├── BayesDataAnalysisWithPymc ├── Data │ ├── McIntyre1994data.csv │ └── McDonaldSK1991data.txt ├── short_hdi.py ├── YmetricXsinglePyMC.py ├── BernTwoPyMC.py ├── plot_post.py ├── BernBetaMuKappaPyMC.py ├── normalize.py ├── SimpleLinearRegressionPyMC.py └── ANOVAOnewayPyMC.py ├── LICENSE.md └── README.md /BayesDataAnalysisWithPymc/Data/McIntyre1994data.csv: -------------------------------------------------------------------------------- 1 | Brand,Tar,Nic,Wt,CO 2 | Alpine,14.1,0.86,0.9853,13.6 3 | BensonAndHedges,16.0,1.06,1.0938,16.6 4 | BullDurham,29.8,2.03,1.1650,23.5 5 | CamelLights,8.0,0.67,0.9280,10.2 6 | Carlton,4.1,0.40,0.9462,5.4 7 | Chesterfield,15.0,1.04,0.8885,15.0 8 | GoldenLights,8.8,0.76,1.0267,9.0 9 | Kent,12.4,0.95,0.9225,12.3 10 | Kool,16.6,1.12,0.9372,16.3 11 | LandM,14.9,1.02,0.8858,15.4 12 | LarkLights,13.7,1.01,0.9643,13.0 13 | Marlboro,15.1,0.90,0.9316,14.4 14 | Merit,7.8,0.57,0.9705,10.0 15 | MultiFilter,11.4,0.78,1.1240,10.2 16 | NewportLights,9.0,0.74,0.8517,9.5 17 | Now,1.0,0.13,0.7851,1.5 18 | OldGold,17.0,1.26,0.9186,18.5 19 | PallMallLight,12.8,1.08,1.0395,12.6 20 | Raleigh,15.8,0.96,0.9573,17.5 21 | SalemUltra,4.5,0.42,0.9106,4.9 22 | Tareyton,14.5,1.01,1.0070,15.9 23 | True,7.3,0.61,0.9806,8.5 24 | ViceroyRichLight,8.6,0.69,0.9693,10.6 25 | VirginiaSlims,15.2,1.02,0.9496,13.9 26 | WinstonLights,12.0,0.82,1.1184,14.9 27 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013, Erikson Kaszubowski 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /BayesDataAnalysisWithPymc/short_hdi.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | '''Algorithm to calculate the shortest Highest Density Interval 3 | (HDI). Adaptation of the R code from "Doing Bayesian Data Analysis", 4 | by John K. Krushcke. 5 | More info: http://doingbayesiandataanalysis.blogspot.com.br/ 6 | 7 | ''' 8 | 9 | 10 | def short_hdi(sample, cred=0.95): 11 | '''Calculate the shortest Highest Density Interval from 12 | the posterior distribution sampled via MCMC. 13 | 14 | :Arguments: 15 | sample: A list with the values of the posterior distribution. 16 | cred: The mass of the posterior for which the interval is computed. 17 | Default is 95%, should be a float from 0.0 to 1.0. 18 | 19 | Returns a tuple with the limits of the HDI. 20 | 21 | PyMC has a 95% HDI algorithm, but it uses quantiles. 22 | 23 | ''' 24 | sorted_sample = sorted(sample) 25 | ci_index = int(cred * len(sorted_sample)) # Uses 'int()' for R's 'floor()' 26 | num_ci = len(sorted_sample) - ci_index 27 | 28 | ci_width = [] 29 | for i in range(num_ci): 30 | width = sorted_sample[i + ci_index] - sorted_sample[i] 31 | ci_width.append(width) 32 | 33 | hdi_min = sorted_sample[ci_width.index(min(ci_width))] 34 | hdi_max = sorted_sample[ci_width.index(min(ci_width)) + ci_index] 35 | hdi_lim = (hdi_min, hdi_max) 36 | return hdi_lim 37 | -------------------------------------------------------------------------------- /BayesDataAnalysisWithPymc/YmetricXsinglePyMC.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | '''Hierarchical Model for inferring the mean (mu) 3 | and precision (tau) of normal likelihood data via MCMC. 4 | Python (PyMC) adaptation of the R code from "Doing Bayesian Data Analysis", 5 | by John K. Krushcke. 6 | More info: http://doingbayesiandataanalysis.blogspot.com.br/ 7 | 8 | ''' 9 | from __future__ import division 10 | 11 | import pymc 12 | import numpy as np 13 | from matplotlib import pyplot as plot 14 | from plot_post import plot_post 15 | 16 | # For simplicity's sake, I will generate random data just like 17 | # the R code in the book. 18 | 19 | t_mean = 100 20 | t_sd = 15 21 | N = 20 22 | 23 | # Generate N samples, no rounding. 24 | 25 | y = np.random.normal(t_mean, t_sd, N) 26 | 27 | # Defining the priors for mu and tau. 28 | 29 | mu = pymc.Normal('mu', 0.0, 1.0e-10) # Mean: 0.0, SD: 100000 30 | tau = pymc.Gamma('tau', 0.01, 0.01) # Mean: 1.0, SD: 10 31 | 32 | # Now the likelihood function. 33 | 34 | like = pymc.Normal('like', mu, tau, value=y, observed=True) 35 | 36 | # Create the model, generate initialization values and sample its posterior. 37 | 38 | model = pymc.Model([like, mu, tau]) 39 | map_ = pymc.MAP(model) 40 | map_.fit() 41 | mcmc = pymc.MCMC(model) 42 | mcmc.sample(iter=60000, burn=40000, thin=2) 43 | 44 | # Sample the posterior for the parameter estimates. 45 | 46 | mu_sample = mcmc.trace('mu')[:] 47 | tau_sample = mcmc.trace('tau')[:] 48 | 49 | # Keeping the same idea as the book: convert the posterior samples to SD. 50 | 51 | sigma_sample = 1 / np.sqrt(tau_sample) 52 | 53 | # Plot the results. 54 | 55 | plot.figure(figsize=(8.0, 8.0)) 56 | 57 | plot.subplot(211) 58 | plot_post(mu_sample, title=r'$\mu$ posterior distribution') 59 | 60 | plot.subplot(212) 61 | plot_post(sigma_sample, title=r'$\sigma$ posterior distribution') 62 | 63 | plot.subplots_adjust(wspace=0.2, hspace=0.2) 64 | plot.show() 65 | -------------------------------------------------------------------------------- /BayesDataAnalysisWithPymc/BernTwoPyMC.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' Model for inferring two binomial proportions via MCMC. 3 | Python (PyMC) adaptation of the R code from "Doing Bayesian Data Analysis", 4 | by John K. Krushcke. 5 | More info: http://doingbayesiandataanalysis.blogspot.com.br/ 6 | 7 | ''' 8 | from __future__ import division 9 | 10 | import pymc 11 | from matplotlib import pyplot as plot 12 | from plot_post import plot_post 13 | 14 | # TODO: It would be good to import data from CSV files. 15 | 16 | # Model specification in PyMC goes backwards, in comparison to JAGS: 17 | # first the prior are specified, THEN the likelihood function. 18 | 19 | # TODO: With PyMC, it´s possible to define many stochastic variables 20 | # in just one variable name using the 'size' function parameter. 21 | 22 | # But for now, I will use multiple variable names for simplicity. 23 | 24 | theta1 = pymc.Beta('theta1', alpha=3, beta=3) 25 | theta2 = pymc.Beta('theta2', alpha=3, beta=3) 26 | 27 | # Define the observed data. 28 | 29 | data = [[1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1], 30 | [1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]] 31 | 32 | # Define the likelihood function for the observed data. 33 | 34 | like1 = pymc.Bernoulli('like1', theta1, observed=True, value=data[0]) 35 | like2 = pymc.Bernoulli('like2', theta2, observed=True, value=data[1]) 36 | 37 | # Use the PyMC 'Model' class to collect all the variables we are interested in. 38 | 39 | model = pymc.Model([theta1, theta2]) 40 | 41 | # And instantiate the MCMC class to sample the posterior. 42 | 43 | mcmc = pymc.MCMC(model) 44 | mcmc.sample(40000, 10000, 1) 45 | 46 | # Use PyMC built-in plot function to show graphs of the samples. 47 | 48 | # pymc.Matplot.plot(mcmc) 49 | # plot.show() 50 | 51 | # Let's try plotting using Matplotlib's 'pyplot'. 52 | # First, we extract the traces for the parameters of interest. 53 | 54 | theta1_samples = mcmc.trace('theta1')[:] 55 | theta2_samples = mcmc.trace('theta2')[:] 56 | theta_diff = theta2_samples - theta1_samples 57 | 58 | # Then, we plot a histogram of their individual sample values. 59 | 60 | plot.figure(figsize=(8.0, 10)) 61 | 62 | plot.subplot(311) 63 | plot_post(theta1_samples, title=r'Posterior of $\theta_1$') 64 | 65 | plot.subplot(312) 66 | plot_post(theta2_samples, title=r'Posterior of $\theta_2$') 67 | 68 | plot.subplot(313) 69 | plot_post(theta_diff, title=r'Posterior of $\Delta\theta$', comp=0.0) 70 | 71 | plot.subplots_adjust(hspace=0.5) 72 | plot.show() 73 | -------------------------------------------------------------------------------- /BayesDataAnalysisWithPymc/plot_post.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | '''Plot the histogram of the posterior distribution sample, 3 | with the mean and the 95% HDI. 4 | Adaptation of the R code from "Doing Bayesian Data Analysis", 5 | by John K. Krushcke. 6 | More info: http://doingbayesiandataanalysis.blogspot.com.br/ 7 | 8 | Histogram code based on (copied from!) 'Probabilistic Programming and 9 | Bayesian Methods for Hackers', by Cameron Davidson-Pilon. 10 | More info: https://github.com/CamDavidsonPilon/ 11 | Probabilistic-Programming-and-Bayesian-Methods-for-Hackers 12 | 13 | ''' 14 | 15 | from __future__ import division 16 | 17 | from short_hdi import short_hdi 18 | from matplotlib import pyplot as plot 19 | 20 | 21 | def plot_post(sample, title='Posterior', 22 | cred=0.95, comp=None, *args, **kwargs): 23 | '''Plot the histogram of the posterior distribution sample, 24 | with the mean and the HDI. 25 | 26 | :Arguments: 27 | sample: array of sample values. 28 | cred: credible interval (default: 95%) 29 | comp: value for comparison (default: None) 30 | title: String value for graph title. 31 | 32 | ''' 33 | # First we compute the shortest HDI using Krushcke's algorithm. 34 | 35 | sample_hdi = short_hdi(sample) 36 | 37 | # Then we plot the histogram of the sample. 38 | ax = plot.hist(sample, 39 | bins=25, 40 | alpha=0.85, 41 | label='', 42 | normed=True) 43 | 44 | # Force the y-axis to be limited to 1.1 times the max probability density. 45 | maxy = 1.1 * max(ax[0]) 46 | plot.ylim(0.0, maxy) 47 | 48 | # No y-axis label, they are not important here. 49 | plot.yticks([]) 50 | 51 | # Should we plot a vertical line on the mean? 52 | #plot.vlines(sample.mean(), 0, maxy, linestyle='--', 53 | # label=r'Mean (%0.3f)' % sample.mean()) 54 | # But we keep the mean value in its right place. 55 | 56 | plot.text(sample.mean(), 0.9 * max(ax[0]), 'Mean: %0.3f' % sample.mean()) 57 | 58 | #plot.legend(loc='upper right') #Legends are cumbersome! 59 | plot.title(title) 60 | 61 | # Plot the HDI as a vertical line with their respective values. 62 | plot.hlines(y=0, xmin=sample_hdi[0], xmax=sample_hdi[1], linewidth=6) 63 | plot.text(sample_hdi[0], max(ax[0]) / 20, '%0.3f' % sample_hdi[0], 64 | horizontalalignment='center') 65 | plot.text(sample_hdi[1], max(ax[0]) / 20, '%0.3f' % sample_hdi[1], 66 | horizontalalignment='center') 67 | 68 | # In case there is a comparison value, plot it and 69 | # compute how much of the posterior falls at each side. 70 | if comp != None: 71 | loc = max(ax[0]) / 2.0 72 | plot.vlines(comp, 0, loc, color='green', linestyle='--') 73 | less = 100 * (sum(sample < comp)) / len(sample) 74 | more = 100 * (sum(sample > comp)) / len(sample) 75 | print less, more 76 | plot.text(comp, loc, '%0.1f%% < %0.1f < %0.1f%%' % (less, comp, more), 77 | color='green', horizontalalignment='center') 78 | 79 | #return ax # I thought the function should return something. It's not needed. 80 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Bayesian Data Analysis *with PyMC* 2 | ====================== 3 | 4 | ###For those in a hurry 5 | 6 | My first attempt to adapt to Python (using PyMC) the R code from "Doing Bayesian Data Analysis", 7 | by John K. Krushcke. 8 | 9 | ####Models done so far: 10 | - Inferring two binomials proportions and their difference; 11 | - Hierarchical prior for Bernoulli likelihood; 12 | - Metric variable for a single group; 13 | - Simple linear regression; 14 | - Oneway ANOVA. 15 | 16 | ###Quick References 17 | >1. "Doing Bayesian Data Analysis", by John K. Krushcke 18 | >[http://doingbayesiandataanalysis.blogspot.com.br/](http://doingbayesiandataanalysis.blogspot.com.br/) 19 | > 20 | >2. "Probabilistic Programming and Bayesian Methods for Hackers", by Cam Davidson-Pilon 21 | >[https://github.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers/](https://github.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers/) 22 | > 23 | >3. "PyMC", by Christopher Fonnesbeck, Anand Patil and David Huard 24 | >[http://pymc-devs.github.io/pymc/](http://pymc-devs.github.io/pymc/) 25 | 26 | 27 | ###For those with some time to spare 28 | 29 | After many years avoiding any book on statistics, I found myself interested on bayesian methods 30 | when I first heard about them in the NLTK (Natural Language ToolKit for Python) Book. 31 | I read some introductory texts on Bayes' Theorem, but even tough the concepts were somewhat clear, I still 32 | wasn't able to understand how useful it was. 33 | 34 | Then I found Krushcke's "Doing Bayesian Data Analysis". It was love at first few-pages-glance. Seriously, 35 | the book is great, specially for those who know nothing about statistics, like myself. The language is 36 | accessible, there are tons of examples, the hierarchical models are cleverly illustrated using a unique style... 37 | What are you still doing here? Go buy the book! 38 | 39 | Ahem, where was I? Oh, yeah. Krushcke's book. Have I mentioned that it uses R and JAGS (or BUGS) to implement 40 | the concepts and models? It's really a great deal: you learn bayesian statistics, general probability AND R. 41 | But I had problems with R syntax. It's so clumsy, and weird and... Well, I don't like it. Great packages, 42 | but I really prefer Python. 43 | 44 | Being a Python fan, I had to find a way to adapt the original code. It was made possible after I discovered 45 | PyMC in Cam Davidson-Pilon "Probabilistic Programming and Bayesian Methods for Hackers". Also a great book, 46 | published in an original media: IPython Notebooks - you read the book, you read (and edit!) the code and run 47 | it all in your favorite browser. Did I mention it is free? 48 | 49 | So, as a exercise in bayesian statistics and in Python programming, I adapted some models used by Krushcke 50 | in his book to Python, using PyMC (and Numpy and Matplotlib). This is just a first attempt, and I have only 51 | "translated" Krushcke's model to Python - and in a very non-pythonic way. There's plenty of room for improvement, 52 | but the code shouldn't break if the data is entered correctly. The code is also heavily commented - after all, 53 | the point here is to exercise the concepts and programming skills. 54 | 55 | Feel free to fork and modify and pull request - all suggestions are welcome! 56 | -------------------------------------------------------------------------------- /BayesDataAnalysisWithPymc/Data/McDonaldSK1991data.txt: -------------------------------------------------------------------------------- 1 | # From http://udel.edu/~mcdonald/statanovasig.html 2 | # "Here are some data on a shell measurement (the length of the anterior 3 | # adductor muscle scar, standardized by dividing by length) in the mussel 4 | # Mytilus trossulus from five locations: Tillamook, Oregon; Newport, Oregon; 5 | # Petersburg, Alaska; Magadan, Russia; and Tvarminne, Finland, 6 | # taken from a much larger data set used in McDonald et al. (1991)." 7 | # 8 | # McDonald, J. H., R. Seed and R. K. Koehn. 1991. 9 | # Allozymes and morphometric characters of three species of Mytilus 10 | # in the Northern and Southern Hemispheres. 11 | # Mar. Biol. 111:323-333. 12 | # 13 | # Group code: 14 | # 1=Tillamook,Oregon 15 | # 2=Newport,Oregon 16 | # 3=Petersburg,Alaska 17 | # 4=Magadan,Russia 18 | # 5=Tvarminne,Finland 19 | Group Size Site 20 | 1 0.0571 OregonT 21 | 1 0.0813 OregonT 22 | 1 0.0831 OregonT 23 | 1 0.0976 OregonT 24 | 1 0.0817 OregonT 25 | 1 0.0859 OregonT 26 | 1 0.0735 OregonT 27 | 1 0.0659 OregonT 28 | 1 0.0923 OregonT 29 | 1 0.0836 OregonT 30 | 2 0.0873 OregonN 31 | 2 0.0662 OregonN 32 | 2 0.0672 OregonN 33 | 2 0.0819 OregonN 34 | 2 0.0749 OregonN 35 | 2 0.0649 OregonN 36 | 2 0.0835 OregonN 37 | 2 0.0725 OregonN 38 | 3 0.0974 Alaska 39 | 3 0.1352 Alaska 40 | 3 0.0817 Alaska 41 | 3 0.1016 Alaska 42 | 3 0.0968 Alaska 43 | 3 0.1064 Alaska 44 | 3 0.1050 Alaska 45 | 4 0.1033 Russia 46 | 4 0.0915 Russia 47 | 4 0.0781 Russia 48 | 4 0.0685 Russia 49 | 4 0.0677 Russia 50 | 4 0.0697 Russia 51 | 4 0.0764 Russia 52 | 4 0.0689 Russia 53 | 5 0.0703 Finland 54 | 5 0.1026 Finland 55 | 5 0.0956 Finland 56 | 5 0.0973 Finland 57 | 5 0.1039 Finland 58 | 5 0.1045 Finland 59 | # 60 | # http://udel.edu/~mcdonald/statanovaunplanned.html 61 | # shows that Tukey-Kramer method of unplanned comparisons 62 | # groups 63 | # Newport/Magadan/Tillamook (2/4/1), 64 | # Magadan/Tillamook/Tvarminne (4/1/5), 65 | # and Tvarminne/Petersburg (5/3). 66 | # 67 | # From http://udel.edu/~mcdonald/statanovaplanned.html: 68 | # Really important note about planned comparisons 69 | # Planned comparisons must be planned before you look at the data. If you 70 | # look at some data, pick out an interesting comparison, then analyze it as 71 | # if it were a planned comparison, you will be committing scientific fraud. 72 | # For example, if you look at the mean arch heights for the nine sports, see 73 | # that cross-country has the lowest mean and swimming has the highest mean, 74 | # then compare just those two means, your P-value will be much too low. This 75 | # is because there are 36 possible pairwise comparisons in a set of 9 means. 76 | # You expect 5 percent, or 1 out of 20, tests to be "significant" at the 77 | # P<0.05 level, even if all the data really fit the null hypothesis, so 78 | # there's a good chance that the most extreme comparison in a set of 36 79 | # will have a P-value less than 0.05. 80 | # It would be acceptable to run a pilot experiment and plan your planned 81 | # comparisons based on the results of the pilot experiment. However, if you 82 | # do this you could not include the data from the pilot experiment in the 83 | # analysis; you would have to limit your anova to the new data. -------------------------------------------------------------------------------- /BayesDataAnalysisWithPymc/BernBetaMuKappaPyMC.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | '''Hierarchical Model for inferring the mean (mu) 3 | and sample size (kappa) of various Bernoulli trials via MCMC. 4 | Python (PyMC) adaptation of the R code from "Doing Bayesian Data Analysis", 5 | by John K. Krushcke. 6 | More info: http://doingbayesiandataanalysis.blogspot.com.br/ 7 | 8 | ''' 9 | from __future__ import division 10 | 11 | import pymc 12 | from matplotlib import pyplot as plot 13 | from plot_post import plot_post 14 | 15 | # For better code flow, we define the data first. 16 | # Based on the original code's 'Therapeutic touch data'. 17 | 18 | z = [1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 19 | 4, 5, 5, 5, 5, 5, 5, 5, 6, 6, 7, 7, 7, 8] 20 | N = 10 # Number of trials for each z. 21 | 22 | data = [[0] * (N - i) + [1] * i for i in z] # Build the Bernoulli trial data. 23 | 24 | # Again, with PyMC we design the model from top to bottom. 25 | # Let's start defining the hierarchical prior's constants. 26 | # A,B constant for the overall beta distribution for mu. 27 | 28 | a_mu = 2.0 29 | b_mu = 2.0 30 | 31 | # Shape and rate constants for the overall gamma distribution for kappa. 32 | # They are reparametrized as mean and standard deviation. 33 | 34 | s_kappa = 10**2 / 10**2 35 | r_kappa = 10 / 10**2 36 | 37 | # Then, we define the overall beta and gamma distributions. 38 | 39 | mu = pymc.Beta('mu', a_mu, b_mu) 40 | kappa = pymc.Gamma('kappa', s_kappa, r_kappa) 41 | 42 | # Instead of using a 'for' loop for multiple stochastic variables, 43 | # we use the 'size' parameter of PyMC. This is why we defined the data first. 44 | # We could use a '@deterministic' wrapper, but operations already generate it. 45 | 46 | a = mu * kappa 47 | b = (1.0 - mu) * kappa 48 | 49 | theta = pymc.Beta('theta', a, b, size=len(data)) # One beta for each subject. 50 | 51 | # The priors are defined. Now we need to set the likelihood of our data. 52 | # The likelihood can't be defined the same way. We need a 'for' loop. 53 | # Or the 'Lambda()' class. 54 | # For more info: https://github.com/pymc-devs/pymc/issues/319 55 | # 56 | # for i in range(len(data)): 57 | # like_i = pymc.Bernoulli('like_%i' % i, p=theta[i], value=data[i], 58 | # observed=True) 59 | # 60 | # The code above works nicely (the posterior result is the same, and each theta 61 | # is updated with its data. But how does looping the declaration of the same 62 | # variable works? I prefer the following code, since it makes more sense. 63 | 64 | like = [] 65 | for i in range(len(data)): 66 | like.append(pymc.Bernoulli('like_%i' % i, p=theta[i], 67 | value=data[i], observed=True)) 68 | 69 | # Done! Now we need to collect the variables and fit our model. 70 | 71 | model = pymc.Model([theta, mu, kappa]) 72 | 73 | map_ = pymc.MAP(model) 74 | map_.fit() 75 | 76 | mcmc = pymc.MCMC(model) 77 | mcmc.sample(iter=60000, burn=10000, thin=2) 78 | 79 | # Extracting the parameter samples. 80 | 81 | mu_sample = mcmc.trace('mu')[:] 82 | kappa_sample = mcmc.trace('kappa')[:] 83 | theta_sample = mcmc.trace('theta')[:] 84 | 85 | # And plot them. 86 | 87 | plot.figure(figsize=(8.0, 8.0)) 88 | 89 | plot.subplot(221) 90 | plot_post(mu_sample, comp=0.5, title=r'$\mu$ posterior distribution') 91 | 92 | plot.subplot(222) 93 | plot_post(kappa_sample, title=r'$\kappa$ posterior distribution') 94 | 95 | plot.subplot(223) 96 | plot_post(theta_sample[:, 0], title=r'$\theta_1$ posterior distribution') 97 | 98 | plot.subplot(224) 99 | plot_post(theta_sample[:, 27], title=r'$\theta_{28}$ posterior distribution') 100 | 101 | plot.subplots_adjust(wspace=0.2, hspace=0.2) 102 | plot.show() 103 | -------------------------------------------------------------------------------- /BayesDataAnalysisWithPymc/normalize.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | '''Function to normalize data and convert parameter back to original scale. 3 | Python (PyMC) adaptation of the R code from "Doing Bayesian Data Analysis", 4 | by John K. Krushcke. 5 | More info: http://doingbayesiandataanalysis.blogspot.com.br/ 6 | 7 | ''' 8 | from __future__ import division 9 | import numpy as np 10 | 11 | 12 | def normalize(data): 13 | '''Normalizes a set of data. 14 | 15 | ''' 16 | 17 | mean = np.mean(data) 18 | sd = np.sqrt(np.var(data)) 19 | z_data = (data - mean) / sd 20 | return z_data 21 | 22 | 23 | def convert_slope(x_data, y_data, zb1_sample): 24 | '''Converts normalized b1 sample back to original scale. 25 | 26 | :Arguments: 27 | x_data: original predictor data list. 28 | y_data: original predicted data list. 29 | zb1_sample: normalized parameter samples. 30 | 31 | ''' 32 | 33 | y_sd = np.sqrt(np.var(y_data)) 34 | x_sd = np.sqrt(np.var(x_data)) 35 | b1 = zb1_sample * (y_sd / x_sd) 36 | return b1 37 | 38 | 39 | def convert_intercept(x_data, y_data, zb0_sample, zb1_sample): 40 | '''Converts normalized b0 sample back to original scale. 41 | 42 | :Arguments: 43 | x_data: original predictor data list. 44 | y_data: original predicted data list. 45 | zb0_sample: normalized parameter samples. 46 | zb1_sample: normalized parameter samples. 47 | 48 | ''' 49 | 50 | y_sd = np.sqrt(np.var(y_data)) 51 | y_mean = np.mean(y_data) 52 | 53 | x_sd = np.sqrt(np.var(x_data)) 54 | x_mean = np.mean(x_data) 55 | 56 | b0 = zb0_sample * y_sd + y_mean - zb1_sample * (y_sd * x_mean) / x_sd 57 | return b0 58 | 59 | 60 | def convert_tau_sigma(y_data, ztau_sample): 61 | '''Converts normalized tau samples back to original scale SD. 62 | 63 | :Arguments: 64 | y_data: original predicted data list. 65 | ztau_sample: normalized tau parameter samples. 66 | 67 | ''' 68 | z_sigma = 1 / np.sqrt(ztau_sample) 69 | y_sd = np.sqrt(np.var(y_data)) 70 | sigma = z_sigma * y_sd 71 | return sigma 72 | 73 | def convert_sigma(y_data, zsigma_sample): 74 | '''Converts normalized tau samples back to original scale SD. 75 | 76 | :Arguments: 77 | y_data: original predicted data list. 78 | ztau_sample: normalized sigma parameter samples. 79 | 80 | ''' 81 | y_sd = np.sqrt(np.var(y_data)) 82 | sigma = zsigma_sample * y_sd 83 | return sigma 84 | 85 | 86 | def convert_baseline(a0_sample, a_sample, x_levels, y_data): 87 | '''Convert normalized ANOVA baseline back to original scale. 88 | 89 | :Arguments: 90 | a0_sample: normalized baseline samples. 91 | a_sample: normalized deflection samples. 92 | x_levels: integer, levels of categorical variable. 93 | y_data: original predicted data list. 94 | 95 | ''' 96 | m_sample = a0_sample.repeat(x_levels).reshape(len(a0_sample), x_levels) \ 97 | + a_sample 98 | b0_sample = m_sample.mean(axis=1) 99 | b0_sample = b0_sample * np.sqrt(np.var(y_data) + np.mean(y_data)) 100 | return b0_sample 101 | 102 | 103 | def convert_deflection(a0_sample, a_sample, x_levels, y_data): 104 | '''Convert normalized ANOVA deflections back to original scale. 105 | 106 | :Arguments: 107 | a0_sample: normalized baseline samples. 108 | a_sample: normalized deflection samples. 109 | x_levels: integer, levels of categorical variable. 110 | y_data: original predicted data list. 111 | 112 | ''' 113 | m_sample = a0_sample.repeat(x_levels).reshape(len(a0_sample), x_levels) \ 114 | + a_sample 115 | b0_sample = m_sample.mean(axis=1) 116 | b_sample = (m_sample - 117 | b0_sample.repeat(x_levels).reshape(len(b0_sample), x_levels)) 118 | b_sample = b_sample * np.sqrt(np.var(y_data)) 119 | return b_sample 120 | -------------------------------------------------------------------------------- /BayesDataAnalysisWithPymc/SimpleLinearRegressionPyMC.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | '''Hierarchical Model for estimation of simple linear regression 3 | parameter via MCMC. 4 | Python (PyMC) adaptation of the R code from "Doing Bayesian Data Analysis", 5 | by John K. Krushcke. 6 | More info: http://doingbayesiandataanalysis.blogspot.com.br/ 7 | 8 | ''' 9 | from __future__ import division 10 | 11 | import pymc 12 | import numpy as np 13 | from matplotlib import pyplot as plot 14 | from plot_post import plot_post 15 | from normalize import (normalize, convert_intercept, 16 | convert_slope, convert_tau_sigma) 17 | from os import path 18 | 19 | # Code to find the data path. 20 | 21 | scr_dir = path.dirname(__file__) 22 | file_name = 'McIntyre1994data.csv' 23 | comp_dir = path.join(scr_dir, 'Data', file_name) 24 | 25 | # So, let's be lazy: the data are from McIntyre cigarette weight. 26 | # Use numpy to load the data we want directly in the appropriate variables. 27 | 28 | y, x = np.genfromtxt(comp_dir, delimiter=',', 29 | skip_header=1, usecols=(1, 3), unpack=True) 30 | 31 | # Let's try normalizing, as suggested by Krushcke. 32 | 33 | zy = normalize(y) 34 | zx = normalize(x) 35 | 36 | # Define the priors for the model. 37 | # First, normal priors for the slope and intercept. 38 | 39 | beta0 = pymc.Normal('b0', 0.0, 1.0e-10) 40 | beta1 = pymc.Normal('b1', 0.0, 1.0e-10) 41 | 42 | # Then, gamma and uniform prior for precision and DoF. 43 | # Krushcke suggests the use of a Student's t distribution for the likelihood. 44 | # It makes the estimation more robust in the presence of outliers. 45 | # We will use Krushcke's DoF transformation using a gain constant. 46 | 47 | tau = pymc.Gamma('tau', 0.01, 0.01) 48 | udf = pymc.Uniform('udf', 0.0, 1.0) 49 | tdf_gain = 1 50 | 51 | 52 | @pymc.deterministic 53 | def tdf(udf=udf, tdf_gain=tdf_gain): 54 | return 1 - tdf_gain * np.log(1 - udf) 55 | 56 | # Defining the linear relationship between variables. 57 | 58 | 59 | @pymc.deterministic 60 | def mu(beta0=beta0, beta1=beta1, x=zx): 61 | mu = beta0 + beta1 * x 62 | return mu 63 | 64 | 65 | # Finally, the likelihood using Student's t distribution. 66 | 67 | like = pymc.NoncentralT('like', mu=mu, lam=tau, nu=tdf, 68 | value=zy, observed=True) 69 | 70 | # For those who want a more traditional linear model: 71 | #like = pymc.Normal('like', mu=mu, tau=tau, value=zy, observed=True) 72 | 73 | # The model is ready! Sampling code below. 74 | 75 | model = pymc.Model([beta0, beta1, tau, tdf]) 76 | fit = pymc.MAP(model) 77 | fit.fit() 78 | mcmc = pymc.MCMC(model) 79 | mcmc.sample(iter=100000, burn=50000, thin=10) 80 | 81 | # Collect the sample values for the parameters. 82 | 83 | z0_sample = mcmc.trace('b0')[:] 84 | z1_sample = mcmc.trace('b1')[:] 85 | ztau_sample = mcmc.trace('tau')[:] 86 | tdf_sample = mcmc.trace('tdf')[:] 87 | 88 | # Convert the data back to scale. 89 | 90 | b0_sample = convert_intercept(x, y, z0_sample, z1_sample) 91 | b1_sample = convert_slope(x, y, z1_sample) 92 | sigma_sample = convert_tau_sigma(y, ztau_sample) 93 | 94 | # Plot the results 95 | 96 | plot.figure(figsize=(8.0, 8.0)) 97 | 98 | plot.subplot(221) 99 | plot_post(b0_sample, title=r'$\beta_0$ posterior') 100 | 101 | plot.subplot(222) 102 | plot_post(b1_sample, title=r'$\beta_1$ posterior') 103 | 104 | plot.subplot(223) 105 | plot_post(sigma_sample, title=r'$\sigma$ posterior') 106 | 107 | plot.subplot(224) 108 | plot_post(tdf_sample, title=r'tDF posterior') 109 | 110 | plot.subplots_adjust(wspace=0.2, hspace=0.2) 111 | 112 | # Plot the data with some credible regression lines. 113 | 114 | plot.figure(figsize=(8.0, 8.0)) 115 | 116 | plot.scatter(x, y, c='k', s=60) 117 | plot.title('Data points with credible regression lines') 118 | 119 | x1 = plot.axis()[0] 120 | x2 = plot.axis()[1] 121 | 122 | plot.autoscale(enable=False) 123 | 124 | for line in range(0, len(b1_sample), len(b1_sample) // 50): 125 | plot.plot([x1, x2], [b0_sample[line] + b1_sample[line] * x1, 126 | b0_sample[line] + b1_sample[line] * x2], 127 | c='#348ABD', lw=1) 128 | 129 | plot.show() 130 | -------------------------------------------------------------------------------- /BayesDataAnalysisWithPymc/ANOVAOnewayPyMC.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | '''Hierarchical Model for estimation of oneway ANOVA parameters via MCMC. 3 | Python (PyMC) adaptation of the R code from "Doing Bayesian Data Analysis", 4 | by John K. Krushcke. 5 | More info: http://doingbayesiandataanalysis.blogspot.com.br/ 6 | 7 | ''' 8 | from __future__ import division 9 | 10 | import pymc 11 | import numpy as np 12 | from matplotlib import pyplot as plot 13 | from plot_post import plot_post 14 | from normalize import (normalize, convert_baseline, convert_deflection, 15 | convert_sigma) 16 | from math import ceil 17 | from os import path 18 | 19 | # Code to find the data path. 20 | 21 | scr_dir = path.dirname(__file__) 22 | file_name = 'McDonaldSK1991data.txt' 23 | comp_dir = path.join(scr_dir, 'Data', file_name) 24 | 25 | # Using data from the book for easier comparison. 26 | # Data from McDonald (1991) study about geographical location and muscle size 27 | # in mussels. 28 | # Again we use Numpy to assign the data to variables. 29 | 30 | x, y = np.genfromtxt(comp_dir, delimiter=' ', 31 | skip_header=19, usecols=(0, 1), unpack=True) 32 | 33 | # Define the contrasts. 34 | # TODO: use dictionary for easier retrieval of contrast description. 35 | 36 | contrasts = np.array(((-1/3, -1/3, 1/2, -1/3, 1/2), 37 | (1, -1, 0, 0, 0), 38 | (-1/2, -1/2, 1, 0, 0), 39 | (-1/2, -1/2, 1/2, 1/2, 0), 40 | (1/3, 1/3, 1/3, -1, 0), 41 | (-1/4, -1/4, -1/4, -1/4, 1), 42 | (1/3, 1/3, 1/3, -1/2, -1/2), 43 | (0, 0, 0, -1, 1))) 44 | # Random data, for test purposes: 45 | 46 | # y_truesd = 4.0 47 | # a0_true = 100 48 | # atrue = [15, -10, -7, 8, -6] 49 | 50 | #x = [1] * 3 + [2] * 4 + [3] * 3 + [4] * 5 + [5] * 3 51 | #y = [a0_true + atrue[i - 1] + np.random.normal(0, y_truesd) for i in x] 52 | 53 | 54 | # Normalize the data for better MCMC performance. 55 | # And define the total number of levels in our categorical variable. 56 | 57 | zy = normalize(y) 58 | x_levels = len(set(x)) 59 | y_mean = np.mean(y) 60 | y_sd = np.sqrt(np.var(y)) 61 | 62 | # Begin the definition of the model. 63 | # First, we define a Gamma distribution for the precision of 64 | # the deflection parameters. 65 | 66 | a_sd = pymc.Gamma('a_sd', 1.01005, 0.1005) 67 | 68 | @pymc.deterministic 69 | def a_tau(a_sd=a_sd): 70 | return 1.0 / a_sd**2 71 | 72 | # Then we define a normal prior on the baseline and deflection parameters. 73 | 74 | a0 = pymc.Normal('a0', mu=0.0, tau=0.001) 75 | a = pymc.Normal('a', mu=0.0, tau=a_tau, size=x_levels) 76 | 77 | # Almost there! We still need to set the prior on the data variance. 78 | 79 | sigma = pymc.Uniform('sigma', 0, 10) 80 | 81 | 82 | @pymc.deterministic 83 | def tau(sigma=sigma): 84 | return 1.0 / sigma**2 85 | 86 | # The priors are all set! Now we can define the linear model. 87 | # Maybe it can be clearly defined using the 'Lambda()' class. 88 | # But we will use a 'for' loop for easier readability. 89 | 90 | mu = [] 91 | for i in x: 92 | mu.append(a0 + a[int(i - 1)]) 93 | 94 | # And the likelihood. 95 | 96 | like_y = pymc.Normal('like_y', mu=mu, tau=tau, value=zy, observed=True) 97 | 98 | # Now we build the model, set the MAP and sample the posterior distribution. 99 | 100 | model = pymc.Model([like_y, a0, a, sigma, a_tau, a_sd]) 101 | map_ = pymc.MAP(model) 102 | map_.fit() 103 | mcmc = pymc.MCMC(model) 104 | mcmc.sample(iter=80000, burn=20000, thin=10) 105 | 106 | # Extract the samples. 107 | 108 | a0_sample = mcmc.trace('a0')[:] 109 | a_sample = mcmc.trace('a')[:] 110 | sigma_sample = mcmc.trace('sigma')[:] 111 | a_sd_sample = mcmc.trace('a_sd')[:] 112 | 113 | # Convert the values. 114 | 115 | b0_sample = convert_baseline(a0_sample, a_sample, x_levels, y) 116 | b_sample = convert_deflection(a0_sample, a_sample, x_levels, y) 117 | 118 | sig_sample = convert_sigma(y, sigma_sample) 119 | b_sd_sample = convert_sigma(y, a_sd_sample) 120 | 121 | # Plot the results. 122 | 123 | plot.figure(figsize=(6.0, 4.0)) 124 | 125 | plot.subplot(211) 126 | plot_post(sig_sample, title=r'$\sigma$ (cell SD) posterior') 127 | 128 | plot.subplot(212) 129 | plot_post(b_sd_sample, title=r'$aSD$ posterior') 130 | 131 | plot.subplots_adjust(wspace=0.2, hspace=0.5) 132 | 133 | plot.figure(figsize=(18.0, 3.0)) 134 | total_subplot = len(b_sample[0, :]) 135 | plot_n = 100 + (total_subplot + 1) * 10 + 1 136 | 137 | plot.subplot(plot_n) 138 | plot_post(b0_sample, title=r'$\beta_0$ posterior') 139 | 140 | for i in range(total_subplot): 141 | plot.subplot(plot_n + i + 1) 142 | plot_post(b_sample[:, i], title=r'$\beta_{1%i}$ posterior' % (i + 1)) 143 | 144 | plot.subplots_adjust(wspace=0.2) 145 | 146 | n_cons = len(contrasts) 147 | if n_cons > 0: 148 | plot_per_rows = 5 149 | plot_rows = ceil(n_cons / plot_per_rows) 150 | plot_cols = ceil(n_cons / plot_rows) 151 | 152 | plot.figure(figsize=(3.75 * plot_cols, 2.5 * plot_rows)) 153 | 154 | for i in range(n_cons): 155 | contrast = contrasts[i, :] 156 | comp = np.dot(b_sample, contrast) 157 | plot.subplot(plot_rows, plot_cols, i + 1) 158 | plot_post(comp, title='Contrast %i' % (i + 1), comp=0.0) 159 | 160 | plot.subplots_adjust(wspace=0.2, hspace=0.5) 161 | plot.show() 162 | --------------------------------------------------------------------------------