├── slides
    ├── 8-bhm
    │   ├── fig2.pdf
    │   ├── wk8_slides.pdf
    │   └── wk8_slides.Rmd
    ├── 3-bayes
    │   ├── beetle.jpg
    │   ├── diag.pdf
    │   ├── wk3_slides.pdf
    │   ├── lm.stan
    │   └── wk3_slides.Rmd
    ├── 6-shrinkage
    │   ├── pat.jpg
    │   ├── zuur.jpg
    │   ├── confused.jpg
    │   ├── wk6_slides.pdf
    │   ├── prior_predictive.R
    │   └── wk6_slides.Rmd
    ├── 4-poisson
    │   ├── wk4_slides.pdf
    │   ├── poisson_glm.stan
    │   └── wk4_slides.Rmd
    ├── 5-binomial
    │   ├── wk5_slides.pdf
    │   └── wk5_slides.Rmd
    ├── 1-linear_models
    │   ├── wk1_slides.pdf
    │   └── wk1_slides.Rmd
    └── 2-maximum_likelihood
    │   ├── wk2_slides.pdf
    │   └── wk2_slides.Rmd
├── assignments
    ├── 0-intro
    │   └── intro.pdf
    ├── 4-poisson
    │   └── wk4_problems.Rmd
    ├── 2-maximum_likelihood
    │   └── wk2_problems.Rmd
    ├── 3-bayes
    │   └── wk3_problems.Rmd
    ├── 7-bhm
    │   └── wk7_problems.Rmd
    ├── 1-linear_models
    │   └── wk1_problems.Rmd
    ├── 5-binomial
    │   └── wk5_problems.Rmd
    └── 6-shrinkage
    │   └── wk6_problems.Rmd
├── example_presentations
    └── proposal_example.pdf
├── example_solutions
    ├── 3-bayes
    │   ├── wk3_solutions.pdf
    │   ├── wk3_solutions_will.pdf
    │   ├── wk3_solutions.Rmd
    │   └── wk3_solutions_will.Rmd
    ├── 4-poisson
    │   ├── wk4_solutions.pdf
    │   ├── wk4_solutions_will.pdf
    │   ├── wk4_solutions.Rmd
    │   └── wk4_solutions_will.Rmd
    ├── 5-binomial
    │   ├── wk5_solutions.pdf
    │   ├── wk5_solutions_will.pdf
    │   ├── bernoulli_glm.stan
    │   ├── bernoulli_glm_test.stan
    │   ├── tumor_glm_will.stan
    │   ├── tumor_glm_test_will.stan
    │   ├── wk5_solutions.Rmd
    │   └── wk5_solutions_will.Rmd
    ├── 1-linear_models
    │   ├── wk1_solutions.pdf
    │   └── wk1_solutions.Rmd
    └── 2-maximum_likelihood
    │   ├── wk2_solutions.pdf
    │   ├── wk2_solutions_will.pdf
    │   ├── wk2_solutions.Rmd
    │   └── wk2_solutions_will.Rmd
├── README.md
├── schedule.md
├── roadmap.md
└── LICENSE


/slides/8-bhm/fig2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hmods/course/HEAD/slides/8-bhm/fig2.pdf


--------------------------------------------------------------------------------
/slides/3-bayes/beetle.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hmods/course/HEAD/slides/3-bayes/beetle.jpg


--------------------------------------------------------------------------------
/slides/3-bayes/diag.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hmods/course/HEAD/slides/3-bayes/diag.pdf


--------------------------------------------------------------------------------
/slides/6-shrinkage/pat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hmods/course/HEAD/slides/6-shrinkage/pat.jpg


--------------------------------------------------------------------------------
/slides/6-shrinkage/zuur.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hmods/course/HEAD/slides/6-shrinkage/zuur.jpg


--------------------------------------------------------------------------------
/slides/8-bhm/wk8_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hmods/course/HEAD/slides/8-bhm/wk8_slides.pdf


--------------------------------------------------------------------------------
/assignments/0-intro/intro.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hmods/course/HEAD/assignments/0-intro/intro.pdf


--------------------------------------------------------------------------------
/slides/3-bayes/wk3_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hmods/course/HEAD/slides/3-bayes/wk3_slides.pdf


--------------------------------------------------------------------------------
/slides/4-poisson/wk4_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hmods/course/HEAD/slides/4-poisson/wk4_slides.pdf


--------------------------------------------------------------------------------
/slides/5-binomial/wk5_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hmods/course/HEAD/slides/5-binomial/wk5_slides.pdf


--------------------------------------------------------------------------------
/slides/6-shrinkage/confused.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hmods/course/HEAD/slides/6-shrinkage/confused.jpg


--------------------------------------------------------------------------------
/slides/6-shrinkage/wk6_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hmods/course/HEAD/slides/6-shrinkage/wk6_slides.pdf


--------------------------------------------------------------------------------
/slides/1-linear_models/wk1_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hmods/course/HEAD/slides/1-linear_models/wk1_slides.pdf


--------------------------------------------------------------------------------
/example_presentations/proposal_example.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hmods/course/HEAD/example_presentations/proposal_example.pdf


--------------------------------------------------------------------------------
/slides/2-maximum_likelihood/wk2_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hmods/course/HEAD/slides/2-maximum_likelihood/wk2_slides.pdf


--------------------------------------------------------------------------------
/example_solutions/3-bayes/wk3_solutions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hmods/course/HEAD/example_solutions/3-bayes/wk3_solutions.pdf


--------------------------------------------------------------------------------
/example_solutions/4-poisson/wk4_solutions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hmods/course/HEAD/example_solutions/4-poisson/wk4_solutions.pdf


--------------------------------------------------------------------------------
/example_solutions/5-binomial/wk5_solutions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hmods/course/HEAD/example_solutions/5-binomial/wk5_solutions.pdf


--------------------------------------------------------------------------------
/example_solutions/3-bayes/wk3_solutions_will.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hmods/course/HEAD/example_solutions/3-bayes/wk3_solutions_will.pdf


--------------------------------------------------------------------------------
/example_solutions/4-poisson/wk4_solutions_will.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hmods/course/HEAD/example_solutions/4-poisson/wk4_solutions_will.pdf


--------------------------------------------------------------------------------
/example_solutions/1-linear_models/wk1_solutions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hmods/course/HEAD/example_solutions/1-linear_models/wk1_solutions.pdf


--------------------------------------------------------------------------------
/example_solutions/5-binomial/wk5_solutions_will.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hmods/course/HEAD/example_solutions/5-binomial/wk5_solutions_will.pdf


--------------------------------------------------------------------------------
/example_solutions/2-maximum_likelihood/wk2_solutions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hmods/course/HEAD/example_solutions/2-maximum_likelihood/wk2_solutions.pdf


--------------------------------------------------------------------------------
/example_solutions/2-maximum_likelihood/wk2_solutions_will.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hmods/course/HEAD/example_solutions/2-maximum_likelihood/wk2_solutions_will.pdf


--------------------------------------------------------------------------------
/slides/4-poisson/poisson_glm.stan:
--------------------------------------------------------------------------------
 1 | data {
 2 |   int n; // sample size
 3 |   int p; // number of coefficients
 4 |   matrix[n, p] X;
 5 |   int y[n];
 6 | }
 7 | 
 8 | parameters {
 9 |   vector[p] beta;
10 | }
11 | 
12 | model {
13 |   beta ~ normal(0, 5);
14 |   y ~ poisson_log(X * beta);
15 | }
16 | 


--------------------------------------------------------------------------------
/slides/3-bayes/lm.stan:
--------------------------------------------------------------------------------
 1 | data {
 2 |   int n;
 3 |   int p;
 4 |   matrix[n, p] X;
 5 |   vector[n] y;
 6 | }
 7 | 
 8 | parameters {
 9 |   vector[p] beta;
10 |   real<lower=0> sigma;
11 | }
12 | 
13 | model {
14 |   beta ~ normal(0, 2);
15 |   sigma ~ cauchy(0, 5);
16 |   y ~ normal(X * beta, sigma);
17 | }
18 | 


--------------------------------------------------------------------------------
/example_solutions/5-binomial/bernoulli_glm.stan:
--------------------------------------------------------------------------------
 1 | data {
 2 |   int n; // sample size
 3 |   int p; // number of coefficients
 4 |   matrix[n, p] X;
 5 |   int y[n];
 6 | }
 7 | 
 8 | parameters {
 9 |   vector[p] beta;
10 | }
11 | 
12 | model {
13 |   beta ~ normal(0, 1);
14 |   y ~ bernoulli_logit(X * beta);
15 | }
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # course
2 | 
3 | This is the main repository for a graduate course in hierarchical Bayesian modeling taught in Spring 2016, designed to be used with the course [notes](https://github.com/hmods/notes) and Gelman and Hill's textbook on multilevel modeling. 
4 | We will push slides, assignments, and solutions as the semester progresses. 
5 | 


--------------------------------------------------------------------------------
/example_solutions/5-binomial/bernoulli_glm_test.stan:
--------------------------------------------------------------------------------
 1 | data {
 2 |   int n; // sample size
 3 |   int p; // number of coefficients
 4 |   matrix[n, p] X;
 5 |   int y[n];
 6 |   int n_test;
 7 |   int y_test[n_test];
 8 |   matrix[n_test, p] X_test;
 9 | }
10 | 
11 | parameters {
12 |   vector[p] beta;
13 | }
14 | 
15 | model {
16 |   beta ~ normal(0, 1);
17 |   y ~ bernoulli_logit(X * beta);
18 | }
19 | 
20 | 
21 | generated quantities {
22 |   // I wrote this section for you as a hint
23 |   real loglik_test;
24 |   vector[n_test] logit_p_test;
25 |   
26 |   logit_p_test <- X_test * beta;
27 |   loglik_test <- bernoulli_logit_log(y_test, logit_p_test);  
28 |   //returns the sum of the log likelihoods (the joint log-likelihood)
29 | }
30 | 


--------------------------------------------------------------------------------
/example_solutions/5-binomial/tumor_glm_will.stan:
--------------------------------------------------------------------------------
 1 | data {
 2 |   // integer inputs
 3 |   int n;  // the number of samples
 4 |   int n_pred;  // the number of predictors
 5 |   int n_cohort;  // the number of cohorts
 6 | 
 7 |   // integer vector inputs
 8 |   int<lower=0, upper=1> y[n];   // observed malignancies
 9 | 
10 |   // design matrix
11 |   matrix[n,n_pred + n_cohort] X;
12 | }
13 | 
14 | parameters {
15 | 
16 |   // vector intercpet, betas for predictors and cohort means
17 |   vector [n_pred + n_cohort] beta; //
18 | }
19 | 
20 | model {
21 | 
22 |   // define priors for continuous predictors
23 |   beta[1:n_pred] ~ cauchy(0, 3);
24 | 
25 |   // define priors for cohort effects
26 |   beta[n_pred + 1:11] ~ cauchy(0,5);
27 |   beta[12] ~ normal(0,5); #shrinks estimate for cohort 4 towards 50%
28 |   beta[13:n_pred+n_cohort] ~ cauchy(0,5);
29 | 
30 |   // define the likelihood
31 |   y ~ bernoulli_logit(X*beta);
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/slides/6-shrinkage/prior_predictive.R:
--------------------------------------------------------------------------------
 1 | ## gall wasp intercept model prior predictive distribution
 2 | # y ~ pois(lambda)
 3 | # loglambda ~ n(2, 2)
 4 | 
 5 | # start by drawing parameters
 6 | ndraw <- 10000
 7 | loglambda <- rnorm(ndraw, 1, 1)
 8 | 
 9 | # then simulate data using the draws from the prior
10 | y_sim <- rpois(ndraw, lambda = exp(loglambda))
11 | hist(y_sim, breaks = 50)
12 | summary(y_sim)
13 | 
14 | ## Bayesian linear regression
15 | # y ~ n(mu, sigma)
16 | # mu = a + bx
17 | # a ~ n(0, 3)
18 | # b ~ n(0, 3)
19 | # sigma ~ cauchy+(0, 5)
20 | 
21 | # start by drawing parameters
22 | a <- rnorm(ndraw, 0, 20)
23 | b <- rnorm(ndraw, 0, 3)
24 | sigma <- abs(rnorm(ndraw, 0, 3))
25 | pairs(data.frame(a, b, sigma))
26 | 
27 | # simulate data using draws from the prior
28 | x <- runif(ndraw, 0, 5)
29 | y <- rnorm(ndraw, a + b * x, sigma)
30 | 
31 | plot(x, y)
32 | for (i in 1:ndraw) abline(a[i], b[i], col = 2)
33 | points(x, y)
34 | 


--------------------------------------------------------------------------------
/example_solutions/5-binomial/tumor_glm_test_will.stan:
--------------------------------------------------------------------------------
 1 | data {
 2 |   // integer inputs
 3 |   int n;  // the number of samples
 4 |   int n_pred;  // the number of predictors
 5 |   int n_cohort;  // the number of cohorts
 6 |   int n_test; // number of individuals in the test data
 7 | 
 8 | 
 9 |   // integer vector inputs
10 |   int<lower=0, upper=1> y[n];   // observed malignancies
11 |   int<lower=0, upper=1> y_test[n_test];   // observed for test data
12 | 
13 |   // design matrix
14 |   matrix[n,n_pred + n_cohort] X;
15 |   matrix[n_test, n_pred + n_cohort] X_test;
16 | }
17 | 
18 | parameters {
19 | 
20 |   // vector intercept, betas for predictors and cohort means
21 |   vector [n_pred + n_cohort] beta; //
22 | }
23 | 
24 | model {
25 | 
26 | 
27 |   // define priors for continuous predictors
28 |   beta[1:n_pred] ~ cauchy(0, 3);
29 | 
30 |   // define priors for cohort effects
31 |   beta[n_pred + 1:11] ~ cauchy(0,5);
32 |   beta[12] ~ normal(0,5); #shrinks estimate for cohort 4 towards 50%
33 |   beta[13:n_pred+n_cohort] ~ cauchy(0,5);
34 | 
35 |   // define the likelihood
36 |   y ~ bernoulli_logit(X*beta);
37 | 
38 | }
39 | 
40 | generated quantities {
41 |   real loglik_test;
42 |   vector[n_test] logit_p_test;
43 | 
44 |   logit_p_test <- X_test * beta;
45 |   loglik_test <- bernoulli_logit_log(y_test, logit_p_test);
46 |   //returns the sum of the log likelihoods (the joint log-likelihood)
47 | }
48 | 


--------------------------------------------------------------------------------
/slides/1-linear_models/wk1_slides.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Linear models"
 3 | output:
 4 |   beamer_presentation:
 5 |     colortheme: "spruce"
 6 |     fonttheme: "structurebold"
 7 | ---
 8 | 
 9 | ## Introductions
10 | 
11 | - Who you are
12 | - What you would like to learn about
13 | 
14 | ## Course background
15 | 
16 | * Motivation
17 | * Philosophy 
18 | 
19 | ## What might you learn?
20 | 
21 | * R 
22 | * stats
23 | * math
24 | 
25 | ## What is a linear model?
26 | 
27 | $$y \sim N(X \beta, \sigma)$$
28 | 
29 | or
30 | 
31 | $$y = X \beta + \epsilon$$
32 | $$\epsilon \sim N(0, \sigma)$$
33 | 
34 | ## Matrix multiplication review
35 | 
36 | $X = \left[ \begin{array}{cc}
37 | 1 & 0 \\
38 | 1 & 1.1 \\
39 | 1 & 2.5 \end{array} \right]$, 
40 | $\beta = \begin{bmatrix}
41 | -3 \\ 
42 | 1 \end{bmatrix}$, what is $X \beta$?
43 | 
44 | ## Linear models
45 | 
46 | $y = X \beta + \epsilon$
47 | 
48 | $\begin{bmatrix}
49 | y_1 \\
50 | y_2 \\
51 | y_3 \end{bmatrix} = 
52 | \begin{bmatrix}
53 | -3 \\
54 | -1.9 \\
55 | -0.5 \end{bmatrix} + 
56 | \begin{bmatrix}
57 | \epsilon_1 \\
58 | \epsilon_2 \\
59 | \epsilon_3 \end{bmatrix}$
60 | 
61 | $X \beta$ is the **linear predictor**, $\epsilon$ is normal error 
62 | 
63 | ## Examples of linear models
64 | 
65 | - model of the mean
66 | - linear regression
67 | - multiple regression
68 | - ANOVA
69 | - ANCOVA
70 | - factorial ANOVA
71 | - general linear models
72 | 
73 | ## Case study: amniote life history trait prediction
74 | 
75 | - R markdown overview
76 | - how to turn in assignments (vote)
77 | 


--------------------------------------------------------------------------------
/assignments/4-poisson/wk4_problems.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Week 4 assignment: Poisson models'
 3 | author: "Your name here"
 4 | date: "February 5, 2015"
 5 | output: pdf_document
 6 | ---
 7 | 
 8 | Wasps in the family Cynipidae lay their eggs on plants which form galls around the developing larvae, providing nutrition until the larvae metamorphose and burrow out of the galls, emerging as adults.
 9 | From any particular gall, there is variation in the number of host wasps that emerge.
10 | 
11 | Here, you will construct a Bayesian model for the number of emerging cynipid wasps, using features of the galls as explanatory variables.
12 | The data are available in the `cleaned_galls.csv` file.
13 | Your task is to estimate the parameters of your model, and then to do a posterior predictive check to evaluate overdispersion.
14 | 
15 | # Problem 1: looking at the data
16 | 
17 | Load the data and explore how the features relate to the response variable.
18 | 
19 | ```{r}
20 | # your code here
21 | ```
22 | 
23 | # Problem 2: model specification
24 | 
25 | What is your model? Write it in \LaTeX.
26 | 
27 | *Your equation here*
28 | 
29 | What is your Stan model statement?
30 | 
31 | ```
32 | your Stan code here
33 | ```
34 | 
35 | # Problem 3: parameter estimation
36 | 
37 | Use the `rstan` package to estimate your model parameters.
38 | 
39 | ```{r}
40 | # your code here
41 | ```
42 | 
43 | Verify convergence using traceplots and the Rhat statistic:
44 | 
45 | ```{r}
46 | # your code here
47 | ```
48 | 
49 | # Problem 4: posterior predictive check
50 | 
51 | Does your model adequately capture the variance in the emergence data, or is there overdispersion?
52 | 
53 | ```{r}
54 | # your code here
55 | ```
56 | 
57 | *Your text answer here*
58 | 


--------------------------------------------------------------------------------
/schedule.md:
--------------------------------------------------------------------------------
 1 | # Course schedule
 2 | 
 3 | Here is a weekly breakdown for the course in terms of readings and topics.
 4 | 
 5 | | Week | Date | Topic                            | Course notes chapter | Gelman and Hill chapter(s) |
 6 | |------|------|----------------------------------|----------------------|----------------------------|
 7 | | 1    | 1/15 | Linear models                    | 1                    | 3, 4                       |
 8 | | 2    | 1/22 | Likelihood                       | 2                    | 18                         |
 9 | | 3    | 1/29 | Bayesian inference               | 3                    | 18                         |
10 | | 4    | 2/5  | Poisson models                   | 4                    | 6                          |
11 | | 5    | 2/12 | Binomial models                  | 5                    | 5, 6                       |
12 | | 6    | 2/19 | Partial pooling                  | 6                    | 11, 12                     |
13 | | 7    | 2/26 | Bayesian hierarchical models     | 7                    | 13                         |
14 | | 8    | 3/4  | Project proposal presentations   |                      |                            |
15 | | 9    | 3/11 | Project proposal presentations   |                      |                            |
16 | | 10   | 3/18 | Constructing hierarchical models | 8                    | 14, 15                     |
17 | | 11   | 3/25 | Spring break: no class           |                      |                            |
18 | | 12   | 4/1  | Independent projects             |                      |                            |
19 | | 13   | 4/8  | Independent projects             |                      |                            |
20 | | 14   | 4/15 | Independent projects             |                      |                            |
21 | | 15   | 4/22 | Final student presentations      |                      |                            |
22 | | 16   | 4/29 | Final student presentations      |                      |                            |
23 | | 17   | 5/6  | Final student presentations      |                      |                            |
24 | 


--------------------------------------------------------------------------------
/slides/2-maximum_likelihood/wk2_slides.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Week 2: likelihood, earthquakes, and more"
  3 | output:
  4 |   beamer_presentation:
  5 |     colortheme: "spruce"
  6 |     fonttheme: "structurebold"
  7 |     highlight: kate
  8 | fontsize: 10pt
  9 | ---
 10 | 
 11 | ```{r, echo=FALSE, message=FALSE}
 12 | library(ggplot2)
 13 | ```
 14 | 
 15 | ## Quick poll
 16 | 
 17 | Help outside of class? 
 18 | 
 19 | - open lab hours
 20 | - message board
 21 | 
 22 | ## Access to the course notes last week
 23 | 
 24 | ![](visitors.png)
 25 | 
 26 | ## Solutions to last week's problems
 27 | 
 28 | Posted on GitHub course site at end of class
 29 | 
 30 | ## Likelihood
 31 | 
 32 | The joint conditional probability of observations $y_1, y_2, ..., y_n$:
 33 | 
 34 | $$\mathcal{L}(\theta | y) = \prod_{i=1}^{n} p(y_i | \theta)$$
 35 | 
 36 | ## Maximum likelihood estimation
 37 | 
 38 | normal model, known standard deviation
 39 | 
 40 | $y \sim N(\mu, 1)$
 41 | 
 42 | ```{r, echo=FALSE}
 43 | y <- rnorm(n = 100, mean = 5, sd = 1)
 44 | ```
 45 | 
 46 | ## Normal probability density 
 47 | 
 48 | ```{r, fig.width=9, fig.height=5}
 49 | x <- seq(-3, 3, .01)
 50 | fx <- dnorm(x, 0, 1)
 51 | plot(x, fx, ylab = 'Probability density of x')
 52 | ```
 53 | 
 54 | 
 55 | ## Maximum likelihood estimation
 56 | 
 57 | $$\mathcal{L}(\theta | y) = \prod_{i=1}^{n} p(y_i | \theta)$$
 58 | 
 59 | What values of $\theta$ maximize $\mathcal{L}(\theta | y)$?
 60 | 
 61 | 
 62 | ## Maximum likelihood estimation
 63 | 
 64 | $$\mathcal{L}(\theta | y) = \prod_{i=1}^{n} p(y_i | \theta)$$
 65 | 
 66 | ```{r}
 67 | calc_lik <- function(y, mu) {
 68 |   likelihood <- prod(dnorm(y, mu, sd = 1))
 69 |   return(likelihood)
 70 | } 
 71 | ```
 72 | 
 73 | 
 74 | ## Making a likelihood profile
 75 | 
 76 | ```{r}
 77 | l <- 1000
 78 | mu <- seq(4, 6, length.out = l)
 79 | lik <- rep(NA, l)
 80 | for (i in 1:l){
 81 |   lik[i] <- calc_lik(y, mu[i])
 82 | }
 83 | ```
 84 | 
 85 | 
 86 | ## Making a likelihood profile
 87 | 
 88 | ```{r, fig.width=7, fig.height=5}
 89 | plot(mu, lik)
 90 | ```
 91 | 
 92 | 
 93 | ## Log-likelihood profile
 94 | 
 95 | ```{r, fig.width=7, fig.height=5}
 96 | plot(mu, log(lik))
 97 | ```
 98 | 
 99 | 
100 | ## Today's class: earthquakes & optimization
101 | 
102 | ![](earthquake.jpg)
103 | 
104 | ## The data
105 | 
106 | ```{r, fig.height=2.5, fig.width=4}
107 | ggplot(attenu, aes(x = dist, y = accel, col = mag)) + 
108 |   geom_point()
109 | ```
110 | 
111 | 
112 | ## Your task
113 | 
114 | 1. Predict peak ground acceleration at epicenter
115 | 2. Predict attenuation curve
116 | 
117 | ```{r, fig.height=2.5, fig.width=4, echo=FALSE}
118 | ggplot(attenu, aes(x = dist, y = accel, col = mag)) + 
119 |   geom_point()
120 | ```
121 | 
122 | 
123 | ## A quick primer on \LaTeX$~$math equations
124 | 
125 | Demo in Atom
126 | 
127 | 


--------------------------------------------------------------------------------
/assignments/2-maximum_likelihood/wk2_problems.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Week 2 assignment: likelihood'
 3 | author: "Your name here"
 4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
 5 | output: pdf_document
 6 | ---
 7 | 
 8 | # Problem 1
 9 | 
10 | Earthquakes are most forceful at their epicenters, and this force attenuates with distance. 
11 | R has an earthquake dataset `attenu` with measurements of peak horizontal ground acceleration by distance measured at multiple recording stations, with each earthquake coded in terms of magnitude. 
12 | See the help file for more information on this dataset by typing `?attenu`.
13 | 
14 | Your main task this week is to build one model that predicts both ground acceleration at the epicenter (distance = 0), and the acceleration by distance curve as a function of magnitude and distance from the epicenter. 
15 | You will obtain maximum likelihood estimates for the parameters of the model using the `optim` function. 
16 | The structure of your model is up to you. 
17 | You can use a combination of intuition, imagination, first principles, research, and collaboration to construct your model. 
18 | (Note: there are many possible models that one could construct!)
19 | 
20 | You will benefit from visualizing the data, specifically the relationships between the quantities of interest i.e., magnitude (`mag`), distance from epicenter (`dist`), and peak acceleration (`accel`).
21 | Include your visualization code and plots below.
22 | 
23 | ```{r}
24 | # your code here
25 | ```
26 | 
27 | What is the equation for your model? 
28 | Write it in \LaTeX, bounded between dollar signs (e.g., $e = mc^2$), not in normal text. 
29 | 
30 | $Your equation here$
31 | 
32 | Define all of the parameters, and explain why you formulated it in this way. 
33 | What assumptions went into your model's construction? 
34 | 
35 | *Your text here*
36 | 
37 | Write a function called `nll` that returns the negative log likelihood for your model. 
38 | The arguments to this function should be `theta` (the parameters), and `data` (the data).
39 | 
40 | ```{r}
41 | # Your code here
42 | ```
43 | 
44 | Use `optim` to obtain maximum likelihood estimates for your model parameters.
45 | 
46 | ```{r}
47 | # Your code here
48 | ```
49 | 
50 | Did `optim()` converge to a minimum? How do you know? 
51 | 
52 | *Your text here.*
53 | 
54 | Create a scatterplot with fitted values vs. residuals.
55 | 
56 | ```{r}
57 | # Your code here
58 | ```
59 | 
60 | What stands out in the plot of fitted values vs. residuals? 
61 | Are you worried about any violations of assumptions? 
62 | Why or why not? 
63 | 
64 | *Your text here.*
65 | 
66 | Plot the distance by acceleration data along with your predicted curves (starting at a distance of 0) for earthquakes of magnitude 5 and 8. 
67 | 
68 | ```{r}
69 | # Your code here
70 | ```
71 | 
72 | How do your predictions compare to the data? 
73 | Which characteristics of the data are captured well, and which are captured poorly by your model?
74 | 
75 | *Your text here.*
76 | 
77 | # Problem 2
78 | 
79 | Pat loves to play basketball. 
80 | You observe Pat practicing free throws at the recreation center one day. 
81 | Pat misses 3 shots in a row. 
82 | Generate a likelihood profile for $p$, the probability that Pat makes a free throw.
83 | 
84 | ```{r}
85 | # Your code here
86 | ```
87 | 
88 | What is your MLE for $p$, and does it make sense? Why or why not?
89 | 
90 | *Your text here*
91 | 


--------------------------------------------------------------------------------
/example_solutions/4-poisson/wk4_solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Week 4 assignment: Poisson models'
  3 | author: "Example solutions"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | output: pdf_document
  6 | ---
  7 | 
  8 | Wasps in the family Cynipidae lay their eggs on plants which form galls around the developing larvae, providing nutrition until the larvae metamorphose and burrow out of the galls, emerging as adults. 
  9 | From any particular gall, there is variation in the number of host wasps that emerge. 
 10 | 
 11 | Here, you will construct a Bayesian model for the number of emerging cynipid wasps, using features of the galls as explanatory variables. 
 12 | The data are available in the `cleaned_galls.csv` file. 
 13 | Your task is to estimate the parameters of your model, and then to do a posterior predictive check to evaluate overdispersion. 
 14 | 
 15 | # Problem 1: looking at the data
 16 | 
 17 | Load the data and explore how the features relate to the response variable. 
 18 | 
 19 | ```{r, fig.width = 6, fig.height=4}
 20 | library(ggplot2)
 21 | d <- read.csv('cleaned_galls.csv')
 22 | ggplot(d, aes(x = gall_size, y = n_cynip)) + 
 23 |   geom_count()
 24 | ggplot(d, aes(x = gall_size, y = n_cynip)) + 
 25 |   geom_jitter(shape = 1, position = position_jitter(width=2.5, height=.4), 
 26 |               alpha=.5)
 27 | ggplot(d, aes(y = gall_locality, x = n_cynip)) + geom_count()
 28 | ggplot(d, aes(x = gall_size, y = n_cynip)) + 
 29 |   geom_jitter(shape = 1, position = position_jitter(width=2.5, height=.4), 
 30 |               alpha=.5) + 
 31 |   facet_wrap(~ gall_locality)
 32 | ```
 33 | 
 34 | # Problem 2: model specification
 35 | 
 36 | What is your model? Write it in \LaTeX. 
 37 | 
 38 | $y_i \sim Poisson(\lambda_i)$
 39 | 
 40 | $\lambda_i = e^{\beta^T X_i}$
 41 | 
 42 | What is your Stan model statement? 
 43 | 
 44 | ```
 45 | data {
 46 |   int n; // sample size
 47 |   int p; // number of coefficients
 48 |   matrix[n, p] X;
 49 |   int y[n];
 50 | }
 51 | 
 52 | parameters {
 53 |   vector[p] beta;
 54 | }
 55 | 
 56 | model {
 57 |   beta ~ normal(0, 5);
 58 |   y ~ poisson_log(X * beta);
 59 | }
 60 | ```
 61 | 
 62 | # Problem 3: parameter estimation
 63 | 
 64 | Use the `rstan` package to estimate your model parameters. 
 65 | 
 66 | ```{r, results = 'hide', message = FALSE}
 67 | # center size
 68 | mean_size <- mean(d$gall_size)
 69 | sd_size <- sd(d$gall_size)
 70 | d$size <- (d$gall_size - mean(d$gall_size)) / sd_size
 71 | 
 72 | # make glm() construct a design matrix for me with a 2nd degree polynomial
 73 | m <- glm(n_cynip ~ size + I(size^2) + gall_locality, data=d, family=poisson)
 74 | 
 75 | library(rstan)
 76 | rstan_options(auto_write = TRUE)
 77 | options(mc.cores = parallel::detectCores())
 78 | 
 79 | X <- model.matrix(m)
 80 | stan_d <- list(n = nrow(X), p = ncol(X), X = X, y = d$n_cynip)
 81 | 
 82 | iter <- 1000
 83 | chains <- 4
 84 | m_fit <- stan('poisson_glm.stan', data = stan_d, 
 85 |                chains = chains, iter = iter, cores = 2)
 86 | ```
 87 | 
 88 | Verify convergence using traceplots and the Rhat statistic: 
 89 | 
 90 | ```{r}
 91 | m_fit
 92 | traceplot(m_fit, inc_warmup=TRUE)
 93 | ```
 94 | 
 95 | # Problem 4: posterior predictive check
 96 | 
 97 | Does your model adequately capture the variance in the emergence data, or is there overdispersion? 
 98 | 
 99 | ```{r}
100 | # need to simulate datasets for each posterior draw
101 | # the test statistic is var(y)
102 | post <- rstan::extract(m_fit)
103 | n_draws <- length(post$lp__)
104 | 
105 | sim_y <- function(X, beta) {
106 |   # little function to simulate response data
107 |   n <- nrow(X)
108 |   lambda <- c(exp(X %*% beta))
109 |   y <- rpois(n, lambda)
110 |   return(y)
111 | }
112 | 
113 | # create object to store variances of y
114 | var_y_new <- rep(NA, n_draws)
115 | 
116 | # iterate through posterior draws and store variances of new data
117 | for (i in 1:n_draws) {
118 |   y_new <- sim_y(X, beta = post$beta[i, ])
119 |   var_y_new[i] <- var(y_new)
120 | }
121 | 
122 | # plot histogram of simulated variances
123 | hist(var_y_new, breaks = 50)
124 | # add line to indicate observed value
125 | abline(v = var(d$n_cynip), col = 'red', lty = 2, lwd = 3)
126 | ```
127 | 
128 | *The model seems to have adequately captured the variance in y*.
129 | 


--------------------------------------------------------------------------------
/slides/5-binomial/wk5_slides.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Week 5: Binomial models"
  3 | fontsize: 7pt
  4 | output:
  5 |   beamer_presentation:
  6 |     colortheme: "spruce"
  7 |     fonttheme: "structurebold"
  8 |     latex_engine: xelatex
  9 | header-includes: 
 10 | - \usepackage{listings}
 11 | - \lstset{basicstyle=\small}
 12 | - \setmonofont[Scale=MatchLowercase]{Courier}
 13 | - \setmonofont[Scale=0.8]{Courier}
 14 | ---
 15 | 
 16 | ## Announcements
 17 | 
 18 | 1. Sign up for proposal talks
 19 | 2. For-loops vs. copypasta
 20 | 3. Parameter recovery & model verification
 21 | 4. Plotting results
 22 | 5. Not prior priors
 23 | 
 24 | ```{r, echo=FALSE, message=FALSE}
 25 | library(ggplot2)
 26 | 
 27 | ```
 28 | 
 29 | ## Design matrix activity
 30 | 
 31 | Five ways to think about model structure
 32 | 
 33 | 1. Design matrix 
 34 | 2. R formula syntax
 35 | 3. Long-form equations
 36 | 4. Graphical representation
 37 | 5. Verbal representation
 38 | 
 39 | ## Binomial glm
 40 | 
 41 | $$y_i \sim Binom(k_i, p_i)$$
 42 | 
 43 | $$\log(\frac{p}{1 - p}) = X \beta$$
 44 | 
 45 | Why not $p = X \beta$?
 46 | 
 47 | ## Bernoulli glm
 48 | 
 49 | Equivalent to binomial with $k = 1$
 50 | 
 51 | $$y_i \sim Bernoulli(p_i)$$
 52 | 
 53 | $$\log(\frac{p}{1 - p}) = X \beta$$
 54 | 
 55 | ## Pro tip: 
 56 | 
 57 | Logit function: `qlogis()`
 58 | 
 59 | Inverse logit function: `plogis()`
 60 | 
 61 | ## Binomial distribution: properties
 62 | 
 63 | $y \sim Binom(k, p)$
 64 | 
 65 | $$E(y) = kp$$
 66 | 
 67 | $$Var(y) = kp(1 - p)$$
 68 | 
 69 | ## Binomial overdispersion
 70 | 
 71 | Test with posterior predictive check
 72 | 
 73 | ## 2 solutions to overdispersion
 74 | 
 75 | 1. Binomial-normal model
 76 | 
 77 | $$y \sim Binom(k, p)$$
 78 | 
 79 | $$\text{ln} \Big( \frac{p}{1 - p} \Big) = X \beta + \epsilon$$
 80 | 
 81 | $$\epsilon \sim N(0, \sigma)$$
 82 | 
 83 | 2. Beta-binomial model
 84 | 
 85 | $$y_i \sim Binom(k_i, p_i)$$
 86 | 
 87 | $$p_i \sim Beta(\alpha, \beta)$$
 88 | 
 89 | 
 90 | ## Recommendation
 91 | 
 92 | 1. **Binomial-normal model**
 93 | 
 94 | $$y \sim Binom(k, p)$$
 95 | 
 96 | $$\text{ln} \Big( \frac{p}{1 - p} \Big) = X \beta + \epsilon$$
 97 | 
 98 | $$\epsilon \sim N(0, \sigma)$$
 99 | 
100 | 2. Beta-binomial model
101 | 
102 | $$y_i \sim Binom(k_i, p_i)$$
103 | 
104 | $$p_i \sim Beta(\alpha, \beta)$$
105 | 
106 | ## Caution
107 | 
108 | **Overdispersion is not possible with binary data**
109 | 
110 | Don't try to implement an overdispersed Bernoulli model!
111 | 
112 | 
113 | ## Predictive accuracy
114 | 
115 | 1. Estimate parameters w/ training data:
116 | 
117 | $\rightarrow [\theta \mid y_{train}]$ 
118 | 
119 | 2. Make predictions for new observations
120 | 
121 | 3. Compare model predictions to validation data:
122 | 
123 | - classification error (ROC curves, AUC)
124 |     - good for binary data, but very specific
125 | 
126 | - validation log likelihood $[y_{test} \mid \theta]$
127 |     - more general
128 |     - easy to compute
129 | 
130 | ## Validation log likelihood example
131 | 
132 | ```{r, echo = FALSE, fig.width = 5, fig.height = 4}
133 | par(bty = 'n')
134 | n <- 8
135 | x <- sort(rnorm(n))
136 | y <- rnorm(n, 4 * x + .6 * x^2, 1.5)
137 | group <- sample(c('test', 'train'), n, replace = TRUE)
138 | plot(x, y, pch = 19, col = as.numeric(factor(group)))
139 | legend('topleft', 
140 |        col = 1:2, 
141 |        legend = c('test data', 'training data'), 
142 |        pch = 19, 
143 |        bty='n')
144 | ```
145 | 
146 | 
147 | ## Obtaining estimates with training data
148 | 
149 | ```{r, echo = FALSE, fig.width = 5, fig.height = 4}
150 | par(bty = 'n')
151 | library(scales)
152 | alph <- .2
153 | alphas <- ifelse(group == 'train', 1, alph)
154 | plot(x, y, pch = 19, col = alpha(as.numeric(factor(group)), alphas))
155 | legend('topleft', 
156 |        col = c(alpha(1, alph), 2), 
157 |        legend = c('test data', 'training data'), 
158 |        pch = 19, 
159 |        bty='n')
160 | ```
161 | 
162 | 
163 | ## Obtaining estimates with training data
164 | 
165 | ```{r, echo = FALSE, fig.width = 5, fig.height = 4}
166 | par(bty = 'n')
167 | plot(x, y, pch = 19, col = alpha(as.numeric(factor(group)), alphas))
168 | legend('topleft', 
169 |        col = c(alpha(1, alph), 2), 
170 |        legend = c('test data', 'training data'), 
171 |        pch = 19, 
172 |        bty='n')
173 | y_tr <- y[group == 'train']
174 | x_tr <- x[group == 'train']
175 | y_test <- y[group == 'test']
176 | x_test <- x[group == 'test']
177 | X_test <- model.matrix(lm(y_test ~ x_test))
178 | m_train <- lm(y_tr ~ x_tr)
179 | abline(m_train, col = 2)
180 | ```
181 | 
182 | 
183 | ## Validation log likelihood
184 | 
185 | Joint validation log likelihood:
186 | 
187 | $$\sum_{i=1}^{n_{test}} log([y_{test_i} \mid \theta])$$
188 | 
189 | ## Today's class
190 | 
191 | Mini-Kaggle competition
192 | 
193 | 1. Build a model to classify tumors as malignant or not
194 | 2. Evaluate out of sample predictive power
195 | 3. Earn prizes
196 | 


--------------------------------------------------------------------------------
/assignments/3-bayes/wk3_problems.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Week 3 assignment: Bayesian inference'
  3 | author: "Your name here"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | output: pdf_document
  6 | ---
  7 | 
  8 | # Problem 1
  9 | 
 10 | Maure et al. (2015) conducted experiments to see how ladybird beetle diet affected interactions with parasitoid wasps.
 11 | Individual beetles were randomly assigned to one of five diet treatments:
 12 | 
 13 | 1. 20 aphids per day
 14 | 2. 2 aphids per day
 15 | 3. 20 aphids per day + pollen
 16 | 4. 2 aphids per day + pollen
 17 | 5. pollen only
 18 | 
 19 | Each beetle was exposed to a female parisitoid wasp for 3 hours, which deposited eggs in the beetle, and the beetles were then fed each day and the outcome of the interaction monitored.
 20 | The authors wanted to know whether diet affected the ability to recover (i.e. not die) after the parasitoid attack.
 21 | Your task is to build a Bayesian model with Stan for beetle survival as a function of experimental treatment, estimating the probability of survival for each treatment.
 22 | To keep things simple, consider the treatments to be categorical and unordered.
 23 | The data are archived in Dryad [here](http://www.datadryad.org/resource/doi:10.5061/dryad.7fq4j).
 24 | 
 25 | **Important**: do not manipulate the original raw data file!
 26 | This is is error prone, leaves no written record, encourages spreadsheet farming, and is not reproducible.
 27 | And, in this case, the data are already well organized.
 28 | Read in the .xls file using the `readxl` package, then use R to massage the data as necessary.
 29 | 
 30 | ```{r}
 31 | ## Loading the raw data
 32 | library(readxl) # use the read_excel() function to load the data
 33 | # your code here
 34 | 
 35 | ## Cleaning the data
 36 | # your code here
 37 | 
 38 | ## Visualizing the data
 39 | # your code here
 40 | ```
 41 | 
 42 | Write out your model using \LaTeX:
 43 | 
 44 | *Your \LaTeX${}$ here*
 45 | 
 46 | Paste your Stan model statement in the code block below, and ensure that your written model matches the notation in your Stan file:
 47 | 
 48 | ```
 49 | Your Stan file here
 50 | ```
 51 | 
 52 | Now, use `rstan` to fit your model.
 53 | Evaluate convergence by inspecting the $\hat{R}$ statistic and the traceplots.
 54 | 
 55 | ```{r}
 56 | # your code here
 57 | ```
 58 | 
 59 | Calculate posterior credible intervals, medians, means, and modes for the survival probabilities for each treatment.
 60 | Hint: posterior draws can be extracted with the `rstan::extract` function, which returns a list of arrays.
 61 | 
 62 | ```{r}
 63 | # your code here
 64 | ```
 65 | 
 66 | Generate a plot that shows all of the raw data along with the posterior probability distributions of recovery for each treatment:
 67 | 
 68 | ```{r}
 69 | # your code here
 70 | ```
 71 | 
 72 | The authors reported statistically significant differences in ladybird beetle recovery between the diet treatments.
 73 | What is your conclusion for the effect of diet on ladybird beetle recovery?
 74 | 
 75 | *Your text here*
 76 | 
 77 | # Problem 2
 78 | 
 79 | One of the biggest advantages of Bayesian approaches is the ease with which you can make inference on **derived parameters**.
 80 | For example, we might want to know which diet treatment gives the highest survival probability.
 81 | In one draw from the posterior distribution, we should have five estimated probabilities.
 82 | The highest probability can be recorded and stored in an object (say `best`).
 83 | We can do this for each posterior draw to produce a vector of the "best" treatments (from the beetle's perspective).
 84 | To find the posterior probability that each particular treatment is best, count the frequency of each treatment in the `best` vector, and divide by the total number of posterior draws.
 85 | Do this below using the results from problem 1, and report the posterior probabilities.
 86 | 
 87 | ```{r}
 88 | # your code here
 89 | ```
 90 | 
 91 | Which treatment was best? What is the probability of that treatment being the best, conditional on the data?
 92 | 
 93 | *Your text here*
 94 | 
 95 | # Problem 3
 96 | 
 97 | Simulate data from a normal distribution for three groups, each with a different mean.
 98 | You can assume that the standard deviations for each group are equal.
 99 | In generating data, use a design matrix to acquire the expected value for your data (somewhere in your code there should be `X %*% beta`).
100 | 
101 | ```{r}
102 | # your code here
103 | ```
104 | 
105 | Write a Stan model statement for a linear model that you can use to estimate the parameters.
106 | 
107 | ```
108 | Your Stan model here
109 | ```
110 | 
111 | Use Stan to estimate the parameters of your model.
112 | 
113 | ```{r}
114 | # your code here
115 | ```
116 | 
117 | Assess convergence of the MCMC algorithm graphically and with the Rhat statistic.
118 | 
119 | ```{r}
120 | # your code here
121 | ```
122 | 
123 | Plot the marginal posterior distributions for each parameter with a vertical line at the true value.
124 | 
125 | ```{r}
126 | # your code here
127 | ```
128 | 


--------------------------------------------------------------------------------
/assignments/7-bhm/wk7_problems.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Week 7 assignment: Bayesian hierarchical models'
  3 | author: "Your name here"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | output:
  6 |   pdf_document
  7 | ---
  8 | 
  9 | The goal of today's assignment is for you to execute a Bayesian analysis of a hierarchical model from start to finish. 
 10 | We will work with Gelman's radon data, which should be familiar to you if you have been keeping up with Gelman and Hill. 
 11 | If not, see chapter 13 for some useful content.
 12 | Radon is a radioactive gas that causes lung cancer.
 13 | 
 14 | The data are in the file `radon.csv`, which contains the log radon mesurements in basements and first floors of various houses in different counties across the U.S, along with a measurement of the average log uranium level in each county.
 15 | Take a moment to explore the data with the following question in mind: how do radon levels compare among floors and basements across counties in the U.S.?
 16 | 
 17 | ```{r}
 18 | # your code here
 19 | ```
 20 | 
 21 | Your job is to model the log radon levels as a function of floor and county using a hierarchical model. 
 22 | This is a "choose your own adventure" exercise, as you may choose among four possible models which vary in complexity. 
 23 | All models should include a basement vs. first floor effect, as this is known to affect radon levels, and beyond that the options from easiest to hardest are:
 24 | 
 25 | 1. varying intercepts at the county level (a "random intercept" model)
 26 | 2. varying intercepts along with effects of county-level mean uranium
 27 | 3. varying intercepts and basement effects at the county level (a "random intercept & slope" model)
 28 | 4. varying intercepts and slopes with county-level mean uranium effects
 29 | 
 30 | Detailed model descriptions can be found for all of these models. 
 31 | Models 1 and 2 are described in Chapter 12, and models 3 and 4 are in Chapter 13 of Gelman and Hill. 
 32 | 
 33 | Which model do you want to implement? 
 34 | 
 35 | *Your text here (1, 2, 3, or 4)*
 36 | 
 37 | # Model specification
 38 | 
 39 | Specify your model in mathematical notation below. 
 40 | Be sure to specify priors for all parameters (it's a proper Bayesian model after all, none of that `lme4` nonsense).
 41 | 
 42 | *Your* \LaTeX
 43 | 
 44 | # Prior predictive simulations
 45 | 
 46 | To evaluate the implications of your priors, simulate from the prior predictive distribution below.
 47 | 
 48 | ```{r}
 49 | # your code here
 50 | ```
 51 | 
 52 | Visualize the data in a way that makes sense for the question, and evaluate based on your knowledge (or lack thereof) of radon measurements in houses whether the priors make sense. 
 53 | It's perfectly acceptable to tune your priors until they make sense (but don't use the `log_radon` data to tune the priors!).
 54 | Unless you already know a lot about radon, do a quick internet search to get a prior sense of the mean and range of log radon levels that tend to get recorded in houses, e.g., [here](http://www.who.int/mediacentre/factsheets/fs291/en/).
 55 | The units in the original data were becquerels per cubic meter $\frac{Bq}{m^3}$ (you can't make this stuff up) and these values were log transformed to get the `log_radon` column. 
 56 | 
 57 | ```{r}
 58 | # your code here
 59 | ```
 60 | 
 61 | 
 62 | # Parameter interpretation
 63 | 
 64 | Write an english language description of your model parameters (at least all of the hyperparameters):
 65 | 
 66 | - $\sigma_y$: the residual standard deviation - accounts for unexplained variance in the log radon measurements at the observation level
 67 | 
 68 | - *Your other parameter descriptions here and below*
 69 | 
 70 | 
 71 | # Parameter estimation
 72 | 
 73 | Write a Stan file that expresses the model and priors outlined above: 
 74 | 
 75 | ```
 76 | Your Stan code
 77 | ```
 78 | 
 79 | Estimate the parameters, evaluate convergence, and print a summary of the posterior distribution for each parameter. 
 80 | 
 81 | ```{r, message = FALSE}
 82 | # your code here
 83 | ```
 84 | 
 85 | 
 86 | # Precision and sample size
 87 | 
 88 | Plot the width of the credible intervals for the county level parameters vs. sample size for each county to evaluate how precision of the estimate relates to sample size. 
 89 | 
 90 | ```{r}
 91 | # your code here
 92 | ```
 93 | 
 94 | 
 95 | # Making predictions
 96 | 
 97 | Unlike non-hierarchical approaches, hierarchical modeling allows you to make predictions for observed and unobserved groups. 
 98 | 
 99 | ### Observed groups 
100 | 
101 | Suppose that we are going to sample the basement of a new house in county 70. 
102 | Simulate from the posterior predictive distribution for the radon measurements.
103 | Your answer should be in the form of a histogram of simulated observations.
104 | 
105 | ```{r}
106 | # your code here
107 | ```
108 | 
109 | ### New groups
110 | 
111 | Suppose that we are going to sample the basement of a new house in a new county with mean log uranium level `log_u` = 0.2. 
112 | This county is new, meaning that is not represented in the data. 
113 | What is the posterior predictive distribution for the radon measurements? 
114 | Your answer should be in the form of a histogram of simulated observations.
115 | 
116 | ```{r}
117 | # your code here
118 | ```
119 | 
120 | Which predictions were more precise: the predictions for the observed county or the unobserved county, and why?
121 | 
122 | ```{r}
123 | # your code here
124 | ```
125 | 
126 | *Your text here*
127 | 


--------------------------------------------------------------------------------
/assignments/1-linear_models/wk1_problems.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Week 1 assignment: linear models'
  3 | author: "Your name here"
  4 | date: "January 6, 2015"
  5 | output: pdf_document
  6 | ---
  7 | 
  8 | # Loading the relevant R packages
  9 | 
 10 | To complete these problems, you'll need some R packages loaded and ready to go.
 11 | We suggest starting with a package for plotting (ggplot2), and potentially some packages for manipulating data frames (dplyr and tidyr), depending on how you prefer to use R.
 12 | 
 13 | ```{r}
 14 | #your code here
 15 | ```
 16 | 
 17 | # Problem 1
 18 | 
 19 | You've discovered a single adult of a new species of amniote.
 20 | Though you don't know the sex, you're worried that it might lay eggs soon and you want to build something to protect them.
 21 | To do so, you need to predict the mass of this new creature's eggs.
 22 | You first weigh your amniote and find that it weighs 3000 grams.
 23 | Luckily, some researchers have just recently published a full database of aminote life history traits, which can give you some information about how amniote adult mass relates to amniote egg mass. ([Myhrvold et al. (2015)](http://www.esajournals.org/doi/abs/10.1890/15-0846R.1))
 24 | 
 25 | ### Loading the data
 26 | 
 27 | Your first task will be to acquire the published dataset, which is available [here](http://www.esapubs.org/archive/ecol/E096/269/Data_Files/Amniote_Database_Aug_2015.csv).
 28 | Download the data file `Amniote_Database_Aug_2015.csv` and save it in a location that makes sense (e.g., `~/hmods/class1/data/`).
 29 | Then, you'll need to load the data with the `read.csv()` function.
 30 | Do not use `file.choose()`.
 31 | 
 32 | ```{r}
 33 | # your code here
 34 | ```
 35 | 
 36 | ### Preparing the data
 37 | 
 38 | In this dataset, missing values are coded as `-999`.
 39 | We want to replace these values with `NA` which indicates missing values to R.
 40 | 
 41 | ```{r}
 42 | # your code here
 43 | ```
 44 | 
 45 | ### Visualizing the data
 46 | 
 47 | We want to predict egg mass from adult body mass.
 48 | Visualize the relationship between these two variables below.
 49 | Transformation will be helpful for both variables.
 50 | 
 51 | ```{r}
 52 | # your code here
 53 | 
 54 | ```
 55 | 
 56 | ### Modeling the data
 57 | 
 58 | Use the `lm()` function to construct a linear model that could be used to predict egg mass based on adult body mass.
 59 | (Hint: what other kinds of data transformations might be helpful prior to fitting the model?)
 60 | 
 61 | ```{r}
 62 | # your code here
 63 | 
 64 | ```
 65 | 
 66 | Evaluate the homoscedasticity and normality assumptions graphically (e.g., `plot(mymodel)`.
 67 | 
 68 | ```{r}
 69 | # your code here
 70 | 
 71 | ```
 72 | 
 73 | Are the assumptions met?
 74 | Why or why not?
 75 | 
 76 | *Your text answer here*
 77 | 
 78 | ### Understanding the model
 79 | 
 80 | Produce a scatterplot as before, but this time add a trendline that represents the expected value of the response as a function of the predictor.
 81 | 
 82 | ```{r}
 83 | # your code here
 84 | ```
 85 | 
 86 | Make an image plot of the design matrix for your model (e.g., `image(t(model.matrix(m)))`):
 87 | 
 88 | ```{r}
 89 | # your code here
 90 | ```
 91 | 
 92 | Why does this image plot look the way it does, and what is the result of multiplying the design matrix by the vector of estimated coefficients (e.g., `model.matrix(m) %*% coef(m)`)?
 93 | 
 94 | *Your text answer here*
 95 | 
 96 | ### Predicting egg mass for the new critter
 97 | 
 98 | Predict the egg mass for the new species in units of grams.
 99 | 
100 | ```{r}
101 | # your code here
102 | ```
103 | Besides reporting your best estimate,and provide upper and lower bounds on this estimate, in units of grams.
104 | Remember that interval should incorporate both *predictive* uncertainty (error term in the model) and inferential uncertainty (uncertainty about the coefficients and amount of residiual error)
105 | (Hint: there's a built-in R function that should help generate prediction intervals)
106 | 
107 | ```{r}
108 | # your code here
109 | ```
110 | 
111 | # Problem 2
112 | 
113 | A week later, you are told that the critter has been identified to be in the class Reptilia.
114 | Use this new information to update your prediction in the code block below, commenting your code to document your thought process.
115 | 
116 | ```{r}
117 | # your code here
118 | ```
119 | 
120 | How does your new prediction compare to your prediction from Problem 1 in terms of accuracy and precision?
121 | Is it lower or higher, and why?
122 | 
123 | *Your text answer here*
124 | 
125 | # Problem 3
126 | 
127 | Myrdahl et al. just retracted all of the adult mass data from their data set, and have advised researchers to stop using the existing adult mass data until further notice!
128 | Given this new development, and ignoring your previous (now spurious) analysis, what's your best prediction for the critter's egg mass?
129 | Update your prediction in the block below, commenting the code as necessary.
130 | 
131 | ```{r}
132 | # your code here
133 | ```
134 | # Bonus Problem (optional)
135 | 
136 | When predicting the egg mass value for your unknown amniote, you probably used a built-in function in R (i.e. *predict*) to automatically generate prediction intervals.
137 | Can you generate prediction intervals from the model without resorting to a built-in *predict* fuction?
138 | (Hint 1: how many parameters were estimated?)
139 | (Hint 2: can prediction intervals be simulated?)
140 | (Hint 3: check out chapter 7 of Gelman and Hill if you have it)
141 | 
142 | ```{r}
143 | # your code here
144 | ```
145 | 


--------------------------------------------------------------------------------
/example_solutions/2-maximum_likelihood/wk2_solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Week 2 assignment: likelihood'
  3 | author: "Example solutions"
  4 | date: "Jan 17, 2015"
  5 | output: pdf_document
  6 | ---
  7 | 
  8 | # Problem 1
  9 | 
 10 | Earthquakes are most forceful at their epicenters, and this force attenuates with distance. 
 11 | R has an earthquake dataset `attenu` with measurements of peak horizontal ground acceleration by distance measured at multiple recording stations, with each earthquake coded in terms of magnitude. 
 12 | See the help file for more information on this dataset by typing `?attenu`.
 13 | 
 14 | Your main task this week is to build one model that predicts both ground acceleration at the epicenter (distance = 0), and the acceleration by distance curve as a function of magnitude and distance from the epicenter. 
 15 | You will obtain maximum likelihood estimates for the parameters of the model using the `optim` function. 
 16 | The structure of your model is up to you. 
 17 | You can use a combination of intuition, imagination, first principles, research, and collaboration to construct your model. 
 18 | (Note: there are many possible models that one could construct!)
 19 | 
 20 | You will benefit from visualizing the data, specifically the relationships between the quantities of interest i.e., magnitude (`mag`), distance from epicenter (`dist`), and peak acceleration (`accel`).
 21 | Include your visualization code and plots below.
 22 | 
 23 | ```{r}
 24 | # your code here
 25 | library(ggplot2)
 26 | ggplot(attenu, aes(x=dist, y=accel, col=mag)) + 
 27 |   geom_point() + 
 28 |   scale_color_gradientn(colors = rainbow(3)) + 
 29 |   geom_smooth()
 30 | ggplot(attenu, aes(x=dist, y=accel, col=mag)) + 
 31 |   geom_point() + 
 32 |   scale_color_gradientn(colors = rainbow(3)) + 
 33 |   geom_smooth() + 
 34 |   scale_y_log10()
 35 | ggplot(attenu, aes(x=dist, y=accel, col=mag)) + 
 36 |   geom_point() + 
 37 |   scale_color_gradientn(colors = rainbow(3)) + 
 38 |   scale_y_log10() + 
 39 |   scale_x_log10() + 
 40 |   geom_smooth()
 41 | ```
 42 | 
 43 | What is the equation for your model? 
 44 | Write it in \LaTeX, bounded between dollar signs (e.g., $e = mc^2$), not in normal text. 
 45 | 
 46 | $y_i \sim N(\gamma m_i  e^{-\frac{d_i}{\phi}}, \sigma)$
 47 | 
 48 | Define all of the parameters, and explain why you formulated it in this way. 
 49 | What assumptions went into your model's construction? 
 50 | 
 51 | *There are a ton of potential answers here. For me, $y_i$ is the peak ground acceleration, $m_i$ is magnitude, $d_i$ is distance from epicenter, $\gamma$ is a proportionality parameter that determines peak acceleration at the epicenter, and $\phi$ is a distance decay parameter. I'm assuming that the acceleration decays exponentially with distance, and that the rate of decay is constant with respect to magnitude. The initial peak acceleration however is a function of magnitude (specifically some fraction of magnitude). I'm ignoring the information on different stations, assuming that there are no systematic differnece among stations other than distance to epicenter.*
 52 | 
 53 | Write a function called `nll` that returns the negative log likelihood for your model. 
 54 | The arguments to this function should be `theta` (the parameters), and `data` (the data).
 55 | 
 56 | ```{r}
 57 | # function to return the expected value (the mean function)
 58 | predict_accel <- function(theta, data){
 59 |   gamma <- theta['gamma']
 60 |   phi <- theta['phi']
 61 |   sigma <- exp(theta['lsigma'])
 62 |   mu <- gamma * data[, 'mag'] * exp(- data[, 'dist'] / phi)
 63 |   mu
 64 | }
 65 | 
 66 | # function to return the negative log likelihood
 67 | nll <- function(theta, data){
 68 |   mu <- predict_accel(theta, data)  
 69 |   -sum(dnorm(data[, 'accel'], mu, exp(theta['lsigma']), log = TRUE))
 70 | }
 71 | ```
 72 | 
 73 | Use `optim` to obtain maximum likelihood estimates for your model parameters.
 74 | 
 75 | ```{r}
 76 | inits <- c(gamma = .1, phi = .1, lsigma = .1)
 77 | out <- optim(inits, nll, data = attenu)
 78 | out
 79 | ```
 80 | 
 81 | Did `optim()` converge to a minimum? How do you know? 
 82 | 
 83 | *Yes, the convergence code is 0, indicating successful convergence to a minimum.*
 84 | 
 85 | 
 86 | Create a scatterplot with fitted values vs. residuals.
 87 | 
 88 | ```{r}
 89 | attenu$predictions <- predict_accel(out$par, attenu)
 90 | attenu$residuals <- with(attenu, accel - predictions)
 91 | ggplot(attenu, aes(x = predictions, y = residuals)) + 
 92 |   geom_point()
 93 | ```
 94 | 
 95 | What stands out in the plot of fitted values vs. residuals? 
 96 | Are you worried about any violations of assumptions? 
 97 | Why or why not? 
 98 | 
 99 | *It looks like there is heteroscedasticity. The residual spread is much greater for higher predictions than smaller.*
100 | 
101 | Plot the distance by acceleration data along with your predicted curves (starting at a distance of 0) for earthquakes of magnitude 5 and 8. 
102 | 
103 | ```{r}
104 | # create a vector of distances
105 | lo <- 100
106 | dists <- seq(0, max(attenu$dist), length.out=lo)
107 | 
108 | new_d5 <- data.frame(mag = 5, dist = dists)
109 | new_d5$accel <- predict_accel(out$par, new_d5)
110 | 
111 | new_d8 <- data.frame(mag = 8, dist = dists)
112 | new_d8$accel <- predict_accel(out$par, new_d8)
113 | 
114 | ggplot(attenu, aes(x=dist, y=accel, col=mag)) + 
115 |   geom_point() + 
116 |   scale_color_gradientn(colors = rainbow(3)) + 
117 |   geom_line(data = new_d5) + 
118 |   geom_line(data = new_d8)
119 | ```
120 | 
121 | How do your predictions compare to the data? 
122 | Which characteristics of the data are captured well, and which are captured poorly by your model?
123 | 
124 | *The predictions match the decay of the data somewhat, though the predictions aren't as extreme as the data on either end. Also the model does not capture the positivity of acceleration - for low expected values the model does have positive probability for negative acceleration.*
125 | 
126 | # Problem 2
127 | 
128 | Pat loves to play basketball. 
129 | You observe Pat practicing free throws at the recreation center one day. 
130 | Pat misses 3 shots in a row. 
131 | Generate a likelihood profile for $p$, the probability that Pat makes a free throw.
132 | 
133 | ```{r}
134 | p <- seq(0, 1, .01)
135 | lik <- dbinom(0, 3, p)
136 | plot(p, lik, type = 'l')
137 | ```
138 | 
139 | What is your MLE for $p$, and does it make sense? Why or why not?
140 | 
141 | *The MLE is 0. This seems unreasonable - it is very unlikely that Pat has a zero probability of making a free throw.*
142 | 


--------------------------------------------------------------------------------
/assignments/5-binomial/wk5_problems.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Week 5 assignment: Binomial models'
  3 | author: "Your name here"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | output: pdf_document
  6 | ---
  7 | 
  8 | 
  9 | About one out of eight women in the U.S. will develop breast cancer at some point in her lifetime.
 10 | Early diagnoses help with treatment of this potentially fatal disease, and these diagnoses can be made based on a variety of cytological metrics evaluated via biopsy.
 11 | Your job today is to develop a model that classifies tumors as malignant or benign based on these metrics.
 12 | The student(s) with the most predictive model will get a prize.
 13 | 
 14 | The data are in the `breast_cancer.csv` file.
 15 | Details for this dataset can be found [on the UCI machine learning data repository](https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Original)), which is useful if you ever need data to play with.
 16 | I split the data into two groups at random: the *training* data, which you'll use to estimate parameters, and the *test* data, which we'll use to evaluate the predictive power of the model.
 17 | There is a column in the data called `group`, which indicates whether an observation is part of the training or test set.
 18 | 
 19 | ## Data exploration
 20 | 
 21 | As usual, you will want to explore the data before constructing any statistcal models.
 22 | Only explore the training data, and do not use the test data for data exploration/visualization.
 23 | We will pretend that we don't have access to the test data yet.
 24 | 
 25 | ```{r, message = FALSE}
 26 | # your code here
 27 | ```
 28 | 
 29 | 
 30 | ## Model structure
 31 | 
 32 | What is your model? Write it out in \LaTeX. Hint: you will want to use a design matrix.
 33 | 
 34 | *LaTeX here*
 35 | 
 36 | What is your Stan model statement?
 37 | 
 38 | ```
 39 | Your stan code
 40 | ```
 41 | 
 42 | ## Building and understanding the design matrix
 43 | 
 44 | We mentioned that you would want to use a design matrix.
 45 | Specifically, your model should be of the form:
 46 | 
 47 | $y \sim Bernoulli(p)$
 48 | 
 49 | And the probability of malignancy $p$ is modeled using a logit-link:
 50 | 
 51 | $log \Big(\dfrac{p}{1 - p} \Big) = X \beta$
 52 | 
 53 | The design matrix $X$ contains the tumor features, and also dictates the interpretation of the coefficients $\beta$.
 54 | In the code block below, construct your design matrix, creating an object called `X`.
 55 | The included code will make an image plot of your design matrix with a horrendous color scheme.
 56 | Once you fill in your code, set the argument `eval = TRUE` inside of the curly braces at the beginning of the code chuck (this is a chunk option), otherwise the code chunk will not be evaluated when you're knitting your pdf.
 57 | 
 58 | ```{r, eval = FALSE}
 59 | # define your design matrix below
 60 | X <- ...
 61 | 
 62 | 
 63 | # the code below will plot your design matrix
 64 | library(reshape2)
 65 | library(ggplot2)
 66 | mX <- melt(X)
 67 | ggplot(mX, aes(x = Var2, y = Var1)) +
 68 |   geom_raster(aes(fill = value)) +
 69 |   scale_y_reverse() +
 70 |   xlab('Design matrix column') +
 71 |   ylab('Design matrix row') +
 72 |   scale_fill_gradientn(colors = rainbow(20))
 73 | ```
 74 | 
 75 | 
 76 | For each column of $X$ you will get a coefficient, one element in $\beta$.
 77 | For instance, the coefficient $\beta_1$ will be associated with the first column in $X$, which we might denote $X[, 1]$, to borrow some R syntax.
 78 | There's no sense in estimating parameters if you don't know what they mean (Abraham Lincoln said that), so below, list each element in $\beta$ and briefly describe what it represents/how you would interpret it:
 79 | 
 80 | 
 81 | 1. $\beta_1$ represents *your text here*
 82 | 
 83 | 2. $\beta_2$ represents *your text here*
 84 | 
 85 | ... and so on, for all of your coefficients
 86 | 
 87 | 
 88 | 
 89 | ## Parameter estimation
 90 | 
 91 | Use the **training** data to estimate your model's parameters (`group == 'train'`).
 92 | Do not use the **test** data yet.
 93 | Make sure that the MCMC algorithm has converged before moving forward.
 94 | 
 95 | ```{r, message = FALSE}
 96 | # your code here
 97 | ```
 98 | 
 99 | 
100 | ## Out of sample predictive power
101 | 
102 | One measure of a model's ability to predict new data is the log likelihood of new data, given the parameters of the model $[\tilde{y} \mid \theta]$, where $\tilde{y}$ is the new data (the **test** or **validation** data), and the parameters $\theta$ have been estimated from other data (e.g., the **training** data).
103 | 
104 | Hints:
105 | 
106 | - this is done most easily via a new design matrix $X_{test}$, which can be multiplied by the vector of model parameters, and must be declared in the `data` block
107 | - make sure that if you used any feature scaling or centering in the training data, that the exact same scaling/centering schemes are applied to the test set
108 | - you'll use the `generated quantities` block to calculate the log-likelihood of the test data
109 | - you can obtain the joint log likelihood with the `bernoulli_logit_log` function in Stan, and I wrote a generated quantities model block for you below, which should be the last block in your new Stan model statement
110 | 
111 | What is your updated Stan model?
112 | 
113 | ```
114 | Your stan code here
115 | 
116 | 
117 | generated quantities {
118 |   real loglik_test;
119 |   vector[n_test] logit_p_test;
120 | 
121 |   logit_p_test <- X_test * beta;
122 |   loglik_test <- bernoulli_logit_log(y_test, logit_p_test);  
123 |   //returns the sum of the log likelihoods (the joint log-likelihood)
124 | }
125 | 
126 | ```
127 | 
128 | Acquire the posterior distribution of the model parameters and the holdout log likelihood.
129 | 
130 | ```{r}
131 | # your code here
132 | ```
133 | 
134 | Make a histogram of the holdout log likelihood and report the posterior mean along with a 95% credible interval.
135 | 
136 | ```{r}
137 | # your code here
138 | ```
139 | 
140 | 
141 | ## Showing predictions
142 | 
143 | The whole point of building this model is to predict whether a tumor is malignant based on some features.
144 | Plot the posterior probability of tumor malignance for each holdout tumor, and show the true tumor status in the same graph.
145 | Multiple graph types are possible here, but we do not recommend simply copying and pasting code from another example (so far about a quarter of plots made in this way have made sense).
146 | Instead, think hard about what sort of data display would be effective, and make that plot!
147 | 
148 | ```{r}
149 | # your code here
150 | ```
151 | 


--------------------------------------------------------------------------------
/slides/4-poisson/wk4_slides.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Week 4: Poisson models"
  3 | fontsize: 7pt
  4 | output:
  5 |   beamer_presentation:
  6 |     colortheme: "spruce"
  7 |     fonttheme: "structurebold"
  8 |     latex_engine: xelatex
  9 | header-includes: 
 10 | - \usepackage{listings}
 11 | - \lstset{basicstyle=\small}
 12 | - \setmonofont[Scale=MatchLowercase]{Courier}
 13 | - \setmonofont[Scale=0.8]{Courier}
 14 | ---
 15 | 
 16 | 
 17 | ```{r, echo=FALSE, message=FALSE}
 18 | library(ggplot2)
 19 | ```
 20 | 
 21 | ## Poisson glm
 22 | 
 23 | $$y_i \sim Poisson(\mu_i)$$
 24 | 
 25 | $$\log(\mu) = X \beta$$
 26 | 
 27 | Why not $\mu = X \beta$?
 28 | 
 29 | ## Offsets
 30 | 
 31 | Account for exposure
 32 | 
 33 | $\implies$ modeling a rate
 34 | 
 35 | $$\log(\mu_i) = X \beta + log(offset)$$
 36 | 
 37 | ## Offsets
 38 | 
 39 | What about the following examples?
 40 | 
 41 | - number of events over time interval
 42 | - number of events per attempted event
 43 | - number of events in an area (e.g., county)
 44 | 
 45 | ## Model checking
 46 | 
 47 | 1. Prior sensitivity analysis
 48 | 2. Sensicality of inference
 49 | 3. Posterior predictive checks
 50 | 
 51 | ## Posterior predictive distribution
 52 | 
 53 | Distribution of predicted data, given the observations
 54 | 
 55 | $$[\tilde{y} \mid y]$$
 56 | 
 57 | **Useful idea:**
 58 | 
 59 | For a *good* model, predicted data resembles the real data
 60 | 
 61 | 
 62 | ## Posterior predictive check
 63 | 
 64 | Do model predictions match the data? 
 65 | 
 66 | **Steps:**
 67 | 
 68 | 1. for each posterior draw:
 69 |     - simulate a response vector $y_{rep}$ 
 70 |     - calculate some test statistic $T(y^{rep})$
 71 | 2. compare observed $T(y)$ to the distribution of $T(y^{rep})$
 72 | 
 73 | 
 74 | ## Posterior predictive check example
 75 | 
 76 | $$y = THTHTHTHTT$$
 77 | 
 78 | Sequence of H and T switches consistent with Bernoulli model?
 79 | 
 80 | ## The model
 81 | 
 82 | $$y = THTHTHTHTT$$
 83 | 
 84 | Likelihood: 
 85 | 
 86 | $$[y_i \mid p] \sim Bernoulli(p)$$
 87 | 
 88 | Prior: 
 89 | 
 90 | $$[p] \sim Beta(100, 100)$$
 91 | 
 92 | Posterior: 
 93 | 
 94 | $$[p \mid y] \sim Beta(104, 106)$$
 95 | 
 96 | 
 97 | ## The posterior distribution for P(heads)
 98 | 
 99 | ```{r, echo = FALSE}
100 | y <- c(0, 1, 0, 1, 0, 1, 0, 1, 0, 0)
101 | ```
102 | 
103 | ```{r, fig.width=8, fig.height=4, echo=FALSE}
104 | p_post <- rbeta(5000, 104, 106)
105 | hist(p_post, breaks = 50, xlab = 'p', 
106 |      ylab = expression(paste('[', p ,'|', y, ']')), freq = FALSE)
107 | ```
108 | 
109 | 
110 | ## Simulating data from the posterior
111 | 
112 | 1. for each posterior draw:
113 |     - simulate a response vector $y_{rep}$ 
114 |     
115 | ```{r}
116 | rbinom(n = 10, size = 1, prob = p_post[1])
117 | ```
118 | 
119 | 
120 | ## Simulating data from the posterior
121 | 
122 | 1. for each posterior draw:
123 |     - simulate a response vector $y_{rep}$ 
124 |     
125 | ```{r}
126 | # make a 2d array to store new coinflips
127 | n_flips <- length(y)
128 | n_iter <- length(p_post)
129 | y_rep <- array(dim = c(n_iter, n_flips))
130 | 
131 | # simulate new coinflip sequences
132 | for (i in 1:n_iter) {
133 |   y_rep[i, ] <- rbinom(n_flips, 1, p_post[i])
134 | }
135 | ```
136 | 
137 | 
138 | ## Choosing a test statistic
139 | 
140 | $$y = THTHTHTHTT$$
141 | 
142 | ```{r, comment = NA}
143 | y_rep[1:4, ]
144 | ```
145 | 
146 | 
147 | ## Choosing a test statistic
148 | 
149 | Define $T(y) =$ number of switches between heads and tails in $y$
150 | 
151 | ```{r, comment = NA}
152 | y_rep[1:4, ]
153 | ```
154 | 
155 | 
156 | ## Calculating the test statistic
157 | 
158 | Define a function to calculate $T(y)$
159 | 
160 | ```{r}
161 | count_n_switches <- function(y){
162 |   n <- length(y)
163 |   switches <- 0
164 |   for (i in 2:n) {
165 |     if (y[i - 1] != y[i]){
166 |       switches <- switches + 1
167 |     }
168 |   }
169 |   return(switches)
170 | }
171 | ```
172 | 
173 | 
174 | ## Calculating the test statistic under the model
175 | 
176 | ```{r}
177 | T_rep <- apply(y_rep, 1, count_n_switches)
178 | ```
179 | 
180 | ```{r, echo = FALSE, fig.width = 9, fig.height=4}
181 | hist(T_rep, breaks = seq(0, 10, .5), right = FALSE, col = 'grey')
182 | ```
183 | 
184 | 
185 | ## Compare observed $T(y)$ to the distribution of $T(y^{rep})$
186 | 
187 | $$y = THTHTHTHTT$$
188 | 
189 | ```{r}
190 | T_obs <- count_n_switches(y)
191 | ```
192 | 
193 | ```{r, fig.width = 9, fig.height=4, echo = FALSE}
194 | hist(T_rep, breaks = seq(0, 10, .5), right = FALSE, col = 'grey')
195 | abline(v = T_obs, col = 2, lwd = 2, lty = 2)
196 | ```
197 | 
198 | 
199 | ## If you miss p-values...
200 | 
201 | Bayesian p-value: $[T(y_{rep}, \theta) \geq T(y, \theta)]$
202 | 
203 | ```{r}
204 | mean(T_rep >= T_obs)
205 | ```
206 | 
207 | *How extreme are the data relative to model predictions?*
208 | 
209 | ## Graphical depiction: Bayesian p-value
210 | 
211 | ```{r, fig.width = 8, fig.height = 5, echo = FALSE, message = FALSE}
212 | library(ggplot2)
213 | library(dplyr)
214 | d <- data.frame(T_rep = T_rep, greq = T_rep >= T_obs) %>%
215 |   arrange(T_rep) %>%
216 |   mutate(index = 1:n())
217 | ggplot(d, aes(x = index, y = T_rep)) + 
218 |   geom_jitter(aes(color = greq), 
219 |               position = position_jitter(width = 0, height = 1), 
220 |               shape = 1, size = .3) + 
221 |   scale_color_discrete(guide = guide_legend(title = "T_rep >= T_obs")) + 
222 |   geom_hline(yintercept = T_obs, linetype = 'dashed') + 
223 |   geom_text(aes(label = 'T_obs', x = 2500, y = T_obs)) + 
224 |   scale_y_continuous(breaks = 0:10)
225 | ```
226 | 
227 | 
228 | ## Graphical depiction: Bayesian p-value
229 | 
230 | ```{r, echo = FALSE, fig.width = 7, fig.height = 4}
231 | ggplot(d, aes(x=T_rep, fill = greq)) + 
232 |   geom_bar() + 
233 |   scale_x_continuous(breaks = 0:10) + 
234 |   geom_vline(xintercept = T_obs, linetype = 'dashed') + 
235 |   scale_fill_discrete(guide = guide_legend(title = "T_rep >= T_obs")) + 
236 |   geom_text(aes(label = 'T_obs', x = T_obs, y = 0))
237 | ```
238 | 
239 | 
240 | ## Posterior predictive checks
241 | 
242 | Model assessment tool
243 | 
244 | - data consistent with posterior predictive distribution?
245 | - what features are captured by the model? 
246 | - variance, min, max, range, skewness, kurtosis, etc.
247 | 
248 | ## Bayesian vs. frequentist p-values
249 | 
250 | **Bayesian**
251 | 
252 | - uses good parameter values: $[\theta \mid y]$
253 | - model criticism and expansion
254 | - many possible test statistics
255 | 
256 | **Frequentist**
257 | 
258 | - uses null parameter values: $\beta = 0$
259 | - hypothesis testing
260 | - strict test statistic and rejection criteria
261 | 
262 | ## This week:
263 | 
264 | Gall wasp example:
265 | 
266 | - develop Poisson models
267 | - conduct posterior predictive check
268 | 
269 | ## The data
270 | 
271 | ```{r, comment = NA}
272 | d <- read.csv('cleaned_galls.csv')
273 | head(d)
274 | ```
275 | 


--------------------------------------------------------------------------------
/example_solutions/2-maximum_likelihood/wk2_solutions_will.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Week 2 assignment: likelihood'
  3 | author: "Your name here"
  4 | date: "Jan 17, 2015"
  5 | output: pdf_document
  6 | ---
  7 | 
  8 | # Problem 1
  9 | 
 10 | Earthquakes are most forceful at their epicenters, and this force attenuates with distance.
 11 | R has an earthquake dataset `attenu` with measurements of peak horizontal ground acceleration by distance measured at multiple recording stations, with each earthquake coded in terms of magnitude.
 12 | See the help file for more information on this dataset by typing `?attenu`.
 13 | 
 14 | Your main task this week is to build one model that predicts both ground acceleration at the epicenter (distance = 0), and the acceleration by distance curve as a function of magnitude and distance from the epicenter.
 15 | You will obtain maximum likelihood estimates for the parameters of the model using the `optim` function.
 16 | The structure of your model is up to you.
 17 | You can use a combination of intuition, imagination, first principles, research, and collaboration to construct your model.
 18 | (Note: there are many possible models that one could construct!)
 19 | 
 20 | You will benefit from visualizing the data, specifically the relationships between the quantities of interest i.e., magnitude (`mag`), distance from epicenter (`dist`), and peak acceleration (`accel`).
 21 | Include your visualization code and plots below.
 22 | 
 23 | ```{r}
 24 | # plot acceleration as a function of distance
 25 | library(ggplot2)
 26 | ggplot(data = attenu, aes(dist,accel)) +
 27 |   geom_point(aes(col = mag)) +
 28 |   geom_smooth(se = FALSE)
 29 |   # seems like an exponential decay overall, but are there differences in the
 30 |   # acceleration at the epicenter and the rate of decay for earthquakes of
 31 |   # different magnitudue?
 32 | 
 33 | # let's bin the magnitudes to make visualization a little easier
 34 | attenu[attenu$mag >= 5 & attenu$mag <= 5.5,"mag_round"] <- "5-5.5 "
 35 | attenu[attenu$mag > 5.5 & attenu$mag <=6,"mag_round"] <- "5.6-6 "
 36 | attenu[attenu$mag > 6 & attenu$mag <=6.5,"mag_round"] <- "6-6.5 "
 37 | attenu[attenu$mag > 6.6 & attenu$mag <=7,"mag_round"] <- "6.6-7 "
 38 | attenu[attenu$mag > 7, "mag_round"] <- ">7"
 39 | table(attenu$mag_round)
 40 | 
 41 | # re-plot acceleration as a function of distance
 42 | ggplot(data = attenu, aes(dist,accel)) +
 43 |   geom_smooth(se = FALSE, aes(color = mag_round))
 44 |   # doesn't look like a huge difference in the rate of decay until magnitudes
 45 |   # greater than 7. However, the epicenter acceleration seems maximized for
 46 |   # intermediate values of magnitude.
 47 | 
 48 | 
 49 | 
 50 | ```
 51 | 
 52 | What is the equation for your model?
 53 | Write it in \LaTeX, bounded between dollar signs (e.g., $e = mc^2$), not in normal text.
 54 | 
 55 | $y_i \sim N((\beta m_i + \gamma m_i^2) * e^{-\frac{d_i}{m_i \lambda}}, \sigma)$
 56 | 
 57 | Define all of the parameters, and explain why you formulated it in this way.
 58 | What assumptions went into your model's construction?
 59 | 
 60 | *Here, $y_i$ is the predicted acceleration, $m_i$ is the magnitude and $d_i$ is the distance from the epicenter. The parameters $\beta$ and $\gamma$ together determine the amount of acceleration when distance equals zero. Because $\gamma$ is multiplied by the square of the magnitude, this means that the acceleration at the epicenter will be quadratic (rather than linear) function of the magnitude of the earthquake. Hopefully, this accounts for the in larger accelerations at the epicenter for moderately magnitude earthquakes. $\lambda$ is the exponential decay constant, which itself is proportional to the magnitude of the earthquake. This should reduce the the rate of decay of earthquakes of higher magnitude (slightly).*
 61 | 
 62 | Write a function called `nll` that returns the negative log likelihood for your model.
 63 | The arguments to this function should be `theta` (the parameters), and `data` (the data).
 64 | 
 65 | ```{r}
 66 | # function to return the expected value (the mean function)
 67 | predict_accel <- function(theta, data){
 68 |   beta <- theta['beta']
 69 |   gamma <- theta['gamma']
 70 |   lambda <- theta['lambda']
 71 |   sigma <- exp(theta['lsigma'])
 72 |   mu <- (beta * data[,'mag'] + gamma * (data[,'mag'])^2) *  exp(- data[, 'dist']
 73 |    / (lambda * data[,'mag']))
 74 |   mu
 75 | }
 76 | 
 77 | # function to return the negative log likelihood
 78 | nll <- function(theta, data){
 79 |   mu <- predict_accel(theta, data)  
 80 |   -sum(dnorm(data[, 'accel'], mu, exp(theta['lsigma']), log = TRUE))
 81 | }
 82 | ```
 83 | 
 84 | Use `optim` to obtain maximum likelihood estimates for your model parameters.
 85 | 
 86 | ```{r}
 87 | # intial values
 88 | inits <- c(beta = 0, gamma = .1, lambda = .1, lsigma = .1)
 89 | 
 90 | # optimize
 91 | out <- optim(inits, nll, data = attenu)
 92 | out
 93 | ```
 94 | 
 95 | Did `optim()` converge to a minimum? How do you know?
 96 | 
 97 | *Sure did. Convergence = 0*
 98 | 
 99 | Create a scatterplot with fitted values vs. residuals.
100 | 
101 | ```{r}
102 | attenu$predictions <- predict_accel(out$par, attenu)
103 | attenu$residuals <- with(attenu, accel - predictions)
104 | ggplot(attenu, aes(x = predictions, y = residuals)) +
105 |   geom_point()
106 | ```
107 | 
108 | What stands out in the plot of fitted values vs. residuals?
109 | Are you worried about any violations of assumptions?
110 | Why or why not?
111 | 
112 | *Definately some heteroscedasticity present. There is much more spread around the larger predicted accelerations*
113 | 
114 | Plot the distance by acceleration data along with your predicted curves (starting at a distance of 0) for earthquakes of magnitude 5 and 8.
115 | 
116 | ```{r}
117 | # create a vector of distances
118 | lo <- 100
119 | dists <- seq(0, max(attenu$dist), length.out=lo)
120 | 
121 | new_d5 <- data.frame(mag = 5, dist = dists)
122 | new_d5$accel <- predict_accel(out$par, new_d5)
123 | 
124 | new_d8 <- data.frame(mag = 8, dist = dists)
125 | new_d8$accel <- predict_accel(out$par, new_d8)
126 | 
127 | ggplot(attenu, aes(x=dist, y=accel, col=mag)) +
128 |   geom_point() +
129 |   scale_color_gradientn(colors = rainbow(3)) +
130 |   geom_line(data = new_d5) +
131 |   geom_line(data = new_d8)
132 | ```
133 | 
134 | How do your predictions compare to the data?
135 | Which characteristics of the data are captured well, and which are captured poorly by your model?
136 | 
137 | *Not too shabby. We did a good job of separating out the fits for low and high magnitude earthquakes, though we have little data for large magnitude earthquakes near the epicenter. We may be slightly underestimating the acceleration for large magnitude earthquakes as well.*
138 | 
139 | # Problem 2
140 | 
141 | Pat loves to play basketball.
142 | You observe Pat practicing free throws at the recreation center one day.
143 | Pat misses 3 shots in a row.
144 | Generate a likelihood profile for $p$, the probability that Pat makes a free throw.
145 | 
146 | ```{r}
147 | p <- seq(0, 1, .01)
148 | lik <- dbinom(0, 3, p)
149 | qplot(p, lik, geom = "line")
150 | ```
151 | 
152 | What is your MLE for $p$, and does it make sense? Why or why not?
153 | 
154 | *Well, technically speaking, we only saw Pat attempt three free throws. Clearly, the maximum likelihood estimate for the probablility that Pat has ever, or will ever, make a free throw is zero. We are completely objective scientists after all. Data don't lie*
155 | 


--------------------------------------------------------------------------------
/slides/3-bayes/wk3_slides.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Week 3: Bayesian inference"
  3 | fontsize: 7pt
  4 | output:
  5 |   beamer_presentation:
  6 |     colortheme: "spruce"
  7 |     fonttheme: "structurebold"
  8 |     latex_engine: xelatex
  9 | header-includes: 
 10 | - \usepackage{listings}
 11 | - \lstset{basicstyle=\small}
 12 | - \setmonofont[Scale=MatchLowercase]{Courier}
 13 | - \setmonofont[Scale=0.6]{Courier}
 14 | ---
 15 | 
 16 | 
 17 | ```{r, echo=FALSE, message=FALSE}
 18 | library(ggplot2)
 19 | library(rstan)
 20 | rstan_options(auto_write = TRUE)
 21 | par(bty='n')
 22 | ```
 23 | 
 24 | ## Bayes' theorem
 25 | 
 26 | $$p(\theta \mid y) = \dfrac{p(y \mid \theta) p(\theta)}{p(y)}$$
 27 | 
 28 | $$p(\theta \mid y) \propto p(y \mid \theta) p(\theta)$$
 29 | 
 30 | ## Freethrow example
 31 | 
 32 | 0 shots made, 3 attempts
 33 | 
 34 | ```{r, echo=FALSE}
 35 | p <- seq(0, 1, .01)
 36 | lik <- dbinom(0, 3, p)
 37 | plot(p, lik, type = 'l', xlab = 'Pr(make freethrow)', ylab = '')
 38 | legend('topright', lty=1, col=1, legend = 'Likelihood', bty='n')
 39 | ```
 40 | 
 41 | 
 42 | ## What is the MLE?
 43 | 
 44 | ```{r, echo=FALSE, fig.width=5, fig.height=4}
 45 | plot(p, lik, type = 'l', xlab = 'Pr(make freethrow)', ylab = '')
 46 | legend('topright', lty=1, col=1, legend = 'Likelihood', bty='n')
 47 | ```
 48 | 
 49 | 
 50 | 
 51 | ## What is our prior?
 52 | 
 53 | ```{r, echo=FALSE, fig.width=5, fig.height=4}
 54 | plot(p, lik, type = 'l', xlab = 'Pr(make freethrow)', ylab = '')
 55 | legend('topright', lty=1, col=1, legend = 'Likelihood', bty='n')
 56 | ```
 57 | 
 58 | 
 59 | ## Uniform prior (never heard of a "free throw")
 60 | 
 61 | ```{r, echo=FALSE, fig.width=5, fig.height=4}
 62 | plot(p, lik, type = 'l', xlab = 'Pr(make freethrow)', ylab = '')
 63 | x <- seq(0, 1, .01)
 64 | lines(x, dbeta(x, 1, 1), col='red')
 65 | legend('bottomleft', lty=1, col=1:2, legend = c('Likelihood', 'Prior'), 
 66 |        bty='n')
 67 | ```
 68 | 
 69 | 
 70 | ## The posterior: uniform prior
 71 | 
 72 | $$y \sim Binomial(k = 3, p)$$
 73 | 
 74 | $$p \sim Beta(1, 1)$$
 75 | 
 76 | ```{r, echo=FALSE, fig.width=7, fig.height=4}
 77 | plot(p, lik, type = 'l', xlab = 'Pr(make freethrow)', ylab = '', ylim = c(0, 4))
 78 | x <- seq(0, 1, .01)
 79 | lines(x, dbeta(x, 1, 1), col='red')
 80 | lines(x, dbeta(x, 1, 4), col='green')
 81 | legend('topright', lty=1, col=1:3, legend = c('Likelihood', 'Prior', 'Posterior'), 
 82 |        bty='n')
 83 | ```
 84 | 
 85 | 
 86 | ## Non-uniform prior
 87 | 
 88 | $$y \sim Binomial(k = 3, p)$$
 89 | 
 90 | $$p \sim Beta(2, 2)$$
 91 | 
 92 | ```{r, echo=FALSE, fig.width=7, fig.height=4}
 93 | plot(p, lik, type = 'l', xlab = 'Pr(make freethrow)', ylab = '', ylim = c(0, 3))
 94 | x <- seq(0, 1, .01)
 95 | lines(x, dbeta(x, 2, 2), col='red')
 96 | lines(x, dbeta(x, 2, 5), col='green')
 97 | legend('topright', lty=1, col=1:3, legend = c('Likelihood', 'Prior', 'Posterior'), 
 98 |        bty='n')
 99 | ```
100 | 
101 | 
102 | 
103 | ## What if Pat takes a lot of free throws?
104 | 
105 | $$y \sim Binomial(k >> 3, p)$$
106 | 
107 | $$p \sim Beta(2, 2)$$
108 | 
109 | $k \rightarrow \inf$: prior doesn't matter
110 | 
111 | ## Demo: freethrows in Stan
112 | 
113 | 
114 | 
115 | 
116 | ## MCMC animation
117 | 
118 | [`http://mbjoseph.github.io/2013/09/08/metropolis.html`](http://mbjoseph.github.io/2013/09/08/metropolis.html)
119 | 
120 | ## Bayes in practice
121 | 
122 | 1. write model
123 | 
124 | 2. translate model
125 | 
126 | 3. estimate parameters
127 | 
128 | ## Bayes in practice
129 | 
130 | 1. **write model**
131 | 
132 | 2. translate model
133 | 
134 | 3. estimate parameters
135 | 
136 | ## Bayesian linear regression
137 | 
138 | ```{r, echo=FALSE, fig.width=4, fig.height=3}
139 | n <- 50
140 | x <- runif(n, 0, 3)
141 | y <- rnorm(n, -3 + .75 * x, 1)
142 | d <- data.frame(x, y)
143 | ggplot(d, aes(x, y)) + geom_point()
144 | ```
145 | 
146 | 
147 | ## Writing a model
148 | 
149 | $$y \sim N(\mu, \sigma)$$
150 | 
151 | $$\mu = X \beta$$
152 | 
153 | ```{r, echo=FALSE, fig.width=2, fig.height=4}
154 | image(t(model.matrix(lm(y ~ x))), xaxt='n', yaxt='n')
155 | ```
156 | 
157 | 
158 | ## Writing a model
159 | 
160 | $y \sim N(\mu, \sigma)$
161 | 
162 | $\mu = X \beta$
163 | 
164 | What's missing?
165 | 
166 | ## Writing a model
167 | 
168 | $y \sim N(\mu, \sigma)$
169 | 
170 | $\mu = X \beta$
171 | 
172 | $\beta \sim N(0, 2)$
173 | 
174 | $\sigma \sim halfCauchy(0, 5)$
175 | 
176 | ## Writing a model
177 | 
178 | 
179 | \begin{columns}
180 | \begin{column}{0.48\textwidth}
181 | 
182 | $y \sim N(\mu, \sigma)$
183 | 
184 | $\mu = X \beta$
185 | 
186 | $\beta \sim N(0, 2)$
187 | 
188 | $\sigma \sim halfCauchy(0, 5)$
189 | \end{column}
190 | \begin{column}{0.48\textwidth}
191 | \includegraphics{diag}
192 | \end{column}
193 | \end{columns}
194 | 
195 | 
196 | ## Writing a model
197 | 
198 | \begin{columns}
199 | \begin{column}{0.48\textwidth}
200 | 
201 | $$\big[\theta \mid y \big] = \dfrac{\big[\theta, y \big]}{\big[y \big]}$$
202 | 
203 | $$\implies \big[\theta \mid y \big] \propto \big[\theta, y \big]$$
204 | 
205 | Factoring $\big[\theta, y \big]$ with graph:
206 | 
207 | $$\big[\theta \mid y \big] \propto \big[y \mid \beta, \sigma\big] \big[\beta \big] \big[\sigma \big]$$
208 | 
209 | \end{column}
210 | \begin{column}{0.48\textwidth}
211 | \includegraphics{diag}
212 | \end{column}
213 | \end{columns}
214 | 
215 | ## Components of the posterior distribution
216 | 
217 | \begin{columns}
218 | \begin{column}{0.48\textwidth}
219 | 
220 | $$\big[\theta \mid y \big] \propto \big[y \mid \beta, \sigma\big] \big[\beta \big] \big[\sigma \big]$$
221 | 
222 | $\big[y \mid \beta, \sigma\big]:$ likelihood
223 | 
224 | $\big[\beta \big]:$ prior for slope
225 | 
226 | $\big[\sigma \big]:$ prior for standard deviation
227 | 
228 | \end{column}
229 | \begin{column}{0.48\textwidth}
230 | \includegraphics{diag}
231 | \end{column}
232 | \end{columns}
233 | 
234 | ## Bayes in practice
235 | 
236 | 1. write model
237 | 
238 | 2. **translate model**
239 | 
240 | 3. estimate parameters
241 | 
242 | 
243 | ## Stan translation
244 | 
245 | \begin{columns}
246 | \begin{column}{0.48\textwidth}
247 | 
248 | \lstinputlisting{lm.stan}
249 | 
250 | \end{column}
251 | \begin{column}{0.48\textwidth}
252 | 
253 | $\beta \sim N(0, 2)$
254 | 
255 | $\sigma \sim halfCauchy(0, 5)$
256 | 
257 | $y \sim N(X \beta, \sigma)$
258 | \end{column}
259 | \end{columns}
260 | 
261 | 
262 | ## Bayes in practice
263 | 
264 | 1. write model
265 | 
266 | 2. translate model
267 | 
268 | 3. **estimate parameters**
269 | 
270 | ## Estimating parameters
271 | 
272 | ```{r, echo=FALSE}
273 | X <- model.matrix(lm(y ~ x))
274 | ```
275 | 
276 | ```{r, message=FALSE, results='hide'}
277 | library(rstan)
278 | stan_d <- list(y = y, X = X, n = nrow(X), p = ncol(X))
279 | m <- stan('lm.stan', data = stan_d, iter=1000)
280 | ```
281 | 
282 | The last line does the following:
283 | 
284 | - generates MCMC algorithm for your model
285 | - compiles it into fast C code
286 | - initializes parameters
287 | - runs MCMC algorithm
288 | - formats output into a `stanfit` model
289 | 
290 | 
291 | ## Evaluating convergence
292 | 
293 | ```{r, fig.height=4, fig.width=12}
294 | traceplot(m, inc_warmup = TRUE)
295 | ```
296 | 
297 | 
298 | ## Evaluating convergence
299 | 
300 | ```{r, fig.height=4, fig.width=12}
301 | traceplot(m)
302 | ```
303 | 
304 | 
305 | ## Evaluating convergence
306 | 
307 | ```{r}
308 | m
309 | ```
310 | 
311 | 
312 | ## Not enough iterations
313 | 
314 | ```{r, message=FALSE, echo=FALSE, results='hide'}
315 | library(rstan)
316 | m2 <- stan('lm.stan', data = stan_d, iter=100)
317 | ```
318 | 
319 | ```{r, echo=FALSE}
320 | traceplot(m2, inc_warmup = TRUE)
321 | ```
322 | 
323 | 
324 | 
325 | ## Posterior geometry
326 | 
327 | ```{r}
328 | pairs(m)
329 | ```
330 | 
331 | 
332 | 
333 | ## Correlation between slope and intercept
334 | 
335 | Is $\bar{x}$ positive or negative?
336 | 
337 | ```{r, echo=FALSE, fig.width=5, fig.height=3}
338 | post <- rstan::extract(m)
339 | beta_d <- data.frame(post$beta)
340 | ggplot(beta_d, aes(x=X1, y=X2)) + 
341 |   geom_point(alpha=.2) + 
342 |   xlab(expression(beta[0])) + 
343 |   ylab(expression(beta[1]))
344 | ```
345 | 
346 | 
347 | 
348 | ## Visualizing posterior draws
349 | 
350 | ```{r, echo=FALSE, fig.width=5, fig.height=3}
351 | ggplot(d, aes(x, y)) + 
352 |   geom_abline(data=beta_d, color='red',
353 |               aes(slope=X2, intercept=X1), alpha=.05) + 
354 |   geom_point()
355 | ```
356 | 
357 | 
358 | 
359 | ## Today's class: ladybird beetles and parasitoids
360 | 
361 | ![](beetle.jpg)
362 | 


--------------------------------------------------------------------------------
/assignments/6-shrinkage/wk6_problems.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Week 6 assignments: multilevel models'
  3 | author: "Your name here"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | output:
  6 |   pdf_document:
  7 |     fig_caption: yes
  8 | ---
  9 | 
 10 | # Understanding the motivation for multilevel models
 11 | 
 12 | We will return to the amniote data from week 1, and explore how to fit a four level model with `lmer`.
 13 | This week, we have written most of the code for you to alleviate some of the progamming burden and allow you to focus on the concepts. 
 14 | This assignment is also shorter, but for those of you who want to dig deeper into the material, we have two recommended problems at the end.
 15 | Your job is to grasp what is happening conceptually, and we have crafted some questions to point you in some important directions along the way. 
 16 | 
 17 | First, we will load some packages and the data, and clean the data for you. 
 18 | 
 19 | ```{r, message = FALSE}
 20 | library(ggplot2)
 21 | library(dplyr)
 22 | library(tidyr)
 23 | 
 24 | # load the data 
 25 | d <- read.csv("amniotes.csv")
 26 | 
 27 | # replace -999 with NA (missing observations)
 28 | d[d == -999] <- NA
 29 | 
 30 | # subset data to complete observations of longevity and mass
 31 | d <- subset(d, !is.na(maximum_longevity_y) &
 32 |               !is.na(adult_body_mass_g)) %>%
 33 |   droplevels() %>%
 34 |   select(class, order, family, genus, species, 
 35 |          maximum_longevity_y, adult_body_mass_g)
 36 | 
 37 | # create binomial name varable (Genus + species)
 38 | d$binomial <- paste(d$genus, d$species)
 39 | 
 40 | # log transform longevity
 41 | d$long <- log(d$maximum_longevity_y)
 42 | 
 43 | # scale log transformed mass
 44 | d$clmass <- c(scale(log(d$adult_body_mass_g)))
 45 | ```
 46 | 
 47 | Next, we will visualize the number of observations across taxonomic levels and groups. 
 48 | 
 49 | ```{r, fig.height = 12, fig.width = 9, fig.cap='Distribution of sample sizes across taxonomic levels.'}
 50 | par(mfrow=c(2, 2))
 51 | barplot(sort(table(d$class)), horiz = T, las = 2, cex.names = 1,
 52 |         xlab = 'Number of observations', 
 53 |         main = 'Class')
 54 | barplot(sort(table(d$order)), horiz = T, las = 2, cex.names = .5, 
 55 |         xlab = 'Number of observations', 
 56 |         main = 'Order')
 57 | barplot(sort(table(d$family)), horiz = T, las = 2, cex.names = .2,
 58 |         xlab = 'Number of observations', 
 59 |         main = 'Family')
 60 | barplot(sort(table(d$genus)), horiz = T, las = 2, cex.names = .1,
 61 |         xlab = 'Number of observations', 
 62 |         main = 'Genus')
 63 | ```
 64 | 
 65 | **Question 1**
 66 | 
 67 | Suppose we are interested in modeling the log longevity (lifespan) of amniotes with this data. 
 68 | We want to know what the mean longevity is for all classes, orders, families, and genera in the dataset. 
 69 | What about Figure 1 (the four barplots we just made) might point us toward a hierarchical model? 
 70 | 
 71 | *Your text here*
 72 | 
 73 | **Question 2**
 74 | 
 75 | One non-hierarchical option would be a no pooling model, which we implement with `lm()` below. 
 76 | 
 77 | ```{r}
 78 | no_pool <- lm(long ~ 0 + class + order + family + genus, data = d)
 79 | # (this takes tens of seconds to run)
 80 | ```
 81 | 
 82 | Inspect the output from this no pooling model by printing the `no_pool` object summary in your console (`summary(no_pool)`). 
 83 | What do you notice about the output that seems problematic? 
 84 | 
 85 | *Your text here*
 86 | 
 87 | # Estimating parameters with `lmer` & understanding their meaning
 88 | 
 89 | We can fit a hierarchical model for longevity as follows using `lmer`, which is in the `lme4` package:
 90 | 
 91 | ```{r, message = FALSE}
 92 | library(lme4)
 93 | m_fit <- lmer(long ~ 1 + (1 | class) + (1 | order) + (1 | family) + (1 | genus), 
 94 |             data = d)
 95 | m_fit
 96 | ```
 97 | 
 98 | **Question 3**
 99 | 
100 | Write out the model in mathematical notation, either long-form or using matrix operations. 
101 | Hint: one efficient way to do this is to use multiple design matrices.
102 | 
103 | Your \LaTeX
104 | 
105 | **Question 4**
106 | 
107 | In English, what does the `(Intercept)` parameter represent (accessible via `fixef(m_fit)`)?
108 | 
109 | *Your text here*
110 | 
111 | **Question 5**
112 | 
113 | In English, what do the random effect standard deviations represent? 
114 | 
115 | *Your text here*
116 | 
117 | **Question 6**
118 | 
119 | Why does the default plotting method for the random effects (e.g.,  `plot(ranef(m_fit))`) make normal quantile-quantile plots? 
120 | 
121 | *Your text here*
122 | 
123 | # Including more information in the model
124 | 
125 | We have a lot of other information about each species that we have ignored up to this point. 
126 | For instance, we might want to include body mass as a predictor, since large bodied species might live longer. 
127 | 
128 | ```{r, fig.width = 3, fig.height = 2.5, fig.align='center'}
129 | ggplot(d, aes(x = clmass, y = long)) + 
130 |   geom_point(alpha = .1) + 
131 |   xlab('Centered log mass') + 
132 |   ylab('log(Longevity)')
133 | ```
134 | 
135 | **Question 7**
136 | 
137 | If we include body mass as a covariate, what do you expect will happen to our estimate of the residual standard deviation?
138 | 
139 | *Your text here*
140 | 
141 | **Question 8**
142 | 
143 | What do you expect will happen to the estimated group-level standard deviations when including body mass in the model?
144 | 
145 | *Your text here*
146 | 
147 | Below, we fit a model using centered log mass as a covariate. 
148 | 
149 | ```{r}
150 | m_fit2 <- lmer(long ~ 1 + clmass + 
151 |                (1 | class) + (1|order) + (1|family) + (1|genus), 
152 |              data = d)
153 | ```
154 | 
155 | Compare the estimates of the residual standard deviation and group-level standard deviations to those from our first model that did not include mass. 
156 | 
157 | ```{r}
158 | VarCorr(m_fit)
159 | VarCorr(m_fit2)
160 | ```
161 | 
162 | Compare your predictions to the results - you may be surprised.
163 | 
164 | # Pat's free throws continued
165 | 
166 | **Question 9**
167 | 
168 | Thinking back to the day that you saw Pat miss 3 free throws, you remember that you also noticed other individuals in the gym taking some free throws. 
169 | Specifically, you happen to recall 20 other people shooting, who made 2/4, 13/17, 12/12, 7/15, 4/11, 14/16, 6/8, 7/14, 10/15, 2/3, 9/13, 1/3, 7/12, 6/14, 1/10, 4/4, 10/11, 3/3, 7/18, and 3/5 free throws. 
170 | Using this new information, make a hierarchical model to compute the maximum likelihood estimate for the probability that Pat makes a free throw. 
171 | 
172 | ```{r}
173 | # your code here
174 | ```
175 | 
176 | Print the maximum likelihood estimate for Pat's probability of making a free throw.
177 | 
178 | ```{r}
179 | # your code here (remember that coefficients are reported on the logit scale)
180 | ```
181 | 
182 | Why isn't the estimate 0, as it was before with our non-hierarchical model? 
183 | 
184 | *Your text here*
185 | 
186 | **Conceptual check (optional): visualizing estimated priors**
187 | 
188 | You just acquired maximum likelihood estimates for the parameters for the prior distribution of the probability that a person in the gym makes a free throw. 
189 | These parameters are calculated on the logit scale, but we usually think of probabilities on the inverse logit scale (bounded between 0 and 1). 
190 | Below, visualize the probability density of $p$ for a new player based on the model output. 
191 | You'll need to use the intercept and the among player standard deviation, and also the `plogis` function to transform the logit probabilities to probabilities. 
192 | Hint: visualize the normal density on the logit scale first with `dnorm`, so that you get your estimated "bell curve", then transform the x-axis to the probability scale. 
193 | 
194 | ```{r}
195 | #your code here
196 | ```
197 | 
198 | **Programming challenge (optional): where `lme4` fails**
199 | 
200 | The `lme4` package is useful to acquire point estimates, but not very useful if you need to quantify the uncertainty in the model parameters. 
201 | In particular, there is no easy way to incorporate uncertainty in the hyperparameters; more on this [here](http://stats.stackexchange.com/questions/147836/prediction-interval-for-lmer-mixed-effects-model-in-r) and [here](https://stat.ethz.ch/pipermail/r-sig-mixed-models/2010q1/003447.html).
202 | In contrast, a Bayesian approach automatically quantifies uncertainty for **all** model parameters. 
203 | 
204 | As an optional challenge, implement the amniote model in Stan below and plot the posterior distributions for the hyperparameters. 
205 | 
206 | ```{r}
207 | # your code here
208 | ```
209 | 


--------------------------------------------------------------------------------
/example_solutions/3-bayes/wk3_solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Week 3 assignment: Bayesian inference'
  3 | author: "Example solutions"
  4 | date: "January 17, 2015"
  5 | output: pdf_document
  6 | ---
  7 | 
  8 | # Problem 1
  9 | 
 10 | Maure et al. (2015) conducted experiments to see how ladybird beetle diet affected interactions with parasitoid wasps.
 11 | Individual beetles were randomly assigned to one of five diet treatments:
 12 | 
 13 | 1. 20 aphids per day
 14 | 2. 2 aphids per day
 15 | 3. 20 aphids per day + pollen
 16 | 4. 2 aphids per day + pollen
 17 | 5. pollen only
 18 | 
 19 | Each beetle was exposed to a female parisitoid wasp for 3 hours, which deposited eggs in the beetle, and the beetles were then fed each day and the outcome of the interaction monitored.
 20 | The authors wanted to know whether diet affected the probability of recovering from parasitoid attack.
 21 | Your task is to build a Bayesian model with Stan for beetle survival as a function of experimental treatment, estimating the probability of survival for each treatment.
 22 | To keep things simple, consider the treatments to be categorical and unordered.
 23 | The data are archived in Dryad [here](http://www.datadryad.org/resource/doi:10.5061/dryad.7fq4j).
 24 | 
 25 | **Important: do not manipulate the original raw data file!**
 26 | This is is error prone, leaves no written record, encourages spreadsheet farming, and is not reproducible.
 27 | And, in this case, the data are already well organized.
 28 | Read in the .xls file using the `readxl` package, then use R to clean the data as necessary.
 29 | 
 30 | ```{r, message = FALSE}
 31 | ## Loading the raw data
 32 | library(readxl) # use the read_excel() function to load the data
 33 | library(dplyr)
 34 | library(ggplot2)
 35 | d <- read_excel('data/diet_data.xlsx')
 36 | 
 37 | ## Cleaning the data
 38 | # remove NA columns
 39 | d <- d[, !is.na(names(d))]
 40 | 
 41 | # replace spaces in column names with underscores and convert to lowercase
 42 | names(d) <- gsub(' ', '_', names(d)) %>%
 43 |   tolower()
 44 | 
 45 | # create binary recovery variable
 46 | d <- d %>%
 47 |   mutate(recov = ifelse(ladybird_recovery == 'YES', 1, 0))
 48 | 
 49 | # calculate the proportion recovered for each diet
 50 | p_sample <- d %>%
 51 |   group_by(diet) %>%
 52 |   summarize(p_recov = mean(recov))
 53 | 
 54 | ## Visualizing the data
 55 | ggplot(d, aes(x=diet, y=recov)) +
 56 |   geom_jitter(position = position_jitter(width=.2, height=.1)) +
 57 |   geom_point(aes(y = p_recov), data = p_sample, col='red', size=3)
 58 | ```
 59 | 
 60 | Write out your model using \LaTeX:
 61 | 
 62 | $y_i \sim Bern(p_j[i])$
 63 | 
 64 | $p_j \sim Beta(1, 1)$
 65 | 
 66 | Paste your Stan model statement in the code block below, and ensure that your written model matches the notation in your Stan file:
 67 | 
 68 | ```
 69 | data {
 70 |   // define the types and names of the data
 71 |   int n; // n is an integer
 72 |   int<lower=0, upper=1> y[n]; // y is an integer vector with n elements
 73 |   int n_trt;
 74 |   int<lower=1, upper=n_trt> treatment[n];
 75 | }
 76 | parameters {
 77 |   vector<lower=0, upper=1>[n_trt] p; // p is a real number between 0 and 1
 78 | }
 79 | model {
 80 |   // define priors
 81 |   p ~ beta(1, 1);
 82 | 
 83 |   // likelihood
 84 |   for (i in 1:n)
 85 |     y[i] ~ bernoulli(p[treatment[i]]);
 86 | }
 87 | ```
 88 | 
 89 | Now, use `rstan` to fit your model.
 90 | Evaluate convergence by inspecting the $\hat{R}$ statistic and the traceplots.
 91 | 
 92 | ```{r, results = 'hide', message = FALSE}
 93 | library(rstan)
 94 | 
 95 | # bundle data into a list
 96 | stan_d <- list(n = nrow(d),
 97 |                y = d$recov,
 98 |                n_trt = length(unique(d$diet)),
 99 |                treatment = as.numeric(factor(d$diet)))
100 | 
101 | # fit the model
102 | m_fit <- stan('bern_mod.stan', data = stan_d)
103 | ```
104 | 
105 | ```{r}
106 | # print summary
107 | m_fit
108 | 
109 | # plot the chains
110 | traceplot(m_fit, inc_warmup=TRUE, alpha=.5) + ylim(0, 1)
111 | ```
112 | 
113 | Calculate posterior credible intervals, medians, means, and modes for the survival probabilities for each treatment.
114 | Hint: posterior draws can be extracted with the `rstan::extract` function, which returns a list of arrays.
115 | 
116 | ```{r, message = FALSE}
117 | library(reshape2)
118 | library(modeest)
119 | 
120 | # extract the posterior draws
121 | post <- rstan::extract(m_fit)
122 | 
123 | # make a data frame that contains all of the posterior draws
124 | post_df <- post$p %>%
125 |   melt(varnames = c('iter', 'trt'))
126 | 
127 | # match numeric treatment indicators to diet names
128 | post_df$diet <- levels(factor(d$diet))[post_df$trt]
129 | 
130 | # summarize each treatment's posterior in terms of quantiles and central tendency
131 | post_sum <- post_df %>%
132 |   group_by(diet) %>%
133 |   summarize(lo = quantile(value, .025),
134 |             hi = quantile(value, .975),
135 |             mode = mlv(value, method='mfv')$M,
136 |             median = median(value),
137 |             mean = mean(value))
138 | 
139 | # plot the data frame
140 | post_sum
141 | ```
142 | 
143 | Generate a plot that shows all of the raw data along with the posterior probability distributions of recovery for each treatment:
144 | 
145 | ```{r}
146 | # plot the raw data with interval estimates
147 | ggplot(d, aes(x=diet, y=recov)) +
148 |   geom_jitter(position = position_jitter(width=.2, height=.1)) +
149 |   geom_point(aes(y = p_recov), data = p_sample, col='red', size=3) +
150 |   geom_segment(aes(xend = diet, y = lo, yend = hi), data = post_sum) +
151 |   geom_point(aes(y=mode), data = post_sum, color='blue')
152 | 
153 | 
154 | ggplot(d, aes(x=recov)) +
155 |   geom_histogram() +
156 |   geom_density(aes(x=value), data = post_df) +
157 |   facet_wrap(~ diet)
158 | ```
159 | 
160 | The authors reported statistically significant differences in ladybird beetle recovery between the diet treatments.
161 | What is your conclusion for the effect of diet on ladybird beetle recovery?
162 | 
163 | *It doesn't seem like the recovery probabilities were very different between the five treatments. Beetles without aphids had the lowest probabilities of recovery, but there is considerable overlap with the other diets. See the posterior comparisons below.*
164 | 
165 | ```{r}
166 | ggplot(post_df, aes(x=value, fill=diet)) +
167 |   geom_density(alpha=.4) +
168 |   xlab('Posterior density') +
169 |   ylab('Probability of recovery')
170 | ```
171 | 
172 | 
173 | # Problem 2
174 | 
175 | One of the biggest advantages of Bayesian approaches is the ease with which you can make inference on **derived parameters**.
176 | For example, we might want to know which diet treatment gives the highest survival probability.
177 | In one draw from the posterior distribution, we should have five estimated probabilities.
178 | The highest probability can be recorded and stored in an object (say `best`).
179 | We can do this for each posterior draw to produce a vector of the "best" treatments (from the beetle's perspective).
180 | To find the posterior probability that each particular treatment is best, count the frequency of each treatment in the `best` vector, and divide by the total number of posterior draws.
181 | Do this below using the results from problem 1, and report the posterior probabilities.
182 | 
183 | ```{r}
184 | best <- apply(post$p, 1, which.max)
185 | p_best <- rep(NA, stan_d$n_trt)
186 | for (i in 1:stan_d$n_trt){
187 |   p_best[i] <- mean(best == i)
188 | }
189 | data.frame(diet = levels(factor(d$diet)), p_best)
190 | ```
191 | 
192 | Which treatment was best? What is the probability of that treatment being the best, conditional on the data?
193 | 
194 | *The 2 aphids + pollen treatment was the best with probability $\approx$ 0.4.*
195 | 
196 | 
197 | # Problem 3
198 | 
199 | Simulate data from a normal distribution for three groups, each with a different mean.
200 | You can assume that the standard deviations for each group are equal.
201 | In generating data, use a design matrix to acquire the expected value for your data (somewhere in your code there should be `X %*% beta`).
202 | 
203 | ```{r}
204 | # your code here
205 | n <- 50
206 | ngroup <- 3
207 | group_vector <- sample(ngroup, n, replace = TRUE) %>%
208 |   sort()
209 | 
210 | is_in_group <- function(x, group) {
211 |   # tests whether elements in x are in a group
212 |   return(as.numeric(x == group))
213 | }
214 | 
215 | # make design matrix
216 | X <- matrix(nrow = n, ncol = ngroup)
217 | for (i in 1:ngroup){
218 |   X[, i] <- is_in_group(group_vector, i)
219 | }
220 | X
221 | 
222 | # choose group means and residual sd
223 | beta <- c(-2.3, .1, 1.5)
224 | sigma <- .5
225 | 
226 | # simulate data
227 | y <- rnorm(n, X %*% beta, sigma)
228 | ```
229 | 
230 | Write a Stan model statement for a linear model that you can use to estimate the parameters.
231 | 
232 | ```
233 | data {
234 |   int n; //
235 |   int p; //
236 |   matrix[n, p] X;
237 |   vector[n] y;
238 | }
239 | parameters {
240 |   vector[p] beta;
241 |   real<lower=0> sigma;
242 | }
243 | model {
244 |   beta ~ normal(0, 5);
245 |   sigma ~ normal(0, 5);
246 |   y ~ normal(X * beta, sigma);
247 | }
248 | ```
249 | 
250 | Use Stan to estimate the parameters of your model.
251 | 
252 | ```{r}
253 | library(rstan)
254 | stan_d <- list(X = X, p = ngroup, n = n, y = y)
255 | m_fit <- stan('lm.stan', data = stan_d)
256 | ```
257 | 
258 | Assess convergence of the MCMC algorithm graphically and with the Rhat statistic.
259 | 
260 | ```{r}
261 | m_fit
262 | traceplot(m_fit)
263 | ```
264 | 
265 | Plot the marginal posterior distributions for each parameter with a vertical line at the true value.
266 | 
267 | ```{r}
268 | post <- rstan::extract(m_fit)
269 | par(mfrow=c(2, 2))
270 | for (i in 1:ngroup){
271 |   plot(density(post$beta[, i]), main = paste('Group', i, 'mean', sep = ' '))
272 |   abline(v = beta[i])
273 | }
274 | plot(density(post$sigma), main = 'sigma')
275 | abline(v = sigma)
276 | ```
277 | 


--------------------------------------------------------------------------------
/example_solutions/4-poisson/wk4_solutions_will.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Week 4 assignment: Poisson models'
  3 | author: "Will Stutz"
  4 | date: February 5, 2016
  5 | output: pdf_document
  6 | ---
  7 | 
  8 | Wasps in the family Cynipidae lay their eggs on plants which form galls around the developing larvae, providing nutrition until the larvae metamorphose and burrow out of the galls, emerging as adults.
  9 | From any particular gall, there is variation in the number of host wasps that emerge.
 10 | 
 11 | Here, you will construct a Bayesian model for the number of emerging cynipid wasps, using features of the galls as explanatory variables.
 12 | The data are available in the `cleaned_galls.csv` file.
 13 | Your task is to estimate the parameters of your model, and then to do a posterior predictive check to evaluate overdispersion.
 14 | 
 15 | # Problem 1: looking at the data
 16 | 
 17 | Load the data and explore how the features relate to the response variable.
 18 | 
 19 | ```{r}
 20 | # libraries
 21 | library(ggplot2)
 22 | library(tidyr)
 23 | library(rstan)
 24 | 
 25 | # load the data
 26 | dat <- read.csv("cleaned_galls.csv")
 27 | 
 28 | # what kind of data do we have
 29 | head(dat)
 30 |   # looks like n_cynip, gall size and locality
 31 | ```
 32 | Let's take a look at how the data are distributed across sites (i.e. sample sizes)
 33 | ```{r}
 34 | # order of localities by sample size
 35 | ordered_names <- sort(table(dat$gall_locality), decreasing = FALSE) %>%
 36 |   names()
 37 | 
 38 | # reorder localities in the data frame
 39 | dat$gall_locality <- factor(dat$gall_locality, levels = ordered_names)
 40 | 
 41 | # calculate mean number of wasps per sample location
 42 | loc_means <- data.frame(gall_locality = levels(dat$gall_locality),
 43 |   means = tapply(dat$n_cynip, dat$gall_locality,mean))
 44 | dat <- merge(dat, loc_means, by = "gall_locality")
 45 | 
 46 | # plot raw data with mean number of wasps
 47 | ggplot(data = dat, aes(n_cynip, gall_locality)) +
 48 |   geom_vline(xintercept = 0, color = "grey50") +
 49 |   geom_jitter(, alpha = 1/5) +
 50 |   geom_point(aes(means,gall_locality), pch = 21, fill = "seagreen3", size = 2)
 51 | ```
 52 | 
 53 | *It looks like a few sites have lots of galls while some have only a handful. Also, there are a lot of zeros in the data (all the points jittered around zero on the x-axis). There is also variation in the means across sites, so we'll probably want to account for that in our model as well.*
 54 | 
 55 | Let's look at the relationship between gall size and the number of wasps
 56 | ```{r}
 57 | # number of wasps as a function of gall size
 58 | ggplot(data = dat, aes(gall_size,n_cynip)) +
 59 |   geom_jitter(width = 3, alpha = 1/10) +
 60 |   stat_smooth(color = "steelblue") +
 61 |   stat_smooth(method = "glm",
 62 |       method.args = list(family = "poisson"), color = "seagreen3")
 63 |   # number of emerging wasps increases with gall size
 64 | ```
 65 | *It would appear that the number of wasps levels off at higher gall sizes relative to standard Poisson (green) regression. Perhaps a quadratic term later on might help? Let's break it down by locality first:*
 66 | 
 67 | ```{r}
 68 | 
 69 | # break it down by locality
 70 | ggplot(data = dat, aes(gall_size,n_cynip)) +
 71 |   geom_jitter(alpha = 1/3) +
 72 |   stat_smooth(color = "steelblue") +
 73 |   stat_smooth(method = "glm",
 74 |     method.args = list(family = "poisson"), color = "seagreen3") +
 75 |   facet_wrap(~gall_locality)
 76 | ```
 77 | *Hmm. It's possible that the overall pattern we saw before may have to do with different slope parameters in different populations, as there isn't too much difference between the Poisson regression (green) and the smooth curve(blue) within localities, except in the case of the Butcher Rd. site*
 78 | 
 79 | # Problem 2: model specification
 80 | 
 81 | What is your model? Write it in \LaTeX.
 82 | 
 83 | *Given the above, I want to fit a model that models the number of wasps as a Poisson distributed random variable that has different mean values for each population. Additionally, I'll fit a different gall size parameter for each locality, rather than a single gall size parameter applied across all localities*
 84 | 
 85 | $$y \sim Poisson(\lambda)$$
 86 | 
 87 | $$log(\lambda) = X\beta$$
 88 | 
 89 | *Here we are modeling the number of wasps $y_i$ as a Poisson distributed random variable where beta has different intercept and gall size coefficients for each population (13 populations x 2 = 26 total parameters).*
 90 | 
 91 | What is your Stan model statement?
 92 | 
 93 | ```
 94 | data {
 95 |   int n; // number of galls
 96 |   int p; // number of parameters
 97 |   matrix[n, p] X;  // design matrix
 98 |   int y[n];  // the number of observed wasps emerging from each gall
 99 | }
100 | 
101 | parameters {
102 |   vector[p] beta; // p length vector of parameters
103 | }
104 | 
105 | model {
106 |   beta ~ normal(0, 5);  // use a somewhat vague Normal prior centered at zero
107 |   y ~ poisson_log(X * beta); // Poisson likelihood with log transformation
108 | }
109 | ```
110 | *Note that I've given my intercept and my slope parameters all the same prior. We probably won't always do this, as it often makes sense to use different priors for these since they are fundamentally different parameters, but this prior is vague enough that it shouldn't matter too much*
111 | 
112 | *Also, keep in mind that centering the intercept priors at zero means that the prior is centered around a value of 1 on the raw data scale (hint: what does e^0 equal?) and that, while it's symmetric on the log scale, it's not on the raw-data scale*
113 | 
114 | # Problem 3: parameter estimation
115 | 
116 | *First I want to center my gall size variable at zero so the site intercepts are not correlated with the site level gall size parameter. This also means my intercept parameters will now be estimates of mean wasp number for an average sized gall and not for gall size equals zero. I could instead center gall size within each population, but I won't do that here).*
117 | 
118 | ```{r}
119 | # what is the average call size
120 | mean(dat$gall_size)
121 | 
122 | # center gall size(and scale to unit variance)
123 | dat$gsize_adj <- scale(dat$gall_size, center = TRUE, scale = TRUE) %>%
124 |   as.numeric  # the as.numeric() removes the attributes appended by scale()
125 | ```
126 | *Next I'll use R's glm() function to create my design matrix, which is easier than doing it by hand.*
127 | 
128 | ```{r}
129 | 
130 | # create the design matrix
131 | X <- glm(n_cynip ~ 0 + gall_locality + gall_locality:gsize_adj, data = dat,
132 |   family = "poisson") %>% model.matrix()
133 |     # note suppression of global intercept
134 | 
135 | # create the data to input into Stan
136 | stan_d <- list(n = nrow(dat),
137 |                 p = ncol(X),
138 |                 X = X,
139 |                 y = dat$n_cynip)
140 | 
141 | # tell Stan to be aware of all of my processors when fitting models
142 | options(mc.cores = parallel::detectCores())
143 | 
144 | # fit the Stan model (note this takes awhile because p is big (26!)
145 | gall_fit <- stan("poisson_glm_will.stan",
146 |   data = stan_d,
147 |   chains = 4,
148 |   iter = 500,
149 |   open_progress = FALSE)
150 | ```
151 | Verify convergence using traceplots and the Rhat statistic:
152 | 
153 | ```{r}
154 | # check Rhat
155 | gall_fit
156 | 
157 | # traceplot
158 | traceplot(gall_fit, pars = "beta")
159 | 
160 | ```
161 | *Looks good*
162 | 
163 | # Problem 4: posterior predictive check
164 | 
165 | Does your model adequately capture the variance in the emergence data, or is there overdispersion?
166 | 
167 | *In addition to checking the variance predicted by the model, I'm also going to check whether my model adequately captures the number of zeros in the data. If there are more zeros in the actual data than my model predicts, it might be an indication of zero-inflation (which could be do some extra effect I haven't accounted for)*
168 | 
169 | ```{r}
170 | # extract the posteriors
171 | posts <- extract(gall_fit)$beta
172 | 
173 | # write a function calculate the variance
174 | calc_var <- function(X, beta){
175 |   n <- nrow(X)  # number of data points
176 |   lambda <- c(exp(X %*% beta)) # simulate lambda for the draw
177 |   y <- rpois(n, lambda)
178 |   var_y <- var(y)
179 | }
180 | 
181 | # write a function to calculate the number of zeros
182 | calc_zeros <- function(X, beta){
183 |   n <- nrow(X)  # number of data points
184 |   lambda <- c(exp(X %*% beta)) # simulate lambda for the draw
185 |   y <- rpois(n, lambda)
186 |   zero_y <- sum(y == 0)
187 | }
188 | 
189 | # how many draws do we have
190 | n_draws <- nrow(posts)
191 | 
192 | # create a data.frame to store our variances and zero counts
193 | sims <- data.frame(variance = rep(NA, n_draws),
194 |                       n_zeros = rep(NA, n_draws))
195 | 
196 | # simulate variances and zero counts for each draw
197 | for(i in 1:n_draws){
198 |   sims[i,"variance"] <- calc_var(X, beta = posts[i, ])
199 |   sims[i,"n_zeros"] <- calc_zeros(X, beta = posts[i, ])
200 | }
201 | 
202 | # calculate Bayesian P-values
203 | var_p <- sum(sims$variance < var(dat$n_cynip))/n_draws
204 | zeros_p <- sum(sims$n_zeros > sum(dat$n_cynip == 0))/n_draws
205 | 
206 | # now plot the distribution of simulated variances with the actual variance
207 | ggplot(data = sims, aes(variance)) +
208 |   geom_histogram() +
209 |   geom_vline(xintercept = var(dat$n_cynip), color = "steelblue", size = 2) +
210 |   annotate("text", x = 5.5, y = 75,
211 |     label = paste0("p = ", var_p))
212 | ```
213 | *Looks like our model captures the observed variance quite well, which would indicate that there is no overdispersion*
214 | 
215 | ```{r}
216 | # now plot the distribution of simulated number of zeros
217 | ggplot(data = sims, aes(n_zeros)) +
218 |   geom_histogram() +
219 |   geom_vline(xintercept = sum(dat$n_cynip == 0), color = "steelblue", size = 2) +
220 |   annotate("text", x = 750, y = 100,
221 |     label = paste0("p = ", zeros_p))
222 | 
223 | ```
224 | 
225 | *However, it looks like we are undershooting the number of zeros by 100-200, so there may be some other factor that is causing there to be zero emerging wasps in some galls beyond what is predicted by random Poisson variation.*
226 | 


--------------------------------------------------------------------------------
/roadmap.md:
--------------------------------------------------------------------------------
  1 | # Hierarchical modeling course roadmap
  2 | 
  3 | ## Learning goals
  4 | 
  5 | By the end of this course, graduate students should be able to comfortably:
  6 | 
  7 | 1. Translate biological problems to hierarchical models
  8 | 2. Recall and use common probability distributions
  9 | 3. Think in terms of parameter, process, and observation models
 10 | 4. Understand multi-level models from a Bayesian perspective
 11 | 5. Link model results to graphical outputs and vice versa
 12 | 6. Understand mixed and random effects models as a subset of hierarchical models
 13 | 
 14 | ## Outline
 15 | 
 16 | ### 1. Linear models
 17 | 
 18 | The idea here is to understand the mechanics of linear models. Here is where I think a number of key topics can be introduced that will make understanding hierarchical models easier (hopefully), ease the transitions from non-hierarchical to hierarchical models, and allow students to build and fit models with more confidence about what's going on.
 19 | 
 20 | #### Priorities
 21 | 
 22 | - linear regression with `lm`
 23 | - intercepts, "categorical" effects
 24 | - varying model structure to estimate effects and standard errors
 25 | - interactions as variation in slope estimates for different groups
 26 | - centering input variables and intepreting resulting parameters
 27 | - assumptions and unarticulated priors
 28 | - understanding residual variance (Gaussian)
 29 | - understanding all of the above graphically
 30 | 
 31 | #### Optional
 32 | 
 33 | - understanding and plotting output of lm
 34 | - notation and linear algebra review: $X\beta$
 35 | 
 36 | #### Reading
 37 | 
 38 | Schielzeth, H. 2010. Simple means to improve the interpretability of regression coefficients. Methods in Ecology and Evolution 1:103–113.  
 39 | 
 40 | Enqvist, L. 2005. The mistreatment of covariate interaction terms in linear model analyses of behavioural and evolutionary ecology studies. Animal Behaviour 70:967–971.  
 41 | 
 42 | Chapters from Gelman and Hill on linear modeling
 43 | 
 44 | ### (2). Probability and Distributions
 45 | 
 46 | I'm not convinced that this material should be a class on its own in a short modeling based course and I wonder whether it could spread out among other class periods where necessary.
 47 | 
 48 | - What is probability?
 49 | - joint probabilities
 50 | - independence
 51 | - conditioning
 52 | - marginalization
 53 | - factorization & likelihood (tied to independence)
 54 | - PDFs vs. PMFs
 55 | - Normal
 56 | - Binomial
 57 | - Uniform
 58 | - Cauchy
 59 | - Poisson
 60 | 
 61 | 
 62 | ### 2. Fitting linear models (Likelihood Approaches)
 63 | 
 64 | My thought here is to combine the probability and likelihood functions directly with simple model fitting. May or may not be too much for one class, but I worry that a classes just on probability distributions and likilhood wouldn't be practical enough for a short course. 
 65 | 
 66 | Hopefully students have used `lm` before, and we can get more into depth by approaching this problem from two new angles: optimization of the likelihood function and Bayesian approaches. I suspect this will be a good place to introduce Bayesian inference because the problem of regression should be familiar to most students.
 67 | 
 68 | #### Priorities
 69 | 
 70 | - definition of likelihood
 71 | - single parameter models: MLE and optim
 72 | - model of the mean with unknown variance 
 73 | - fitting simple linear models with liklihood (glm with gaussian link?)
 74 | - assumptions and inference
 75 | - separation (as a preview of why priors are nice...)
 76 | 
 77 | #### Optional
 78 | 
 79 | - restricted maximum likelihood vs. maximum likelihood
 80 | 
 81 | 
 82 | ### 3. Fitting linear models (Bayesian Approaches)
 83 | 
 84 | #### Priorities
 85 | 
 86 | - Bayes' theorem and Bayesian probability
 87 | - relationsihp between likelihood and Bayesian inference
 88 | - priors (generally, informative vs. non-informative)
 89 | - interpreting posterior disributions (mean, mode, intervals)
 90 | - intro to Bayesian computation and MCMC
 91 | - single parameter models: MLE vs. Bayesian treatments
 92 | - Bayesian linear regression: intro to Stan & JAGS
 93 | 
 94 | #### Optional
 95 | 
 96 | - comparing posterior distributions for different parameter estimates (i.e. are they "significantly" different)
 97 | - proper vs. improper priors 
 98 | 
 99 | ### 4. Generalized Linear models (Poisson models)
100 | 
101 | Here is my preference for teaching Poisson models first. I (personally) prefer to start with poisson because (1) IMO log links are more intuitive than logit (probit etc) links (2) latent variables remain between zero and infinity (3) IMO overdispersion is easier to understand than in binomial models
102 | 
103 | #### Priorties
104 | 
105 | - non-gaussian data (counts, proportions, binary, exponential)
106 | - link functions and latent variables
107 | - Poisson distribution
108 | - log link as a map (what are we actually estimating -- mean or rate)
109 | - understanding effects sizes in Poisson models
110 | - dependence of mean and variance in non-Gaussian models 
111 | - overdispersion : quasi-Poisson and negative-binomial
112 | - overdispersion as unaccounted for variation (maybe a simple example -- sex effects)
113 | - implementation with `glm`, Stan, JAGS
114 | - graphical displays 
115 | - model checking
116 | 
117 | 
118 | #### Optional
119 | - simulation of data & parameter recovery
120 | 
121 | 
122 | ### 5. Binomial models
123 | 
124 | Here, the students will continue to use a combination of methods for implementation. Key points to take away from this section include the properties/behavior of bionomial models, ways to check binomial modles, and a hint that Bayesian approaches are going to be more flexible.  The binomial-Poisson hierarchical model is a classic that should reinforce the notion that Bayesian approaches will generally be easier for more complex examples.
125 | 
126 | #### Priorities
127 | 
128 | - binomial distribution (relationship between mean and variance)
129 | - logit link as a map
130 | - proportion vs. binary models (will help with understanding hierarchical models later)
131 | - implementation with `glm`
132 | - overdispersion in proportion models and understanding the difference between individual and group level probabilities
133 | - implementation with  Stan, JAGS
134 | - hierarchical model number 1: occupancy model (a classic) (maybe, or we could do it later)
135 |   - review marginalization 
136 | - graphical displays
137 | - model checking
138 | 
139 | #### Optional
140 | 
141 | - simulation of data & parameter recovery
142 | 
143 | 
144 | 
145 | ### 6. Intro to hierarchical models (Part I - Partial pooling and likelihood)
146 | 
147 | The main dish. I'd like to avoid a recipe-based approach where we discuss varying intercept and varying slope models as primary objectives. Instead, I think it's important to cover these topics as special cases of the general approach of hierarchical modeling as a means to impose probabilistic structures on parameters. From that perspective, students should be able to better extend these methods for their own work.
148 | 
149 | #### Priorities
150 | 
151 | - definition
152 | - review previous examples
153 | - hyperparameters (they've always been there even when we don't acknowledge them)
154 | - varying intercepts (NBA freethrow example) with `lme4`
155 | - partial pooling
156 | - clearing up confusion about nestedness
157 | - simple hierarchical models with likelihood
158 | - continous predictors for multiple levels
159 | 
160 | #### Optional
161 | 
162 | - plotting estimates for different levels from lme4 models
163 | 
164 | 
165 | ### 7. Intro to hierarchical models (Part II - Bayesian)
166 | 
167 | #### Priorities
168 | 
169 | - varying intercepts (NBA freethrow example) with `Stan`
170 | - hierarchical models in Stan
171 | - highlight Bayesian connection to priors
172 | - classic examples: 
173 |     - hierarchical model number 1: occupancy model (a classic)
174 |     - hierarchical model: binomial-Poisson hierarchy (e.g. # eggs laid & survival)
175 | - introduction to the multivariate normal distribution
176 | - parameters for hierarchical variance parameters
177 | - prediction (new vs. observed groups)
178 | - priors 
179 | - note crossing of the 'ease' threshold (?)
180 | 
181 | 
182 | #### Optional
183 | 
184 | - posterior prediction
185 | - basic Bayesian models in MCMCglmm
186 | - random effects, fixed effects, mixed effects models as special instances of hierarchical linear models
187 | 
188 | #### Reading
189 | 
190 | Gelman, A., J. Hill, and M. Yajima. 2012. Why We (Usually) Don’t Have to Worry About Multiple Comparisons. Journal of Research on Educational Effectiveness 5:189–211.  
191 | Gelman and Hill discussion of random effects terminology (very good)
192 | 
193 | ### 8. Hierarchical model construction
194 | 
195 | This is where I think we will have the greatest impact on students future work. Translating problems to models is a key skill, and it may take a fair bit of practice. Tools to implement include graphical skills (e.g. drawing DAGs), and familiarity with probability distributions.
196 | 
197 | - parameter, process, and observation models
198 | - building complexity from simple pieces
199 | - translating biological problems and observations to models
200 | - example: what method works best for detecting a species?
201 | - example: error in variables models
202 | - more practice in developing models (don't necessarily have to implement)
203 | 
204 | 
205 | ### 9. Comparing between models 
206 | 
207 | I envisions this as occuring a bit more ad hoc during the second half as students start to build their own models
208 | 
209 | - start with simpler models and build (may be counterintuitive for those used to step-down procedures)
210 | - posterior prediction
211 | - DIC, wAIC
212 | - cross-checking
213 | - simulated data
214 | 
215 | #### Reading
216 | 
217 | Hooten, M. B., and N. T. Hobbs. 2015. A guide to Bayesian model selection for ecologists. Ecological Monographs 85:3–28.  
218 | 
219 | ### 10. Student projects
220 | 
221 | This will help to make the class more valuable for the students, who doubtless have a ton of their own work to do on top of coursework.
222 | 
223 | - students develop models for their own systems
224 | - student presentations on their projects
225 | 


--------------------------------------------------------------------------------
/example_solutions/1-linear_models/wk1_solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Week 1 assignment: linear models'
  3 | author: "Example solutions"
  4 | date: "January 15, 2015"
  5 | output: pdf_document
  6 | ---
  7 | 
  8 | # Loading the relevant R packages
  9 | 
 10 | To complete these problems, you'll need some R packages loaded and ready to go.
 11 | We suggest starting with a package for plotting (ggplot2), and potentially some packages for manipulating data frames (dplyr and tidyr), depending on how you prefer to use R.
 12 | 
 13 | ```{r, message=FALSE}
 14 | library(ggplot2)
 15 | library(dplyr)
 16 | library(tidyr)
 17 | ```
 18 | 
 19 | # Problem 1
 20 | 
 21 | You've discovered a single adult of a new species of amniote.
 22 | Though you don't know the sex, you're worried that it might lay eggs soon and you want to build something to protect them.
 23 | To do so, you need to predict the mass of this new creature's eggs.
 24 | You first weigh your amniote and find that it weighs 3000 grams.
 25 | Luckily, some researchers have just recently published a full database of aminote life history traits, which can give you some information about how amniote adult mass relates to amniote egg mass. ([Myhrvold et al. (2015)](http://www.esajournals.org/doi/abs/10.1890/15-0846R.1))
 26 | 
 27 | ### Loading the data
 28 | 
 29 | Your first task will be to acquire the published dataset, which is available [here](http://www.esapubs.org/archive/ecol/E096/269/Data_Files/Amniote_Database_Aug_2015.csv).
 30 | Download the data file `Amniote_Database_Aug_2015.csv` and save it in a location that makes sense (e.g., `~/hmods/class1/data/`).
 31 | Then, you'll need to load the data with the `read.csv()` function.
 32 | Do not use `file.choose()`.
 33 | 
 34 | ```{r}
 35 | dat <- read.csv("../data/Amniote_Database_Aug_2015.csv")
 36 | ```
 37 | 
 38 | ### Preparing the data
 39 | 
 40 | In this dataset, missing values are coded as `-999`.
 41 | We want to replace these values with `NA` which indicates missing values to R.
 42 | 
 43 | ```{r}
 44 | dat[dat == -999] <- NA
 45 | ```
 46 | 
 47 | ### Visualizing the data
 48 | 
 49 | We want to predict egg mass from adult body mass.
 50 | Visualize the relationship between these two variables below.
 51 | Transformation will be helpful for both variables.
 52 | 
 53 | ```{r}
 54 | # remove points without the data we want
 55 | dat_use <- dat %>%
 56 |   filter(!is.na(adult_body_mass_g) & !is.na(egg_mass_g))
 57 | 
 58 | # plot the raw data
 59 | ggplot(dat_use, aes(adult_body_mass_g, egg_mass_g)) +
 60 |   geom_point(shape = 1) # plots data as points
 61 | 
 62 | # plot transformred
 63 | ggplot(dat_use, aes(adult_body_mass_g, egg_mass_g)) +
 64 |   geom_point(shape = 1) + 
 65 |   xlab('Adult mass') + # specifies axis labels
 66 |   ylab('Egg mass') + 
 67 |   scale_y_log10() + 
 68 |   scale_x_log10()
 69 | ```
 70 | 
 71 | 
 72 | ### Modeling the data
 73 | 
 74 | Use the `lm()` function to construct a linear model that could be used to predict egg mass based on adult body mass.
 75 | (Hint: what other kinds of data transformations might be helpful prior to fitting the model?)
 76 | 
 77 | ```{r}
 78 | # create a log transformed input variable
 79 | dat_use <- dat_use %>%
 80 |   mutate(lmass = log(adult_body_mass_g), 
 81 |          legg = log(egg_mass_g))
 82 | 
 83 | # fit the model
 84 | model <- lm(legg ~ lmass, data = dat_use)
 85 | summary(model)
 86 | # note that we could center lmass via scale(lmass), but not necessary here
 87 | ```
 88 | 
 89 | Evaluate the homoscedasticity and normality assumptions graphically (e.g., `plot(mymodel)`).
 90 | 
 91 | ```{r, fig.width=8, fig.height=6}
 92 | par(mfrow=c(2, 2))
 93 | plot(model)
 94 | ```
 95 | 
 96 | Are the assumptions met?
 97 | Why or why not?
 98 | 
 99 | *The downward curve in the Q-Q plot shows that the residuals are becoming consistently negative for larger values of adult mass. This suggests that we might benefit from adding a polynomial (i.e. squared) term to the model, though predictions for most values of adult mass will likely be pretty good. Nothing else is overly concerning, though the Q-Q plot is a bit wonky on the ends.*
100 | 
101 | ### Understanding the model
102 | 
103 | Produce a scatterplot as before, but this time add a trendline that represents the expected value of the response as a function of the predictor.
104 | 
105 | ```{r}
106 | # plot with best fit linear trendline
107 | ggplot(dat_use, aes(lmass, legg)) +
108 |   geom_point(shape = 1) +
109 |   geom_smooth(method = "lm") +  # adds line via lm()
110 |   labs(x = "log(adult mass)", y = "log(egg_mass)")
111 | 
112 | # plot with polynomial trendline
113 | ggplot(dat_use, aes(lmass, legg)) +
114 |   geom_point(shape = 1) +
115 |   geom_smooth(method = "lm", formula = y ~ poly(x,2)) +
116 |   labs(x = "log(adult mass)", y = "log(egg_mass)")
117 | # note that geom_smooth is now calling lm() with the log 
118 | # squared adult body mass included in the formula
119 | ```
120 | 
121 | Make an image plot of the design matrix for your model (e.g., `image(t(model.matrix(m)))`):
122 | 
123 | ```{r}
124 | par(mfrow=c(1, 1))
125 | # below, I use piping to avoid "onion" code
126 | # but the result is the same as image(t(model.matrix(model)))
127 | model %>%
128 |   model.matrix() %>%
129 |   t() %>%
130 |   image()
131 | ```
132 | 
133 | Why does this image plot look the way it does, and what is the result of multiplying the design matrix by the vector of estimated coefficients (e.g., `model.matrix(m) %*% coef(m)`)?
134 | 
135 | *The first column in the matrix is all ones, and corresponds to the intercept (or global mean egg mass). The second column indicates the adult mass values for each individual. Multiplying the model matrix by the given model coefficients yields the predicted values for each individual - aka the linear predictor.*
136 | 
137 | ### Predicting egg mass for the new critter
138 | 
139 | Predict the egg mass for the new species and provide upper and lower bounds on this estimate, in units of grams.
140 | Remember that interval should incorporate both *predictive* uncertainty (error term in the model) and inferential uncertainty (uncertainty about the coefficients and amount of residiual error)
141 | (Hint: there's a built-in R function that should help generate prediction intervals)
142 | 
143 | ```{r}
144 | # predict the new value
145 | new_logmass <- log(3000) # model uses log values
146 | 
147 | # predict() is a useful function (see ?predict for syntax)
148 | pred <- predict(model, data.frame(lmass = new_logmass), 
149 |                 se.fit=TRUE, interval = 'prediction')
150 | 
151 | # convert prediction from log grams to grams scale
152 | exp(pred$fit)
153 | ```
154 | 
155 | # Problem 2
156 | 
157 | A week later, you are told that the critter has been identified to be in the class Reptilia.
158 | Use this new information to update your prediction in the code block below, commenting your code to document your thought process.
159 | 
160 | ```{r}
161 | # how many classes are there?
162 | table(dat_use$class)
163 | 
164 | # does the relationship between adult mass and egg mass differ for the two classes?
165 | ggplot(dat_use, aes(lmass, legg, color = class)) +
166 |   geom_point(shape = 1) +
167 |   stat_smooth(method = "lm")
168 | 
169 | # fit a new model
170 | model2 <- lm(legg ~ lmass * class, data = dat_use)
171 | summary(model2)
172 | ```
173 | 
174 | ```{r, fig.width=8, fig.height=6}
175 | # evaluate diagnostic plots
176 | par(mfrow=c(2, 2))
177 | plot(model2)
178 | ```
179 | 
180 | ```{r}
181 | # new prediction
182 | pred2 <- predict(model2, data.frame(lmass = new_logmass, class = "Reptilia"),
183 |   se.fit=TRUE, interval = 'prediction')
184 | 
185 | # convert to grams
186 | exp(pred2$fit)
187 | 
188 | # compare on the natural scale (grams)
189 | round(rbind(exp(pred$fit), exp(pred2$fit)), digits = 1)
190 | # compare on log scale
191 | round(rbind(pred$fit, pred2$fit), digits = 1)
192 | ```
193 | 
194 | How does your new prediction compare to your prediction from Problem 1 in terms of accuracy and precision?
195 | Is it lower or higher, and why?
196 | 
197 | *No idea about the accuracy (because we don't know what the actual underlying value is), but the prediction interval is a bit narrower, so the precision has increased a bit. The new prediction is lower, because the slope is different for reptiles vs. birds.*
198 | 
199 | # Problem 3
200 | 
201 | Myrdahl et al. just retracted all of the adult mass data from their data set, and have advised researchers to stop using the existing adult mass data until further notice!
202 | Given this new development, what's your best prediction for the critter's egg mass?
203 | Update your prediction in the block below, commenting the code as necessary.
204 | 
205 | ```{r}
206 | # our best guess will be to use the mean egg size for Reptilia,
207 | # we can estimate this using a liner model
208 | model3 <- lm(legg ~ 0 + class, data = dat_use)
209 | 
210 | # prediction interval
211 | pred3 <- predict(model3, data.frame(class = "Reptilia"),
212 |   se.fit=TRUE, interval = 'prediction')
213 | 
214 | # compare to the previous fit
215 | round(rbind(exp(pred$fit), exp(pred2$fit),exp(pred3$fit)),digits = 1)
216 | ```
217 | 
218 | *Ouch! Our predictive ability really takes quite a hit when we don't have any information about adult body mass!*
219 | 
220 | # Bonus Problem (optional)
221 | 
222 | When predicting the egg mass value for your unknown amniote, you probably used a built-in function in R (i.e. *predict*) to automatically generate prediction intervals.
223 | Can you generate prediction intervals from the model without resorting to a built-in *predict* fuction?  
224 | (Hint 1: how many parameters were estimated?)  
225 | (Hint 2: can prediction intervals be simulated?)  
226 | (Hint 3: check out chapter 7 of Gelman and Hill if you have it)
227 | 
228 | ```{r}
229 | # first we need to use the uncertainty in the fitted parameters to simulate a
230 | # range of new parameter values consistent with the data we have.
231 | 
232 | # to do so we will use the sim() function in the 'arm' package
233 | 
234 | # load the package
235 | library(arm)
236 | 
237 | # simulate new values for each parameter using sim()
238 | n.sims <- 10000
239 | sim.1 <- sim(model,n.sims)
240 | sim_params <- data.frame(alpha = sim.1@coef[,1], beta = sim.1@coef[,2],
241 |   sigma = sim.1@sigma)
242 | 
243 | # what does sim_params look like?
244 | head(sim_params)
245 |   # okay so it's generated a three column data frame with one row per simulation
246 |   # add one column per fitted parameter in the model
247 | 
248 | ## calculate prediction for our critter for each simulation
249 | new_preds <- rnorm(n = n.sims, 
250 |                    mean = sim_params$alpha + sim_params$beta * new_logmass, 
251 |                    sd = sim_params$sigma)
252 | 
253 | # what does the simulated distribution of predicted values look like for our
254 | # new creature?
255 | qplot(new_preds, geom = "histogram", binwidth = 0.2)
256 | 
257 | # calculate new 95% prediction intervals
258 | pred4 <- data.frame(fit = mean(new_preds),
259 |     lwr = quantile(new_preds, probs = 0.025),
260 |     upr = quantile(new_preds, probs = 0.975),
261 |     row.names = "simulated prediction")
262 | 
263 | # how does it compare to our original *predict()* prediction?
264 | round(rbind(pred$fit,pred4), digits = 3)
265 |   # nice!
266 | ```
267 | 


--------------------------------------------------------------------------------
/example_solutions/5-binomial/wk5_solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Week 5 assignment: Binomial models'
  3 | author: "Example solutions"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | output: pdf_document
  6 | ---
  7 | 
  8 | About one out of eight women in the U.S. will develop breast cancer at some point in her lifetime. 
  9 | Early diagnoses help with treatment of this potentially fatal disease, and these diagnoses can be made based on a variety of cytological metrics evaluated via biopsy. 
 10 | Your job today is to develop a model that classifies tumors as malignant or benign based on these metrics.
 11 | The student(s) with the most predictive model will get a prize.
 12 | 
 13 | The data are in the `breast_cancer.csv` file.
 14 | Details for this dataset can be found [on the UCI machine learning data repository](https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Original)), which is useful if you ever need data to play with.
 15 | I split the data into two groups at random: the *training* data, which you'll use to estimate parameters, and the *test* data, which we'll use to evaluate the predictive power of the model. 
 16 | There is a column in the data called `group`, which indicates whether an observation is part of the training or test set. 
 17 | 
 18 | ## Data exploration
 19 | 
 20 | As usual, you will want to explore the data before constructing any statistcal models. 
 21 | Only explore the training data, and do not use the test data for data exploration/visualization. 
 22 | We will pretend that we don't have access to the test data yet.
 23 | 
 24 | ```{r, message = FALSE, fig.width = 12, fig.height = 7}
 25 | d <- read.csv('data/breast_cancer.csv')
 26 | library(dplyr)
 27 | str(d)
 28 | # center the continuous explanatory variables
 29 | d[, 2:10] <- apply(d[, 2:10], 2, FUN = function(x) unlist(scale(x)))
 30 | # create a subsetted data frame with just the training data
 31 | d_train <- subset(d, group == 'train')
 32 | d_test <- subset(d, group == 'test')
 33 | par(mfrow=c(2, 5))
 34 | for (i in 2:10){
 35 | plot(x = jitter(d_train[, i]), 
 36 |      y = jitter(d_train[, 'malignant']),
 37 |      xlab = names(d_train)[i], ylab = 'malignant')
 38 | }
 39 | plot(malignant ~ cohort, data = d_train)
 40 | ```
 41 | 
 42 | 
 43 | ## Model structure
 44 | 
 45 | What is your model? Write it out in \LaTeX. Hint: you will want to use a design matrix.
 46 | 
 47 | *LaTeX here*
 48 | 
 49 | What is your Stan model statement? 
 50 | 
 51 | ```
 52 | data {
 53 |   int n; // sample size
 54 |   int p; // number of coefficients
 55 |   matrix[n, p] X;
 56 |   int y[n];
 57 | }
 58 | 
 59 | parameters {
 60 |   vector[p] beta;
 61 | }
 62 | 
 63 | model {
 64 |   beta ~ normal(0, 1);
 65 |   y ~ bernoulli_logit(X * beta);
 66 | }
 67 | 
 68 | ```
 69 | 
 70 | ## Building and understanding the design matrix
 71 | 
 72 | We mentioned that you would want to use a design matrix. 
 73 | Specifically, your model should be of the form: 
 74 | 
 75 | $y \sim Bernoulli(p)$
 76 | 
 77 | And the probability of malignancy $p$ is modeled using a logit-link:
 78 | 
 79 | $log \Big(\dfrac{p}{1 - p} \Big) = X \beta$
 80 | 
 81 | The design matrix $X$ contains the tumor features, and also dictates the interpretation of the coefficients $\beta$. 
 82 | In the code block below, construct your design matrix, creating an object called `X`. 
 83 | The included code will make an image plot of your design matrix with a horrendous color scheme. 
 84 | Once you fill in your code, set the argument `eval = TRUE` inside of the curly braces at the beginning of the code chuck (this is a chunk option), otherwise the code chunk will not be evaluated when you're knitting your pdf.
 85 | 
 86 | ```{r}
 87 | # define your design matrix below
 88 | X <- model.matrix(~ 0 + clump_thickness + 
 89 |                     size_uniformity + 
 90 |                     shape_uniformity + 
 91 |                     marginal_adhesion + 
 92 |                     epithelial_size + 
 93 |                     bare_nuclei + 
 94 |                     bland_chromatin + 
 95 |                     normal_nucleoli + 
 96 |                     mitoses + 
 97 |                     cohort, 
 98 |                   data = d_train)
 99 |   
100 | # the code below will plot your design matrix
101 | library(reshape2)
102 | library(ggplot2)
103 | mX <- melt(X) 
104 | ggplot(mX, aes(x = Var2, y = Var1)) + 
105 |   geom_raster(aes(fill = value)) + 
106 |   scale_y_reverse() + 
107 |   xlab('Design matrix column') + 
108 |   ylab('Design matrix row') + 
109 |   scale_fill_gradientn(colors = rainbow(20)) + 
110 |   theme_minimal() + 
111 |   theme(axis.text.x = element_text(angle=90))
112 | ```
113 | 
114 | For each column of $X$ you will get a coefficient, one element in $\beta$. 
115 | For instance, the coefficient $\beta_1$ will be associated with the first column in $X$, which we might denote $X[, 1]$, to borrow some R syntax.
116 | There's no sense in estimating parameters if you don't know what they mean (Abraham Lincoln said that), so below, list each element in $\beta$ and briefly describe what it represents/how you would interpret it:
117 | 
118 | 1. $\beta_1$ represents the increase in the logit probability of malignance for an increase of one standard deviation in clump thickness
119 | 
120 | 2. $\beta_2$ represents the increase in the logit probability of malignance for an increase of one standard deviation in size uniformity
121 | 
122 | 3. $\beta_3$ represents the increase in the logit probability of malignance for an increase of one standard deviation in shape uniformity
123 | 
124 | 4. $\beta_4$ represents the increase in the logit probability of malignance for an increase of one standard deviation in marginal adhesion
125 | 
126 | 5. $\beta_5$ represents the increase in the logit probability of malignance for an increase of one standard deviation in epithelial size
127 | 
128 | 6. $\beta_6$ represents the increase in the logit probability of malignance for an increase of one standard deviation in bare nuclei
129 | 
130 | 7. $\beta_7$ represents the increase in the logit probability of malignance for an increase of one standard deviation in bland chromatin
131 | 
132 | 8. $\beta_8$ represents the increase in the logit probability of malignance for an increase of one standard deviation in normal nucleoli
133 | 
134 | 9. $\beta_9$ represents the increase in the logit probability of malignance for an increase of one standard deviation in mitoses
135 | 
136 | The remaining columns (10 through 17) are group-level intercepts, whose coefficients will represent the logit probability of malignance for an average tumor. 
137 | 
138 | ## Parameter estimation
139 | 
140 | Use the **training** data to estimate your model's parameters (`group == 'test'`). 
141 | Do not use the **test** data yet. 
142 | Make sure that the MCMC algorithm has converged before moving forward.
143 | 
144 | ```{r, message = FALSE}
145 | library(rstan)
146 | rstan_options(auto_write = TRUE)
147 | stan_d <- list(n = nrow(X), 
148 |                p = ncol(X), 
149 |                X = X, 
150 |                y = d_train$malignant)
151 | m_fit <- stan('bernoulli_glm.stan', 
152 |               data = stan_d, cores = 2)
153 | m_fit
154 | traceplot(m_fit, inc_warmup = TRUE, 'beta')
155 | ```
156 | 
157 | 
158 | ## Out of sample predictive power
159 | 
160 | One measure of a model's ability to predict new data is the log likelihood of new data, given the parameters of the model $[\tilde{y} \mid \theta]$, where $\tilde{y}$ is the new data (the **test** or **validation** data), and the parameters $\theta$ have been estimated from other data (e.g., the **training** data).
161 | 
162 | Hints:
163 | 
164 | - this is done most easily via a new design matrix $X_{test}$, which can be multiplied by the vector of model parameters, and must be declared in the `data` block
165 | - make sure that if you used any feature scaling or centering in the training data, that the exact same scaling/centering schemes are applied to the test set
166 | - you'll use the `generated quantities` block to calculate the log-likelihood of the test data
167 | - you can obtain the joint log likelihood with the `bernoulli_logit_log` function in Stan, and I wrote a generated quantities model block for you below, which should be the last block in your new Stan model statement
168 | 
169 | What is your updated Stan model?
170 | 
171 | ```
172 | data {
173 |   int n; // sample size
174 |   int p; // number of coefficients
175 |   matrix[n, p] X;
176 |   int y[n];
177 |   int n_test;
178 |   int y_test[n_test];
179 |   matrix[n_test, p] X_test;
180 | }
181 | 
182 | parameters {
183 |   vector[p] beta;
184 | }
185 | 
186 | model {
187 |   beta ~ normal(0, 1);
188 |   y ~ bernoulli_logit(X * beta);
189 | }
190 | 
191 | 
192 | generated quantities {
193 |   // I wrote this section for you as a hint
194 |   real loglik_test;
195 |   vector[n_test] logit_p_test;
196 |   
197 |   logit_p_test <- X_test * beta;
198 |   loglik_test <- bernoulli_logit_log(y_test, logit_p_test);  
199 |   //returns the sum of the log likelihoods (the joint log-likelihood)
200 | }
201 | 
202 | ```
203 | 
204 | Acquire the posterior distribution of the model parameters and the holdout log likelihood.
205 | 
206 | ```{r}
207 | X_test <-  model.matrix(~ 0 + clump_thickness + 
208 |                     size_uniformity + 
209 |                     shape_uniformity + 
210 |                     marginal_adhesion + 
211 |                     epithelial_size + 
212 |                     bare_nuclei + 
213 |                     bland_chromatin + 
214 |                     normal_nucleoli + 
215 |                     mitoses + 
216 |                     cohort, 
217 |                   data = d_test)
218 | 
219 | stan_d <- list(n = nrow(X), 
220 |                p = ncol(X), 
221 |                X = X, 
222 |                y = d_train$malignant,
223 |                n_test = nrow(X_test), 
224 |                y_test = d_test$malignant,
225 |                X_test = X_test)
226 | m_fit <- stan('bernoulli_glm_test.stan', 
227 |               data = stan_d, cores = 2)
228 | print(m_fit, pars = c('beta', 'loglik_test'))
229 | traceplot(m_fit, inc_warmup = TRUE, c('beta', 'loglik_test'))
230 | ```
231 | 
232 | Make a histogram of the holdout log likelihood and report the posterior mean along with a 95% credible interval.
233 | 
234 | ```{r}
235 | post <- rstan::extract(m_fit)
236 | par(mfrow=c(1, 1))
237 | hist(post$loglik_test, breaks=40)
238 | c(mean = mean(post$loglik_test), quantile(post$loglik_test, c(0.025, 0.975)))
239 | ```
240 | 
241 | 
242 | ## Showing predictions
243 | 
244 | The whole point of building this model is to diagnose whether a tumor is malignant based on some features. 
245 | Plot the posterior probability of tumor malignance for each holdout tumor, and show the true tumor status in the same graph.
246 | Multiple graph types are possible here, but we do not recommend simply copying and pasting code from another example (so far about a quarter of plots made in this way have made sense). 
247 | Instead, think hard about what sort of data display would be effective, and make that plot!
248 | 
249 | ```{r}
250 | library(reshape2)
251 | p_df <- melt(post$logit_p_test, varnames = c('iter', 'obs'))
252 | 
253 | subset(d, group == 'test') %>%
254 |   mutate(obs = 1:n()) %>%
255 |   full_join(p_df) %>% 
256 |   ggplot(aes(x = value, 
257 |              group = obs, 
258 |              fill = factor(malignant))) + 
259 |   geom_density(alpha = .1) + 
260 |   facet_wrap(~ cohort, nrow = 2) +
261 |   theme_bw() + 
262 |   xlab('Predicted logit probability of tumor malignance') + 
263 |   ylab('Posterior density') + 
264 |   theme(legend.position = "top")
265 | ```
266 | 
267 | 


--------------------------------------------------------------------------------
/slides/6-shrinkage/wk6_slides.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Hieararchical models: 1"
  3 | fontsize: 7pt
  4 | output:
  5 |   beamer_presentation:
  6 |     colortheme: "spruce"
  7 |     fonttheme: "structurebold"
  8 |     latex_engine: xelatex
  9 | header-includes: 
 10 | - \usepackage{listings}
 11 | - \lstset{basicstyle=\small}
 12 | - \setmonofont[Scale=MatchLowercase]{Courier}
 13 | - \setmonofont[Scale=0.8]{Courier}
 14 | ---
 15 | 
 16 | ## So you're having a hard time choosing priors...
 17 | 
 18 | ![](confused.jpg)
 19 | 
 20 | ## So you're having a hard time choosing priors...
 21 | 
 22 | - Not surprising!
 23 | 
 24 | - Takes practice
 25 | 
 26 | 
 27 | ## Useful tips for prior selection
 28 | 
 29 | 1. Any constraints on parameter? 
 30 | 
 31 | - variance parameters: $\sigma > 0$
 32 | - probabilities: $0 \leq p \leq 1$
 33 | - correlations: $-1 \leq \rho \leq 1$
 34 | 
 35 | ## Useful tips for prior selection
 36 | 
 37 | 1. Any constraints on parameter? 
 38 | 
 39 | 2. Prior predictive distribution: 
 40 | 
 41 | $$[y]$$
 42 | 
 43 | ## Review: posterior predictive distribution
 44 | 
 45 | Distribution of predicted data, given the observations
 46 | 
 47 | $$[\tilde{y} \mid y]$$
 48 | 
 49 | **Concept:**
 50 | 
 51 | For a *good* model, predicted data resembles the real data
 52 | 
 53 | 
 54 | ## Prior predictive distribution
 55 | 
 56 | Distribution of predicted data, given your priors
 57 | 
 58 | $$[y]$$
 59 | 
 60 | **Concept:**
 61 | 
 62 | For *good* priors, predicted data resembles your expectations for the data
 63 | 
 64 | 
 65 | ## Prior predictive distribution simulations
 66 | 
 67 | 1. Simulate parameter draws from prior 
 68 | 
 69 | 2. Simulate data using these parameters
 70 | 
 71 | * how different from posterior predictive simulation?
 72 | 
 73 | ## Useful tips for prior selection
 74 | 
 75 | 1. Constraints
 76 | 
 77 | 2. Prior predictive distribution
 78 | 
 79 | 3. Expert recommendations [https://github.com/stan-dev/stan/wiki/Prior-Choice-Recommendations](https://github.com/stan-dev/stan/wiki/Prior-Choice-Recommendations)
 80 | 
 81 | ## Useful tips for prior selection
 82 | 
 83 | 1. Constraints
 84 | 
 85 | 2. Prior predictive distribution
 86 | 
 87 | 3. Expert recommendations 
 88 | 
 89 | 4. Treat the prior parameters as unknown! 
 90 | 
 91 | - aka use a hierarchical model
 92 | 
 93 | ## Hierarchical models: why bother?
 94 | 
 95 | ![](zuur.jpg)
 96 | 
 97 | 
 98 | ## Gall wasp example
 99 | 
100 | **Goal**: Estimate mean number of wasps for each location
101 | 
102 | 1. Sample locations $j=1,..., J$
103 | 
104 | 2. Sample galls at each location
105 | 
106 | 3. Gall $i$ is from site $j$
107 | 
108 | 
109 | ## The data
110 | 
111 | ```{r, echo = FALSE, message = FALSE}
112 | library(reshape2)
113 | library(ggplot2)
114 | library(dplyr)
115 | d <- read.csv('cleaned_galls.csv')
116 | ggplot(d, aes(x=n_cynip, y = gall_locality)) + 
117 |   geom_count() + 
118 |   xlab('Number of wasps emerging') + 
119 |   ylab('Location')
120 | ```
121 | 
122 | 
123 | ## Sample sizes by location
124 | 
125 | ```{r, echo = FALSE}
126 | d %>%
127 |   group_by(gall_locality) %>%
128 |   summarize(n = n()) %>%
129 |   ggplot(aes(x = n, y = reorder(gall_locality, -n))) + 
130 |   geom_point() + 
131 |   xlab('Sample size') + 
132 |   ylab('Location')
133 | ```
134 | 
135 | 
136 | ## Two extreme choices to estimate means
137 | 
138 | 1. Complete pooling: all locations are the same
139 | 
140 | 2. No pooling: locations have different means
141 | 
142 | ## Complete pooling
143 | 
144 | ```{r}
145 | complete_pool <- glm(n_cynip ~ 1, 
146 |                      data = d, family = poisson)
147 | ```
148 | 
149 | $$y_i \sim Poisson(\lambda)$$
150 | 
151 | $$log(\lambda) = \beta_0$$
152 | 
153 | ## Complete pooling
154 | 
155 | ```{r, echo = FALSE}
156 | ggplot(d, aes(x=n_cynip, y = gall_locality)) + 
157 |   geom_count() + 
158 |   xlab('Number of wasps emerging') + 
159 |   ylab('Location') + 
160 |   geom_vline(xintercept = exp(coef(complete_pool)), 
161 |              col = 2, linetype = 'dashed')
162 | ```
163 | 
164 | 
165 | ## No pooling: locations different and independent
166 | 
167 | ```{r}
168 | no_pool <- glm(n_cynip ~ 0 + gall_locality, 
169 |                      data = d, family = poisson)
170 | ```
171 | 
172 | $$y_i \sim Poisson(\lambda_i)$$
173 | 
174 | $$log(\lambda_i) = \beta_{j[i]}$$
175 | 
176 | 
177 | ## No pooling 
178 | 
179 | ```{r, echo = FALSE}
180 | np_df <- data.frame(gall_locality = sort(levels(d$gall_locality)), 
181 |                     n_cynip = exp(coef(no_pool)))
182 | ggplot(d, aes(x=n_cynip, y = gall_locality)) + 
183 |   geom_point(data = np_df, col = 'red', shape = 19, size = 6) + 
184 |   geom_count() + 
185 |   xlab('Number of wasps emerging') + 
186 |   ylab('Location')
187 | ```
188 | 
189 | 
190 | ## Uncertainty and sample size
191 | 
192 | ```{r, message = FALSE, echo = FALSE, warning=FALSE}
193 | cis <- confint(no_pool)
194 | np_df$lo <- exp(cis[, 1])
195 | np_df$hi <- exp(cis[, 2])
196 | 
197 | d %>%
198 |   group_by(gall_locality) %>%
199 |   summarize(n = n()) %>%
200 |   full_join(np_df) %>%
201 |   ggplot(aes(x = n, y = n_cynip)) + 
202 |   geom_point() + 
203 |   geom_segment(aes(x = n, xend = n, 
204 |                    y = lo, yend = hi)) + 
205 |   xlab('Sample size') + 
206 |   ylab('Estimated mean number of wasps') + 
207 |   scale_x_log10()
208 | ```
209 | 
210 | 
211 | ## Which estimates do we trust?
212 | 
213 | ```{r, echo = FALSE, message = FALSE}
214 | d %>%
215 |   group_by(gall_locality) %>%
216 |   summarize(n = n()) %>%
217 |   full_join(np_df) %>%
218 |   ggplot(aes(x = n, y = n_cynip)) + 
219 |   geom_point() + 
220 |   geom_segment(aes(x = n, xend = n, 
221 |                    y = lo, yend = hi)) + 
222 |   xlab('Sample size') + 
223 |   ylab('Estimated mean number of wasps') + 
224 |   scale_x_log10()
225 | ```
226 | 
227 | 
228 | ## How can we improve estimates with small $n$?
229 | 
230 | ```{r, echo = FALSE, message = FALSE}
231 | d %>%
232 |   group_by(gall_locality) %>%
233 |   summarize(n = n()) %>%
234 |   full_join(np_df) %>%
235 |   ggplot(aes(x = n, y = n_cynip)) + 
236 |   geom_point() + 
237 |   geom_segment(aes(x = n, xend = n, 
238 |                    y = lo, yend = hi)) + 
239 |   xlab('Sample size') + 
240 |   ylab('Estimated mean number of wasps') + 
241 |   scale_x_log10()
242 | ```
243 | 
244 | 
245 | ## Gall wasp hierarchical model
246 | 
247 | $$y_i \sim \text{Poisson}(\lambda_i)$$
248 | 
249 | $$log(\lambda_i) = \alpha_0 + \alpha_{j[i]}$$
250 | 
251 | $$\alpha_j \sim Normal(0, \sigma_\alpha)$$
252 | 
253 | ## Parameter interpretation
254 | 
255 | $$y_i \sim \text{Poisson}(\lambda_i)$$
256 | 
257 | $$log(\lambda_i) = \alpha_0 + \alpha_{j[i]}$$
258 | 
259 | $$\alpha_j \sim Normal(0, \sigma_\alpha)$$
260 | 
261 | ## Fitting a hierarchical model
262 | 
263 | ```{r, message=FALSE}
264 | library(lme4)
265 | partial_pool <- glmer(n_cynip ~(1 | gall_locality), 
266 |               data = d, family = poisson)
267 | ```
268 | 
269 | ## Understanding the model object
270 | 
271 | ```{r}
272 | partial_pool
273 | ```
274 | 
275 | 
276 | ## Is this a Bayesian model?
277 | 
278 | $$y_i \sim \text{Poisson}(\lambda_i)$$
279 | 
280 | $$log(\lambda_i) = \alpha_0 + \alpha_{j[i]}$$
281 | 
282 | $$\alpha_j \sim Normal(0, \sigma_\alpha)$$
283 | 
284 | 
285 | ## Comparing estimates: which estimates were shrunk?
286 | 
287 | ```{r, echo = FALSE, message = FALSE}
288 | # create data frame to plot results
289 | n_locality <- length(unique(d$gall_locality))
290 | shrink_df <- data.frame(gall_locality = rep(sort(unique(d$gall_locality)), 3), 
291 |                         model = rep(c('1-complete_pooling', 
292 |                                       '3-no pooling', 
293 |                                       '2-partial pooling'), 
294 |                                     each = n_locality))
295 | 
296 | # add intercepts from model objects
297 | shrink_df$intercept[1:n_locality] <- coef(complete_pool)
298 | shrink_df$intercept[(1 + n_locality):(2 * n_locality)] <- coef(no_pool)
299 | shrink_df$intercept[(1 + 2 * n_locality):(3 * n_locality)] <- coef(partial_pool)$gall_locality[, 1]
300 | 
301 | # make a data frame with the sample sizes
302 | shrink_df <- d %>%
303 |   group_by(gall_locality) %>%
304 |   summarize(n = n()) %>%
305 |   right_join(shrink_df)
306 | 
307 | 
308 | # create a plot to illustrate the compromise between no pooling and partial pooling
309 | library(ggrepel)
310 | ggplot(shrink_df, aes(x = model, y = exp(intercept))) + 
311 |   theme_minimal() + 
312 |   geom_point(color = 'blue', size = 3) + 
313 |   geom_line(aes(group = gall_locality), color = 'blue', alpha = .6) + 
314 |   geom_text_repel(aes(label = paste('n =', n)), 
315 |                 data = subset(shrink_df, model == '3-no pooling'), 
316 |             nudge_x = .3, size = 3) + 
317 |   ylab('Expected number of emerging wasps') + 
318 |   xlab('Model')
319 | ```
320 | 
321 | 
322 | ## Bayesian connections
323 | 
324 | $$y_i \sim \text{Poisson}(\lambda_i)$$
325 | 
326 | $$log(\lambda_i) = \alpha_{j[i]}$$
327 | 
328 | $$\alpha_j \sim Normal(\alpha_0, \sigma_\alpha)$$
329 | 
330 | 
331 | ## Bayesian connections
332 | 
333 | Estimated distribution of intercepts
334 | 
335 | $$\alpha_j \sim Normal(\alpha_0, \sigma_\alpha)$$
336 | 
337 | ```{r, echo = FALSE}
338 | xvals <- seq(-2.5, 2, .001)
339 | alpha_sd <- VarCorr(partial_pool) %>% unlist() %>% sqrt()
340 | plot(xvals, dnorm(xvals, fixef(partial_pool), sd = alpha_sd), type = 'l', 
341 |      xlab = expression(paste('Varying intercept: ', alpha)), 
342 |      ylab = expression(paste('Estimated probability density: ', alpha)), 
343 |      col = 2)
344 | ```
345 | 
346 | 
347 | ## Bayesian connections
348 | 
349 | Estimated distribution of intercepts
350 | 
351 | 
352 | ```{r, echo = FALSE}
353 | plot(xvals, dnorm(xvals, fixef(partial_pool), sd = alpha_sd), type = 'l', 
354 |      xlab = expression(paste('Varying intercept: ', alpha)), 
355 |      ylab = expression(paste('Estimated probability density: ', alpha)), 
356 |      col = 2)
357 | text(x = fixef(partial_pool), y = 0.1, labels = expression(alpha[0]), 
358 |      col = 2, cex = 2)
359 | text(x = c(fixef(partial_pool) - alpha_sd, fixef(partial_pool) + alpha_sd), 
360 |      y = 0.05, 
361 |      labels = c(expression(alpha[0] - sigma[alpha]), 
362 |                 expression(alpha[0] + sigma[alpha])), col = 2)
363 | rug(unlist(coef(partial_pool)), col = 4, lwd = 2)
364 | ```
365 | 
366 | 
367 | ## Bayesian connections
368 | 
369 | 
370 | Complete pooling: $\sigma_\alpha \rightarrow 0$
371 | 
372 | ```{r, echo = FALSE}
373 | plot(xvals, dnorm(xvals, fixef(partial_pool), sd = alpha_sd), type = 'l', 
374 |      xlab = expression(paste('Varying intercept: ', alpha)), 
375 |      ylab = expression(paste('Estimated probability density: ', alpha)), 
376 |      col = 2)
377 | lines(xvals, dnorm(xvals, fixef(partial_pool), sd = 1E-4))
378 | ```
379 | 
380 | 
381 | ## Bayesian connections
382 | 
383 | No pooling: $\sigma_\alpha \rightarrow \infty$
384 | 
385 | ```{r, echo = FALSE}
386 | plot(xvals, dnorm(xvals, fixef(partial_pool), sd = alpha_sd), type = 'l', 
387 |      xlab = expression(paste('Varying intercept: ', alpha)), 
388 |      ylab = expression(paste('Estimated probability density: ', alpha)), 
389 |      col = 2)
390 | lines(xvals, dnorm(xvals, fixef(partial_pool), sd = 1E4))
391 | ```
392 | 
393 | 
394 | ## Partial pooling: a reasonable compromise
395 | 
396 | Complete pooling: $\sigma_\alpha \rightarrow 0$
397 | 
398 | No pooling: $\sigma_\alpha \rightarrow \infty$
399 | 
400 | Partial pooling: $0 < \sigma_\alpha < \infty$
401 | 
402 | ## Hierarchical models
403 | 
404 | Why bother? 
405 | 
406 | 1. Shrinkage & partial pooling
407 |     - sharing information among groups
408 |     
409 |     
410 | ## Hierarchical models
411 | 
412 | Why bother? 
413 | 
414 | 1. Shrinkage & partial pooling
415 |     - sharing information among groups
416 |     
417 | How many groups do we need to justify hierarchical modeling?
418 | 
419 | 
420 | ## Hierarchical models
421 | 
422 | Why bother? 
423 | 
424 | 1. Shrinkage & partial pooling
425 | 
426 | 2. Predictions for new groups
427 | 
428 | ## This week
429 | 
430 | Amniotes & free throws redux
431 | 
432 | ![](pat.jpg)
433 | 


--------------------------------------------------------------------------------
/slides/8-bhm/wk8_slides.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Hierarchical model construction"
  3 | fontsize: 7pt
  4 | output:
  5 |   beamer_presentation:
  6 |     colortheme: "spruce"
  7 |     fonttheme: "structurebold"
  8 |     latex_engine: xelatex
  9 | header-includes: 
 10 | - \usepackage{listings}
 11 | - \lstset{basicstyle=\small}
 12 | - \setmonofont[Scale=MatchLowercase]{Courier}
 13 | - \setmonofont[Scale=0.8]{Courier}
 14 | ---
 15 | 
 16 | 
 17 | ```{r setup, include=FALSE}
 18 | knitr::opts_chunk$set(echo = FALSE)
 19 | par(bty='n')
 20 | ```
 21 | 
 22 | # 
 23 | 
 24 | 1. Classic hierarchical Bayesian models
 25 | 2. Gaussian process models
 26 | 3. Projects
 27 | 
 28 | # Occupancy models
 29 | 
 30 | $$z_i \sim Bernoulli(\psi_i)$$
 31 | 
 32 | $$y_{ij} \sim Bernoulli(z_i p)$$
 33 | 
 34 | Sites $i, ..., N$
 35 | 
 36 | Repeat visits $j, ..., J$
 37 | 
 38 | # N-mixture models
 39 | 
 40 | $$N_i \sim Poisson(\lambda_i)$$
 41 | 
 42 | $$y_{ij} \sim Binomial(N_i, p)$$
 43 | 
 44 | # Error in variables models
 45 | 
 46 | $$y_i \sim Normal(\alpha + \beta \tilde{x}_i, \sigma_y)$$
 47 | 
 48 | $$x_i \sim Normal(\tilde{x}_i, \sigma_x)$$
 49 | 
 50 | # Example: modeling poop at ponds
 51 | 
 52 | $$y_i \sim Poisson(\mu_i)$$
 53 | 
 54 | $$\mu_i = \alpha_0 + log(\tilde{\pi})$$
 55 | 
 56 | $$\pi \sim N(\tilde{\pi}, \sigma_\pi)$$
 57 | 
 58 | # Zero inflated Poisson
 59 | 
 60 | $$p(y_i|\theta,\lambda)
 61 | = 
 62 | \begin{cases}
 63 | \theta + (1 - \theta) Poisson(0 \mid \lambda) & \mbox{if } y = 0 \\
 64 | (1 - \theta) Poisson(y_i \mid \lambda) & \mbox{if } y > 0
 65 | \end{cases}$$
 66 | 
 67 | $\theta$: mixing parameter 
 68 | 
 69 | # Zero inflated gamma
 70 | 
 71 | $$p(y_i|\theta,\lambda)
 72 | = 
 73 | \begin{cases}
 74 | \theta & \mbox{if } y = 0 \\
 75 | (1 - \theta) Gamma(y_i \mid \alpha, \beta) & \mbox{if } y > 0
 76 | \end{cases}$$
 77 | 
 78 | $\theta$: mixing parameter 
 79 | 
 80 | # Beta glm
 81 | 
 82 | $$y_i \sim Beta(\alpha, \beta)$$
 83 | 
 84 | $$\alpha = \mu \phi$$
 85 | 
 86 | $$\beta = (1 - \mu) \phi$$
 87 | 
 88 | $$logit(\mu) = X \beta$$
 89 | 
 90 | # Hierarchical Bayesian structural equation models
 91 | 
 92 | ![](fig2.pdf)
 93 | 
 94 | # Background: univariate normal
 95 | 
 96 | $$x \sim N(\mu, \sigma^2)$$
 97 | 
 98 | ```{r, echo=FALSE, fig.height=4, fig.width=5}
 99 | x <- seq(-3, 3, .1)
100 | plot(x, dnorm(x), type='l', main='Normal(0, 1) probability density')
101 | ```
102 | 
103 | # Background: multivariate normal
104 | 
105 | $$\boldsymbol{x} \sim N(\boldsymbol{\mu}, \boldsymbol{\Sigma})$$
106 | 
107 | $\boldsymbol{\mu}$: vector of means
108 | 
109 | $\boldsymbol{\Sigma}$: covariance matrix
110 | 
111 | # Bivariate normal probability density
112 | 
113 | $$\boldsymbol{x} \sim N(\boldsymbol{\mu}, \boldsymbol{\Sigma})$$
114 | 
115 | ```{r, fig.width=8, fig.height=4}
116 | # lets first simulate a bivariate normal sample
117 | library(MASS)
118 | bivn <- mvrnorm(100000, mu = c(0, 0), Sigma = matrix(c(1, .5, .5, 1), 2))
119 | 
120 | # now we do a kernel density estimate
121 | bivn.kde <- kde2d(bivn[,1], bivn[,2], n = 50)
122 | 
123 | par(mfrow=c(1, 2))
124 | # now plot your results
125 | persp(bivn.kde, phi = 45, theta = 30, xlab='x1', ylab='x2', zlab='p(x1, x2)')
126 | 
127 | # fancy contour with image
128 | image(bivn.kde, xlab='x1', ylab='x2'); contour(bivn.kde, add = T)
129 | ```
130 | 
131 | # Bivariate normal parameters
132 | 
133 | $\boldsymbol{\mu} = \begin{bmatrix}
134 | \mu_1 \\
135 | \mu_2
136 | \end{bmatrix},$
137 | $\boldsymbol{\Sigma} = \begin{bmatrix}
138 | Cov[X_1, X_1] & Cov[X_1, X_2] \\
139 | Cov[X_2, X_1] & Cov[X_2, X_2]
140 | \end{bmatrix}$
141 | 
142 | ```{r, fig.width=6, fig.height=6}
143 | Sigma <- matrix(c(1, .5, .5, 1), nrow=2)
144 | n <- 10000
145 | z <- matrix(rnorm(n), nrow=nrow(Sigma))
146 | y <- t(chol(Sigma)) %*% z
147 | plot(y[1, ], y[2, ], xlab=expression(x[1]), ylab=expression(x[2]))
148 | text(0, 0, labels=expression(bold(mu)), col='red', cex=2)
149 | ```
150 | 
151 | # Bivariate normal parameters
152 | 
153 | $\boldsymbol{\mu} = \begin{bmatrix}
154 | 0 \\
155 | 0
156 | \end{bmatrix},$
157 | $\boldsymbol{\Sigma} = \begin{bmatrix}
158 | 1 & 0.5 \\
159 | 0.5 & 1
160 | \end{bmatrix}$
161 | 
162 | ```{r, fig.width=6, fig.height=6}
163 | plot(y[1, ], y[2, ], xlab=expression(x[1]), ylab=expression(x[2]))
164 | text(0, 0, labels=expression(bold(mu)), col='red', cex=2)
165 | ```
166 | 
167 | # Uncorrelated bivariate normal
168 | 
169 | $\boldsymbol{\mu} = \begin{bmatrix}
170 | 0 \\
171 | 0
172 | \end{bmatrix},$
173 | $\boldsymbol{\Sigma} = \begin{bmatrix}
174 | 1 & 0 \\
175 | 0 & 1
176 | \end{bmatrix}$
177 | 
178 | ```{r, fig.width=6, fig.height=6}
179 | Sigma <- matrix(c(1, 0, 0, 1), nrow=2)
180 | y <- t(chol(Sigma)) %*% z
181 | plot(y[1, ], y[2, ], xlab=expression(x[1]), ylab=expression(x[2]))
182 | text(0, 0, labels=expression(bold(mu)), col='red', cex=2)
183 | ```
184 | 
185 | # Common notation
186 | 
187 | $\boldsymbol{\mu} = \begin{bmatrix}
188 | \mu_1 \\
189 | \mu_2
190 | \end{bmatrix},$
191 | $\boldsymbol{\Sigma} = \begin{bmatrix}
192 | \sigma_1^2 & \rho \sigma_1 \sigma_2 \\
193 | \rho \sigma_1 \sigma_2 & \sigma_2^2
194 | \end{bmatrix}$
195 | 
196 | # Common notation
197 | 
198 | $\boldsymbol{\mu} = \begin{bmatrix}
199 | \mu_1 \\
200 | \mu_2
201 | \end{bmatrix},$
202 | $\boldsymbol{\Sigma} = \begin{bmatrix}
203 | \sigma_1^2 & \rho \sigma_1 \sigma_2 \\
204 | \rho \sigma_1 \sigma_2 & \sigma_2^2
205 | \end{bmatrix}$
206 | 
207 | $Cov[X_1, X_1] = Var[X_1] = \sigma_1^2$
208 | 
209 | $Cov[X_1, X_2] = \rho \sigma_1 \sigma_2$
210 | 
211 | $\Sigma$ must be symmetric and positive semi-definite
212 | 
213 | # Classic linear modeling
214 | 
215 | $$y = X\beta + \epsilon$$
216 | 
217 | $$\epsilon \sim N(0, \sigma^2)$$
218 | 
219 | Functional form determined by $X \beta$
220 | 
221 | # Linear model functional forms
222 | 
223 | e.g. $y = \mu(x) + \epsilon$
224 | 
225 | ```{r}
226 | n <- 20
227 | x <- rnorm(n)
228 | beta <- c(1, 1, -.5)
229 | X <- matrix(c(rep(1, n), x, x^3), ncol=3)
230 | y <- c(scale(X %*% beta + rnorm(n)))
231 | plot(x, y, ylim=range(y) * 2)
232 | m1 <- lm(y ~ x)
233 | m2 <- lm(y ~ x + I(x^2))
234 | abline(m1)
235 | newx <- seq(min(x), max(x), .01)
236 | p1 <- predict(m1, data.frame(x=newx), interval='prediction')
237 | p2 <- predict(m2, data.frame(x=newx), interval='prediction')
238 | matlines(newx, p1, lty=c(1, 2, 2), col=1)
239 | matlines(newx, p2, lty=c(1, 2, 2), col=2)
240 | ```
241 | 
242 | # Why not set a prior on $\mu(x)$?
243 | 
244 | *Gaussian process* as a prior for $\mu(x)$
245 | 
246 | $$y \sim N(\mu(x), \sigma^2)$$
247 | 
248 | $$\mu(x) \sim GP(m, k)$$
249 | 
250 | # GP prior for $\mu(x)$
251 | 
252 | $y \sim N(\mu(x), \sigma^2)$
253 | 
254 | $\mu(x) \sim GP(m, k)$
255 | 
256 | ```{r, fig.width=6, fig.height=4}
257 | library(scales)
258 | plot(x, y, ylim=range(y) * 2, pch=19)
259 | D <- as.matrix(dist(x))
260 | C <- function(sigma, d, rho){
261 |   stopifnot(sigma > 0)
262 |   stopifnot(rho > 0)
263 |   sigma ^ 2 * exp(-d^2 / (2 * rho ^ 2))
264 | }
265 | sigma_e <- .00001
266 | n_p <- 1000
267 | x_p <- sort(runif(n_p, min(x), max(x)))
268 | d_mat <- as.matrix(dist(x_p))
269 | Emat <- diag(rep(sigma_e^2, n_p))
270 | # simulate realizations
271 | for (i in 1:100){
272 |   sigma <- runif(1, 0, 5)
273 |   rho <- runif(1, 0, 4)
274 |   Cmat <- C(sigma, d_mat, rho)
275 |   L_c <- t(chol(Cmat + Emat))
276 |   z <- rnorm(n_p)
277 |   y_p <- L_c %*% z
278 |   lines(x_p, y_p + mean(y), col=alpha(1, .1))
279 | }
280 | title('Data and realizations from a GP prior')
281 | ```
282 | 
283 | # Wait, what's Gaussian about that?
284 | 
285 | If $\mu(x) \sim GP(m, k)$, then 
286 | 
287 | $\mu(x_1), ..., \mu(x_n) \sim N(m(x_1), ..., m(x_n), K(x_1, ..., x_n)$
288 | 
289 | $m$ and $k$ are functions!
290 | 
291 | # Mean function: m
292 | 
293 | Classic example: $m(x) = X \beta$
294 | 
295 | e.g., $\mu(x) \sim GP(X \beta, k(x))$
296 | 
297 | But, the covariance function $k(x)$ is the real star.
298 | 
299 | # Covariance functions
300 | 
301 | $k$ specifies covariance between to $x$ values
302 | 
303 | Squared exponential covariance:
304 | 
305 | $$k(x, x') = \tau^2 exp\Big(-\dfrac{|x - x'|^2}{\phi}\Big)$$
306 | 
307 | Lots of [options](http://www.gaussianprocess.org/gpml/chapters/RW4.pdf): smooth, jaggety, periodic
308 | 
309 | # Example of squared exponential
310 | 
311 | $$ \boldsymbol{K} = \begin{bmatrix}
312 | \tau^2 exp(-\frac{|x_1 - x_1|^2}{\phi}) & \tau^2 exp(-\frac{|x_1 - x_2|^2}{\phi}) \\
313 | \tau^2 exp(-\frac{|x_2 - x_1|^2}{\phi}) & \tau^2 exp(-\frac{|x_2 - x_2|^2}{\phi})
314 | \end{bmatrix}$$
315 | 
316 | # Example of squared exponential
317 | 
318 | $$ \boldsymbol{K} = \begin{bmatrix}
319 | \tau^2 exp(-\frac{0^2}{\phi}) & \tau^2 exp(-\frac{|x_1 - x_2|^2}{\phi}) \\
320 | \tau^2 exp(-\frac{|x_2 - x_1|^2}{\phi}) & \tau^2 exp(-\frac{0^2}{\phi})
321 | \end{bmatrix}$$
322 | 
323 | # Example of squared exponential
324 | 
325 | $$ \boldsymbol{K} = \begin{bmatrix}
326 | \tau^2 exp(0) & \tau^2 exp(-\frac{|x_1 - x_2|^2}{\phi}) \\
327 | \tau^2 exp(-\frac{|x_2 - x_1|^2}{\phi}) & \tau^2 exp(0)
328 | \end{bmatrix}$$
329 | 
330 | # Example of squared exponential
331 | 
332 | $$ \boldsymbol{K} = \begin{bmatrix}
333 | \tau^2 & \tau^2 exp(-\frac{|x_1 - x_2|^2}{\phi}) \\
334 | \tau^2 exp(-\frac{|x_2 - x_1|^2}{\phi}) & \tau^2
335 | \end{bmatrix}$$
336 | 
337 | $Cor(\mu(x_1), \mu(x_2)) = exp(-\frac{|x_1 - x_2|^2}{\phi})$.
338 | 
339 | # Correlation function
340 | 
341 | $Cor(\mu(x_1), \mu(x_2)) = exp(-\frac{|x_1 - x_2|^2}{\phi})$.
342 | 
343 | ```{r, echo = FALSE}
344 | d <- seq(0, 1, .01)
345 | sq_exp <- function(d, phi) {
346 |   exp(-d^2 / phi)
347 | }
348 | plot(d, sq_exp(d, .1), type='l', 
349 |      xlab='Distance between x1 & x2', 
350 |      ylab='Correlation: mu(x1) & mu(x2)', 
351 |      col = alpha(1, .1))
352 | 
353 | for (i in 1:100){
354 |   lines(d, sq_exp(d, abs(rnorm(1, .1, .1))), 
355 |         col = alpha(1, .1))
356 | }
357 | ```
358 | 
359 | # Gaussian process realizations
360 | 
361 | ```{r}
362 | # Simulating Gaussian processes
363 | library(akima)
364 | library(scales)
365 | 
366 | ## Univariate inputs ---------------------------
367 | n <- 1000
368 | x <- sort(runif(n, -10, 10))
369 | 
370 | # squared exponential --------------------------
371 | l <- 1  # range parameter
372 | dmat <- as.matrix(dist(x))
373 | C <- exp(-dmat^2 / (2 * l^2)) + diag(rep(1E-6, n))
374 | 
375 | # z %*% L produces multivariate normal draws from MVN(0, Sigma),
376 | # where L %*% t(L) = Sigma. i.e., L is a cholesky decomposition of Sigma
377 | # and z ~ normal(0, 1)
378 | z <- rnorm(n)
379 | y <- z %*% chol(C)
380 | plot(x, y, type='l')
381 | ```
382 | 
383 | # Gaussian process realizations
384 | 
385 | ```{r}
386 | z <- rnorm(n)
387 | y <- z %*% chol(C)
388 | plot(x, y, type='l')
389 | ```
390 | 
391 | 
392 | # Gaussian process realizations
393 | 
394 | ```{r}
395 | z <- rnorm(n)
396 | y <- z %*% chol(C)
397 | plot(x, y, type='l')
398 | ```
399 | 
400 | 
401 | # Gaussian process realizations
402 | 
403 | ```{r}
404 | z <- rnorm(n)
405 | y <- z %*% chol(C)
406 | plot(x, y, type='l')
407 | ```
408 | 
409 | 
410 | # Gaussian process realizations
411 | 
412 | ```{r}
413 | z <- rnorm(n)
414 | y <- z %*% chol(C)
415 | plot(x, y, type='l')
416 | ```
417 | 
418 | # Gaussian process with nonzero mean function
419 | 
420 | $y \sim N(\mu, \sigma_y)$
421 | 
422 | $\mu \sim GP(X \beta, k(x)$
423 | 
424 | ```{r}
425 | # or, built into a linear model
426 | alpha <- -1
427 | beta <- .5
428 | y <- alpha + beta * x + z %*% chol(C)
429 | plot(x, y, type='l')
430 | ```
431 | 
432 | # Ornstein–Uhlenbeck Gaussian process
433 | 
434 | $k(x_1, x_2) = e^{\frac{d_{x_1, x_2}}{\phi}}$
435 | 
436 | ```{r}
437 | C <- exp(-dmat / l)
438 | z <- rnorm(n)
439 | y <- z %*% chol(C)
440 | plot(x, y, type='l')
441 | ```
442 | 
443 | # Periodic Gaussian process
444 | 
445 | $k(x_1, x_2) = exp(\frac{2 \text{sin}^2 (d / 2)}{\phi})$
446 | 
447 | ```{r}
448 | C <- exp(- 2 * sin(dmat / 2) * sin(dmat / 2) / l) + diag(rep(.000001, n))
449 | z <- rnorm(n)
450 | y <- z %*% chol(C)
451 | plot(x, y, type='l')
452 | ```
453 | 
454 | # Combining Gaussian processes
455 | 
456 | e.g., sums and products of covariance functions
457 | 
458 | ```{r}
459 | C <- exp(- 2 * sin(dmat / 2) * sin(dmat / 2) / l) + diag(rep(.000001, n))
460 | C2 <- exp(-dmat / l)
461 | z <- rnorm(n)
462 | y <- z %*% chol(2 * C  + .1 * C2)
463 | plot(x, y, type='l')
464 | title('Periodic OU Gaussian process')
465 | ```
466 | 
467 | # Combining Gaussian processes
468 | 
469 | ```{r}
470 | C <- exp(- 2 * sin(dmat / 2) * sin(dmat / 2) / l * 30) + diag(rep(.000001, n))
471 | C2 <- exp(- 2 * sin(dmat / 2) * sin(dmat / 2) / l) + diag(rep(.000001, n))
472 | z <- rnorm(n)
473 | y <- z %*% chol(.1 * C  + 2 * C2)
474 | plot(x, y, type='l')
475 | title('Doubly periodic Gaussian process')
476 | ```
477 | 
478 | # Multidimensional inputs
479 | 
480 | ```{r}
481 | ## 2d gaussian process -------------------------------
482 | # squared exponential covariance function:
483 | C <- function(sigma, d, rho){
484 |   stopifnot(sigma > 0)
485 |   stopifnot(rho > 0)
486 |   sigma ^ 2 * exp(-d^2 / (2 * rho ^ 2))
487 | }
488 | 
489 | sigma <- 1
490 | rho <- .7
491 | sigma_e <- .001
492 | n <- 2000
493 | x1 <- runif(n, 0, 10)
494 | x2 <- runif(n, 0, 10)
495 | d_mat <- as.matrix(dist(cbind(x1, x2)))
496 | Cmat <- C(sigma, d_mat, rho)
497 | Emat <- diag(rep(sigma_e^2, n))
498 | # simulate realizations
499 | L_c <- t(chol(Cmat + Emat))
500 | z <- rnorm(n)
501 | y <- L_c %*% z
502 | s <- interp(x1, x2, y, nx=300, ny=300)
503 | image(s, col=rainbow(100))
504 | title('Squared exponential 2d Gaussian process')
505 | ```
506 | 
507 | # Multidimensional inputs
508 | 
509 | ```{r}
510 | ## 2d gaussian process -------------------------------
511 | # OU covariance function:
512 | C <- function(sigma, d, rho){
513 |   stopifnot(sigma > 0)
514 |   stopifnot(rho > 0)
515 |   sigma ^ 2 * exp(-d / (rho))
516 | }
517 | 
518 | sigma <- 1
519 | rho <- .7
520 | sigma_e <- .001
521 | n <- 2000
522 | x1 <- runif(n, 0, 10)
523 | x2 <- runif(n, 0, 10)
524 | d_mat <- as.matrix(dist(cbind(x1, x2)))
525 | Cmat <- C(sigma, d_mat, rho)
526 | Emat <- diag(rep(sigma_e^2, n))
527 | # simulate realizations
528 | L_c <- t(chol(Cmat + Emat))
529 | z <- rnorm(n)
530 | y <- L_c %*% z
531 | s <- interp(x1, x2, y, nx=300, ny=300)
532 | image(s, col=rainbow(100))
533 | title('OU 2d Gaussian process')
534 | ```
535 | 
536 | # Other inputs
537 | 
538 | Generally, $k(x)$ maps **distance** to **correlation**
539 | 
540 | - phylogenetic distance $\rightarrow$ phylogenetic correlation
541 | - pedigree distance $\rightarrow$ additive genetic correlation
542 | - distance in time $\rightarrow$ temporal correlation
543 | 
544 | 
545 | # Student projects
546 | 
547 | Before fitting your model to your data: 
548 | 
549 | 1. Write out model in mathematical notation (ideally \LaTeX)
550 | 2. Prior predictive simulations (do your priors make sense?)
551 | 3. Model verification (given known parameters from PPS, do you recover parameters?)
552 | 
553 | 


--------------------------------------------------------------------------------
/example_solutions/3-bayes/wk3_solutions_will.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Week 3 assignment: Bayesian inference'
  3 | author: "Example solutions"
  4 | date: "December 20, 2015"
  5 | output: pdf_document
  6 | ---
  7 | 
  8 | # Problem 1
  9 | 
 10 | Maure et al. (2015) conducted experiments to see how ladybird beetle diet affected interactions with parasitoid wasps.
 11 | Individual beetles were randomly assigned to one of five diet treatments:
 12 | 
 13 | 1. 20 aphids per day
 14 | 2. 2 aphids per day
 15 | 3. 20 aphids per day + pollen
 16 | 4. 2 aphids per day + pollen
 17 | 5. pollen only
 18 | 
 19 | Each beetle was exposed to a female parisitoid wasp for 3 hours, which deposited eggs in the beetle, and the beetles were then fed each day and the outcome of the interaction monitored.
 20 | The authors wanted to know whether diet affected the probability of surviving the parasitoid attack.
 21 | Your task is to build a Bayesian model with Stan for beetle survival as a function of experimental treatment, estimating the probability of survival for each treatment.
 22 | To keep things simple, consider the treatments to be categorical and unordered.
 23 | The data are archived in Dryad [here](http://www.datadryad.org/resource/doi:10.5061/dryad.7fq4j).
 24 | 
 25 | **Important**: do not manipulate the original raw data file!
 26 | This is is error prone, leaves no written record, encourages spreadsheet farming, and is not reproducible.
 27 | And, in this case, the data are already well organized.
 28 | Read in the .xls file using the `readxl` package, then use R to massage the data as necessary.
 29 | 
 30 | ```{r}
 31 | ## libraries
 32 | library(readxl) # use the read_excel() function to load the data
 33 | library(ggplot2)
 34 | library(rstan)
 35 | 
 36 | # tell stan to find all of my processors when fitting models
 37 | options(mc.cores = parallel::detectCores())
 38 | 
 39 | # load the dataset
 40 | options(warn = -1)
 41 | dat <- read_excel("data/diet_data.xlsx")
 42 | options(warn = 0)
 43 | 
 44 | ## Cleaning the data
 45 | # manually replace spaces with underscores and capitalize column names
 46 | names(dat)[names(dat) == "Ladybird emergence date"] <- "ladybird_emergence_date"
 47 | names(dat)[names(dat) == "Parasitoid oviposition date"] <- "parasitoid_oviposition_date"
 48 |   # ugh...there has to be a quicker way to do this!
 49 | 
 50 | # replace spaces with underscores using grep
 51 | names(dat) <- gsub(pattern = " ", replacement = "_", x = names(dat))
 52 | 
 53 | # change capitals to lowercase
 54 | names(dat) <- tolower(names(dat))
 55 |   # much better
 56 | 
 57 | # also, that pound sign means something else in R
 58 | names(dat)[names(dat) == "#"] <- "ID"
 59 | 
 60 | # lets remove the rows from the data we won't use here
 61 | dat_use <- dat[is.na(dat$ladybird_recovery) == FALSE,]
 62 | 
 63 | # our response variable is recovery, so make a new binary version
 64 | dat_use[dat_use$ladybird_recovery == "YES", "ladybird_recovery"] <- 1
 65 | dat_use[dat_use$ladybird_recovery == "NO", "ladybird_recovery"] <- 0
 66 | dat_use$ladybird_recovery <- as.numeric(dat_use$ladybird_recovery)
 67 | 
 68 | # finally, let's change diet treatment to a factor
 69 | dat_use$diet <- factor(dat_use$diet)
 70 | 
 71 | ## Working with the data
 72 | 
 73 | # calculate the proportion that recovered for each diet treatments
 74 | recov_summ <- data.frame(diet = levels(dat_use$diet),
 75 |     prop_recov = tapply(dat_use$ladybird_recovery, dat_use$diet,mean))
 76 | rownames(recov_summ) <- seq(1:nrow(recov_summ)) # cleaner
 77 | 
 78 | # for kicks, lets calculate maximum likelihood estimates and 95% CIs
 79 | ml <- glm(ladybird_recovery ~ 0 + diet, data = dat_use, family = "binomial")
 80 | 
 81 | # now adding it back to our summary data frame
 82 | recov_summ <- data.frame(recov_summ,
 83 |   ml = plogis(ml$coef),
 84 |   lwr = plogis(suppressMessages(confint(ml)[,"2.5 %"])),
 85 |   upr = plogis(suppressMessages(confint(ml)[,"97.5 %"])))
 86 | recov_summ
 87 |   # good, the ml estimates match the calculate averages
 88 | 
 89 | ## Visualizing the data
 90 | 
 91 | # now we'll plot the mean proportions in each category along with the raw data
 92 | ggplot(data = dat_use, aes(diet,ladybird_recovery)) +
 93 |   geom_point(size = 2,
 94 |     alpha = 1/2,
 95 |     position = position_jitter(width = 0.3, height = 0.1)) +
 96 |   geom_point(data = recov_summ,
 97 |       aes(diet, ml),
 98 |       size = 3,
 99 |       color = "seagreen3") +
100 |   geom_errorbar(data = recov_summ,
101 |     aes(y = ml, ymin = lwr, ymax = upr),
102 |     color = "seagreen3",
103 |     width = 0.1) +
104 |   labs(y = "proportion recovered\n") +
105 |   theme(axis.title.x = element_blank())
106 | 
107 | ```
108 | 
109 | Write out your model using \LaTeX:
110 | 
111 | 
112 | $y_i \sim Bernoulli(p_j[i])$   
113 | $p_j \sim Beta(1, 1)$           
114 | 
115 | *This should model the recovery (yes/no) as a bernoulli random variable with an underlying probability $p_j$, which varies across the J treatments. The probabilities themselves are given a uniform prior between zero and one, reflecting my complete lack of a prior knowledge about ladybird recovery*
116 | 
117 | Paste your Stan model statement in the code block below, and ensure that your written model matches the notation in your Stan file:
118 | 
119 | ```
120 | data {
121 | 
122 |   // integer inputs
123 |   int n;                        // the number of samples
124 |   int n_treat;                  // the number of treatments
125 | 
126 |   // integer vector inputs
127 |   int<lower=0, upper=1> y[n];   // probability of recovery (zeros and ones)
128 |   int<lower=1, upper=n_treat> treatment[n];  // vector of treatments (1 to n_treat)
129 | 
130 | }
131 | 
132 | parameters {
133 | 
134 |   // probabilities (p) of recovery
135 |   vector<lower=0, upper=1>[n_trt] p; // p is a real number between 0 and 1
136 | }
137 | 
138 | model {
139 | 
140 |   // define prior for p
141 |   p ~ beta(1, 1);
142 | 
143 |   // define the likelihood
144 |   for (i in 1:n)
145 |     y[i] ~ bernoulli(p[treatment[i]]); // calculate y for each element of y[n]
146 | 
147 | }
148 | ```
149 | 
150 | Now, use `rstan` to fit your model.
151 | Evaluate convergence by inspecting the $\hat{R}$ statistic and the traceplots.
152 | 
153 | ```{r}
154 | # make a list that mirrors the input in the stan model
155 | input <- list(n = nrow(dat_use),
156 |               n_treat = length(levels(dat_use$diet)),
157 |               y = dat_use$ladybird_recovery,
158 |               treatment = as.numeric(dat_use$diet))
159 |   # don't forget the we defined treatment as a number in the stan model!
160 | 
161 | # fit the model
162 | mod <- stan('bern_mod_will.stan',
163 |   data = input)
164 | 
165 | # check Rhat using model ouput
166 | mod
167 |   # looks good (Rhat all < 1.1)
168 | 
169 | # check traceplots including warmup samples
170 | traceplot(mod, inc_warmup = TRUE)
171 |   # that's what convergence looks like
172 | ```
173 | 
174 | Calculate posterior credible intervals, medians, means, and modes for the survival probabilities for each treatment.
175 | Hint: posterior draws can be extracted with the `rstan::extract` function, which returns a list of arrays.
176 | 
177 | ```{r}
178 | library(modeest)  # allows me to calculate the mode of a continous distibution
179 | 
180 | # extract the posterior samples
181 | posts <- extract(mod)
182 | 
183 | # pull posteriors for probabilities into data frame
184 | prob_posts <- as.data.frame(posts$p)
185 | colnames(prob_posts) <- levels(dat_use$diet)
186 | 
187 | # create summary table
188 | posterior_summary <- c()   # empty table
189 | 
190 | for(i in 1:ncol(prob_posts)){
191 |   posterior_summary <- rbind(posterior_summary,
192 |       data.frame(diet = colnames(prob_posts)[i],
193 |         lwr = quantile(prob_posts[,i], prob = 0.025),
194 |         upr = quantile(prob_posts[,i], prob = 0.975),
195 |         mean = mean(prob_posts[,i]),
196 |         median = median(prob_posts[,i]),
197 |         mode = mlv(prob_posts[,i], method = "mfv")$M
198 |       ),
199 |     make.row.names = FALSE
200 |   )
201 | }
202 | 
203 | # show the summary
204 | posterior_summary
205 | ```
206 | 
207 | Generate a plot that shows all of the raw data along with the posterior probability distributions of recovery for each treatment:
208 | 
209 | ```{r}
210 | 
211 | ## plot of raw data
212 | ggplot(data = dat_use, aes(diet,ladybird_recovery)) +
213 |   geom_point(size = 2, alpha = 1/2,
214 |     position = position_jitter(width = 0.3, height = 0.1)) +
215 |   geom_point(data = recov_summ, aes(diet, ml), size = 3, color = "seagreen3") +
216 |   geom_point(data = posterior_summary,
217 |     aes(diet,median),
218 |     color = "steelblue",
219 |     size = 3) +
220 |   geom_errorbar(data = posterior_summary,
221 |     aes(y = mode, ymin = lwr, ymax = upr),
222 |     color = "steelblue",
223 |     width = 0.1) +
224 |   labs(y = "proportion recovered\n") +
225 |   theme(axis.title.x = element_blank())
226 | 
227 | ## plot the posterior distributions overlaid
228 | library(reshape) # for melt function
229 | over <- suppressMessages(melt(prob_posts))
230 | 
231 | ggplot(data = over, aes(value)) +
232 |   geom_density(aes(fill = variable), color = "black", alpha = 1/3) +
233 |   labs(x = "probability of recovery")
234 | ```
235 | 
236 | The authors reported statistically significant differences in ladybird beetle recovery between the diet treatments.
237 | What is your conclusion for the effect of diet on ladybird beetle recovery?
238 | 
239 | *There doesn't appear to be substantial differences between treatments in the probability of recovery. There is a suggestion that having at least 2 aphids increases the probability of recovery, but the posterior distributions still overlap quite a bit. We would need more data to say one way or the other.*
240 | 
241 | *Inspecting the methods, results, and figure 2 (dynamite!) of Maure et al. (2015), it would appear that they actually achieved roughly the same estimates of proportions as we did. However, they used a frequentist approach of first asking whether including treatment as a predictor improved model fit versus a model of the mean only (it did, barely, at P = 0.04 when using a log-likelihood ratio test). Then they "performed pairwise comparisons to reveal differences among means" though they don't report what method they use to "reveal" nor its associated statistics, and only report the results with "a" and "b" lettering on figure 2.*
242 | 
243 | *This is a classical example of how ANOVA or analysis of deviance methods can be misleading. I like our way comparing estimated proportions much better.*
244 | 
245 | # Problem 2
246 | 
247 | One of the biggest advantages of Bayesian approaches is the ease with which you can make inference on **derived parameters**.
248 | For example, we might want to know which diet treatment gives the highest survival probability.
249 | In one draw from the posterior distribution, we should have five estimated probabilities.
250 | The highest probability can be recorded and stored in an object (say `best`).
251 | We can do this for each posterior draw to produce a vector of the "best" treatments (from the beetle's perspective).
252 | To find the posterior probability that each particular treatment is best, count the frequency of each treatment in the `best` vector, and divide by the total number of posterior draws.
253 | Do this below using the results from problem 1, and report the posterior probabilities.
254 | 
255 | ```{r}
256 | # which treatment had the highest probability of recovery for each sample
257 | best <- apply(prob_posts, 1, which.max)
258 | worst <- apply(prob_posts, 1, which.min)
259 | 
260 | # convert to treatments
261 | best <- factor(levels(dat_use$diet)[best])
262 | worst <- factor(levels(dat_use$diet)[worst])
263 | 
264 | # make sure levels match in case any treatments have prob = 0
265 | best <- factor(best, levels = levels(dat_use$diet))
266 | worst <- factor(worst, levels = levels(dat_use$diet))
267 | 
268 | # proportion best
269 | p_best <- round(table(best)/length(best), digits = 2)
270 | 
271 | # proportion worst(for kicks)
272 | p_worst <- round(table(worst)/length(worst), digits = 2)
273 | 
274 | data.frame(diet = levels(dat_use$diet),
275 |   proportion_best = as.numeric(p_best),
276 |   proportion_worst = as.numeric(p_worst))
277 | ```
278 | 
279 | Which treatment was best? What is the probability of that treatment being the best, conditional on the data?
280 | 
281 | *Well the "best" treatment was 2aphids + pollen, though the probability of that treatment being best was only ~ 0.4. On the other hand, the pollen only treatment did have a ~ 0.9 probability of being the worst, so there's more suggestive evidence that not having aphids in the diet is worse than having aphids.*
282 | 
283 | # Problem 3
284 | 
285 | Simulate data from a normal distribution for three groups, each with a different mean.
286 | You can assume that the standard deviations for each group are equal.
287 | In generating data, use a design matrix to acquire the expected value for your data (somewhere in your code there should be `X %*% beta`).
288 | 
289 | ```{r}
290 | # set up
291 | n_groups <- 3       # nubmer of groups
292 | n <- 200             # total number of individuals
293 | 
294 | # assign individuals to groups
295 | groups <- sample(1:n_groups,n, replace = TRUE)
296 | 
297 | # create design matrix
298 | X <- matrix(0,nrow = n, ncol = n_groups)
299 | for(i in 1:nrow(X)){
300 |    X[i,groups[i]] <- 1
301 | }
302 | 
303 | # simulate y values
304 | beta <- c(5,10,15)     # vector of means
305 | sigma <- 3             # standard deviations
306 | y <- rnorm(n, mean = X %*% beta,sd = sigma)
307 | ```
308 | 
309 | Write a Stan model statement for a linear model that you can use to estimate the parameters.
310 | 
311 | ```
312 | data {
313 | 
314 |   // integer inputs
315 |   int n; // number of individuals
316 |   int n_groups; // number of groups
317 | 
318 |   // vector inputs
319 |   vector[n] y; // the observed data of length y
320 | 
321 |   // matrix inputs
322 |   matrix[n, n_groups] X; // the n x n_groups design matrix
323 | 
324 | }
325 | parameters {
326 | 
327 |   // vector of betas
328 |   vector[n_groups] beta;
329 | 
330 |   // value for standard deviation
331 |   real<lower=0> sigma; // must be greater than zero
332 | }
333 | model {
334 | 
335 |   // priors
336 |   beta ~ normal(0, 5);
337 |   sigma ~ normal(0, 5);
338 | 
339 |   // likelihood
340 |   y ~ normal(X * beta, sigma);
341 | }
342 | ```
343 | 
344 | Use Stan to estimate the parameters of your model.
345 | 
346 | ```{r}
347 | input <- list(n = n,
348 |   groups = n_groups,
349 |   X = X
350 | )
351 | 
352 | mod2 <- stan("lm_fit_will.stan", data = input,open_progress = FALSE)
353 | 
354 | ```
355 | 
356 | Assess convergence of the MCMC algorithm graphically and with the Rhat statistic.
357 | 
358 | ```{r}
359 | mod2
360 | traceplot(mod2)
361 | ```
362 | 
363 | Plot the marginal posterior distributions for each parameter with a vertical line at the true value.
364 | 
365 | ```{r}
366 | ## extract posteriors
367 | posts <- data.frame(b = extract(mod2)$beta,
368 |   s = extract(mod2)$sigma)
369 | names(posts) <- c("Group 1 mean","Group2 mean","Group 3 mean","SD")
370 | 
371 | ## melt
372 | posts_melted <- suppressWarnings(melt(posts))
373 | 
374 | ## convert parameters to data frame
375 | params <- data.frame(variable = names(posts),
376 |   value = c(beta,sigma))
377 | 
378 | ## plot
379 | ggplot(data = posts_melted, aes(value)) +
380 |   geom_density(fill = "grey50") +
381 |   facet_wrap(~variable) +
382 |   geom_vline(data = params, aes(xintercept = value),
383 |     color = "black", lty = "dashed")
384 | 
385 | 
386 | ```
387 | 


--------------------------------------------------------------------------------
/example_solutions/5-binomial/wk5_solutions_will.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Week 5 assignment: Binomial models'
  3 | author: "Will Stutz"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | output: pdf_document
  6 | ---
  7 | 
  8 | 
  9 | About one out of eight women in the U.S. will develop breast cancer at some point in her lifetime.
 10 | Early diagnoses help with treatment of this potentially fatal disease, and these diagnoses can be made based on a variety of cytological metrics evaluated via biopsy.
 11 | Your job today is to develop a model that classifies tumors as malignant or benign based on these metrics.
 12 | The student(s) with the most predictive model will get a prize.
 13 | 
 14 | The data are in the `breast_cancer.csv` file.
 15 | Details for this dataset can be found [on the UCI machine learning data repository](https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Original)), which is useful if you ever need data to play with.
 16 | I split the data into two groups at random: the *training* data, which you'll use to estimate parameters, and the *test* data, which we'll use to evaluate the predictive power of the model.
 17 | There is a column in the data called `group`, which indicates whether an observation is part of the training or test set.
 18 | 
 19 | ## Data exploration
 20 | 
 21 | As usual, you will want to explore the data before constructing any statistcal models.
 22 | Only explore the training data, and do not use the test data for data exploration/visualization.
 23 | We will pretend that we don't have access to the test data yet.
 24 | 
 25 | ```{r, message = FALSE}
 26 | # some useful libraries
 27 | library(ggplot2)
 28 | library(rstan)
 29 | library(reshape)
 30 | library(tidyr)
 31 | 
 32 | # set some options
 33 | options(mc.cores = parallel::detectCores())
 34 | 
 35 | # upload the data
 36 | dat <- read.csv("data/breast_cancer.csv")
 37 | 
 38 | # let's go ahead and just pull the training data output
 39 | train <- dat[dat$group == "train",] %>% droplevels
 40 | 
 41 | # how big is the data set?
 42 | dim(train)
 43 |   # 333 observations and 13 data columns
 44 | 
 45 | # what kind of data do we have?
 46 | head(train)
 47 | summary(train)
 48 |   # looks like a bunch of integer variables that vary between 1 and 10
 49 |   # 'malignant' must be whether the tumor was malignant or not
 50 | 
 51 | # how many cohorts?
 52 | table(train$cohort)
 53 |   # 8!
 54 | 
 55 | # are there differences between cohorts?
 56 | 
 57 |   # calculate means
 58 |   cohort_means <- data.frame(cohort = levels(train$cohort),
 59 |     prob = as.vector(tapply(train$malignant, train$cohort, mean)))
 60 | 
 61 |   # add to Data
 62 |   train <- merge(train,cohort_means,by = c("cohort"))
 63 | 
 64 |   # plot raw data with cohort means
 65 |   ggplot(data = train, aes(cohort,malignant)) +
 66 |     geom_jitter(height = 0.05) +
 67 |     geom_point(aes(y = prob), color = "steelblue", size = 3)
 68 | 
 69 | # let's melt all the predictor variables into a single column
 70 | train_melt <- melt(train, id.vars = c("id","cohort","group","malignant","prob"))
 71 | 
 72 | # are there correlations between the predictors?
 73 | library(GGally) # for the ggpairs function
 74 | ggpairs(train[,c(2:10)])
 75 |   # hard to see but shape and size uniformity are highly correlated
 76 | 
 77 | # plot the predictor data versus malignancy
 78 | ggplot(data = train_melt, aes(value, malignant)) +
 79 |   facet_wrap(~variable) +
 80 |   geom_jitter(height = 0.05, color = "grey30") +
 81 |   geom_smooth(method = "glm", method.args = list(family = "binomial"))
 82 |   # all are positively associated with the probability of malignancy
 83 | 
 84 | ```
 85 | 
 86 | 
 87 | ## Model structure
 88 | 
 89 | What is your model? Write it out in \LaTeX. Hint: you will want to use a design matrix.
 90 | 
 91 | $y \sim Bernoulli(p)$
 92 | 
 93 | $logit(p) = X \beta$
 94 | 
 95 | What is your Stan model statement?
 96 | 
 97 | ```
 98 | data {
 99 |   // integer inputs
100 |   int n;  // the number of samples
101 |   int n_pred;  // the number of predictors
102 |   int n_cohort;  // the number of cohorts
103 | 
104 |   // integer vector inputs
105 |   int<lower=0, upper=1> y[n];   // observed malignancies
106 | 
107 |   // design matrix
108 |   matrix[n,n_pred + n_cohort] X;
109 | }
110 | 
111 | parameters {
112 | 
113 |   // vector intercpet, betas for predictors and cohort means
114 |   vector [n_pred + n_cohort] beta; //
115 | }
116 | 
117 | model {
118 | 
119 |   // define priors for continuous predictors
120 |   beta[1:n_pred] ~ cauchy(0, 3);
121 | 
122 |   // define priors for cohort effects
123 |   beta[n_pred + 1:11] ~ cauchy(0,5);
124 |   beta[12] ~ normal(0,5);
125 |   beta[13:n_pred+n_cohort] ~ cauchy(0,5);
126 | 
127 |   // define the likelihood
128 |   y ~ bernoulli_logit(X*beta);
129 | 
130 | }
131 | 
132 | ```
133 | 
134 | ## Building and understanding the design matrix
135 | 
136 | We mentioned that you would want to use a design matrix.
137 | Specifically, your model should be of the form:
138 | 
139 | $y \sim Bernoulli(p)$
140 | 
141 | And the probability of malignancy $p$ is modeled using a logit-link:
142 | 
143 | $log \Big(\dfrac{p}{1 - p} \Big) = X \beta$
144 | 
145 | The design matrix $X$ contains the tumor features, and also dictates the interpretation of the coefficients $\beta$.
146 | In the code block below, construct your design matrix, creating an object called `X`.
147 | The included code will make an image plot of your design matrix with a horrendous color scheme.
148 | Once you fill in your code, set the argument `eval = TRUE` inside of the curly braces at the beginning of the code chuck (this is a chunk option), otherwise the code chunk will not be evaluated when you're knitting your pdf.
149 | 
150 | ```{r, eval = TRUE}
151 | # calculate principal components of uniformity
152 | unif_pca <- prcomp(~shape_uniformity + size_uniformity, data = train,
153 |     center = TRUE, scale = TRUE)
154 | 
155 | # add to data
156 | train$uniformity <- unif_pca$x[,1]
157 | 
158 | # center variables
159 | train_centered <- train # create new data frame
160 | train_centered[,c(3,6:11)] <- train_centered[,c(3,6:11)] - 5.5
161 |   # center variables at 5.5 since they are on a 1-10 scale
162 | 
163 | 
164 | # define your design matrix below
165 | X <- model.matrix(~0 + clump_thickness + uniformity + marginal_adhesion +
166 |   epithelial_size + bare_nuclei + bland_chromatin + normal_nucleoli + mitoses
167 |   + cohort, data = train_centered)
168 | 
169 | 
170 | # the code below will plot your design matrix
171 | mX <- melt(X)
172 | ggplot(mX, aes(x = X2, y = X1)) +
173 |   geom_raster(aes(fill = value)) +
174 |   scale_y_reverse() +
175 |   xlab('Design matrix column') +
176 |   ylab('Design matrix row') +
177 |   scale_fill_gradient2(low = "steelblue",  mid = "white", high = "seagreen3")
178 | ```
179 | 
180 | 
181 | For each column of $X$ you will get a coefficient, one element in $\beta$.
182 | For instance, the coefficient $\beta_1$ will be associated with the first column in $X$, which we might denote $X[, 1]$, to borrow some R syntax.
183 | There's no sense in estimating parameters if you don't know what they mean (Abraham Lincoln said that), so below, list each element in $\beta$ and briefly describe what it represents/how you would interpret it:
184 | 
185 | 
186 | 1. $\beta_1$ represents *the increase in the logit probability that a tumor is malignant given an increase of 1 point in clump thickness*
187 | 
188 | 2. $\beta_2$ represents *the increase in the logit probability that a tumor is malignant given an increase of 1 point in uniformity*
189 | 
190 | 3. $\beta_3$ represents *the increase in the logit probability that a tumor is malignant given an increase of 1 point in marginal adhesion*
191 | 
192 | 4. $\beta_4$ represents *the increase in the logit probability that a tumor is malignant given an increase of 1 point in epithelial size*
193 | 
194 | 5. $\beta_5$ represents *the increase in the logit probability that a tumor is malignant given an increase of 1 point in bare_nuclei*
195 | 
196 | 6. $\beta_6$ represents *the increase in the logit probability that a tumor is malignant given an increase of 1 point in bland chromatin*
197 | 
198 | 7. $\beta_7$ represents *the increase in the logit probability that a tumor is malignant given an increase of 1 point in normal nucleoli*
199 | 
200 | 8. $\beta_8$ represents *the increase in the logit probability that a tumor is malignant given an increase of 1 point in mitoses*
201 | 
202 | 9. $\beta_9$ represents *the logit probability that a tumor is malignant it the carrier was in cohort 1*
203 | 
204 | 10. $\beta_{10}$ represents *the logit probability that a tumor is malignant it the carrier was in cohort 2*
205 | 
206 | 11. $\beta_{11}$ represents *the logit probability that a tumor is malignant it the carrier was in cohort 3*
207 | 
208 | 12. $\beta_{12}$ represents *the logit probability that a tumor is malignant it the carrier was in cohort 4*
209 | 
210 | 13. $\beta_{13}$ represents *the logit probability that a tumor is malignant it the carrier was in cohort 5*
211 | 
212 | 14. $\beta_{14}$ represents *the logit probability that a tumor is malignant it the carrier was in cohort 6*
213 | 
214 | 15. $\beta_{15}$ represents *the logit probability that a tumor is malignant it the carrier was in cohort 7*
215 | 
216 | 16. $\beta_{16}$ represents *the logit probability that a tumor is malignant it the carrier was in cohort 8*
217 | 
218 | 
219 | ## Parameter estimation
220 | 
221 | Use the **training** data to estimate your model's parameters (`group == 'train'`).
222 | Do not use the **test** data yet.
223 | Make sure that the MCMC algorithm has converged before moving forward.
224 | 
225 | ```{r, message = FALSE}
226 | 
227 | # build the Data
228 | stan_d <- list(n = nrow(train),
229 |   n_pred = 8,
230 |   n_cohort = length(levels(train$cohort)),
231 |   X = X,
232 |   y = train$malignant)
233 | 
234 | # fit Model
235 | tumor_fit <- stan("tumor_glm_will.stan", data = stan_d)
236 | 
237 | # check Rhat
238 | print(tumor_fit)
239 | 
240 | # check traceplots
241 | rstan::traceplot(tumor_fit, pars = "beta")
242 | ```
243 | 
244 | 
245 | ## Out of sample predictive power
246 | 
247 | One measure of a model's ability to predict new data is the log likelihood of new data, given the parameters of the model $[\tilde{y} \mid \theta]$, where $\tilde{y}$ is the new data (the **test** or **validation** data), and the parameters $\theta$ have been estimated from other data (e.g., the **training** data).
248 | 
249 | Hints:
250 | 
251 | - this is done most easily via a new design matrix $X_{test}$, which can be multiplied by the vector of model parameters, and must be declared in the `data` block
252 | - make sure that if you used any feature scaling or centering in the training data, that the exact same scaling/centering schemes are applied to the test set
253 | - you'll use the `generated quantities` block to calculate the log-likelihood of the test data
254 | - you can obtain the joint log likelihood with the `bernoulli_logit_log` function in Stan, and I wrote a generated quantities model block for you below, which should be the last block in your new Stan model statement
255 | 
256 | What is your updated Stan model?
257 | 
258 | ```
259 | data {
260 |   // integer inputs
261 |   int n;  // the number of samples
262 |   int n_pred;  // the number of predictors
263 |   int n_cohort;  // the number of cohorts
264 |   int n_test; // number of individuals in the test data
265 | 
266 | 
267 |   // integer vector inputs
268 |   int<lower=0, upper=1> y[n];   // observed malignancies
269 |   int<lower=0, upper=1> y_test[n_test];   // observed for test data
270 | 
271 |   // design matrix
272 |   matrix[n,n_pred + n_cohort] X;
273 |   matrix[n_test, n_pred + n_cohort] X_test;
274 | }
275 | 
276 | parameters {
277 | 
278 |   // vector intercept, betas for predictors and cohort means
279 |   vector [n_pred + n_cohort] beta; //
280 | }
281 | 
282 | model {
283 | 
284 | 
285 |   // define priors for continuous predictors
286 |   beta[1:n_pred] ~ cauchy(0, 3);
287 | 
288 |   // define priors for cohort effects
289 |   beta[n_pred + 1:11] ~ cauchy(0,5);
290 |   beta[12] ~ normal(0,5); #shrinks estimate for cohort 4 towards 50%
291 |   beta[13:n_pred+n_cohort] ~ cauchy(0,5);
292 | 
293 |   // define the likelihood
294 |   y ~ bernoulli_logit(X*beta);
295 | 
296 | }
297 | 
298 | generated quantities {
299 |   real loglik_test;
300 |   vector[n_test] logit_p_test;
301 | 
302 |   logit_p_test <- X_test * beta;
303 |   loglik_test <- bernoulli_logit_log(y_test, logit_p_test);
304 |   //returns the sum of the log likelihoods (the joint log-likelihood)
305 | }
306 | 
307 | ```
308 | 
309 | Acquire the posterior distribution of the model parameters and the holdout log likelihood.
310 | 
311 | ```{r}
312 | # calculate new PCA using all the data
313 | dat$uniformity <- prcomp(~size_uniformity + shape_uniformity, data = dat,
314 |   center = TRUE, scale = TRUE)$x[,1]
315 | 
316 | # center variables
317 | dat_centered <- dat # create new data frame
318 | dat_centered[,c(2,5:10)] <- dat_centered[,c(2,5:10)] - 5.5
319 |   # center variables at 5.5 since they are on a 1-10 scale
320 | 
321 | # pull test data
322 | test_centered <- dat[dat$group == "test",]
323 | 
324 | # pull training data
325 | train_centered <- dat[dat$group == "train",]
326 | 
327 | # define training design matrix
328 | X <- model.matrix(~0 + clump_thickness + uniformity + marginal_adhesion +
329 |   epithelial_size + bare_nuclei + bland_chromatin + normal_nucleoli + mitoses
330 |   + cohort, data = train_centered)
331 | 
332 | # define test design matrix
333 | X_test <- model.matrix(~0 + clump_thickness + uniformity + marginal_adhesion +
334 |   epithelial_size + bare_nuclei + bland_chromatin + normal_nucleoli + mitoses
335 |   + cohort, data = test_centered)
336 | 
337 | # build the Data
338 | stan_d <- list(n = nrow(train_centered),
339 |   n_pred = 8,
340 |   n_cohort = length(levels(train_centered$cohort)),
341 |   n_test = nrow(test_centered),
342 |   X = X,
343 |   x_test = X_test,
344 |   y = train_centered$malignant,
345 |   y_test = test_centered$malignant)
346 | 
347 | # fit Model
348 | tumor_fit_test <- stan("tumor_glm_test_will.stan", data = stan_d)
349 | 
350 | # check Rhat
351 | print(tumor_fit_test, pars = c("beta","loglik_test"))
352 | 
353 | # check traceplots
354 | rstan::traceplot(tumor_fit_test, pars = c("beta","loglik_test"))
355 | 
356 | 
357 | 
358 | ```
359 | 
360 | Make a histogram of the holdout log likelihood and report the posterior mean along with a 95% credible interval.
361 | 
362 | ```{r}
363 | # extract summed log-likelihoods for each draw
364 | logliks <- rstan::extract(tumor_fit_test)$loglik_test
365 | 
366 | # create histogram
367 | qplot(logliks, geom = "histogram")
368 | 
369 | # calculate mean and 95% CI
370 | data.frame(mean = mean(logliks),
371 |   low_ci = quantile(logliks, prob = c(0.025)),
372 |   hi_ci = quantile(logliks, prob = c(0.975)))
373 | 
374 | ```
375 | 
376 | 
377 | ## Showing predictions
378 | 
379 | The whole point of building this model is to predict whether a tumor is malignant based on some features.
380 | Plot the posterior probability of tumor malignance for each holdout tumor, and show the true tumor status in the same graph.
381 | Multiple graph types are possible here, but we do not recommend simply copying and pasting code from another example (so far about a quarter of plots made in this way have made sense).
382 | Instead, think hard about what sort of data display would be effective, and make that plot!
383 | 
384 | ```{r}
385 | library(coda)
386 | # extract logit probabilities for each test tumor
387 | logit_probs <- as.mcmc(rstan::extract(tumor_fit_test)$logit_p_test)
388 | 
389 | # calculate mean and 95% credible intervals for each patient
390 | prob_summary <- summary(logit_probs)
391 | 
392 | # create data frame for plotting
393 | plot_data <- data.frame(patient = as.character(test_centered$id),
394 |   cohort = test_centered$cohort,
395 |   malignant = as.factor(test_centered$malignant),
396 |   mean = prob_summary$statistics[,"Mean"],
397 |   lwr = prob_summary$quantiles[,"2.5%"],
398 |   upr = prob_summary$quantiles[,"97.5%"])
399 | 
400 | # plot
401 | ggplot(data = plot_data, aes(patient, mean)) +
402 |   geom_hline(yintercept = 0, lty = "dotted") +
403 |   geom_errorbar(aes(ymin = lwr, ymax = upr), color = "grey70",width = 0.02) +
404 |   geom_point(aes(color = malignant)) +
405 |   facet_wrap(~cohort,ncol = 3,scales = "free_x") +
406 |   labs(y = "logit probability of malignancy", x = "patient") +
407 |   theme_bw() +
408 |   theme(axis.text.x = element_blank(),
409 |     axis.ticks.x = element_blank(),
410 |     panel.grid = element_blank())
411 | ```
412 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc., <http://fsf.org/>
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     {description}
294 |     Copyright (C) {year}  {fullname}
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   {signature of Ty Coon}, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 
341 | 


--------------------------------------------------------------------------------