├── _config.yml
├── .gitignore
├── statsr_labs.Rproj
├── Makefile
├── 4.2_conjugate_priors
    ├── data
    │   └── gen_data.R
    └── credible_interval.Rmd
├── README.md
├── 2.3_inf_for_numerical_data
    └── inf_for_numerical_data.Rmd
├── 2.4_inf_for_categorical_data
    └── inf_for_categorical_data.Rmd
├── 2.2_confidence_intervals
    └── confidence_intervals.Rmd
├── 4.1_two_armed_bandit
    └── two_armed_bandit.Rmd
├── 1.3_probability
    └── probability.Rmd
├── 2.1_sampling_distributions
    └── sampling_distributions.Rmd
├── 3.1_simple_regression
    └── simple_regression.Rmd
├── 1.2_intro_to_data
    └── intro_to_data.Rmd
├── 3.2_multiple_regression
    └── multiple_regression.Rmd
├── 1.1_intro_to_r
    └── intro_to_r.Rmd
└── 4.3_bayesian_inference
    └── bayesian_inference.Rmd


/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-slate


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | *cache
6 | 4.4_bayes_regression/bayes_regression_Coursera_files/
7 | *.html
8 | 


--------------------------------------------------------------------------------
/statsr_labs.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: knitr
13 | LaTeX: pdfLaTeX
14 | 
15 | BuildType: Makefile
16 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | RMD_FILES := $(shell find ./ -name *.Rmd)
 2 | HTML_FILES := $(subst Rmd,html,$(RMD_FILES))
 3 | LAB_DIRS := $(dir $(RMD_FILES))
 4 | 
 5 | 
 6 | all: $(HTML_FILES)
 7 | 	
 8 | $(HTML_FILES): %.html: %.Rmd
 9 | 	@echo "Compiling $(@F)"
10 | 	@cd $(@D); Rscript -e "library(rmarkdown);render('$(<F)', runtime='static', quiet=FALSE)"
11 | 	 
12 | clean:
13 | 	@rm -f $(HTML_FILES)


--------------------------------------------------------------------------------
/4.2_conjugate_priors/data/gen_data.R:
--------------------------------------------------------------------------------
 1 | # Code used to generate the brfss data frame in the statsr package
 2 | 
 3 | library(dplyr)
 4 | 
 5 | if(!file.exists("brfss2013.RData"))
 6 |   download.file("http://stat.duke.edu/~cr173/Sta102_Sp16/Proj/brfss2013.RData",  
 7 |                 destfile="brfss2013.RData")
 8 | 
 9 | load("brfss2013.RData")
10 | 
11 | set.seed("34308078")
12 | 
13 | n = 5000
14 | 
15 | brfss = brfss2013 %>% 
16 |   select(weight2, height3, sex, exerany2, fruit1, fvgreen) %>%
17 |   mutate(weight2 = as.numeric(as.character(weight2))) %>%
18 |   filter(height3 >= 200 & height3 <= 711) %>%
19 |   filter(weight2 > 50 & weight2 < 500) %>%
20 |   filter(fruit1 %in% c(0,101:109)) %>%
21 |   filter(fvgreen %in% c(0,101:109)) %>%
22 |   na.omit() %>%
23 |   transmute(weight = weight2,
24 |             height = floor(height3/100)*12+height3 %% 100,
25 |             sex    = sex,
26 |             exercise = exerany2,
27 |             fruit_per_day = fruit1 %% 100,
28 |             vege_per_day  = fvgreen %% 100) %>%
29 |   tbl_df()
30 | 
31 | 
32 | n_male = rbinom(1, n, 0.4862)
33 | n_female = n - n_male
34 | 
35 | brfss_male = brfss %>% filter(sex == "Male") %>% sample_n(n_male)
36 | brfss_female = brfss %>% filter(sex == "Female") %>% sample_n(n_female)
37 | 
38 | brfss = rbind(brfss_male, brfss_female) %>% sample_n(n)
39 | 
40 | save(brfss, file="brfss.rda")
41 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Statistics with R Specialization
 2 | 
 3 | This repository contains the most recent versions of all R Markdown based labs for the [Statistics with R](https://www.coursera.org/specializations/statistics) Coursera specialization.
 4 | 
 5 | ### 1. Introduction to Probability and Data
 6 | 
 7 | - Lab 1 - [Introduction to R](1.1_intro_to_r/intro_to_r.Rmd)
 8 | - Lab 2 - [Introduction to Data](1.2_intro_to_data/intro_to_data.Rmd)
 9 | - Lab 3 - [Probability](1.3_probability/probability.Rmd)
10 | 
11 | ### 2. Inferential Statistics
12 | 
13 | - Lab 1 - [Sampling Distributions](2.1_sampling_distributions/sampling_distributions.Rmd)
14 | - Lab 2 - [Confidence Intervals](2.2_confidence_intervals/confidence_intervals.Rmd)
15 | - Lab 3 - [Inference for Numerical Data](2.3_inf_for_numerical_data/inf_for_numerical_data.Rmd)
16 | - Lab 4 - [Inference for Categorical Data](2.4_inf_for_categorical_data/inf_for_categorical_data.Rmd)
17 | 
18 | ### 3. Linear Regression and Modeling
19 | 
20 | - Lab 1 - [Simple Linear Regression](3.1_simple_regression/simple_regression.Rmd)
21 | - Lab 2 - [Multiple Linear Regression](3.2_multiple_regression/multiple_regression.Rmd)
22 | 
23 | ### 4. Bayesian Statistics
24 | 
25 | - Lab 1 - [Two Armed Bandit](4.1_two_armed_bandit/two_armed_bandit.Rmd)
26 | - Lab 2 - [Conjugate Priors](4.2_conjugate_priors/credible_interval.Rmd)
27 | - Lab 3 - [Bayesian Inference](4.3_bayesian_inference/bayesian_inference.Rmd)
28 | - Lab 4 - [Bayesian Linear Regression](4.4_bayes_regression/bayes_regression.Rmd)
29 | 


--------------------------------------------------------------------------------
/2.3_inf_for_numerical_data/inf_for_numerical_data.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Inference for numerical data"
  3 | output: statsr:::statswithr_lab
  4 | ---
  5 | 
  6 | <div class="instructions">
  7 | Complete all **Exercises**, and submit answers to **Questions** on the Coursera 
  8 | platform.
  9 | </div>
 10 | 
 11 | ## Getting Started
 12 | 
 13 | ### Load packages
 14 | 
 15 | In this lab we will explore the data using the `dplyr` package and visualize it 
 16 | using the `ggplot2` package for data visualization. The data can be found in the
 17 | companion package for this course, `statsr`.
 18 | 
 19 | Let's load the packages.
 20 | 
 21 | ```{r load-packages, message=FALSE}
 22 | library(statsr)
 23 | library(dplyr)
 24 | library(ggplot2)
 25 | ```
 26 | 
 27 | ### The data
 28 | 
 29 | In 2004, the state of North Carolina released a large data set containing 
 30 | information on births recorded in this state. This data set is useful to 
 31 | researchers studying the relation between habits and practices of expectant 
 32 | mothers and the birth of their children. We will work with a random sample of 
 33 | observations from this data set.
 34 | 
 35 | Load the `nc` data set into our workspace.
 36 | 
 37 | ```{r load-data}
 38 | data(nc)
 39 | ```
 40 | 
 41 | We have observations on 13 different variables, some categorical and some 
 42 | numerical. The meaning of each variable is as follows.
 43 | 
 44 | variable         | description
 45 | ---------------- | ---------------------------------------------
 46 | `fage`           | father's age in years.
 47 | `mage`           | mother's age in years.
 48 | `mature`         | maturity status of mother.
 49 | `weeks`          | length of pregnancy in weeks.
 50 | `premie`         | whether the birth was classified as premature (premie) or full-term.
 51 | `visits`         | number of hospital visits during pregnancy.
 52 | `marital`        | whether mother is `married` or `not married` at birth.
 53 | `gained`         | weight gained by mother during pregnancy in pounds.
 54 | `weight`         | weight of the baby at birth in pounds.
 55 | `lowbirthweight` | whether baby was classified as low birthweight (`low`) or not (`not low`).
 56 | `gender`         | gender of the baby, `female` or `male`.
 57 | `habit`          | status of the mother as a `nonsmoker` or a `smoker`.
 58 | `whitemom`       | whether mom is `white` or `not white`.
 59 | 
 60 | <div class="question">
 61 | There are 1,000 cases in this data set, what do the cases represent? 
 62 | 
 63 | * The hospitals where the births took place  
 64 | * The fathers of the children  
 65 | * The days of the births 
 66 | * The births 
 67 | </div>
 68 | 
 69 | As a first step in the analysis, we should take a look at the variables in the dataset. 
 70 | This can be done using the `str` command:
 71 | 
 72 | ```{r str}
 73 | str(nc)
 74 | ```
 75 | 
 76 | As you review the variable summaries, consider which variables are categorical and which 
 77 | are numerical. For numerical variables, are there outliers? If you aren't sure or want to 
 78 | take a closer look at the data, make a graph.
 79 | 
 80 | ## Exploratory data analysis
 81 | 
 82 | We will first start with analyzing the weight gained by mothers throughout the 
 83 | pregnancy: `gained`.
 84 | 
 85 | Using visualization and summary statistics, describe the distribution of weight 
 86 | gained by mothers during pregnancy. The `summary` function can also be useful.
 87 | 
 88 | ```{r summary}
 89 | summary(nc$gained)
 90 | ```
 91 | 
 92 | <div class="question">
 93 |  How many mothers are we missing weight gain data from?
 94 | 
 95 | * 0 
 96 | * 13 
 97 | * 27 
 98 | * 31 
 99 | </div>
100 | 
101 | Next, consider the possible relationship between a mother's smoking habit and the 
102 | weight of her baby. Plotting the data is a useful first step because it helps 
103 | us quickly visualize trends, identify strong associations, and develop research
104 | questions.
105 | 
106 | <div class="question">
107 |  Make side-by-side boxplots of `habit` and `weight`. Which of the following is 
108 | false about the relationship between habit and weight?
109 | 
110 | * Median birth weight of babies born to non-smoker mothers is slightly higher than that of babies born to smoker mothers. 
111 | * Range of birth weights of babies born to non-smoker mothers is greater than that of babies born to smoker mothers. 
112 | * Both distributions are extremely right skewed. 
113 | * The IQRs of the distributions are roughly equal. 
114 | </div>
115 | ```{r habit-weight-box}
116 | # type your code for the Question 3 here, and Knit
117 | 
118 | ```
119 | 
120 | The box plots show how the medians of the two distributions compare, but we can
121 | also compare the means of the distributions using the following to 
122 | first group the data by the `habit` variable, and then calculate the mean
123 | `weight` in these groups using the `mean` function.
124 | 
125 | ```{r by-means}
126 | nc %>%
127 |   group_by(habit) %>%
128 |   summarise(mean_weight = mean(weight))
129 | ```
130 | 
131 | There is an observed difference, but is this difference statistically 
132 | significant? In order to answer this question we will conduct a hypothesis 
133 | test.
134 | 
135 | ## Inference
136 | 
137 | 
138 | <div class="exercise">
139 | Are all conditions necessary for inference satisfied? Comment on each. You can compute the group sizes using the same `by` command above but replacing `mean(weight)` with `n()`.
140 | </div>
141 | 
142 | <div class="question">
143 |   What are the hypotheses for testing if the average weights of babies born to 
144 | smoking and non-smoking mothers are different?
145 | 
146 | * $H_0: \mu_{smoking} = \mu_{non-smoking}$; $H_A: \mu_{smoking} > \mu_{non-smoking}$ 
147 | * $H_0: \mu_{smoking} = \mu_{non-smoking}$; $H_A: \mu_{smoking} ne \mu_{non-smoking}$ 
148 | * $H_0: \bar{x}_{smoking} = \bar{x}_{non-smoking}$; $H_A: \bar{x}_{smoking} > \bar{x}_{non-smoking}$ 
149 | * $H_0: \bar{x}_{smoking} = \bar{x}_{non-smoking}$; $H_A: \bar{x}_{smoking} > \bar{x}_{non-smoking}$ 
150 | * $H_0: \mu_{smoking} \ne \mu_{non-smoking}$;  $H_A: \mu_{smoking} = \mu_{non-smoking}$ 
151 | </div>
152 | 
153 | Next, we introduce a new function, `inference`, that we will use for conducting
154 | hypothesis tests and constructing confidence intervals. 
155 | 
156 | Then, run the following:
157 | 
158 | ```{r inf-weight-habit-ht, tidy=FALSE}
159 | inference(y = weight, x = habit, data = nc, statistic = "mean", type = "ht", null = 0, 
160 |           alternative = "twosided", method = "theoretical")
161 | ```
162 | 
163 | Let's pause for a moment to go through the arguments of this custom function. 
164 | The first argument is `y`, which is the response variable that we are 
165 | interested in: `weight`. The second argument is the explanatory variable, 
166 | `x`, which is the variable that splits the data into two groups, smokers and 
167 | non-smokers: `habit`. The third argument, `data`, is the data frame these
168 | variables are stored in. Next is `statistic`, which is the sample statistic
169 | we're using, or similarly, the population parameter we're estimating. In future labs
170 | we can also work with "median" and "proportion". Next we decide on the `type` of inference 
171 | we want: a hypothesis test (`"ht"`) or a confidence interval (`"ci"`). When performing a 
172 | hypothesis test, we also need to supply the `null` value, which in this case is `0`, 
173 | since the null hypothesis sets the two population means equal to each other. 
174 | The `alternative` hypothesis can be `"less"`, `"greater"`, or `"twosided"`. 
175 | Lastly, the `method` of inference can be `"theoretical"` or `"simulation"` based.
176 | 
177 | For more information on the inference function see the help file with `?inference`.
178 | 
179 | <div class="exercise">
180 | What is the conclusion of the hypothesis test?
181 | </div>
182 | 
183 | <div class="question">
184 |  Change the `type` argument to `"ci"` to construct and record a confidence 
185 | interval for the difference between the weights of babies born to nonsmoking and 
186 | smoking mothers, and interpret this interval in context of the data. Note that by 
187 | default you'll get a 95% confidence interval. If you want to change the
188 | confidence level, add a new argument (`conf_level`) which takes on a value
189 | between 0 and 1. Also note that when doing a confidence interval arguments like
190 | `null` and `alternative` are not useful, so make sure to remove them.
191 | 
192 | * We are 95% confident that babies born to nonsmoker mothers are on average 0.05 to 0.58 pounds lighter at birth than babies born to smoker mothers. 
193 | * We are 95% confident that the difference in average weights of babies whose moms are smokers and nonsmokers is between 0.05 to 0.58 pounds. 
194 | * We are 95% confident that the difference in average weights of babies in this sample whose moms are smokers and nonsmokers is between 0.05 to 0.58 pounds. 
195 | * We are 95% confident that babies born to nonsmoker mothers are on average 0.05 to 0.58 pounds heavier at birth than babies born to smoker mothers. 
196 | </div>
197 | ```{r habit-weight-ci}
198 | # type your code for the Question 5 here, and Knit
199 | 
200 | ```
201 | 
202 | By default the function reports an interval for ($\mu_{nonsmoker} - \mu_{smoker}$)
203 | . We can easily change this order by using the `order` argument:
204 | 
205 | ```{r inf-weight-habit-ci, tidy=FALSE}
206 | inference(y = weight, x = habit, data = nc, statistic = "mean", type = "ci", 
207 |           method = "theoretical", order = c("smoker","nonsmoker"))
208 | ```
209 | 
210 | <div class="question">
211 |   Calculate a 99% confidence interval for the average length of pregnancies 
212 | (`weeks`). Note that since you're doing inference on a single population 
213 | parameter, there is no explanatory variable, so you can omit the `x` variable 
214 | from the function. Which of the following is the correct interpretation of this 
215 | interval?
216 | </div>
217 | 
218 | ```{r weeks-ci-99}
219 | # type your code for Question 6 here, and Knit
220 | 
221 | ```
222 | 
223 | <div class="exercise">
224 | Calculate a new confidence interval for the same parameter at the 90% confidence level. Comment on the width of this interval versus the one obtained in the the previous exercise.
225 | </div>
226 | ```{r weeks-ci-90}
227 | # type your code for the Exercise here, and Knit
228 | 
229 | ```
230 | 
231 | <div class="exercise">
232 | Conduct a hypothesis test evaluating whether the average weight gained by younger mothers is different than the average weight gained by mature mothers.
233 | </div>
234 | ```{r gained-mature-ht}
235 | # type your code for the Exercise here, and Knit
236 | 
237 | ```
238 | 
239 | 7. Now, a non-inference task: Determine the age cutoff for younger and mature 
240 | mothers. Use a method of your choice, and explain how your method works.
241 | ```{r cutoff-mature}
242 | # type your code for Question 7 here, and Knit
243 | 
244 | ```
245 | 
246 | <div class="exercise">
247 | Pick a pair of variables: one numerical (response) and one categorical 
248 | (explanatory). Come up with a research question evaluating the relationship between 
249 | these variables. Formulate the question in a way that it can be answered using a 
250 | hypothesis test and/or a confidence interval. Answer your question using the `inference` 
251 | function, report the statistical results, and also provide an explanation in 
252 | plain language. Be sure to check all assumptions,state your $\alpha$ level, and conclude 
253 | in context. (Note: Picking your own variables, coming up with a research question,
254 | and analyzing the data to answer this question is basically what you'll need to do for
255 | your project as well.)
256 | </div>
257 | 
258 | ```{r pick-your-own}
259 | # type your code for the Exercise here, and Knit
260 | 
261 | ```
262 | 
263 | 
264 | <div id="license">
265 | This is a product of OpenIntro that is released under a [Creative Commons 
266 | Attribution-ShareAlike 3.0 Unported](http://creativecommons.org/licenses/by-sa/3.0).
267 | This lab was written for OpenIntro by Andrew Bray and Mine &Ccedil;etinkaya-Rundel.
268 | </div>


--------------------------------------------------------------------------------
/2.4_inf_for_categorical_data/inf_for_categorical_data.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Inference for categorical data"
  3 | output: statsr:::statswithr_lab
  4 | ---
  5 | 
  6 | <div class="instructions">
  7 | Complete all **Exercises**, and submit answers to **Questions** on the Coursera 
  8 | platform.
  9 | </div>
 10 | 
 11 | ## Introduction
 12 | 
 13 | In August of 2012, news outlets ranging from the [Washington Post](http://www.washingtonpost.com/national/on-faith/poll-shows-atheism-on-the-rise-in-the-us/2012/08/13/90020fd6-e57d-11e1-9739-eef99c5fb285_story.html) to the [Huffington Post](http://www.huffingtonpost.com/2012/08/14/atheism-rise-religiosity-decline-in-america_n_1777031.html) ran a story about the rise of atheism in America. The source for the story was a poll that asked people, "Irrespective of whether you attend a place of worship or not, would you say you are a religious person, not a religious person or a convinced atheist?" This type of question, which asks people to classify themselves in one way or another, is common in polling and generates categorical data. In this lab we take a look at the atheism survey and explore what's at play when making inference about population proportions using categorical data.
 14 | 
 15 | ## Getting Started
 16 | 
 17 | ### Load packages
 18 | 
 19 | In this lab we will explore the data using the `dplyr` package and visualize it 
 20 | using the `ggplot2` package for data visualization. The data can be found in the
 21 | companion package for this course, `statsr`.
 22 | 
 23 | Let's load the packages.
 24 | 
 25 | ```{r load-packages, message=FALSE}
 26 | library(statsr)
 27 | library(dplyr)
 28 | library(ggplot2)
 29 | ```
 30 | 
 31 | ### The survey
 32 | 
 33 | The press release for the poll, conducted by WIN-Gallup International, can be accessed [here](http://www.wingia.com/web/files/richeditor/filemanager/Global_INDEX_of_Religiosity_and_Atheism_PR__6.pdf).
 34 | 
 35 | Take a moment to review the report then address the following questions.
 36 | 
 37 | <div class="question">
 38 |  How many people were interviewed for this survey? 
 39 | 
 40 | * A poll conducted by WIN-Gallup International surveyed 51,000 people from 57 countries. 
 41 | * A poll conducted by WIN-Gallup International surveyed 52,000 people from 57 countries. 
 42 | * A poll conducted by WIN-Gallup International surveyed 51,917 people from 57 countries. 
 43 | * A poll conducted by WIN-Gallup International surveyed 51,927 people from 57 countries. 
 44 | </div>
 45 | 
 46 | <div class="question">
 47 |  Which of the following methods were used to gather information? 
 48 | 
 49 | * Face to face  
 50 | * Telephone 
 51 | * Internet  
 52 | * All of the above 
 53 | </div>
 54 | 
 55 | <div class="question">
 56 |  True / False: In the first paragraph, several key findings are reported. These percentages appear to be **sample statistics**. 
 57 | 
 58 | * True  
 59 | * False 
 60 | </div>
 61 | 
 62 | <div class="question">
 63 |   True / False:The title of the report is "Global Index of Religiosity and Atheism". To generalize the report's findings to the global human population, We must assume that the sample was a random sample from the entire population in order to be able to generalize the results to the global human population. This seems to be a reasonable assumption.
 64 | 
 65 | * True  
 66 | * False 
 67 | </div>
 68 | 
 69 | ### The data
 70 | 
 71 | Turn your attention to Table 6 (pages 15 and 16), which reports the sample size and response percentages for all 57 countries. While this is a useful format to summarize the data, we will base our analysis on the original data set of individual responses to the survey. Load this data set into R with the following command.
 72 | 
 73 | ```{r load-data}
 74 | data(atheism)
 75 | ```
 76 | 
 77 | <div class="question">
 78 |   What does each row of Table 6 correspond to? 
 79 | 
 80 | * Countries  
 81 | * Individual Persons  
 82 | * Religions 
 83 | </div>
 84 | 
 85 | <div class="question">
 86 |  What does each row of `atheism` correspond to?
 87 | 
 88 | * Countries  
 89 | * Individual Persons  
 90 | * Religions 
 91 | </div>
 92 | 
 93 | To investigate the link between these two ways of organizing this data, take a look at the estimated proportion of atheists in the United States. Towards the bottom of Table 6, we see that this is 5%. We should be able to come to the same number using the `atheism` data.
 94 | 
 95 | Create a new dataframe called `us12` that contains only the rows in `atheism` associated with respondents to the 2012 survey from the United States:
 96 | 
 97 | ```{r us-atheism}
 98 | us12 <- atheism %>%
 99 |   filter(nationality == "United States" , atheism$year == "2012")
100 | ```
101 | 
102 | 
103 | <div class="question">
104 |  Next, calculate the proportion of atheist responses in the United States in 2012, i.e. in `us12`. True / False: This percentage agrees with the percentage in Table~6.
105 | 
106 | * True  
107 | * False 
108 | </div>
109 | ```{r perc-atheist-us12}
110 | # type your code for Question 7 here, and Knit
111 | 
112 | ```
113 | 
114 | 
115 | ## Inference on proportions
116 | 
117 | As was hinted earlier, Table 6 provides **sample statistics**, that is, calculations made from the sample of 51,927 people. What we'd like, though, is insight into the population **population parameters**. You answer the question, "What proportion of people in your sample reported being atheists?" with a statistic; while the question "What proportion of people on earth would report being atheists" is answered with an estimate of the parameter.
118 | 
119 | The inferential tools for estimating population proportion are analogous to those used for means in the last lab: the confidence interval and the hypothesis test.
120 | 
121 | <div class="exercise">
122 | Write out the conditions for inference to construct a 95% confidence interval for the proportion of atheists in the United States in 2012. Are you confident all conditions are met?
123 | </div>
124 | 
125 | If the conditions for inference are reasonable, we can either calculate the standard error and construct the interval by hand, or allow the `inference` function to do it for us.
126 | 
127 | ```{r us-atheism-ci}
128 | inference(y = response, data = us12, statistic = "proportion", type = "ci", method = "theoretical", success = "atheist")
129 | ```
130 | 
131 | Note that since the goal is to construct an interval estimate for a proportion, it's necessary to specify what constitutes a ``success'', which here is a response of `atheist`.
132 | 
133 | Although formal confidence intervals and hypothesis tests don't show up in the report, suggestions of inference appear at the bottom of page 7: "In general, the error margin for surveys of this kind is $\pm$ 3-5% at 95% confidence."
134 | 
135 | <div class="exercise">
136 | Imagine that, after reading a front page story about the latest public opinion poll, a family member asks you, "What is a margin of error?" In one sentence, and ignoring the mechanics behind the calculation, how would you respond in a way that conveys the general concept?
137 | </div>
138 | 
139 | <div class="question">
140 |  Based on the R output, what is the margin of error for the estimate of the proportion of the proportion of atheists in US in 2012? 
141 | 
142 | * The margin of error for the estimate of the proportion of atheists in the US in 2012 is 0.05.  
143 | * The margin of error for the estimate of the proportion of atheists in the US in 2012 is 0.025.  
144 | * The margin of error for the estimate of the proportion of atheists in the US in 2012 is 0.0135. 
145 | </div>
146 | ```{r me-perc-atheist-us12}
147 | # type your code for Question 8 here, and Knit
148 | 
149 | ```
150 | 
151 | <div class="exercise">
152 | Using the inference function, calculate confidence intervals for the proportion of atheists in 2012 in two other countries of your choice, and report the associated margins of error. Be sure to note whether the conditions for inference are met. It may be helpful to create new data sets for each of the two countries first, and then use these data sets in the `inference` function to construct the confidence intervals.
153 | </div>
154 | ```{r me-perc-atheist-other-countries}
155 | # type your code for the Exercise here, and Knit
156 | 
157 | ```
158 | 
159 | ## How does the proportion affect the margin of error?
160 | 
161 | Imagine you've set out to survey 1000 people on two questions: are you female? and are you left-handed? Since both of these sample proportions were calculated from the same sample size, they should have the same margin of error, right? Wrong!  While the margin of error does change with sample size, it is also affected by the proportion.
162 | 
163 | Think back to the formula for the standard error: $SE = \sqrt{p(1-p)/n}$. This is then used in the formula for the margin of error for a 95% confidence interval: $ME = 1.96\times SE = 1.96\times\sqrt{p(1-p)/n}$. Since the population proportion $p$ is in this $ME$ formula, it should make sense that the margin of error is in some way dependent on the population proportion. We can visualize this relationship by creating a plot of $ME$ vs. $p$.
164 | 
165 | The first step is to make a vector `p` that is a sequence from $0$ to $1$ with each number separated by $0.01$. We can then create a vector of the margin of error (`me`) associated with each of these values of `p` using the familiar approximate formula ($ME = 1.96 \times SE$). Lastly, we plot the two vectors against each other to reveal their relationship.
166 | 
167 | ```{r me-plot}
168 | d <- data.frame(p <- seq(0, 1, 0.01))
169 | n <- 1000
170 | d <- d %>%
171 |   mutate(me = 1.96*sqrt(p*(1 - p)/n))
172 | ggplot(d, aes(x = p, y = me)) +
173 |   geom_line()
174 | ```
175 | 
176 | <div class="question">
177 |  Which of the following is false about the relationship between $p$ and $ME$. 
178 | 
179 | * The $ME$ reaches a minimum at $p = 0$.  
180 | * The $ME$ reaches a minimum at $p = 1$.  
181 | * The $ME$ is maximized when $p = 0.5$.  
182 | * The most conservative estimate when calculating a confidence interval occurs when $p$ is set to 1. 
183 | </div>
184 | 
185 | The question of atheism was asked by WIN-Gallup International in a similar survey that was conducted in 2005. We assume here that sample sizes have remained the same. Table 4 on page 13 of the report summarizes survey results from 2005 and 2012 for 39 countries.
186 | 
187 | Answer the following two questions using the `inference` function. As always, write out the hypotheses for any tests you conduct and outline the status of the conditions for inference.
188 | 
189 | <div class="question">
190 |  True / False: There is convincing evidence that Spain has seen a change in its atheism index between 2005 and 2012. <br><br> *Hint:* Create a new data set for respondents from Spain. Then use their responses as the first input on the `inference`, and use `year` as the grouping variable.
191 | 
192 | * True  
193 | * False 
194 | </div>
195 | ```{r spain-05-12}
196 | # type your code for Question 10 here, and Knit
197 | 
198 | ```
199 | 
200 | <div class="question">
201 |  True / False: There is convincing evidence that the United States has seen a change in its atheism index between 2005 and 2012.
202 | 
203 | * True  
204 | * False 
205 | </div>
206 | ```{r us-05-12}
207 | # type your code for Question 11 here, and Knit
208 | 
209 | ```
210 | 
211 | 
212 | <div class="question">
213 |  If in fact there has been no change in the atheism index in the countries listed in Table 4, in how many of those countries would you expect to detect a change (at a significance level of 0.05) simply by chance? <br><br> *Hint:* Type 1 error.
214 | 
215 | * 0 
216 | * 1  
217 | * 1.95  
218 | * 5 
219 | </div>
220 | 
221 | ```{r type1}
222 | # type your code for Question 12 here, and Knit
223 | 
224 | ```
225 | 
226 | <div class="question">
227 |  Suppose you're hired by the local government to estimate the proportion of residents that attend a religious service on a weekly basis. According to the guidelines, the estimate must have a margin of error no greater than 1% with 95% confidence. You have no idea what to expect for $p$. How many people would you have to sample to ensure that you are within the guidelines? <br><br> *Hint:* Refer to your plot of the relationship between $p$ and margin of error. Do not use the data set to answer this question.
228 | 
229 | * 2401 people 
230 | * At least 2401 people 
231 | * 9604 people  
232 | * At least 9604 people 
233 | </div>
234 | 
235 | ```{r sample-size}
236 | # type your code for Question 13 here, and Knit
237 | 
238 | ```
239 | 
240 | <div id="license">
241 | This is a product of OpenIntro that is released under a [Creative Commons 
242 | Attribution-ShareAlike 3.0 Unported](http://creativecommons.org/licenses/by-sa/3.0).
243 | This lab was written for OpenIntro by Andrew Bray and Mine &Ccedil;etinkaya-Rundel.
244 | </div>


--------------------------------------------------------------------------------
/2.2_confidence_intervals/confidence_intervals.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Foundations for inference - Confidence intervals"
  3 | output: statsr:::statswithr_lab
  4 | ---
  5 | 
  6 | <div class="instructions">
  7 | Complete all **Exercises**, and submit answers to **Questions** on the Coursera 
  8 | platform.
  9 | </div>
 10 | 
 11 | ## Introduction
 12 | 
 13 | If you have access to data on an entire population, say the size of every 
 14 | house in Ames, Iowa, it's straight forward to answer questions like, "How big 
 15 | is the typical house in Ames?" and "How much variation is there in sizes of 
 16 | houses?". If you have access to only a sample of the population, as is often 
 17 | the case, the task becomes more complicated. What is your best guess for the 
 18 | typical size if you only know the sizes of several dozen houses? This sort of 
 19 | situation requires that you use your sample to make inference on what your 
 20 | population looks like.
 21 | 
 22 | ### Setting a seed
 23 | 
 24 | We will take some random samples and calculate confidence based
 25 | on these samples in this lab, which means you should set a seed on top of your lab. If 
 26 | this concept is new to you, review the previous lab and ask your TA.
 27 | 
 28 | Setting a seed will cause R to sample the same sample each time you knit your document.
 29 | This will make sure your results don't change each time you knit, and it will also 
 30 | ensure reproducibility of your work (by setting the same seed it will be possible to 
 31 | reproduce your results). You can set a seed like this:
 32 | ```{r set-seed}
 33 | set.seed(9102015)                 # make sure to change the seed
 34 | ```
 35 | The number above is completely arbitraty. If you need inspiration, you can use your
 36 | ID, birthday, or just a random string of numbers. The important thing is that you
 37 | use each seed only once. You only need to do this once in your R Markdown document,
 38 | but make sure it comes before sampling.
 39 | </div>
 40 | 
 41 | 
 42 | ## Getting Started
 43 | 
 44 | ### Load packages
 45 | 
 46 | In this lab we will explore the data using the `dplyr` package and visualize it 
 47 | using the `ggplot2` package for data visualization. The data can be found in the
 48 | companion package for this course, `statsr`.
 49 | 
 50 | Let's load the packages.
 51 | 
 52 | ```{r load-packages, message=FALSE}
 53 | library(statsr)
 54 | library(dplyr)
 55 | library(tidyr)
 56 | library(ggplot2)
 57 | ```
 58 | 
 59 | ### The data
 60 | 
 61 | We consider real estate data from the city of Ames, Iowa. This is the same 
 62 | dataset used in the previous lab. The details of 
 63 | every real estate transaction in Ames is recorded by the City Assessor's 
 64 | office. Our particular focus for this lab will be all residential home sales 
 65 | in Ames between 2006 and 2010.  This collection represents our population of 
 66 | interest. In this lab we would like to learn about these home sales by taking 
 67 | smaller samples from the full population. Let's load the data.
 68 | 
 69 | ```{r load-data}
 70 | data(ames)
 71 | ```
 72 | 
 73 | In this lab we'll start with a simple random sample of size 60 from the 
 74 | population. Specifically, this is a simple random sample of size 60. Note that 
 75 | the data set has information on many housing variables, but for the first 
 76 | portion of the lab we'll focus on the size of the house, represented by the 
 77 | variable `area`.
 78 | 
 79 | ```{r sample}
 80 | n <- 60
 81 | samp <- sample_n(ames, n)
 82 | ```
 83 | 
 84 | <div class="exercise">
 85 | Describe the distribution of homes in your sample. What would you 
 86 | say is the "typical" size within your sample? Also state precisely what you 
 87 | interpreted "typical" to mean.
 88 | </div>
 89 | ```{r describe-sample}
 90 | # type your code for the Exercise here, and Knit
 91 | 
 92 | ```
 93 | 
 94 | <div class="question">
 95 |  True or False: My distribution should be similar to others' distributions who also collect random samples from this population, but it is likely not exactly the same since it's a random sample.
 96 | 
 97 | * True. 
 98 | * False. 
 99 | </div>
100 | 
101 | ## Confidence intervals
102 | 
103 | Return for a moment to the question that first motivated this lab: based on 
104 | this sample, what can we infer about the population? Based only on this single 
105 | sample, the best estimate of the average living area of houses sold in Ames 
106 | would be the sample mean, usually denoted as $\bar{x}$ (here we're calling it 
107 | `x_bar`). That serves as a good **point estimate** but it would be useful 
108 | to also communicate how uncertain we are of that estimate. This uncertainty
109 | can be quantified using a **confidence interval**.
110 | 
111 | A confidence interval for a population mean is of the following form
112 | \[ \bar{x} + z^\star \frac{s}{\sqrt{n}} \]
113 | 
114 | You should by now be comfortable with calculating the mean and standard deviation of 
115 | a sample in R. And we know that the sample size is 60. So the only remaining building
116 | block is finding the appropriate critical value for a given confidence level. We can
117 | use the `qnorm` function for this task, which will give the critical value associated
118 | with a given percentile under the normal distribution. Remember that confidence levels
119 | and percentiles are not equivalent. For example, a 95% confidence level refers to the
120 | middle 95% of the distribution, and the critical value associated with this area will
121 | correspond to the 97.5th percentile.
122 | 
123 | We can find the critical value for a 95% confidence interal using
124 | ```{r z_star_95}
125 | z_star_95 <- qnorm(0.975)
126 | z_star_95
127 | ```
128 | which is roughly equal to the value critical value 1.96 that you're likely
129 | familiar with by now.
130 | 
131 | Let's finally calculate the confidence interval:
132 | ```{r ci}
133 | samp %>%
134 |   summarise(lower = mean(area) - z_star_95 * (sd(area) / sqrt(n)),
135 |             upper = mean(area) + z_star_95 * (sd(area) / sqrt(n)))
136 | ```
137 | 
138 | To recap: even though we don't know what the full population looks like, we're 95% 
139 | confident that the true average size of houses in Ames lies between the values *lower* 
140 | and *upper*. There are a few conditions that must be met for this interval to be valid.
141 | 
142 | <div class="question">
143 |   For the confidence interval to be valid, the sample mean must be normally distributed and have standard error $s / \sqrt{n}$. Which of the following is not a condition needed for this to be true?
144 | 
145 | * The sample is random. 
146 | * The sample size, 60, is less than 10% of all houses. 
147 | * The sample distribution must be nearly normal. 
148 | </div>
149 | 
150 | 
151 | ## Confidence levels
152 | 
153 | <div class="question">
154 | What does "95% confidence" mean?
155 | 
156 | * 95% of the time the true average area of houses in Ames, Iowa, will be in this interval. 
157 | * 95% of random samples of size 60 will yield confidence intervals that contain the true average area of houses in Ames, Iowa. 
158 | * 95% of the houses in Ames have an area in this interval. 
159 | * 95% confident that the sample mean is in this interval. 
160 | </div>
161 | 
162 | In this case we have the rare luxury of knowing the true population mean since we 
163 | have data on the entire population. Let's calculate this value so that
164 | we can determine if our confidence intervals actually capture it. We'll store it in a
165 | data frame called `params` (short for population parameters), and name it `mu`.
166 | 
167 | ```{r pop-mean}
168 | params <- ames %>%
169 |   summarise(mu = mean(area))
170 | ```
171 | 
172 | <div class="exercise">
173 | Does your confidence interval capture the true average size of houses in 
174 | Ames?
175 | </div>
176 | ```{r check-ci-contain-true-mean}
177 | # type your code for the Exercise here, and Knit
178 | 
179 | ```
180 | 
181 | <div class="question">
182 | What proportion of 95% confidence intervals would you expect to capture the true population mean?
183 | 
184 | * 1% 
185 | * 5% 
186 | * 95% 
187 | * 99% 
188 | </div>
189 | 
190 | Using R, we're going to collect many samples to learn more about how sample 
191 | means and confidence intervals vary from one sample to another.
192 | 
193 | Here is the rough outline:
194 | 
195 | -   Obtain a random sample.
196 | -   Calculate the sample's mean and standard deviation, and use these to calculate
197 | and store the lower and upper bounds of the confidence intervals.
198 | -   Repeat these steps 50 times.
199 | 
200 | We can accomplish this using the `rep_sample_n` function. The following lines of 
201 | code takes 50 random samples of size `n` from population (and remember we defined 
202 | $n = 60$ earlier), and computes the upper and lower bounds of the confidence intervals based on these samples.
203 | 
204 | ```{r calculate-50-cis}
205 | ci <- ames %>%
206 |         rep_sample_n(size = n, reps = 50, replace = TRUE) %>%
207 |         summarise(lower = mean(area) - z_star_95 * (sd(area) / sqrt(n)),
208 |                   upper = mean(area) + z_star_95 * (sd(area) / sqrt(n)))
209 | ```
210 | 
211 | Let's view the first five intervals:
212 | 
213 | ```{r first-five-intervals}
214 | ci %>%
215 |   slice(1:5)
216 | ```
217 | 
218 | Next we'll create a plot similar to Figure 4.8 on page 175 of [OpenIntro Statistics, 3rd
219 | Edition](https://www.openintro.org/os). First step will be to create a new variable in 
220 | the `ci` data frame that indicates whether the interval does or does not capture the 
221 | true population mean. Note that capturing this value would mean the lower bound of the
222 | confidence interval is below the value and upper bound of the confidence interval is
223 | above the value. Remember that we create new variables using the `mutate` function.
224 | 
225 | ```{r capture-mu}
226 | ci <- ci %>%
227 |   mutate(capture_mu = ifelse(lower < params$mu & upper > params$mu, "yes", "no"))
228 | ```
229 | 
230 | The `ifelse` function is new. It takes three arguments: first is a logical statement,
231 | second is the value we want if the logical statement yields a true result, and the
232 | third is the value we want if the logical statement yields a false result.
233 | 
234 | We now have all the information we need to create the plot, but we need to re-organize
235 | our data a bit for easy plotting. Specifically, we need to organize the data in a new
236 | data frame where each row represents one bound, as opposed to one interval. So this
237 | 
238 | ~~~
239 |      lower    upper capture_mu
240 | 1 1350.540 1544.360        yes
241 | 2 1333.441 1584.425        yes
242 | 3 1412.133 1663.801        yes
243 | ...
244 | ~~~
245 | 
246 | should instead look something like
247 | 
248 | ~~~
249 |   replicate   type     bound capture_mu
250 | 1         1  lower  1350.540        yes
251 | 2         2  lower  1333.441        yes
252 | 3         3  lower  1412.133        yes
253 | 4         1  upper  1544.360        yes
254 | 5         2  upper  1584.425        yes
255 | 6         3  upper  1663.801        yes
256 | ...
257 | ~~~
258 | 
259 | We can accomplish this using the following:
260 | 
261 | ```{r create-ci-data-for-plot}
262 | ci_data <- gather(ci, type, bound, lower:upper)
263 | ```
264 | 
265 | And finally we can create the plot using the following:
266 | 
267 | ```{r plot-ci}
268 | ggplot(data = ci_data, aes(x = bound, y = replicate, 
269 |                            group = replicate, color = capture_mu)) +
270 |   geom_point(size = 2) +  # add points at the ends, size = 2
271 |   geom_line() +           # connect with lines
272 |   geom_vline(xintercept = params$mu, color = "darkgray") # draw vertical line
273 | ```
274 | 
275 | <div class="exercise">
276 | What proportion of your confidence intervals include the true population mean? 
277 | Is this proportion exactly equal to the confidence level? If not, explain why.
278 | </div>
279 | 
280 | 
281 | <div class="question">
282 |   What is the appropriate critical value for a 99% confidence level?
283 | 
284 | * 0.01 
285 | * 0.99  
286 | * 1.96 
287 | * 2.33 
288 | * 2.58 
289 | </div>
290 | ```{r find-99-perc-crit-val}
291 | # type your code for the Question 5 here, and Knit
292 | 
293 | ```
294 | 
295 | <div class="exercise">
296 | Calculate 50 confidence intervals at the 99% confidence level. You do not need to obtain new samples, simply calculate new intervals based on the sample means and standard deviations you have already collected. Plot all intervals and calculate the proportion of intervals that include the true population mean.
297 | </div>
298 | ```{r plot-99-perc-cis}
299 | # type your code for the Exercise here, and Knit
300 | 
301 | ```
302 | 
303 | <div class="question">
304 |  We would expect 99% of the intervals to contain the true population mean.
305 | 
306 | * True 
307 | * False 
308 | </div>
309 | 
310 | 
311 | <div id="license">
312 | This is a product of OpenIntro that is released under a [Creative Commons 
313 | Attribution-ShareAlike 3.0 Unported](http://creativecommons.org/licenses/by-sa/3.0).
314 | This lab was written for OpenIntro by Andrew Bray and Mine &Ccedil;etinkaya-Rundel.
315 | </div>


--------------------------------------------------------------------------------
/4.1_two_armed_bandit/two_armed_bandit.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Bayes' Rule and the Two Armed Bandit"
  3 | runtime: shiny
  4 | output: statsr:::statswithr_lab
  5 | ---
  6 | 
  7 | ```{r echo=FALSE, message=FALSE}
  8 | library(tidyverse)
  9 | library(statsr)
 10 | ```
 11 | 
 12 | <div class="instructions">
 13 | Complete all **Exercises**, and submit answers to **Questions** in the **Quiz: Week 1 Lab** on Coursera.
 14 | </div>
 15 | 
 16 | ## Background
 17 | 
 18 | Some people refer to slot machines as "One-armed Bandits" due to the older style 
 19 | of machine requiring the player to pull a mechanical handle to play. Statisticians 
 20 | and mathematicians often develop theories / models based on games of chance which 
 21 | turn out to be more generally useful. One general class of probability / optimization 
 22 | problems is known as the multi-armed bandit problem which is based on the following 
 23 | analogy: A player walks into a casino and sees a wall of slot machines. All of the 
 24 | machines pay out at different rates - some pay out more often than others, some 
 25 | pay out less often. Since the player does not know which machines are "good" and 
 26 | which are "bad", how should he / she play so as to make as much money (or at least 
 27 | lose as little) as possible?
 28 | 
 29 | ## Simulated Slots
 30 | 
 31 | Today we will examine a simplified case where there are only two machines (one "Good" 
 32 | and one "Bad"). We will also assume that we know the probability of winning on 
 33 | the "Good" machine and the probability of winning on the "Bad" machine - what we 
 34 | don't know is which machine is which. 
 35 | 
 36 | The Shiny App below will let you simulate playing slot machines when the 
 37 | probability of winning on the "good" machine is 1/2 and the probability of 
 38 | winning on the "bad" machine is 1/3. Each time you play, the App will "flip a 
 39 | coin" and randomly select either Machine 1 or Machine 2 to be the "good" machine, 
 40 | and the other to be the "bad" machine.
 41 | 
 42 | 
 43 | ```{r bandit, echo=FALSE, error=TRUE}
 44 | bandit_sim()
 45 | ```
 46 | 
 47 | <div class="task exercise">
 48 | Use the Shiny App above to play the slot machines a total of *10* times. You can 
 49 | divide your plays up however you like between the two machines. Once you have 
 50 | played 10 times, use the results of your plays to decide which machine you think 
 51 | has the better payout (i.e. the good machine) - click the button on the right 
 52 | that corresponds to your guess, the App will tell you if you are right. If you 
 53 | were right or wrong, press the reset button and play again and guess which 
 54 | machine you think is the good machine. As you are playing, think about what it 
 55 | is about your results that enabled you make the correct guess.
 56 | </div>
 57 | 
 58 | <div class="task exercise">
 59 | Press the Reset button again, now play *30* times and use those results to guess 
 60 | which machine is the good one. Do you think it was easier or harder to make a 
 61 | decision with the additional plays? Why do you think that is?
 62 | </div>
 63 | 
 64 | Hopefully what you have observed is that as you played the slot machine, initially 
 65 | it was difficult to determine which machine was which. But as you played more, it 
 66 | became more and more clear. In particular, each time you played you naturally 
 67 | reassessed which machine you thought was good. With the initial handful of plays, 
 68 | your beliefs stayed close to 50-50, potentially with a small bias towards the 
 69 | machine you had won more on. By the time you got to 30 plays you should have had 
 70 | a very strong belief about which machine was the "good" one. 
 71 | 
 72 | This is the way in which we usually interact with the world - we try something 
 73 | and modify our mental model based on the outcome we have received. This idea of 
 74 | updating beliefs based on observed data is one of the core tenets of Bayesian 
 75 | statistics - in the following sections we will work through the probability 
 76 | calculations and see how they correspond with our intuitive understanding.
 77 | 
 78 | 
 79 | ## Posterior Probabilities
 80 | 
 81 | We will start by examining the result of playing just once. Imagine that you 
 82 | play Machine 1 and you win, what do we now know about the probability of the two 
 83 | machines being "good" or "bad"? It is reasonable to believe that each machine is 
 84 | equally likely to be the "good" machine, we can express our belief as follow: 
 85 | 
 86 | $$P(M_1 \text{ is Good})=P(M_2 \text{ is Bad})=1/2$$
 87 | $$P(M_1 \text{ is Bad})=P(M_2 \text{ is Good})=1/2.$$
 88 | 
 89 | We have also been told that the probabilities of winning for each type of 
 90 | machine are:
 91 | 
 92 | $$P(\text{Win on }M_1 ~|~ M_1 \text{ is Good}) = 1/2 \qquad P(\text{Win on }M_1 ~|~ M_1 \text{ is Bad}) = 1/3.$$
 93 | 
 94 | We can use these probabilities to calculate the posterior probabilities of 
 95 | losing for each type of machine:
 96 | 
 97 | $$P(\text{Lose on }M_1 ~|~ M_1 \text{ is Good}) = 1/2 \qquad P(\text{Lose on }M_1 ~|~ M_1 \text{ is Bad}) = 2/3.$$
 98 | 
 99 | Note that while these probabilities are all for Machine 1, they are exactly the 
100 | same as the probabilities for Machine 2. We have seen how we can use Bayes' rule 
101 | to calculate $P(M_1 \text{ is Good} ~|~ \text{Win on } M_1)$
102 | 
103 | $$
104 | \begin{aligned}
105 | P(M_1 \text{ is Good} ~|~ \text{Win on } M_1) 
106 | &= \frac{P(\text{Win on } M_1 ~|~ M_1 \text{ is Good})~P(M_1 \text{ is Good})}{P(\text{Win on } M_1)} \\
107 | & = \\
108 | &= \frac{P(\text{Win on } M_1 ~|~ M_1 \text{ is Good})~P(M_1 \text{ is Good})}{P(\text{Win on } M_1 ~|~ M_1 \text{ is Good})~P(M_1 \text{ is Good})+P(\text{Win on } M_1 ~|~ M_1 \text{ is Bad})~P(M_1 \text{ is Bad})} \\
109 | & = \\
110 | &= \frac{1/2 \times 1/2}{1/2 \times 1/2+1/3 \times 1/2} = 0.6
111 | \end{aligned}
112 | $$
113 | 
114 | <div class="question">
115 | Based on the preceding result, what is the probability that Machine 1 is "Bad" 
116 | given you won a game playing on Machine 1?
117 | 
118 | * 0.3
119 | * 0.4
120 | * 0.5
121 | * 0.6
122 | * 0.7
123 | </div>
124 | 
125 | <div class="question">
126 | Based on the preceding result, what is the probability that Machine 2 is "Good" 
127 | given you won a game playing on Machine 1?
128 | 
129 | * 0.3
130 | * 0.4
131 | * 0.5
132 | * 0.6
133 | * 0.7
134 | </div>
135 | 
136 | <div class="question">
137 | Under the Bayesian paradigm, which of the following correctly matches the 
138 | probabilities with their names?
139 | 
140 |  
141 | * Posterior - $P(M_1 \text{ is Good} ~|~ \text{Win on } M_1)$ <br/> Prior - $P(M_1 \text{ is Good})$ <br/> Likelihood - $P(\text{Win on } M_1 ~|~ M_1 \text{ is Good})$
142 | * Posterior - $P(M_1 \text{ is Good} ~|~ \text{Win on } M_1)$ <br/> Prior - $P(\text{Win on } M_1 ~|~ M_1 \text{ is Good})$ <br/> Likelihood - $P(M_1 \text{ is Good})$ 
143 | * Posterior - P(\text{Win on } M_1 ~|~ M_1 \text{ is Good})$ <br/> Prior - P(M_1 \text{ is Good} ~|~ \text{Win on } M_1)$ <br/> Likelihood - $P(M_1 \text{ is Good})$
144 | * Posterior - $P(\text{Win on } M_1 ~|~ M_1 \text{ is Good})$ <br/> Prior - $P(M_1 \text{ is Good})$ <br/> Likelihood - $P(M_1 \text{ is Good} ~|~ \text{Win on } M_1)$ 
145 | </div>
146 | 
147 | 
148 | 
149 | ## Bayesian Updating
150 | 
151 | We have implemented a function for calculating the posterior probability of 
152 | Machine 1 and Machine 2 being the "good" machine after one or more plays of 
153 | either machine. The function `bandit_posterior` expects a data frame representing 
154 | your play history that contains two columns, `machine` which records which machine 
155 | was played (e.g. either a 1 or 2) and `outcome` which records whether you won 
156 | (`"W"`) or lost (`"L"`). An optional parameter to `bandit_posterior` is `prior`, 
157 | a vector of length two that specifies the prior probability of each machine 
158 | being "good".  If left unspecified, equal prior probabilities (0.5, 0.5) are 
159 | assumed.  We can repeat the calculation from the previous section using the 
160 | following code in the RStudio Console:
161 | 
162 | ```{r}
163 | bandit_posterior(data = data.frame(machine = 1, outcome = "W"))
164 | ```
165 | 
166 | We can also use this function to calculate the posterior probabilities for 
167 | additional plays, for example playing Machine 1 twice, first winning and then 
168 | losing.
169 | 
170 | ```{r}
171 | bandit_posterior(data = data.frame(machine = c(1,1), outcome = c("W","L")))
172 | ```
173 | 
174 | We have discussed how the Bayesian approach allows for updating procedures where 
175 | for each new data observation we are able to use the previous posterior 
176 | probabilities as our new prior probabilities and thereby simplify the calculation 
177 | (e.g. multiple simple updates can be used instead of one single large calculation). 
178 | We can explore this process by **chaining** multiple calls to `bandit_posterior` 
179 | together using `%>%`. With `prior = .` in the next call of `bandit_posterior`, 
180 | we use the returned posterior values as the prior.
181 | 
182 | ```{r}
183 | # Initiate the two plays on Machine 1
184 | data1 <- data.frame(machine = c(1), outcome = c("W"))
185 | data2 <- data.frame(machine = c(1), outcome = c("L"))
186 | 
187 | # Use piping and bandit_posterior to calculuate the posterior probabilities after each play
188 | bandit_posterior(data1) %>% 
189 |   bandit_posterior(data2, prior = .)
190 | ```
191 | 
192 | Note that this exactly matches the probabilities we calculated when we provided 
193 | the outcome of two plays all at once.
194 | 
195 | <div class="question">
196 | Use the `bandit_posterior` function to calculate the posterior probabilities of 
197 | Machine 1 and 2 being "good" after playing Machine 1 twice and winning both 
198 | times, and then playing Machine 2 three times with 2 wins then 1 loss.
199 | 
200 | * $P(M_1\text{ is good}~|~\text{data}) = 0.250$, <br/> $P(M_2\text{ is good}~|~\text{data}) = 0.750$
201 | * $P(M_1\text{ is good}~|~\text{data}) = 0.429$, <br/> $P(M_2\text{ is good}~|~\text{data}) = 0.571$
202 | * $P(M_1\text{ is good}~|~\text{data}) = 0.571$, <br/> $P(M_2\text{ is good}~|~\text{data}) = 0.429$
203 | * $P(M_1\text{ is good}~|~\text{data}) = 0.750$, <br/>$P(M_2\text{ is good}~|~\text{data}) = 0.250$
204 | </div>
205 | 
206 | ```{r Q4}
207 | # Type your code for Question 4 here
208 | ```
209 | 
210 | <div class="question">
211 | What would the posterior probabilities be if we had instead played Machine 2 
212 | first, playing three times with 2 wins and 1 loss, and then playing Machine 1 
213 | twice and winning both times?
214 | 
215 | * $P(M_1\text{ is good}~|~\text{data}) = 0.250$,<br/> $P(M_2\text{ is good}~|~\text{data}) = 0.750$
216 | * $P(M_1\text{ is good}~|~\text{data}) = 0.429$,<br/> $P(M_2\text{ is good}~|~\text{data}) = 0.571$
217 | * $P(M_1\text{ is good}~|~\text{data}) = 0.571$,<br/> $P(M_2\text{ is good}~|~\text{data}) = 0.429$
218 | * $P(M_1\text{ is good}~|~\text{data}) = 0.750$,<br/> $P(M_2\text{ is good}~|~\text{data}) = 0.250$
219 | </div>
220 | 
221 | ```{r Q5}
222 | # Type your code for Question 5 here
223 | ```
224 | 
225 | 
226 | <div class="exercise">
227 | Confirm the updating property we discussed previously by connecting two calls of 
228 | `bandit_posterior` in Question 4. The first call calculates the posterior 
229 | probability for the first two plays on Machine 1. The second call should use 
230 | these values as its prior and then calculate a new posterior using the data from 
231 | the subsequent three plays on Machine 2.
232 | </div>
233 | 
234 | ```{r E3}
235 | # Type your code for Exercise 3 here
236 | ```
237 | 
238 | 
239 | 
240 | ## Back to the Bandits
241 | 
242 | You may have notice that if you click on the Data tab in the middle of the App 
243 | above you are given code for a data frame that represents the results of your 
244 | plays within the machine. 
245 | 
246 | <div class="exercise">
247 | **Exercise 4**: Use this data frame with the `bandit_posterior` function to 
248 | calculate the exact posterior probability of each machine being "good". Do these 
249 | probabilities match with your intuition about which machine was good? 
250 | </div>
251 | 
252 | ```{r E4}
253 | # Type your code for Exercise 4 here
254 | ```
255 | 
256 | <div class="exercise">
257 | Reset the simulation and then play at least *50* times. Suppose you pause after 
258 | every 10 plays and record the results of that 10 plays, that is, subset the data 
259 | into `data[1:10]`, `data[11:20]` and so on to represent the results of each 10 
260 | plays. Then use the chaining method we discussed earlier to **update** the 
261 | posterior probability **sequentially** after we have finished the next 10 plays. 
262 | Observe how the posterior probability changes as more plays are made.
263 | </div>
264 | 
265 | ```{r E5}
266 | # Type your code for Exercise 5 here
267 | ```
268 | 
269 | We can visualize how these posterior probabilities update using the `plot_bandit_posterior` function. This function calculates and plots the 
270 | posterior probability after each play. Here is an example to visualize the 
271 | posterior probability of the two machines.
272 | 
273 | ```{r}
274 | # Store play results for each Machine into `data`
275 | data <- data.frame(machine = c(rep(1, 20), rep(2, 20)),
276 |                   outcome = c("L", "W", "W", "W", "L", "L", "L", "W", "W", "L", 
277 |                               "L", "W", "W", "W", "W", "L", "W", "L", "L", "L", 
278 |                               "W", "L", "L", "W", "L", "L", "L", "W", "L", "W", 
279 |                               "L", "L", "W", "L", "L", "L", "W", "W", "L", "W"))
280 |                               
281 | # Calculate posterior probabilities after each play and plot the result 
282 | plot_bandit_posterior(data)
283 | ```
284 | 
285 | <div class="exercise">
286 | Plot the result of your last 50 plays using the code `plot_bandit_posterior(data)` as shown in the example above, describe the pattern you see for the two posterior probabilities.
287 | </div>
288 | 
289 | ```{r E6}
290 | # Type your code for Exercise 6 here
291 | ```
292 | 
293 | <div class="question">
294 | From the plot generated in the example above, we can see that the posterior probabilities for Machine 1 and Machine 2 mirror each other. Why will this happen?
295 | 
296 | * $P(M_1~|~\text{data})$ and $P(M_2~|~\text{data})$ are complementary
297 | * Machine 1 and Machine 2 being "good" are mutually exclusive events
298 | * Both of the above
299 | </div>
300 | 
301 | <div class="license">
302 | This work is licensed under [GNU General Public License v3.0](https://www.gnu.org/licenses/quick-guide-gplv3.html).
303 | </div>
304 | 
305 | 


--------------------------------------------------------------------------------
/1.3_probability/probability.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Probability"
  3 | output: statsr:::statswithr_lab
  4 | ---
  5 | 
  6 | <div class="instructions">
  7 | Complete all **Exercises**, and submit answers to **Questions** on the Coursera 
  8 | platform.
  9 | </div>
 10 | 
 11 | ## Hot Hands
 12 | 
 13 | Basketball players who make several baskets in succession are described as 
 14 | having a *hot hand*. Fans and players have long believed in the hot hand 
 15 | phenomenon, which refutes the assumption that each shot is independent of the 
 16 | next. However, [a 1985 paper](http://www.sciencedirect.com/science/article/pii/0010028585900106) by Gilovich, Vallone, and Tversky collected evidence
 17 | that contradicted this belief and showed that successive shots are independent 
 18 | events. This paper started a great controversy that continues to this day, as you can 
 19 | see by Googling *hot hand basketball*.
 20 | 
 21 | We do not expect to resolve this controversy today. However, in this lab we'll 
 22 | apply one approach to answering questions like this. The goals for this lab are 
 23 | to (1) think about the effects of independent and dependent events, (2) learn 
 24 | how to simulate shooting streaks in R, and (3) to compare a simulation to actual
 25 | data in order to determine if the hot hand phenomenon appears to be real.
 26 | 
 27 | ## Getting Started
 28 | 
 29 | ### Load packages
 30 | 
 31 | In this lab we will explore the data using the `dplyr` package and visualize it 
 32 | using the `ggplot2` package for data visualization. The data can be found in the
 33 | companion package for this course, `statsr`.
 34 | 
 35 | Let's load the packages.
 36 | 
 37 | ```{r load-packages, message=FALSE}
 38 | library(statsr)
 39 | library(dplyr)
 40 | library(ggplot2)
 41 | ```
 42 | 
 43 | ### Data
 44 | 
 45 | Our investigation will focus on the performance of one player: Kobe Bryant of 
 46 | the Los Angeles Lakers. His performance against the Orlando Magic in the 2009 
 47 | NBA finals earned him the title *Most Valuable Player* and many spectators 
 48 | commented on how he appeared to show a hot hand. Let's load some necessary files
 49 | that we will need for this lab.
 50 | 
 51 | ```{r load-data}
 52 | data(kobe_basket)
 53 | ```
 54 | 
 55 | This data frame contains 133 observations and 6 variables, where every
 56 | row records a shot taken by Kobe Bryant. The `shot` variable in this dataset
 57 | indicates whether the shot was a hit (`H`) or a miss (`M`).
 58 | 
 59 | Just looking at the string of hits and misses, it can be difficult to gauge 
 60 | whether or not it seems like Kobe was shooting with a hot hand. One way we can 
 61 | approach this is by considering the belief that hot hand shooters tend to go on 
 62 | shooting streaks. For this lab, we define the length of a shooting streak to be 
 63 | the *number of consecutive baskets made until a miss occurs*.
 64 | 
 65 | For example, in Game 1 Kobe had the following sequence of hits and misses from 
 66 | his nine shot attempts in the first quarter:
 67 | 
 68 | \[ \textrm{H M | M | H H M | M | M | M} \]
 69 | 
 70 | You can verify this by viewing the first 8 rows of the data in the data viewer.
 71 | 
 72 | Within the nine shot attempts, there are six streaks, which are separated by a 
 73 | "|" above. Their lengths are one, zero, two, zero, zero, zero (in order of 
 74 | occurrence).
 75 | 
 76 | <div class="question">
 77 |  Fill in the blank: A streak length of 1 means one \_\_\_ followed by one miss.
 78 | 
 79 | * hit 
 80 | * miss 
 81 | </div>
 82 | 
 83 | 
 84 | <div class="question">
 85 |  Fill in the blank: A streak length of 0 means one \_\_\_ which must occur after a 
 86 | miss that ended the preceeding streak.
 87 | 
 88 | * hit 
 89 | * miss 
 90 | </div>
 91 | 
 92 | Counting streak lengths manually for all 133 shots would get tedious, so we'll
 93 | use the custom function `calc_streak` to calculate them, and store the results
 94 | in a data frame called `kobe_streak` as the `length` variable.
 95 | 
 96 | ```{r calc-streak-kobe}
 97 | kobe_streak <- calc_streak(kobe_basket$shot)
 98 | ```
 99 | 
100 | We can then take a look at the distribution of these streak lengths.
101 | 
102 | ```{r plot-streak-kobe}
103 | ggplot(data = kobe_streak, aes(x = length)) +
104 |   geom_histogram(binwidth = 1)
105 | ```
106 | 
107 | <div class="question">
108 |  Which of the following is false about the distribution of Kobe's streak lengths 
109 | from the 2009 NBA finals. 
110 | 
111 | * The distribution of Kobe's streaks is unimodal and right skewed. 
112 | * The typical length of a streak is 0 since the median of the distribution is at 0. 
113 | * The IQR of the distribution is 1. 
114 | * The longest streak of baskets is of length 4. 
115 | * The shortest streak is of length 1.
116 | </div>
117 | 
118 | ## Compared to What?
119 | 
120 | We've shown that Kobe had some long shooting streaks, but are they long enough 
121 | to support the belief that he had hot hands? What can we compare them to?
122 | 
123 | To answer these questions, let's return to the idea of *independence*. Two 
124 | processes are independent if the outcome of one process doesn't effect the outcome 
125 | of the second. If each shot that a player takes is an independent process, 
126 | having made or missed your first shot will not affect the probability that you
127 | will make or miss your second shot.
128 | 
129 | A shooter with a hot hand will have shots that are *not* independent of one 
130 | another. Specifically, if the shooter makes his first shot, the hot hand model 
131 | says he will have a *higher* probability of making his second shot.
132 | 
133 | Let's suppose for a moment that the hot hand model is valid for Kobe. During his
134 | career, the percentage of time Kobe makes a basket (i.e. his shooting 
135 | percentage) is about 45%, or in probability notation,
136 | 
137 | \[ P(\textrm{shot 1 = H}) = 0.45 \]
138 | 
139 | If he makes the first shot and has a hot hand (*not* independent shots), then 
140 | the probability that he makes his second shot would go up to, let's say, 60%,
141 | 
142 | \[ P(\textrm{shot 2 = H} \, | \, \textrm{shot 1 = H}) = 0.60 \]
143 | 
144 | As a result of these increased probabilites, you'd expect Kobe to have longer 
145 | streaks. Compare this to the skeptical perspective where Kobe does *not* have a
146 | hot hand, where each shot is independent of the next. If he hit his first shot,
147 | the probability that he makes the second is still 0.45.
148 | 
149 | \[ P(\textrm{shot 2 = H} \, | \, \textrm{shot 1 = H}) = 0.45 \]
150 | 
151 | In other words, making the first shot did nothing to effect the probability that
152 | he'd make his second shot. If Kobe's shots are independent, then he'd have the 
153 | same probability of hitting every shot regardless of his past shots: 45%.
154 | 
155 | Now that we've phrased the situation in terms of independent shots, let's return
156 | to the question: how do we tell if Kobe's shooting streaks are long enough to 
157 | indicate that he has hot hands? We can compare his streak lengths to someone
158 | without hot hands: an independent shooter. 
159 | 
160 | ## Simulations in R
161 | 
162 | While we don't have any data from a shooter we know to have independent shots, 
163 | that sort of data is very easy to simulate in R. In a simulation, you set the 
164 | ground rules of a random process and then the computer uses random numbers to 
165 | generate an outcome that adheres to those rules. As a simple example, you can
166 | simulate flipping a fair coin with the following.
167 | 
168 | ```{r head-tail}
169 | coin_outcomes <- c("heads", "tails")
170 | sample(coin_outcomes, size = 1, replace = TRUE)
171 | ```
172 | 
173 | The vector `outcomes` can be thought of as a hat with two slips of paper in it: 
174 | one slip says `heads` and the other says `tails`. The function `sample` draws 
175 | one slip from the hat and tells us if it was a head or a tail. 
176 | 
177 | Run the second command listed above several times. Just like when flipping a 
178 | coin, sometimes you'll get a heads, sometimes you'll get a tails, but in the 
179 | long run, you'd expect to get roughly equal numbers of each.
180 | 
181 | If you wanted to simulate flipping a fair coin 100 times, you could either run 
182 | the function 100 times or, more simply, adjust the `size` argument, which 
183 | governs how many samples to draw (the `replace = TRUE` argument indicates we put
184 | the slip of paper back in the hat before drawing again). Save the resulting 
185 | vector of heads and tails in a new object called `sim_fair_coin`.
186 | 
187 | ```{r sim-fair-coin}
188 | sim_fair_coin <- sample(coin_outcomes, size = 100, replace = TRUE)
189 | ```
190 | 
191 | To view the results of this simulation, type the name of the object and then use
192 | `table` to count up the number of heads and tails.
193 | 
194 | ```{r table-sim-fair-coin}
195 | sim_fair_coin
196 | table(sim_fair_coin)
197 | ```
198 | 
199 | Since there are only two elements in `outcomes`, the probability that we "flip" 
200 | a coin and it lands heads is 0.5. Say we're trying to simulate an unfair coin 
201 | that we know only lands heads 20% of the time. We can adjust for this by adding 
202 | an argument called `prob`, which provides a vector of two probability weights.
203 | 
204 | ```{r sim-unfair-coin}
205 | sim_unfair_coin <- sample(coin_outcomes, size = 100, replace = TRUE, 
206 |                           prob = c(0.2, 0.8))
207 | ```
208 | 
209 | `prob = c(0.2, 0.8)` indicates that for the two elements in the `outcomes` vector,
210 | we want to select the first one, `heads`, with probability 0.2 and the second 
211 | one, `tails` with probability 0.8. Another way of thinking about this is to 
212 | think of the outcome space as a bag of 10 chips, where 2 chips are labeled 
213 | "head" and 8 chips "tail". Therefore at each draw, the probability of drawing a 
214 | chip that says "head"" is 20%, and "tail" is 80%.
215 | 
216 | <div class="exercise">
217 | In your simulation of flipping the unfair coin 100 times, how many flips came up heads?
218 | </div>
219 | 
220 | In a sense, we've shrunken the size of the slip of paper that says "heads", 
221 | making it less likely to be drawn and we've increased the size of the slip of 
222 | paper saying "tails", making it more likely to be drawn. When we simulated the 
223 | fair coin, both slips of paper were the same size. This happens by default if 
224 | you don't provide a `prob` argument; all elements in the `outcomes` vector have 
225 | an equal probability of being drawn.
226 | 
227 | If you want to learn more about `sample` or any other function, recall that you 
228 | can always check out its help file with `?sample`.
229 | 
230 | 
231 | ## Simulating the Independent Shooter
232 | 
233 | Simulating a basketball player who has independent shots uses the same mechanism 
234 | that we use to simulate a coin flip. To simulate a single shot from an 
235 | independent shooter with a shooting percentage of 50% we type,
236 | 
237 | ```{r sim-basket}
238 | shot_outcomes <- c("H", "M")
239 | sim_basket <- sample(shot_outcomes, size = 1, replace = TRUE)
240 | ```
241 | 
242 | To make a valid comparison between Kobe and our simulated independent shooter, 
243 | we need to align both their shooting percentage and the number of attempted shots.
244 | 
245 | 
246 | <div class="exercise">
247 | What change needs to be made to the `sample` function so that it reflects a shooting percentage of 45%? Make this adjustment, then run a simulation to sample 133 shots. Assign the output of this simulation to a new object called `sim_basket`.
248 | </div>
249 | ```{r}
250 | # type your code for the Exercise here, and Knit
251 | 
252 | ```
253 | 
254 |    
255 | Note that we've named the new vector `sim_basket`, the same name that we gave to
256 | the previous vector reflecting a shooting percentage of 50%. In this situation, 
257 | R overwrites the old object with the new one, so always make sure that you don't
258 | need the information in an old vector before reassigning its name.
259 | 
260 | With the results of the simulation saved as `sim_basket`, we have the data 
261 | necessary to compare Kobe to our independent shooter.
262 | 
263 | Both data sets represent the results of 133 shot attempts, each with the same 
264 | shooting percentage of 45%. We know that our simulated data is from a shooter 
265 | that has independent shots. That is, we know the simulated shooter does not have
266 | a hot hand.
267 | 
268 | ### Comparing Kobe Bryant to the Independent Shooter
269 | 
270 | <div class="exercise">
271 | Using `calc_streak`, compute the streak lengths of `sim_basket`, and save the results in a data frame called `sim_streak`. Note that since the `sim_streak` object is just a vector and not a variable in a data frame, we don't need to first select it from a data frame like we did earlier when we calculated the streak lengths for Kobe's shots.
272 | </div>
273 | ```{r sim-streak-lengths}
274 | # type your code for the Exercise here, and Knit
275 | 
276 | ```
277 |    
278 | <div class="exercise">
279 | Make a plot of the distribution of simulated streak lengths of the independent shooter. What is the typical streak length for this simulated independent shooter with a 45% shooting percentage? How long is the player's longest streak of baskets in 133 shots?
280 | </div>
281 | ```{r plot-sim-streaks}
282 | # type your code for the Exercise here, and Knit
283 | 
284 | ```
285 | 
286 | <div class="question">
287 | If you were to run the simulation of the independent shooter a second time, how 
288 | would you expect its streak distribution to compare to the distribution from the 
289 | exercise above? 
290 | * Exactly the same  
291 | * Somewhat similar 
292 | * Totally different 
293 | </div>
294 | 
295 | 
296 | <div class="question">
297 | How does Kobe Bryant's distribution of streak lengths compare to the distribution 
298 | of streak lengths for the simulated shooter? Using this comparison, do you have 
299 | evidence that the hot hand model fits Kobe's shooting patterns?
300 | 
301 | * The distributions look very similar. Therefore, there doesn't appear to be evidence for Kobe Bryant's hot hand. 
302 | * The distributions look very similar. Therefore, there appears to be evidence for Kobe Bryant's hot hand. 
303 | * The distributions look very different. Therefore, there doesn't appear to be evidence for Kobe Bryant's hot hand. 
304 | * The distributions look very different. Therefore, there appears to be evidence for Kobe Bryant's hot hand. 
305 | </div>
306 | 
307 | <div class="exercise">
308 | What concepts from the course videos are covered in this lab? What 
309 | concepts, if any, are not covered in the videos? Have you seen these concepts 
310 | elsewhere, e.g. textbook, previous labs, or practice problems?
311 | </div>
312 | 
313 | <div id="license">
314 | This is a derivative of an [OpenIntro](https://www.openintro.org/stat/labs.php) lab, and is released under a [Attribution-NonCommercial-ShareAlike 3.0 United States](https://creativecommons.org/licenses/by-nc-sa/3.0/us/) license.
315 | </div>


--------------------------------------------------------------------------------
/2.1_sampling_distributions/sampling_distributions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Foundations for inference - Sampling distributions"
  3 | output: statsr:::statswithr_lab
  4 | runtime: shiny
  5 | ---
  6 | 
  7 | <div class="instructions">
  8 | Complete all **Exercises**, and submit answers to **Questions** on the Coursera 
  9 | platform.
 10 | </div>
 11 | 
 12 | ## Getting Started
 13 | 
 14 | ### Load packages
 15 | 
 16 | In this lab we will explore the data using the `dplyr` package and visualize it 
 17 | using the `ggplot2` package for data visualization. The data can be found in the
 18 | companion package for this course, `statsr`.
 19 | 
 20 | Let's load the packages.
 21 | 
 22 | ```{r load-packages, message=FALSE}
 23 | library(statsr)
 24 | library(dplyr)
 25 | library(ggplot2)
 26 | ```
 27 | 
 28 | ### The data
 29 | 
 30 | We consider real estate data from the city of Ames, Iowa. The details of 
 31 | every real estate transaction in Ames is recorded by the City Assessor's 
 32 | office. Our particular focus for this lab will be all residential home sales 
 33 | in Ames between 2006 and 2010.  This collection represents our population of 
 34 | interest. In this lab we would like to learn about these home sales by taking 
 35 | smaller samples from the full population. Let's load the data.
 36 | 
 37 | ```{r load-data}
 38 | data(ames)
 39 | ```
 40 | 
 41 | We see that there are quite a few variables in the data set, enough to do a 
 42 | very in-depth analysis. For this lab, we'll restrict our attention to just 
 43 | two of the variables: the above ground living area of the house in square feet 
 44 | (`area`) and the sale price (`price`).
 45 | 
 46 | We can explore the distribution of areas of homes in the population of home
 47 | sales visually and with summary statistics. Let's first create a visualization,
 48 | a histogram:
 49 | 
 50 | ```{r area-hist}
 51 | ggplot(data = ames, aes(x = area)) +
 52 |   geom_histogram(binwidth = 250)
 53 | ```
 54 | 
 55 | Let's also obtain some summary statistics. Note that we can do this using the
 56 | `summarise` function. We can calculate as many statistics as we want using this
 57 | function, and just string along the results. Some of the functions below should
 58 | be self explanatory (like `mean`, `median`, `sd`, `IQR`, `min`, and `max`). A
 59 | new function here is the `quantile` function which we can use to calculate 
 60 | values corresponding to specific percentile cutoffs in the distribution. For
 61 | example `quantile(x, 0.25)` will yield the cutoff value for the 25th percentile (Q1)
 62 | in the distribution of x. Finding these values are useful for describing the 
 63 | distribution, as we can use them for descriptions like *"the middle 50% of the 
 64 | homes have areas between such and such square feet"*.
 65 | 
 66 | ```{r area-stats}
 67 | ames %>%
 68 |   summarise(mu = mean(area), pop_med = median(area), 
 69 |             sigma = sd(area), pop_iqr = IQR(area),
 70 |             pop_min = min(area), pop_max = max(area),
 71 |             pop_q1 = quantile(area, 0.25),  # first quartile, 25th percentile
 72 |             pop_q3 = quantile(area, 0.75))  # third quartile, 75th percentile
 73 | ```
 74 | 
 75 | <div class="question">
 76 | Which of the following is **false**?
 77 | 
 78 | * The distribution of areas of houses in Ames is unimodal and right-skewed. 
 79 | * 50\% of houses in Ames are smaller than 1,499.69 square feet. 
 80 | * The middle 50\% of the houses range between approximately 1,126 square feet and 1,742.7 square feet. 
 81 | * The IQR is approximately 616.7 square feet. 
 82 | * The smallest house is 334 square feet and the largest is 5,642 square feet. 
 83 | </div>
 84 | 
 85 | 
 86 | ## The unknown sampling distribution
 87 | 
 88 | In this lab we have access to the entire population, but this is rarely the 
 89 | case in real life. Gathering information on an entire population is often 
 90 | extremely costly or impossible. Because of this, we often take a sample of 
 91 | the population and use that to understand the properties of the population.
 92 | 
 93 | If we were interested in estimating the mean living area in Ames based on a 
 94 | sample, we can use the following command to survey the population.
 95 | 
 96 | ```{r samp1}
 97 | samp1 <- ames %>%
 98 |   sample_n(size = 50)
 99 | ```
100 | 
101 | This command collects a simple random sample of `size` 50 from the `ames` dataset, 
102 | which is assigned to `samp1`. This is like going into the City 
103 | Assessor's database and pulling up the files on 50 random home sales. Working 
104 | with these 50 files would be considerably simpler than working with all 2930 
105 | home sales.
106 | 
107 | <div class="exercise">
108 | Describe the distribution of this sample? How does it compare to the distribution of the population? **Hint:** `sample_n` function takes a random sample of observations (i.e. rows) from the dataset, you can still refer to the variables in the dataset with the same names. Code you used in the previous exercise will also be helpful for visualizing and summarizing the sample, however be careful to not label values `mu` and `sigma` anymore since these are sample statistics, not population parameters. You can customize the labels of any of the statistics to indicate that these come from the sample.
109 | </div>
110 | ```{r samp1-dist}
111 | # type your code for the Exercise here, and Run Document
112 | 
113 | ```
114 | 
115 | 
116 | If we're interested in estimating the average living area in homes in Ames 
117 | using the sample, our best single guess is the sample mean.
118 | 
119 | ```{r mean-samp1}
120 | samp1 %>%
121 |   summarise(x_bar = mean(area))
122 | ```
123 | 
124 | Depending on which 50 homes you selected, your estimate could be a bit above 
125 | or a bit below the true population mean of 1,499.69 square feet. In general, 
126 | though, the sample mean turns out to be a pretty good estimate of the average 
127 | living area, and we were able to get it by sampling less than 3\% of the 
128 | population.
129 | 
130 | <div class="question">
131 |  Suppose we took two more samples, one of size 100 and one of size 1000. Which would you think would provide a more accurate estimate of the population mean?
132 | 
133 | * Sample size of 50. 
134 | * Sample size of 100. 
135 | * Sample size of 1000. 
136 | </div>
137 | 
138 | Let's take one more sample of size 50, and view the mean area in this sample:
139 | ```{r mean-samp2}
140 | ames %>%
141 |   sample_n(size = 50) %>%
142 |   summarise(x_bar = mean(area))
143 | ```
144 | 
145 | Not surprisingly, every time we take another random sample, we get a different 
146 | sample mean. It's useful to get a sense of just how much variability we 
147 | should expect when estimating the population mean this way. The distribution 
148 | of sample means, called the *sampling distribution*, can help us understand 
149 | this variability. In this lab, because we have access to the population, we 
150 | can build up the sampling distribution for the sample mean by repeating the 
151 | above steps many times. Here we will generate 15,000 samples and compute the 
152 | sample mean of each. Note that we are sampling with replacement, 
153 | `replace = TRUE` since sampling distributions are constructed with sampling
154 | with replacement.
155 | 
156 | ```{r loop}
157 | sample_means50 <- ames %>%
158 |                     rep_sample_n(size = 50, reps = 15000, replace = TRUE) %>%
159 |                     summarise(x_bar = mean(area))
160 | 
161 | ggplot(data = sample_means50, aes(x = x_bar)) +
162 |   geom_histogram(binwidth = 20)
163 | ```
164 | 
165 | Here we use R to take 15,000 samples of size 50 from the population, calculate 
166 | the mean of each sample, and store each result in a vector called 
167 | `sample_means50`. Next, we review how this set of code works.
168 | 
169 | <div class="exercise">
170 | How many elements are there in `sample_means50`?  Describe the sampling distribution, and be sure to specifically note its center. Make sure to include a plot of the distribution in your answer.
171 | </div>
172 | ```{r sampling-dist}
173 | # type your code for the Exercise here, and Run Document
174 | 
175 | ```
176 | 
177 | ## Interlude: Sampling distributions
178 | 
179 | The idea behind the `rep_sample_n` function is *repetition*. Earlier we took
180 | a single sample of size `n` (50) from the population of all houses in Ames. With
181 | this new function we are able to repeat this sampling procedure `rep` times in order
182 | to build a distribution of a series of sample statistics, which is called the 
183 | **sampling distribution**. 
184 | 
185 | Note that in practice one rarely gets to build sampling distributions, 
186 | because we rarely have access to data from the entire population. 
187 | 
188 | Without the `rep_sample_n` function, this would be painful. We would have to 
189 | manually run the following code 15,000 times 
190 | ```{r sample-code, eval=FALSE}
191 | ames %>%
192 |   sample_n(size = 50) %>%
193 |   summarise(x_bar = mean(area))
194 | ```
195 | as well as store the resulting sample means each time in a separate vector.
196 | 
197 | Note that for each of the 15,000 times we computed a mean, we did so from a 
198 | **different** sample!
199 | 
200 | <div class="exercise">
201 | To make sure you understand how sampling distributions are built, and exactly what the `sample_n` and `do` function do, try modifying the code to create a sampling distribution of **25 sample means** from **samples of size 10**, and put them in a data frame named `sample_means_small`. Print the output. How many observations are there in this object called `sample_means_small`? What does each observation represent?
202 | </div>
203 | ```{r practice-sampling-dist}
204 | # type your code for the Exercise here, and Run Document
205 | 
206 | ```
207 | 
208 | <div class="question"> How many elements are there in this object called `sample_means_small`? 
209 | 
210 | * 0 
211 | * 3 
212 | * 25 
213 | * 100 
214 | * 5,000 
215 | </div>
216 | ```{r sample-means-small}
217 | # type your code for Question 3 here, and Run Document
218 | 
219 | ```
220 | 
221 | <div class="question">
222 |  Which of the following is **true** about the elements in the sampling distributions you created?
223 | 
224 | * Each element represents a mean square footage from a simple random sample of 10 houses. 
225 | * Each element represents the square footage of a house. 
226 | * Each element represents the true population mean of square footage of houses. 
227 | </div>
228 | 
229 | 
230 | ## Sample size and the sampling distribution
231 | 
232 | Mechanics aside, let's return to the reason we used the `rep_sample_n` function: to 
233 | compute a sampling distribution, specifically, this one.
234 | 
235 | ```{r hist}
236 | ggplot(data = sample_means50, aes(x = x_bar)) +
237 |   geom_histogram(binwidth = 20)
238 | ```
239 | 
240 | The sampling distribution that we computed tells us much about estimating 
241 | the average living area in homes in Ames.  Because the sample mean is an 
242 | unbiased estimator, the sampling distribution is centered at the true average 
243 | living area of the population, and the spread of the distribution 
244 | indicates how much variability is induced by sampling only 50 home sales.
245 | 
246 | In the remainder of this section we will work on getting a sense of the effect that 
247 | sample size has on our sampling distribution.
248 | 
249 | <div class="exercise">
250 | Use the app below to create sampling distributions of means of `area`s from samples of size 10, 50, and 100. Use 5,000 simulations. What does each observation in the sampling distribution represent? How does the mean, standard error, and shape of the sampling distribution change as the sample size increases? How (if at all) do these values change if you increase the number of simulations?
251 | </div>
252 |     
253 | ```{r shiny, echo=FALSE, error=TRUE}
254 | ames_sampling_dist()
255 | ```
256 | 
257 | <div class="question">
258 |  It makes intuitive sense that as the sample size increases, the center of the sampling distribution becomes a more reliable estimate for the true population mean. Also as the sample size increases, the variability of the sampling distribution ________. 
259 | 
260 | * decreases 
261 | * increases 
262 | * stays the same 
263 | </div>
264 | 
265 | <div class="exercise">
266 | Take a random sample of size 50 from `price`. Using this sample, what is your best point estimate of the population mean?
267 | </div>
268 | ```{r price-sample}
269 | # type your code for this Exercise here, and Run Document
270 | 
271 | ```
272 | 
273 | <div class="exercise">
274 | Since you have access to the population, simulate the sampling distribution for $\bar{x}_{price}$ by taking 5000 samples from the population of size 50 and computing 5000 sample means.  Store these means in a vector called `sample_means50`. Plot the data, then describe the shape of this sampling distribution. Based on this sampling distribution, what would you guess the mean home price of the population to be?
275 | </div>
276 | ```{r price-sampling}
277 | # type your code for this Exercise here, and Run Document
278 | 
279 | ```
280 | 
281 | <div class="exercise">
282 | Change your sample size from 50 to 150, then compute the sampling distribution using the same method as above, and store these means in a new vector called `sample_means150`. Describe the shape of this sampling distribution, and compare it to the sampling distribution for a sample size of 50.  Based on this sampling distribution, what would you guess to be the mean sale price of homes in Ames?
283 | </div>
284 | ```{r price-sampling-more}
285 | # type your code for this Exercise here, and Run Document
286 | 
287 | ```
288 | 
289 | * * *
290 | 
291 | So far, we have only focused on estimating the mean living area in homes in 
292 | Ames. Now you'll try to estimate the mean home price.
293 | 
294 | Note that while you might be able to answer some of these questions using the app
295 | you are expected to write the required code and produce the necessary plots and
296 | summary statistics. You are welcomed to use the app for exploration.
297 | 
298 | <div class="exercise">
299 | Take a sample of size 15 from the population and calculate the mean `price` of the homes in this sample. Using this sample, what is your best point estimate of the population mean of prices of homes?
300 | </div>
301 | ```{r price-sample-small}
302 | # type your code for this Exercise here, and Run Document
303 | 
304 | ```
305 | 
306 | <div class="exercise">
307 | Since you have access to the population, simulate the sampling distribution for $\bar{x}_{price}$ by taking 2000 samples from the population of size 15 and computing 2000 sample means. Store these means in a vector called `sample_means15`. Plot the data, then describe the shape of this sampling distribution. Based on this sampling distribution, what would you guess the mean home price of the population to be? Finally, calculate and report the population mean.
308 | </div>
309 | ```{r price-sampling-small}
310 | # type your code for this Exercise here, and Run Document
311 | 
312 | ```
313 | 
314 | <div class="exercise">
315 | Change your sample size from 15 to 150, then compute the sampling distribution using the same method as above, and store these means in a new vector called `sample_means150`. Describe the shape of this sampling distribution, and compare it to the sampling distribution for a sample size of 15. Based on this sampling distribution, what would you guess to be the mean sale price of homes in Ames?
316 | </div>
317 | ```{r price-sampling-big}
318 | # type your code for this Exercise here, and Run Document
319 | 
320 | ```
321 | 
322 | <div class="question">
323 |  Which of the following is false? 
324 | 
325 | * The variability of the sampling distribution with the smaller sample size (`sample_means50`) is smaller than the variability of the sampling distribution with the larger sample size (`sample_means150`). 
326 | * The means for the two sampling distribtuions are roughly similar.  
327 | * Both sampling distributions are symmetric. 
328 | </div>
329 | ```{r price-sampling-compare}
330 | # type your code for Question 6 here, and Run Document
331 | 
332 | ```
333 | 
334 | <div id="license">
335 | This is a derivative of an [OpenIntro](https://www.openintro.org/stat/labs.php) lab, and is released under a [Attribution-NonCommercial-ShareAlike 3.0 United States](https://creativecommons.org/licenses/by-nc-sa/3.0/us/) license.
336 | </div>


--------------------------------------------------------------------------------
/3.1_simple_regression/simple_regression.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Introduction to linear regression"
  3 | output: statsr:::statswithr_lab
  4 | ---
  5 | 
  6 | <div class="instructions">
  7 | Complete all **Exercises**, and submit answers to **Questions** on the Coursera 
  8 | platform.
  9 | </div>
 10 | 
 11 | ## Batter up 
 12 | 
 13 | The movie [Moneyball](http://en.wikipedia.org/wiki/Moneyball_(film)) focuses on
 14 | the "quest for the secret of success in baseball". It follows a low-budget team, 
 15 | the Oakland Athletics, who believed that underused statistics, such as a player's 
 16 | ability to get on base, better predict the ability to score runs than typical 
 17 | statistics like home runs, RBIs (runs batted in), and batting average. Obtaining 
 18 | players who excelled in these underused statistics turned out to be much more 
 19 | affordable for the team.
 20 | 
 21 | In this lab we'll be looking at data from all 30 Major League Baseball teams and
 22 | examining the linear relationship between runs scored in a season and a number 
 23 | of other player statistics. Our aim will be to summarize these relationships 
 24 | both graphically and numerically in order to find which variable, if any, helps 
 25 | us best predict a team's runs scored in a season.
 26 | 
 27 | ## Getting Started
 28 | 
 29 | ### Load packages
 30 | 
 31 | In this lab we will explore the data using the `dplyr` package and visualize it 
 32 | using the `ggplot2` package for data visualization. The data can be found in the
 33 | companion package for this course, `statsr`.
 34 | 
 35 | Let's load the packages.
 36 | 
 37 | ```{r load-packages, message=FALSE}
 38 | library(statsr)
 39 | library(dplyr)
 40 | library(ggplot2)
 41 | ```
 42 | 
 43 | ### The data
 44 | 
 45 | Let's load up the data for the 2011 season.
 46 | 
 47 | ```{r load-data}
 48 | data(mlb11)
 49 | ```
 50 | 
 51 | In addition to runs scored, there are seven traditionally used variables in the 
 52 | data set: at-bats, hits, home runs, batting average, strikeouts, stolen bases, 
 53 | and wins. There are also three newer variables: on-base percentage, slugging 
 54 | percentage, and on-base plus slugging. For the first portion of the analysis 
 55 | we'll consider the seven traditional variables. At the end of the lab, you'll 
 56 | work with the three newer variables on your own.
 57 | 
 58 | <div class="question">
 59 |  What type of plot would you use to display the relationship between `runs` and one of the other numerical variables? 
 60 | 
 61 | * histogram  
 62 | * box plot  
 63 | * scatterplot  
 64 | * bar plot 
 65 | </div>
 66 | 
 67 | <div class="question">
 68 |  Plot the relationship between `runs` and `at_bats`, using `at_bats` as the explanatory variable. <br> The relationship appears to be ...
 69 | 
 70 | * linear  
 71 | * negative  
 72 | * horseshoe-shaped ($\cap$)   
 73 | * u-shaped ($\cup$) 
 74 | </div>
 75 | ```{r runs-vs-at_bats}
 76 | # type your code for Question 2 here, and Knit
 77 | 
 78 | ```
 79 | 
 80 | <div class="exercise">
 81 | If you knew a team's `at_bats`, would you be comfortable using 
 82 | a linear model to predict their number of runs?
 83 | </div>
 84 | 
 85 | If the relationship looks linear, we can quantify the strength of the
 86 | relationship with the correlation coefficient.
 87 | 
 88 | ```{r cor}
 89 | mlb11 %>%
 90 |   summarise(cor(runs, at_bats))
 91 | ```
 92 | 
 93 | ## Sum of squared residuals
 94 | 
 95 | <div class="boxedtext">
 96 | In this section you will use an interactive function to investigate what we mean by "sum 
 97 | of squared residuals". You will need to run this function in your console, not in your 
 98 | markdown document. Running the function also requires that the `mlb11` dataset is loaded 
 99 | in your environment.
100 | </div>
101 | 
102 | Think back to the way that we described the distribution of a single variable. 
103 | Recall that we discussed characteristics such as center, spread, and shape. It's
104 | also useful to be able to describe the relationship of two numerical variables, 
105 | such as `runs` and `at_bats` above.
106 | 
107 | <div class="question">
108 |  Looking at your plot from the previous exercise, which of the following best describe the relationship between these two variables? 
109 | 
110 | * The relationship is negative, linear, and moderately strong. One of the potential outliers is a team with approximately 5520 at bats. 
111 | * The relationship is positive, linear, and moderately strong. One of the potential outliers is a team with approximately 5520 at bats. 
112 | * The relationship is positive, linear, and very weak. There are no outliers. 
113 | * The relationship is positive, linear, and very weak. One of the potential outliers is a team with approximately 5520 at bats. 
114 | </div>
115 | 
116 | Just as we used the mean and standard deviation to summarize a single variable, 
117 | we can summarize the relationship between these two variables by finding the 
118 | line that best follows their association. Use the following interactive 
119 | function to select the line that you think does the best job of going through 
120 | the cloud of points.
121 | 
122 | ```{r plotss-atbats-runs, eval=FALSE}
123 | plot_ss(x = at_bats, y = runs, data = mlb11)
124 | ```
125 | 
126 | After running this command, you'll be prompted to click two points on the plot 
127 | to define a line. Once you've done that, the line you specified will be shown in
128 | black and the residuals in blue. Note that there are 30 residuals, one for each 
129 | of the 30 observations. Recall that the residuals are the difference between the
130 | observed values and the values predicted by the line:
131 | 
132 | \[
133 |   e_i = y_i - \hat{y}_i
134 | \]
135 | 
136 | The most common way to do linear regression is to select the line that minimizes
137 | the sum of squared residuals. To visualize the squared residuals, you can rerun 
138 | the plot command and add the argument `showSquares = TRUE`.
139 | 
140 | ```{r plotss-atbats-runs-squares, eval=FALSE}
141 | plot_ss(x = at_bats, y = runs, data = mlb11, showSquares = TRUE)
142 | ```
143 | 
144 | Note that the output from the `plot_ss` function provides you with the slope and
145 | intercept of your line as well as the sum of squares.
146 | 
147 | <div class="exercise">
148 | Using `plot_ss`, choose a line that does a good job of minimizing 
149 | the sum of squares.  Run the function several times.  Report your smallest sum 
150 | of squares.
151 | </div>
152 | 
153 | ## The linear model
154 | 
155 | It is rather cumbersome to try to get the correct least squares line, i.e. the 
156 | line that minimizes the sum of squared residuals, through trial and error. 
157 | Instead we can use the `lm` function in R to fit the linear model (a.k.a. 
158 | regression line).
159 | 
160 | ```{r m1}
161 | m1 <- lm(runs ~ at_bats, data = mlb11)
162 | ```
163 | 
164 | The first argument in the function `lm` is a formula that takes the form 
165 | `y ~ x`. Here it can be read that we want to make a linear model of `runs` as a 
166 | function of `at_bats`. The second argument specifies that R should look in the 
167 | `mlb11` data frame to find the `runs` and `at_bats` variables.
168 | 
169 | The output of `lm` is an object that contains all of the information we need 
170 | about the linear model that was just fit. We can access this information using 
171 | the summary function.
172 | 
173 | ```{r summary-m1}
174 | summary(m1)
175 | ```
176 | 
177 | Let's consider this output piece by piece. First, the formula used to describe 
178 | the model is shown at the top. After the formula you find the five-number 
179 | summary of the residuals. The "Coefficients" table shown next is key; its first 
180 | column displays the linear model's y-intercept and the coefficient of `at_bats`.
181 | With this table, we can write down the least squares regression line for the 
182 | linear model:
183 | 
184 | \[
185 |   \hat{y} = -2789.2429 + 0.6305 \times at\_bats
186 | \]
187 | 
188 | One last piece of information we will discuss from the summary output is the 
189 | Multiple R-squared, or more simply, $R^2$. The $R^2$ value represents the 
190 | proportion of variability in the response variable that is explained by the 
191 | explanatory variable. For this model, 37.3% of the variability in runs is 
192 | explained by at-bats.
193 | 
194 | <div class="question">
195 |  Fit a new model that uses `homeruns` to predict `runs`.  Using the estimates from the R output, write the equation of the regression line. What does the slope tell us in the context of the relationship between success of a team and its home runs? 
196 | 
197 | * For each additional home run, the model predicts 1.83 more runs, on average. 
198 | * Each additional home run increases runs by 1.83. 
199 | * For each additional home run, the model predicts 1.83 fewer runs, on average. 
200 | * For each additional home run, the model predicts 415.24 more runs, on average.  
201 | * For each additional home run, the model predicts 415.24 fewer runs, on average. 
202 | </div>
203 | ```{r homeruns-vs-runs}
204 | # type your code for Question 4 here, and Knit
205 | 
206 | ```
207 | 
208 | ## Prediction and prediction errors
209 | 
210 | Let's create a scatterplot with the least squares line for `m1` laid on top.
211 | 
212 | ```{r reg-with-line}
213 | ggplot(data = mlb11, aes(x = at_bats, y = runs)) +
214 |   geom_point() +
215 |   stat_smooth(method = "lm", se = FALSE)
216 | ```
217 | 
218 | Here we are literally adding a layer on top of our plot. `stat_smooth` creates
219 | the line by fitting a linear model. It can also show us the standard error `se`
220 | associated with our line, but we'll suppress that for now.
221 | 
222 | This line can be used to predict $y$ at any value of $x$. When 
223 | predictions are made for values of $x$ that are beyond the range of the observed
224 | data, it is referred to as *extrapolation* and is not usually recommended. 
225 | However, predictions made within the range of the data are more reliable. 
226 | They're also used to compute the residuals.
227 | 
228 | <div class="exercise">
229 | If a team manager saw the least squares regression line and 
230 | not the actual data, how many runs would he or she predict for a team with 
231 | 5,579 at-bats? Is this an overestimate or an underestimate, and by how much?
232 | </div>
233 | 
234 | To find the observed number of runs for the team with 5,579 at bats you can use the following:
235 | ```{r eval=FALSE}
236 | mlb11 %>%
237 |   filter(at_bats == 5579) %>%
238 |   select(runs)
239 | ```
240 | This code first filters for rows observation. `at_bats` is 5579, and then shows the 
241 | value of the `runs variable for that observation.
242 | 
243 | <div class="question">
244 |  What is the residual for the prediction of runs for a team with 5,579 at-bats? Choose the closest answer. 
245 | 
246 | * -15.32 
247 | * 15.32 
248 | * 713 
249 | * 5579 
250 | </div>
251 | ```{r residual}
252 | # type your code for Question 5 here, and Knit
253 | 
254 | ```
255 | 
256 | ## Model diagnostics
257 | 
258 | To assess whether the linear model is reliable, we need to check for (1) 
259 | linearity, (2) nearly normal residuals, and (3) constant variability.
260 | 
261 | **Linearity**: You already checked if the relationship between runs and at-bats
262 | is linear using a scatterplot. We should also verify this condition with a plot 
263 | of the residuals vs. fitted (predicted) values.
264 | 
265 | ```{r residuals}
266 | ggplot(data = m1, aes(x = .fitted, y = .resid)) +
267 |   geom_point() +
268 |   geom_hline(yintercept = 0, linetype = "dashed") +
269 |   xlab("Fitted values") +
270 |   ylab("Residuals")
271 | ```
272 | 
273 | Notice here that our model object `m1` can also serve as a data set because stored within it are the fitted values ($\hat{y}$) and the residuals. Also note that we're getting fancy with the code here. After creating the scatterplot on the first layer (first line of code), we overlay a horizontal dashed line at $y = 0$ (to help us check whether residuals are distributed around 0), and we also adjust the axis labels to be more informative.
274 | 
275 | <div class="question">
276 |  Which of the following statements about the residual plot is false? 
277 | 
278 | * The residuals appear to be randomly distributed around 0.  
279 | * The residuals show a curved pattern.  
280 | * The plot is indicative of a linear relationship between runs and at-bats.  
281 | * The team with a very high residual compared to the others appears to be an outlier. 
282 | </div>
283 | 
284 | **Nearly normal residuals**: To check this condition, we can look at a histogram
285 | 
286 | ```{r hist-res}
287 | ggplot(data = m1, aes(x = .resid)) +
288 |   geom_histogram(binwidth = 25) +
289 |   xlab("Residuals")
290 | ```
291 | 
292 | or a normal probability plot of the residuals.
293 | 
294 | ```{r qq-res}
295 | ggplot(data = m1, aes(sample = .resid)) +
296 |   stat_qq()
297 | ```
298 | 
299 | Note that the syntax for making a normal probability plot is a bit different than what you're used to seeing: we set `sample` equal to the residuals instead of `x`, and we set a statistical method `qq`, which stands for "quantile-quantile", another name commonly used for normal probability plots.
300 | 
301 | <div class="question"> Which of the following is true? 
302 |  
303 | * The residuals are extremely right skewed, hence the normal distribution of residuals condition is not met. 
304 | * The residuals are extremely left skewed, hence the normal distribution of residuals condition is not met.  
305 | * The residuals are perfectly symmetric, hence the normal distribution of residuals condition is met. 
306 | * The residuals are fairly symmetric, with only a slightly longer tail on the right, hence it would be appropriate to deem the the normal distribution of residuals condition met. 
307 | </div>
308 | 
309 | **Constant variability**:
310 | 
311 | <div class="question"> Based on the residuals plot from earlier, the constant variability condition appears to be met. 
312 |  
313 | * True 
314 | * False 
315 | </div>
316 | 
317 | <div class="exercise">
318 | Choose another one of the seven traditional variables from 
319 | `mlb11` besides `at_bats` that you think might be a good predictor of `runs`. 
320 | Produce a scatterplot of the two variables and fit a linear model. At a glance, 
321 | does there seem to be a linear relationship?
322 | </div>
323 | ```{r runs-vs-sth-else}
324 | # type your code for the Exercise here, and Knit
325 | 
326 | ```
327 | 
328 | <div class="exercise">
329 | How does this relationship compare to the relationship between 
330 | `runs` and `at_bats`? Use the R$^2$ values from the two model summaries to 
331 | compare. Does your variable seem to predict `runs` better than `at_bats`? How 
332 | can you tell?
333 | </div>
334 | ```{r pick-model}
335 | # type your code for the Exercise here, and Knit
336 | 
337 | ```
338 | 
339 | <div class="question">
340 |  Now that you can summarize the linear relationship between two variables, investigate the relationships between `runs` and each of the other five traditional variables. Which variable best predicts `runs`? Support your conclusion using the graphical and numerical methods we've discussed.
341 | 
342 | * at bats 
343 | * hits 
344 | * wins 
345 | * batting average 
346 | </div>
347 | ```{r runs-vs-others}
348 | # type your code for Question 9 here, and Knit
349 | 
350 | ```
351 | 
352 | <div class="question">
353 |  Now examine the three newer variables.  These are the statistics used by the author of *Moneyball* to predict a teams success.  In general, are they more or less effective at predicting runs that the old variables?  Explain using appropriate graphical and numerical evidence.  Of all ten variables we've analyzed, which seems to be the best predictor of `runs`? 
354 | 
355 | * on-base plus slugging (`new_obs`) 
356 | * slugging percentage (`new_slug`)  
357 | * on-base percentage (`new_onbase`) 
358 | </div>
359 | ```{r runs-vs-new-vars}
360 | # type your code for Question 10 here, and Knit
361 | 
362 | ```
363 | 
364 | <div class="exercise">
365 | Check the model diagnostics for the regression model with the 
366 | variable you decided was the best predictor for runs.
367 | </div>
368 | ```{r diag}
369 | # type your code for the Exercise here, and Knit
370 | 
371 | ```
372 | 
373 | <div id="license">
374 | This is a product of OpenIntro that is released under a [Creative Commons 
375 | Attribution-ShareAlike 3.0 Unported](http://creativecommons.org/licenses/by-sa/3.0). 
376 | This lab was adapted for OpenIntro by Andrew Bray and Mine &Ccedil;etinkaya-Rundel 
377 | from a lab written by the faculty and TAs of UCLA Statistics.
378 | </div>


--------------------------------------------------------------------------------
/1.2_intro_to_data/intro_to_data.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Introduction to data"
  3 | output: statsr:::statswithr_lab
  4 | ---
  5 | 
  6 | <div class="instructions">
  7 | Complete all **Exercises**, and submit answers to **Questions** on the Coursera 
  8 | platform.
  9 | </div>
 10 | 
 11 | ## Introduction
 12 | 
 13 | Some define statistics as the field that focuses on turning information into
 14 | knowledge. The first step in that process is to summarize and describe the raw
 15 | information - the data. In this lab we explore flights, specifically a random
 16 | sample of domestic flights that departed from the three major 
 17 | New York City airport in 2013. We will generate simple graphical and numerical 
 18 | summaries of data on these flights and explore delay times. As this is a large 
 19 | data set, along the way you'll also learn the indispensable skills of data 
 20 | processing and subsetting.
 21 | 
 22 | ## Getting started
 23 | 
 24 | ### Load packages
 25 | 
 26 | In this lab we will explore the data using the `dplyr` package and visualize it 
 27 | using the `ggplot2` package for data visualization. The data can be found in the
 28 | companion package for this course, `statsr`.
 29 | 
 30 | Let's load the packages.
 31 | 
 32 | ```{r load-packages, message=FALSE}
 33 | library(statsr)
 34 | library(dplyr)
 35 | library(ggplot2)
 36 | ```
 37 | 
 38 | ### Data
 39 | 
 40 | The [Bureau of Transportation Statistics](http://www.rita.dot.gov/bts/about/) 
 41 | (BTS) is a statistical agency that is a part of the Research and Innovative 
 42 | Technology Administration (RITA). As its name implies, BTS collects and makes 
 43 | available transportation data, such as the flights data we will be working with 
 44 | in this lab.
 45 | 
 46 | We begin by loading the `nycflights` data frame. Type the following in your console
 47 | to load the data:
 48 | 
 49 | ```{r load-data}
 50 | data(nycflights)
 51 | ```
 52 | 
 53 | The data frame containing `r nrow(nycflights)` flights that shows up in your 
 54 | workspace is a *data matrix*, with each row representing an *observation* and each 
 55 | column representing a *variable*. R calls this data format a **data frame**, which is 
 56 | a term that will be used throughout the labs.
 57 | 
 58 | To view the names of the variables, type the command
 59 | 
 60 | ```{r names}
 61 | names(nycflights)
 62 | ```
 63 | 
 64 | This returns the names of the variables in this data frame. The **codebook**
 65 | (description of the variables) is included below. This information can also be
 66 | found in the help file for the data frame which can be accessed by typing
 67 | `?nycflights` in the console.
 68 | 
 69 | - `year`, `month`, `day`: Date of departure
 70 | - `dep_time`, `arr_time`: Departure and arrival times, local timezone.
 71 | - `dep_delay`, `arr_delay`: Departure and arrival delays, in minutes. Negative times represent early departures/arrivals.
 72 | - `carrier`: Two letter carrier abbreviation.
 73 |     + `9E`:           Endeavor Air Inc.
 74 |     + `AA`:      American Airlines Inc.
 75 |     + `AS`:        Alaska Airlines Inc.
 76 |     + `B6`:             JetBlue Airways
 77 |     + `DL`:        Delta Air Lines Inc.
 78 |     + `EV`:    ExpressJet Airlines Inc.
 79 |     + `F9`:      Frontier Airlines Inc.
 80 |     + `FL`: AirTran Airways Corporation
 81 |     + `HA`:      Hawaiian Airlines Inc.
 82 |     + `MQ`:                   Envoy Air
 83 |     + `OO`:       SkyWest Airlines Inc.
 84 |     + `UA`:       United Air Lines Inc.
 85 |     + `US`:             US Airways Inc.
 86 |     + `VX`:              Virgin America
 87 |     + `WN`:      Southwest Airlines Co.
 88 |     + `YV`:          Mesa Airlines Inc.
 89 | - `tailnum`: Plane tail number
 90 | - `flight`: Flight number
 91 | - `origin`, `dest`: Airport codes for origin and destination. (Google can help
 92 | you with what code stands for which airport.)
 93 | - `air_time`: Amount of time spent in the air, in minutes.
 94 | - `distance`: Distance flown, in miles.
 95 | - `hour`, `minute`: Time of departure broken in to hour and minutes.
 96 | 
 97 | A very useful function for taking a quick peek at your data frame, and viewing
 98 | its dimensions and data types is `str`, which stands for **str**ucture.
 99 | 
100 | ```{r str}
101 | str(nycflights)
102 | ```
103 | 
104 | The `nycflights` data frame is a massive trove of information. Let's think about 
105 | some questions we might want to answer with these data:
106 | 
107 | - We might want to find out how delayed flights headed to a particular 
108 | destination tend to be. 
109 | - We might want to evaluate how departure delays vary over months. 
110 | - Or we might want to determine which of the three major NYC airports has a better 
111 | on time percentage for departing flights.
112 | 
113 | ### Seven verbs
114 | 
115 | The `dplyr` package offers seven verbs (functions) for basic data 
116 | manipulation:
117 | 
118 | - `filter()`
119 | - `arrange()`
120 | - `select()` 
121 | - `distinct()`
122 | - `mutate()`
123 | - `summarise()`
124 | - `sample_n()`
125 | 
126 | We will use some of these functions in this lab, and learn about others in a 
127 | future lab.
128 | 
129 | 
130 | ## Analysis
131 | 
132 | ### Departure delays in flights to Raleigh-Durham (RDU)
133 | 
134 | We can examine the distribution of departure delays of all flights with a 
135 | histogram.
136 | 
137 | ```{r hist-dep-delay}
138 | ggplot(data = nycflights, aes(x = dep_delay)) +
139 |   geom_histogram()
140 | ```
141 | 
142 | This function says to plot the `dep_delay` variable from the `nycflights` data 
143 | frame on the x-axis. It also defines a `geom` (short for geometric object), 
144 | which describes the type of plot you will produce. 
145 | 
146 | Histograms are generally a very good way to see the shape of a single 
147 | distribution, but that shape can change depending on how the data is split 
148 | between the different bins. You can easily define the binwidth you want to use:
149 | 
150 | ```{r hist-dep-delay-bins}
151 | ggplot(data = nycflights, aes(x = dep_delay)) +
152 |   geom_histogram(binwidth = 15)
153 | ggplot(data = nycflights, aes(x = dep_delay)) +
154 |   geom_histogram(binwidth = 150)
155 | ```
156 | 
157 | <div class="exercise">
158 | How do these three histograms with the various binwidths compare?
159 | </div>
160 | 
161 | If we want to focus on departure delays of flights headed to RDU only, we need to
162 | first `filter` the data for flights headed to RDU (`dest == "RDU"`) and then make
163 | a histogram of only departure delays of only those flights.
164 | 
165 | ```{r rdu-flights-hist}
166 | rdu_flights <- nycflights %>%
167 |   filter(dest == "RDU")
168 | ggplot(data = rdu_flights, aes(x = dep_delay)) +
169 |   geom_histogram()
170 | ```
171 | 
172 | Let's decipher these three lines of code:
173 | 
174 | - Line 1: Take the `nycflights` data frame, `filter` for flights headed to RDU, and 
175 | save the result as a new data frame called `rdu_flights`.
176 |     + `==` means "if it's equal to".
177 |     + `RDU` is in quotation marks since it is a character string.
178 | - Line 2: Basically the same `ggplot` call from earlier for making a histogram, 
179 | except that it uses the data frame for flights headed to RDU instead of all
180 | flights.
181 | 
182 | <div class="boxedtext">
183 | **Logical operators: ** Filtering for certain observations (e.g. flights from a 
184 | particular airport) is often of interest in data frames where we might want to 
185 | examine observations with certain characteristics separately from the rest of 
186 | the data. To do so we use the `filter` function and a series of 
187 | **logical operators**. The most commonly used logical operators for data 
188 | analysis are as follows:
189 | 
190 | - `==` means "equal to"
191 | - `!=` means "not equal to"
192 | - `>` or `<` means "greater than" or "less than"
193 | - `>=` or `<=` means "greater than or equal to" or "less than or equal to"
194 | </div>
195 | 
196 | We can also obtain numerical summaries for these flights:
197 | 
198 | ```{r rdu-flights-summ}
199 | rdu_flights %>%
200 |   summarise(mean_dd = mean(dep_delay), sd_dd = sd(dep_delay), n = n())
201 | ```
202 | 
203 | Note that in the `summarise` function we created a list of two elements. The 
204 | names of these elements are user defined, like `mean_dd`, `sd_dd`, `n`, and 
205 | you could customize these names as you like (just don't use spaces in your 
206 | names). Calculating these summary statistics also require that you know the 
207 | function calls. Note that `n()` reports the sample size.
208 | 
209 | <div class="boxedtext">
210 | **Summary statistics: ** Some useful function calls for summary statistics for a 
211 | single numerical variable are as follows:
212 | 
213 | - `mean`
214 | - `median`
215 | - `sd`
216 | - `var`
217 | - `IQR`
218 | - `range`
219 | - `min`
220 | - `max`
221 | </div>
222 | 
223 | We can also filter based on multiple criteria. Suppose we are interested in
224 | flights headed to San Francisco (SFO) in February:
225 | 
226 | ```{r}
227 | sfo_feb_flights <- nycflights %>%
228 |   filter(dest == "SFO", month == 2)
229 | ```
230 | 
231 | Note that we can separate the conditions using commas if we want flights that
232 | are both headed to SFO **and** in February. If we are interested in either
233 | flights headed to SFO **or** in February we can use the `|` instead of the comma.
234 | 
235 | <div class="question">
236 |  Create a new data frame that includes flights headed to SFO in February, and save 
237 | this data frame as `sfo_feb_flights`. How many flights meet these criteria? 
238 | 
239 | * 68  
240 | * 1345  
241 | * 2286  
242 | * 3563 
243 | * 32735 
244 | </div>
245 | 
246 | ```{r sfo-feb-flights}
247 | # type your code for Question 1 here, and Knit
248 | 
249 | ```
250 | 
251 | 
252 | <div class="question">
253 |  Make a histogram and calculate appropriate summary statistics for **arrival** 
254 | delays of `sfo_feb_flights`. Which of the following is false? 
255 | 
256 | * The distribution is unimodal.  
257 | * The distribution is right skewed.  
258 | * No flight is delayed more than 2 hours.  
259 | * The distribution has several extreme values on the right side. 
260 | * More than 50% of flights arrive on time or earlier than scheduled. 
261 | </div>
262 | 
263 | ```{r sfo-feb-flights-arrival-delays}
264 | # type your code for Question 2 here, and Knit
265 | 
266 | ```
267 | 
268 | Another useful functionality is being able to quickly calculate summary 
269 | statistics for various groups in your data frame. For example, we can modify the 
270 | above command using the `group_by` function to get the same summary stats for 
271 | each origin airport:
272 | 
273 | ```{r summary-custom-list-origin}
274 | rdu_flights %>%
275 |   group_by(origin) %>%
276 |   summarise(mean_dd = mean(dep_delay), sd_dd = sd(dep_delay), n = n())
277 | ```
278 | 
279 | Here, we first grouped the data by `origin`, and then calculated the summary 
280 | statistics.
281 | 
282 | <div class="question">
283 |   Calculate the median and interquartile range for `arr_delay`s of flights in the 
284 | `sfo_feb_flights` data frame, grouped by carrier. Which carrier has the highest 
285 | IQR of arrival delays? 
286 | 
287 | * American Airlines  
288 | * JetBlue Airways  
289 | * Virgin America  
290 | * Delta and United Airlines 
291 | * Frontier Airlines 
292 | </div>
293 | 
294 | ```{r sfo-feb-flights-arrival-delays-carrier}
295 | # type your code for Question 3 here, and Knit
296 | 
297 | ```
298 | 
299 | ### Departure delays over months
300 | 
301 | Which month would you expect to have the highest average delay departing 
302 | from an NYC airport?
303 | 
304 | Let's think about how we would answer this question:
305 | 
306 | - First, calculate monthly averages for departure delays. With the new language
307 | we are learning, we need to
308 |     + `group_by` months, then
309 |     + `summarise` mean departure delays.
310 | - Then, we need to `arrange` these average delays in `desc`ending order
311 | 
312 | ```{r mean-dep-delay-months}
313 | nycflights %>%
314 |   group_by(month) %>%
315 |   summarise(mean_dd = mean(dep_delay)) %>%
316 |   arrange(desc(mean_dd))
317 | ```
318 | 
319 | <div class="question">
320 |  Which month has the highest average departure delay from an NYC airport? 
321 | 
322 | * January  
323 | * March  
324 | * July  
325 | * October 
326 | * December 
327 | </div>
328 | 
329 | ```{r highest-avg-dep-delay-month}
330 | # type your code for Question 4 here, and Knit
331 | 
332 | ```
333 | 
334 | 
335 | <div class="question">
336 |  Which month has the highest median departure delay from an NYC airport?
337 | 
338 | * January  
339 | * March  
340 | * July  
341 | * October 
342 | * December 
343 | </div>
344 | 
345 | ```{r highest-median-dep-delay-month}
346 | # type your code for Question 5 here, and Knit
347 | 
348 | ```
349 | 
350 | 
351 | <div class="question">
352 |  Is the mean or the median a more reliable measure for deciding which month(s) to 
353 | avoid flying if you really dislike delayed flights, and why? 
354 | 
355 | * Mean would be more reliable as it gives us the true average.  
356 | * Mean would be more reliable as the distribution of delays is symmetric.  
357 | * Median would be more reliable as the distribution of delays is skewed.  
358 | * Median would be more reliable as the distribution of delays is symmetric. 
359 | * Both give us useful information. 
360 | </div>
361 | 
362 | 
363 | We can also visualize the distributions of departure delays across months using 
364 | side-by-side box plots:
365 | 
366 | ```{r delay-month-box}
367 | ggplot(nycflights, aes(x = factor(month), y = dep_delay)) +
368 |   geom_boxplot()
369 | ```
370 | 
371 | There is some new syntax here: We want departure delays on the y-axis and the
372 | months on the x-axis to produce side-by-side box plots. Side-by-side box plots
373 | require a categorical variable on the x-axis, however in the data frame `month` is 
374 | stored as a numerical variable (numbers 1 - 12). Therefore we can force R to treat
375 | this variable as categorical, what R calls a **factor**, variable with 
376 | `factor(month)`.
377 | 
378 | ### On time departure rate for NYC airports
379 | 
380 | Suppose you will be flying out of NYC and want to know which of the 
381 | three major NYC airports has the best on time departure rate of departing flights. 
382 | Suppose also that for you a flight that is delayed for less than 5 minutes is 
383 | basically "on time". You consider any flight delayed for 5 minutes of more to be 
384 | "delayed".
385 | 
386 | In order to determine which airport has the best on time departure rate, 
387 | we need to 
388 | 
389 | - first classify each flight as "on time" or "delayed",
390 | - then group flights by origin airport,
391 | - then calculate on time departure rates for each origin airport,
392 | - and finally arrange the airports in descending order for on time departure
393 | percentage.
394 | 
395 | Let's start with classifying each flight as "on time" or "delayed" by
396 | creating a new variable with the `mutate` function.
397 | 
398 | ```{r dep-type}
399 | nycflights <- nycflights %>%
400 |   mutate(dep_type = ifelse(dep_delay < 5, "on time", "delayed"))
401 | ```
402 | 
403 | The first argument in the `mutate` function is the name of the new variable
404 | we want to create, in this case `dep_type`. Then if `dep_delay < 5` we classify 
405 | the flight as `"on time"` and `"delayed"` if not, i.e. if the flight is delayed 
406 | for 5 or more minutes.
407 | 
408 | Note that we are also overwriting the `nycflights` data frame with the new 
409 | version of this data frame that includes the new `dep_type` variable.
410 | 
411 | We can handle all the remaining steps in one code chunk:
412 | 
413 | ```{r}
414 | nycflights %>%
415 |   group_by(origin) %>%
416 |   summarise(ot_dep_rate = sum(dep_type == "on time") / n()) %>%
417 |   arrange(desc(ot_dep_rate))
418 | ```
419 | 
420 | <div class="question">
421 |  If you were selecting an airport simply based on on time departure percentage, 
422 | which NYC airport would you choose to fly out of? 
423 | 
424 | * EWR  
425 | * JFK  
426 | * LGA  
427 | </div>
428 | 
429 | ```{r on-time-dep-perc-airport}
430 | # type your code for Question 7 here, and Knit
431 | 
432 | ```
433 | 
434 | We can also visualize the distribution of on on time departure rate across 
435 | the three airports using a segmented bar plot.
436 | 
437 | ```{r}
438 | ggplot(data = nycflights, aes(x = origin, fill = dep_type)) +
439 |   geom_bar()
440 | ```
441 | 
442 | <div class="question">
443 |  Mutate the data frame so that it includes a new variable that contains the 
444 | average speed, `avg_speed` traveled by the plane for each flight (in mph). What is 
445 | the tail number of the plane with the fastest `avg_speed`? **Hint:** Average speed 
446 | can be calculated as distance divided by number of hours of travel, and note that 
447 | `air_time` is given in minutes. If you just want to show the `avg_speed` and 
448 | `tailnum` and none of the other variables, use the select function at the end of your 
449 | pipe to select just these two variables with `select(avg_speed, tailnum)`. You can 
450 | Google this tail number to find out more about the aircraft. 
451 | 
452 | * N666DN  
453 | * N755US  
454 | * N779JB  
455 | * N947UW  
456 | * N959UW  
457 | </div>
458 | 
459 | ```{r fastest-avg-speed-tailnum}
460 | # type your code for Question 8 here, and Knit
461 | 
462 | ```
463 | 
464 | 
465 | <div class="question">
466 |  Make a scatterplot of `avg_speed` vs. `distance`. Which of the following is true 
467 | about the relationship between average speed and distance. 
468 | 
469 | * As distance increases the average speed of flights decreases.  
470 | * The relationship is linear.  
471 | * There is an overall postive association between distance and average speed. 
472 | * There are no outliers.  
473 | * The distribution of distances are uniform over 0 to 5000 miles.  
474 | </div>
475 | 
476 | ```{r avg-speed-dist-scatter}
477 | # type your code for Question 9 here, and Knit
478 | 
479 | ```
480 | 
481 | <div class="question">
482 | Suppose you define a flight to be "on time" if it gets to the destination on 
483 | time or earlier than expected, regardless of any departure delays. Mutate the data 
484 | frame to create a new variable called `arr_type` with levels `"on time"` and 
485 | `"delayed"` based on this definition. Also mutate to create a new variable called 
486 | `dep_type` with levels `"on time"` and `"delayed"` depending on the flight was delayed for fewer than 5 minutes or 5 minutes or more, respectively. In other words, if `arr_delay` is 0 minutes or fewer, `arr_type` is `"on time"`. If `dep_delay` is less than 5 minutes, `dep_type` is `"on time"`. Then, determine the on time arrival percentage based on whether the flight departed on time or not. What percent of flights that were `"delayed"` departing arrive `"on time"`?
487 | <div/>
488 | 
489 | ```{r on-time-arr-perc}
490 | # type your code for Question 10 here, and Knit
491 | 
492 | ```
493 | 


--------------------------------------------------------------------------------
/4.2_conjugate_priors/credible_interval.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Bayesian credible intervals"
  3 | runtime: shiny
  4 | output: statsr:::statswithr_lab
  5 | ---
  6 | 
  7 | ```{r echo=FALSE, message=FALSE}
  8 | library(tidyverse)
  9 | ```
 10 | 
 11 | <div class="instructions">
 12 | Complete all **Exercises**, and submit answers to **Questions** in the **Quiz: Week 2 Lab** on Coursera.
 13 | </div>
 14 | 
 15 | ## Getting Started
 16 | 
 17 | ### Load packages
 18 | 
 19 | In this lab we will explore some basic Bayesian inference using conjugate priors 
 20 | and credible intervals to examine some categorical and count data from the 
 21 | [CDC's Behavioral Risk Factor Surveillance System](http://www.cdc.gov/brfss/) 
 22 | (BRFSS). A subset of these data from 2013 have been made available in the 
 23 | `statsr` package, as usual we will first load the package and then the data set. 
 24 | 
 25 | Let's load the package,
 26 | 
 27 | ```{r load-packages, message=FALSE}
 28 | library(statsr)
 29 | data(brfss)
 30 | ```
 31 | 
 32 | This data set contains 5000 observations of 6 variables:
 33 | 
 34 | variable         | description
 35 | ---------------- | ---------------------------------------------
 36 | `weight`         |  Respondent's weight in pounds.
 37 | `height`         |  Respondent's height in inches.
 38 | `sex`            |  Respondent's sex
 39 | `exercise`       |  Has the respondent exercised in the last 30 days
 40 | `fruit_per_day`  |  How many servings of fruit does the respondent consume per day
 41 | `vege_per_day`   |  How many servings of dark green vegetables does the respondent consume per day
 42 | 
 43 | 
 44 | ## Credible Interval Calculator
 45 | 
 46 | Recall that probability distribution (prior/posterior distribution) of a 
 47 | parameter that describes the distribution of the data is given by:
 48 | 
 49 | $$\begin{align*}
 50 | \text{Beta distribution} ~-~ & \pi(p; \alpha, \beta) = \text{Beta}(\alpha, \beta)\\
 51 | \text{Gamma distribution} ~-~ & \pi(\lambda; \alpha, \beta) = \text{Gamma}(\alpha, \beta)\\
 52 | \text{Normal distribution} ~-~ & \pi(\mu; \nu, \tau) = \mathscr{N}(\nu, \tau)
 53 | \end{align*}$$
 54 | 
 55 | Here, $p$, $\lambda$, and $\mu$ are the variables of their own distributions 
 56 | (the values of them define the distributions of the data), and other parameters 
 57 | such as $\alpha,\ \beta,\ \nu$, and $\tau$ are the parameters of the 
 58 | distributions of $p$, $\lambda$, and $\mu$.
 59 | 
 60 | (**Note:** In this lab, we use the following definition of Gamma distribution:
 61 | $$ \pi(\lambda; \alpha, \beta) = \text{Gamma}(\alpha, \beta) = \frac{\beta^\alpha}{\Gamma(\alpha)}\lambda^{\alpha-1}e^{-\beta\lambda}$$
 62 | This definition of the Gamma distribution is different from the one introduced 
 63 | in the video lecture.)
 64 | 
 65 | Below is an interactive app for visualizing posterior distributions and credible 
 66 | intervals of $p$, $\lambda$, and $\mu$ given different values of parameters. We 
 67 | will use this app to explore how both our choice of prior distribution, as well 
 68 | as our data, affect the posterior distribution and the credible interval for 
 69 | $p$, $\lambda$, and $\mu$.
 70 | 
 71 | Note that this app assumes you now the posterior distribution as well as the 
 72 | parameters of this distribution. In the remainder of the lab we will walk you 
 73 | through how to calculate the posterior distribution in the Beta-Binomial 
 74 | Conjugacy and the Gamma-Poisson Conjugacy cases based on real world data from 
 75 | `BRFSS`. Then you will be asked to calculate the credible interval using codes 
 76 | similar to the one shown under the graph of the app.
 77 | 
 78 | First, let us do some exercises to learn how to use this app.
 79 | 
 80 | ```{r error=TRUE}
 81 | credible_interval_app()
 82 | ```
 83 | 
 84 | <div class="question">
 85 | Suppose the posterior distribution of $\mu$ follows a Normal distribution with 
 86 | mean 10 and variance 5. Which of the following are the bounds of a 95% credible 
 87 | interval for $\mu$? Answer this question using the app.
 88 | 
 89 | * (-1.96, 1.96)
 90 | * (0.419, 0.872)
 91 | * (0.959, 3.417)
 92 | * (5.618, 14.382)
 93 | </div>
 94 | 
 95 | <div class="exercise">
 96 | Confirm your answer by running the code given below the distribution plot in 
 97 | the app.
 98 | </div>
 99 | 
100 | ```{r normal-ci}
101 | # Type your code for Exercise 1 here.
102 | ```
103 | 
104 | <div class="question">
105 | Suppose the posterior distribution of $p$ follows a Beta distribution with 
106 | $\alpha = 2$ and $\beta = 5$. Which of the following are the bounds of a 90% 
107 | credible interval for $p$? Answer this question using the app.
108 | 
109 | * (-1.678, 5.678)
110 | * (0.043, 0.641)
111 | * (0.063, 0.582)
112 | * (0.071, 0.949)
113 | </div>
114 | 
115 | <div class="exercise">
116 | Confirm your answer by running the code given below the distribution plot in 
117 | the app.
118 | </div>
119 | 
120 | ```{r beta-ci}
121 | # Type your code for the Exercise 2 here.
122 | ```
123 | 
124 | 
125 | <div class="question">
126 | Suppose the posterior distribution of $\lambda$ follows a Gamma distribution 
127 | with $\alpha = 4$ and $\beta = 8$. Which of the following are the bounds of a 
128 | 99% credible interval for $\lambda$? Answer this question using the app.
129 | 
130 | * (-3.284, 11.284)
131 | * (0.069, 0.693)
132 | * (0.084, 1.372)
133 | * (0.171, 0.969)
134 | </div>
135 | 
136 | <div class="exercise">
137 | Confirm your answer by running the code given below the distribution plot in 
138 | the app.
139 | </div>
140 | 
141 | ```{r gamma-ci}
142 | # Type your code for the Exercise 3 here.
143 | ```
144 | 
145 | 
146 | ## Beta-Binomial Conjugacy
147 | 
148 | As we discussed in the videos, the Beta distribution is conjugate to the 
149 | Binomial distribution - meaning that if we use a Beta prior for the parameter 
150 | $p$ of the Binomial distribution then the posterior distribution of $p$ after 
151 | observing data will be another Beta distribution. 
152 | 
153 | $$ \pi(p) = \text{Beta}(a, b) $$
154 | $$ x\,|\,n,p ~\sim~ \text{Binom}(n,p) $$
155 | $$ p \,|\, x,n ~\sim~ \text{Beta}(\alpha, \beta).$$
156 | 
157 | Our goal with inference in general is to take specific observations (data) and 
158 | use them to make useful statements about unknown population parameters of 
159 | interest. The Beta-Binomial Conjugacy is a Bayesian approach for inference about 
160 | a single population proportion $p$. Whereas with the frequentist approach we 
161 | used $\hat{p} = x / n$ we will now just use $x$ and $n$ directly with $x$ being 
162 | the number of successes obtained from $n$ identical Bernoulli trials. (A 
163 | Bernoulli trial is a random experiment with exactly two possible outcomes, 
164 | "success" and "failure", in which the probability of success is the same every 
165 | time the experiment is conducted.) As such, we can view $x$ as a Binomial random 
166 | variable with $n$ the number of trials, and $p$ the probability of success.
167 |  
168 | To complete our Bayesian approach of inference, all we need is to define our 
169 | prior beliefs for $p$ by defining a prior distribution. Our choice of the prior 
170 | hyperparameters ($a$ and $b$) should reflect our prior beliefs about $p$. In 
171 | the following, we will use the term **hyperparameter** to define parameters of 
172 | prior/posterior distributions, and the term **parameter** to define the unknown 
173 | parameters of the likelihood, such as $p$. For most conjugate distributions 
174 | there is usually a straight forward interpretation of these hyperparameters as 
175 | the previously observed data -- in the case of the Beta-Binomial Conjugacy, we 
176 | can think of our hyperparameters as representing $a-1$ previous successes and $b-1$ previous failures.
177 | 
178 | ### Data and the updating rule
179 | 
180 | We will start by performing inference on the sex ratio of respondents to 
181 | `BRFSS`, we can define success as being `Female` and we would like to make 
182 | some statement about the overall sex ratio of American adults based on our 
183 | sample from `BRFSS`. We will do this by estimating $p$, the true proportion 
184 | of females in the American population, using credible intervals. For each 
185 | credible interval you compute, always check back in with your intuition, which 
186 | hopefully says that $p$ should be around 0.5 since we would expect roughly 50% 
187 | females and 50% males in the population.
188 | 
189 | Here is the observed sex distribution in the data:
190 | ```{r}
191 | table(brfss$sex)
192 | ```
193 | 
194 | Let's store the relevant, total sample size and number of females, for use in 
195 | later calculations:
196 | ```{r}
197 | n <- length(brfss$sex)
198 | x <- sum(brfss$sex == "Female")
199 | ```
200 | 
201 | For each observed data point from a Binomial ($n$ and $x$) we can calculate 
202 | the values of the posterior parameters using the following updating rule:
203 | 
204 | $$ \alpha = a + x $$
205 | $$ \beta = b + n - x $$
206 | 
207 | From the data we now have $x = `r x`$ (the number of females), and 
208 | $n - x = `r n - x`$ (the number of males). We'll start with a Beta prior where 
209 | $a = 1$ and $b = 1$. Remember that this is equivalent to a Uniform distribution. 
210 | By combining the data with the prior, we arrive at a posterior where 
211 | 
212 | $$ p \,|\, x,n ~\sim~ \text{Beta}(\alpha = 1 + `r x`,~ \beta = 1 + `r n - x`) $$
213 | 
214 | <div class="question">
215 | What is the 95% credible interval for $p$, the proportion of females in the 
216 | population, based on the posterior distribution obtained with the updating rule 
217 | shown above. Use the credible interval app to answer this question.
218 | 
219 | * (0.500, 0.536)
220 | * (0.503, 0.531) 
221 | * (0.507, 0.530)
222 | * (0.468, 0.496)
223 | </div>
224 | 
225 | <div class="question">
226 | Which of the following is the correct Bayesian interpretation of this interval?
227 | 
228 | * The probability that the true proportion of females lies in this interval is either 0 or 1.
229 | * The probability that the true proportion of females lies in this interval is 0.95.
230 | * 95\% of the time the true proportion of females is in this interval. 
231 | * 95\% of true proportions of females are in this interval. 
232 | </div>
233 | <br>
234 | 
235 | Let's now use a more informative prior that reflects a **stronger** belief that 
236 | the sex ratio should be 50-50. For this, we use a Beta prior with hyperparameters 
237 | $a = 500$ and $b = 500$.
238 | 
239 | <div class="exercise">
240 | Confirm by plotting the following two Beta distributions $\text{Beta}(a = 1, b = 1)$ 
241 | and $\text{Beta}(a = 500, b = 500)$ using the app above to show that the 
242 | $\text{Beta}(a = 500, b = 500)$ distribution is centered around 0.5 and much more 
243 | narrow than the uniform distribution, i.e. $Beta(a = 1, b = 1)$.
244 | </div>
245 | 
246 | <div class="question">
247 | What is the 95% credible interval for $p$, the proportion of females in the 
248 | population, based on a prior distribution of $\text{Beta}(a = 500, b = 500)$. 
249 | **Hint:** You need to determine the hyperparameters of the posterior distribution, 
250 | then use the app to construct the credible interval.
251 | 
252 | * (0.498, 0.531)
253 | * (0.500, 0.528)
254 | * (0.504, 0.532)
255 | * (0.502, 0.527) 
256 | </div>
257 | 
258 | Let's consider one other prior distribution: $\text{Beta}(a = 5, b = 200)$. 
259 | 
260 | <div class="question">
261 | Which is of the following is the center of the $\text{Beta}(a = 5, b = 200)$ 
262 | distribution? **Hint:** modify the code under the distribution plot to get the 
263 | center.
264 | 
265 | * approximately 0.03
266 | * approximately 0.15
267 | * approximately 0.50
268 | * approximately 0.97
269 | </div>
270 | 
271 | ```{r q7}
272 | # Type your code for Question 7 here.
273 | ```
274 | 
275 | <div class="question">
276 | What is the 95% credible interval for $p$, the proportion of females in the 
277 | population, based on a prior distribution of $\text{Beta}(a = 5, b = 200)$. 
278 | **Hint:** You need to determine the posterior distribution first, then use the 
279 | app to construct the credible interval.
280 | 
281 | * (0.503, 0.531)
282 | * (0.499, 0.535)
283 | * (0.486, 0.509)
284 | * (0.484, 0.511)
285 | </div>
286 | 
287 | In summary, when we used a prior distribution that was centered around a 
288 | realistic value for $p$ (the center is around 0.5), the credible interval we 
289 | obtained was also more realistic. However when we used a strong prior distribution 
290 | that was centered around a clearly unrealistic value for $p$ (say the 
291 | $\text{Beta}(5, 200)$ prior), the credible interval we obtained did not match the 
292 | distribution of the data (with the proportion of female respondents 
293 | $2586/(2586+2414) \approx 0.517$). Hence, a good prior helps, however a bad 
294 | prior can hurt your results. 
295 | 
296 | Next, let's turn our attention to the `exercise` variable, which indicates 
297 | whether the respondent exercised in the last 30 days. While for the `sex` variable 
298 | we had some intuition about the true proportion of females (we would expect it 
299 | to be around 0.5), many of us probably do not have a strong prior belief about 
300 | the proportion of Americans who exercise. In this case we would be more inclined 
301 | to use a non-informative prior, e.g. a uniform distribution, which says that $p$ 
302 | is equally likely to be anywhere between 0 and 1.
303 | 
304 | Here is the observed exercise distribution in the data:
305 | 
306 | ```{r}
307 | table(brfss$exercise)
308 | ```
309 | 
310 | <div class="question">
311 | What is the 90% credible interval for $p$, the proportion of Americans who 
312 | exercise, based on a uniform prior distribution?
313 | 
314 | * (0.762, 0.785)
315 | * (0.764, 0.783) 
316 | * (0.718, 0.737)
317 | * (0.758, 0.789)
318 | </div>
319 | 
320 | 
321 | ## Gamma-Poisson Conjugacy
322 | 
323 | Since the Poisson distribution describes the number of counts in a given 
324 | interval, we will use this distribution to model the `fruit_per_day` variable 
325 | which records the servings of fruit the respondents consume per day. The Poisson 
326 | distribution has a single parameter, $\lambda$, which is the expected number of 
327 | counts per time period. 
328 | 
329 | The Gamma-Poisson conjugacy is another example of conjugate families where we use 
330 | the Gamma distribution as the prior for the count parameter $\lambda$. In this 
331 | lab, we use the following definition of Gamma distribution:
332 | $$ \pi(\pi; \alpha, \beta) = \text{Gamma}(\alpha, \beta) = \frac{\beta^\alpha}{\Gamma(\alpha)}\lambda^{\alpha-1}e^{-\beta\lambda}$$
333 | 
334 | With Bayes' Rule and the likelihood which is given by the Poisson distribution, 
335 | we will get a Gamma posterior for $\lambda$.
336 | 
337 | $$ \pi(\lambda) = \text{Gamma}(a,b) $$
338 | $$ x\,|\,\lambda ~\sim~ \text{Poisson}(\lambda) $$
339 | $$ \lambda \,|\, x ~\sim~ \text{Gamma}(\alpha,\beta).$$
340 | 
341 | Once again, our choice of the prior parameters ($a$ and $b$) should reflect our 
342 | prior beliefs about the parameter $\lambda$. In the case of the Gamma-Poisson 
343 | conjugacy, we can view $a$ as the number of total counts and $b$ as the prior 
344 | number of observations. For example, setting $a = 12$ and $b = 3$ reflects a 
345 | belief based on data that 3 respondents on average consume a total of 12 fruits 
346 | per day. At a first glance, this might sound equivalent to setting $a = 4$ and 
347 | $b = 1$ or $a = 120$ and $b = 30$, however these three distributions, 
348 | $Gamma(a = 4, b = 1)$, $Gamma(a = 12, b = 3)$, and $Gamma(a = 120, b = 30)$, 
349 | while they all have the same expected value 4, differ in their spreads which 
350 | indicates a different degree of belief about the parameter $\lambda$.
351 | 
352 | <div class="exercise">
353 | Use the app to plot the following three prior Gamma distributions, 
354 | $Gamma(a = 4, b = 1)$, $Gamma(a = 12, b = 3)$, and $Gamma(a = 120, b = 30)$. 
355 | Confirm that they all have the same center but different spreads. Order them in 
356 | ascending order of spreads, from least to most variable.
357 | </div>
358 | 
359 | ### Data and the updating rule
360 | 
361 | For each observed data point from the Poisson distribution ($x$) we can calculate 
362 | the values of the posterior parameters using the following updating rule:
363 | 
364 | $$ \alpha = a + x $$
365 | $$ \beta = b + 1 $$
366 | 
367 | However in this case we have 5000 observations and we would like to avoid updates 
368 | every single count individually. As we saw last week, we can use our subsequentially 
369 | updated posterior as a new prior. As such, a more general multi-observation 
370 | updating rule is
371 | 
372 | $$ \alpha = a + \sum_{i = 1}^n x_i $$
373 | $$ \beta = b + n $$
374 | 
375 | <div class="question">
376 | Using the multi-observation updating rule, what should the posterior distribution 
377 | be when the hyperparameters of the Gamma prior are $a = 4$ and $b = 1$, and we 
378 | have observed the data $x = \{2, 3, 4, 5, 4\}$.
379 | 
380 | * Gamma($a = 22$, $b = 6$) 
381 | * Gamma($a = 18$, $b = 5$)
382 | * Gamma($a = 18$, $b = 6$)
383 | * Gamma($a = 19$, $b = 8$)
384 | </div>
385 | 
386 | ```{r q10}
387 | # Type your code for Question 10 here.
388 | ```
389 | 
390 | <div class="question">
391 | The government recommends that Americans consume approximately 5 servings of 
392 | fruits per day. Which of the following represents a weak prior that Americans 
393 | on average follow this recommendation?
394 | 
395 | * Gamma($a = 1$, $b = 5$)
396 | * Gamma($a = 5$, $b = 1$)
397 | * Gamma($a = 100$, $b = 500$)
398 | * Gamma($a = 500$, $b = 100$)
399 | </div>
400 | 
401 | <div class="question">
402 | Using the correct prior distribution from the previous question and the data of 
403 | `fruit_per_day` in the `BRFSS` dataset, calculate the hyperparameters of the 
404 | posterior distribution.
405 | 
406 | * Gamma($\alpha = 8114$, $\beta = 5000$)
407 | * Gamma($\alpha = 8118$, $\beta = 5001$)
408 | * Gamma($\alpha = 8119$, $\beta = 5001$)
409 | * Gamma($\alpha = 8115$, $\beta = 5005$)
410 | </div>
411 | 
412 | ```{r q12}
413 | # Type your code for Question 12 here.
414 | ```
415 | 
416 | 
417 | <div class="question">
418 | Using the correct posterior distribution from the previous question, calculate 
419 | the 90% credible interval for $\lambda$, the expected number of servings of 
420 | fruit Americans consume per day.
421 | 
422 | * (1.575, 1.668) 
423 | * (1.588, 1.659) 
424 | * (1.592, 1.651) 
425 | * (1.594, 1.653) 
426 | </div>
427 | 
428 | <div class="question">
429 | Based on this result, do Americans appear to follow the government guidelines 
430 | which recommend consuming 5 servings of fruits per day?
431 | 
432 | * Yes
433 | * No
434 | </div>
435 | 
436 | 
437 | <div class="exercise">
438 | Repeat the preceding analysis for number of servings of vegetables per day 
439 | (`vege_per_day`), and evaluate whether Americans follow the government guidelines 
440 | which recommend consuming 5 servings of vegetables per day.
441 | </div>
442 | 
443 | ```{r veg_day}
444 | # Type your code for the Exercise 6 here.
445 | ```
446 | 
447 | <div class="license">
448 | This work is licensed under [GNU General Public License v3.0](https://www.gnu.org/licenses/quick-guide-gplv3.html).
449 | </div>
450 | 


--------------------------------------------------------------------------------
/3.2_multiple_regression/multiple_regression.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Multiple linear regression"
  3 | output: statsr:::statswithr_lab
  4 | 
  5 | references:
  6 | - id: Hamermesh2005
  7 |   title: Beauty in the Classroom - Instructors' Pulchritude and Putative Pedagogical Productivity
  8 |   author:
  9 |   - family: Hamermesh
 10 |     given: Daniel S.
 11 |   - family: Parker
 12 |     given: Amy
 13 |   volume: 24
 14 |   URL: 'http://www.sciencedirect.com/science/article/pii/S0272775704001165'
 15 |   DOI: 10.1016/j.econedurev.2004.07.013
 16 |   publisher: Economics of Education Review
 17 |   ISSN: 0272-7757
 18 |   issue: 4
 19 |   page: 369-376
 20 |   type: article-journal
 21 |   issued:
 22 |     year: 2005
 23 |     month: 8
 24 | - id: Gelman2007
 25 |   title: Data Analysis Using Regression and Multilevel/Hierarchical Models
 26 |   author:
 27 |   - family: Gelman
 28 |     given: Andrew
 29 |   - family: Hill
 30 |     given: Jennifer
 31 |   publisher: Cambridge University Press
 32 |   city:
 33 |   type: book
 34 |   issued:
 35 |     year: 2007
 36 |   edition: 1
 37 |   ISBN: 9780521686891
 38 | ---
 39 | 
 40 | <div class="instructions">
 41 | Complete all **Exercises**, and submit answers to **Questions** on the Coursera 
 42 | platform.
 43 | </div>
 44 | 
 45 | ## Grading the professor
 46 | 
 47 | Many college courses conclude by giving students the opportunity to evaluate 
 48 | the course and the instructor anonymously. However, the use of these student 
 49 | evaluations as an indicator of course quality and teaching effectiveness is 
 50 | often criticized because these measures may reflect the influence of 
 51 | non-teaching related characteristics, such as the physical appearance of the 
 52 | instructor. The article titled, "Beauty in the classroom: instructors' 
 53 | pulchritude and putative pedagogical productivity" [@Hamermesh2005] 
 54 | found that instructors who are viewed to be better looking receive higher 
 55 | instructional ratings. 
 56 | 
 57 | In this lab we will analyze the data from this study in order to learn what goes 
 58 | into a positive professor evaluation.
 59 | 
 60 | ## Getting Started
 61 | 
 62 | ### Load packages
 63 | 
 64 | In this lab we will explore the data using the `dplyr` package and visualize it 
 65 | using the `ggplot2` package for data visualization. The data can be found in the
 66 | companion package for this course, `statsr`.
 67 | 
 68 | Let's load the packages.
 69 | 
 70 | ```{r load-packages, message=FALSE}
 71 | library(statsr)
 72 | library(dplyr)
 73 | library(ggplot2)
 74 | library(GGally)
 75 | ```
 76 | 
 77 | This is the first time we're using the `GGally` package. We will be using the
 78 | `ggpairs` function from this package later in the lab.
 79 | 
 80 | ### The data
 81 | 
 82 | The data were gathered from end of semester student evaluations for a large 
 83 | sample of professors from the University of Texas at Austin. In addition, six 
 84 | students rated the professors' physical appearance. (This is a slightly modified 
 85 | version of the original data set that was released as part of the replication 
 86 | data for *Data Analysis Using Regression and Multilevel/Hierarchical Models* 
 87 | [@Gelman2007].) The result is a data frame where each row contains a 
 88 | different course and columns represent variables about the courses and 
 89 | professors.
 90 | 
 91 | Let's load the data:
 92 | 
 93 | ```{r load-data, message=FALSE}
 94 | data(evals)
 95 | ```
 96 | 
 97 | variable         | description
 98 | ---------------- | -----------
 99 | `score`          | average professor evaluation score: (1) very unsatisfactory - (5) excellent.
100 | `rank`           | rank of professor: teaching, tenure track, tenured.
101 | `ethnicity`      | ethnicity of professor: not minority, minority.
102 | `gender`         | gender of professor: female, male.
103 | `language`       | language of school where professor received education: english or non-english.
104 | `age`            | age of professor.
105 | `cls_perc_eval`  | percent of students in class who completed evaluation.
106 | `cls_did_eval`   | number of students in class who completed evaluation.
107 | `cls_students`   | total number of students in class.
108 | `cls_level`      | class level: lower, upper.
109 | `cls_profs`      | number of professors teaching sections in course in sample: single, multiple.
110 | `cls_credits`    | number of credits of class: one credit (lab, PE, etc.), multi credit.
111 | `bty_f1lower`    | beauty rating of professor from lower level female: (1) lowest - (10) highest.
112 | `bty_f1upper`    | beauty rating of professor from upper level female: (1) lowest - (10) highest.
113 | `bty_f2upper`    | beauty rating of professor from second upper level female: (1) lowest - (10) highest.
114 | `bty_m1lower`    | beauty rating of professor from lower level male: (1) lowest - (10) highest.
115 | `bty_m1upper`    | beauty rating of professor from upper level male: (1) lowest - (10) highest.
116 | `bty_m2upper`    | beauty rating of professor from second upper level male: (1) lowest - (10) highest.
117 | `bty_avg`        | average beauty rating of professor.
118 | `pic_outfit`     | outfit of professor in picture: not formal, formal.
119 | `pic_color`      | color of professor's picture: color, black & white.
120 | 
121 | <div class="question">
122 |  Is this an observational study or an experiment?
123 | 
124 | * Observational study 
125 | * Experiment 
126 | </div>
127 | 
128 | <div class="question">
129 |  The original research question posed in the paper is whether beauty leads directly to the differences in course evaluations.  Given the study design, should the question be rephrased? If so, how? 
130 | 
131 | * No, the question is worded accurately. 
132 | * Yes, revise wording to "Is there an association between beauty and course evaluations?" 
133 | * Yes, revise wording to "Does beauty score increase the professor's course evaluations?"  
134 | * Yes, revise wording to "Does beauty score decrease the professor's course evaluations?" 
135 | </div>
136 | 
137 | ## Exploring the data
138 | 
139 | <div class="question">
140 |  Which of the following statements is **false** about the distribution of `score`? 
141 | 
142 | * The median of the distribution is 4.3. 
143 | * 25\% of the students gave their professors a score of over 4.6.  
144 | * 11 of students gave a professor a score below 3.  
145 | * The left skewness of the data suggests that the students are less likely to rate the professors highly. 
146 | </div>
147 | ```{r score-dist}
148 | # type your code for Question 3 here, and Knit
149 | 
150 | ```
151 | 
152 | <div class="exercise">
153 | **Exercise:** Excluding `score`, select two other variables and describe their 
154 | relationship using an appropriate visualization (scatterplot, side-by-side boxplots, 
155 | or mosaic plot).
156 | </div>
157 | ```{r two-vars-eda}
158 | # type your code for the Exercise here, and Knit
159 | 
160 | ```
161 | 
162 | ## Simple linear regression
163 | 
164 | The fundamental phenomenon suggested by the study is that better looking teachers
165 | are evaluated more favorably. Let's create a scatterplot to see if this appears 
166 | to be the case:
167 | 
168 | ```{r scatter-score-bty_avg}
169 | ggplot(data = evals, aes(x = bty_avg, y = score)) +
170 |   geom_point()
171 | ```
172 | 
173 | Before we draw conclusions about the trend, compare the number of observations 
174 | in the data frame with the approximate number of points on the scatterplot. 
175 | Is anything awry?
176 | 
177 | <div class="exercise">
178 | **Exercise:** Replot the scatterplot, but this time replace the `geom_point()` layer 
179 | with a `geom_jitter()` layer. (Use `?geom_jitter` to learn more.)  What was misleading 
180 | about the initial scatterplot?
181 | </div>
182 | ```{r jitter}
183 | # type your code for the Exercise here, and Knit
184 | 
185 | ```
186 | 
187 | Let's see if the apparent trend in the plot is something more than 
188 | natural variation.  Fit a linear model called `m_bty` to predict average professor 
189 | score by average beauty rating and add the line to your plot using the following. If 
190 | you do not remember how to do this, refer to the previous lab.
191 | 
192 | ```{r scatter-score-bty_avg-line}
193 | ggplot(data = evals, aes(x = bty_avg, y = score)) +
194 |   geom_jitter() +
195 |   geom_smooth(method = "lm")
196 | ```
197 | 
198 | The blue line is the model. The shaded gray area around the line tells us about the 
199 | variability we might expect in our predictions. To turn that off, use `se = FALSE`.
200 | 
201 | ```{r scatter-score-bty_avg-line-no-se}
202 | ggplot(data = evals, aes(x = bty_avg, y = score)) +
203 |   geom_jitter() +
204 |   geom_smooth(method = "lm", se = FALSE)
205 | ```
206 | 
207 | <div class="exercise">
208 | **Exercise:** Print a summary of the linear model, write out the equation, and 
209 | interpret the slope. 
210 | </div>
211 | 
212 | ```{r summary-model}
213 | # type your code for the Exercise here, and Knit
214 | 
215 | ```
216 | 
217 | <div class="question">
218 |  Average beauty score is a statistically significant predictor of evaluation score. 
219 |  
220 | * True 
221 | * False 
222 | </div>
223 | 
224 | <div class="question">
225 |  Use residual plots to evaluate whether the conditions of least squares regression are reasonable.  Which of the following statements is an incorrect analysis of the residual plots and conditions? 
226 | 
227 | * Linear association: The residuals plot shows a random scatter. 
228 | * Constant variance of residuals: No fan shape in residuals plot. 
229 | * Nearly normal residuals: Residuals are right skewed, but the sample size is large, so this may not be an important violation of conditions. 
230 | * Independent observations: Classes sampled randomly, no order effect. 
231 | </div>
232 | 
233 | ```{r diag-score-bty_avg}
234 | # type your code for the Question 5 here, and Knit
235 | 
236 | ```
237 | 
238 | ## Multiple linear regression
239 | 
240 | The data set contains several variables on the beauty score of the professor: 
241 | individual ratings from each of the six students who were asked to score the 
242 | physical appearance of the professors and the average of these six scores. Let's 
243 | take a look at the relationship between one of these scores and the average 
244 | beauty score.
245 | 
246 | ```{r bty-rel}
247 | ggplot(data = evals, aes(x = bty_f1lower, y = bty_avg)) +
248 |   geom_jitter()
249 | evals %>% 
250 |   summarise(cor(bty_avg, bty_f1lower))
251 | ```
252 | 
253 | As expected the relationship is quite strong - after all, the average score is 
254 | calculated using the individual scores. We can actually take a look at the 
255 | relationships between all beauty variables (columns 13 through 19) using the 
256 | following command:
257 | 
258 | ```{r bty-rels, fig.width = 12, fig.height= 5}
259 | ggpairs(evals, columns = 13:19)
260 | ```
261 | 
262 | These variables are collinear (correlated), and adding more than one of these 
263 | variables to the model would not add much value to the model. In this 
264 | application and with these highly-correlated predictors, it is reasonable to use
265 | the average beauty score as the single representative of these variables.
266 | 
267 | In order to see if beauty is still a significant predictor of professor score 
268 | after we've accounted for the gender of the professor, we can add the gender 
269 | term into the model.
270 | 
271 | ```{r scatter-score-bty_avg_gender}
272 | m_bty_gen <- lm(score ~ bty_avg + gender, data = evals)
273 | summary(m_bty_gen)
274 | ```
275 | 
276 | <div class="question">
277 |  P-values and parameter estimates should only be trusted if the conditions for the regression are reasonable. Using diagnostic plots, we can conclude that the conditions for this model are reasonable. 
278 | 
279 | * True 
280 | * False 
281 | </div>
282 | </div>
283 | ```{r diag-mlr-model}
284 | # type your code for the Exercise here, and Knit
285 | 
286 | ```
287 | 
288 | <div class="exercise">
289 | **Exercise:** Print a summary of the multiple linear regression model.  Is `bty_avg` 
290 | still a significant predictor of `score`?  Has the addition of `gender` to the 
291 | model changed the parameter estimate for `bty_avg`?
292 | </div>
293 | 
294 | ```{r summary-mlr-model}
295 | # type your code for the Exercise here, and Knit
296 | 
297 | ```
298 | 
299 | Note that the estimate for `gender` is now called `gendermale`. You'll see this 
300 | name change whenever you introduce a categorical variable. The reason is that R 
301 | recodes `gender` from having the values of `female` and `male` to being an 
302 | indicator variable called `gendermale` that takes a value of $0$ for females and
303 | a value of $1$ for males. (Such variables are often referred to as "dummy" 
304 | variables.)
305 | 
306 | As a result, for females, the parameter estimate is multiplied by zero, leaving 
307 | the intercept and slope form familiar from simple regression.
308 | 
309 | $$
310 | \begin{aligned}
311 |   \widehat{score} 
312 |     &= \hat{\beta}_0 + \hat{\beta}_1 \times bty\_avg + \hat{\beta}_2 \times (0) \\
313 |     &= \hat{\beta}_0 + \hat{\beta}_1 \times bty\_avg
314 | \end{aligned}
315 | $$
316 | 
317 | <div class="question">
318 |  For two professors (one male and one female) who received the same beauty rating, the male professor is predicted to have the higher course evaluation score than the female.
319 | 
320 | * True 
321 | * False 
322 | </div>
323 | 
324 | The decision to call the indicator variable `gendermale` instead of`genderfemale`
325 | has no deeper meaning. R simply codes the category that comes first 
326 | alphabetically as a $0$. (You can change the reference level of a categorical 
327 | variable, which is the level that is coded as a 0, using the`relevel` function. 
328 | Use `?relevel` to learn more.)
329 | 
330 | <div class="exercise">
331 | **Exercise:** Create a new model called `m_bty_rank` with `gender` removed and 
332 | `rank` added in.  How does R appear to handle categorical variables that have 
333 | more than two levels? Note that the rank variable has three levels: teaching, 
334 | tenure track, tenured.
335 | </div>
336 | 
337 | ```{r new-mlr-model}
338 | # type your code for the Exercise here, and Knit
339 | 
340 | ```
341 | 
342 | <div class="question">
343 |  Which of the following is the correct order of the three levels of rank if we were to order them from lowest predicted course evaluation score to highest predicted course evaluation score? 
344 | 
345 | * Teaching, Tenure Track, Tenured 
346 | * Tenure track, Tenured 
347 | * Tenure Track, Tenured, Teaching 
348 | * Teaching, Tenured, Tenure Track 
349 | </div>
350 | 
351 | The interpretation of the coefficients in multiple regression is slightly 
352 | different from that of simple regression. The estimate for `bty_avg` reflects
353 | how much higher a group of professors is expected to score if they have a beauty
354 | rating that is one point higher *while holding all other variables constant*. In
355 | this case, that translates into considering only professors of the same rank 
356 | with `bty_avg` scores that are one point apart.
357 | 
358 | ## Prediction
359 | 
360 | Suppose we want to use the model we created earlier, `m_bty_gen` to predict 
361 | the evaluation score for a professor, Dr. Hypo Thetical, who is a male tenure track
362 | professor with an average beauty of 3.
363 | 
364 | If we wanted to do this by hand, we would simply plug in these values into the
365 | linear model.
366 | 
367 | We can also calculate the predicted value in R.
368 | 
369 | First, we need to create a new data frame for this professor.
370 | ```{r new-prof}
371 | newprof <- data.frame(gender = "male", bty_avg = 3)
372 | ```
373 | 
374 | Note that I didn't need to add `rank = "tenure track"` to this data frame since
375 | this variable is not used in our model.
376 | 
377 | Then, I can do the prediction using the `predict` function:
378 | ```{r new-prof-predict}
379 | predict(m_bty_gen, newprof)
380 | ```
381 | 
382 | We can also construct a prediction interval around this prediction, which will
383 | provide a measure of uncertainty around the prediction.
384 | ```{r new-prof-predict-interval}
385 | predict(m_bty_gen, newprof, interval = "prediction", level = 0.95)
386 | ```
387 | 
388 | Hence, the model predicts, with 95% confidence, that a male professor with an 
389 | average beauty score of 3 is expected to have an evaluation score between 3.1 
390 | and 5.18. 
391 | 
392 | ## The search for the best model
393 | 
394 | We will start with a full model that predicts professor score based on rank, 
395 | ethnicity, gender, language of the university where they got their degree, age, 
396 | proportion of students that filled out evaluations, class size, course level, 
397 | number of professors, number of credits, average beauty rating, outfit, and 
398 | picture color.
399 | 
400 | <div id="exercise>
401 | Which variable would you expect to have the highest p-value in this model? Why? 
402 | *Hint:* Think about which variable would you expect to not have any association 
403 | with the professor score.
404 | </div>
405 | 
406 | Let's run the model...
407 | 
408 | ```{r m_full, tidy = FALSE}
409 | m_full <- lm(score ~ rank + ethnicity + gender + language + age + cls_perc_eval 
410 |              + cls_students + cls_level + cls_profs + cls_credits + bty_avg 
411 |              + pic_outfit + pic_color, data = evals)
412 | summary(m_full)
413 | ```
414 | 
415 | <div id="exercise>
416 | **Exercise:** Check your suspicions from the previous exercise. Include the model 
417 | output in your response.
418 | </div>
419 | 
420 | <div class="question">
421 |  Which of the following is the correct intrepetation of the coefficient associated with the ethnicity variable. <br> Non-minority professors are expected on average to score ... 
422 | 
423 | * 0.12 points lower than minority professors, all else held constant. 
424 | * 0.12 points higher than minority professors, all else held constant.  
425 | * 0.02 points lower than minority professors, all else held constant. 
426 | * 0.02 points higher than minority professors, all else held constant. 
427 | </div>
428 | 
429 | <div class="exercise">
430 | **Exercise:** Drop the variable with the highest p-value and re-fit the model. 
431 | Did the coefficients and significance of the other explanatory variables change? 
432 | (One of the things that makes multiple regression interesting is that coefficient 
433 | estimates depend on the other variables that are included in the model.) If not, 
434 | what does this say about whether or not the dropped variable was collinear with 
435 | the other explanatory variables?
436 | </div>
437 | ```{r p-val-select}
438 | # type your code for the Exercise here, and Knit
439 | 
440 | ```
441 | 
442 | Now we try a different model selection method: adjusted $R^2$. Create a new model, 
443 | `m1`, where you remove `rank` from the list of explanatory variables. Check out the 
444 | adjusted $R^2$ of this new model and compare it to the adjusted $R^2$ of the full model.
445 | 
446 | ```{r eval=FALSE}
447 | m1 <- lm(score ~ ethnicity + gender + language + age + cls_perc_eval 
448 |              + cls_students + cls_level + cls_profs + cls_credits + bty_avg, data = evals)
449 | summary(m1)$adj.r.squared
450 | ```
451 | 
452 | Then, try dropping the next variable from the full model (`ethnicity`):
453 | 
454 | ```{r eval=FALSE}
455 | m2 = lm(score ~ rank + gender + language + age + cls_perc_eval + 
456 |     cls_students + cls_level + cls_profs + cls_credits + bty_avg, data = evals)
457 | summary(m2)$adj.r.squared
458 | ```
459 | 
460 | <div class="exercise">
461 | **Exercise:** Repeat this process until you have tried removing each variable 
462 | from the full model at a time, and determine removal of which variable yields 
463 | the highest improvement in the adjusted $R^2$.
464 | </div>
465 | 
466 | ```{r adj-rsq-select}
467 | # type your code for the Exercise here, and Knit
468 | 
469 | ```
470 | 
471 | <div class="question">
472 |  Elimination of which variable from the full model yielded the highest adjusted R-squared? 
473 | 
474 | * `bty_avg` 
475 | * `cls_profs`  
476 | * `cls_students` 
477 | * `rank` 
478 | </div>
479 | 
480 | To complete the model selection we would continue removing variables one at a 
481 | time until removal of another variable did not increase adjusted $R^2$.
482 | 
483 | <div class="exercise">
484 | **Exercise:** The original paper describes how these data were gathered by taking 
485 | a sample of professors from the University of Texas at Austin and including all 
486 | courses that they have taught.  Considering that each row represents a course, 
487 | could this new information have an impact on any of the conditions of linear regression?
488 | </div>
489 | 
490 | <div class="exercise">
491 | **Exercise:** Based on your final model, describe the characteristics of a 
492 | professor and course at University of Texas at Austin that would be associated 
493 | with a high evaluation score.
494 | </div>
495 | 
496 | <div class="exercise">
497 | **Exercise:** Would you be comfortable generalizing your conclusions to apply 
498 | to professors generally (at any university)?  Why or why not?
499 | </div>
500 | 
501 | <div id="license">
502 | This is a product of OpenIntro that is released under a [Creative Commons Attribution-ShareAlike 3.0 Unported](http://creativecommons.org/licenses/by-sa/3.0). 
503 | This lab was written by Mine &Ccedil;etinkaya-Rundel and Andrew Bray.
504 | </div>
505 | 
506 | ## References


--------------------------------------------------------------------------------
/1.1_intro_to_r/intro_to_r.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Introduction to R and RStudio"
  3 | output: statsr:::statswithr_lab
  4 | ---
  5 | 
  6 | <div class="instructions">
  7 | Complete all **Exercises**, and submit answers to **Questions** on the Coursera platform.
  8 | </div>
  9 | 
 10 | ## Introduction
 11 | 
 12 | The goal of this lab is to introduce you to R and RStudio, which you'll be using
 13 | throughout the course both to learn the statistical concepts discussed in the 
 14 | course and to analyze real data and come to informed conclusions. To straighten 
 15 | out which is which: R is the name of the programming language itself and RStudio 
 16 | is a convenient interface.
 17 | 
 18 | As the labs progress, you are encouraged to explore beyond what the labs dictate;
 19 | a willingness to experiment will make you a much better programmer. Before we 
 20 | get to that stage, however, you need to build some basic fluency in R. Today we
 21 | begin with the fundamental building blocks of R and RStudio: the interface, 
 22 | reading in data, and basic commands.
 23 | 
 24 | ## RStudio
 25 | 
 26 | Your RStudio window has four panels.
 27 | 
 28 | Your R Markdown file (this document) is in the upper left panel.
 29 | 
 30 | The panel on the lower left is where the action happens. It's called the *console*. 
 31 | Everytime you launch RStudio, it will have the same text at the top of the 
 32 | console telling you the version of R that you're running. Below that information
 33 | is the *prompt*. As its name suggests, this prompt is really a request, a 
 34 | request for a command. Initially, interacting with R is all about typing commands
 35 | and interpreting the output. These commands and their syntax have evolved over
 36 | decades (literally) and now provide what many users feel is a fairly natural way
 37 | to access data and organize, describe, and invoke statistical computations.
 38 | 
 39 | The panel in the upper right contains your *workspace* as well as a history of 
 40 | the commands that you've previously entered. 
 41 | 
 42 | Any plots that you generate will show up in the panel in the lower right corner. 
 43 | This is also where you can browse your files, access help, manage packages, etc.
 44 | 
 45 | 
 46 | ## R Packages
 47 | 
 48 | R is an open-source programming language, meaning that users can contribute
 49 | packages that make our lives easier, and we can use them for free. For this lab,
 50 | and many others in the future, we will use the following R packages:
 51 | 
 52 | - `statsr`: for data files and functions used in this course
 53 | - `dplyr`: for data wrangling
 54 | - `ggplot2`: for data visualization
 55 | 
 56 | You should have already installed these packages using commands like 
 57 | `install.packages` and `install_github`.
 58 | 
 59 | Next, you need to load the packages in your working environment. We do this with
 60 | the `library` function. Note that you only need to **install** packages once, but
 61 | you need to **load** them each time you relaunch RStudio.
 62 | 
 63 | ```{r load-packages, message = FALSE}
 64 | library(dplyr)
 65 | library(ggplot2)
 66 | library(statsr)
 67 | ```
 68 | 
 69 | To do so, you can 
 70 | 
 71 | - click on the green arrow at the top of the code chunk in the R Markdown (Rmd) 
 72 | file, or
 73 | - highlight these lines, and hit the **Run** button on the upper right corner of the 
 74 | pane, or
 75 | - type the code in the console.
 76 | 
 77 | Going forward you will be asked to load any relevant packages at the beginning
 78 | of each lab.
 79 | 
 80 | ## Dataset 1: Dr. Arbuthnot's Baptism Records
 81 | 
 82 | To get you started, run the following command to load the data.
 83 | 
 84 | ```{r load-abrbuthnot-data}
 85 | data(arbuthnot)
 86 | ```
 87 | 
 88 | To do so, once again, you can 
 89 | 
 90 | - click on the green arrow at the top of the code chunk in the R Markdown (Rmd) 
 91 | file, or
 92 | - put your cursor on this line, and hit the **Run** button on the upper right 
 93 | corner of the pane, or
 94 | - type the code in the console.
 95 | 
 96 | This command instructs R to load some data. The Arbuthnot baptism counts for boys 
 97 | and girls. You should see that the workspace area in the upper righthand corner of 
 98 | the RStudio window now lists a data set called `arbuthnot` that has 82 observations 
 99 | on 3 variables. As you interact with R, you will create a series of objects. 
100 | Sometimes you load them as we have done here, and sometimes you create them yourself 
101 | as the byproduct of a computation or some analysis you have performed.
102 | 
103 | The Arbuthnot data set refers to Dr. John Arbuthnot, an 18<sup>th</sup> century 
104 | physician, writer, and mathematician. He was interested in the ratio of newborn
105 | boys to newborn girls, so he gathered the baptism records for children born in
106 | London for every year from 1629 to 1710. We can take a look at the data by 
107 | typing its name into the console.
108 | 
109 | ```{r view-data}
110 | arbuthnot
111 | ```
112 | 
113 | However printing the whole dataset in the console is not that useful. 
114 | One advantage of RStudio is that it comes with a built-in data viewer. Click on
115 | the name `arbuthnot` in the *Environment* pane (upper right window) that lists 
116 | the objects in your workspace. This will bring up an alternative display of the 
117 | data set in the *Data Viewer* (upper left window). You can close the data viewer
118 | by clicking on the *x* in the upper lefthand corner.
119 | 
120 | What you should see are four columns of numbers, each row representing a 
121 | different year: the first entry in each row is simply the row number (an index 
122 | we can use to access the data from individual years if we want), the second is 
123 | the year, and the third and fourth are the numbers of boys and girls baptized 
124 | that year, respectively. Use the scrollbar on the right side of the console 
125 | window to examine the complete data set.
126 | 
127 | Note that the row numbers in the first column are not part of Arbuthnot's data. 
128 | R adds them as part of its printout to help you make visual comparisons. You can
129 | think of them as the index that you see on the left side of a spreadsheet. In 
130 | fact, the comparison to a spreadsheet will generally be helpful. R has stored 
131 | Arbuthnot's data in a kind of spreadsheet or table called a *data frame*.
132 | 
133 | You can see the dimensions of this data frame by typing:
134 | 
135 | ```{r dim-data}
136 | dim(arbuthnot)
137 | ```
138 | 
139 | This command should output `[1] 82 3`, indicating that there are 82 rows and 3 
140 | columns (we'll get to what the `[1]` means in a bit), just as it says next to 
141 | the object in your workspace. You can see the names of these columns (or 
142 | variables) by typing:
143 | 
144 | ```{r names-data}
145 | names(arbuthnot)
146 | ```
147 | 
148 | <div class="question">
149 | How many variables are included in this data set? 
150 | 
151 | * 2
152 | * 3
153 | * 4
154 | * 82
155 | * 1710
156 | </div>
157 | 
158 | <div class="exercise">
159 | What years are included in this dataset? Hint: Take a look at the year 
160 | variable in the Data Viewer to answer this question.
161 | </div>
162 | 
163 | You should see that the data frame contains the columns `year`,  `boys`, and 
164 | `girls`. At this point, you might notice that many of the commands in R look a 
165 | lot like functions from math class; that is, invoking R commands means supplying
166 | a function with some number of arguments. The `dim` and `names` commands, for 
167 | example, each took a single argument, the name of a data frame.
168 | 
169 | <div class="boxedtext">
170 | **Tip: ** If you use the up and down arrow keys, you can scroll through your 
171 | previous commands, your so-called command history. You can also access it 
172 | by clicking on the history tab in the upper right panel. This will save 
173 | you a lot of typing in the future.
174 | </div>
175 | 
176 | ### R Markdown
177 | 
178 | So far we asked you to type your commands in the console. The console is a great 
179 | place for playing around with some code, however it is not a good place for 
180 | documenting your work. Working in the console exclusively makes it difficult to 
181 | document your work as you go, and reproduce it later. 
182 | 
183 | R Markdown is a great solution for this problem. And, you already have worked with 
184 | an R Markdown document -- this lab! Going forward type the code for the questions 
185 | in the code chunks provided in the R Markdown (Rmd) document for the lab, and **Knit**
186 | the document to see the results.
187 | 
188 | ### Some Exploration
189 | 
190 | Let's start to examine the data a little more closely. We can access the data in
191 | a single column of a data frame separately using a command like
192 | 
193 | ```{r view-boys}
194 | arbuthnot$boys
195 | ```
196 | 
197 | This command will only show the number of boys baptized each year. The dollar
198 | sign basically says "go to the data frame that comes before me, and find the 
199 | variable that comes after me".
200 | 
201 | <div class="question">
202 | What command would you use to extract just the counts of girls born? 
203 | 
204 | * `arbuthnot$boys`
205 | * `arbuthnot$girls`
206 | * `girls`
207 | * `arbuthnot[girls]`
208 | * `$girls`
209 | </div>
210 | 
211 | ```{r extract-counts-of-girls-born}
212 | # type your code for the Question 2 here, and Knit
213 | 
214 | ```
215 | 
216 | Notice that the way R has printed these data is different. When we looked at the
217 | complete data frame, we saw 82 rows, one on each line of the display. These data
218 | are no longer structured in a table with other variables, so they are displayed 
219 | one right after another. Objects that print out in this way are called vectors; 
220 | they represent a set of numbers. R has added numbers in [brackets] along the left
221 | side of the printout to indicate locations within the vector. For example, 5218 
222 | follows [1], indicating that 5218 is the first entry in the vector. And if [43] 
223 | starts a line, then that would mean the first number on that line would represent
224 | the 43rd entry in the vector.
225 | 
226 | R has some powerful functions for making graphics. We can create a simple plot 
227 | of the number of girls baptized per year with the command
228 | 
229 | ```{r plot-girls-vs-year}
230 | ggplot(data = arbuthnot, aes(x = year, y = girls)) +
231 |   geom_point()
232 | ```
233 | 
234 | Before we review the code for this plot, let's summarize the trends we see in the 
235 | data.
236 | 
237 | <div class="question">
238 | Which of the following best describes the number of girls baptised over the years included in this dataset? 
239 | 
240 | * There appears to be no trend in the number of girls baptised from 1629 to 1710. 
241 | * There is initially an increase in the number of girls baptised, which peaks around 1640. After 1640 there is a decrease in the number of girls baptised, but the number begins to increase again in 1660. Overall the trend is an increase in the number of girls baptised. 
242 | * There is initially an increase in the number of girls baptised. This number peaks around 1640 and then after 1640 the number of girls baptised decreases. 
243 | * The number of girls baptised has decreased over time. 
244 | * There is an initial increase in the number of girls baptised but this number appears to level around 1680 and not change after that time point. 
245 | </div>
246 | 
247 | Back to the code... We use the `ggplot()` function to build plots. If you run the 
248 | plotting code in your console, you should see the plot appear under the *Plots* tab 
249 | of the lower right panel of RStudio. Notice that the command above again looks like 
250 | a function, this time with arguments separated by commas. 
251 | 
252 | - The first argument is always the dataset. 
253 | - Next, we provide thevariables from the dataset to be assigned to `aes`thetic 
254 | elements of the plot, e.g. the x and the y axes. 
255 | - Finally, we use another layer, separated by a `+` to specify the `geom`etric 
256 | object for the plot. Since we want to scatterplot, we use `geom_point`.
257 | 
258 | You might wonder how you are supposed to know the syntax for the `ggplot` function. 
259 | Thankfully, R documents all of its functions extensively. To read what a function 
260 | does and learn the arguments that are available to you, just type in a question mark 
261 | followed by the name of the function that you're interested in. Try the following in
262 | your console:
263 | 
264 | ```{r plot-help, tidy = FALSE}
265 | ?ggplot
266 | ```
267 | 
268 | Notice that the help file replaces the plot in the lower right panel. You can 
269 | toggle between plots and help files using the tabs at the top of that panel. 
270 | 
271 | <div class="boxedtext">
272 | More extensive help for plotting with the `ggplot2` package can be found at 
273 | http://docs.ggplot2.org/current/. The best (and easiest) way to learn the syntax is 
274 | to take a look at the sample plots provided on that page, and modify the code 
275 | bit by bit until you get achieve the plot you want.
276 | </div>
277 | 
278 | ### R as a big calculator
279 | 
280 | Now, suppose we want to plot the total number of baptisms. To compute this, we 
281 | could use the fact that R is really just a big calculator. We can type in 
282 | mathematical expressions like
283 | 
284 | ```{r calc-total-bapt-numbers}
285 | 5218 + 4683
286 | ```
287 | 
288 | to see the total number of baptisms in 1629. We could repeat this once for each 
289 | year, but there is a faster way. If we add the vector for baptisms for boys to 
290 | that of girls, R will compute all sums simultaneously.
291 | 
292 | ```{r calc-total-bapt-vars}
293 | arbuthnot$boys + arbuthnot$girls
294 | ```
295 | 
296 | What you will see are 82 numbers (in that packed display, because we aren’t 
297 | looking at a data frame here), each one representing the sum we’re after. Take a
298 | look at a few of them and verify that they are right.
299 | 
300 | ### Adding a new variable to the data frame
301 | 
302 | We'll be using this new vector to generate some plots, so we'll want to save it 
303 | as a permanent column in our data frame.
304 | 
305 | ```{r calc-total-bapt-vars-save}
306 | arbuthnot <- arbuthnot %>%
307 |   mutate(total = boys + girls)
308 | ```
309 | 
310 | What in the world is going on here? The `%>%` operator is called the **piping** 
311 | operator. Basically, it takes the output of the current line and pipes it into 
312 | the following line of code.
313 | 
314 | <div class="boxedtext">
315 | **A note on piping: ** Note that we can read these three lines of code as the following: 
316 | 
317 | *"Take the `arbuthnot` dataset and **pipe** it into the `mutate` function. 
318 | Using this mutate a new variable called `total` that is the sum of the variables
319 | called `boys` and `girls`. Then assign this new resulting dataset to the object
320 | called `arbuthnot`, i.e. overwrite the old `arbuthnot` dataset with the new one
321 | containing the new variable."*
322 | 
323 | This is essentially equivalent to going through each row and adding up the boys 
324 | and girls counts for that year and recording that value in a new column called
325 | total.
326 | </div>
327 | 
328 | <div class="boxedtext">
329 | **Where is the new variable? ** When you make changes to variables in your dataset, 
330 | click on the name of the dataset again to update it in the data viewer.
331 | </div>
332 | 
333 | You'll see that there is now a new column called `total` that has been tacked on
334 | to the data frame. The special symbol `<-` performs an *assignment*, taking the 
335 | output of one line of code and saving it into an object in your workspace. In 
336 | this case, you already have an object called `arbuthnot`, so this command updates
337 | that data set with the new mutated column.
338 | 
339 | We can make a plot of the total number of baptisms per year with the following command.
340 | 
341 | ```{r plot-total-vs-year-line}
342 | ggplot(data = arbuthnot, aes(x = year, y = total)) +
343 |   geom_line()
344 | ```
345 | 
346 | Note that using `geom_line()` instead of `geom_point()` results in a line plot instead
347 | of a scatter plot. You want both? Just layer them on:
348 | 
349 | ```{r plot-total-vs-year-line-and-point}
350 | ggplot(data = arbuthnot, aes(x = year, y = total)) +
351 |   geom_line() +
352 |   geom_point()
353 | ```
354 | 
355 | <div class="exercise">
356 | Now, generate a plot of the proportion of boys born over time. What 
357 | do you see? 
358 | </div>
359 | 
360 | ```{r plot-proportion-of-boys-over-time}
361 | # type your code for the Exercise here, and Knit
362 | 
363 | ```
364 | 
365 | Finally, in addition to simple mathematical operators like subtraction and 
366 | division, you can ask R to make comparisons like greater than, `>`, less than,
367 | `<`, and equality, `==`. For example, we can ask if boys outnumber girls in each 
368 | year with the expression
369 | 
370 | ```{r boys-more-than-girls}
371 | arbuthnot <- arbuthnot %>%
372 |   mutate(more_boys = boys > girls)
373 | ```
374 | 
375 | This command add a new variable to the `arbuthnot` data frame containing the values
376 | of either `TRUE` if that year had more boys than girls, or `FALSE` if that year 
377 | did not (the answer may surprise you). This variable contains different kind of 
378 | data than we have considered so far. All other columns in the `arbuthnot` data 
379 | frame have values are numerical (the year, the number of boys and girls). Here, 
380 | we've asked R to create *logical* data, data where the values are either `TRUE` 
381 | or `FALSE`. In general, data analysis will involve many different kinds of data 
382 | types, and one reason for using R is that it is able to represent and compute 
383 | with many of them.
384 | 
385 | 
386 | ## Dataset 2: Present birth records
387 | 
388 | In the previous few pages, you recreated some of the displays and preliminary 
389 | analysis of Arbuthnot's baptism data. Next you will do a similar analysis, 
390 | but for present day birth records in the United States. Load up the 
391 | present day data with the following command.
392 | 
393 | ```{r load-present-data}
394 | data(present)
395 | ```
396 | 
397 | The data are stored in a data frame called `present` which should now be loaded in 
398 | your workspace.
399 | 
400 | <div class="question">
401 | How many variables are included in this data set?
402 | 
403 | * 2 
404 | * 3 
405 | * 4 
406 | * 74 
407 | * 2013 
408 | </div>
409 | 
410 | ```{r variables-in-present}
411 | # type your code for Question 4 here, and Knit
412 | 
413 | ```
414 | 
415 | <div class="exercise">
416 | What years are included in this dataset? **Hint:** Use the `range` 
417 | function and `present$year` as its argument.
418 | </div>
419 | 
420 | ```{r years-in-present-data}
421 | # type your code for Exercise here, and Knit
422 | 
423 | ```
424 | 
425 | <div class="question">
426 | Calculate the total number of births for each year and store these values in a new 
427 | variable called `total` in the `present` dataset. Then, calculate the proportion of 
428 | boys born each year and store these values in a new variable called `prop_boys` in 
429 | the same dataset. Plot these values over time and based on the plot determine if the 
430 | following statement is true or false: The proportion of boys born in the US has 
431 | decreased over time. 
432 | 
433 | * True 
434 | * False 
435 | </div>
436 | 
437 | ```{r prop-boys-over-time}
438 | # type your code for Question 5 here, and Knit
439 | 
440 | ```
441 | 
442 | 
443 | <div class="question">
444 | Create a new variable called `more_boys` which contains the value of either `TRUE` 
445 | if that year had more boys than girls, or `FALSE` if that year did not. Based on this 
446 | variable which of the following statements is true? 
447 | 
448 | * Every year there are more girls born than boys. 
449 | * Every year there are more boys born than girls. 
450 | * Half of the years there are more boys born, and the other half more girls born. 
451 | </div>
452 | 
453 | ```{r more-boys-per-year}
454 | # type your code for Question 6 here, and Knit
455 | 
456 | ```
457 | 
458 | 
459 | <div class="question">
460 |  Calculate the boy-to-girl ratio each year, and store these values in a new variable called `prop_boy_girl` in the `present` dataset. Plot these values over time. Which of the following best describes the trend? 
461 | 
462 | * There appears to be no trend in the boy-to-girl ratio from 1940 to 2013. 
463 | * There is initially an increase in boy-to-girl ratio, which peaks around 1960. After 1960 there is a decrease in the boy-to-girl ratio, but the number begins to increase in the mid 1970s. 
464 | * There is initially a decrease in the boy-to-girl ratio, and then an increase between 1960 and 1970, followed by a decrease. 
465 | * The boy-to-girl ratio has increased over time. 
466 | * There is an initial decrease in the boy-to-girl ratio born but this number appears to level around 1960 and remain constant since then. 
467 | </div>
468 | 
469 | ```{r prop-boy-girl-over-time}
470 | # type your code for Question 7 here, and Knit
471 | 
472 | ```
473 | 
474 | <div class="question">
475 |  In what year did we see the most total number of births in the U.S.? *Hint:* Sort 
476 | your dataset in descending order based on the `total` column. You can do this 
477 | interactively in the data viewer by clicking on the arrows next to the variable 
478 | names. Or to arrange the data in a descenting order with new function: `desc` (for 
479 | descending order).
480 | 
481 | * 1940 
482 | * 1957 
483 | * 1961 
484 | * 1991 
485 | * 2007 
486 | </div>
487 | 
488 | ```{r most-total-births}
489 | # type your code for Question 8 here
490 | # sample code is provided below, edit as necessary, uncomment, and then Knit
491 | #present %>%
492 | #  mutate(total = ?) %>%
493 | #  arrange(desc(total))
494 | ```
495 | 
496 | ## Resources for learning R and working in RStudio
497 | 
498 | That was a short introduction to R and RStudio, but we will provide you with more
499 | functions and a more complete sense of the language as the course progresses. You 
500 | might find the following tips and resources helpful.
501 | 
502 | - In this course we will be using the `dplyr` (for data wrangling) and `ggplot2` (for 
503 | data visualization) extensively. If you are googling for R code, make sure
504 | to also include these package names in your search query. For example, instead
505 | of googling "scatterplot in R", google "scatterplot in R with ggplot2".
506 | 
507 | - The following cheathseets may come in handy throughout the course. Note that some 
508 | of the code on these cheatsheets may be too advanced for this course, however 
509 | majority of it will become useful as you progress through the course material.
510 |     - [Data wrangling cheatsheet](http://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf)
511 |     - [Data visualization cheatsheet](http://www.rstudio.com/wp-content/uploads/2015/12/ggplot2-cheatsheet-2.0.pdf)
512 |     - [R Markdown](http://www.rstudio.com/wp-content/uploads/2016/03/rmarkdown-cheatsheet-2.0.pdf)
513 | 
514 | - While you will get plenty of exercise working with these packages in the labs of 
515 | this course, if you would like further opportunities to practice we recommend 
516 | checking out the relevant courses at [DataCamp](https://www.datacamp.com/courses).
517 | 
518 | <div id="license">
519 | This is a derivative of an [OpenIntro](https://www.openintro.org/stat/labs.php) lab, and is released under a [Attribution-NonCommercial-ShareAlike 3.0 United States](https://creativecommons.org/licenses/by-nc-sa/3.0/us/) license.
520 | </div>
521 | 


--------------------------------------------------------------------------------
/4.3_bayesian_inference/bayesian_inference.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Bayesian Inference for Numerical Data"
  3 | output: statsr:::statswithr_lab
  4 | ---
  5 | 
  6 | <div class="instructions">
  7 | Complete all **Exercises**, and submit answers to **Questions** in the **Quiz: Week 3 Lab** on Coursera.
  8 | </div>
  9 | 
 10 | ## Getting Started
 11 | 
 12 | In this lab we will review exploratory data analysis using the `ggplot2` 
 13 | package for data visualization, which is included in the `tidyverse`. The main 
 14 | focus of this lab is to be able to obtain and interpret credible intervals and 
 15 | hypothesis tests using Bayesian methods for numerical variables. The data and 
 16 | functions for inference can be found in the companion package for this course, 
 17 | `statsr`. 
 18 | 
 19 | Let's load the necessary packages for this week's lab:
 20 | 
 21 | ```{r load-packages, message=FALSE, warning=FALSE}
 22 | library(PairedData)
 23 | library(tidyverse)
 24 | library(statsr)
 25 | ```
 26 | 
 27 | 
 28 | ### The data
 29 | 
 30 | In 2004, the state of North Carolina released a large data set containing 
 31 | information on births recorded in this state. This data set is useful to 
 32 | researchers studying the relation between habits and practices of expectant 
 33 | mothers and the birth of their children. We will work with a random sample 
 34 | of observations from this data set.
 35 | 
 36 | Let's load the `nc` data set into our workspace from the `statsr` package.
 37 | 
 38 | ```{r load-data}
 39 | data(nc)
 40 | ```
 41 | 
 42 | We have observations on 13 different variables, some categorical and some 
 43 | numerical. The meaning of each variable is as follows.
 44 | 
 45 | variable         | description
 46 | ---------------- | ---------------------------------------------
 47 | `fage`           | father's age in years.
 48 | `mage`           | mother's age in years.
 49 | `mature`         | maturity status of mother.
 50 | `weeks`          | length of pregnancy in weeks.
 51 | `premie`         | whether the birth was classified as premature (premie) or full-term.
 52 | `visits`         | number of hospital visits during pregnancy.
 53 | `marital`        | whether mother is `married` or `not married` at birth.
 54 | `gained`         | weight gained by mother during pregnancy in pounds.
 55 | `weight`         | weight of the baby at birth in pounds.
 56 | `lowbirthweight` | whether baby was classified as low birthweight (`low`) or not (`not low`).
 57 | `gender`         | gender of the baby, `female` or `male`.
 58 | `habit`          | status of the mother as a `nonsmoker` or a `smoker`.
 59 | `whitemom`       | whether mom is `white` or `not white`.
 60 | 
 61 | Note: These data should be familiar for those  who took the 
 62 | *Inferential Statistics* course as part of the *Statistics with R* 
 63 | specialization, where the `nc` data were  used in the 
 64 | **Inference for Numerical Data** lab.
 65 | 
 66 | ### EDA
 67 | 
 68 | As a first step in the analysis, let's take a look at the variables in the 
 69 | dataset and how `R` has encoded them. The most straight forward way of doing 
 70 | this is using the `glimpse` function.
 71 | 
 72 | ```{r str}
 73 | glimpse(nc)
 74 | ```
 75 | 
 76 | Another useful function is `summary` which provides the range, quartiles, and 
 77 | means for numeric variables and counts for categorical variables. Additionally, 
 78 | if there are any missing observations (denoted `NA`), summary will provide the 
 79 | number of missing cases for each variable. Note that the output of the summary 
 80 | function can be too long and difficult to parse  visually and interpret if the 
 81 | dataset has a large number of variables.
 82 | 
 83 | ```{r nc-summary}
 84 | summary(nc)
 85 | ```
 86 | 
 87 | As you review the variable summaries, consider which variables are categorical 
 88 | and which are numerical. For numerical variables, are there outliers? If you 
 89 | aren't sure or want to take a closer look at the data, you can make a graph.
 90 | 
 91 | For example, we can examine the distribution of the amount of weight that a 
 92 | mother `gained` with a histogram.
 93 | 
 94 | ```{r hist-weight}
 95 | ggplot(data = nc, aes(x = gained)) +
 96 |   geom_histogram(binwidth = 5)
 97 | ```
 98 | 
 99 | This function says to plot the `gained` variable from the `nc` data frame on the 
100 | x-axis. It also defines a `geom` (short for geometric object), which describes 
101 | the type of plot you will produce. We used a binwidth of 5, however you can 
102 | change this value and see how it affects the shape of the histogram. Also note 
103 | that the function results in a warning saying that 27 rows have been removed. 
104 | This is because 27 observations in the data have `NA` values for weight gained. 
105 | You can confirm this by peeking back at the summary output above. If you need a 
106 | refresher on using `ggplot2`, you may want to take some time to review the 
107 | material in the earlier courses in this specialization.
108 | 
109 | <div class="question">
110 | How many of the 13 variables are categorical?
111 | 
112 | * 5
113 | * 6
114 | * 7
115 | * 8
116 | </div>
117 | 
118 | ```{r Q1-summay}
119 | # Type your code for Question 1 here.
120 | ```
121 | 
122 | We will start with analyzing the weight of the babies at birth, which is 
123 | contained in the variable `weight`.
124 | 
125 | <div class="question">
126 | Use a visualization such as a histogram and summary statistics tools in R to 
127 | analyze the distribution of `weight`. Which of the following best describes the 
128 | distribution of `weight`?
129 | 
130 | * Left skewed
131 | * Right skewed
132 | * Uniformly distributed
133 | * Normally distributed
134 | </div>
135 | 
136 | ```{r Q2-weight-dist}
137 | # Type your code for Question 2 here.
138 | ```
139 | 
140 | The variable `premie` in the dataframe classifies births on whether they were 
141 | full-term or premie. We can use some of the functions of `dplyr` to create a 
142 | new dataframe to limit the analysis to full term births.
143 | 
144 | ```{r nc-omitNA}
145 | nc_fullterm = filter(nc, premie == 'full term')
146 | summary(nc_fullterm)
147 | ```
148 | 
149 | The `filter` function selects variables all variables from the dataframe `nc` 
150 | where the condition `premie` equals "full term" is met.
151 | 
152 | <div class="exercise">
153 | Repeat the visualization and summary with the weights from full term term 
154 | births. Does `weight` appear to be approximately normally distributed?
155 | </div>
156 | 
157 | ## Inference
158 | 
159 | Next we will introduce a function `bayes_inference` that we will use for 
160 | constructing credible intervals and conducting hypothesis tests. The following 
161 | illustrates how we would use the function `bayes_inference` to construct a 95% 
162 | credible interval of `weight`; the Bayesian analogue to a 95% confidence 
163 | interval.
164 | 
165 | 
166 | ```{r mean-inference, fig.align='center', out.width = "70%"}
167 | bayes_inference(y = weight, data = nc_fullterm, 
168 |                 statistic = "mean", type = "ci",  
169 |                 prior_family = "JZS", mu_0 = 7.7, rscale = 1,
170 |                 method = "simulation",
171 |                 cred_level = 0.95)
172 | ```
173 | 
174 | Let's look at the meanings of the arguments of this custom function. The first 
175 | argument `y` specifies the response variable that we are interested in: `weight`.
176 | The second argument, `data`, specifies the dataset `nc_fullterm` that contains 
177 | the variable `weight`. The third argument `statistic` is the sample statistic 
178 | we're using, or similarly, the population parameter we're estimating. The 
179 | argument `type` specifies the type of inference that we want: credible 
180 | intervals (`type = "ci"`), or hypothesis tests (`type = "ht"`). The argument `prior` 
181 | indicates which prior distribution for any unknown parameters we will use for inference or testing, with options `JZS` (the Jeffreys-Zellner-Siow prior which 
182 | is the Jeffreys prior for the unknown variance and a Cauchy prior for the mean), 
183 | `JUI` (the Jeffreys-Unit Information prior which is the Jeffreys prior for the 
184 | variance and the Unit Information Gaussian prior for the mean), `NG` (the 
185 | conjugate Normal-Gamma prior for the mean and inverse of the variance) or `ref` 
186 | (the independent Jeffreys reference prior for the variance and the uniform prior 
187 | for the mean). As we would like to use the same prior for constructing credible 
188 | intervals and hypothesis testing later with results that are robust if we 
189 | mis-specify the prior, we will use the `JZS` option. For all of the 
190 | `prior_family` options, we need to specify prior hyperparameters. For `JZS`, 
191 | the prior on standardized effect $\mu/\sigma$ is a Cauchy centered at `mu_0` 
192 | and with a scale of `rscale`. By default these are zero and one respectively. 
193 | The average birthweight for full term births in the US in 1994-1996 was 7.7 
194 | pounds, which we will use as the center of our prior distribution using the 
195 | argument `mu_0 = 7.7`. We will use the default argument `rscale = 1`. The method of inference can be either 
196 | `method = "theoretical"` (theoretical based) or `"simulation"` based; in the 
197 | case of the `JZS` prior for credible intervals, `"simulation"` is the only 
198 | option as there are no closed form results.  We 
199 | also specify that we are looking for the 95% credible interval by setting 
200 | `cred_level = 0.95`, which is the default. 
201 | For more information on the `bayes_inference` function see the help file with 
202 | `?bayes_inference`.
203 | 
204 | <div class="question">
205 | Which of the following corresponds to the **95%** credible interval for the 
206 | average birth weight of all full-term babies born in North Carolina?
207 | 
208 | 
209 | * There is a 95% chance that babies weigh 7.4 to 7.5 pounds. 
210 | * There is a 95% chance that the average weights of babies in this sample is between 7.4 an 7.5 pounds. 
211 | * There is a 95% chance that babies on average weigh 7.4 to 7.5 pounds. 
212 | </div>
213 | 
214 | We can also conduct a Bayesian hypothesis test by calculating Bayes factors or 
215 | posterior probabilities. Let us test to see if the average birth weight in North 
216 | Carolina for full term births is significantly different from the US average of 
217 | 7.7 pounds from 1994-96. The two competing hypotheses are:
218 | 
219 | $$ H_1: \mu = 7.7 $$
220 | $$ H_2: \mu \ne 7.7 $$
221 | 
222 | To conduct this hypothesis test, we will need to change the `type` argument to 
223 | hypothesis test, `type = "ht"` in the `bayes_inference` function. In addition, we 
224 | will need to add the type of alternative hypothesis as an additional argument 
225 | `alternative = "twosided"`. For faster calculation, change the method to 
226 | `theoretical` and add `show_plot=FALSE`.
227 | 
228 | <div class="question">
229 | Based of Jeffrey's scale for interpretation of a Bayes factors how should we 
230 | describe the evidence against $H_1$ from your results for the hypothesis test?
231 | 
232 | * Not worth a bare mention
233 | * Positive
234 | * Strong
235 | * Very Strong
236 | </div>
237 | 
238 | ```{r Q4-HT-weight}
239 | # Type your code for the Exercise here.
240 | ```
241 | 
242 | ## Prediction using MCMC
243 | 
244 | A key advantage of Bayesian statistics is predictions and the probabilistic 
245 | interpretation of predictions. Much of Bayesian prediction is done using 
246 | simulation techniques, some of which was discussed near the end of this module. 
247 | We will go over a simple simulation example to obtain the predictive distribution 
248 | of the variable `weight` using the output of `bayes_inference` which we will 
249 | save to the object `weight_post`:
250 | 
251 | ```{r weight-inference, fig.align='center', out.width = "70%"}
252 | weight_post = bayes_inference(y = weight, data = nc_fullterm, 
253 |                               statistic = "mean", type = "ci",  
254 |                               prior_family = "JZS", mu_0 = 7.7, rscale = 1,
255 |                               method = "simulation",
256 |                               cred_level = 0.95)
257 | ```
258 | 
259 | The `names` function can list the output or objects that are stored in the object created by `bayes_inference`:
260 | 
261 | ```{r}
262 | names(weight_post)
263 | ```
264 | 
265 | In particular, the `samples` object is a `matrix` or table which contain the draws from the MCMC 
266 | simulation, and includes columns for `mu` and  `sig2`, which are posterior samples of the mean and variance respectively.  Let's see how we can use these to make predictions.
267 | 
268 | ### Posterior predictive distribution of new observation $y_{n+1}$
269 | 
270 | The distribution of any new observation conditional on the mean and variance is 
271 | $$N(\mu, \sigma^2)$$ and if we knew $\mu$ and $\sigma^2$ we could draw a sample 
272 | from the distribution of the new observation from the normal distribution. 
273 | While we do not know $\mu$ and $\sigma^2$ we the draws of $\mu$ and $\sigma^2$ 
274 | from their posterior distributions. If we substitute these values into the 
275 | normal distribution for $Y_{n+1}$, we can obtain samples from the predictive 
276 | distribution for the birth weight for any new observation $y_{1001}$. 
277 | 
278 | We'll first convert our `samples` into a dataframe and then use `mutate` the create draws from the predictive distribution using `rnorm`:
279 | 
280 | ```{r pred-distribution}
281 | samples = as.data.frame(weight_post$samples)
282 | nsim = nrow(samples)
283 | samples = mutate(samples, y_pred = rnorm(nsim, mu, sqrt(sig2)))
284 | ```
285 | 
286 | We can view an estimate of the predictive distribution, by looking at a 
287 | smoothed version of the histogram of the simulated data:
288 | 
289 | ```{r preddens, fig.align="center", out.width="70%"}
290 | ggplot(data = samples, aes(x = y_pred)) + 
291 |   geom_histogram(aes(y = ..density..), bins = 100) +
292 |   geom_density() + 
293 |   xlab(expression(y[new]))
294 | ```
295 | 
296 | A 95% central credible interval for a new observation is the interval (L, U) 
297 | where $P(Y_{new} < L \mid Y) = 0.05/2$ and $P(Y_{new} > U \mid Y) = 0.05/2)$. 
298 | In this case, since the posterior distribution of $\mu$ and $Y_{new}$ are both 
299 | symmetric, we can set L to be the 0.025 quantile and U to be the 0.975 quantile. 
300 | Using the `quantile` function `R` we can find the 0.025 and 0.975, as well as 
301 | median (0.50) quantiles of the predictive distribution:
302 | 
303 | ```{r}
304 | dplyr::select(samples, mu, y_pred) %>%
305 |   map(quantile, probs=c(0.025, 0.50, 0.975))
306 | ```
307 | In the above code we are using `dplyr:select` to select just the columns `mu` and `y_pred` from `samples`.  The usage of `dplyr:` preceeding `select`, ensures that we are using the `select` function from the `dplyr` package to avoid possible name conflicts, as several packages have a `select` function.  We are also taking advantage of the pipe operator to send the selected columns to the `map` function to apply the `quantile` function to each of the selected columns for the probabilities in the argument `probs` to `quantile`.
308 | 
309 | <div class="question">
310 | For predicting the birth weight of a new full term baby in NC, 
311 | 
312 | * there is a 95% chance that their birth weight will be 7.4 to 7.5 pounds.
313 | * there is a 95% chance that their birth weight will be on average 7.4 to 7.5 pounds.
314 | * there is a 95% chance that their birth weight will be 5.4 to 9.5 pounds.
315 | * there is 50% chance that their birth weight will be 7.4 pounds.
316 | </div>
317 | 
318 | ```{r Q5-cred-int}
319 | # Type your code for Question 5 here.
320 | ```
321 | 
322 | 
323 | <div class="exercise">
324 | Repeat the above analysis but find the predictive distribution for babies that 
325 | were premature. 
326 | </div>
327 | 
328 | ```{r E2-t-dist}
329 | # Type your code for Exersice 2 here.
330 | ```
331 | 
332 | ## Bayesian inference for two independent means
333 | 
334 | Next, let us consider whether there is a difference of baby weights for babies 
335 | born to smokers and non-smokers. Here we will use the variable `habit` to 
336 | distinguish between babies born to mothers who smoked and babies born to mothers 
337 | who were non-smokers. Plotting the data is a useful first step because it helps 
338 | us quickly visualize trends, identify strong associations, and develop research 
339 | questions.
340 | 
341 | To create side by side boxplots by levels of a categorical variable `x`, we can 
342 | use the following:
343 | 
344 | ```{r weight-habit-box}
345 | ggplot(nc, aes(x = habit, y = weight)) +
346 |   geom_boxplot()
347 | ```
348 | 
349 | to create side-by-side boxplots of `weight` for smokers and non-smokers.
350 | 
351 | <div class="question">
352 | Construct a side-by-side boxplot of `habit` and `weight` for the data using full 
353 | term births and compare the two distributions. Which of the following is *false* 
354 | about the relationship between `habit` and `weight`?
355 | 
356 | * Median birth weight of babies born to non-smokers is slightly higher than that of babies born to smokers.
357 | * Range of birth weights of female babies are roughly the same as that of male babies.
358 | * Both distributions are approximately symmetric.
359 | * The IQRs of the distributions are roughly equal.
360 | </div>
361 | 
362 | ```{r habit-weight-box}
363 | # Type your code for the question here.
364 | ```
365 | 
366 | 
367 | The box plots show how the medians of the two distributions compare, but we can 
368 | also compare the means of the distributions using the following to first group 
369 | the data by the `habit` variable, and then calculate the mean `weight` in these 
370 | groups using the `mean` function, where the `%>%` pipe operator takes the output 
371 | of one function and then *pipes* it into the next function.
372 | 
373 | ```{r by-means}
374 | nc_fullterm %>%
375 |   group_by(habit) %>%
376 |   summarise(mean_weight = mean(weight))
377 | ```
378 | 
379 | There is an observed difference, but is this difference statistically 
380 | significant? In order to answer this question we will conduct a 
381 | Bayesian hypothesis test.
382 | 
383 | As before, we can use the `bayes_inference` function to test the hypothesis the 
384 | mean weight of babies born to non-smokers is different than the mean weight of 
385 | babies born to smokers. The call is almost identical to the single mean case, 
386 | except now we will provide `habit` as an explanatory variable (argument 
387 | `x = habit`). Here, we use the theoretical method instead of simulation 
388 | (argument `method = "theoretical"`).
389 | 
390 | ```{r ht-habit-weight}
391 | bayes_inference(y = weight, x = habit, data = nc_fullterm, 
392 |                 statistic = "mean", 
393 |                 type = "ht", alternative = "twosided", null = 0, 
394 |                 prior = "JZS", rscale = 1, 
395 |                 method = "theoretical", show_plot = FALSE)
396 | ```
397 | 
398 | <div class="question">
399 | Based on the Bayes factor calculated above, how strong is evidence against $H_1$?
400 | 
401 | * Not worth a bare mention
402 | * Positive
403 | * Strong
404 | * Very Strong
405 | </div>
406 | 
407 | <div class="question">
408 | How would the Bayes factor above change if we were to increase the prior 
409 | probability of $H_2$ to 0.75? (Hint: you may change the prior of $H_1$ and $H_2$ 
410 | by specifying `hypothesis_prior = c(a, b)` where $P(H_1) = a$, $P(H_2) = b$, and 
411 | $a+b = 1$.)
412 | 
413 | * Get bigger
414 | * Get smaller
415 | * Stay the same
416 | </div>
417 | 
418 | ```{r Q8-ht-habit-weight-increase-H2}
419 | # Type your code for the question here.
420 | ```
421 | 
422 | If differences between the groups are expected to be small, using a value of 
423 | `rscale = sqrt(2)/2` in the `JZS` prior is recommended. 
424 | 
425 | <div class="question">
426 | How would the Bayes factor for H2 to H1 change if we were to change the scale 
427 | in the Cauchy prior `rscale = sqrt(2)/2`?
428 | 
429 | * Get bigger
430 | * Get smaller
431 | * Stay the same
432 | </div>
433 | 
434 | ```{r Q9-ht-gender-weight-increase-rscale}
435 | # Type your code for  Question 9 here.
436 | ```
437 | 
438 | 
439 | To quantify the magnitude of the differences in mean birth weight, we can use a 
440 | credible interval. Change the `type` argument to `"ci"` to construct and record 
441 | a credible interval for the difference between the weights of babies born to 
442 | nonsmoking and smoking mothers, and interpret this interval in context of the 
443 | data. Note that by default you'll get a 95% credible interval. If you want to 
444 | change the confidence level, change the value for `cred_level` which takes on a 
445 | value between 0 and 1. Also note that when doing a credible interval arguments 
446 | like `null` and `alternative` are not useful, so make sure to remove them, but 
447 | include the prior mean `mu_0`.
448 | 
449 | <div class="question">
450 | Based on the 95% credible interval for the differences in full term birth 
451 | weights for nonsmokers and smoker:
452 | 
453 | * there is a 95% chance that babies born to nonsmoker mothers are on average 0.11 to 0.54 pounds lighter at birth than babies born to smoker mothers. 
454 | * there is a 95% chance that the difference in average weights of babies whose moms are smokers and nonsmokers is between 0.11 to 0.54 pounds. 
455 | * there is a 95% chance that the difference in average weights of babies in this sample whose moms are nonsmokers and smokers is between 0.11 to 0.54 pounds. 
456 | * there is a 95% chance that babies born to nonsmoker mothers are on average 0.11 to 0.54 pounds heavier at birth than babies born to smoker mothers. 
457 | </div>
458 | 
459 | ```{r Q10-ci-weight-habit-increase}
460 | # Type your code for Question 10 here.
461 | ```
462 | 
463 | ## Bayesian inference on Two Paired Means
464 | 
465 | The second data set comes from a 2008 study *A simple tool to ameliorate 
466 | detainees' mood and well-being in Prison: Physical activities*. The study was 
467 | performed in a penitentiary of the Rhone-Alpes region (France), that includes 
468 | two establishments, one for remand prisoners and short sentences (Jail) and the 
469 | second for sentenced persons (Detention Centre, DC). A total number of 26 male 
470 | subjects, imprisoned between 3 to 48 months, participated to the study. The 
471 | participants were divided into two groups: 15 "Sportsmen" who chose 
472 | spontaneously to follow the physical program; and 11 "References", who did not 
473 | and wished to remain sedentary. This data provide the perceived stress scale 
474 | (PSS) of the participants in prison at the entry (`PSSbefore`) and at the exit 
475 | (`PSSafter`).
476 | 
477 | We can load the `PrisonStress` data set into our workspace using the `data` 
478 | function once the `PairedData` package is loaded.
479 | 
480 | ```{r load-data1}
481 | data("PrisonStress")
482 | ```
483 | 
484 | This data set consists of 26 observations on 4 variables. They are summarized as 
485 | follows:
486 | 
487 | variable    | description        
488 | ------------|-----------------------------------------------------
489 | `Subject`   | anonymous subjects
490 | `Group`     | whether the subject chose to follow the physical programme `Sport` or not `Control`
491 | `PSSbefore` | perceived stress measurement at the entry
492 | `PSSafter`  | perceived stress measurement at the exit
493 |   
494 |   
495 | We have two groups of observations: the `sport` group, the ones who chose to 
496 | follow the physical training program; and the `control` group, the ones who 
497 | chose not to follow. We are interested to know whether in average there is any 
498 | difference in the perceived stress scale (PSS) before they started the training 
499 | (at the entry) and after the training (at the exit).
500 | 
501 | We first analyze the `control` group data. We subset the data according to the 
502 | `Group` variable using the `dplyr` package, and save this into a smaller data 
503 | set `PPS.control`.
504 | 
505 | ```{r select-control}
506 | pss_control = PrisonStress %>%
507 |   filter(Group == "Control") %>%
508 |   mutate(diff = PSSbefore - PSSafter)
509 | ```
510 | 
511 | where the third line calculate the difference of the PSS of each subject before 
512 | and after the training and saves it as a new variable `diff`.
513 | 
514 | 
515 | We can now conduct the following hypothesis test:
516 | $$ H_1: \mu_{\text{before}} = \mu_{\text{after}}\qquad \Longrightarrow \qquad H_1: \mu_{\text{diff}} = 0, $$
517 | 
518 | $$ H_2: \mu_{\text{before}} \neq \mu_{\text{after}}\qquad \Longrightarrow \qquad H_1: \mu_{\text{diff}} \neq 0, $$
519 | 
520 | We use `bayes_inference` function to calculate the Bayes factor. The code is 
521 | similar to the one we used for inference for one mean, except that we need to 
522 | set `null = 0`, because we are comparing the mean of the difference to 0.
523 | 
524 | ```{r ht-control-diff}
525 | bayes_inference(y = diff, data = pss_control, 
526 |                 statistic = "mean",
527 |                 type = "ht", alternative = "twosided", null = 0, 
528 |                 prior = "JZS", rscale = 1, 
529 |                 method = "simulation", show_plot = FALSE)
530 | ```
531 | 
532 | While there appears to an increase in stress, based on Jeffrey's scales of 
533 | evidence, the evidence against H1 is `*worth a bare mention*'.
534 | 
535 | 
536 | <div class="question">
537 | Conduct the same hypothesis test for the mean of the difference in perceived 
538 | stress scale for the `sport` group. Based of Jeffrey's scale for interpretation 
539 | of a Bayes factors how should we describe the evidence against $H_1$ from the 
540 | results?
541 | 
542 | * Not worth a bare mention
543 | * Positive
544 | * Strong
545 | * Very strong
546 | </div>
547 | 
548 | ```{r Q11-ht-sport-diff}
549 | # Type your code for Question 11 here.
550 | ```
551 | 
552 | It is possible that other factors during this period of time could affect stress. 
553 | By combining data from both groups we can increase our sample size, which 
554 | provides greater power to detect a difference due to the intervention. If we 
555 | assume that the variation in the two groups is comparable, we can use the 
556 | differences in the before and after measurements to compare whether the 
557 | intervention had an effect on stress levels. 
558 | 
559 | <div class="exercise">
560 | Create a new data frame with a variable that is the difference in pre and post 
561 | stress measurements and test the hypothesis that the mean difference in the 
562 | control group is equal to the mean in the sport group versus the hypothesis 
563 | that the means are not equal.
564 | </div>
565 | 
566 | ```{r}
567 | # Type your code for Exercise 2 here.
568 | ```
569 | 
570 | 
571 | 
572 | 


--------------------------------------------------------------------------------