2 |
3 | // [[Rcpp::export]]
4 | double square(double number){
5 | return(pow(number, 2));
6 | }
7 |
8 | // [[Rcpp::export]]
9 | double to_radians_cpp(double degrees){
10 | return(degrees * 3.141593 / 180);
11 | }
12 |
13 |
14 | // [[Rcpp::export]]
15 | double sum2(Rcpp::NumericVector a_vector){
16 | double running_sum = 0;
17 | int length = a_vector.size();
18 | for( int i = 0; i < length; i++ ){
19 | running_sum = running_sum + a_vector(i);
20 | }
21 | return(running_sum);
22 | }
23 |
24 |
25 |
26 | // [[Rcpp::export]]
27 | double haversine_cpp(double lat1, double long1,
28 | double lat2, double long2,
29 | std::string unit="km"){
30 | int radius = 6378;
31 | double delta_phi = to_radians_cpp(lat2 - lat1);
32 | double delta_lambda = to_radians_cpp(long2 - long1);
33 | double phi1 = to_radians_cpp(lat1);
34 | double phi2 = to_radians_cpp(lat2);
35 | double term1 = pow(sin(delta_phi / 2), 2);
36 | double term2 = cos(phi1) * cos(phi2) * pow(sin(delta_lambda/2), 2);
37 | double the_terms = term1 + term2;
38 | double delta_sigma = 2 * atan2(sqrt(the_terms), sqrt(1-the_terms));
39 | double distance = radius * delta_sigma;
40 |
41 | /* if it is anything *but* km it is miles */
42 | if(unit != "km"){
43 | return(distance*0.621371);
44 | }
45 |
46 | return(distance);
47 | }
48 |
49 |
50 |
51 | // [[Rcpp::export]]
52 | double single_core_cpp(Rcpp::NumericMatrix mat){
53 | int nrows = mat.nrow();
54 | int numcomps = nrows*(nrows-1)/2;
55 | double running_sum = 0;
56 | for( int i = 0; i < nrows; i++ ){
57 | for( int j = i+1; j < nrows; j++){
58 | double this_dist = haversine_cpp(mat(i,0), mat(i,1),
59 | mat(j,0), mat(j,1));
60 |
61 | running_sum = running_sum + this_dist;
62 | }
63 | }
64 | return running_sum / numcomps;
65 | }
66 |
67 |
68 |
--------------------------------------------------------------------------------
/Chapter16/Chapter16.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Data-Analysis-with-R-Second-Edition/ece3101ca2f43f725c13a67d27810297bd31142a/Chapter16/Chapter16.txt
--------------------------------------------------------------------------------
/Chapter17/Chapter17.txt:
--------------------------------------------------------------------------------
1 | RStudio
2 |
3 | library(ggplot2)
4 | nothing <- data.frame(a=rbinom(1000, 20, .5),
5 | b=c("red", "white"),
6 | c=rnorm(1000, mean=100, sd=10))
7 | qplot(c, data=nothing, geom="histogram")
8 | write.csv(nothing, "nothing.csv")
9 | Execute the statements one by one. Notice that
10 |
11 |
12 |
13 | Running R scripts
14 |
15 |
16 | R CMD BATCH nothing.R
17 |
18 | R --vanilla CMD BATCH nothing.R
19 |
20 | Rscript nothing.R
21 |
22 | Rscript --vanilla nothing.R
23 |
24 |
25 |
26 |
27 |
28 | An example script
29 |
30 | #!/usr/bin/Rscript --vanilla
31 | ###########################################################
32 | ## ##
33 | ## nyc-sat-scores.R ##
34 | ## ##
35 | ## Author: Tony Fischetti ##
36 | ## tony.fischetti@gmail.com ##
37 | ## ##
38 | ###########################################################
39 | ##
40 | ## Aim: to use Bayesian analysis to compare NYC's 2010
41 | ## combined SAT scores against the average of the
42 | ## rest of the country, which, according to
43 | ## FairTest.com, is 1509
44 | ##
45 | # workspace cleanup
46 | rm(list=ls())
47 | # options
48 | options(echo=TRUE)
49 | options(stringsAsFactors=FALSE)
50 | # libraries
51 | library(assertr) # for data checking
52 | library(runjags) # for MCMC
53 | # make sure everything is all set with JAGS
54 | testjags()
55 | # yep!
56 | ## read data file
57 | # data was retrieved from NYC Open Data portal
58 | # direct link:
59 | https://data.cityofnewyork.us/api/views/zt9s-n5aj/rows.csv?accessType=DOWNL
60 | OAD
61 | nyc.sats <- read.csv("./data/SAT_Scores_NYC_2010.csv")
62 | # let's give the columns easier names
63 | better.names <- c("id", "school.name", "n", "read.mean",
64 | "math.mean", "write.mean")
65 | names(nyc.sats) <- better.names
66 | # there are 460 rows but almost 700 NYC schools
67 | # we will *assume*, then, that this is a random
68 | # sample of NYC schools
69 | # let's first check the veracity of this data...
70 | #nyc.sats <- assert(nyc.sats, is.numeric,
71 | # n, read.mean, math.mean, write.mean)
72 | # It looks like check failed because there are "s"s for some
73 | # rows. (??) A look at the data set descriptions indicates
74 | # that the "s" is for schools # with 5 or fewer students.
75 | # For our purposes, let's just exclude them.
76 | # This is a function that takes a vector, replaces all "s"s
77 | # with NAs and make coverts all non-"s"s into numerics
78 | remove.s <- function(vec){
79 | ifelse(vec=="s", NA, vec)
80 | }
81 | nyc.sats$n <- as.numeric(remove.s(nyc.sats$n))
82 | nyc.sats$read.mean <- as.numeric(remove.s(nyc.sats$read.mean))
83 | nyc.sats$math.mean <- as.numeric(remove.s(nyc.sats$math.mean))
84 | nyc.sats$write.mean <- as.numeric(remove.s(nyc.sats$write.mean))
85 | # Remove schools with fewer than 5 test takers
86 | nyc.sats <- nyc.sats[complete.cases(nyc.sats), ]
87 | # Calculate a total combined SAT score
88 | nyc.sats$combined.mean <- (nyc.sats$read.mean +
89 | nyc.sats$math.mean +
90 | nyc.sats$write.mean)
91 | # Let's build a posterior distribution of the true mean
92 | # of NYC high schools' combined SAT scores.
93 | # We're not going to look at the summary statistics, because
94 | # we don't want to bias our priors
95 | # Specify a standard gaussian model
96 | the.model <- "
97 | model {
98 | # priors
99 | mu ~ dunif(0, 2400)
100 | stddev ~ dunif(0, 500)
101 | tau <- pow(stddev, -2)
102 | # likelihood
103 | for(i in 1:theLength){
104 | samp[i] ~ dnorm(mu, tau)
105 | }
106 | }"
107 | the.data <- list(
108 | samp = nyc.sats$combined.mean,
109 | theLength = length(nyc.sats$combined.mean)
110 | )
111 | results <- autorun.jags(the.model, data=the.data,
112 | n.chains = 3,
113 | monitor = c('mu', 'stddev'))
114 | # View the results of the MCMC
115 | print(results)
116 | # Plot the MCMC diagnostics
117 | plot(results, plot.type=c("histogram", "trace"), layout=c(2,1))
118 | # Looks good!
119 | # Let's extract the MCMC samples of the mean and get the
120 | # bounds of the middle 95%
121 | results.matrix <- as.matrix(results$mcmc)
122 | mu.samples <- results.matrix[,'mu']
123 | bounds <- quantile(mu.samples, c(.025, .975))
124 | # We are 95% sure that the true mean is between 1197 and 1232
125 | # Now let's plot the marginal posterior distribution for the mean
126 | # of the NYC high schools' combined SAT grades and draw the 95%
127 | # percent credible interval.
128 | plot(density(mu.samples),
129 | main=paste("Posterior distribution of mean combined SAT",
130 | "score in NYC high schools (2010)", sep="\n"))
131 | lines(c(bounds[1], bounds[2]), c(0, 0), lwd=3, col="red")
132 | # Given the results, the SAT scores for NYC high schools in 2010
133 | # are *incontrovertibly* not at par with the average SAT scores of
134 | # the nation.
135 |
136 |
137 |
138 |
139 |
140 | Scripting and reproducibility
141 |
142 | > devtools::session_info()
143 | Session info ---------------------------------
144 | setting value
145 | version R version 3.2.1 (2015-06-18)
146 | system x86_64, darwin13.4.0
147 | ui RStudio (0.99.486)
148 | language (EN)
149 | collate en_US.UTF-8
150 | tz America/New_York
151 | date 1969-07-20
152 | Packages -------------------------------------
153 | package * version date source
154 | assertr * 1.0.0 2015-06-26 CRAN (R 3.2.1)
155 | coda 0.17-1 2015-03-03 CRAN (R 3.2.0)
156 | devtools 1.9.1 2015-09-11 CRAN (R 3.2.0)
157 | digest 0.6.8 2014-12-31 CRAN (R 3.2.0)
158 | lattice 0.20-33 2015-07-14 CRAN (R 3.2.0)
159 | memoise 0.2.1 2014-04-22 CRAN (R 3.2.0)
160 | modeest 2.1 2012-10-15 CRAN (R 3.2.0)
161 |
162 |
163 | rjags 3-15 2015-04-15 CRAN (R 3.2.0)
164 | runjags * 2.0.2-8 2015-09-14 CRAN (R 3.2.0)
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 | R projects
173 |
174 | read.csv("/Users/bensisko/Desktop/SAT_Scores_NYC_2010.csv")
175 |
176 |
177 | #!/usr/bin/Rscript --vanilla
178 | source("./code/load-and-clean-sat-data.R")
179 | source("./code/analyze-sat-data.R")
180 |
181 |
182 |
183 |
184 | Communicating results
185 |
186 |
187 | ---
188 | title: "NYC SAT Scores Analysis"
189 | author: "Tony Fischetti"
190 | date: "November 1, 2015"
191 | output: html_document
192 | ---
193 | #### Aim:
194 | To use Bayesian analysis to compare NYC's 2010
195 | combined SAT scores against the average of the
196 | rest of the country, which, according to
197 | FairTest.com, is 1509
198 | ```{r, echo=FALSE}
199 | # options
200 | options(echo=TRUE)
201 | options(stringsAsFactors=FALSE)
202 | ```
203 | We are going to use the `assertr` and `runjags`
204 | packages for data checking and MCMC, respectively.
205 | ```{r}
206 | # libraries
207 | library(assertr) # for data checking
208 | library(runjags) # for MCMC
209 | ```
210 | Let's make sure everything is all set with JAGS!
211 | ```{r}
212 | testjags()
213 | ...
214 |
215 | Great!
216 | This data was found in the NYC Open Data Portal:
217 | https://nycopendata.socrata.com
218 | ```{r}
219 | link.to.data <-
220 | "http://data.cityofnewyork.us/api/views/zt9s-n5aj/rows.csv?accessType=DOWNL
221 | OAD"
222 | download.file(link.to.data, "./data/SAT_Scores_NYC_2010.csv")
223 | nyc.sats <- read.csv("./data/SAT_Scores_NYC_2010.csv")
224 | ```
225 | Let's give the columns easier names
226 | ```{r}
227 | better.names <- c("id", "school.name", "n", "read.mean",
228 | "math.mean", "write.mean")
229 | names(nyc.sats) <- better.names
230 | ```
231 | There are `r nrow(nyc.sats)` rows but almost 700 NYC schools. We will,
232 | therefore, *assume* that this is a random sample of NYC schools.
233 | Let's first check the veracity of this data...
234 | ```{r, error=TRUE}
235 | nyc.sats <- assert(nyc.sats, is.numeric,
236 | n, read.mean, math.mean, write.mean)
237 | ```
238 | It looks like check failed because there are "s"s for some rows. (??)
239 | A look at the data set descriptions indicates that the "s" is for schools
240 | with 5 or fewer students. For our purposes, let's just exclude them.
241 | This is a function that takes a vector, replaces all "s"s
242 | with NAs and make coverts all non-"s"s into numerics
243 | ```{r}
244 | remove.s <- function(vec){
245 | ifelse(vec=="s", NA, vec)
246 | }
247 | nyc.sats$n <- as.numeric(remove.s(nyc.sats$n))
248 | nyc.sats$read.mean <- as.numeric(remove.s(nyc.sats$read.mean))
249 | nyc.sats$math.mean <- as.numeric(remove.s(nyc.sats$math.mean))
250 | nyc.sats$write.mean <- as.numeric(remove.s(nyc.sats$write.mean))
251 |
252 |
253 |
254 | Now we are going to remove schools with fewer than 5 test takers
255 | and calculate a combined SAT score
256 | ```{r}
257 | nyc.sats <- nyc.sats[complete.cases(nyc.sats), ]
258 | # Calculate a total combined SAT score
259 | nyc.sats$combined.mean <- (nyc.sats$read.mean +
260 | nyc.sats$math.mean +
261 | nyc.sats$write.mean)
262 | ```
263 | Let's now build a posterior distribution of the true mean of NYC high
264 | schools' combined SAT scores. We're not going to look at the summary
265 | statistics, because we don't want to bias our priors.
266 | We will use a standard gaussian model.
267 | ```{r, cache=TRUE, results="hide", warning=FALSE, message=FALSE}
268 | the.model <- "
269 | model {
270 | # priors
271 | mu ~ dunif(0, 2400)
272 | stddev ~ dunif(0, 500)
273 | tau <- pow(stddev, -2)
274 | # likelihood
275 | for(i in 1:theLength){
276 | samp[i] ~ dnorm(mu, tau)
277 | }
278 | }"
279 | the.data <- list(
280 | samp = nyc.sats$combined.mean,
281 | theLength = length(nyc.sats$combined.mean)
282 | )
283 | results <- autorun.jags(the.model, data=the.data,
284 | n.chains = 3,
285 | monitor = c('mu'))
286 | ```
287 | Let's view the results of the MCMC.
288 | ```{r}
289 | print(results)
290 | ```
291 | Now let's plot the MCMC diagnostics
292 | ```{r, message=FALSE}
293 | plot(results, plot.type=c("histogram", "trace"), layout=c(2,1))
294 |
295 | Looks good!
296 | Let's extract the MCMC samples of the mean, and get the
297 | bounds of the middle 95%
298 | ```{r}
299 | results.matrix <- as.matrix(results$mcmc)
300 | mu.samples <- results.matrix[,'mu']
301 | bounds <- quantile(mu.samples, c(.025, .975))
302 | ```
303 | We are 95% sure that the true mean is between
304 | `r round(bounds[1], 2)` and `r round(bounds[2], 2)`.
305 | Now let's plot the marginal posterior distribution for the mean
306 | of the NYC high schools' combined SAT grades, and draw the 95%
307 | percent credible interval.
308 | ```{r}
309 | plot(density(mu.samples),
310 | main=paste("Posterior distribution of mean combined SAT",
311 | "score in NYC high schools (2010)", sep="\n"))
312 | lines(c(bounds[1], bounds[2]), c(0, 0), lwd=3, col="red")
313 | ```
314 | Given the results, the SAT scores for NYC high schools in 2010
315 | are **incontrovertibly** not at par with the average SAT scores of
316 | the nation.
317 | ------------------------------------
318 | This is some session information for reproducibility:
319 | ```{r}
320 | devtools::session_info()
--------------------------------------------------------------------------------
/Chapter17/nyc-sat-scores.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/Rscript --vanilla
2 |
3 | ###########################################################
4 | ## ##
5 | ## nyc-sat-scores.R ##
6 | ## ##
7 | ## Author: Tony Fischetti ##
8 | ## tony.fischetti@gmail.com ##
9 | ## ##
10 | ###########################################################
11 |
12 | ##
13 | ## Aim: to use Bayesian analysis to compare NYC's 2010
14 | ## combined SAT scores against the average of the
15 | ## rest of the country, which, according to
16 | ## FairTest.com, is 1509
17 | ##
18 |
19 | # workspace cleanup
20 | rm(list=ls())
21 |
22 | # options
23 | options(echo=TRUE)
24 | options(stringsAsFactors=FALSE)
25 |
26 | # libraries
27 | library(assertr) # for data checking
28 | library(runjags) # for MCMC
29 |
30 | # make sure everything is all set with JAGS
31 | testjags()
32 | # yep!
33 |
34 |
35 | ## read data file
36 | # data was retrieved from NYC Open Data portal
37 | # direct link: https://data.cityofnewyork.us/api/views/zt9s-n5aj/rows.csv?accessType=DOWNLOAD
38 | nyc.sats <- read.csv("./data/SAT_Scores_NYC_2010.csv")
39 |
40 | # let's give the columns easier names
41 | better.names <- c("id", "school.name", "n", "read.mean",
42 | "math.mean", "write.mean")
43 | names(nyc.sats) <- better.names
44 |
45 |
46 | # there are 460 rows but almost 700 NYC schools
47 | # we will *assume*, then, that this is a random
48 | # sample of NYC schools
49 |
50 | # let's first check the veracity of this data...
51 | #nyc.sats <- assert(nyc.sats, is.numeric,
52 | # n, read.mean, math.mean, write.mean)
53 |
54 | # It looks like check failed because there are "s"s for some rows. (??)
55 | # A look at the data set descriptions indicates that the "s" is for schools
56 | # with 5 or fewer students. For our purposes, let's just exclude them.
57 |
58 |
59 | # This is a function that takes a vector, replaces all "s"s
60 | # with NAs and make coverts all non-"s"s into numerics
61 | remove.s <- function(vec){
62 | ifelse(vec=="s", NA, vec)
63 | }
64 |
65 | nyc.sats$n <- as.numeric(remove.s(nyc.sats$n))
66 | nyc.sats$read.mean <- as.numeric(remove.s(nyc.sats$read.mean))
67 | nyc.sats$math.mean <- as.numeric(remove.s(nyc.sats$math.mean))
68 | nyc.sats$write.mean <- as.numeric(remove.s(nyc.sats$write.mean))
69 |
70 | # Remove schools with fewer than 5 test takers
71 | nyc.sats <- nyc.sats[complete.cases(nyc.sats), ]
72 |
73 | # Calculate a total combined SAT score
74 | nyc.sats$combined.mean <- (nyc.sats$read.mean +
75 | nyc.sats$math.mean +
76 | nyc.sats$write.mean)
77 |
78 | # Let's build a posterior distribution of the true mean
79 | # of NYC high school's combined SAT scores.
80 |
81 | # We're not going to look at the summary statistics because
82 | # we don't want to bias our priors
83 |
84 | # Specify a standard gaussian model
85 | the.model <- "
86 | model {
87 | # priors
88 | mu ~ dunif(0, 2400)
89 | stddev ~ dunif(0, 500)
90 | tau <- pow(stddev, -2)
91 |
92 | # likelihood
93 | for(i in 1:theLength){
94 | samp[i] ~ dnorm(mu, tau)
95 | }
96 | }"
97 |
98 | the.data <- list(
99 | samp = nyc.sats$combined.mean,
100 | theLength = length(nyc.sats$combined.mean)
101 | )
102 |
103 | results <- autorun.jags(the.model, data=the.data,
104 | n.chains = 3,
105 | monitor = c('mu', 'stddev'))
106 |
107 | # View the results of the MCMC
108 | print(results)
109 |
110 | # Plot the MCMC diagnostics
111 | plot(results, plot.type=c("histogram", "trace"), layout=c(2,1))
112 | # Looks good!
113 |
114 | # Let's extract the MCMC samples of the mean and get the
115 | # bounds of the middle 95%
116 | results.matrix <- as.matrix(results$mcmc)
117 | mu.samples <- results.matrix[,'mu']
118 | bounds <- quantile(mu.samples, c(.025, .975))
119 |
120 | # We are 95% sure that the true mean is between 1197 and 1232
121 |
122 | # Now let's plot the marginal posterior distribution for the mean
123 | # of the NYC high schools' combined SAT grades and draw the 95%
124 | # percent credible interval.
125 | plot(density(mu.samples), main=paste("Posterior distribution of mean combined SAT",
126 | "score in NYC high schools (2010)", sep="\n"))
127 | lines(c(bounds[1], bounds[2]), c(0, 0), lwd=3, col="red")
128 |
129 |
130 | # Given the results, the SAT scores for NYC high schools in 2010
131 | # are *incontrovertibly* not on par with the average SAT scores of
132 | # the nation.
133 |
--------------------------------------------------------------------------------
/Chapter17/nyc-sat-scores.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "NYC SAT Scores Analysis"
3 | author: "Tony Fischetti"
4 | date: "November 1, 2015"
5 | output: html_document
6 | ---
7 |
8 | #### Aim:
9 | To use Bayesian analysis to compare NYC's 2010
10 | combined SAT scores against the average of the
11 | rest of the country, which, according to
12 | FairTest.com, is 1509
13 |
14 |
15 | ```{r, echo=FALSE}
16 | # options
17 | options(echo=TRUE)
18 | options(stringsAsFactors=FALSE)
19 | ```
20 |
21 | We are going to use the `assertr` and `runjags`
22 | packages for data checking and MCMC, respectively.
23 | ```{r}
24 | # libraries
25 | library(assertr) # for data checking
26 | library(runjags) # for MCMC
27 | ```
28 |
29 | Let's make sure everything is all set with JAGS!
30 | ```{r}
31 | testjags()
32 | ```
33 | Great!
34 |
35 | This data was found in the NYC Open Data Portal:
36 | https://nycopendata.socrata.com
37 | ```{r}
38 | link.to.data <- "http://data.cityofnewyork.us/api/views/zt9s-n5aj/rows.csv?accessType=DOWNLOAD"
39 | download.file(link.to.data, "./data/SAT_Scores_NYC_2010.csv")
40 |
41 | nyc.sats <- read.csv("./data/SAT_Scores_NYC_2010.csv")
42 | ```
43 |
44 | Let's give the columns easier names
45 | ```{r}
46 | better.names <- c("id", "school.name", "n", "read.mean",
47 | "math.mean", "write.mean")
48 | names(nyc.sats) <- better.names
49 | ```
50 |
51 | There are `r nrow(nyc.sats)` rows but almost 700 NYC schools. We will,
52 | therefore, *assume* that this is a random sample of NYC schools.
53 |
54 |
55 | Let's first check the veracity of this data...
56 | ```{r, error=TRUE}
57 | nyc.sats <- assert(nyc.sats, is.numeric,
58 | n, read.mean, math.mean, write.mean)
59 | ```
60 |
61 | It looks like check failed because there are "s"s for some rows. (??)
62 | A look at the data set descriptions indicates that the "s" is for schools
63 | with 5 or fewer students. For our purposes, let's just exclude them.
64 |
65 |
66 | This is a function that takes a vector, replaces all "s"s
67 | with NAs and make coverts all non-"s"s into numerics
68 | ```{r}
69 | remove.s <- function(vec){
70 | ifelse(vec=="s", NA, vec)
71 | }
72 |
73 | nyc.sats$n <- as.numeric(remove.s(nyc.sats$n))
74 | nyc.sats$read.mean <- as.numeric(remove.s(nyc.sats$read.mean))
75 | nyc.sats$math.mean <- as.numeric(remove.s(nyc.sats$math.mean))
76 | nyc.sats$write.mean <- as.numeric(remove.s(nyc.sats$write.mean))
77 | ```
78 |
79 | Now we are going to remove schools with fewer than 5 test takers
80 | and calculate a combined SAT score
81 | ```{r}
82 | nyc.sats <- nyc.sats[complete.cases(nyc.sats), ]
83 |
84 | # Calculate a total combined SAT score
85 | nyc.sats$combined.mean <- (nyc.sats$read.mean +
86 | nyc.sats$math.mean +
87 | nyc.sats$write.mean)
88 | ```
89 |
90 | Let's now build a posterior distribution of the true mean
91 | of NYC high school's combined SAT scores. We're not going to look
92 | at the summary statistics because we don't want to bias our priors.
93 | We will use a standard gaussian model.
94 |
95 | ```{r, cache=TRUE, results="hide", warning=FALSE, message=FALSE}
96 | the.model <- "
97 | model {
98 | # priors
99 | mu ~ dunif(0, 2400)
100 | stddev ~ dunif(0, 500)
101 | tau <- pow(stddev, -2)
102 |
103 | # likelihood
104 | for(i in 1:theLength){
105 | samp[i] ~ dnorm(mu, tau)
106 | }
107 | }"
108 |
109 | the.data <- list(
110 | samp = nyc.sats$combined.mean,
111 | theLength = length(nyc.sats$combined.mean)
112 | )
113 |
114 | results <- autorun.jags(the.model, data=the.data,
115 | n.chains = 3,
116 | monitor = c('mu'))
117 | ```
118 |
119 | Let's view the results of the MCMC.
120 | ```{r}
121 | print(results)
122 | ```
123 |
124 | Now let's plot the MCMC diagnostics
125 | ```{r, message=FALSE}
126 | plot(results, plot.type=c("histogram", "trace"), layout=c(2,1))
127 | ```
128 |
129 | Looks good!
130 |
131 |
132 | Let's extract the MCMC samples of the mean and get the
133 | bounds of the middle 95%
134 | ```{r}
135 | results.matrix <- as.matrix(results$mcmc)
136 | mu.samples <- results.matrix[,'mu']
137 | bounds <- quantile(mu.samples, c(.025, .975))
138 | ```
139 |
140 | We are 95% sure that the true mean is between `r bounds[1]` and
141 | `r bounds[2]`.
142 |
143 | Now let's plot the marginal posterior distribution for the mean
144 | of the NYC high schools' combined SAT grades and draw the 95%
145 | percent credible interval.
146 | ```{r}
147 | plot(density(mu.samples),
148 | main=paste("Posterior distribution of mean combined SAT",
149 | "score in NYC high schools (2010)", sep="\n"))
150 | lines(c(bounds[1], bounds[2]), c(0, 0), lwd=3, col="red")
151 | ```
152 |
153 | Given the results, the SAT scores for NYC high schools in 2010
154 | are **incontrovertibly** not on par with the average SAT scores of
155 | the nation.
156 |
157 | ------------------------------------
158 |
159 | This is some session information for reproducibility:
160 | ```{r}
161 | devtools::session_info()
162 | ```
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Packt
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | # Data Analysis with R - Second Edition
5 | This is the code repository for [Data Analysis with R - Second Edition](https://www.packtpub.com/big-data-and-business-intelligence/data-analysis-r-second-edition?utm_source=github&utm_medium=repository&utm_campaign=9781788393720), published by [Packt](https://www.packtpub.com/?utm_source=github). It contains all the supporting project files necessary to work through the book from start to finish.
6 | ## About the Book
7 | Frequently the tool of choice for academics, R has spread deep into the private sector and can be found in the production pipelines at some of the most advanced and successful enterprises. The power and domain-specificity of R allows the user to express complex analytics easily, quickly, and succinctly.
8 |
9 | Starting with the basics of R and statistical reasoning, this book dives into advanced predictive analytics, showing how to apply those techniques to real-world data though with real-world examples.
10 |
11 |
12 | ## Instructions and Navigation
13 | All of the code is organized into folders. Each folder starts with a number followed by the application name. For example, Chapter02.
14 |
15 |
16 |
17 | The code will look like the following:
18 | ```
19 | # don't worry about memorizing this
20 | temp.density <- density(airquality$Temp)
21 | pdf <- approxfun(temp.density$x, temp.density$y, rule=2)
22 | integrate(pdf, 80, 90)
23 | ```
24 |
25 | All code in this book has been written against the latest version of R—3.4.3 at time of
26 | writing. As a matter of good practice, you should keep your R version up to date but most,
27 | if not all, code should work with any reasonably recent version of R. Some of the R
28 | packages we will be installing will require more recent versions though. For the other
29 | software that this book uses, instructions will be furnished pro re nata. If you want to get a
30 | head start, however, install RStudio, JAGS, and a C++ compiler (or Rtools if you use
31 | windows).
32 |
33 | ## Related Products
34 | * [Data Analysis with R](https://www.packtpub.com/big-data-and-business-intelligence/data-analysis-r?utm_source=github&utm_medium=repository&utm_campaign=9781785288142)
35 |
36 | * [Mastering Data Analysis with R](https://www.packtpub.com/big-data-and-business-intelligence/mastering-data-analysis-r?utm_source=github&utm_medium=repository&utm_campaign=9781783982028)
37 |
38 | * [Hands-On Geospatial Analysis with R and QGIS](https://www.packtpub.com/application-development/hands-geospatial-analysis-r-and-qgis?utm_source=github&utm_medium=repository&utm_campaign=9781788991674)
39 |
40 | ### Download a free PDF
41 |
42 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.
43 | https://packt.link/free-ebook/9781788393720
--------------------------------------------------------------------------------