├── .Rhistory ├── ProgAssignment4.R ├── Quiz_1.md ├── Quiz_2.md ├── Quiz_3.md ├── Quiz_4.md ├── README.md ├── best.R ├── cachematrix.R ├── complete.R ├── corr.R ├── plot1.R ├── plot2.R ├── plot3.R ├── plot4.R ├── pollutantmean.R ├── rankall.R └── rankhospital.R /.Rhistory: -------------------------------------------------------------------------------- 1 | source("getdata-data-ss06hid.csv") 2 | file <- read.csv("getdata-data-ss06hid.csv", sep = ",") 3 | file 4 | View(file) 5 | print(length(file[file$VAL > 1000000])) 6 | print(length(file[,file$VAL > 1000000])) 7 | length(file[file$VAL > 1000000]) 8 | length(file[,file$VAL > 1000000]) 9 | length(file[file$VAL > 1000000,]) 10 | print(file[file$VAL > 1000000,]) 11 | length(file$VAL > 1000000) 12 | vals <- file$VAL 13 | vals <- vals[!is.na(vals)] 14 | test <- vals > 1000000 15 | test <- vals[vals > 1000000,] 16 | test <- vals[,vals > 1000000] 17 | test <- vals[vals > 1000000,] 18 | vals > 1000000 19 | vals[vals > 1000000,] 20 | vals 21 | sort(vals) 22 | test <- vals[vals == 24,] 23 | vals[vals == 24] 24 | test <- vals[vals == 24] 25 | length(test) 26 | source("getdata-data-DATA.gov_NGAP.xlsx") 27 | file <- read.table("getdata-data-DATA.gov_NGAP.xlsx") 28 | file <- read.table("getdata-data-DATA.gov_NGAP.xlsx", sep = ",") 29 | library(xlsx) 30 | install.packages("xlsx") 31 | library(xlsx) 32 | file <- read.xlsx("getdata-data-DATA.gov_NGAP.xlsx", header = TRUE) 33 | file <- read.xlsx("getdata-data-DATA.gov_NGAP.xlsx", header = TRUE, sheetIndex = 1) 34 | dat <- read.xlsx("getdata-data-DATA.gov_NGAP.xlsx", header = TRUE, sheetIndex = 1, rowIndex = 18:23, colIndex = 7:15) 35 | sum(dat$Zip*dat$Ext,na.rm=T) 36 | install.packages(XML) 37 | install.packages("XML") 38 | library(XML) 39 | file <- xmlTreeParse("https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Frestaurants.xml", useInternal = TRUE) 40 | fileURL <- https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Frestaurants.xml 41 | fileURL <- https:/d396qusza40orc.cloudfront.net/getdata%2Fdata%2Frestaurants.xml 42 | fileURL <- "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Frestaurants.xml" 43 | doc <- xmlTreeParse(fileURL, useInternal = TRUE) 44 | doc <- xmlTreeParse(fileURL, useInternal = TRUE) 45 | doc = xmlTreeParse(fileURL, useInternal = TRUE) 46 | z <- sum(x ^ 2 * p) - sum(x * p) ^ 2 47 | x <- 1:4 48 | p <- x/sum(x) 49 | temp <- rbind(x, p) 50 | rownames(temp) <- c("X", "Prob") 51 | temp 52 | mean(X) 53 | mean(x) 54 | round((0.75*0.3)/( (0.75*0.3) + ((1-0.52)*(1-0.3)) )*100) 55 | a <- c(140, 138, 150, 148, 135) 56 | b <- c(132, 135, 151, 146, 130) 57 | t.test(a, b, alternative = "two.sided", paired = T) 58 | round(pbinom(2,size=4,prob=0.5,lower.tail=FALSE),2) 59 | pbinom (3,size=4,prob = 0.5) 60 | n1 <- n2 <- 9 61 | x1 <- -3 ##treated 62 | x2 <- 1 ##placebo 63 | s1 <- 1.5 ##treated 64 | s2 <- 1.8 ##placebo 65 | spsq <- ( (n1 - 1) * s1^2 + (n2 - 1) * s2^2) / (n1 + n2 - 2) 66 | t=(x1-x2)/(spsq*sqrt(1/n1 + 1/n2)) 67 | 2*pt(t,n1+n2-2) 68 | (qnorm(.95) + qnorm(.9))^2 * .04^2 / .01^2 69 | round( (qnorm(0.95) + qnorm(0.90))^2 * 0.04^2 / (0.01^2) ) 70 | smm=sqrt(12^2/288 + 12^2/288) 71 | 12^2/288 72 | x=44 73 | y=42.04 74 | sx=12 75 | sy=12 76 | n1=288 77 | n2=288 78 | SDpooled <- sqrt( ((n1 - 1) * sx^2 + (n2-1) * sy^2) / (n1 + n2 -2)) 79 | SE_est= sqrt(SDpooled^2/n1 + SDpooled^2/n2) 80 | 2*pnorm((44-42.04),lower.tail=F) 81 | myPlot <- function(beta){ 82 | y <- galton$child - mean(galton$child) 83 | x <- galton$parent - mean(galton$parent) 84 | freqData <- as.data.frame(table(x, y)) 85 | names(freqData) <- c("child", "parent", "freq") 86 | plot( 87 | as.numeric(as.vector(freqData$parent)), 88 | as.numeric(as.vector(freqData$child)), 89 | pch = 21, col = "black", bg = "lightblue", 90 | cex = .15 * freqData$freq, 91 | xlab = "parent", 92 | ylab = "child" 93 | ) 94 | abline(0, beta, lwd = 3) 95 | points(0, 0, cex = 2, pch = 19) 96 | mse <- mean( (y - beta * x)^2 ) 97 | title(paste("beta = ", beta, "mse = ", round(mse, 3))) 98 | } 99 | manipulate(myPlot(beta), beta = slider(0.6, 1.2, step = 0.02)) 100 | setwd("~/Documents/Coursera/Data Science Specialization/R Programming") 101 | x <- c(1, 3, 5) 102 | y <- c(3, 2, 10) 103 | rbind(x, y) 104 | x <- list(2, "a", "b", TRUE) 105 | x[[2]] 106 | x <- 1:4 107 | y <- 2:3 108 | x + y 109 | x <- c(3, 5, 1, 10, 12, 6) 110 | x[x < 6] <- 0 111 | x[x < 6] 112 | x 113 | x <- c(3, 5, 1, 10, 12, 6) 114 | x[x < 6] <- 0 115 | x 116 | cube <- function(x, n) { 117 | x^3 118 | } 119 | cube(3) 120 | x <- 1:10 121 | if(x > 5) { 122 | x <- 0 123 | } 124 | f <- function(x) { 125 | g <- function(y) { 126 | y + z 127 | } 128 | z <- 4 129 | x + g(x) 130 | } 131 | z <- 10 132 | f(3) 133 | x <- 5 134 | y <- if(x < 3) { 135 | NA 136 | } else { 137 | 10 138 | } 139 | y 140 | library(datasets) 141 | data(iris) 142 | ?iris 143 | tapply(iris$Sepal.Length, mean) 144 | tapply(iris$Sepal.Length, iris, mean) 145 | tapply(iris$Sepal.Length, iris$virginica, mean) 146 | tapply(iris[Species == virginica], iris$Special.Length, mean) 147 | tapply(iris[Species == virginica,], iris$Special.Length, mean) 148 | tapply(iris[,Species == virginica], iris$Special.Length, mean) 149 | library(datasets) 150 | data(iris) 151 | ?iris 152 | tapply(iris$Sepal.Length, iris[Species == "virginica",], mean) 153 | tapply(iris$Sepal.Length, iris[iris$Species == "virginica",], mean) 154 | iris$Species 155 | tapply(iris$Sepal.Length[,iris$Species == "virginica"], mean) 156 | tapply(iris$Sepal.Length[iris$Species == "virginica",], mean) 157 | test <- iris[iris$Species == "virginica",] 158 | test 159 | test <- iris[iris$Species == "virginica",] 160 | library(datasets) 161 | data(iris) 162 | ?iris 163 | virginica <- iris[iris$Species == "virginica",] 164 | tapply(Sepal.Length, virginica, mean) 165 | tapply(iris$Sepal.Length, virginica, mean) 166 | tapply(iris$Sepal.Length, virginica$Sepal.Length, mean) 167 | tapply(virginica$Sepal.Length, virginica, mean) 168 | tapply(virginica$Sepal.Length, virginica[!is.na(virginica$Sepal.Length)], mean) 169 | tapply(virginica$Sepal.Length, virginica[!is.na(virginica$Sepal.Length),], mean) 170 | tapply(virginica$Sepal.Length, virginica[,!is.na(virginica$Sepal.Length)], mean) 171 | tapply(iris$Sepal.Length, iris, mean) 172 | tapply(iris$Sepal.Length, iris[iris$Species == "virginica" & !is.na(iris$Sepal.Length)], mean) 173 | tapply(iris$Sepal.Length, iris[iris$Species == "virginica" & !is.na(iris$Sepal.Length),], mean) 174 | -------------------------------------------------------------------------------- /ProgAssignment4.R: -------------------------------------------------------------------------------- 1 | best <- function(state, outcome) { 2 | ## Reads outcome data 3 | cat(outcome) 4 | cat(state) 5 | ## Check that state and outcome are valid 6 | ## Return hospital name in that state with lowest 30-day death 7 | ## rate 8 | } -------------------------------------------------------------------------------- /Quiz_1.md: -------------------------------------------------------------------------------- 1 | Quiz 1 2 | ====== 3 | 4 | |Attempts|Score| 5 | |--------|-----| 6 | | 1/3|20/20| 7 | 8 | 9 | Question 1 10 | ---------- 11 | The R language is a dialect of which of the following programming languages? 12 | 13 | ### Answer 14 | S 15 | 16 | ### Explanation 17 | R is a dialect of the S language which was developed at Bell Labs. 18 | 19 | 20 | Question 2 21 | ---------- 22 | The definition of free software consists of four freedoms (freedoms 0 through 3). Which of the following is NOT one of the freedoms that are part of the definition? 23 | 24 | ### Answer 25 | The freedom to sell the software for any price. 26 | 27 | ### Explanation 28 | This is not part of the free software definition. The free software definition does not mention anything about selling software (although it does not disallow it). 29 | 30 | 31 | Question 3 32 | ---------- 33 | In R the following are all atomic data types EXCEPT 34 | 35 | ### Answer 36 | matrix 37 | 38 | ### Explanation 39 | 'matrix' is not an atomic data type in R. 40 | 41 | 42 | Question 4 43 | ---------- 44 | If I execute the expression x <- 4 in R, what is the class of the object 'x' as determined by the 'class()' function? 45 | 46 | ### Answer 47 | numeric 48 | 49 | ### Explanation 50 | 51 | > x <- 4 52 | > class(x) 53 | [1] "numeric" 54 | 55 | 56 | Question 5 57 | ---------- 58 | What is the class of the object defined by x <- c(4, TRUE)? 59 | 60 | ### Answer 61 | numeric 62 | 63 | ### Explanation 64 | The numeric class is the "lowest common denominator" here and so all elements will be coerced into that class. 65 | 66 | R does automatic coercion of vectors so that all elements of the vector are the same data class. 67 | 68 | > x <- c(4, TRUE) 69 | > class(x) 70 | [1] "numeric" 71 | 72 | 73 | Question 6 74 | ---------- 75 | If I have two vectors x <- c(1,3, 5) and y <- c(3, 2, 10), what is produced by the expression rbind(x, y)? 76 | 77 | ### Answer 78 | a 2 by 3 numeric matrix 79 | 80 | ### Explanation 81 | The 'rbind' function treats vectors as if they were rows of a matrix. It then takes those vectors and binds them together row-wise to create a matrix. 82 | 83 | > x <- c(1, 3, 5) 84 | > y <- c(3, 2, 10) 85 | > rbind(x, y) 86 | [,1] [,2] [,3] 87 | x 1 3 5 88 | y 3 2 10 89 | 90 | 91 | Question 7 92 | ---------- 93 | A key property of vectors in R is that 94 | 95 | ### Answer 96 | elements of a vector all must be of the same class 97 | 98 | 99 | Question 8 100 | ---------- 101 | Suppose I have a list defined as x <- list(2, "a", "b", TRUE). What does x[[2]] give me? 102 | 103 | ### Answer 104 | a character vector of length 1. 105 | 106 | ### Explanation 107 | 108 | > x <- list(2, "a", "b", TRUE) 109 | > x[[2]] 110 | [1] "a" 111 | 112 | Question 9 113 | ---------- 114 | Suppose I have a vector x <- 1:4 and y <- 2:3. What is produced by the expression x + y? 115 | 116 | ### Answer 117 | an integer vector with the values 3, 5, 5, 7. 118 | 119 | ### Explanation 120 | 121 | > x <- 1:4 122 | > y <- 2:3 123 | > x + y 124 | [1] 3 5 5 7 125 | 126 | 127 | Question 10 128 | ----------- 129 | Suppose I have a vector x <- c(3, 5, 1, 10, 12, 6) and I want to set all elements of this vector that are less than 6 to be equal to zero. What R code achieves this? 130 | 131 | ### Answer 132 | x[x < 6] <- 0 133 | 134 | ### Explanation 135 | You can create a logical vector with the expression x < 6 and then use the [ operator to subset the original vector x. 136 | 137 | > x <- c(3, 5, 1, 10, 12, 6) 138 | > x[x < 6] <- 0 139 | > x 140 | [1] 0 0 0 10 12 6 141 | 142 | 143 | Question 11 144 | ----------- 145 | In the dataset provided for this Quiz, what are the column names of the dataset? 146 | 147 | ### Answer 148 | Ozone, Solar.R, Wind, Temp, Month, Day 149 | 150 | ### Explanation 151 | You can get the column names of a data frame with the 'names()' function. 152 | 153 | > hw1 = read.csv('hw1_data.csv') 154 | > names(hw1) 155 | [1] "Ozone" "Solar.R" "Wind" "Temp" "Month" "Day" 156 | 157 | 158 | Question 12 159 | ----------- 160 | Extract the first 2 rows of the data frame and print them to the console. What does the output look like? 161 | 162 | ### Answer 163 | Ozone Solar.R Wind Temp Month Day 164 | 1 41 190 7.4 67 5 1 165 | 2 36 118 8.0 72 5 2 166 | 167 | ### Explantion 168 | You can extract the first two rows using the [ operator and an integer sequence to index the rows. 169 | 170 | > hw1 = read.csv('hw1_data.csv') 171 | > hw1[c(1,2),] 172 | Ozone Solar.R Wind Temp Month Day 173 | 1 41 190 7.4 67 5 1 174 | 2 36 118 8.0 72 5 2 175 | 176 | 177 | Question 13 178 | ----------- 179 | How many observations (i.e. rows) are in this data frame? 180 | 181 | ### Answer 182 | 153 183 | 184 | ### Explanation 185 | You can use the 'nrows()' function to compute the number of rows in a data frame. 186 | 187 | > hw1 = read.csv('hw1_data.csv') 188 | > nrow(hw1) 189 | [1] 153 190 | 191 | 192 | Question 14 193 | ----------- 194 | Extract the last 2 rows of the data frame and print them to the console. What does the output look like? 195 | 196 | ### Answer 197 | 198 | Ozone Solar.R Wind Temp Month Day 199 | 152 18 131 8.0 76 9 29 200 | 153 20 223 11.5 68 9 30 201 | 202 | ### Explanation 203 | The 'tail()' function is an easy way to extract the last few elements of an R object. 204 | 205 | > hw1 = read.csv('hw1_data.csv') 206 | > tail(hw1,2) 207 | Ozone Solar.R Wind Temp Month Day 208 | 152 18 131 8.0 76 9 29 209 | 153 20 223 11.5 68 9 30 210 | 211 | 212 | Question 15 213 | ----------- 214 | What is the value of Ozone in the 47th row? 215 | 216 | ### Answer 217 | 21 218 | 219 | ### Explanation 220 | The single bracket [ operator can be used to extract individual rows of a data frame. 221 | 222 | > hw1 = read.csv('hw1_data.csv') 223 | > hw1[15,] 224 | Ozone Solar.R Wind Temp Month Day 225 | 15 18 65 13.2 58 5 15 226 | 227 | 228 | Question 16 229 | ----------- 230 | How many missing values are in the Ozone column of this data frame? 231 | 232 | ### Answer 233 | 37 234 | 235 | ### Explanation 236 | The 'is.na' function can be used to test for missing values. 237 | 238 | > hw1 = read.csv('hw1_data.csv') 239 | > sub = subset(hw1, is.na(Ozone)) 240 | > nrow(sub) 241 | [1] 37 242 | 243 | 244 | Question 17 245 | ----------- 246 | What is the mean of the Ozone column in this dataset? Exclude missing values (coded as NA) from this calculation. 247 | 248 | ### Answer 249 | 42.1 250 | 251 | ### Explanation 252 | The 'mean' function can be used to calculate the mean. 253 | 254 | > hw1 = read.csv('hw1_data.csv') 255 | > sub = subset(hw1, !is.na(Ozone), select = Ozone) 256 | > apply(sub, 2, mean) 257 | Ozone 258 | 42.12931 259 | 260 | 261 | Question 18 262 | ----------- 263 | Extract the subset of rows of the data frame where Ozone values are above 31 and Temp values are above 90. What is the mean of Solar.R in this subset? 264 | 265 | ### Answer 266 | 212.8 267 | 268 | ### Explanation 269 | You need to construct a logical vector in R to match the question's requirements. Then use that logical vector to subset the data frame. 270 | 271 | > hw1 = read.csv('hw1_data.csv') 272 | > sub = subset(hw1, Ozone > 31 & Temp > 90, select = Solar.R) 273 | > apply(sub, 2, mean) 274 | Solar.R 275 | 212.8 276 | 277 | 278 | Question 19 279 | ----------- 280 | What is the mean of "Temp" when "Month" is equal to 6? 281 | 282 | ### Answer 283 | 79.1 284 | 285 | ### Explanation 286 | 287 | > hw1 = read.csv('hw1_data.csv') 288 | > sub = subset(hw1, Month == 6, select = Temp) 289 | > apply(sub, 2, mean) 290 | Temp 291 | 79.1 292 | 293 | 294 | Question 20 295 | ----------- 296 | What was the maximum ozone value in the month of May (i.e. Month = 5)? 297 | 298 | ### Answer 299 | 115 300 | 301 | ### Explantion 302 | 303 | > hw1 = read.csv('hw1_data.csv') 304 | > sub = subset(hw1, Month == 5 & !is.na(Ozone), select = Ozone) 305 | > apply(sub, 2, max) 306 | Ozone 307 | 115 308 | -------------------------------------------------------------------------------- /Quiz_2.md: -------------------------------------------------------------------------------- 1 | Quiz 2 2 | ====== 3 | 4 | |Attempts|Score| 5 | |--------|-----| 6 | | 2/3|10/10| 7 | 8 | 9 | Question 1 10 | ---------- 11 | Suppose I define the following function in R 12 | 13 | cube <- function(x, n) { 14 | x^3 15 | } 16 | 17 | What is the result of running 18 | 19 | cube(3) 20 | 21 | in R after defining this function? 22 | 23 | ### Answer 24 | The number 27 is returned 25 | 26 | ### Explanation 27 | Because 'n' is not evaluated, it is not needed even though it is a formal argument. 28 | 29 | > cube <- function(x, n) { 30 | + x^3 31 | + } 32 | > cube(3) 33 | [1] 27 34 | 35 | 36 | Question 2 37 | ---------- 38 | The following code will produce a warning in R. 39 | 40 | x <- 1:10 41 | if(x > 5) { 42 | x <- 0 43 | } 44 | 45 | Why? 46 | 47 | ### Answer 48 | 'x' is a vector of length 10 and 'if' can only test a single logical statement. 49 | 50 | ### Explanation 51 | 52 | > if(x > 5) { 53 | + x <- 0 54 | + } 55 | Warning message: 56 | In if (x > 5) { : 57 | the condition has length > 1 and only the first element will be used 58 | 59 | Question 3 60 | ---------- 61 | Consider the following function 62 | 63 | f <- function(x) { 64 | g <- function(y) { 65 | y + z 66 | } 67 | z <- 4 68 | x + g(x) 69 | } 70 | 71 | If I then run in R 72 | 73 | z <- 10 74 | f(3) 75 | 76 | What value is returned? 77 | 78 | ### Answer 79 | 10 80 | 81 | ### Explanation 82 | 83 | > f <- function(x) { 84 | + g <- function(y) { 85 | + y + z 86 | + } 87 | + z <- 4 88 | + x + g(x) 89 | + } 90 | > z <- 10 91 | > f(3) 92 | [1] 10 93 | 94 | 95 | Question 4 96 | ---------- 97 | Consider the following expression: 98 | 99 | x <- 5 100 | y <- if(x < 3) { 101 | NA 102 | } else { 103 | 10 104 | } 105 | 106 | What is the value of 'y' after evaluating this expression? 107 | 108 | ### Answer 109 | 10 110 | 111 | ### Explanation 112 | 113 | > x <- 5 114 | > y <- if(x < 3) { 115 | + NA 116 | + } else { 117 | + 10 118 | + } 119 | > y 120 | [1] 10 121 | 122 | 123 | Question 5 124 | ---------- 125 | Consider the following R function 126 | 127 | h <- function(x, y = NULL, d = 3L) { 128 | z <- cbind(x, d) 129 | if(!is.null(y)) 130 | z <- z + y 131 | else 132 | z <- z + f 133 | g <- x + y / z 134 | if(d == 3L) 135 | return(g) 136 | g <- g + 10 137 | g 138 | } 139 | 140 | Which symbol in the above function is a free variable? 141 | 142 | ### Answer 143 | f 144 | 145 | ### Explanation 146 | A free variable is a variable that is not defined in the function nor an argument of the function. 147 | 148 | 149 | Question 6 150 | ---------- 151 | What is an environment in R? 152 | 153 | ### Answer 154 | a collection of symbol/value pairs 155 | 156 | 157 | Question 7 158 | ---------- 159 | The R language uses what type of scoping rule for resolving free variables? 160 | 161 | ### Answer 162 | lexical scoping 163 | 164 | 165 | Question 8 166 | ---------- 167 | How are free variables in R functions resolved? 168 | 169 | ### Answer 170 | The values of free variables are searched for in the environment in which the function was defined 171 | 172 | 173 | Question 9 174 | ---------- 175 | What is one of the consequences of the scoping rules used in R? 176 | 177 | ### Answer 178 | All objects must be stored in memory 179 | 180 | 181 | Question 10 182 | ----------- 183 | In R, what is the parent frame? 184 | 185 | ### Answer 186 | It is the environment in which a function was called 187 | -------------------------------------------------------------------------------- /Quiz_3.md: -------------------------------------------------------------------------------- 1 | Quiz 3 2 | ====== 3 | 4 | |Attempts|Score| 5 | |--------|-----| 6 | | 1/3| 5/5| 7 | 8 | 9 | Question 1 10 | ---------- 11 | Take a look at the 'iris' dataset that comes with R. The data can be loaded with the code: 12 | 13 | library(datasets) 14 | data(iris) 15 | 16 | A description of the dataset can be found by running 17 | 18 | ?iris 19 | 20 | There will be an object called 'iris' in your workspace. In this dataset, what is the mean of 'Sepal.Length' for the species virginica? (Please only enter the numeric result and nothing else.) 21 | 22 | ### Answer 23 | 6.588 24 | 25 | ### Explanation 26 | 27 | > library(datasets) 28 | > data(iris) 29 | > ?iris 30 | > mean(iris[iris$Species == "virginica",]$Sepal.Length) 31 | [1] 6.588 32 | 33 | 34 | Question 2 35 | ---------- 36 | Continuing with the 'iris' dataset from the previous Question, what R code returns a vector of the means of the variables 'Sepal.Length', 'Sepal.Width', 'Petal.Length', and 'Petal.Width'? 37 | 38 | ### Answer 39 | apply(iris[, 1:4], 2, mean) 40 | 41 | ### Explanation 42 | 43 | > library(datasets) 44 | > data(iris) 45 | > apply(iris[, 1:4], 2, mean) 46 | Sepal.Length Sepal.Width Petal.Length Petal.Width 47 | 5.843333 3.057333 3.758000 1.199333 48 | 49 | Question 3 50 | ---------- 51 | Load the 'mtcars' dataset in R with the following code 52 | 53 | library(datasets) 54 | data(mtcars) 55 | 56 | There will be an object names 'mtcars' in your workspace. You can find some information about the dataset by running 57 | 58 | ?mtcars 59 | 60 | How can one calculate the average miles per gallon (mpg) by number of cylinders in the car (cyl)? 61 | 62 | ### Answer 63 | sapply(split(mtcars$mpg, mtcars$cyl), mean) 64 | 65 | ### Explanation 66 | 67 | > library(datasets) 68 | > data(mtcars) 69 | > ?mtcars 70 | > sapply(split(mtcars$mpg, mtcars$cyl), mean) 71 | 4 6 8 72 | 26.66364 19.74286 15.10000 73 | 74 | 75 | Question 4 76 | ---------- 77 | Continuing with the 'mtcars' dataset from the previous Question, what is the absolute difference between the average horsepower of 4-cylinder cars and the average horsepower of 8-cylinder cars? 78 | 79 | ### Answer 80 | 126.5779 81 | 82 | ### Explanation 83 | 84 | > library(datasets) 85 | > data(mtcars) 86 | > mean(mtcars[mtcars$cyl == "8",]$hp) - mean(mtcars[mtcars$cyl == "4",]$hp) 87 | [1] 126.5779 88 | 89 | 90 | Question 5 91 | ---------- 92 | If you run 93 | 94 | debug(ls) 95 | 96 | what happens when you next call the 'ls' function? 97 | 98 | ### Answer 99 | Execution of 'ls' will suspend at the beginning of the function and you will be in the browser. 100 | -------------------------------------------------------------------------------- /Quiz_4.md: -------------------------------------------------------------------------------- 1 | Quiz 4 2 | ====== 3 | 4 | |Attempts|Score| 5 | |--------|-----| 6 | | 1/3|10/10| 7 | 8 | 9 | Question 1 10 | ---------- 11 | What is produced at the end of this snippet of R code? 12 | 13 | set.seed(1) 14 | rpois(5, 2) 15 | 16 | ### Answer 17 | A vector with the numbers 1, 1, 2, 4, 1 18 | 19 | ### Explanation 20 | Because the 'set.seed()' function is used, 'rpois()' will always output the same vector in this code. 21 | 22 | > set.seed(1) 23 | > rpois(5, 2) 24 | [1] 1 1 2 4 1 25 | 26 | 27 | Question 2 28 | ---------- 29 | What R function can be used to generate standard Normal random variables? 30 | 31 | ### Answer 32 | rnorm 33 | 34 | ### Explanation 35 | Functions beginning with the 'r' prefix are used to simulate random variates. 36 | 37 | Standard probability distributions in R have a set of four functions that can be used to simulate variates, evaluate the density, evaluate the cumulative density, and evaluate the quantile function. 38 | 39 | 40 | Question 3 41 | ---------- 42 | When simulating data, why is using the set.seed() function important? 43 | 44 | ### Answer 45 | It ensures that the sequence of random numbers starts in a specific place and is therefore reproducible. 46 | 47 | 48 | Question 4 49 | ---------- 50 | Which function can be used to evaluate the inverse cumulative distribution function for the Poisson distribution? 51 | 52 | ### Answer 53 | qpois 54 | 55 | ### Explanation 56 | Probability distribution functions beginning with the 'q' prefix are used to evaluate the quantile function. 57 | 58 | 59 | Question 5 60 | ---------- 61 | What does the following code do? 62 | 63 | set.seed(10) 64 | x <- rbinom(10, 10, 0.5) 65 | e <- rnorm(10, 0, 20) 66 | y <- 0.5 + 2 * x + e 67 | 68 | ### Answer 69 | Generate data from a Normal linear model 70 | 71 | 72 | Question 6 73 | ---------- 74 | What R function can be used to generate Binomial random variables? 75 | 76 | ### Answer 77 | rbinom 78 | 79 | 80 | Question 7 81 | ---------- 82 | What aspect of the R runtime does the profiler keep track of when an R expression is evaluated? 83 | 84 | ### Answer 85 | the function call stack 86 | 87 | 88 | Question 8 89 | ---------- 90 | Consider the following R code 91 | 92 | library(datasets) 93 | Rprof() 94 | fit <- lm(y ~ x1 + x2) 95 | Rprof(NULL) 96 | 97 | (Assume that y, x1, and x2 are present in the workspace.) Without running the code, what percentage of the run time is spent in the 'lm' function, based on the 'by.total' method of normalization shown in 'summaryRprof()'? 98 | 99 | ### Answer 100 | 100% 101 | 102 | ### Explanation 103 | When using 'by.total' normalization, the top-level function (in this case, `lm()') always takes 100% of the time. 104 | 105 | 106 | Question 9 107 | ---------- 108 | When using 'system.time()', what is the user time? 109 | 110 | ### Answer 111 | It is the time spent by the CPU evaluating an expression 112 | 113 | 114 | Question 10 115 | ----------- 116 | If a computer has more than one available processor and R is able to take advantage of that, then which of the following is true when using 'system.time()'? 117 | 118 | ### Answer 119 | elapsed time may be smaller than user time 120 | 121 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | R Programming 2 | ============= 3 | 4 | This is a repository for any and all code written for the R Programming Coursera course through Johns Hopkins University. 5 | -------------------------------------------------------------------------------- /best.R: -------------------------------------------------------------------------------- 1 | best <- function(state, outcome) { 2 | 3 | ## Reads outcome data 4 | file_data <- read.csv("outcome-of-care-measures.csv", sep = ",") 5 | 6 | ## Checks that state and outcome are valid 7 | valid_states <- c("AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY") 8 | valid_outcomes <- c("heart attack", "heart failure", "pneumonia") 9 | if (!is.element(state, valid_states)) stop("invalid state") 10 | if (!is.element(outcome, valid_outcomes)) stop("invalid outcome") 11 | 12 | ## Returns hospital name in that state with lowest 30-day death 13 | data <- file_data[file_data$State == state,] 14 | header_name <- NULL 15 | if (outcome == "heart attack") { 16 | header_name <- "Hospital.30.Day.Death..Mortality..Rates.from.Heart.Attack" 17 | } else if (outcome == "heart failure") { 18 | header_name <- "Hospital.30.Day.Death..Mortality..Rates.from.Heart.Failure" 19 | } else { 20 | header_name <- "Hospital.30.Day.Death..Mortality..Rates.from.Pneumonia" 21 | } 22 | mortality_rate <- data[,header_name] 23 | mortality_rate <- mortality_rate[!mortality_rate == "Not Available"] 24 | mortality_rate <- as.numeric(as.character(mortality_rate)) 25 | min_rate <- min(mortality_rate) 26 | best_hosps <- data[data[,header_name] == min_rate,] 27 | hosp_names <- sort(best_hosps[,"Hospital.Name"]) 28 | return(as.character(hosp_names[1])) 29 | } -------------------------------------------------------------------------------- /cachematrix.R: -------------------------------------------------------------------------------- 1 | 2 | ## The following is a pair of functions that cache and compute the 3 | ## inverse of a matrix. 4 | 5 | ## This function creates a special "matrix" object 6 | ## that can cache its inverse. 7 | 8 | makeCacheMatrix <- function(mtx = matrix()) { 9 | inverse <- NULL 10 | set <- function(x) { 11 | mtx <<- x; 12 | inverse <<- NULL; 13 | } 14 | get <- function() return(mtx); 15 | setinv <- function(inv) inverse <<- inv; 16 | getinv <- function() return(inverse); 17 | return(list(set = set, get = get, setinv = setinv, getinv = getinv)) 18 | } 19 | 20 | ## This function computes the inverse of the special 21 | ## "matrix" returned by `makeCacheMatrix` above. If the inverse has 22 | ## already been calculated (and the matrix has not changed), then 23 | ## `cacheSolve` should retrieve the inverse from the cache. 24 | 25 | cacheSolve <- function(mtx, ...) { 26 | inverse <- mtx$getinv() 27 | if(!is.null(inverse)) { 28 | message("Getting cached data...") 29 | return(inverse) 30 | } 31 | data <- mtx$get() 32 | invserse <- solve(data, ...) 33 | mtx$setinv(inverse) 34 | return(inverse) 35 | } -------------------------------------------------------------------------------- /complete.R: -------------------------------------------------------------------------------- 1 | complete <- function(directory, id = 1:332) { 2 | 3 | ## 'directory' is a character vector of length 1 indicating 4 | ## the location of the CSV files 5 | 6 | ## 'id' is an integer vector indicating the monitor ID numbers 7 | ## to be used 8 | 9 | ## Returns a data frame of the form: 10 | ## id nobs 11 | ## 1 117 12 | ## 2 1041 13 | ## ... 14 | ## where 'id' is the monitor ID number and 'nobs' is the 15 | ## number of complete cases 16 | 17 | files <- (Sys.glob("specdata//*.csv")); 18 | nobs <- c(); 19 | 20 | for (index in id) { 21 | file_data <- read.csv(files[index], sep = ","); 22 | complete_cases <- file_data[complete.cases(file_data),]; 23 | nobs <- c(nobs, nrow(complete_cases)); 24 | } 25 | 26 | return(data.frame(cbind(id, nobs))); 27 | } -------------------------------------------------------------------------------- /corr.R: -------------------------------------------------------------------------------- 1 | corr <- function(directory, threshold = 0) { 2 | 3 | ## 'directory' is a character vector of length 1 indicating 4 | ## the location of the CSV files 5 | 6 | ## 'threshold' is a numeric vector of length 1 indicating the 7 | ## number of completely observed observations (on all 8 | ## variables) required to compute the correlation between 9 | ## nitrate and sulfate; the default is 0 10 | 11 | ## Returns a numeric vector of correlations 12 | 13 | files <- (Sys.glob("specdata//*.csv")); 14 | 15 | correlations <- c() 16 | 17 | for (file in files) { 18 | file_data <- read.csv(file, sep = ","); 19 | complete_cases <- file_data[complete.cases(file_data),]; 20 | if (nrow(complete_cases) > threshold) { 21 | correlations <- c(correlations, cor(complete_cases$sulfate, complete_cases$nitrate)) 22 | } 23 | } 24 | 25 | return(correlations) 26 | } -------------------------------------------------------------------------------- /plot1.R: -------------------------------------------------------------------------------- 1 | ## Fetches full dataset 2 | base_data <- read.table("household_power_consumption.txt", header = TRUE, sep = ';', na.strings = "?", check.names = FALSE, stringsAsFactors = FALSE, comment.char="", quote='\"') 3 | base_data$Date <- as.Date(base_data$Date, format="%d/%m/%Y") 4 | 5 | ## Subsets the data 6 | data <- subset(base_data, subset = (Date >= "2007-02-01" & Date <= "2007-02-02")) 7 | rm(base_data) 8 | 9 | ## Converts dates 10 | date_time <- paste(as.Date(data$Date), data$Time) 11 | data$Datetime <- as.POSIXct(date_time) 12 | 13 | ## Plot 1 14 | hist(data$Global_active_power, main = "Global Active Power", xlab = "Global Active Power (kilowatts)", ylab = "Frequency", col = "Red") 15 | 16 | ## Saves data to file 17 | dev.copy(png, file = "plot1.png", height = 480, width = 480) 18 | dev.off() -------------------------------------------------------------------------------- /plot2.R: -------------------------------------------------------------------------------- 1 | ## Fetches full dataset 2 | base_data <- read.table("household_power_consumption.txt", header = TRUE, sep = ';', na.strings = "?", check.names = FALSE, stringsAsFactors = FALSE, comment.char="", quote='\"') 3 | base_data$Date <- as.Date(base_data$Date, format="%d/%m/%Y") 4 | 5 | ## Subsets the data 6 | data <- subset(base_data, subset = (Date >= "2007-02-01" & Date <= "2007-02-02")) 7 | rm(base_data) 8 | 9 | ## Converts dates 10 | date_time <- paste(as.Date(data$Date), data$Time) 11 | data$Datetime <- as.POSIXct(date_time) 12 | 13 | ## Plot 2 14 | plot(data$Global_active_power~data$Datetime, type = "l", ylab = "Global Active Power (kilowatts)", xlab = "") 15 | 16 | ## Saves data to file 17 | dev.copy(png, file = "plot2.png", height = 480, width = 480) 18 | dev.off() -------------------------------------------------------------------------------- /plot3.R: -------------------------------------------------------------------------------- 1 | ## Fetches full dataset 2 | base_data <- read.table("household_power_consumption.txt", header = TRUE, sep = ';', na.strings = "?", check.names = FALSE, stringsAsFactors = FALSE, comment.char="", quote='\"') 3 | base_data$Date <- as.Date(base_data$Date, format="%d/%m/%Y") 4 | 5 | ## Subsets the data 6 | data <- subset(base_data, subset = (Date >= "2007-02-01" & Date <= "2007-02-02")) 7 | rm(base_data) 8 | 9 | ## Converts dates 10 | date_time <- paste(as.Date(data$Date), data$Time) 11 | data$Datetime <- as.POSIXct(date_time) 12 | 13 | ## Plot 3 14 | with(data, { 15 | plot(Sub_metering_1~Datetime, type = "l", 16 | ylab = "Global Active Power (kilowatts)", xlab = "") 17 | lines(Sub_metering_2~Datetime, col = 'Red') 18 | lines(Sub_metering_3~Datetime, col = 'Blue') 19 | }) 20 | legend("topright", col=c("black", "red", "blue"), lty = 1, lwd = 2, 21 | legend = c("Sub_metering_1", "Sub_metering_2", "Sub_metering_3")) 22 | 23 | ## Saves data to file 24 | dev.copy(png, file = "plot3.png", height = 480, width = 480) 25 | dev.off() -------------------------------------------------------------------------------- /plot4.R: -------------------------------------------------------------------------------- 1 | ## Fetches full dataset 2 | base_data <- read.table("household_power_consumption.txt", header = TRUE, sep = ';', na.strings = "?", check.names = FALSE, stringsAsFactors = FALSE, comment.char="", quote='\"') 3 | base_data$Date <- as.Date(base_data$Date, format="%d/%m/%Y") 4 | 5 | ## Subsets the data 6 | data <- subset(base_data, subset = (Date >= "2007-02-01" & Date <= "2007-02-02")) 7 | rm(base_data) 8 | 9 | ## Converts dates 10 | date_time <- paste(as.Date(data$Date), data$Time) 11 | data$Datetime <- as.POSIXct(date_time) 12 | 13 | ## Plot 4 14 | par(mfrow = c(2, 2), mar = c(4, 4, 2, 1), oma = c(0, 0, 2, 0)) 15 | with(data, { 16 | plot(Global_active_power~Datetime, type = "l", 17 | ylab = "Global Active Power (kilowatts)", xlab = "") 18 | plot(Voltage~Datetime, type = "l", 19 | ylab = "Voltage (volt)", xlab = "") 20 | plot(Sub_metering_1~Datetime, type = "l", 21 | ylab = "Global Active Power (kilowatts)", xlab = "") 22 | lines(Sub_metering_2~Datetime, col = 'Red') 23 | lines(Sub_metering_3~Datetime, col = 'Blue') 24 | legend("topright", col = c("black", "red", "blue"), lty = 1, lwd = 2, bty = "n", 25 | legend = c("Sub_metering_1", "Sub_metering_2", "Sub_metering_3")) 26 | plot(Global_reactive_power~Datetime, type = "l", 27 | ylab = "Global Rective Power (kilowatts)", xlab = "") 28 | }) 29 | 30 | ## Saves data to file 31 | dev.copy(png, file = "plot4.png", height = 480, width = 480) 32 | dev.off() -------------------------------------------------------------------------------- /pollutantmean.R: -------------------------------------------------------------------------------- 1 | pollutantmean <- function(directory, pollutant, id = 1:332) { 2 | 3 | ## 'directory' is a character vector of length 1 indicating 4 | ## the location of the CSV files 5 | 6 | ## 'pollutant' is a character vector of length 1 indicating 7 | ## the name of the pollutant for which we will calculate the 8 | ## mean; either "sulfate" or "nitrate". 9 | 10 | ## 'id' is an integer vector indicating the monitor ID numbers 11 | ## to be used 12 | 13 | ## Returns the mean of the pollutant across all monitors list 14 | ## in the 'id' vector (ignoring NA values) 15 | 16 | files <- (Sys.glob("specdata//*.csv"))[id]; 17 | 18 | combined_data <- c() 19 | 20 | for (file in files) { 21 | file_data <- read.csv(file, sep = ","); 22 | pollutant_data <- file_data[,pollutant]; 23 | pollutant_data <- pollutant_data[!is.na(pollutant_data)] 24 | combined_data <- c(combined_data, pollutant_data) 25 | } 26 | 27 | return(mean(combined_data)); 28 | } -------------------------------------------------------------------------------- /rankall.R: -------------------------------------------------------------------------------- 1 | rankall <- function(outcome, num = "best") { 2 | 3 | ## Reads outcome data 4 | file_data <- read.csv("outcome-of-care-measures.csv", sep = ",") 5 | 6 | ## Checks that state and outcome are valid 7 | valid_states <- c("AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DE", "FL", "GA", "HI", "IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME", "MI", "MN", "MO", "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI", "WV", "WY") 8 | valid_outcomes <- c("heart attack", "heart failure", "pneumonia") 9 | if (!is.element(outcome, valid_outcomes)) stop("invalid outcome") 10 | 11 | header_name <- NULL 12 | if (outcome == "heart attack") header_name <- "Hospital.30.Day.Death..Mortality..Rates.from.Heart.Attack" 13 | else if (outcome == "heart failure") header_name <- "Hospital.30.Day.Death..Mortality..Rates.from.Heart.Failure" 14 | else header_name <- "Hospital.30.Day.Death..Mortality..Rates.from.Pneumonia" 15 | 16 | hosps <- c() 17 | states <- c() 18 | 19 | ## For each state, finds the hospital of the given rank 20 | for (state in valid_states) { 21 | ranked_hosp <- c() 22 | data <- file_data[file_data$State == state,] 23 | sorted_data <- data[order(as.numeric(as.character(data[,header_name])), as.character(data[,"Hospital.Name"])),] 24 | sorted_data <- sorted_data[!sorted_data[,header_name] == "Not Available",] 25 | if (num == "best") { 26 | ranked_hosp <- best(state, outcome) 27 | } else if (num == "worst") { 28 | ranked_hosp <- as.character(tail(sorted_data[,"Hospital.Name"], n = 1)) 29 | } else { 30 | ranked_hosp <- as.character(sorted_data[,"Hospital.Name"][num]) 31 | } 32 | hosps <- c(hosps, ranked_hosp) 33 | } 34 | result <- data.frame(hosps, valid_states) 35 | colnames(result) <- c("hospital", "state") 36 | return(result) 37 | } -------------------------------------------------------------------------------- /rankhospital.R: -------------------------------------------------------------------------------- 1 | rankhospital <- function(state, outcome, num = "best") { 2 | 3 | ## Reads outcome data 4 | file_data <- read.csv("outcome-of-care-measures.csv", sep = ",") 5 | 6 | ## Checks that state and outcome are valid 7 | valid_states <- c("AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY") 8 | valid_outcomes <- c("heart attack", "heart failure", "pneumonia") 9 | if (!is.element(state, valid_states)) stop("invalid state") 10 | if (!is.element(outcome, valid_outcomes)) stop("invalid outcome") 11 | 12 | ## Returns hospital name in that state with lowest 30-day death 13 | data <- file_data[file_data$State == state,] 14 | header_name <- NULL 15 | if (outcome == "heart attack") header_name <- "Hospital.30.Day.Death..Mortality..Rates.from.Heart.Attack" 16 | else if (outcome == "heart failure") header_name <- "Hospital.30.Day.Death..Mortality..Rates.from.Heart.Failure" 17 | else header_name <- "Hospital.30.Day.Death..Mortality..Rates.from.Pneumonia" 18 | 19 | sorted_data <- data[order(as.numeric(as.character(data[,header_name])), as.character(data[,"Hospital.Name"])),] 20 | sorted_data <- sorted_data[!sorted_data[,header_name] == "Not Available",] 21 | if (num == "best") { 22 | return(best(state, outcome)) 23 | } else if (num == "worst") { 24 | return(tail(as.character(sorted_data[,"Hospital.Name"]), n = 1)) 25 | } 26 | return(as.character(sorted_data[,"Hospital.Name"][num])) 27 | } --------------------------------------------------------------------------------