├── .Rhistory
├── ProgAssignment4.R
├── Quiz_1.md
├── Quiz_2.md
├── Quiz_3.md
├── Quiz_4.md
├── README.md
├── best.R
├── cachematrix.R
├── complete.R
├── corr.R
├── plot1.R
├── plot2.R
├── plot3.R
├── plot4.R
├── pollutantmean.R
├── rankall.R
└── rankhospital.R


/.Rhistory:
--------------------------------------------------------------------------------
  1 | source("getdata-data-ss06hid.csv")
  2 | file <- read.csv("getdata-data-ss06hid.csv", sep = ",")
  3 | file
  4 | View(file)
  5 | print(length(file[file$VAL > 1000000]))
  6 | print(length(file[,file$VAL > 1000000]))
  7 | length(file[file$VAL > 1000000])
  8 | length(file[,file$VAL > 1000000])
  9 | length(file[file$VAL > 1000000,])
 10 | print(file[file$VAL > 1000000,])
 11 | length(file$VAL > 1000000)
 12 | vals <- file$VAL
 13 | vals <- vals[!is.na(vals)]
 14 | test <- vals > 1000000
 15 | test <- vals[vals > 1000000,]
 16 | test <- vals[,vals > 1000000]
 17 | test <- vals[vals > 1000000,]
 18 | vals > 1000000
 19 | vals[vals > 1000000,]
 20 | vals
 21 | sort(vals)
 22 | test <- vals[vals == 24,]
 23 | vals[vals == 24]
 24 | test <- vals[vals == 24]
 25 | length(test)
 26 | source("getdata-data-DATA.gov_NGAP.xlsx")
 27 | file <- read.table("getdata-data-DATA.gov_NGAP.xlsx")
 28 | file <- read.table("getdata-data-DATA.gov_NGAP.xlsx", sep = ",")
 29 | library(xlsx)
 30 | install.packages("xlsx")
 31 | library(xlsx)
 32 | file <- read.xlsx("getdata-data-DATA.gov_NGAP.xlsx", header = TRUE)
 33 | file <- read.xlsx("getdata-data-DATA.gov_NGAP.xlsx", header = TRUE, sheetIndex = 1)
 34 | dat <- read.xlsx("getdata-data-DATA.gov_NGAP.xlsx", header = TRUE, sheetIndex = 1, rowIndex = 18:23, colIndex = 7:15)
 35 | sum(dat$Zip*dat$Ext,na.rm=T)
 36 | install.packages(XML)
 37 | install.packages("XML")
 38 | library(XML)
 39 | file <- xmlTreeParse("https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Frestaurants.xml", useInternal = TRUE)
 40 | fileURL <- https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Frestaurants.xml
 41 | fileURL <- https:/d396qusza40orc.cloudfront.net/getdata%2Fdata%2Frestaurants.xml
 42 | fileURL <- "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Frestaurants.xml"
 43 | doc <- xmlTreeParse(fileURL, useInternal = TRUE)
 44 | doc <- xmlTreeParse(fileURL, useInternal = TRUE)
 45 | doc = xmlTreeParse(fileURL, useInternal = TRUE)
 46 | z <- sum(x ^ 2 * p) - sum(x * p) ^ 2
 47 | x <- 1:4
 48 | p <- x/sum(x)
 49 | temp <- rbind(x, p)
 50 | rownames(temp) <- c("X", "Prob")
 51 | temp
 52 | mean(X)
 53 | mean(x)
 54 | round((0.75*0.3)/(  (0.75*0.3) +  ((1-0.52)*(1-0.3))  )*100)
 55 | a <- c(140, 138, 150, 148, 135)
 56 | b <- c(132, 135, 151, 146, 130)
 57 | t.test(a, b, alternative = "two.sided", paired = T)
 58 | round(pbinom(2,size=4,prob=0.5,lower.tail=FALSE),2)
 59 | pbinom (3,size=4,prob = 0.5)
 60 | n1 <- n2 <- 9
 61 | x1 <- -3  ##treated
 62 | x2 <- 1  ##placebo
 63 | s1 <- 1.5  ##treated
 64 | s2 <- 1.8  ##placebo
 65 | spsq <- ( (n1 - 1) * s1^2 + (n2 - 1) * s2^2) / (n1 + n2 - 2)
 66 | t=(x1-x2)/(spsq*sqrt(1/n1 + 1/n2))
 67 | 2*pt(t,n1+n2-2)
 68 | (qnorm(.95) + qnorm(.9))^2 * .04^2 / .01^2
 69 | round( (qnorm(0.95) + qnorm(0.90))^2 * 0.04^2 / (0.01^2) )
 70 | smm=sqrt(12^2/288 + 12^2/288)
 71 | 12^2/288
 72 | x=44
 73 | y=42.04
 74 | sx=12
 75 | sy=12
 76 | n1=288
 77 | n2=288
 78 | SDpooled <- sqrt( ((n1 - 1) * sx^2 + (n2-1) * sy^2) / (n1 + n2 -2))
 79 | SE_est= sqrt(SDpooled^2/n1 + SDpooled^2/n2)
 80 | 2*pnorm((44-42.04),lower.tail=F)
 81 | myPlot <- function(beta){
 82 | y <- galton$child - mean(galton$child)
 83 | x <- galton$parent - mean(galton$parent)
 84 | freqData <- as.data.frame(table(x, y))
 85 | names(freqData) <- c("child", "parent", "freq")
 86 | plot(
 87 | as.numeric(as.vector(freqData$parent)),
 88 | as.numeric(as.vector(freqData$child)),
 89 | pch = 21, col = "black", bg = "lightblue",
 90 | cex = .15 * freqData$freq,
 91 | xlab = "parent",
 92 | ylab = "child"
 93 | )
 94 | abline(0, beta, lwd = 3)
 95 | points(0, 0, cex = 2, pch = 19)
 96 | mse <- mean( (y - beta * x)^2 )
 97 | title(paste("beta = ", beta, "mse = ", round(mse, 3)))
 98 | }
 99 | manipulate(myPlot(beta), beta = slider(0.6, 1.2, step = 0.02))
100 | setwd("~/Documents/Coursera/Data Science Specialization/R Programming")
101 | x <- c(1, 3, 5)
102 | y <- c(3, 2, 10)
103 | rbind(x, y)
104 | x <- list(2, "a", "b", TRUE)
105 | x[[2]]
106 | x <- 1:4
107 | y <- 2:3
108 | x + y
109 | x <- c(3, 5, 1, 10, 12, 6)
110 | x[x < 6] <- 0
111 | x[x < 6]
112 | x
113 | x <- c(3, 5, 1, 10, 12, 6)
114 | x[x < 6] <- 0
115 | x
116 | cube <- function(x, n) {
117 | x^3
118 | }
119 | cube(3)
120 | x <- 1:10
121 | if(x > 5) {
122 | x <- 0
123 | }
124 | f <- function(x) {
125 | g <- function(y) {
126 | y + z
127 | }
128 | z <- 4
129 | x + g(x)
130 | }
131 | z <- 10
132 | f(3)
133 | x <- 5
134 | y <- if(x < 3) {
135 | NA
136 | } else {
137 | 10
138 | }
139 | y
140 | library(datasets)
141 | data(iris)
142 | ?iris
143 | tapply(iris$Sepal.Length, mean)
144 | tapply(iris$Sepal.Length, iris, mean)
145 | tapply(iris$Sepal.Length, iris$virginica, mean)
146 | tapply(iris[Species == virginica], iris$Special.Length, mean)
147 | tapply(iris[Species == virginica,], iris$Special.Length, mean)
148 | tapply(iris[,Species == virginica], iris$Special.Length, mean)
149 | library(datasets)
150 | data(iris)
151 | ?iris
152 | tapply(iris$Sepal.Length, iris[Species == "virginica",], mean)
153 | tapply(iris$Sepal.Length, iris[iris$Species == "virginica",], mean)
154 | iris$Species
155 | tapply(iris$Sepal.Length[,iris$Species == "virginica"], mean)
156 | tapply(iris$Sepal.Length[iris$Species == "virginica",], mean)
157 | test <- iris[iris$Species == "virginica",]
158 | test
159 | test <- iris[iris$Species == "virginica",]
160 | library(datasets)
161 | data(iris)
162 | ?iris
163 | virginica <- iris[iris$Species == "virginica",]
164 | tapply(Sepal.Length, virginica, mean)
165 | tapply(iris$Sepal.Length, virginica, mean)
166 | tapply(iris$Sepal.Length, virginica$Sepal.Length, mean)
167 | tapply(virginica$Sepal.Length, virginica, mean)
168 | tapply(virginica$Sepal.Length, virginica[!is.na(virginica$Sepal.Length)], mean)
169 | tapply(virginica$Sepal.Length, virginica[!is.na(virginica$Sepal.Length),], mean)
170 | tapply(virginica$Sepal.Length, virginica[,!is.na(virginica$Sepal.Length)], mean)
171 | tapply(iris$Sepal.Length, iris, mean)
172 | tapply(iris$Sepal.Length, iris[iris$Species == "virginica" & !is.na(iris$Sepal.Length)], mean)
173 | tapply(iris$Sepal.Length, iris[iris$Species == "virginica" & !is.na(iris$Sepal.Length),], mean)
174 | 


--------------------------------------------------------------------------------
/ProgAssignment4.R:
--------------------------------------------------------------------------------
1 | best <- function(state, outcome) {
2 |     ## Reads outcome data
3 |     cat(outcome)
4 |     cat(state)
5 |     ## Check that state and outcome are valid
6 |     ## Return hospital name in that state with lowest 30-day death
7 |     ## rate
8 | }


--------------------------------------------------------------------------------
/Quiz_1.md:
--------------------------------------------------------------------------------
  1 | Quiz 1
  2 | ======
  3 | 
  4 | |Attempts|Score|
  5 | |--------|-----|
  6 | |     1/3|20/20|
  7 | 
  8 | 
  9 | Question 1
 10 | ----------
 11 | The R language is a dialect of which of the following programming languages?
 12 | 
 13 | ### Answer
 14 | S
 15 | 
 16 | ### Explanation
 17 | R is a dialect of the S language which was developed at Bell Labs.
 18 | 
 19 | 
 20 | Question 2
 21 | ----------
 22 | The definition of free software consists of four freedoms (freedoms 0 through 3). Which of the following is NOT one of the freedoms that are part of the definition?
 23 | 
 24 | ### Answer
 25 | The freedom to sell the software for any price.
 26 | 
 27 | ### Explanation
 28 | This is not part of the free software definition. The free software definition does not mention anything about selling software (although it does not disallow it).
 29 | 
 30 | 
 31 | Question 3
 32 | ----------
 33 | In R the following are all atomic data types EXCEPT
 34 | 
 35 | ### Answer
 36 | matrix
 37 | 
 38 | ### Explanation
 39 | 'matrix' is not an atomic data type in R.
 40 | 
 41 | 
 42 | Question 4
 43 | ----------
 44 | If I execute the expression x <- 4 in R, what is the class of the object 'x' as determined by the 'class()' function?
 45 | 
 46 | ### Answer
 47 | numeric
 48 | 
 49 | ### Explanation
 50 | 
 51 | 	> x <- 4
 52 | 	> class(x)
 53 | 	[1] "numeric"
 54 | 
 55 | 
 56 | Question 5
 57 | ----------
 58 | What is the class of the object defined by x <- c(4, TRUE)?
 59 | 
 60 | ### Answer
 61 | numeric
 62 | 
 63 | ### Explanation
 64 | The numeric class is the "lowest common denominator" here and so all elements will be coerced into that class.
 65 | 
 66 | R does automatic coercion of vectors so that all elements of the vector are the same data class.
 67 | 
 68 | 	> x <- c(4, TRUE)
 69 | 	> class(x)
 70 | 	[1] "numeric"
 71 | 
 72 | 
 73 | Question 6
 74 | ----------
 75 | If I have two vectors x <- c(1,3, 5) and y <- c(3, 2, 10), what is produced by the expression rbind(x, y)?
 76 | 
 77 | ### Answer
 78 | a 2 by 3 numeric matrix	
 79 | 
 80 | ### Explanation
 81 | The 'rbind' function treats vectors as if they were rows of a matrix. It then takes those vectors and binds them together row-wise to create a matrix.
 82 | 
 83 | 	> x <- c(1, 3, 5)
 84 | 	> y <- c(3, 2, 10)
 85 | 	> rbind(x, y)
 86 | 	    [,1] [,2] [,3]
 87 | 	  x    1    3    5
 88 | 	  y    3    2   10
 89 | 
 90 | 
 91 | Question 7
 92 | ----------
 93 | A key property of vectors in R is that
 94 | 
 95 | ### Answer
 96 | elements of a vector all must be of the same class
 97 | 
 98 | 
 99 | Question 8
100 | ----------
101 | Suppose I have a list defined as x <- list(2, "a", "b", TRUE). What does x[[2]] give me?
102 | 
103 | ### Answer
104 | a character vector of length 1.
105 | 
106 | ### Explanation
107 | 
108 | 	> x <- list(2, "a", "b", TRUE)
109 | 	> x[[2]]
110 | 	  [1] "a"
111 | 
112 | Question 9
113 | ----------
114 | Suppose I have a vector x <- 1:4 and y <- 2:3. What is produced by the expression x + y?
115 | 
116 | ### Answer
117 | an integer vector with the values 3, 5, 5, 7.
118 | 
119 | ### Explanation
120 | 
121 | 	> x <- 1:4
122 | 	> y <- 2:3
123 | 	> x + y
124 | 	[1] 3 5 5 7
125 | 
126 | 
127 | Question 10
128 | -----------
129 | Suppose I have a vector x <- c(3, 5, 1, 10, 12, 6) and I want to set all elements of this vector that are less than 6 to be equal to zero. What R code achieves this?
130 | 
131 | ### Answer
132 | x[x < 6] <- 0
133 | 
134 | ### Explanation
135 | You can create a logical vector with the expression x < 6 and then use the [ operator to subset the original vector x.
136 | 
137 | 	> x <- c(3, 5, 1, 10, 12, 6)
138 | 	> x[x < 6] <- 0
139 | 	> x
140 | 	[1]  0  0  0 10 12  6
141 | 
142 | 
143 | Question 11
144 | -----------
145 | In the dataset provided for this Quiz, what are the column names of the dataset?
146 | 
147 | ### Answer
148 | Ozone, Solar.R, Wind, Temp, Month, Day
149 | 
150 | ### Explanation
151 | You can get the column names of a data frame with the 'names()' function.
152 | 
153 | 	> hw1 = read.csv('hw1_data.csv')
154 | 	> names(hw1)
155 | 	[1] "Ozone"   "Solar.R" "Wind"    "Temp"    "Month"   "Day"
156 | 
157 | 
158 | Question 12
159 | -----------
160 | Extract the first 2 rows of the data frame and print them to the console. What does the output look like?
161 | 
162 | ### Answer
163 | 	  Ozone Solar.R Wind Temp Month Day
164 | 	1    41     190  7.4   67     5   1
165 | 	2    36     118  8.0   72     5   2
166 | 
167 | ### Explantion
168 | You can extract the first two rows using the [ operator and an integer sequence to index the rows.
169 | 
170 | 	> hw1 = read.csv('hw1_data.csv')
171 | 	> hw1[c(1,2),]
172 | 	  Ozone Solar.R Wind Temp Month Day
173 | 	1    41     190  7.4   67     5   1
174 | 	2    36     118  8.0   72     5   2
175 | 
176 | 
177 | Question 13
178 | -----------
179 | How many observations (i.e. rows) are in this data frame?
180 | 
181 | ### Answer
182 | 153
183 | 
184 | ### Explanation
185 | You can use the 'nrows()' function to compute the number of rows in a data frame.
186 | 
187 | 	> hw1 = read.csv('hw1_data.csv')
188 | 	> nrow(hw1)
189 | 	[1] 153
190 | 
191 | 
192 | Question 14
193 | -----------
194 | Extract the last 2 rows of the data frame and print them to the console. What does the output look like?
195 | 
196 | ### Answer
197 | 
198 | 	   Ozone Solar.R Wind Temp Month Day
199 | 	152    18     131  8.0   76     9  29
200 | 	153    20     223 11.5   68     9  30
201 | 
202 | ### Explanation
203 | The 'tail()' function is an easy way to extract the last few elements of an R object.
204 | 
205 | 	> hw1 = read.csv('hw1_data.csv')
206 | 	> tail(hw1,2)
207 | 	    Ozone Solar.R Wind Temp Month Day
208 | 	152    18     131  8.0   76     9  29
209 | 	153    20     223 11.5   68     9  30
210 | 
211 | 
212 | Question 15
213 | -----------
214 | What is the value of Ozone in the 47th row?
215 | 
216 | ### Answer
217 | 21
218 | 
219 | ### Explanation
220 | The single bracket [ operator can be used to extract individual rows of a data frame.
221 | 
222 | 	> hw1 = read.csv('hw1_data.csv')
223 | 	> hw1[15,]
224 | 	   Ozone Solar.R Wind Temp Month Day
225 | 	15    18      65 13.2   58     5  15
226 | 
227 | 
228 | Question 16
229 | -----------
230 | How many missing values are in the Ozone column of this data frame?
231 | 
232 | ### Answer
233 | 37
234 | 
235 | ### Explanation
236 | The 'is.na' function can be used to test for missing values.
237 | 
238 | 	> hw1 = read.csv('hw1_data.csv')
239 | 	> sub = subset(hw1, is.na(Ozone))
240 | 	> nrow(sub)
241 | 	[1] 37
242 | 
243 | 
244 | Question 17
245 | -----------
246 | What is the mean of the Ozone column in this dataset? Exclude missing values (coded as NA) from this calculation.
247 | 
248 | ### Answer
249 | 42.1
250 | 
251 | ### Explanation
252 | The 'mean' function can be used to calculate the mean.
253 | 
254 | 	> hw1 = read.csv('hw1_data.csv')
255 | 	> sub = subset(hw1, !is.na(Ozone), select = Ozone)
256 | 	> apply(sub, 2, mean)
257 | 	   Ozone 
258 | 	42.12931 
259 | 
260 | 
261 | Question 18
262 | -----------
263 | Extract the subset of rows of the data frame where Ozone values are above 31 and Temp values are above 90. What is the mean of Solar.R in this subset?
264 | 
265 | ### Answer
266 | 212.8
267 | 
268 | ### Explanation
269 | You need to construct a logical vector in R to match the question's requirements. Then use that logical vector to subset the data frame.
270 | 
271 | 	> hw1 = read.csv('hw1_data.csv')
272 | 	> sub = subset(hw1, Ozone > 31 & Temp > 90, select = Solar.R)
273 | 	> apply(sub, 2, mean)
274 | 	Solar.R 
275 | 	  212.8 
276 | 
277 | 
278 | Question 19
279 | -----------
280 | What is the mean of "Temp" when "Month" is equal to 6?
281 | 
282 | ### Answer
283 | 79.1
284 | 
285 | ### Explanation
286 | 
287 | 	> hw1 = read.csv('hw1_data.csv')
288 | 	> sub = subset(hw1, Month == 6, select = Temp)
289 | 	> apply(sub, 2, mean)
290 | 	Temp 
291 | 	79.1
292 | 
293 | 
294 | Question 20
295 | -----------
296 | What was the maximum ozone value in the month of May (i.e. Month = 5)?
297 | 
298 | ### Answer
299 | 115
300 | 
301 | ### Explantion
302 | 
303 | 	> hw1 = read.csv('hw1_data.csv')
304 | 	> sub = subset(hw1, Month == 5 & !is.na(Ozone), select = Ozone)
305 | 	> apply(sub, 2, max)
306 | 	Ozone 
307 | 	  115
308 | 


--------------------------------------------------------------------------------
/Quiz_2.md:
--------------------------------------------------------------------------------
  1 | Quiz 2
  2 | ======
  3 | 
  4 | |Attempts|Score|
  5 | |--------|-----|
  6 | |     2/3|10/10|
  7 | 
  8 | 
  9 | Question 1
 10 | ----------
 11 | Suppose I define the following function in R
 12 | 
 13 | 	cube <- function(x, n) {
 14 | 	        x^3
 15 | 	}
 16 |   
 17 | What is the result of running
 18 | 
 19 | 	cube(3)
 20 | 
 21 | in R after defining this function?
 22 | 
 23 | ### Answer
 24 | The number 27 is returned
 25 | 
 26 | ### Explanation
 27 | Because 'n' is not evaluated, it is not needed even though it is a formal argument.
 28 | 
 29 |     > cube <- function(x, n) {
 30 |     +     x^3
 31 |     + }
 32 |     > cube(3)
 33 |     [1] 27
 34 | 
 35 | 
 36 | Question 2
 37 | ----------
 38 | The following code will produce a warning in R.
 39 | 
 40 |     x <- 1:10
 41 |     if(x > 5) {
 42 |             x <- 0
 43 |     }
 44 | 
 45 | Why?
 46 | 
 47 | ### Answer
 48 | 'x' is a vector of length 10 and 'if' can only test a single logical statement.
 49 | 
 50 | ### Explanation
 51 | 
 52 |     > if(x > 5) {
 53 |     +     x <- 0
 54 |     + }
 55 |     Warning message:
 56 |     In if (x > 5) { :
 57 |       the condition has length > 1 and only the first element will be used
 58 | 
 59 | Question 3
 60 | ----------
 61 | Consider the following function
 62 | 
 63 |     f <- function(x) {
 64 |             g <- function(y) {
 65 |                     y + z
 66 |             }
 67 |             z <- 4
 68 |             x + g(x)
 69 |     }
 70 |     
 71 | If I then run in R
 72 | 
 73 |     z <- 10
 74 |     f(3)
 75 |     
 76 | What value is returned?
 77 | 
 78 | ### Answer
 79 | 10
 80 | 
 81 | ### Explanation
 82 | 
 83 |     > f <- function(x) {
 84 |     +     g <- function(y) {
 85 |     +         y + z
 86 |     +     }
 87 |     +     z <- 4
 88 |     +     x + g(x)
 89 |     + }
 90 |     > z <- 10
 91 |     > f(3)
 92 |     [1] 10
 93 | 
 94 | 
 95 | Question 4
 96 | ----------
 97 | Consider the following expression:
 98 | 
 99 |     x <- 5
100 |     y <- if(x < 3) {
101 |             NA
102 |     } else {
103 |             10
104 |     }
105 |     
106 | What is the value of 'y' after evaluating this expression?
107 | 
108 | ### Answer
109 | 10
110 | 
111 | ### Explanation
112 | 
113 |     > x <- 5
114 |     > y <- if(x < 3) {
115 |     +     NA
116 |     + } else {
117 |     +     10
118 |     + }
119 |     > y
120 |     [1] 10
121 | 
122 | 
123 | Question 5
124 | ----------
125 | Consider the following R function
126 | 
127 |     h <- function(x, y = NULL, d = 3L) {
128 |             z <- cbind(x, d)
129 |             if(!is.null(y))
130 |                     z <- z + y
131 |             else
132 |                     z <- z + f
133 |             g <- x + y / z
134 |             if(d == 3L)
135 |                     return(g)
136 |             g <- g + 10
137 |             g
138 |     }
139 |     
140 | Which symbol in the above function is a free variable?
141 | 
142 | ### Answer
143 | f
144 | 
145 | ### Explanation
146 | A free variable is a variable that is not defined in the function nor an argument of the function.
147 | 
148 | 
149 | Question 6
150 | ----------
151 | What is an environment in R?
152 | 
153 | ### Answer
154 | a collection of symbol/value pairs
155 | 
156 | 
157 | Question 7
158 | ----------
159 | The R language uses what type of scoping rule for resolving free variables?
160 | 
161 | ### Answer
162 | lexical scoping
163 | 
164 | 
165 | Question 8
166 | ----------
167 | How are free variables in R functions resolved?
168 | 
169 | ### Answer
170 | The values of free variables are searched for in the environment in which the function was defined
171 | 
172 | 
173 | Question 9
174 | ----------
175 | What is one of the consequences of the scoping rules used in R?
176 | 
177 | ### Answer
178 | All objects must be stored in memory
179 | 
180 | 
181 | Question 10
182 | -----------
183 | In R, what is the parent frame?
184 | 
185 | ### Answer
186 | It is the environment in which a function was called
187 | 


--------------------------------------------------------------------------------
/Quiz_3.md:
--------------------------------------------------------------------------------
  1 | Quiz 3
  2 | ======
  3 | 
  4 | |Attempts|Score|
  5 | |--------|-----|
  6 | |     1/3|  5/5|
  7 | 
  8 | 
  9 | Question 1
 10 | ----------
 11 | Take a look at the 'iris' dataset that comes with R. The data can be loaded with the code:
 12 | 
 13 |     library(datasets)
 14 |     data(iris)
 15 |   
 16 | A description of the dataset can be found by running
 17 | 
 18 | 	?iris
 19 | 
 20 | There will be an object called 'iris' in your workspace. In this dataset, what is the mean of 'Sepal.Length' for the species virginica? (Please only enter the numeric result and nothing else.)
 21 | 
 22 | ### Answer
 23 | 6.588
 24 | 
 25 | ### Explanation
 26 | 
 27 |     > library(datasets)
 28 |     > data(iris)
 29 |     > ?iris
 30 |     > mean(iris[iris$Species == "virginica",]$Sepal.Length)
 31 |     [1] 6.588
 32 | 
 33 | 
 34 | Question 2
 35 | ----------
 36 | Continuing with the 'iris' dataset from the previous Question, what R code returns a vector of the means of the variables 'Sepal.Length', 'Sepal.Width', 'Petal.Length', and 'Petal.Width'?
 37 | 
 38 | ### Answer
 39 | apply(iris[, 1:4], 2, mean)
 40 | 
 41 | ### Explanation
 42 | 
 43 |     > library(datasets)
 44 |     > data(iris)
 45 |     > apply(iris[, 1:4], 2, mean)
 46 |     Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
 47 |         5.843333     3.057333     3.758000     1.199333 
 48 | 
 49 | Question 3
 50 | ----------
 51 | Load the 'mtcars' dataset in R with the following code
 52 | 
 53 |     library(datasets)
 54 |     data(mtcars)
 55 |     
 56 | There will be an object names 'mtcars' in your workspace. You can find some information about the dataset by running
 57 | 
 58 |     ?mtcars
 59 |     
 60 | How can one calculate the average miles per gallon (mpg) by number of cylinders in the car (cyl)?
 61 | 
 62 | ### Answer
 63 | sapply(split(mtcars$mpg, mtcars$cyl), mean)
 64 | 
 65 | ### Explanation
 66 | 
 67 |     > library(datasets)
 68 |     > data(mtcars)
 69 |     > ?mtcars
 70 |     > sapply(split(mtcars$mpg, mtcars$cyl), mean)
 71 |            4        6        8 
 72 |     26.66364 19.74286 15.10000 
 73 | 
 74 | 
 75 | Question 4
 76 | ----------
 77 | Continuing with the 'mtcars' dataset from the previous Question, what is the absolute difference between the average horsepower of 4-cylinder cars and the average horsepower of 8-cylinder cars?
 78 | 
 79 | ### Answer
 80 | 126.5779
 81 | 
 82 | ### Explanation
 83 | 
 84 |     > library(datasets)
 85 |     > data(mtcars)
 86 |     > mean(mtcars[mtcars$cyl == "8",]$hp) - mean(mtcars[mtcars$cyl == "4",]$hp)
 87 |     [1] 126.5779
 88 | 
 89 | 
 90 | Question 5
 91 | ----------
 92 | If you run
 93 | 
 94 |     debug(ls)
 95 |     
 96 | what happens when you next call the 'ls' function?
 97 | 
 98 | ### Answer
 99 | Execution of 'ls' will suspend at the beginning of the function and you will be in the browser.
100 | 


--------------------------------------------------------------------------------
/Quiz_4.md:
--------------------------------------------------------------------------------
  1 | Quiz 4
  2 | ======
  3 | 
  4 | |Attempts|Score|
  5 | |--------|-----|
  6 | |     1/3|10/10|
  7 | 
  8 | 
  9 | Question 1
 10 | ----------
 11 | What is produced at the end of this snippet of R code?
 12 | 
 13 |     set.seed(1)
 14 |     rpois(5, 2)
 15 | 
 16 | ### Answer
 17 | A vector with the numbers 1, 1, 2, 4, 1
 18 | 
 19 | ### Explanation
 20 | Because the 'set.seed()' function is used, 'rpois()' will always output the same vector in this code.
 21 | 
 22 |     > set.seed(1)
 23 |     > rpois(5, 2)
 24 |     [1] 1 1 2 4 1
 25 | 
 26 | 
 27 | Question 2
 28 | ----------
 29 | What R function can be used to generate standard Normal random variables?
 30 | 
 31 | ### Answer
 32 | rnorm
 33 | 
 34 | ### Explanation
 35 | Functions beginning with the 'r' prefix are used to simulate random variates.
 36 | 
 37 | Standard probability distributions in R have a set of four functions that can be used to simulate variates, evaluate the density, evaluate the cumulative density, and evaluate the quantile function.
 38 | 
 39 | 
 40 | Question 3
 41 | ----------
 42 | When simulating data, why is using the set.seed() function important?
 43 | 
 44 | ### Answer
 45 | It ensures that the sequence of random numbers starts in a specific place and is therefore reproducible.
 46 | 
 47 | 
 48 | Question 4
 49 | ----------
 50 | Which function can be used to evaluate the inverse cumulative distribution function for the Poisson distribution?
 51 | 
 52 | ### Answer
 53 | qpois
 54 | 
 55 | ### Explanation
 56 | Probability distribution functions beginning with the 'q' prefix are used to evaluate the quantile function.
 57 | 
 58 | 
 59 | Question 5
 60 | ----------
 61 | What does the following code do?
 62 | 
 63 |     set.seed(10)
 64 |     x <- rbinom(10, 10, 0.5)
 65 |     e <- rnorm(10, 0, 20)
 66 |     y <- 0.5 + 2 * x + e
 67 | 
 68 | ### Answer
 69 | Generate data from a Normal linear model
 70 | 
 71 | 
 72 | Question 6
 73 | ----------
 74 | What R function can be used to generate Binomial random variables?
 75 | 
 76 | ### Answer
 77 | rbinom
 78 | 
 79 | 
 80 | Question 7
 81 | ----------
 82 | What aspect of the R runtime does the profiler keep track of when an R expression is evaluated?
 83 | 
 84 | ### Answer
 85 | the function call stack
 86 | 
 87 | 
 88 | Question 8
 89 | ----------
 90 | Consider the following R code
 91 | 
 92 |     library(datasets)
 93 |     Rprof()
 94 |     fit <- lm(y ~ x1 + x2)
 95 |     Rprof(NULL)
 96 |     
 97 | (Assume that y, x1, and x2 are present in the workspace.) Without running the code, what percentage of the run time is spent in the 'lm' function, based on the 'by.total' method of normalization shown in 'summaryRprof()'?
 98 | 
 99 | ### Answer
100 | 100%
101 | 
102 | ### Explanation
103 | When using 'by.total' normalization, the top-level function (in this case, `lm()') always takes 100% of the time.
104 | 
105 | 
106 | Question 9
107 | ----------
108 | When using 'system.time()', what is the user time?
109 | 
110 | ### Answer
111 | It is the time spent by the CPU evaluating an expression
112 | 
113 | 
114 | Question 10
115 | -----------
116 | If a computer has more than one available processor and R is able to take advantage of that, then which of the following is true when using 'system.time()'?
117 | 
118 | ### Answer 
119 | elapsed time may be smaller than user time
120 | 
121 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | R Programming
2 | =============
3 | 
4 | This is a repository for any and all code written for the R Programming Coursera course through Johns Hopkins University.
5 | 


--------------------------------------------------------------------------------
/best.R:
--------------------------------------------------------------------------------
 1 | best <- function(state, outcome) {
 2 |     
 3 |     ## Reads outcome data
 4 |     file_data <- read.csv("outcome-of-care-measures.csv", sep = ",")
 5 |     
 6 |     ## Checks that state and outcome are valid
 7 |     valid_states <- c("AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY")
 8 |     valid_outcomes <- c("heart attack", "heart failure", "pneumonia")
 9 |     if (!is.element(state, valid_states)) stop("invalid state")
10 |     if (!is.element(outcome, valid_outcomes)) stop("invalid outcome")
11 |     
12 |     ## Returns hospital name in that state with lowest 30-day death
13 |     data <- file_data[file_data$State == state,]
14 |     header_name <- NULL
15 |     if (outcome == "heart attack") {
16 |         header_name <- "Hospital.30.Day.Death..Mortality..Rates.from.Heart.Attack"
17 |     } else if (outcome == "heart failure") {
18 |         header_name <- "Hospital.30.Day.Death..Mortality..Rates.from.Heart.Failure"
19 |     } else {
20 |         header_name <- "Hospital.30.Day.Death..Mortality..Rates.from.Pneumonia"
21 |     }
22 |     mortality_rate <- data[,header_name]
23 |     mortality_rate <- mortality_rate[!mortality_rate == "Not Available"]
24 |     mortality_rate <- as.numeric(as.character(mortality_rate))
25 |     min_rate <- min(mortality_rate)
26 |     best_hosps <- data[data[,header_name] == min_rate,]
27 |     hosp_names <- sort(best_hosps[,"Hospital.Name"])
28 |     return(as.character(hosp_names[1]))
29 | }


--------------------------------------------------------------------------------
/cachematrix.R:
--------------------------------------------------------------------------------
 1 | 
 2 | ## The following is a pair of functions that cache and compute the 
 3 | ## inverse of a matrix.
 4 | 
 5 | ## This function creates a special "matrix" object
 6 | ## that can cache its inverse.
 7 | 
 8 | makeCacheMatrix <- function(mtx = matrix()) {
 9 |     inverse <- NULL
10 |     set <- function(x) {
11 |         mtx <<- x;
12 |         inverse <<- NULL;
13 |     }
14 |     get <- function() return(mtx);
15 |     setinv <- function(inv) inverse <<- inv;
16 |     getinv <- function() return(inverse);
17 |     return(list(set = set, get = get, setinv = setinv, getinv = getinv))
18 | }
19 | 
20 | ## This function computes the inverse of the special
21 | ## "matrix" returned by `makeCacheMatrix` above. If the inverse has
22 | ## already been calculated (and the matrix has not changed), then
23 | ## `cacheSolve` should retrieve the inverse from the cache.
24 | 
25 | cacheSolve <- function(mtx, ...) {
26 |     inverse <- mtx$getinv()
27 |     if(!is.null(inverse)) {
28 |         message("Getting cached data...")
29 |         return(inverse)
30 |     }
31 |     data <- mtx$get()
32 |     invserse <- solve(data, ...)
33 |     mtx$setinv(inverse)
34 |     return(inverse)
35 | }


--------------------------------------------------------------------------------
/complete.R:
--------------------------------------------------------------------------------
 1 | complete <- function(directory, id = 1:332) {
 2 |     
 3 |     ## 'directory' is a character vector of length 1 indicating
 4 |     ## the location of the CSV files
 5 |     
 6 |     ## 'id' is an integer vector indicating the monitor ID numbers
 7 |     ## to be used
 8 |     
 9 |     ## Returns a data frame of the form:
10 |     ## id nobs
11 |     ## 1  117
12 |     ## 2  1041
13 |     ## ...
14 |     ## where 'id' is the monitor ID number and 'nobs' is the
15 |     ## number of complete cases
16 |     
17 |     files <- (Sys.glob("specdata//*.csv"));
18 |     nobs <- c();
19 |     
20 |     for (index in id) {
21 |         file_data <- read.csv(files[index], sep = ",");
22 |         complete_cases <- file_data[complete.cases(file_data),];
23 |         nobs <- c(nobs, nrow(complete_cases));
24 |     }
25 |     
26 |     return(data.frame(cbind(id, nobs)));
27 | }


--------------------------------------------------------------------------------
/corr.R:
--------------------------------------------------------------------------------
 1 | corr <- function(directory, threshold = 0) {
 2 |     
 3 |     ## 'directory' is a character vector of length 1 indicating
 4 |     ## the location of the CSV files
 5 |     
 6 |     ## 'threshold' is a numeric vector of length 1 indicating the
 7 |     ## number of completely observed observations (on all
 8 |     ## variables) required to compute the correlation between
 9 |     ## nitrate and sulfate; the default is 0
10 |     
11 |     ## Returns a numeric vector of correlations
12 |     
13 |     files <- (Sys.glob("specdata//*.csv"));
14 |     
15 |     correlations <- c()
16 |     
17 |     for (file in files) {
18 |         file_data <- read.csv(file, sep = ",");
19 |         complete_cases <- file_data[complete.cases(file_data),];
20 |         if (nrow(complete_cases) > threshold) {
21 |             correlations <- c(correlations, cor(complete_cases$sulfate, complete_cases$nitrate))
22 |         }
23 |     }
24 |     
25 |     return(correlations)
26 | }


--------------------------------------------------------------------------------
/plot1.R:
--------------------------------------------------------------------------------
 1 | ## Fetches full dataset
 2 | base_data <- read.table("household_power_consumption.txt", header = TRUE, sep = ';', na.strings = "?", check.names = FALSE, stringsAsFactors = FALSE, comment.char="", quote='\"')
 3 | base_data$Date <- as.Date(base_data$Date, format="%d/%m/%Y")
 4 | 
 5 | ## Subsets the data
 6 | data <- subset(base_data, subset = (Date >= "2007-02-01" & Date <= "2007-02-02"))
 7 | rm(base_data)
 8 | 
 9 | ## Converts dates
10 | date_time <- paste(as.Date(data$Date), data$Time)
11 | data$Datetime <- as.POSIXct(date_time)
12 | 
13 | ## Plot 1
14 | hist(data$Global_active_power, main = "Global Active Power", xlab = "Global Active Power (kilowatts)", ylab = "Frequency", col = "Red")
15 | 
16 | ## Saves data to file
17 | dev.copy(png, file = "plot1.png", height = 480, width = 480)
18 | dev.off()


--------------------------------------------------------------------------------
/plot2.R:
--------------------------------------------------------------------------------
 1 | ## Fetches full dataset
 2 | base_data <- read.table("household_power_consumption.txt", header = TRUE, sep = ';', na.strings = "?", check.names = FALSE, stringsAsFactors = FALSE, comment.char="", quote='\"')
 3 | base_data$Date <- as.Date(base_data$Date, format="%d/%m/%Y")
 4 | 
 5 | ## Subsets the data
 6 | data <- subset(base_data, subset = (Date >= "2007-02-01" & Date <= "2007-02-02"))
 7 | rm(base_data)
 8 | 
 9 | ## Converts dates
10 | date_time <- paste(as.Date(data$Date), data$Time)
11 | data$Datetime <- as.POSIXct(date_time)
12 | 
13 | ## Plot 2
14 | plot(data$Global_active_power~data$Datetime, type = "l", ylab = "Global Active Power (kilowatts)", xlab = "")
15 | 
16 | ## Saves data to file
17 | dev.copy(png, file = "plot2.png", height = 480, width = 480)
18 | dev.off()


--------------------------------------------------------------------------------
/plot3.R:
--------------------------------------------------------------------------------
 1 | ## Fetches full dataset
 2 | base_data <- read.table("household_power_consumption.txt", header = TRUE, sep = ';', na.strings = "?", check.names = FALSE, stringsAsFactors = FALSE, comment.char="", quote='\"')
 3 | base_data$Date <- as.Date(base_data$Date, format="%d/%m/%Y")
 4 | 
 5 | ## Subsets the data
 6 | data <- subset(base_data, subset = (Date >= "2007-02-01" & Date <= "2007-02-02"))
 7 | rm(base_data)
 8 | 
 9 | ## Converts dates
10 | date_time <- paste(as.Date(data$Date), data$Time)
11 | data$Datetime <- as.POSIXct(date_time)
12 | 
13 | ## Plot 3
14 | with(data, {
15 |     plot(Sub_metering_1~Datetime, type = "l",
16 |          ylab = "Global Active Power (kilowatts)", xlab = "")
17 |     lines(Sub_metering_2~Datetime, col = 'Red')
18 |     lines(Sub_metering_3~Datetime, col = 'Blue')
19 | })
20 | legend("topright", col=c("black", "red", "blue"), lty = 1, lwd = 2, 
21 |        legend = c("Sub_metering_1", "Sub_metering_2", "Sub_metering_3"))
22 | 
23 | ## Saves data to file
24 | dev.copy(png, file = "plot3.png", height = 480, width = 480)
25 | dev.off()


--------------------------------------------------------------------------------
/plot4.R:
--------------------------------------------------------------------------------
 1 | ## Fetches full dataset
 2 | base_data <- read.table("household_power_consumption.txt", header = TRUE, sep = ';', na.strings = "?", check.names = FALSE, stringsAsFactors = FALSE, comment.char="", quote='\"')
 3 | base_data$Date <- as.Date(base_data$Date, format="%d/%m/%Y")
 4 | 
 5 | ## Subsets the data
 6 | data <- subset(base_data, subset = (Date >= "2007-02-01" & Date <= "2007-02-02"))
 7 | rm(base_data)
 8 | 
 9 | ## Converts dates
10 | date_time <- paste(as.Date(data$Date), data$Time)
11 | data$Datetime <- as.POSIXct(date_time)
12 | 
13 | ## Plot 4
14 | par(mfrow = c(2, 2), mar = c(4, 4, 2, 1), oma = c(0, 0, 2, 0))
15 | with(data, {
16 |     plot(Global_active_power~Datetime, type = "l", 
17 |          ylab = "Global Active Power (kilowatts)", xlab = "")
18 |     plot(Voltage~Datetime, type = "l", 
19 |          ylab = "Voltage (volt)", xlab = "")
20 |     plot(Sub_metering_1~Datetime, type = "l", 
21 |          ylab = "Global Active Power (kilowatts)", xlab = "")
22 |     lines(Sub_metering_2~Datetime, col = 'Red')
23 |     lines(Sub_metering_3~Datetime, col = 'Blue')
24 |     legend("topright", col = c("black", "red", "blue"), lty = 1, lwd = 2, bty = "n",
25 |            legend = c("Sub_metering_1", "Sub_metering_2", "Sub_metering_3"))
26 |     plot(Global_reactive_power~Datetime, type = "l", 
27 |          ylab = "Global Rective Power (kilowatts)", xlab = "")
28 | })
29 | 
30 | ## Saves data to file
31 | dev.copy(png, file = "plot4.png", height = 480, width = 480)
32 | dev.off()


--------------------------------------------------------------------------------
/pollutantmean.R:
--------------------------------------------------------------------------------
 1 | pollutantmean <- function(directory, pollutant, id = 1:332) {
 2 |   
 3 |     ## 'directory' is a character vector of length 1 indicating
 4 |     ## the location of the CSV files
 5 |     
 6 |     ## 'pollutant' is a character vector of length 1 indicating
 7 |     ## the name of the pollutant for which we will calculate the
 8 |     ## mean; either "sulfate" or "nitrate".
 9 |     
10 |     ## 'id' is an integer vector indicating the monitor ID numbers
11 |     ## to be used
12 |     
13 |     ## Returns the mean of the pollutant across all monitors list
14 |     ## in the 'id' vector (ignoring NA values)
15 |     
16 |     files <- (Sys.glob("specdata//*.csv"))[id];
17 |     
18 |     combined_data <- c()
19 |     
20 |     for (file in files) {
21 |         file_data <- read.csv(file, sep = ",");
22 |         pollutant_data <- file_data[,pollutant];
23 |         pollutant_data <- pollutant_data[!is.na(pollutant_data)]
24 |         combined_data <- c(combined_data, pollutant_data)
25 |     }
26 |     
27 |     return(mean(combined_data));
28 | }


--------------------------------------------------------------------------------
/rankall.R:
--------------------------------------------------------------------------------
 1 | rankall <- function(outcome, num = "best") {
 2 |     
 3 |     ## Reads outcome data
 4 |     file_data <- read.csv("outcome-of-care-measures.csv", sep = ",")
 5 |     
 6 |     ## Checks that state and outcome are valid
 7 |     valid_states <- c("AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DE", "FL", "GA", "HI", "IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME", "MI", "MN", "MO", "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI", "WV", "WY")
 8 |     valid_outcomes <- c("heart attack", "heart failure", "pneumonia")
 9 |     if (!is.element(outcome, valid_outcomes)) stop("invalid outcome")
10 |     
11 |     header_name <- NULL
12 |     if (outcome == "heart attack") header_name <- "Hospital.30.Day.Death..Mortality..Rates.from.Heart.Attack"
13 |     else if (outcome == "heart failure") header_name <- "Hospital.30.Day.Death..Mortality..Rates.from.Heart.Failure"
14 |     else header_name <- "Hospital.30.Day.Death..Mortality..Rates.from.Pneumonia"
15 |     
16 |     hosps <- c()
17 |     states <- c()
18 |     
19 |     ## For each state, finds the hospital of the given rank
20 |     for (state in valid_states) {
21 |         ranked_hosp <- c()
22 |         data <- file_data[file_data$State == state,]        
23 |         sorted_data <- data[order(as.numeric(as.character(data[,header_name])), as.character(data[,"Hospital.Name"])),]
24 |         sorted_data <- sorted_data[!sorted_data[,header_name] == "Not Available",]
25 |         if (num == "best") {
26 |             ranked_hosp <- best(state, outcome)
27 |         } else if (num == "worst") {
28 |             ranked_hosp <- as.character(tail(sorted_data[,"Hospital.Name"], n = 1))
29 |         } else {
30 |             ranked_hosp <- as.character(sorted_data[,"Hospital.Name"][num])
31 |         }
32 |         hosps <- c(hosps, ranked_hosp)
33 |     }
34 |     result <- data.frame(hosps, valid_states)
35 |     colnames(result) <- c("hospital", "state")
36 |     return(result)
37 | }


--------------------------------------------------------------------------------
/rankhospital.R:
--------------------------------------------------------------------------------
 1 | rankhospital <- function(state, outcome, num = "best") {
 2 |     
 3 |     ## Reads outcome data
 4 |     file_data <- read.csv("outcome-of-care-measures.csv", sep = ",")
 5 |     
 6 |     ## Checks that state and outcome are valid
 7 |     valid_states <- c("AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY")
 8 |     valid_outcomes <- c("heart attack", "heart failure", "pneumonia")
 9 |     if (!is.element(state, valid_states)) stop("invalid state")
10 |     if (!is.element(outcome, valid_outcomes)) stop("invalid outcome")
11 |     
12 |     ## Returns hospital name in that state with lowest 30-day death
13 |     data <- file_data[file_data$State == state,]
14 |     header_name <- NULL
15 |     if (outcome == "heart attack") header_name <- "Hospital.30.Day.Death..Mortality..Rates.from.Heart.Attack"
16 |     else if (outcome == "heart failure") header_name <- "Hospital.30.Day.Death..Mortality..Rates.from.Heart.Failure"
17 |     else header_name <- "Hospital.30.Day.Death..Mortality..Rates.from.Pneumonia"
18 | 
19 |     sorted_data <- data[order(as.numeric(as.character(data[,header_name])), as.character(data[,"Hospital.Name"])),]
20 |     sorted_data <- sorted_data[!sorted_data[,header_name] == "Not Available",]
21 |     if (num == "best") {
22 |         return(best(state, outcome))
23 |     } else if (num == "worst") {
24 |         return(tail(as.character(sorted_data[,"Hospital.Name"]), n = 1))
25 |     }
26 |     return(as.character(sorted_data[,"Hospital.Name"][num]))
27 | }


--------------------------------------------------------------------------------