├── README.md
└── random_forest_demo.R


/README.md:
--------------------------------------------------------------------------------
1 | This is the code from the StatQuest...
2 | 
3 | * Random Forests in R: https://youtu.be/6EXPYzbfLCE
4 | 


--------------------------------------------------------------------------------
/random_forest_demo.R:
--------------------------------------------------------------------------------
  1 | library(ggplot2)
  2 | library(cowplot)
  3 | library(randomForest)
  4 | 
  5 | ## NOTE: The data used in this demo comes from the UCI machine learning
  6 | ## repository.
  7 | ## http://archive.ics.uci.edu/ml/index.php
  8 | ## Specifically, this is the heart disease data set.
  9 | ## http://archive.ics.uci.edu/ml/datasets/Heart+Disease
 10 | 
 11 | url <- "http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
 12 | 
 13 | data <- read.csv(url, header=FALSE)
 14 | 
 15 | #####################################
 16 | ##
 17 | ## Reformat the data so that it is 
 18 | ## 1) Easy to use (add nice column names)
 19 | ## 2) Interpreted correctly by randomForest..
 20 | ##
 21 | #####################################
 22 | head(data) # you see data, but no column names
 23 | 
 24 | colnames(data) <- c(
 25 |   "age",
 26 |   "sex",# 0 = female, 1 = male
 27 |   "cp", # chest pain 
 28 |           # 1 = typical angina, 
 29 |           # 2 = atypical angina, 
 30 |           # 3 = non-anginal pain, 
 31 |           # 4 = asymptomatic
 32 |   "trestbps", # resting blood pressure (in mm Hg)
 33 |   "chol", # serum cholestoral in mg/dl
 34 |   "fbs",  # fasting blood sugar if less than 120 mg/dl, 1 = TRUE, 0 = FALSE
 35 |   "restecg", # resting electrocardiographic results
 36 |           # 1 = normal
 37 |           # 2 = having ST-T wave abnormality
 38 |           # 3 = showing probable or definite left ventricular hypertrophy
 39 |   "thalach", # maximum heart rate achieved
 40 |   "exang",   # exercise induced angina, 1 = yes, 0 = no
 41 |   "oldpeak", # ST depression induced by exercise relative to rest
 42 |   "slope", # the slope of the peak exercise ST segment 
 43 |           # 1 = upsloping 
 44 |           # 2 = flat 
 45 |           # 3 = downsloping 
 46 |   "ca", # number of major vessels (0-3) colored by fluoroscopy
 47 |   "thal", # this is short of thalium heart scan
 48 |           # 3 = normal (no cold spots)
 49 |           # 6 = fixed defect (cold spots during rest and exercise)
 50 |           # 7 = reversible defect (when cold spots only appear during exercise)
 51 |   "hd" # (the predicted attribute) - diagnosis of heart disease 
 52 |           # 0 if less than or equal to 50% diameter narrowing
 53 |           # 1 if greater than 50% diameter narrowing
 54 |   )
 55 | 
 56 | head(data) # now we have data and column names
 57 | 
 58 | str(data) # this shows that we need to tell R which columns contain factors
 59 |           # it also shows us that there are some missing values. There are "?"s
 60 |           # in the dataset.
 61 | 
 62 | ## First, replace "?"s with NAs.
 63 | data[data == "?"] <- NA
 64 | 
 65 | ## Now add factors for variables that are factors and clean up the factors
 66 | ## that had missing data...
 67 | data[data$sex == 0,]$sex <- "F"
 68 | data[data$sex == 1,]$sex <- "M"
 69 | data$sex <- as.factor(data$sex)
 70 | 
 71 | data$cp <- as.factor(data$cp)
 72 | data$fbs <- as.factor(data$fbs)
 73 | data$restecg <- as.factor(data$restecg)
 74 | data$exang <- as.factor(data$exang)
 75 | data$slope <- as.factor(data$slope)
 76 | 
 77 | data$ca <- as.integer(data$ca) # since this column had "?"s in it (which
 78 |                                # we have since converted to NAs) R thinks that
 79 |                                # the levels for the factor are strings, but
 80 |                                # we know they are integers, so we'll first
 81 |                                # convert the strings to integiers...
 82 | data$ca <- as.factor(data$ca)  # ...then convert the integers to factor levels
 83 | 
 84 | data$thal <- as.integer(data$thal) # "thal" also had "?"s in it.
 85 | data$thal <- as.factor(data$thal)
 86 | 
 87 | ## This next line replaces 0 and 1 with "Healthy" and "Unhealthy"
 88 | data$hd <- ifelse(test=data$hd == 0, yes="Healthy", no="Unhealthy")
 89 | data$hd <- as.factor(data$hd) # Now convert to a factor
 90 | 
 91 | str(data) ## this shows that the correct columns are factors and we've replaced
 92 |   ## "?"s with NAs because "?" no longer appears in the list of factors
 93 |   ## for "ca" and "thal"
 94 | 
 95 | 
 96 | #####################################
 97 | ##
 98 | ## Now we are ready to build a random forest.
 99 | ##
100 | #####################################
101 | set.seed(42)
102 | 
103 | ## NOTE: For most machine learning methods, you need to divide the data
104 | ## manually into a "training" set and a "test" set. This allows you to train 
105 | ## the method using the training data, and then test it on data it was not
106 | ## originally trained on. 
107 | ##
108 | ## In contrast, Random Forests split the data into "training" and "test" sets 
109 | ## for you. This is because Random Forests use bootstrapped
110 | ## data, and thus, not every sample is used to build every tree. The 
111 | ## "training" dataset is the bootstrapped data and the "test" dataset is
112 | ## the remaining samples. The remaining samples are called
113 | ## the "Out-Of-Bag" (OOB) data.
114 | 
115 | ## impute any missing values in the training set using proximities
116 | data.imputed <- rfImpute(hd ~ ., data = data, iter=6)
117 | ## NOTE: iter = the number of iterations to run. Breiman says 4 to 6 iterations
118 | ## is usually good enough. With this dataset, when we set iter=6, OOB-error
119 | ## bounces around between 17% and 18%. When we set iter=20, 
120 | # set.seed(42)
121 | # data.imputed <- rfImpute(hd ~ ., data = data, iter=20)
122 | ## we get values a little better and a little worse, so doing more 
123 | ## iterations doesn't improve the situation.
124 | ##
125 | ## NOTE: If you really want to micromanage how rfImpute(),
126 | ## you can change the number of trees it makes (the default is 300) and the
127 | ## number of variables that it will consider at each step.
128 | 
129 | ## Now we are ready to build a random forest.
130 | 
131 | ## NOTE: If the thing we're trying to predict (in this case it is 
132 | ## whether or not someone has heart disease) is a continuous number 
133 | ## (i.e. "weight" or "height"), then by default, randomForest() will set 
134 | ## "mtry", the number of variables to consider at each step, 
135 | ## to the total number of variables divided by 3 (rounded down), or to 1 
136 | ## (if the division results in a value less than 1).
137 | ## If the thing we're trying to predict is a "factor" (i.e. either "yes/no"
138 | ## or "ranked"), then randomForest() will set mtry to 
139 | ## the square root of the number of variables (rounded down to the next
140 | ## integer value).
141 | 
142 | ## In this example, "hd", the thing we are trying to predict, is a factor and
143 | ## there are 13 variables. So by default, randomForest() will set 
144 | ## mtry = sqrt(13) = 3.6 rounded down = 3
145 | ## Also, by default random forest generates 500 trees (NOTE: rfImpute() only
146 | ## generates 300 tress by default)
147 | model <- randomForest(hd ~ ., data=data.imputed, proximity=TRUE)
148 | 
149 | ## RandomForest returns all kinds of things
150 | model # gives us an overview of the call, along with...
151 |       # 1) The OOB error rate for the forest with ntree trees. 
152 |       #    In this case ntree=500 by default
153 |       # 2) The confusion matrix for the forest with ntree trees.
154 |       #    The confusion matrix is laid out like this:
155 | #          
156 | #                Healthy                      Unhealthy
157 | #          --------------------------------------------------------------
158 | # Healthy  | Number of healthy people   | Number of healthy people      |
159 | #          | correctly called "healthy" | incorectly called "unhealthy" |
160 | #          | by the forest.             | by the forest                 |
161 | #          --------------------------------------------------------------
162 | # Unhealthy| Number of unhealthy people | Number of unhealthy peole     |
163 | #          | incorrectly called         | correctly called "unhealthy"  |
164 | #          | "healthy" by the forest    | by the forest                 |
165 | #          --------------------------------------------------------------
166 | 
167 | ## Now check to see if the random forest is actually big enough...
168 | ## Up to a point, the more trees in the forest, the better. You can tell when
169 | ## you've made enough when the OOB no longer improves.
170 | oob.error.data <- data.frame(
171 |   Trees=rep(1:nrow(model$err.rate), times=3),
172 |   Type=rep(c("OOB", "Healthy", "Unhealthy"), each=nrow(model$err.rate)),
173 |   Error=c(model$err.rate[,"OOB"], 
174 |     model$err.rate[,"Healthy"], 
175 |     model$err.rate[,"Unhealthy"]))
176 | 
177 | ggplot(data=oob.error.data, aes(x=Trees, y=Error)) +
178 |   geom_line(aes(color=Type))
179 | # ggsave("oob_error_rate_500_trees.pdf")
180 | 
181 | ## Blue line = The error rate specifically for calling "Unheathly" patients that
182 | ## are OOB.
183 | ##
184 | ## Green line = The overall OOB error rate.
185 | ##
186 | ## Red line = The error rate specifically for calling "Healthy" patients 
187 | ## that are OOB.
188 | 
189 | ## NOTE: After building a random forest with 500 tress, the graph does not make 
190 | ## it clear that the OOB-error has settled on a value or, if we added more 
191 | ## trees, it would continue to decrease.
192 | ## So we do the whole thing again, but this time add more trees.
193 | 
194 | model <- randomForest(hd ~ ., data=data.imputed, ntree=1000, proximity=TRUE)
195 | model
196 | 
197 | oob.error.data <- data.frame(
198 |   Trees=rep(1:nrow(model$err.rate), times=3),
199 |   Type=rep(c("OOB", "Healthy", "Unhealthy"), each=nrow(model$err.rate)),
200 |   Error=c(model$err.rate[,"OOB"], 
201 |     model$err.rate[,"Healthy"], 
202 |     model$err.rate[,"Unhealthy"]))
203 | 
204 | ggplot(data=oob.error.data, aes(x=Trees, y=Error)) +
205 |   geom_line(aes(color=Type))
206 | # ggsave("oob_error_rate_1000_trees.pdf")
207 | 
208 | ## After building a random forest with 1,000 trees, we get the same OOB-error
209 | ## 16.5% and we can see convergence in the graph. So we could have gotten
210 | ## away with only 500 trees, but we wouldn't have been sure that number
211 | ## was enough.
212 | 
213 | 
214 | ## If we want to compare this random forest to others with different values for
215 | ## mtry (to control how many variables are considered at each step)...
216 | oob.values <- vector(length=10)
217 | for(i in 1:10) {
218 |   temp.model <- randomForest(hd ~ ., data=data.imputed, mtry=i, ntree=1000)
219 |   oob.values[i] <- temp.model$err.rate[nrow(temp.model$err.rate),1]
220 | }
221 | oob.values
222 | ## find the minimum error
223 | min(oob.values)
224 | ## find the optimal value for mtry...
225 | which(oob.values == min(oob.values))
226 | ## create a model for proximities using the best value for mtry
227 | model <- randomForest(hd ~ ., 
228 |                       data=data.imputed,
229 |                       ntree=1000, 
230 |                       proximity=TRUE, 
231 |                       mtry=which(oob.values == min(oob.values)))
232 | 
233 | ## Now let's create an MDS-plot to show how the samples are related to each 
234 | ## other.
235 | ##
236 | ## Start by converting the proximity matrix into a distance matrix.
237 | distance.matrix <- as.dist(1-model$proximity)
238 | 
239 | mds.stuff <- cmdscale(distance.matrix, eig=TRUE, x.ret=TRUE)
240 | 
241 | ## calculate the percentage of variation that each MDS axis accounts for...
242 | mds.var.per <- round(mds.stuff$eig/sum(mds.stuff$eig)*100, 1)
243 | 
244 | ## now make a fancy looking plot that shows the MDS axes and the variation:
245 | mds.values <- mds.stuff$points
246 | mds.data <- data.frame(Sample=rownames(mds.values),
247 |   X=mds.values[,1],
248 |   Y=mds.values[,2],
249 |   Status=data.imputed$hd)
250 | 
251 | ggplot(data=mds.data, aes(x=X, y=Y, label=Sample)) + 
252 |   geom_text(aes(color=Status)) +
253 |   theme_bw() +
254 |   xlab(paste("MDS1 - ", mds.var.per[1], "%", sep="")) +
255 |   ylab(paste("MDS2 - ", mds.var.per[2], "%", sep="")) +
256 |   ggtitle("MDS plot using (1 - Random Forest Proximities)")
257 | # ggsave(file="random_forest_mds_plot.pdf")
258 | 


--------------------------------------------------------------------------------