├── README.md └── random_forest_demo.R /README.md: -------------------------------------------------------------------------------- 1 | This is the code from the StatQuest... 2 | 3 | * Random Forests in R: https://youtu.be/6EXPYzbfLCE 4 | -------------------------------------------------------------------------------- /random_forest_demo.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | library(cowplot) 3 | library(randomForest) 4 | 5 | ## NOTE: The data used in this demo comes from the UCI machine learning 6 | ## repository. 7 | ## http://archive.ics.uci.edu/ml/index.php 8 | ## Specifically, this is the heart disease data set. 9 | ## http://archive.ics.uci.edu/ml/datasets/Heart+Disease 10 | 11 | url <- "http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data" 12 | 13 | data <- read.csv(url, header=FALSE) 14 | 15 | ##################################### 16 | ## 17 | ## Reformat the data so that it is 18 | ## 1) Easy to use (add nice column names) 19 | ## 2) Interpreted correctly by randomForest.. 20 | ## 21 | ##################################### 22 | head(data) # you see data, but no column names 23 | 24 | colnames(data) <- c( 25 | "age", 26 | "sex",# 0 = female, 1 = male 27 | "cp", # chest pain 28 | # 1 = typical angina, 29 | # 2 = atypical angina, 30 | # 3 = non-anginal pain, 31 | # 4 = asymptomatic 32 | "trestbps", # resting blood pressure (in mm Hg) 33 | "chol", # serum cholestoral in mg/dl 34 | "fbs", # fasting blood sugar if less than 120 mg/dl, 1 = TRUE, 0 = FALSE 35 | "restecg", # resting electrocardiographic results 36 | # 1 = normal 37 | # 2 = having ST-T wave abnormality 38 | # 3 = showing probable or definite left ventricular hypertrophy 39 | "thalach", # maximum heart rate achieved 40 | "exang", # exercise induced angina, 1 = yes, 0 = no 41 | "oldpeak", # ST depression induced by exercise relative to rest 42 | "slope", # the slope of the peak exercise ST segment 43 | # 1 = upsloping 44 | # 2 = flat 45 | # 3 = downsloping 46 | "ca", # number of major vessels (0-3) colored by fluoroscopy 47 | "thal", # this is short of thalium heart scan 48 | # 3 = normal (no cold spots) 49 | # 6 = fixed defect (cold spots during rest and exercise) 50 | # 7 = reversible defect (when cold spots only appear during exercise) 51 | "hd" # (the predicted attribute) - diagnosis of heart disease 52 | # 0 if less than or equal to 50% diameter narrowing 53 | # 1 if greater than 50% diameter narrowing 54 | ) 55 | 56 | head(data) # now we have data and column names 57 | 58 | str(data) # this shows that we need to tell R which columns contain factors 59 | # it also shows us that there are some missing values. There are "?"s 60 | # in the dataset. 61 | 62 | ## First, replace "?"s with NAs. 63 | data[data == "?"] <- NA 64 | 65 | ## Now add factors for variables that are factors and clean up the factors 66 | ## that had missing data... 67 | data[data$sex == 0,]$sex <- "F" 68 | data[data$sex == 1,]$sex <- "M" 69 | data$sex <- as.factor(data$sex) 70 | 71 | data$cp <- as.factor(data$cp) 72 | data$fbs <- as.factor(data$fbs) 73 | data$restecg <- as.factor(data$restecg) 74 | data$exang <- as.factor(data$exang) 75 | data$slope <- as.factor(data$slope) 76 | 77 | data$ca <- as.integer(data$ca) # since this column had "?"s in it (which 78 | # we have since converted to NAs) R thinks that 79 | # the levels for the factor are strings, but 80 | # we know they are integers, so we'll first 81 | # convert the strings to integiers... 82 | data$ca <- as.factor(data$ca) # ...then convert the integers to factor levels 83 | 84 | data$thal <- as.integer(data$thal) # "thal" also had "?"s in it. 85 | data$thal <- as.factor(data$thal) 86 | 87 | ## This next line replaces 0 and 1 with "Healthy" and "Unhealthy" 88 | data$hd <- ifelse(test=data$hd == 0, yes="Healthy", no="Unhealthy") 89 | data$hd <- as.factor(data$hd) # Now convert to a factor 90 | 91 | str(data) ## this shows that the correct columns are factors and we've replaced 92 | ## "?"s with NAs because "?" no longer appears in the list of factors 93 | ## for "ca" and "thal" 94 | 95 | 96 | ##################################### 97 | ## 98 | ## Now we are ready to build a random forest. 99 | ## 100 | ##################################### 101 | set.seed(42) 102 | 103 | ## NOTE: For most machine learning methods, you need to divide the data 104 | ## manually into a "training" set and a "test" set. This allows you to train 105 | ## the method using the training data, and then test it on data it was not 106 | ## originally trained on. 107 | ## 108 | ## In contrast, Random Forests split the data into "training" and "test" sets 109 | ## for you. This is because Random Forests use bootstrapped 110 | ## data, and thus, not every sample is used to build every tree. The 111 | ## "training" dataset is the bootstrapped data and the "test" dataset is 112 | ## the remaining samples. The remaining samples are called 113 | ## the "Out-Of-Bag" (OOB) data. 114 | 115 | ## impute any missing values in the training set using proximities 116 | data.imputed <- rfImpute(hd ~ ., data = data, iter=6) 117 | ## NOTE: iter = the number of iterations to run. Breiman says 4 to 6 iterations 118 | ## is usually good enough. With this dataset, when we set iter=6, OOB-error 119 | ## bounces around between 17% and 18%. When we set iter=20, 120 | # set.seed(42) 121 | # data.imputed <- rfImpute(hd ~ ., data = data, iter=20) 122 | ## we get values a little better and a little worse, so doing more 123 | ## iterations doesn't improve the situation. 124 | ## 125 | ## NOTE: If you really want to micromanage how rfImpute(), 126 | ## you can change the number of trees it makes (the default is 300) and the 127 | ## number of variables that it will consider at each step. 128 | 129 | ## Now we are ready to build a random forest. 130 | 131 | ## NOTE: If the thing we're trying to predict (in this case it is 132 | ## whether or not someone has heart disease) is a continuous number 133 | ## (i.e. "weight" or "height"), then by default, randomForest() will set 134 | ## "mtry", the number of variables to consider at each step, 135 | ## to the total number of variables divided by 3 (rounded down), or to 1 136 | ## (if the division results in a value less than 1). 137 | ## If the thing we're trying to predict is a "factor" (i.e. either "yes/no" 138 | ## or "ranked"), then randomForest() will set mtry to 139 | ## the square root of the number of variables (rounded down to the next 140 | ## integer value). 141 | 142 | ## In this example, "hd", the thing we are trying to predict, is a factor and 143 | ## there are 13 variables. So by default, randomForest() will set 144 | ## mtry = sqrt(13) = 3.6 rounded down = 3 145 | ## Also, by default random forest generates 500 trees (NOTE: rfImpute() only 146 | ## generates 300 tress by default) 147 | model <- randomForest(hd ~ ., data=data.imputed, proximity=TRUE) 148 | 149 | ## RandomForest returns all kinds of things 150 | model # gives us an overview of the call, along with... 151 | # 1) The OOB error rate for the forest with ntree trees. 152 | # In this case ntree=500 by default 153 | # 2) The confusion matrix for the forest with ntree trees. 154 | # The confusion matrix is laid out like this: 155 | # 156 | # Healthy Unhealthy 157 | # -------------------------------------------------------------- 158 | # Healthy | Number of healthy people | Number of healthy people | 159 | # | correctly called "healthy" | incorectly called "unhealthy" | 160 | # | by the forest. | by the forest | 161 | # -------------------------------------------------------------- 162 | # Unhealthy| Number of unhealthy people | Number of unhealthy peole | 163 | # | incorrectly called | correctly called "unhealthy" | 164 | # | "healthy" by the forest | by the forest | 165 | # -------------------------------------------------------------- 166 | 167 | ## Now check to see if the random forest is actually big enough... 168 | ## Up to a point, the more trees in the forest, the better. You can tell when 169 | ## you've made enough when the OOB no longer improves. 170 | oob.error.data <- data.frame( 171 | Trees=rep(1:nrow(model$err.rate), times=3), 172 | Type=rep(c("OOB", "Healthy", "Unhealthy"), each=nrow(model$err.rate)), 173 | Error=c(model$err.rate[,"OOB"], 174 | model$err.rate[,"Healthy"], 175 | model$err.rate[,"Unhealthy"])) 176 | 177 | ggplot(data=oob.error.data, aes(x=Trees, y=Error)) + 178 | geom_line(aes(color=Type)) 179 | # ggsave("oob_error_rate_500_trees.pdf") 180 | 181 | ## Blue line = The error rate specifically for calling "Unheathly" patients that 182 | ## are OOB. 183 | ## 184 | ## Green line = The overall OOB error rate. 185 | ## 186 | ## Red line = The error rate specifically for calling "Healthy" patients 187 | ## that are OOB. 188 | 189 | ## NOTE: After building a random forest with 500 tress, the graph does not make 190 | ## it clear that the OOB-error has settled on a value or, if we added more 191 | ## trees, it would continue to decrease. 192 | ## So we do the whole thing again, but this time add more trees. 193 | 194 | model <- randomForest(hd ~ ., data=data.imputed, ntree=1000, proximity=TRUE) 195 | model 196 | 197 | oob.error.data <- data.frame( 198 | Trees=rep(1:nrow(model$err.rate), times=3), 199 | Type=rep(c("OOB", "Healthy", "Unhealthy"), each=nrow(model$err.rate)), 200 | Error=c(model$err.rate[,"OOB"], 201 | model$err.rate[,"Healthy"], 202 | model$err.rate[,"Unhealthy"])) 203 | 204 | ggplot(data=oob.error.data, aes(x=Trees, y=Error)) + 205 | geom_line(aes(color=Type)) 206 | # ggsave("oob_error_rate_1000_trees.pdf") 207 | 208 | ## After building a random forest with 1,000 trees, we get the same OOB-error 209 | ## 16.5% and we can see convergence in the graph. So we could have gotten 210 | ## away with only 500 trees, but we wouldn't have been sure that number 211 | ## was enough. 212 | 213 | 214 | ## If we want to compare this random forest to others with different values for 215 | ## mtry (to control how many variables are considered at each step)... 216 | oob.values <- vector(length=10) 217 | for(i in 1:10) { 218 | temp.model <- randomForest(hd ~ ., data=data.imputed, mtry=i, ntree=1000) 219 | oob.values[i] <- temp.model$err.rate[nrow(temp.model$err.rate),1] 220 | } 221 | oob.values 222 | ## find the minimum error 223 | min(oob.values) 224 | ## find the optimal value for mtry... 225 | which(oob.values == min(oob.values)) 226 | ## create a model for proximities using the best value for mtry 227 | model <- randomForest(hd ~ ., 228 | data=data.imputed, 229 | ntree=1000, 230 | proximity=TRUE, 231 | mtry=which(oob.values == min(oob.values))) 232 | 233 | ## Now let's create an MDS-plot to show how the samples are related to each 234 | ## other. 235 | ## 236 | ## Start by converting the proximity matrix into a distance matrix. 237 | distance.matrix <- as.dist(1-model$proximity) 238 | 239 | mds.stuff <- cmdscale(distance.matrix, eig=TRUE, x.ret=TRUE) 240 | 241 | ## calculate the percentage of variation that each MDS axis accounts for... 242 | mds.var.per <- round(mds.stuff$eig/sum(mds.stuff$eig)*100, 1) 243 | 244 | ## now make a fancy looking plot that shows the MDS axes and the variation: 245 | mds.values <- mds.stuff$points 246 | mds.data <- data.frame(Sample=rownames(mds.values), 247 | X=mds.values[,1], 248 | Y=mds.values[,2], 249 | Status=data.imputed$hd) 250 | 251 | ggplot(data=mds.data, aes(x=X, y=Y, label=Sample)) + 252 | geom_text(aes(color=Status)) + 253 | theme_bw() + 254 | xlab(paste("MDS1 - ", mds.var.per[1], "%", sep="")) + 255 | ylab(paste("MDS2 - ", mds.var.per[2], "%", sep="")) + 256 | ggtitle("MDS plot using (1 - Random Forest Proximities)") 257 | # ggsave(file="random_forest_mds_plot.pdf") 258 | --------------------------------------------------------------------------------