├── classification-tree-and-rule-based-models ├── test ├── random_forests_143.R └── classification_trees_rules_142.R ├── README.md ├── linear-regression-and-cousins ├── IR_data_61_files │ └── figure-html │ │ ├── unnamed-chunk-2-1.png │ │ ├── unnamed-chunk-2-2.png │ │ ├── unnamed-chunk-4-1.png │ │ ├── unnamed-chunk-4-2.png │ │ ├── unnamed-chunk-4-3.png │ │ ├── unnamed-chunk-5-1.png │ │ ├── unnamed-chunk-5-2.png │ │ ├── unnamed-chunk-5-3.png │ │ ├── unnamed-chunk-5-4.png │ │ ├── unnamed-chunk-5-5.png │ │ ├── unnamed-chunk-5-6.png │ │ └── unnamed-chunk-7-1.png ├── permeability_data_62_files │ └── figure-html │ │ ├── unnamed-chunk-2-1.png │ │ ├── unnamed-chunk-3-1.png │ │ ├── unnamed-chunk-3-2.png │ │ ├── unnamed-chunk-3-3.png │ │ ├── unnamed-chunk-3-4.png │ │ └── unnamed-chunk-4-1.png ├── permeability_data_62.Rmd ├── chem_manufact_data_63.R ├── IR_data_61.Rmd ├── permeability_data_62.md └── IR_data_61.md ├── measuring-predictor-importance ├── oil_data_182_files │ └── figure-html │ │ ├── unnamed-chunk-2-1.png │ │ ├── unnamed-chunk-2-2.png │ │ └── unnamed-chunk-6-1.png ├── abalone_data_183_files │ └── figure-html │ │ ├── unnamed-chunk-2-1.png │ │ ├── unnamed-chunk-2-2.png │ │ ├── unnamed-chunk-3-1.png │ │ ├── unnamed-chunk-3-2.png │ │ ├── unnamed-chunk-4-1.png │ │ └── unnamed-chunk-4-2.png ├── churn_data_181_files │ └── figure-html │ │ ├── unnamed-chunk-1-1.png │ │ ├── unnamed-chunk-10-1.png │ │ ├── unnamed-chunk-11-1.png │ │ ├── unnamed-chunk-2-1.png │ │ ├── unnamed-chunk-4-1.png │ │ └── unnamed-chunk-9-1.png ├── oil_data_182.Rmd ├── oil_data_182.md ├── abalone_data_183.Rmd ├── churn_data_181.Rmd ├── abalone_data_183.md └── churn_data_181.md ├── non-linear-regression-models ├── svm_simulation_71.R └── friedman_data_72.R ├── linear-classification-models └── linear_classification_hepatic_121.R └── data-pre-processing └── data_pre_proc.R /classification-tree-and-rule-based-models/test: -------------------------------------------------------------------------------- 1 | hello 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Applied-Predictive-Modeling 2 | Exercises and R code related to the book Applied Predictive Modeling by Max Kuhn and Kjell Johnson 3 | -------------------------------------------------------------------------------- /linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-2-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-2-2.png -------------------------------------------------------------------------------- /linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-4-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-4-2.png -------------------------------------------------------------------------------- /linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-4-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-4-3.png -------------------------------------------------------------------------------- /linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-5-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-5-2.png -------------------------------------------------------------------------------- /linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-5-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-5-3.png -------------------------------------------------------------------------------- /linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-5-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-5-4.png -------------------------------------------------------------------------------- /linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-5-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-5-5.png -------------------------------------------------------------------------------- /linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-5-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-5-6.png -------------------------------------------------------------------------------- /linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-7-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-7-1.png -------------------------------------------------------------------------------- /measuring-predictor-importance/oil_data_182_files/figure-html/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/oil_data_182_files/figure-html/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /measuring-predictor-importance/oil_data_182_files/figure-html/unnamed-chunk-2-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/oil_data_182_files/figure-html/unnamed-chunk-2-2.png -------------------------------------------------------------------------------- /measuring-predictor-importance/oil_data_182_files/figure-html/unnamed-chunk-6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/oil_data_182_files/figure-html/unnamed-chunk-6-1.png -------------------------------------------------------------------------------- /measuring-predictor-importance/abalone_data_183_files/figure-html/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/abalone_data_183_files/figure-html/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /measuring-predictor-importance/abalone_data_183_files/figure-html/unnamed-chunk-2-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/abalone_data_183_files/figure-html/unnamed-chunk-2-2.png -------------------------------------------------------------------------------- /measuring-predictor-importance/abalone_data_183_files/figure-html/unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/abalone_data_183_files/figure-html/unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /measuring-predictor-importance/abalone_data_183_files/figure-html/unnamed-chunk-3-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/abalone_data_183_files/figure-html/unnamed-chunk-3-2.png -------------------------------------------------------------------------------- /measuring-predictor-importance/abalone_data_183_files/figure-html/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/abalone_data_183_files/figure-html/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /measuring-predictor-importance/abalone_data_183_files/figure-html/unnamed-chunk-4-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/abalone_data_183_files/figure-html/unnamed-chunk-4-2.png -------------------------------------------------------------------------------- /measuring-predictor-importance/churn_data_181_files/figure-html/unnamed-chunk-1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/churn_data_181_files/figure-html/unnamed-chunk-1-1.png -------------------------------------------------------------------------------- /measuring-predictor-importance/churn_data_181_files/figure-html/unnamed-chunk-10-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/churn_data_181_files/figure-html/unnamed-chunk-10-1.png -------------------------------------------------------------------------------- /measuring-predictor-importance/churn_data_181_files/figure-html/unnamed-chunk-11-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/churn_data_181_files/figure-html/unnamed-chunk-11-1.png -------------------------------------------------------------------------------- /measuring-predictor-importance/churn_data_181_files/figure-html/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/churn_data_181_files/figure-html/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /measuring-predictor-importance/churn_data_181_files/figure-html/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/churn_data_181_files/figure-html/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /measuring-predictor-importance/churn_data_181_files/figure-html/unnamed-chunk-9-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/churn_data_181_files/figure-html/unnamed-chunk-9-1.png -------------------------------------------------------------------------------- /linear-regression-and-cousins/permeability_data_62_files/figure-html/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/permeability_data_62_files/figure-html/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /linear-regression-and-cousins/permeability_data_62_files/figure-html/unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/permeability_data_62_files/figure-html/unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /linear-regression-and-cousins/permeability_data_62_files/figure-html/unnamed-chunk-3-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/permeability_data_62_files/figure-html/unnamed-chunk-3-2.png -------------------------------------------------------------------------------- /linear-regression-and-cousins/permeability_data_62_files/figure-html/unnamed-chunk-3-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/permeability_data_62_files/figure-html/unnamed-chunk-3-3.png -------------------------------------------------------------------------------- /linear-regression-and-cousins/permeability_data_62_files/figure-html/unnamed-chunk-3-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/permeability_data_62_files/figure-html/unnamed-chunk-3-4.png -------------------------------------------------------------------------------- /linear-regression-and-cousins/permeability_data_62_files/figure-html/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/permeability_data_62_files/figure-html/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /non-linear-regression-models/svm_simulation_71.R: -------------------------------------------------------------------------------- 1 | # Chapter 7: Non-linear Regression models 2 | # 3 | # Script based on Ex 7.1 of Applied Predictive Modeling (Kuhn & Johnson, 2013) 4 | # Exercise covers on data simulation and the SVM tuning parameters 5 | # 6 | ## SVM Notes: 7 | # Can be seen as a form of robust regression. 8 | # e-insensitive regression: data points with residuals inside the e boundary do not 9 | # contribute to the model while those outside contribute a linear scaled amount. 10 | # MIN Cost*SUM[L(y - y_hat)] + SUM(beta^2); L is the e-insensitve function. 11 | # Over-parameterised model, as many beta as points, however these are zero for 12 | # points inside the e boundary. 13 | # The support vectors are those points with non-zero beta (or alpha!). 14 | # New points enter as dot-product of the support vectors. An extension to non-linear 15 | # situations is to use other Kernel function around the dot product (e.g. radial 16 | # basis). Non-linear kernels have scaling parameters requiring tuning. 17 | # 18 | # 19 | # Packages ==================================================================== 20 | library(kernlab) 21 | library(RColorBrewer) 22 | 23 | # Simulate the data =========================================================== 24 | set.seed(100) 25 | x <- runif(100, min = 2, max = 10) 26 | y <- sin(x) + rnorm(length(x)) * .25 27 | sin_data <- data.frame(x = x, y = y) 28 | plot(x, y) 29 | 30 | # Create a grid of x values to use for prediction 31 | data_grid <- data.frame(x = seq(2, 10, length = 100)) 32 | 33 | # Fit different models using a radial basis function and different values of === 34 | # the cost (the C parameter) 35 | 36 | mypalette <- brewer.pal(9, "Greens") 37 | 38 | par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE) 39 | plot(x, y) 40 | for (i in 1:9) { 41 | radial_SVM <- ksvm(x = x, y = y, data = sin_data, 42 | kernel ="rbfdot", kpar = "automatic", 43 | C = i/2, epsilon = 0.1) # move the cost from 0.5 to 4.5 44 | model_preds <- predict(radial_SVM, newdata = data_grid) 45 | points(x = data_grid$x, y = model_preds[,1], type = "l", col = mypalette[i]) 46 | } 47 | legend("topright", inset=c(-0.2,0), lty = 1, legend=c(seq(0.5, 4.5, by = 0.5)), 48 | col = mypalette[1:10], title="Cost", cex = 0.5) 49 | par(xpd = FALSE) 50 | points(seq(1, 11, by = 0.1), sin(seq(1, 11, by = 0.1)), type = "l", lwd = 2) 51 | 52 | # Fit different models using a radial basis function and different values of === 53 | # the epsilon boundary parameter 54 | 55 | mypalette <- brewer.pal(9, "Blues") 56 | 57 | par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE) 58 | plot(x, y, bty='L') 59 | for (i in 1:9) { 60 | radial_SVM <- ksvm(x = x, y = y, data = sin_data, 61 | kernel ="rbfdot", kpar = "automatic", 62 | C = 1, epsilon = i/10) # move epsilon from 0.1 to 0.9 63 | model_preds <- predict(radial_SVM, newdata = data_grid) 64 | points(x = data_grid$x, y = model_preds[,1], type = "l", 65 | col = mypalette[i], 66 | lwd = 2) 67 | } 68 | legend("topright", inset=c(-0.2,0), lty = 1, legend=c(seq(0.1, 0.9, by = 0.1)), 69 | col = mypalette[1:10], title="Epsilon", cex = 0.5) 70 | par(xpd = FALSE) 71 | points(seq(1, 11, by = 0.1), sin(seq(1, 11, by = 0.1)), type = "l", lwd = 1) 72 | 73 | # Fit different models using a radial basis function and different values of === 74 | # sigma (the scaling parameter) 75 | 76 | mypalette <- brewer.pal(9, "Reds") 77 | 78 | par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE) 79 | plot(x, y) 80 | for (i in 1:9) { 81 | radial_SVM <- ksvm(x = x, y = y, data = sin_data, 82 | kernel ="rbfdot", kpar = list(sigma = i), # move epsilon from 1 to 9 83 | C = 1, epsilon = 0.1) 84 | model_preds <- predict(radial_SVM, newdata = data_grid) 85 | points(x = data_grid$x, y = model_preds[,1], type = "l", 86 | col = mypalette[i], 87 | lwd = 2) 88 | } 89 | legend("topright", inset=c(-0.2,0), lty = 1, legend=c(seq(1, 9, by = 1)), 90 | col = mypalette[1:10], title="Sigma", cex = 0.5) 91 | par(xpd = FALSE) 92 | points(seq(1, 11, by = 0.1), sin(seq(1, 11, by = 0.1)), type = "l", lwd = 1) 93 | 94 | 95 | # sigma determines the extent smoothing, if zero left only with beta_0... 96 | # Cost determines error impact on model, e determines number of support vectors 97 | # book suggests fixing e and tuning the cost (there is a relationship between the two) 98 | -------------------------------------------------------------------------------- /non-linear-regression-models/friedman_data_72.R: -------------------------------------------------------------------------------- 1 | # Chapter 7: Non-linear Regression models 2 | # 3 | # Script based on Ex 7.2 of Applied Predictive Modeling (Kuhn & Johnson, 2013) 4 | # Exercise covers on data simulation and model fitting 5 | # 6 | # 7 | # Packages ==================================================================== 8 | library(mlbench) 9 | library(caret) 10 | library(reshape2) 11 | 12 | # Simulate the data =========================================================== 13 | set.seed(200) 14 | training <- mlbench.friedman1(200, sd = 1) # data is in list format 15 | training$x <- data.frame(training$x) # convert predictor matrix to dataframe 16 | 17 | # slight pattern in som of the generating variables X1 through X5 compared to the 18 | # noise variables X6 through X10 19 | featurePlot(training$x, training$y) 20 | 21 | # simulate a large test set for greater precision in error estimation 22 | test <- mlbench.friedman1(5000, sd = 1) 23 | test$x <- data.frame(test$x) 24 | 25 | # Fit and test a KNN model ====================================================== 26 | ctrl <- trainControl(method = 'repeatedcv', number = 10, repeats = 5) 27 | 28 | set.seed(100) 29 | knn_model <- train(x = training$x, 30 | y = training$y, 31 | method = "knn", 32 | preProc = c("center", "scale"), 33 | trControl = ctrl, 34 | tuneLength = 10) 35 | knn_model 36 | 37 | knn_pred <- predict(knn_model, newdata = test$x) 38 | postResample(pred = knn_pred, obs = test$y) 39 | 40 | # the predictions are within a narrower range, tendecy to underpredict high values 41 | # and overpredict low values 42 | plot(x = knn_pred, y = test$y, xlab = 'prediction', ylab = 'observed') 43 | abline(a = 0, b = 1, lty = 2, col = 2) 44 | 45 | # Fit and test a SVM model ====================================================== 46 | set.seed(100) 47 | svmR_model <- train(x = training$x, 48 | y = training$y, 49 | method = "svmRadial", 50 | preProc = c("center", "scale"), 51 | trControl = ctrl, 52 | tuneLength = 10) 53 | svmR_model 54 | svmR_model$finalModel # epsilon = 0.1 55 | 56 | svmR_pred <- predict(svmR_model, newdata = test$x) 57 | postResample(pred = svmR_pred, obs = test$y) 58 | 59 | # Much better fit than the knn model 60 | plot(x = svmR_pred, y = test$y, xlab = 'prediction', ylab = 'observed', 61 | main = 'SVM model') 62 | abline(a = 0, b = 1, lty = 2, col = 2) 63 | 64 | # Fit and test a MARS model ==================================================== 65 | set.seed(100) 66 | mars_grid <- expand.grid(.degree = 1:2, .nprune = seq(7, 20, by = 2)) 67 | mars_model <- train(x = training$x, 68 | y = training$y, 69 | method = "earth", 70 | preProc = c("center", "scale"), 71 | trControl = ctrl, 72 | tuneGrid = mars_grid) 73 | mars_model 74 | 75 | # Model differentiated signal from noise variables, only X6 was not pruned 76 | # from the models (and then only for degree = 1) 77 | plotmo(mars_model$finalModel) 78 | varImp(mars_model) 79 | 80 | mars_pred <- predict(mars_model, test$x) 81 | postResample(pred = mars_pred, obs = test$y) 82 | 83 | # Close fit... 84 | plot(x = mars_pred, y = test$y, xlab = 'prediction', ylab = 'observed', 85 | main = 'MARS model') 86 | abline(a = 0, b = 1, lty = 2, col = 2) 87 | 88 | # Fit and test a Neural Networks model ========================================= 89 | cor(training$x) 90 | 91 | nnet_grid <- expand.grid(.decay = seq(0.5, 1.5, by = 0.1), # book recommends decay is 92 | .size = seq(1, 15, by = 2), .bag = FALSE) # between 0 and 0.1 93 | set.seed(100) 94 | nnet_model <- train(x = training$x, 95 | y = training$y, 96 | method = "avNNet", # avNNet performs model averaging 97 | preProc = c("center", "scale"), 98 | trControl = ctrl, 99 | tuneGrid = nnet_grid, 100 | linout = TRUE, 101 | maxit = 500) 102 | nnet_model 103 | nnet_model$finalModel # there are 5 final models 104 | 105 | nnet_pred <- predict(nnet_model, test$x) 106 | postResample(pred = nnet_pred, obs = test$y) 107 | 108 | # Slight tendency, as with knn, towards a thinner tailed prediction distribution 109 | # than actually observed 110 | plot(x = nnet_pred, y = test$y, xlab = 'prediction', ylab = 'observed', 111 | main = 'Neural Networks model') 112 | abline(a = 0, b = 1, lty = 2, col = 2) 113 | 114 | nnet_pred_dist <- data.frame(training_data = training$y, predictions = nnet_pred, 115 | population_data = test$y) 116 | nnet_pred_dist <- melt(nnet_pred_dist); str(nnet_pred_dist) 117 | ggplot(data = nnet_pred_dist, aes(value, ..density.., colour = variable)) + 118 | geom_density() + 119 | theme_ 120 | 121 | -------------------------------------------------------------------------------- /classification-tree-and-rule-based-models/random_forests_143.R: -------------------------------------------------------------------------------- 1 | # Classification Trees and Rule Based Models 2 | # 3 | # Script based on Ex 14.3 of Applied Predictive Modeling (Kuhn & Johnson, 2013) 4 | # 5 | # AIM: investigate difference between CART and conditional inference random 6 | # forest models 7 | # 8 | # In particular: 9 | # a) The resulting models - 10-fold cv performance, optimal mtry 10 | # b) Fitting time 11 | # c) Most important variables - any differences? 12 | # d) Impact of preprocessing on CART most important variables 13 | # 14 | # Load data and packages ======================================================= 15 | library(caret) 16 | library(AppliedPredictiveModeling) 17 | library(party) 18 | 19 | data(hepatic) 20 | 21 | 22 | # Fit a random forest model using both CART trees and conditional inference trees 23 | set.seed(714) 24 | indx <- createFolds(injury, returnTrain = TRUE) 25 | ctrl <- trainControl(method = "cv", index = indx) 26 | mtryValues <- c(5, 10, 25, 50, 75, 100) 27 | 28 | # CART random forest 29 | rfCART <- train(chem, injury, 30 | method = "rf", 31 | metric = "Kappa", 32 | ntree = 1000, 33 | tuneGrid = data.frame(.mtry = mtryValues), 34 | do.trace = TRUE) 35 | rfCART 36 | 37 | # Conditional inference random forest 38 | rfcForest <- train(chem, injury, # conditional inference trees 39 | method = "cforest", 40 | metric = "Kappa", 41 | tuneGrid = data.frame(.mtry = mtryValues)) 42 | rfcForest 43 | 44 | 45 | # Fitting times --------------------------------------------------------------- 46 | rfCART$times$everything 47 | rfcForest$times$everything 48 | 49 | 50 | # Variable importance --------------------------------------------------------- 51 | varImp(rfCART) 52 | temp <- varImp(rfCART)$importance 53 | imp_var_rfCART <- rownames(temp)[order(temp$Overall, decreasing = TRUE)][1:20] 54 | round(sapply(chem[imp_var_rfCART], var), 2) # variance of variables 55 | sapply(chem[imp_var_rfCART], range) # range of variables 56 | 57 | 58 | varImp(rfcForest) 59 | temp <- varImp(rfcForest)$importance 60 | imp_var_rfcForest <- rownames(temp)[order(temp$Overall, decreasing = TRUE)][1:20] 61 | round(sapply(chem[imp_var_rfcForest], var), 2) # variance of variables 62 | sapply(chem[imp_var_rfcForest], range) # range of variables 63 | 64 | 65 | # The preference of the CART model for continuous rather than binary or discrete 66 | # variables that take on few values is clear from the variances of the top 20 67 | # variables of each model. 68 | # 69 | # Does preprocessing CART models reduce this bias? ---------------------------- 70 | 71 | rfCART.2 <- train(chem, injury, 72 | method = "rf", 73 | metric = "Kappa", 74 | ntree = 1000, 75 | tuneGrid = data.frame(.mtry = mtryValues), 76 | do.trace = TRUE, 77 | preProcess = c("center", "scale")) 78 | rfCART.2 79 | 80 | varImp(rfCART.2) 81 | temp <- varImp(rfCART.2)$importance 82 | imp_var_rfCART.2 <- rownames(temp)[order(temp$Overall, decreasing = TRUE)][1:20] 83 | round(sapply(chem[imp_var_rfCART.2], var), 2) # variance of variables 84 | sapply(chem[imp_var_rfCART.2], range) # range of variables 85 | 86 | # Pre-processing alters the variables considered most important although cross 87 | # validation performance is reduced. The preference towards high variance 88 | # variables taking on many values is reduced. A different mtry value is return. 89 | 90 | 91 | # Random Forests (CART vs cforest) ============================================ 92 | # CART random forest: Trees are high variance, low bias. Bagging attempts to 93 | # reduce the variance by bootstrap aggregating a number of trees. However as 94 | # each tree uses the same set of predictors they are not independent of one 95 | # another. Random forest attempts to deal with this by selecting a subset m of 96 | # the full set predictors at each split. 97 | # For categorical response the m is typically set at sqrt(p), p being the full 98 | # set of predictors. The trees are then fit as normal, i.e. using the Gini index 99 | # or an information criterion for classification. 100 | # 101 | # One issue with CART random forest is a bias in variable selection, specifically 102 | # a preference for continuous over categorical variables. Conditional inference 103 | # trees (Hothorn et al. 2006) are one technique to overcome this. These trees 104 | # are fit by performing a hypothesis test at each candidate split (across an 105 | # exhaustive list) and generating p-values. This allows p-values rather than 106 | # raw differences to be compared which decreases bias as they are on the same 107 | # scale. Multiple comparison correction is also used, as such trees are not pruned 108 | # as increased splits result in reduced power of the test and less likelihood 109 | # of false positive splits 110 | -------------------------------------------------------------------------------- /measuring-predictor-importance/oil_data_182.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Ex18.1 Measuring predictor importance: oil data set' 3 | author: "Oisin Fitzgerald" 4 | output: 5 | html_document: 6 | keep_md: true 7 | --- 8 | Assessing varaible importance (and model fitting) may be more difficult with 9 | a response factor with multiple levels. This script outlines the quantification 10 | of variable importance in a data set where the response is a factor with seven 11 | levels. The data set describes the fatty acid composition of seven oil types (e.g. 12 | pumpkin (A), sunflower (B)....) measured using gas chromatography. More can be 13 | found in ?caret::oil. 14 | 15 | Four approaches are taken to evaluating predictor importance, answer the following questions: 16 | 17 | 1. Graphical display of the predictors for each level of the response (boxplots!): 18 | how well separated do the classes appear? 19 | 2. Analysis of variance: for each predictor is there more variance between rather 20 | than within the response categories? 21 | 3. Area under the ROC curve: what is the ability of a predictor to separate classes? 22 | 4. ReliefF (incl permutation test): what is the ability of a predictor to separate classes and is this greater than would occur by chance? 23 | ```{r} 24 | # load data 25 | library(caret) 26 | data(oil) 27 | str(oilType) 28 | str(fattyAcids) 29 | 30 | # load packages 31 | library(reshape2) 32 | library(CORElearn) 33 | library(AppliedPredictiveModeling) 34 | suppressMessages(library(pROC)) 35 | ``` 36 | 37 | ### 1. Boxplots 38 | Although not particularly pretty the boxplots does show that 39 | oleic and linoleic have the most variation between and within classes. 40 | ```{r} 41 | summary(fattyAcids) 42 | # prepare data for boxplots 43 | df <- data.frame(fattyAcids, oilType) 44 | df <- melt(df) 45 | df1 <- df[df$variable %in% c("Linoleic", "Oleic", "Palmitic"), ] 46 | df2 <- df[!(df$variable %in% c("Linoleic", "Oleic", "Palmitic")), ] 47 | # for linoleic and oleic 48 | ggplot(data = df1, aes(y = value, x = oilType)) + 49 | geom_boxplot() + 50 | facet_grid(. ~variable) 51 | # the remaining variables 52 | ggplot(data = df2, aes(y = value, x = oilType)) + 53 | geom_boxplot() + 54 | facet_grid(. ~variable) 55 | ``` 56 | 57 | ### 2. ANOVA 58 | By modelling the fatty acids as the dependent variables and the oil 59 | types as the independent variables (a realistic description of the original 60 | experiment) we can gain an understanding of likely differences between the means of 61 | factor levels for each fatty acid. The results suggest there are differences, although 62 | an ANOVA won't indicate where the difference lie. Computing pairwise t-tests would 63 | be required for this, but the amount of numbers to compare in this case is prohibitive compared to other methods. 64 | ```{r} 65 | anova_res <- function(x, y) { 66 | test <- anova(lm(x ~ y)) 67 | out <- c(F_value = test$`F value`[1], p_value = test$`Pr(>F)`[1]) 68 | out 69 | } 70 | anova_data <- apply(fattyAcids, MARGIN = 2, FUN = anova_res, y = oilType) 71 | anova_data <- data.frame(t(anova_data)) 72 | anova_data[order(anova_data$p_value, decreasing = FALSE), ] 73 | 74 | # method of pairwise comparison 75 | # pairwise.t.test(x = fattyAcids$Palmitic, g = oilType, pool.sd = FALSE) 76 | ``` 77 | 78 | ### 3. ROC and AUC 79 | When there are three or more classes, filterVarImp will compute ROC curves 80 | for each class versus the others and then returns the largest area under the 81 | curve. In this case, with AUCs of 1, there are clearly some predictors that 82 | can perfectly separate one class from the rest (but we could have guessed this 83 | looking at the boxplots!). 84 | ```{r} 85 | # AUC results 86 | auc_res <- filterVarImp(y = oilType, x = fattyAcids) 87 | ``` 88 | 89 | ### 4. ReliefF and Permutation Test 90 | The ReliefF algorithm takes a random sample of the data and calculates how far each 91 | observation is from a randomly chosen observation of the same and a different (response) class. 92 | ```{r} 93 | relief_values <- attrEval(oilType ~ ., data = fattyAcids, 94 | estimator = "ReliefFequalK", # calculation method 95 | ReliefIterations = 50) # num iteration 96 | relief_values[order(relief_values, decreasing = TRUE)] 97 | ``` 98 | 99 | A permutation test adds to the ReliefF algorithm by allowing us to observe how the ReliefF score compares to a distribution of scores calculated on permutated data 100 | (i.e. where no predictor has any *real* relationship to the response) 101 | ```{r} 102 | relief_perm <- permuteRelief(x = fattyAcids, y = oilType, nperm = 500, 103 | estimator = "ReliefFequalK", 104 | ReliefIterations = 50) 105 | # format data for plotting 106 | relief_values <- data.frame(Predictor = names(relief_values), 107 | value = relief_values, 108 | row.names = NULL) 109 | # ggplot: facet wrapped histograms of permutations 110 | ggplot(data = relief_perm$permutations, aes(x = value)) + 111 | geom_histogram(binwidth = .01, colour = 1) + 112 | geom_vline(aes(xintercept = value), relief_values, colour = "red", linetype = 2) + 113 | facet_wrap(~ Predictor, scales = "free_x") + 114 | labs(title = "Relief Scores and Permutation Distributions", 115 | x = "Relief Scores\n*note free axes") 116 | ``` 117 | 118 | ```{r} 119 | # Standard deviation of permutated distribution from non-permutated score 120 | relief_perm$standardized[order(relief_perm$standardized)] 121 | ``` 122 | 123 | -------------------------------------------------------------------------------- /linear-classification-models/linear_classification_hepatic_121.R: -------------------------------------------------------------------------------- 1 | # Linear Classification Methods 2 | # 3 | # Script based on Ex 12.1 of Applied Predictive Modeling (Kuhn & Johnson, 2013) 4 | # 5 | # AIM: predict whether a compound is likely to cause liver damage based 6 | # on biological and chemical variables 7 | # 8 | # In particular answering: 9 | # a) What is a good train/test split given an imbalanced response 10 | # b) Which of the linear classification techniques works best and what are the 11 | # most important variables 12 | # c) How does performance compare based on the bio and chem sata sets separately 13 | # and together 14 | # 15 | # Load data and packages ====================================================== 16 | 17 | library(caret) 18 | library(AppliedPredictiveModeling) 19 | data(hepatic) 20 | ?AppliedPredictiveModeling::bio 21 | 22 | library(MASS) 23 | library(corrplot) 24 | 25 | # Create a training and test set ============================================== 26 | table(injury) # table of response levels 27 | length(injury) # how many obs 28 | 29 | # Stratified sample: i.e. sample by factor level 30 | # Create stratified sampling function 31 | stratified_sample <- function(x, p) { 32 | 33 | out <- NULL # initialise output 34 | 35 | for (i in 1:length(levels(x))) { 36 | 37 | strata <- levels(x)[i] # factor level to sample 38 | 39 | temp <- sample(which(x %in% strata), # take sample by current level 40 | size = length(x[x == strata])*p, 41 | replace = FALSE) 42 | 43 | out <- c(out, temp) # add sample to previous sample 44 | 45 | } 46 | out <- sort(out) 47 | out # output vector 48 | } 49 | 50 | training_index <- stratified_sample(x = injury, p = 0.6) 51 | 52 | # Create the training/test sets 53 | training_bio <- bio[training_index, ] 54 | test_bio <- bio[-training_index, ] 55 | training_chem <- chem[training_index, ] 56 | test_chem <- chem[-training_index, ] 57 | 58 | rm(bio, chem) # remove unnecessary data 59 | 60 | 61 | # Pre-process the data ======================================================== 62 | # 1. Near Zero Variance 63 | nearZeroVar(training_bio, freqCut = 99/1, saveMetrics = TRUE) 64 | remove <- nearZeroVar(training_bio, freqCut = 95/5) 65 | training_bio <- training_bio[ ,-remove] 66 | 67 | nearZeroVar(training_chem, freqCut = 99/1, saveMetrics = TRUE) 68 | remove <- nearZeroVar(training_chem, freqCut = 95/5) 69 | training_chem <- training_chem[ ,-remove] 70 | 71 | # 2. Correlations 72 | corr_bio <- cor(training_bio) 73 | corrplot(corr_bio, tl.cex = 0.5) 74 | remove <- findCorrelation(corr_bio) 75 | training_bio <- training_bio[ ,-remove] 76 | 77 | corr_chem <- cor(training_chem) 78 | corrplot(corr_chem, tl.cex = 0.5) 79 | remove <- findCorrelation(corr_chem) 80 | training_chem <- training_chem[ ,-remove] 81 | 82 | # 3. Transform, center and scale 83 | preProc_bio <- preProcess(training_bio, method = c("center", "scale", "BoxCox")) 84 | training_bio <- predict(preProc_bio, training_bio) 85 | 86 | preProc_chem <- preProcess(training_chem, method = c("center", "scale", "BoxCox")) 87 | training_chem <- predict(preProc_chem, training_chem) 88 | 89 | rm(corr_bio, corr_chem) # keep clean workspace 90 | 91 | # 5. Prep test data sets (following training alterations) 92 | temp <- colnames(test_bio) %in% colnames(training_bio) 93 | test_bio <- test_bio[ ,temp] 94 | test_bio <- predict(preProc_bio, test_bio) # same preProc object 95 | 96 | temp <- colnames(test_chem) %in% colnames(training_chem) 97 | test_chem <- test_chem[ ,temp] 98 | test_chem <- predict(preProc_chem, test_chem) # same preProc object 99 | 100 | rm(preProc_bio, preProc_chem) 101 | 102 | # Fit linear classification models to optimise AUC ============================ 103 | 104 | # LDA ------------------------------------------------------------------------- 105 | # Briefly: background to linear discriminant analysis (LDA)... 106 | # LDA models the distribution of the predictors X given the response Y (with k 107 | # classes) and uses Bayes theorem to flip these around into estimates of 108 | # Pr(y = k|X = x). Let p(k) represent the overall (prior) probability that any 109 | # random observation is from the kth class of Y. Let f(x = X|Y = k) denote the 110 | # density function of X for an observation that comes from the kth class. f(x) will 111 | # be high when a given value of X is likely to be associated with class k of Y. 112 | # Then Bayes theorem states Pr(y = k|X = x) = pk(x) = p(k)*fk(x)/SUM[p(l)*fl(x)]. 113 | # p(k), the posterior probability, can be estimated using sample fractions of 114 | # classes, however f(x) is more difficult w/out assuming a probability distribution 115 | # for X. 116 | # By modelling these distributions as Gaussian, and assuming equal variances the 117 | # discriminant function can be derived. 118 | 119 | # 1a. fit LDA - biological data 120 | lda_bio <- lda(injury[training_index] ~ ., 121 | data = training_bio) 122 | 123 | # 1a. fit LDA - chem data 124 | # failed - collinearity exists in prediction matrix 125 | # remove collinearity 126 | remove <- findLinearCombos(training_chem)$remove 127 | training_chem <- training_chem[ ,-remove] 128 | temp <- colnames(test_chem) %in% colnames(training_chem) 129 | test_chem <- test_chem[ ,temp] 130 | 131 | lda_chem <- lda(injury[training_index] ~ ., 132 | data = training_chem) 133 | 134 | # 2a. predict on test data - bio 135 | lda_bio_preds <- predict(lda_bio, test_bio) 136 | confusionMatrix(lda_bio_preds$class, injury[-training_index]) 137 | 138 | # 2b. predict on test data - chem 139 | lda_chem_preds <- predict(lda_chem, test_chem) 140 | confusionMatrix(lda_chem_preds$class, injury[-training_index]) 141 | 142 | 143 | 144 | 145 | -------------------------------------------------------------------------------- /data-pre-processing/data_pre_proc.R: -------------------------------------------------------------------------------- 1 | # Chapter 3: Data Pre-Processing 2 | # 3 | # Script based on Ex 3.1 to 3.3 of Applied Predictive Modeling (Kuhn & Johnson, 2013) 4 | # Exercises focus on data pre-processing incl. missingness, distributions 5 | # (skewness/outliers), transformations, near zero variance and visualising above... 6 | # 7 | # ============================================================================== 8 | # Exercise 3.1 9 | # AIM: Visual data exploration (distributions/outliers) and explore possible 10 | # transformations 11 | # 12 | # Notes: 13 | # 14 | # 15 | # The data --------------------------------------------------------------------- 16 | # 6 types of glass; defined in terms of their oxide content (i.e. Na, Fe, K, etc) 17 | library(mlbench) 18 | data(Glass) 19 | str(Glass) 20 | 21 | library(corrplot) 22 | library(reshape2) 23 | library(ggplot2) 24 | library(caret) 25 | library(e1071) 26 | 27 | # Correlation matrix of the variables ------------------------------------------ 28 | Glass.corr <- cor(Glass[ ,1:9]) # correlation matrix of predictors 29 | corrplot(Glass.corr, method = "number", type = "upper") # correlation matrix visualisation 30 | plot(Glass$RI, Glass$Ca, 31 | main = "Scatterplot of RI and Ca") # evidence of linear relationship 32 | 33 | # Histogram of the variables --------------------------------------------------- 34 | Glass.stack <- melt(Glass) # 3 columns w/ Type and variable name as factor variables 35 | str(Glass.stack) 36 | p <- ggplot(Glass.stack, aes(x = value)) 37 | p + geom_histogram(col = "blue") + facet_wrap(~ variable, scales = "free") 38 | # RI, Na, Si, AL, Ca are relatively normal though with outliers 39 | # Ba, Fe, K and Mg have large proportions of zero/near zero values 40 | table(Glass$Ba == 0)/length(Glass$Ba) 41 | 42 | # Near zero variances ---------------------------------------------------------- 43 | nearZeroVar(Glass, saveMetrics = TRUE) # saveMetrics returns useful info 44 | # Fe, Ba, K have large numbers of zero values 45 | 46 | # Transformations: Skewness and Box Cox lamba estimates ----------------------- 47 | Glass.pred <- Glass[ ,1:9] 48 | skewValues <- apply(Glass.pred, 2, skewness) 49 | skewValues # K > Ba > Ca > Fe > RI (all over |1|) 50 | apply(Glass.pred[abs(skewValues) > 1], 2, BoxCoxTrans) # BoxCox estimates 51 | 52 | # Principal Component Analysis ------------------------------------------------ 53 | pcaObject <- prcomp(Glass.pred, center = TRUE, scale. = TRUE) 54 | percentVariance <- pcaObject$sd^2/sum(pcaObject$sd^2)*100 # variance each component a/c for 55 | round(percentVariance, 2) 56 | round(head(pcaObject$rotation[ ,1:5]),2) # variable loadings 57 | 58 | rm(list = ls()) # clear enviroment 59 | 60 | # ============================================================================== 61 | # Exercise 3.2 62 | # AIM: Exploratory analysis focusing on identifying missing values and 63 | # near zero variance 64 | # 65 | # Notes: 66 | # 67 | # 68 | # The data --------------------------------------------------------------------- 69 | library(mlbench) 70 | data(Soybean) 71 | ?mlbench::Soybean # dna = does not apply 72 | str(Soybean) # factors and ordinal factors 73 | 74 | library(caret) 75 | library(ggplot2) 76 | library(mi) # for visualising missing values 77 | library(ipred) # imputation 78 | 79 | # Near Zero Variances --------------------------------------------------------- 80 | SoyZeroVar <- nearZeroVar(Soybean, freqCut = 95/5); SoyZeroVar # 3 variables 81 | qplot(Soybean[ ,19]) 82 | qplot(Soybean[ ,26]) 83 | qplot(Soybean[ ,28]) 84 | nearZeroVar(Soybean, freqCut = 90/10) # 8 variables 85 | 86 | # Missing Values a) Predictors ----------------------------------------------- 87 | table(is.na(Soybean))/(683*36) # how many missing values? 88 | tab.na <- matrix(data = NA, nrow = 2, ncol = 35) # init matrix for storing NA info 89 | rownames(tab.na) <- c("NA", "not NA") 90 | colnames(tab.na) <- colnames(Soybean[-1]) # [-1] to leave out Class 91 | 92 | # Create a table with NA vs not NA values for each variable 93 | for (i in 2:36) { 94 | tab.na[ , i-1] <- table(is.na(Soybean[ , i]))/(length(Soybean[ , i])) 95 | } 96 | tab.na <- tab.na[ ,order(tab.na[1, ])] 97 | 98 | # Visualise the results 99 | barplot(tab.na[1, -12], cex.names = 0.8, las = 2, 100 | main = "Proportion of non-missing values") 101 | 102 | # Alternate visualisation 103 | mSoybean <- missing_data.frame(Soybean) 104 | image(mSoybean) 105 | 106 | # Missing Values b) response (incl. general data missingness by response level) 107 | table(is.na(Soybean[,1])) # all the class obs are present 108 | 109 | for (i in 1:683) { # loop to count missing values per row 110 | Soybean[i, 37] <- sum(is.na(Soybean)[i, ]) 111 | } 112 | colnames(Soybean)[37] <- "missing" 113 | Soybean.na <- subset(Soybean, select = c(Class, missing)) # subset by Class 114 | Soybean[ , 37] <- NULL # rm extra column 115 | missing.by.Class <- aggregate(missing ~ Class, data = Soybean.na, FUN = "sum") 116 | plot(missing.by.Class, 117 | cex.axis = 0.7, 118 | las = 2, 119 | xlab = "", 120 | main = "Missing Values per Class") 121 | # phytophthora-rot most impacted by missing values, followed by 2-4-d-injury and cyst-nematode 122 | 123 | # Initial attempt to deal with the missing values / NZV ----------------------- 124 | Soybean.filtered <- Soybean[ , -SoyZeroVar] # rm zero var variables 125 | dummies <- dummyVars(Class ~ ., data = Soybean.filtered, na.action = na.pass) 126 | Soybean.filtered <- predict(dummies, newdata = Soybean) # dummy variables 127 | preProcess(Soybean.filtered, method="bagImpute") # impute missing values using 128 | # bagged tree method 129 | 130 | rm(list = ls()) # clear enviroment 131 | 132 | # ============================================================================== 133 | # Exercise 3.3 134 | # AIM: Data exploration with a focus on associated predictors / near zero variance 135 | # 136 | # Notes: 137 | # 138 | # 139 | # The data --------------------------------------------------------------------- 140 | library(caret) 141 | data(BloodBrain) 142 | ?BloodBrain # each of the 208 compounds have multiple descriptors 143 | str(bbbDescr) 144 | 145 | library(corrplot); ?corrplot 146 | library(e1071) 147 | 148 | # Correlation matrix of the variables ------------------------------------------ 149 | bbbDescr.corr <- cor(bbbDescr) 150 | corrplot(bbbDescr.corr, method = "shade", tl.cex = 0.4, order = "hclust") 151 | # corrplot indicates evidence of linear relationship between many variables 152 | 153 | highCorr <- findCorrelation(bbbDescr.corr, cutoff = .75) # which variables? 154 | length(highCorr) # 66 variables included 155 | bbbDescr.filtered <- bbbDescr[, -highCorr] # rm algorithm selected high correlations 156 | 157 | # Skewness and transformations ------------------------------------------------ 158 | SkewValues <- apply(bbbDescr, 2, skewness) 159 | SkewedVariables <- subset(SkewValues, abs(SkewValues) > 1) 160 | length(SkewedVariables) # 71 variables 161 | 162 | # BoxCox Transformations 163 | preProc <- preProcess(bbbDescr.filtered, method = "BoxCox") 164 | bbbDescr.proc <- predict(preProc, bbbDescr.filtered) 165 | 166 | # Near Zero Variance ---------------------------------------------------------- 167 | nearZeroVar(bbbDescr) # 7 variabless 168 | bbbDescr.nearzero <- nearZeroVar(bbbDescr.proc) 169 | bbbDescr.final <- bbbDescr.proc[ , -bbbDescr.nearzero] 170 | 171 | # Leading to the final dataframe ----------------------------------------------- 172 | # remaining correlations 173 | corrplot(cor(bbbDescr.final), method = "shade", tl.cex = 0.4, order = "hclust") 174 | # skewness following tranformations 175 | table(apply(bbbDescr.final, 2, skewness) > 1) 176 | 177 | rm(list = ls()) # clear enviroment 178 | 179 | 180 | 181 | -------------------------------------------------------------------------------- /linear-regression-and-cousins/permeability_data_62.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Ex6.2 - Linear regression: molecule permeability' 3 | author: "Oisin Fitzgerald" 4 | date: "27 January 2016" 5 | output: 6 | html_document: 7 | keep_md: true 8 | --- 9 | 10 | ### The data 11 | 12 | This pharmaceutical data set was used to develop a model for predicting compounds' 13 | permeability, a measure of a molecule's ability to cross a membrane. Permeability 14 | impacts on a potential drug's usefulness, i.e. it needs to be able to cross 15 | certain membranes to be effective. There exist assays to measure a compund's 16 | permeability. The gaol here is to develop a predictive model for permeability 17 | in an attempt to potentially reduce the need for the assay. 18 | (brief description, more on ?permeability) 19 | 20 | ### Outline 21 | 22 | * Training/test split 23 | * Pre-process the data 24 | + Skewness of response 25 | + Sparseness 26 | * Fit and test PLS models 27 | * Fit and ridge regression, LASSO and elastic net models 28 | 29 | ```{r} 30 | # Load data and packages 31 | library(AppliedPredictiveModeling) 32 | data(permeability) 33 | 34 | suppressMessages(library(caret)) 35 | suppressMessages(library(pls)) 36 | suppressMessages(library(elasticnet)) 37 | suppressMessages(library(lars)) 38 | ``` 39 | 40 | ### Data splitting and pre-processing 41 | 42 | ```{r} 43 | fingerprints <- data.frame(fingerprints) 44 | permeability <- as.vector(permeability) 45 | # Create training/test split index 46 | Split <- createDataPartition(permeability, times = 1, p = 0.75) 47 | Split <- Split$Resample1 48 | # Create training and test splits 49 | training <- fingerprints[Split, ] 50 | test <- fingerprints[-Split, ] 51 | response_train <- permeability[Split] 52 | response_test <- permeability[-Split] 53 | 54 | # Pre-processing 55 | training_filter <- training[ ,-nearZeroVar(training, freqCut = 95/5)] # near zero variances 56 | # binary data... 57 | 58 | # positively skewed 59 | ggplot() + geom_histogram(aes(x = permeability), binwidth = 4, col = 1) + 60 | labs(title = "Histogram of permeability", x = "Molecule permeability") + 61 | theme_bw() 62 | preProcess(data.frame(permeability), method = c("YeoJohnson")) 63 | ``` 64 | 65 | ### Fit and test partial least squares models 66 | 67 | Three variations on a PLS model were fit and tested: 68 | 1. A model fit to the full training set 69 | 2. A model fit to the near zero variance reduced training set 70 | 3. A model fit to a log transformed response 71 | 72 | None of the models created appear to have the predictive ability to replace the 73 | mentioned assay method, they lack accurate predictive ability. The log(response) 74 | PLS model (and others) appears to shown promise within a certain range. Possibly 75 | the current linear technique is too restrictive. Some models are producing negative 76 | predictions - multicolinearity?. 77 | 78 | ```{r} 79 | # Fit PLS models 80 | ctrl = trainControl("repeatedcv", number = 5, repeats = 5) 81 | pls_model.1 <- train(y = response_train, 82 | x = training, 83 | method = "pls", 84 | metric = "Rsquared", 85 | tuneLength = 10, 86 | trControl = ctrl) 87 | pls_model.1 88 | 89 | pls_model.2 <- train(y = response_train, 90 | x = training_filter, 91 | method = "pls", 92 | metric = "Rsquared", 93 | tuneLength = 10, 94 | trControl = ctrl) 95 | pls_model.2 96 | 97 | pls_model.3 <- train(y = log(response_train), 98 | x = training, 99 | method = "pls", 100 | metric = "Rsquared", 101 | tuneLength = 10, 102 | trControl = ctrl) 103 | pls_model.3 104 | 105 | # post hoc: remove multicollinearity and refit 106 | remove <- findLinearCombos(training_filter) 107 | training_filter2 <- training_filter[ ,remove$remove] 108 | 109 | pls_model.4 <- train(y = response_train, 110 | x = training_filter2, 111 | method = "pls", 112 | metric = "Rsquared", 113 | tuneLength = 10, 114 | trControl = ctrl) 115 | pls_model.4 116 | 117 | 118 | # Predict on test data 119 | pls1_preds <- predict(pls_model.1, test) 120 | RMSE(pls1_preds, response_test) 121 | cor(pls1_preds, response_test)^2 # Rsquared 122 | ggplot() + 123 | geom_point(aes(x = pls1_preds, y = response_test)) + 124 | theme_bw() + 125 | labs(title = "PLS model 1 predictions vs. observed", 126 | x = "predicted permeability", 127 | y = "observed permeability") 128 | 129 | pls2_preds <- predict(pls_model.2, test) 130 | RMSE(pls2_preds, response_test) 131 | cor(pls2_preds, response_test)^2 # Rsquared 132 | ggplot() + 133 | geom_point(aes(x = pls2_preds, y = response_test)) + 134 | theme_bw() + 135 | labs(title = "PLS model 2 predictions vs. observed", 136 | x = "predicted permeability", 137 | y = "observed permeability") 138 | 139 | pls3_preds <- predict(pls_model.3, test) 140 | RMSE(exp(pls3_preds), response_test) 141 | cor(exp(pls3_preds), response_test)^2 # Rsquared 142 | ggplot() + 143 | geom_point(aes(x = pls3_preds, y = response_test)) + 144 | theme_bw() + 145 | labs(title = "PLS model 3 predictions vs. observed", 146 | x = "predicted permeability", 147 | y = "observed permeability") 148 | # can it predict well within a certain range?? 149 | RMSE(exp(pls3_preds[response_test<20]), response_test[response_test<20]) 150 | 151 | pls4_preds <- predict(pls_model.4, test) 152 | RMSE(pls4_preds, response_test) 153 | cor(pls4_preds, response_test)^2 # Rsquared 154 | ggplot() + 155 | geom_point(aes(x = pls4_preds, y = response_test)) + 156 | theme_bw() + 157 | labs(title = "PLS model 4 predictions vs. observed", 158 | x = "predicted permeability", 159 | y = "observed permeability") 160 | 161 | ``` 162 | 163 | #### Fit and ridge regression, LASSO and elastic net models 164 | 165 | As with the PLS models, none of the models were stong predictors of the data, suggesting 166 | that laboratory methods of measuring permeability are prefereable (at least) to the 167 | models fitted. The PLS models actually outperformed the shrinkage methods. Further the 168 | cross validated estimates of RMSE and R2 were quite inaccurate compared to the test fits. 169 | There appears to be residual instability in the cofficients even in these shrinkage methods, 170 | with enet and ridge producing hugely negative predictions and RMSE before being tuned 171 | over a predefined range of shrinkage coefficients. 172 | 173 | ```{r} 174 | # Fit shrinkage models 175 | ctrl <- trainControl("cv", number = 5) 176 | 177 | ridge_grid <- expand.grid(.lambda = seq(0.05, 0.2, 0.01)) 178 | ridge_model <- train(y = response_train, 179 | x = training_filter2, # model fitting impacted by zero variance 180 | method = "ridge", 181 | tuneGrid = ridge_grid, 182 | metric = "RMSE", 183 | trControl = ctrl) 184 | ridge_model 185 | # remaining instability in the coefficients? 186 | plot(ridge_model$finalModel) 187 | title(main = "Ridge regression coefficient stability") 188 | 189 | lasso_model <- train(y = response_train, 190 | x = training_filter2, 191 | method = "lasso", 192 | tuneLength = 10, 193 | metric = "RMSE", 194 | trControl = ctrl) 195 | lasso_model 196 | 197 | enet_grid <- expand.grid(.lambda = c(0.01, 0.02, 0.03, 0.05), .fraction = c(seq(0.00001, 0.2, 0.02))) 198 | enet_model <- train(y = response_train, 199 | x = training_filter, 200 | method = "enet", 201 | tuneGrid = enet_grid, 202 | metric = "RMSE", 203 | trControl = ctrl) 204 | enet_model 205 | 206 | 207 | # Test shrinkage models 208 | ridge_preds <- predict(ridge_model, test) 209 | RMSE(ridge_preds, response_test) 210 | cor(ridge_preds, response_test)^2 211 | 212 | lasso_preds <- predict(lasso_model, test) 213 | RMSE(lasso_preds, response_test) 214 | cor(lasso_preds, response_test)^2 215 | 216 | enet_preds <- predict(enet_model, test) 217 | RMSE(enet_preds, response_test) 218 | cor(enet_preds, response_test)^2 219 | ``` 220 | 221 | 222 | 223 | -------------------------------------------------------------------------------- /measuring-predictor-importance/oil_data_182.md: -------------------------------------------------------------------------------- 1 | # Ex18.1 Measuring predictor importance: oil data set 2 | Oisin Fitzgerald 3 | Assessing varaible importance (and model fitting) may be more difficult with 4 | a response factor with multiple levels. This script outlines the quantification 5 | of variable importance in a data set where the response is a factor with seven 6 | levels. The data set describes the fatty acid composition of seven oil types (e.g. 7 | pumpkin (A), sunflower (B)....) measured using gas chromatography. More can be 8 | found in ?caret::oil. 9 | 10 | Four approaches are taken to evaluating predictor importance, answer the following questions: 11 | 12 | 1. Graphical display of the predictors for each level of the response (boxplots!): 13 | how well separated do the classes appear? 14 | 2. Analysis of variance: for each predictor is there more variance between rather 15 | than within the response categories? 16 | 3. Area under the ROC curve: what is the ability of a predictor to separate classes? 17 | 4. ReliefF (incl permutation test): what is the ability of a predictor to separate classes and is this greater than would occur by chance? 18 | 19 | ```r 20 | # load data 21 | library(caret) 22 | ``` 23 | 24 | ``` 25 | ## Loading required package: lattice 26 | ## Loading required package: ggplot2 27 | ``` 28 | 29 | ```r 30 | data(oil) 31 | str(oilType) 32 | ``` 33 | 34 | ``` 35 | ## Factor w/ 7 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ... 36 | ``` 37 | 38 | ```r 39 | str(fattyAcids) 40 | ``` 41 | 42 | ``` 43 | ## 'data.frame': 96 obs. of 7 variables: 44 | ## $ Palmitic : num 9.7 11.1 11.5 10 12.2 9.8 10.5 10.5 11.5 10 ... 45 | ## $ Stearic : num 5.2 5 5.2 4.8 5 4.2 5 5 5.2 4.8 ... 46 | ## $ Oleic : num 31 32.9 35 30.4 31.1 43 31.8 31.8 35 30.4 ... 47 | ## $ Linoleic : num 52.7 49.8 47.2 53.5 50.5 39.2 51.3 51.3 47.2 53.5 ... 48 | ## $ Linolenic : num 0.4 0.3 0.2 0.3 0.3 2.4 0.4 0.4 0.2 0.3 ... 49 | ## $ Eicosanoic: num 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 ... 50 | ## $ Eicosenoic: num 0.1 0.1 0.1 0.1 0.1 0.5 0.1 0.1 0.1 0.1 ... 51 | ``` 52 | 53 | ```r 54 | # load packages 55 | library(reshape2) 56 | library(CORElearn) 57 | library(AppliedPredictiveModeling) 58 | suppressMessages(library(pROC)) 59 | ``` 60 | 61 | ### 1. Boxplots 62 | Although not particularly pretty the boxplots does show that 63 | oleic and linoleic have the most variation between and within classes. 64 | 65 | ```r 66 | summary(fattyAcids) 67 | ``` 68 | 69 | ``` 70 | ## Palmitic Stearic Oleic Linoleic 71 | ## Min. : 4.50 Min. :1.700 Min. :22.80 Min. : 7.90 72 | ## 1st Qu.: 6.20 1st Qu.:3.475 1st Qu.:26.30 1st Qu.:43.10 73 | ## Median : 9.85 Median :4.200 Median :30.70 Median :50.80 74 | ## Mean : 9.04 Mean :4.200 Mean :36.73 Mean :46.49 75 | ## 3rd Qu.:11.12 3rd Qu.:5.000 3rd Qu.:38.62 3rd Qu.:58.08 76 | ## Max. :14.90 Max. :6.700 Max. :76.70 Max. :66.10 77 | ## Linolenic Eicosanoic Eicosenoic 78 | ## Min. :0.100 Min. :0.100 Min. :0.1000 79 | ## 1st Qu.:0.375 1st Qu.:0.100 1st Qu.:0.1000 80 | ## Median :0.800 Median :0.400 Median :0.1000 81 | ## Mean :2.272 Mean :0.399 Mean :0.3115 82 | ## 3rd Qu.:2.650 3rd Qu.:0.400 3rd Qu.:0.3000 83 | ## Max. :9.500 Max. :2.800 Max. :1.8000 84 | ``` 85 | 86 | ```r 87 | # prepare data for boxplots 88 | df <- data.frame(fattyAcids, oilType) 89 | df <- melt(df) 90 | ``` 91 | 92 | ``` 93 | ## Using oilType as id variables 94 | ``` 95 | 96 | ```r 97 | df1 <- df[df$variable %in% c("Linoleic", "Oleic", "Palmitic"), ] 98 | df2 <- df[!(df$variable %in% c("Linoleic", "Oleic", "Palmitic")), ] 99 | # for linoleic and oleic 100 | ggplot(data = df1, aes(y = value, x = oilType)) + 101 | geom_boxplot() + 102 | facet_grid(. ~variable) 103 | ``` 104 | 105 | ![](oil_data_182_files/figure-html/unnamed-chunk-2-1.png) 106 | 107 | ```r 108 | # the remaining variables 109 | ggplot(data = df2, aes(y = value, x = oilType)) + 110 | geom_boxplot() + 111 | facet_grid(. ~variable) 112 | ``` 113 | 114 | ![](oil_data_182_files/figure-html/unnamed-chunk-2-2.png) 115 | 116 | ### 2. ANOVA 117 | By modelling the fatty acids as the dependent variables and the oil 118 | types as the independent variables (a realistic description of the original 119 | experiment) we can gain an understanding of likely differences between the means of 120 | factor levels for each fatty acid. The results suggest there are differences, although 121 | an ANOVA won't indicate where the difference lie. Computing pairwise t-tests would 122 | be required for this, but the amount of numbers to compare in this case is prohibitive compared to other methods. 123 | 124 | ```r 125 | anova_res <- function(x, y) { 126 | test <- anova(lm(x ~ y)) 127 | out <- c(F_value = test$`F value`[1], p_value = test$`Pr(>F)`[1]) 128 | out 129 | } 130 | anova_data <- apply(fattyAcids, MARGIN = 2, FUN = anova_res, y = oilType) 131 | anova_data <- data.frame(t(anova_data)) 132 | anova_data[order(anova_data$p_value, decreasing = FALSE), ] 133 | ``` 134 | 135 | ``` 136 | ## F_value p_value 137 | ## Linoleic 317.956365 7.400916e-58 138 | ## Oleic 283.022304 1.018427e-55 139 | ## Linolenic 186.689720 3.445215e-48 140 | ## Stearic 97.030804 7.197315e-37 141 | ## Palmitic 95.029863 1.598196e-36 142 | ## Eicosenoic 20.851957 4.033448e-15 143 | ## Eicosanoic 5.894469 3.263021e-05 144 | ``` 145 | 146 | ```r 147 | # method of pairwise comparison 148 | # pairwise.t.test(x = fattyAcids$Palmitic, g = oilType, pool.sd = FALSE) 149 | ``` 150 | 151 | ### 3. ROC and AUC 152 | When there are three or more classes, filterVarImp will compute ROC curves 153 | for each class versus the others and then returns the largest area under the 154 | curve. In this case, with AUCs of 1, there are clearly some predictors that 155 | can perfectly separate one class from the rest (but we could have guessed this 156 | looking at the boxplots!). 157 | 158 | ```r 159 | # AUC results 160 | auc_res <- filterVarImp(y = oilType, x = fattyAcids) 161 | ``` 162 | 163 | ### 4. ReliefF and Permutation Test 164 | The ReliefF algorithm takes a random sample of the data and calculates how far each 165 | observation is from a randomly chosen observation of the same and a different (response) class. 166 | 167 | ```r 168 | relief_values <- attrEval(oilType ~ ., data = fattyAcids, 169 | estimator = "ReliefFequalK", # calculation method 170 | ReliefIterations = 50) # num iteration 171 | relief_values[order(relief_values, decreasing = TRUE)] 172 | ``` 173 | 174 | ``` 175 | ## Linoleic Stearic Oleic Palmitic Linolenic Eicosanoic 176 | ## 0.63730668 0.54491315 0.52274769 0.43404607 0.24963549 0.10965929 177 | ## Eicosenoic 178 | ## 0.08903402 179 | ``` 180 | 181 | A permutation test adds to the ReliefF algorithm by allowing us to observe how the ReliefF score compares to a distribution of scores calculated on permutated data 182 | (i.e. where no predictor has any *real* relationship to the response) 183 | 184 | ```r 185 | relief_perm <- permuteRelief(x = fattyAcids, y = oilType, nperm = 500, 186 | estimator = "ReliefFequalK", 187 | ReliefIterations = 50) 188 | # format data for plotting 189 | relief_values <- data.frame(Predictor = names(relief_values), 190 | value = relief_values, 191 | row.names = NULL) 192 | # ggplot: facet wrapped histograms of permutations 193 | ggplot(data = relief_perm$permutations, aes(x = value)) + 194 | geom_histogram(binwidth = .01, colour = 1) + 195 | geom_vline(aes(xintercept = value), relief_values, colour = "red", linetype = 2) + 196 | facet_wrap(~ Predictor, scales = "free_x") + 197 | labs(title = "Relief Scores and Permutation Distributions", 198 | x = "Relief Scores\n*note free axes") 199 | ``` 200 | 201 | ![](oil_data_182_files/figure-html/unnamed-chunk-6-1.png) 202 | 203 | 204 | ```r 205 | # Standard deviation of permutated distribution from non-permutated score 206 | relief_perm$standardized[order(relief_perm$standardized)] 207 | ``` 208 | 209 | ``` 210 | ## Eicosanoic Eicosenoic Linolenic Palmitic Oleic Stearic 211 | ## 2.989642 5.748575 9.965823 12.783484 15.531254 16.228189 212 | ## Linoleic 213 | ## 18.022441 214 | ``` 215 | 216 | -------------------------------------------------------------------------------- /linear-regression-and-cousins/chem_manufact_data_63.R: -------------------------------------------------------------------------------- 1 | # Chapter 6: Linear Regression and its Cousins 2 | # 3 | # Script based on Ex 6.3 of Applied Predictive Modeling (Kuhn & Johnson, 2013) 4 | # Exercise covers on data pre-processing, train/test splitting, missing value 5 | # imputation, model fitting and assessing variable importance scores 6 | # 7 | # Notes: 8 | # 9 | # 10 | # The data and packages ======================================================= 11 | # Response: % yield from a pharmaceutical product's manufacture 12 | # Predictors: biological and manufacturing process variables 13 | 14 | library(AppliedPredictiveModeling) 15 | data(ChemicalManufacturingProcess) 16 | str(ChemicalManufacturingProcess) 17 | 18 | library(caret) 19 | library(fBasics) 20 | library(corrplot) 21 | library(car) 22 | 23 | # Split into training and test data =========================================== 24 | yield <- ChemicalManufacturingProcess$Yield 25 | Split <- createDataPartition(yield, times = 1, p = 0.75) 26 | # predictors... 27 | training <- ChemicalManufacturingProcess[Split$Resample1, -1] 28 | test <- ChemicalManufacturingProcess[-Split$Resample1, -1] 29 | # response ... 30 | resp_train <- yield[Split$Resample1] 31 | resp_test <- yield[-Split$Resample1] 32 | 33 | # Preprocess (inlc. missing data) ============================================= 34 | preproc_1 <- preProcess(training, 35 | method = c("knnImpute", "center", "scale"), k = 5) 36 | # centering and scaling are important when using e.g PLS/PCR as the methods 37 | # seek directions of maximal variation 38 | 39 | training <- predict(preproc_1, training) 40 | test <- predict(preproc_1, test) 41 | 42 | XX <- cor(train) 43 | corrplot(XX, tl.cex = 0.3) # some strong correlations 44 | rk(XX) # rank less than num vars; linear combinations 45 | remove <- findLinearCombos(XX) 46 | training <- training[ ,-remove$remove] 47 | test <- test[ ,-remove$remove] 48 | 49 | # what columns should be removed to reduce the correlations? 50 | remove <- findCorrelation(XX, cutoff = .90) 51 | training_filter <- training[ ,-remove] 52 | test_filter <- test[ ,-remove] 53 | 54 | # Transform data - estimate lambda with Yeo Johnson 55 | preproc_3 <- preProcess(training, method = "YeoJohnson") 56 | training <- predict(preproc_3, training) 57 | test <- predict(preproc_3, test) 58 | 59 | # distribution of the response 60 | hist(resp_train, xlab = "% yield", main = "Histogram of % yield") 61 | 62 | # Model fitting =============================================================== 63 | ctrl <- trainControl(method = "repeatedcv", number = 5, repeats = 5) 64 | 65 | # 1. A linear regression model ------------------------------------------------- 66 | linear_model <- train(y = resp_train, x = training_filter, 67 | method = "lm", 68 | trControl = ctrl) 69 | linear_model 70 | # plots allow search for ill fit or potential quadratic effects 71 | plot(linear_model$finalModel$fitted.values, 72 | linear_model$finalModel$residuals, xlab = "fitted", ylab = "residuals") 73 | avPlots(linear_model$finalModel) 74 | 75 | # how does the final model fit look vs. the data? 76 | plot(linear_model$finalModel$fitted.values, resp_train, xlab = "observed", 77 | ylab = "predicted") 78 | abline(a = 0, b = 1, col = 2) 79 | cor(linear_model$finalModel$fitted.values, resp_train)^2 # R2 fit 80 | 81 | # 2. Principal components regression ------------------------------------------ 82 | pcr_results <- list(results = data.frame(RMSE = NA, RMSE_sd = NA), final = NA) 83 | for (i in 1:20) { 84 | # fit model 85 | train_data <- princomp(training)$scores[ ,1:i] 86 | train_data <- data.frame(train_data) 87 | pcr_model <- train(y = resp_train, 88 | x = train_data, 89 | method = "lm", 90 | trControl = ctrl) 91 | 92 | # extract results 93 | pcr_results$results[i, 1] <- pcr_model$results$RMSE 94 | pcr_results$results[i, 2] <- pcr_model$results$RMSESD 95 | 96 | # extract model 97 | if (all(pcr_model$results$RMSE <= pcr_results$results$RMSE)) { 98 | pcr_results$final <- pcr_model 99 | } 100 | } 101 | pcr_results 102 | xyplot(pcr_results$results$RMSE ~ 1:20, xlab = "ncomp", ylab = "RMSE") 103 | 104 | # 3. Partial least squares ----------------------------------------------------- 105 | pls_model <- train(y = resp_train, x = training, 106 | method = "pls", 107 | trControl = ctrl, 108 | tuneLength = 10) 109 | pls_model 110 | plot(pls_model) 111 | 112 | # 4. Ridge regression ---------------------------------------------------------- 113 | ridge_grid <- expand.grid(.lambda = seq(0.05, 0.2, 0.01)) 114 | ridge_model <- train(y = resp_train, x = training, 115 | method = "ridge", 116 | trControl = ctrl, 117 | tuneGrid = ridge_grid) 118 | ridge_model 119 | plot(ridge_model) 120 | 121 | # y-axis: coefficient, x-axis: shrinkage proportion 122 | plot(ridge_model$finalModel) 123 | 124 | # 5. LASSO ---------------------------------------------------------------------- 125 | lasso_grid <- expand.grid(.fraction = seq(0.01, 0.20, 0.01)) 126 | lasso_model <- train(y = resp_train, x = training, 127 | method = "lasso", 128 | trControl = ctrl, 129 | tuneGrid = lasso_grid) 130 | lasso_model 131 | plot(lasso_model) 132 | 133 | # 6. Elastic Net --------------------------------------------------------------- 134 | enet_model <- train(y = resp_train, x = training, 135 | method = "enet", 136 | trControl = ctrl, 137 | tuneLength = 10) 138 | enet_model 139 | plot(enet_model) 140 | 141 | # Make predictions ============================================================ 142 | # 1. linear regression model 143 | lm_preds <- predict(linear_model, test_filter) 144 | RMSE(lm_preds, resp_test) 145 | plot(lm_preds, resp_test, xlab = "prediction", ylab = "observed", pch = 20, 146 | main = "Linear Regression: RMSE = 1.172") 147 | abline(a = 0, b = 1, col = 2, lty = 2) 148 | 149 | # 2. PCR model 150 | pca_object <- princomp(training) 151 | test_pc <- predict(pca_object, test) 152 | pcr_preds <- predict(pcr_results$final, test_pc) 153 | RMSE(pcr_preds, resp_test) 154 | plot(pcr_preds, resp_test, xlab = "prediction", ylab = "observed", pch = 20, 155 | main = "PCR: RMSE = 1.144") 156 | abline(a = 0, b = 1, col = 2, lty = 2) 157 | 158 | # 3. PLS model 159 | pls_preds <- predict(pls_model, test) 160 | RMSE(pls_preds, resp_test, na.rm = TRUE) 161 | plot(pls_preds, resp_test, xlab = "prediction", ylab = "observed", 162 | pch = 20, main = "Partial Least Squares: RMSE = 1.079") 163 | abline(a = 0, b = 1, col = 2, lty = 2) 164 | 165 | # 4. Ridge model 166 | ridge_preds <- predict(ridge_model, test) 167 | RMSE(ridge_preds, resp_test) 168 | plot(ridge_preds, resp_test, xlab = "prediction", ylab = "observed", 169 | pch = 20, main = "Ridge Reg: RMSE = 1.086") 170 | abline(a = 0, b = 1, col = 2, lty = 2) 171 | 172 | # 4. LASSO model 173 | lasso_preds <- predict(lasso_model, test) 174 | RMSE(lasso_preds, resp_test) 175 | plot(lasso_preds, resp_test, xlab = "prediction", ylab = "observed", 176 | pch = 20, main = "LASSO: RMSE = 1.112") 177 | abline(a = 0, b = 1, col = 2, lty = 2) 178 | 179 | # 5. enet model 180 | enet_preds <- predict(enet_model, test) 181 | RMSE(enet_preds, resp_test) 182 | plot(enet_preds, resp_test, xlab = "prediction", ylab = "observed", pch = 20, 183 | main = "Elastic Net: RMSE = 1.040") 184 | abline(a = 0, b = 1, col = 2, lty = 2) 185 | 186 | # Variable Importance ========================================================= 187 | varImp(pls_model) 188 | varImp(ridge_model) 189 | varImp(enet_model) 190 | 191 | plot(training$ManufacturingProcess13, resp_train, pch = 20, 192 | xlab = "ManufacturingProcess13", ylab = "% yield") 193 | lines(loess.smooth(training$ManufacturingProcess13, resp_train), col = 2) 194 | 195 | plot(training$ManufacturingProcess32, resp_train, pch = 20, 196 | xlab = "ManufacturingProcess32", ylab = "% yield") 197 | lines(loess.smooth(training$ManufacturingProcess32, resp_train), col = 2) 198 | 199 | plot(training$ManufacturingProcess36, resp_train, pch = 20, 200 | xlab = "ManufacturingProcess36", ylab = "% yield") 201 | lines(loess.smooth(training$ManufacturingProcess36, resp_train), col = 2) 202 | 203 | plot(training$ManufacturingProcess36, resp_train, pch = 20, 204 | xlab = "ManufacturingProcess36", ylab = "% yield") 205 | lines(loess.smooth(training$ManufacturingProcess36, resp_train), col = 2) 206 | 207 | # The variable importance scores reveal several relationships worth examining in 208 | # any experiments aimed at improving the manufacturing process. -------------------------------------------------------------------------------- /measuring-predictor-importance/abalone_data_183.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Ex 18.3 Measuring predictor importance: abalone data set" 3 | author: "Oisin Fitzgerald" 4 | output: 5 | html_document: 6 | keep_md: true 7 | --- 8 | The UCI Abalone data (http://archive.ics.uci.edu/ml/datasets/Abalone) 9 | consist of data from 4,177 abalones (sea snails). The data contain measurements 10 | of the type (male, female, and infant), the longest shell measurement, the diameter, 11 | height, and several weights (whole, shucked, viscera, and shell). The outcome 12 | is the number of rings. 13 | This script: 14 | 1. Visualises how the predictors relate to the reponse 15 | 2. Visualises how the predictors relate to each other 16 | 3. Evaluates predictor importance based on several methods 17 | 4. Filters redundant predictors and uses PCA to create a set of orthogonal predictors 18 | 19 | ```{r} 20 | library(AppliedPredictiveModeling) 21 | data(abalone) 22 | str(abalone) 23 | head(abalone) 24 | 25 | library(ggplot2) 26 | library(tidyr) 27 | library(scales) 28 | library(corrplot) 29 | library(CORElearn) 30 | library(car) 31 | library(minerva) 32 | suppressMessages(library(caret)) 33 | suppressMessages(library(pROC)) 34 | ``` 35 | 36 | ### 1. How do the predictors relate to the number of rings? 37 | Visually displaying the data shows some clear relationships and also outlying 38 | values. In the plots of rings vs. the continouos variables there are both linear 39 | (e.g. diameter) and non-linear (e.g. the weight variables) patterns. The similar 40 | shapes of some of the point clouds are suggestive that some of the variables likely 41 | contain the same information. Of course this makes sense given likely strong 42 | relationships between the various weight and length variables. Height shows two 43 | outlying points with values about 4 to 10 times greater than normal, suggesting 44 | they may be incorrectly entered values. The boxplots show an expected pattern, 45 | with infants having less rings than adults. 46 | ```{r, fig.width = 10, fig.height = 10} 47 | # format data for plotting 48 | gg_data <- gather(abalone, Rings) 49 | names(gg_data) <- c("Rings", "variable", "value") 50 | gg_data <- subset(gg_data, gg_data$variable != "Type") 51 | gg_data$value <- as.numeric(gg_data$value) 52 | # scatter plots for continuous variables 53 | ggplot(aes(x = value, y = Rings), data = gg_data) + 54 | geom_point() + 55 | facet_wrap(~variable, scales = "free_x") + 56 | scale_x_continuous(breaks = pretty_breaks(n = 8)) 57 | ``` 58 | 59 | ```{r} 60 | # boxplot for Type variable 61 | ggplot(aes(x = Type, y = Rings), data = abalone) + 62 | geom_boxplot() 63 | ``` 64 | 65 | ### 2. How do the predictors relate to each other? 66 | The car packages amazing function car::scatterplotMatrix shows clear relationships 67 | between the variables. This further emphasised by the correlation plot. There are 68 | clearly near linear dependencies in the data. 69 | ```{r, fig.width = 10, fig.height = 10} 70 | X <- abalone[ , sapply(abalone, is.numeric) ] 71 | X <- X[ ,-8] # remove Rings 72 | 73 | # matrix scatter plots 74 | scatterplotMatrix(X, smoother = FALSE, reg.line = FALSE) 75 | 76 | # LOESS fit 77 | loess_results <- filterVarImp(x = X, y = abalone$Rings, nonpara = TRUE) 78 | loess_results 79 | 80 | # correlations 81 | XX <- cor(X) 82 | corrplot(XX, "number", tl.cex = 0.7) 83 | ``` 84 | 85 | ### 3. Predictor importance scores 86 | A downside of all the measure used in this section is that they soley reveal bivariate 87 | relationships. We cannot know for example if ther interaction of two predictors is 88 | an important term to include in any model. Regardless the various measures of linear, 89 | rank, and information provide a useful to gauge the likely importance of a variable in 90 | improving the predictive ability of a model (i.e. screening!). 91 | Pearson's *r* provides a measure of the linear relationships between two variables 92 | while Spearman's *rho* is the rank correlation between the variable and so is better 93 | suited to picking up non-linear relationships. All variable have a greater Spearman's 94 | *rho* than Pearson's *r* suggesting future model sshould take into account this non-linearity. 95 | The ANOVA and pariwise t-tests confirm what the boxplot showed: that infants are 96 | most different from the other groups in number of rings. 97 | ```{r} 98 | # linear correlations 99 | pearsonsR <- apply(X, MARGIN = 2, FUN = cor, y = abalone$Rings, method = "pearson") 100 | pearsonsR 101 | 102 | # rank correlations 103 | spearmansRho <- apply(X, MARGIN = 2, FUN = cor, y = abalone$Rings, method = "spearman") 104 | spearmansRho 105 | 106 | # ANOVA and t tests (Type variable) 107 | anova(lm(Rings ~ Type, data = abalone)) 108 | pairwise.t.test(abalone$Rings, abalone$Type, pool.sd = FALSE) 109 | ``` 110 | 111 | The maximal information coefficient (MIC) is an information theory based measure 112 | of the strength of linear and/or non-linear relationship between two variables. It 113 | bins continuous variables in such a way as to maximise the mutual information, the 114 | amount of information you gain about the likely value of one variable given the 115 | value of another. The results suggest that all variables are moderately related to 116 | thee number fo rings. MIC minus R^2 is suggested as a measure of the degree of 117 | non-linearity in the relationship, all values of this measure are close to zero 118 | implying non-linear relationships. 119 | ```{r} 120 | # MIC 121 | mic_values <- mine(x = X, y = abalone$Rings) 122 | mic_values$MIC 123 | ``` 124 | 125 | The RReliefF algorithm is an adaption of ReliefF to a regression setting 126 | (Robnik-Šikonja & Kononenko, 1997). It is a measure of how likely nearby instances 127 | of randomly selected observations are to give a similar prediction in the response. 128 | It can be combined with a permutation test to give an indication of how much the 129 | results differ from chance. 130 | ```{r} 131 | #RreliefF (optimistic!!) 132 | relief_values <- attrEval(abalone$Rings ~ ., data = X, 133 | estimator = "RReliefFbestK", # calculation method 134 | ReliefIterations = 50) # num iteration 135 | relief_values <- data.frame(Predictor = names(relief_values), 136 | value = relief_values, 137 | row.names = NULL) 138 | 139 | # RreliefF permutation test 140 | relief_perm <- permuteRelief(x = X, y = abalone$Rings, nperm = 500, 141 | estimator = "RReliefFbestK", 142 | ReliefIterations = 50) 143 | 144 | # standard deviations from permutation score distribution 145 | relief_perm$standardized[order(relief_perm$standardized)] 146 | ``` 147 | 148 | ### 4. Filters redundant predictors and create a set of non-redundant Principal component analysis 149 | Given the relationships between the variables this function filters out highly 150 | correlated variable leaving a reduced set. It follows a heuristic algorithm in 151 | Kuhn and Johnson's book in removing from a pair that variable most related to the 152 | other variables. PCA is then performed, with the first two principal components 153 | accounting for 90% of the variance. The filter method at *r* = 0.75 also leads to 154 | the conclusion there are only two non-redundant "pieces of information" in this data set. 155 | ```{r} 156 | # Filter predictors 157 | # returns name of predictors to keep, possibly still highly correlated if only 158 | # two, in this case examine how they relate to the response in making decisions 159 | filter_vars <- function(X, cor_level = 0.75,...) { 160 | XX <- cor(X) 161 | XX <- XX - diag(diag(XX)) 162 | while (any(XX > cor_level)) { 163 | if (ncol(XX) <= 2) { # prevent entering filtering loop 164 | return(colnames(XX)) 165 | } else { 166 | var_ind <- which(XX == max(XX), arr.ind = TRUE) 167 | var.1 <- row.names(var_ind)[1] 168 | var.2 <- row.names(var_ind)[2] 169 | var.1_av <- sum(XX[var.1, ])/(length(XX[var.1, ]) - 1) 170 | var.2_av <- sum(XX[var.2, ])/(length(XX[var.2, ]) - 1) 171 | if (var.1_av > var.2_av) { 172 | XX <- XX[!(row.names(XX) == var.1),!(colnames(XX) == var.1)] 173 | } else { 174 | XX <- XX[!(row.names(XX) == var.2),!(colnames(XX) == var.2)] 175 | } 176 | } 177 | } 178 | colnames(XX) 179 | } 180 | 181 | filter_vars(X) # works... 182 | 183 | # PCA 184 | pca_object <- prcomp(X, center = TRUE, scale. = TRUE) 185 | percent_variance <- pca_object$sd^2/sum(pca_object$sd^2)*100 186 | percent_variance # in agreement with filtering method that there are really 187 | ``` 188 | 189 | 190 | -------------------------------------------------------------------------------- /measuring-predictor-importance/churn_data_181.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Ex18.1 - Measuring predictor importance: churn data set" 3 | author: "Oisin Fitzgerald" 4 | output: 5 | html_document: 6 | keep_md: true 7 | --- 8 | 9 | The “churn” data set was developed to predict telecom customer churn based on 10 | information about their account. It contains 20 variables, with the 19 11 | predictors including continuous and factor variables that describe an individual 12 | account. The response churn has two levels "yes" and "no". This script demonstrates 13 | methods for examining categorical and continuous predictor importance for 14 | classification problems. 15 | 16 | (a) Examining the correlation between predictors 17 | ------------------------------------------------ 18 | ```{r} 19 | # load packages 20 | library(AppliedPredictiveModeling) 21 | library(C50) 22 | library(corrplot) 23 | library(caret) 24 | library(CORElearn) 25 | library(ggplot2) 26 | library(pROC) 27 | 28 | # load data 29 | data(churn) 30 | str(churnTrain) 31 | 32 | # correlation between the continuous variables the the 33 | numeric_vars <- sapply(churnTrain, is.numeric) 34 | corrs <- cor(churnTrain[numeric_vars]) 35 | corrplot(corrs, method = "number", tl.cex = 0.75) 36 | ``` 37 | 38 | Finding a perfect colinearity between the of four pairs of total charge and 39 | total minutes variables is of course no surprise, phone charges are set per time 40 | period! However it is somewhat surprising that there is no linear relation 41 | between any other pairs of variables. A scatterplot matrix of a subset of 42 | the continuous variables reveals the extent of any pairwise relationship and 43 | how certain variables contain numerous of zeros. 44 | 45 | ```{r, echo=FALSE} 46 | pairs(~ ., data = churnTrain[numeric_vars][1:4], main="Scatterplot Matrix") 47 | # make conditional on churn 48 | ``` 49 | 50 | (b) Assessing the importance of categorical predictors 51 | ------------------------------------------------------------------------------- 52 | Odds ratios, Fisher's exact test and chi-square tests provide methods to examine 53 | the extent of association between factor levels and the response categories. 54 | Fisher's exact test is considred more reliable than chi-square, however it is more 55 | computationally intensive. 56 | ```{r} 57 | # A function to calculate chi-square, odds ratios and fisher's exact test 58 | association_tests <- function(x, y) { # x is predictors, y is response 59 | 60 | x <- x[ ,sapply(x, is.factor)] 61 | n <- length(x) 62 | names <- colnames(x) 63 | 64 | out <- data.frame( 65 | chisq = rep(NA, n), 66 | chi.p.value = rep(NA, n), 67 | odds.ratio = rep(NA, n), 68 | fisher.p.value = rep(NA, n)) 69 | 70 | for (i in 1:n) { 71 | row.names(out)[i] <- names[i] 72 | if (nlevels(x[ ,i]) > 7) { 73 | fish_res <- fisher.test(x = x[ ,i], y = y, simulate.p.value = TRUE) 74 | out$fisher.p.value[i] <- fish_res$p.value 75 | } else { 76 | fish_res <- fisher.test(x = x[ ,i], y = y) 77 | out$fisher.p.value[i] <- fish_res$p.value 78 | if (nlevels(x[ ,i]) <= 2) out$odds.ratio[i] <- fish_res$estimate 79 | } 80 | 81 | chi_res <- chisq.test(x = x[ ,i], y = y, simulate.p.value = TRUE) # chisq test 82 | out$chisq[i] <- chi_res$statistic 83 | out$chi.p.value[i] <- chi_res$p.value 84 | } 85 | out 86 | } 87 | 88 | res <- association_tests(x = churnTrain[ ,1:19], y = churnTrain$churn) 89 | ``` 90 | 91 | The results suggest international plan to be an important variable, while area code 92 | shows little value as a predictor. The extremely low chi-square statistic and 93 | high p-value suggests data may have been purposely balanced by area code. Voice 94 | mail plan and state seems to have value, without the same strngth of association 95 | as international plan. 96 | ```{r} 97 | ggplot(data = res, aes(x = chisq, y = -log(chi.p.value))) + 98 | geom_point(size = 3) + 99 | annotate("text", x = res$chisq + 10, 100 | y = -log(res$chi.p.value) -.3, label = c(row.names(res))) + 101 | labs(title = "Chi square vs. -log(p.values)") + 102 | xlim(NA, 275) + ylim(NA, 9) 103 | ``` 104 | 105 | Receiver operating characteristic (ROC) curves offer a method to examine the extent 106 | to which a predictor variable distinguishes between the two levels of a response 107 | factor, e.g. to what extent does the "account_length" variable allow us to 108 | distinguish between customers likely to churn, and those who are not. The area 109 | under the ROC curve (AUC) quantifies the ability of a predictor variable to separate 110 | between classes. 111 | The AUC leads to different conclusions to the association tests, with international 112 | plan now considered the least important variable. Area code is now ranked the second 113 | most important variable. 114 | ```{r} 115 | # Calculate the area under the ROC curve 116 | factor_pred <- churnTrain[ ,sapply(churnTrain, is.factor)] # subset the factors 117 | factor_pred$churn <- NULL 118 | auc_factors <- filterVarImp(y = churnTrain$churn, x = factor_pred) 119 | # variables ranked by auc 120 | auc_factors[order(auc_factors$yes, decreasing = TRUE), ] 121 | ``` 122 | 123 | (b) Assessing the importance of continuous predictors 124 | ------------------------------------------------------------------------------- 125 | ```{r} 126 | # create a subset of the continuous predictors 127 | cont_pred <- churnTrain[ ,sapply(churnTrain, is.numeric)] 128 | ``` 129 | 130 | Where the response is a category with two outcomes, t-tests can be used to assess 131 | the difference in the distributions of the continuous predictors by the response 132 | categories. As a signal/noise ratio the t-statistic quantifies the separation in 133 | the distributions, with the associated p value indicating the extent to which this 134 | would occur based on an assumption of no differnce. 135 | ```{r} 136 | get_tstats <- function(x, y) { 137 | test <- t.test(x ~ y) # Welch's t test 138 | out <- c(t_stat = test$statistic, p = test$p.value) 139 | out 140 | } 141 | 142 | t_values <- apply(cont_pred, MARGIN = 2, FUN = get_tstats, y = churnTrain$churn) 143 | t_values <- data.frame(t(t_values)) # transpose 144 | round(t_values[order(t_values$p), ], 6) 145 | ``` 146 | 147 | The AUC and t-test for the continuous predictors both agree to a large extent. 148 | They share the same top 3 predictors, and only seem to have slight re-shuffling 149 | otherwise. 150 | ```{r} 151 | # Calculate the area under the ROC curve 152 | auc_numeric <- filterVarImp(y = churnTrain$churn, x = cont_pred) 153 | # continuous variables ranked by AUC 154 | auc_numeric[order(auc_numeric$yes, decreasing = TRUE), ] 155 | ``` 156 | 157 | 158 | (d) Use RefliefF to jointly assess the importance of predictors 159 | ------------------------------------------------------------------------------- 160 | The Relief algorithm is another method to measure the importance of predictors 161 | for a two class response problem (although it can deal with other situations 162 | as well). It begins by randomly selecting a set of observations, R, of size m. 163 | The algorithm then evaluates each predictor in isolation by looping through each 164 | point in the random set and for each point (1) finding the two nearest scores 165 | that are a hit(i.e. share same class in response) and a miss (i.e. does not share 166 | same class in reponse) and (2) updating the score for that predictor, 167 | S = S - diff(R, Hit)^2/m + diff(R, Miss)^2/m. 168 | ```{r} 169 | relief_values <- attrEval(churn ~ ., data = churnTrain, 170 | estimator = "ReliefFequalK", # calculation method 171 | ReliefIterations = 50) # num iteration 172 | relief_values[order(relief_values, decreasing = TRUE)] 173 | 174 | ``` 175 | 176 | An addition to the Relief algorithm is to permutate the response observations so as to 177 | gain an understanding of the predictors score when it has no relevance. This method can be iterated several times giving a somewhat normal distribution of scores that can then be compared to the true Relief score in terms of standard 178 | deviations. This indicates how much greater the Relief score is than what could 179 | be expected by chance alone. 180 | ```{r} 181 | relief_perm <- permuteRelief(x = churnTrain[ ,-20], y = churnTrain$churn, nperm = 500, 182 | estimator = "ReliefFequalK", 183 | ReliefIterations = 50) 184 | ``` 185 | 186 | The results suggest that total day charge (and therefore total day minutes - 187 | one is a multiple of the other) and number of customer service calls are highly 188 | important variables. International plan is also quite far from its permutation 189 | distribution mean in terms of standard deviations, putting this method more in 190 | agreement with chisq/fishers than the AUC. However its relief score still quite 191 | low. This may be a result of a heavy bias towards "no" internation plan and "no" 192 | churn but ambiguity otherwise (poor sensitivity -> low AUC curve?). There are several predictors that appear without value including state, area code, total intl minutes (and charges), and total night minutes (and charges). 193 | ```{r, fig.width = 10} 194 | # Histograms of the permutated relief scores 195 | relief_values <- data.frame(Predictor = names(relief_values), 196 | value = relief_values, 197 | row.names = NULL) 198 | ggplot(data = relief_perm$permutations, aes(x = value)) + 199 | geom_histogram(binwidth = .01, colour = 1) + 200 | geom_vline(aes(xintercept = value), relief_values, colour = "red", linetype = 2) + 201 | facet_wrap(~ Predictor) + 202 | labs(title = "Relief Scores and Permutation Distributions", xlab = "Relief Scores") 203 | 204 | # Standard deviation of permutated distribution from non-permutated score 205 | relief_perm$standardized[order(relief_perm$standardized)] 206 | ``` 207 | 208 | 209 | 210 | -------------------------------------------------------------------------------- /linear-regression-and-cousins/IR_data_61.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Ex6.1 - Linear regression: IR spectrum of food" 3 | author: "Oisin Fitzgerald" 4 | date: "26 January 2016" 5 | output: 6 | html_document: 7 | keep_md: true 8 | --- 9 | ### The data: 10 | The data provides an infrared (IR) profile and analytical chemistry determined 11 | percent content of water, fat, and protein for meat samples. If there can be establish 12 | a predictive relationship between IR spectrum and fat content, then food scientists 13 | could predict a sample’s fat content with IR instead of using analytical chemistry 14 | 15 | ### Outline: 16 | 1. What is the relationship between the predictors? Are they highly correlated given 17 | the same food sample is measured at many IR wavelengths? 18 | 2. Create training/test split 19 | 3. Fit different models 20 | + Linear regression 21 | + Ridge regression, lasso and elastic net 22 | + PCR and PLS 23 | 4. Compare models predictive ability 24 | 25 | ```{r} 26 | # load data and packages 27 | library(car) 28 | library(lars) 29 | library(broom) 30 | library(reshape2) 31 | suppressMessages(library(elasticnet)) 32 | suppressMessages(library(pls)) 33 | suppressMessages(library(caret)) 34 | 35 | data(tecator) # from caret 36 | ``` 37 | 38 | ### 1. Relationship between predictors and distributions 39 | 40 | ```{r} 41 | # correlation 42 | XX <- cor(absorp) 43 | XX[1:5, 1:5] # everything is related to everything! 44 | 45 | # PCA 46 | pca_object <- prcomp(absorp) 47 | percent_variance <- pca_object$sdev^2/sum(pca_object$sd^2)*100 48 | head(percent_variance) 49 | 50 | # Predictor distributions 51 | ggplot(data = data.frame(absorp)) + 52 | geom_histogram(aes(x = X1), bins = 20, col = 1) + 53 | labs(title = "Histogram of IR wavelength no. 1", 54 | x = "Wavelength predictor 1") # positive skew 55 | ``` 56 | 57 | ### 2. Create a training/test split 58 | 59 | * 75% of the data to the training set 60 | * The predictor variables show positive skew which Yeo-Johnson estimated lambda 61 | (of -1 i.e. reciprocal) altered 62 | 63 | ```{r} 64 | length(endpoints[ ,1]) # how many observations? 65 | # create partition index 66 | data_split <- createDataPartition(endpoints[ ,1], p = .75) 67 | data_split <- data_split$Resample1 68 | 69 | # split data 70 | training <- absorp[data_split, ] 71 | test <- absorp[-data_split, ] 72 | train_resp <- endpoints[data_split, 2] # column 2 is fat content 73 | test_resp <- endpoints[-data_split, 2] 74 | 75 | # de-skew variables 76 | training <- data.frame(training) 77 | test <- data.frame(test) 78 | proc_object <- preProcess(training, 79 | method = c("YeoJohnson", "center", "scale")) 80 | training <- predict(proc_object, training) 81 | test <- predict(proc_object, test) 82 | ``` 83 | 84 | ### 3. Model fitting 85 | * Linear regression 86 | + Unsurprisingly prior removing of highly correlated predictors resulted in a model 87 | with only one independent variable. The performance on cross-validation was poor. 88 | * Ridge regression 89 | + The ridge model quickly highlighted the ability to improve on the linear regression 90 | model. However, subsequent fitting of a lasso model showed that an ability to drive 91 | the coefficients to zero was an advantage in the highly correlated predictor environment. 92 | * The lasso and elastic net 93 | + As noted the lasso model outperformed the ridge model. The optimal solution resulted 94 | in a large number of the coefficient being shrunk to zero 95 | + Enet performed similar to the lasso, with the best performing model having a 96 | low lambda for the ridge function 97 | * Principal components and partial least squares regression 98 | + These both performed quite well. The similarity of the PCR model to the PLS 99 | models is likely related to the variance in the predictors (IR response) very much 100 | being a consequence of the variance in the response (food fat content), thus the 101 | unsupervised nature of PCA causing little detriment. 102 | + The number of principal components was tuned rather than using the first two, 103 | or fist few that explained 90% of variance etc. 104 | 105 | ```{r} 106 | ctrl <- trainControl(method = "cv", number = 5, repeats = 5) 107 | # Linear regression 108 | mc <- findCorrelation(training, cutoff = 0.95) 109 | training_linear <- data.frame(training[ ,-mc]) 110 | # colnames(training_linear) <- "X1" 111 | linear_model <- train(y = train_resp, 112 | x = training_linear, 113 | method = "lm", 114 | trControl = ctrl) 115 | linear_model 116 | 117 | # Ridge Regression - penalise square of coefficient 118 | ridge_model <- train(y = train_resp, 119 | x = training, 120 | method = "ridge", 121 | trControl = ctrl, 122 | tuneLength = 10) 123 | ridge_model 124 | plot(ridge_model) 125 | 126 | # Lasso - penalise absolute value of coeffienct 127 | lasso_grid <- expand.grid(.fraction = seq(0.001, 0.1, 0.01)) 128 | lasso_model <- train(y = train_resp, 129 | x = training, 130 | method = "lasso", 131 | trControl = ctrl, 132 | tuneGrid = lasso_grid) 133 | lasso_model 134 | plot(lasso_model) 135 | 136 | # Elastic Net - combination of ridge and lasso 137 | enet_grid <- expand.grid(.fraction = seq(0.001, 0.1, 0.01), .lambda = c(0, 0.0001, 0.001, 0.01)) 138 | enet_model <- train(y = train_resp, 139 | x = training, 140 | method = "enet", 141 | trControl = ctrl, 142 | tuneGrid = enet_grid) 143 | enet_model 144 | plot(enet_model) 145 | 146 | # PCR - 147 | pcr_results <- list(results = data.frame(RMSE = NA, RMSE_sd = NA), final = NA) 148 | for (i in 1:20) { 149 | # fit model 150 | train_data <- princomp(training)$scores[ ,1:i] 151 | train_data <- data.frame(train_data) 152 | pcr_model <- train(y = train_resp, 153 | x = train_data, 154 | method = "lm", 155 | trControl = ctrl) 156 | 157 | # extract results 158 | pcr_results$results[i, 1] <- pcr_model$results$RMSE 159 | pcr_results$results[i, 2] <- pcr_model$results$RMSESD 160 | 161 | # extract model 162 | if (all(pcr_model$results$RMSE <= pcr_results$results$RMSE)) { 163 | pcr_results$final <- pcr_model 164 | } 165 | } 166 | pcr_results 167 | 168 | 169 | # PLS 170 | pls_grid <- expand.grid(.ncomp = seq(10, 20, 1)) 171 | pls_model <- train(y = train_resp, 172 | x = training, 173 | method = "pls", 174 | trControl = ctrl, 175 | preProcess = c("center", "scale"), 176 | tuneGrid = pls_grid) 177 | pls_model 178 | ``` 179 | 180 | ### 4. Compare performance on test set 181 | * The results from fitting on the test set followed from the cross-validation 182 | included in model fitting. Linear regression did very poorly with ridge regression 183 | slightly worse off than the group of lasso, elastic net, PCR and PLS. 184 | * In context the RMSE and correlation between predicted and observed results are 185 | superb, and surely suggest that any of these models could be used in measuring the 186 | fat content of food using infrared. 187 | * Given the similarities in the model performances I was interested in constructing 188 | confidence intervals around the RMSE. A function to calculate bootstrap 189 | estimation and its results are shown below. 190 | 191 | ```{r} 192 | # Linear regression 193 | test_linear <- data.frame(test[ ,-mc]) 194 | colnames(test_linear) <- colnames(training_linear) 195 | linear_pred <- predict(linear_model, test_linear) 196 | ggplot() + geom_point(aes(x = linear_pred, y = test_resp)) 197 | 198 | n <- length(test_resp) 199 | RMSE_lm <- sqrt(sum((test_resp - linear_pred)^2)/n); RMSE_lm 200 | 201 | # Ridge regression 202 | ridge_preds <- predict(ridge_model, test) 203 | ggplot() + geom_point(aes(x = ridge_preds, y = test_resp)) 204 | 205 | RMSE_ridge <- sqrt(sum((test_resp - ridge_preds)^2)/n); RMSE_ridge 206 | 207 | # Lasso 208 | lasso_preds <- predict(lasso_model, test) 209 | ggplot() + geom_point(aes(x = lasso_preds, y = test_resp)) 210 | 211 | RMSE_lasso <- sqrt(sum((test_resp - lasso_preds)^2)/n); RMSE_lasso 212 | 213 | # Elastic net 214 | enet_preds <- predict(enet_model, test) 215 | ggplot() + geom_point(aes(x = enet_preds, y = test_resp)) 216 | 217 | RMSE_enet <- sqrt(sum((test_resp - enet_preds)^2)/n); RMSE_enet 218 | 219 | # PCR 220 | pca_train <- princomp(training) 221 | test_pcs <- predict(pca_train, test) 222 | pcr_preds <- predict(pcr_results$final, test_pcs) 223 | ggplot() + geom_point(aes(x = pcr_preds, y = test_resp)) 224 | 225 | RMSE_pcr <- sqrt(sum((test_resp - pcr_preds)^2)/n); RMSE_pcr 226 | 227 | # PLS 228 | pls_preds <- predict(pls_model, test) 229 | ggplot() + geom_point(aes(x = pls_preds, y = test_resp)) 230 | 231 | RMSE_pls <- sqrt(sum((test_resp - pls_preds)^2)/n); RMSE_pls 232 | cor(pls_preds, test_resp) 233 | ``` 234 | 235 | ### 4. Compare performance on test set contd. 236 | * Bootstrap estimate of RMSE confidence interval 237 | + PLS appears to be the prefered model, it shows the least variation in its RMSE scores 238 | across the bootstrap samples. PCR is likely similar, an issue with variable naming meant 239 | I excluded it. 240 | 241 | ```{r} 242 | boostrap_RMSE <- function(model, data, obs, trials = 1000, CI = 0.95) { 243 | 244 | n <- nrow(data) 245 | out <- list(results = data.frame(RMSE = NA), lower = NA, upper = NA) 246 | 247 | for (i in 1:trials) { 248 | # create bootstrap sample 249 | samp <- sample(n, size = n, replace = TRUE) 250 | boot_obs <- obs[samp] 251 | boot_data <- data.frame(data[samp, ]) 252 | colnames(boot_data) <- colnames(data) 253 | # predict 254 | preds <- predict(model, newdata = boot_data) 255 | RMSE <- sqrt(sum((boot_obs - preds)^2)/n) 256 | 257 | out$results[i ,1] <- RMSE 258 | } 259 | 260 | temp <- out$results$RMSE 261 | temp <- quantile(temp, probs = c(0.025, 0.975), na.rm = TRUE) 262 | 263 | out$lower <- temp[1] 264 | out$upper <- temp[2] 265 | 266 | out 267 | } 268 | ``` 269 | 270 | 271 | ```{r} 272 | # The bootstrap results 273 | bRMSE_lm <- boostrap_RMSE(linear_model, test_linear, test_resp) 274 | bRMSE_ridge <- boostrap_RMSE(ridge_model, test, test_resp) 275 | bRMSE_lasso <- boostrap_RMSE(lasso_model, test, test_resp) 276 | bRMSE_enet <- boostrap_RMSE(enet_model, test, test_resp) 277 | # bRMSE_pcr <- boostrap_RMSE(pcr_model, test, test_resp) 278 | bRMSE_pls <- boostrap_RMSE(pls_model, test, test_resp) 279 | 280 | model_results <- data.frame(bRMSE_lm$results, bRMSE_ridge$results, 281 | bRMSE_lasso$results, bRMSE_enet$results, bRMSE_pls$results) 282 | colnames(model_results) <- c('lm', 'ridge', 'lasso', 'enet', 'pls') 283 | 284 | temp <- melt(model_results) 285 | 286 | ggplot(data = temp, aes(x = variable, y = value)) + 287 | geom_boxplot(width = 0.5) + 288 | theme_bw() + 289 | labs(title = 'Bootstrap Estimates of Model Performance', 290 | x = 'Model', 291 | y = 'RMSE') 292 | ``` 293 | 294 | ### Conclusion 295 | 296 | * The clear signal in the data meant that despite multicollinarity issues several 297 | linear model fitting methods had no problem producing extremely predictive models. 298 | * The predictors were highly correlated and likely possessed variations on same 299 | information. Therefore possibly as a result of their ability to extract the minimal 300 | dimension signal from several correlated variables PCR and PLS appear to have a slight 301 | performance advantage over other models. 302 | -------------------------------------------------------------------------------- /measuring-predictor-importance/abalone_data_183.md: -------------------------------------------------------------------------------- 1 | # Ex 18.3 Measuring predictor importance: abalone data set 2 | Oisin Fitzgerald 3 | The UCI Abalone data (http://archive.ics.uci.edu/ml/datasets/Abalone) 4 | consist of data from 4,177 abalones (sea snails). The data contain measurements 5 | of the type (male, female, and infant), the longest shell measurement, the diameter, 6 | height, and several weights (whole, shucked, viscera, and shell). The outcome 7 | is the number of rings. 8 | This script: 9 | 1. Visualises how the predictors relate to the reponse 10 | 2. Visualises how the predictors relate to each other 11 | 3. Evaluates predictor importance based on several methods 12 | 4. Filters redundant predictors and uses PCA to create a set of orthogonal predictors 13 | 14 | 15 | ```r 16 | library(AppliedPredictiveModeling) 17 | data(abalone) 18 | str(abalone) 19 | ``` 20 | 21 | ``` 22 | ## 'data.frame': 4177 obs. of 9 variables: 23 | ## $ Type : Factor w/ 3 levels "F","I","M": 3 3 1 3 2 2 1 1 3 1 ... 24 | ## $ LongestShell : num 0.455 0.35 0.53 0.44 0.33 0.425 0.53 0.545 0.475 0.55 ... 25 | ## $ Diameter : num 0.365 0.265 0.42 0.365 0.255 0.3 0.415 0.425 0.37 0.44 ... 26 | ## $ Height : num 0.095 0.09 0.135 0.125 0.08 0.095 0.15 0.125 0.125 0.15 ... 27 | ## $ WholeWeight : num 0.514 0.226 0.677 0.516 0.205 ... 28 | ## $ ShuckedWeight: num 0.2245 0.0995 0.2565 0.2155 0.0895 ... 29 | ## $ VisceraWeight: num 0.101 0.0485 0.1415 0.114 0.0395 ... 30 | ## $ ShellWeight : num 0.15 0.07 0.21 0.155 0.055 0.12 0.33 0.26 0.165 0.32 ... 31 | ## $ Rings : int 15 7 9 10 7 8 20 16 9 19 ... 32 | ``` 33 | 34 | ```r 35 | head(abalone) 36 | ``` 37 | 38 | ``` 39 | ## Type LongestShell Diameter Height WholeWeight ShuckedWeight 40 | ## 1 M 0.455 0.365 0.095 0.5140 0.2245 41 | ## 2 M 0.350 0.265 0.090 0.2255 0.0995 42 | ## 3 F 0.530 0.420 0.135 0.6770 0.2565 43 | ## 4 M 0.440 0.365 0.125 0.5160 0.2155 44 | ## 5 I 0.330 0.255 0.080 0.2050 0.0895 45 | ## 6 I 0.425 0.300 0.095 0.3515 0.1410 46 | ## VisceraWeight ShellWeight Rings 47 | ## 1 0.1010 0.150 15 48 | ## 2 0.0485 0.070 7 49 | ## 3 0.1415 0.210 9 50 | ## 4 0.1140 0.155 10 51 | ## 5 0.0395 0.055 7 52 | ## 6 0.0775 0.120 8 53 | ``` 54 | 55 | ```r 56 | library(ggplot2) 57 | library(tidyr) 58 | library(scales) 59 | library(corrplot) 60 | library(CORElearn) 61 | library(car) 62 | library(minerva) 63 | suppressMessages(library(caret)) 64 | suppressMessages(library(pROC)) 65 | ``` 66 | 67 | ### 1. How do the predictors relate to the number of rings? 68 | Visually displaying the data shows some clear relationships and also outlying 69 | values. In the plots of rings vs. the continouos variables there are both linear 70 | (e.g. diameter) and non-linear (e.g. the weight variables) patterns. The similar 71 | shapes of some of the point clouds are suggestive that some of the variables likely 72 | contain the same information. Of course this makes sense given likely strong 73 | relationships between the various weight and length variables. Height shows two 74 | outlying points with values about 4 to 10 times greater than normal, suggesting 75 | they may be incorrectly entered values. The boxplots show an expected pattern, 76 | with infants having less rings than adults. 77 | 78 | ```r 79 | # format data for plotting 80 | gg_data <- gather(abalone, Rings) 81 | ``` 82 | 83 | ``` 84 | ## Warning: attributes are not identical across measure variables; they will 85 | ## be dropped 86 | ``` 87 | 88 | ```r 89 | names(gg_data) <- c("Rings", "variable", "value") 90 | gg_data <- subset(gg_data, gg_data$variable != "Type") 91 | gg_data$value <- as.numeric(gg_data$value) 92 | # scatter plots for continuous variables 93 | ggplot(aes(x = value, y = Rings), data = gg_data) + 94 | geom_point() + 95 | facet_wrap(~variable, scales = "free_x") + 96 | scale_x_continuous(breaks = pretty_breaks(n = 8)) 97 | ``` 98 | 99 | ![](abalone_data_183_files/figure-html/unnamed-chunk-2-1.png) 100 | 101 | 102 | ```r 103 | # boxplot for Type variable 104 | ggplot(aes(x = Type, y = Rings), data = abalone) + 105 | geom_boxplot() 106 | ``` 107 | 108 | ![](abalone_data_183_files/figure-html/unnamed-chunk-3-1.png) 109 | 110 | ### 2. How do the predictors relate to each other? 111 | The car packages amazing function car::scatterplotMatrix shows clear relationships 112 | between the variables. This further emphasised by the correlation plot. There are 113 | clearly near linear dependencies in the data. 114 | 115 | ```r 116 | X <- abalone[ , sapply(abalone, is.numeric) ] 117 | X <- X[ ,-8] # remove Rings 118 | 119 | # matrix scatter plots 120 | scatterplotMatrix(X, smoother = FALSE, reg.line = FALSE) 121 | ``` 122 | 123 | ![](abalone_data_183_files/figure-html/unnamed-chunk-4-1.png) 124 | 125 | ```r 126 | # LOESS fit 127 | loess_results <- filterVarImp(x = X, y = abalone$Rings, nonpara = TRUE) 128 | loess_results 129 | ``` 130 | 131 | ``` 132 | ## Overall 133 | ## LongestShell 0.3099367 134 | ## Diameter 0.3302339 135 | ## Height 0.3107698 136 | ## WholeWeight 0.3454217 137 | ## ShuckedWeight 0.2691416 138 | ## VisceraWeight 0.3248622 139 | ## ShellWeight 0.4224143 140 | ``` 141 | 142 | ```r 143 | # correlations 144 | XX <- cor(X) 145 | corrplot(XX, "number", tl.cex = 0.7) 146 | ``` 147 | 148 | ![](abalone_data_183_files/figure-html/unnamed-chunk-4-2.png) 149 | 150 | ### 3. Predictor importance scores 151 | A downside of all the measure used in this section is that they soley reveal bivariate 152 | relationships. We cannot know for example if ther interaction of two predictors is 153 | an important term to include in any model. Regardless the various measures of linear, 154 | rank, and information provide a useful to gauge the likely importance of a variable in 155 | improving the predictive ability of a model (i.e. screening!). 156 | Pearson's *r* provides a measure of the linear relationships between two variables 157 | while Spearman's *rho* is the rank correlation between the variable and so is better 158 | suited to picking up non-linear relationships. All variable have a greater Spearman's 159 | *rho* than Pearson's *r* suggesting future model sshould take into account this non-linearity. 160 | The ANOVA and pariwise t-tests confirm what the boxplot showed: that infants are 161 | most different from the other groups in number of rings. 162 | 163 | ```r 164 | # linear correlations 165 | pearsonsR <- apply(X, MARGIN = 2, FUN = cor, y = abalone$Rings, method = "pearson") 166 | pearsonsR 167 | ``` 168 | 169 | ``` 170 | ## LongestShell Diameter Height WholeWeight ShuckedWeight 171 | ## 0.5567196 0.5746599 0.5574673 0.5403897 0.4208837 172 | ## VisceraWeight ShellWeight 173 | ## 0.5038192 0.6275740 174 | ``` 175 | 176 | ```r 177 | # rank correlations 178 | spearmansRho <- apply(X, MARGIN = 2, FUN = cor, y = abalone$Rings, method = "spearman") 179 | spearmansRho 180 | ``` 181 | 182 | ``` 183 | ## LongestShell Diameter Height WholeWeight ShuckedWeight 184 | ## 0.6043853 0.6228950 0.6577164 0.6308320 0.5394200 185 | ## VisceraWeight ShellWeight 186 | ## 0.6143438 0.6924746 187 | ``` 188 | 189 | ```r 190 | # ANOVA and t tests (Type variable) 191 | anova(lm(Rings ~ Type, data = abalone)) 192 | ``` 193 | 194 | ``` 195 | ## Analysis of Variance Table 196 | ## 197 | ## Response: Rings 198 | ## Df Sum Sq Mean Sq F value Pr(>F) 199 | ## Type 2 8381 4190.6 499.33 < 2.2e-16 *** 200 | ## Residuals 4174 35030 8.4 201 | ## --- 202 | ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 203 | ``` 204 | 205 | ```r 206 | pairwise.t.test(abalone$Rings, abalone$Type, pool.sd = FALSE) 207 | ``` 208 | 209 | ``` 210 | ## 211 | ## Pairwise comparisons using t tests with non-pooled SD 212 | ## 213 | ## data: abalone$Rings and abalone$Type 214 | ## 215 | ## F I 216 | ## I < 2e-16 - 217 | ## M 0.00025 < 2e-16 218 | ## 219 | ## P value adjustment method: holm 220 | ``` 221 | 222 | The maximal information coefficient (MIC) is an information theory based measure 223 | of the strength of linear and/or non-linear relationship between two variables. It 224 | bins continuous variables in such a way as to maximise the mutual information, the 225 | amount of information you gain about the likely value of one variable given the 226 | value of another. The results suggest that all variables are moderately related to 227 | thee number fo rings. MIC minus R^2 is suggested as a measure of the degree of 228 | non-linearity in the relationship, all values of this measure are close to zero 229 | implying non-linear relationships. 230 | 231 | ```r 232 | # MIC 233 | mic_values <- mine(x = X, y = abalone$Rings) 234 | mic_values$MIC 235 | ``` 236 | 237 | ``` 238 | ## Y 239 | ## LongestShell 0.3546951 240 | ## Diameter 0.3652849 241 | ## Height 0.3449086 242 | ## WholeWeight 0.3539886 243 | ## ShuckedWeight 0.3153369 244 | ## VisceraWeight 0.3510614 245 | ## ShellWeight 0.3866459 246 | ``` 247 | 248 | The RReliefF algorithm is an adaption of ReliefF to a regression setting 249 | (Robnik-Šikonja & Kononenko, 1997). It is a measure of how likely nearby instances 250 | of randomly selected observations are to give a similar prediction in the response. 251 | It can be combined with a permutation test to give an indication of how much the 252 | results differ from chance. 253 | 254 | ```r 255 | #RreliefF (optimistic!!) 256 | relief_values <- attrEval(abalone$Rings ~ ., data = X, 257 | estimator = "RReliefFbestK", # calculation method 258 | ReliefIterations = 50) # num iteration 259 | relief_values <- data.frame(Predictor = names(relief_values), 260 | value = relief_values, 261 | row.names = NULL) 262 | 263 | # RreliefF permutation test 264 | relief_perm <- permuteRelief(x = X, y = abalone$Rings, nperm = 500, 265 | estimator = "RReliefFbestK", 266 | ReliefIterations = 50) 267 | 268 | # standard deviations from permutation score distribution 269 | relief_perm$standardized[order(relief_perm$standardized)] 270 | ``` 271 | 272 | ``` 273 | ## ShuckedWeight WholeWeight VisceraWeight LongestShell Diameter 274 | ## 3.929672 5.283250 5.732581 5.795141 6.438950 275 | ## ShellWeight Height 276 | ## 9.404226 13.175797 277 | ``` 278 | 279 | ### 4. Filters redundant predictors and create a set of non-redundant Principal component analysis 280 | Given the relationships between the variables this function filters out highly 281 | correlated variable leaving a reduced set. It follows a heuristic algorithm in 282 | Kuhn and Johnson's book in removing from a pair that variable most related to the 283 | other variables. PCA is then performed, with the first two principal components 284 | accounting for 90% of the variance. The filter method at *r* = 0.75 also leads to 285 | the conclusion there are only two non-redundant "pieces of information" in this data set. 286 | 287 | ```r 288 | # Filter predictors 289 | # returns name of predictors to keep, possibly still highly correlated if only 290 | # two, in this case examine how they relate to the response in making decisions 291 | filter_vars <- function(X, cor_level = 0.75,...) { 292 | XX <- cor(X) 293 | XX <- XX - diag(diag(XX)) 294 | while (any(XX > cor_level)) { 295 | if (ncol(XX) <= 2) { # prevent entering filtering loop 296 | return(colnames(XX)) 297 | } else { 298 | var_ind <- which(XX == max(XX), arr.ind = TRUE) 299 | var.1 <- row.names(var_ind)[1] 300 | var.2 <- row.names(var_ind)[2] 301 | var.1_av <- sum(XX[var.1, ])/(length(XX[var.1, ]) - 1) 302 | var.2_av <- sum(XX[var.2, ])/(length(XX[var.2, ]) - 1) 303 | if (var.1_av > var.2_av) { 304 | XX <- XX[!(row.names(XX) == var.1),!(colnames(XX) == var.1)] 305 | } else { 306 | XX <- XX[!(row.names(XX) == var.2),!(colnames(XX) == var.2)] 307 | } 308 | } 309 | } 310 | colnames(XX) 311 | } 312 | 313 | filter_vars(X) # works... 314 | ``` 315 | 316 | ``` 317 | ## [1] "Height" "ShuckedWeight" 318 | ``` 319 | 320 | ```r 321 | # PCA 322 | pca_object <- prcomp(X, center = TRUE, scale. = TRUE) 323 | percent_variance <- pca_object$sd^2/sum(pca_object$sd^2)*100 324 | percent_variance # in agreement with filtering method that there are really 325 | ``` 326 | 327 | ``` 328 | ## [1] 90.78731479 3.99189090 2.39063820 1.62959779 0.92362741 0.18182994 329 | ## [7] 0.09510098 330 | ``` 331 | 332 | 333 | -------------------------------------------------------------------------------- /classification-tree-and-rule-based-models/classification_trees_rules_142.R: -------------------------------------------------------------------------------- 1 | # Classification Trees and Rule Based Models 2 | # 3 | # Script based on Ex 14.2 of Applied Predictive Modeling (Kuhn & Johnson, 2013) 4 | # 5 | # AIM: predict whether a customer is likely to leave a telecomm company based 6 | # on 19 variables 7 | # 8 | # In particular answering: 9 | # a) Impact of grouped vs. binary dummy encoding of the state categorical 10 | # variable 11 | # b) Which classification technique works best of (gini fit) classificaiton 12 | # trees, PART and C50 rule based model and bagged trees 13 | # 14 | # Load data and packages ====================================================== 15 | library(C50) 16 | library(caret) 17 | library(rpart) 18 | library(partykit) 19 | library(tree) 20 | library(pROC) 21 | library(ROCR) 22 | library(randomForest) 23 | library(RWeka) 24 | 25 | data(churn) # load data 26 | 27 | str(churnTrain) 28 | head(churnTrain) 29 | table(churnTrain$churn) # outcome variable category frequencies 30 | prop.table(table(churnTrain$churn)) # outcome variable category probs 31 | 32 | # Basic tree models =========================================================== 33 | # Classification tree model 34 | tree_model <- rpart(churn ~ ., data = churnTrain) # classification tree 35 | print(tree_model) # decision nodes/splits 36 | 37 | # Visualise unpruned tree (default method) 38 | plot(tree_model, branch = 0.3, compress = TRUE, xpd = NA) # plot the decision tree 39 | text(tree_model, use.n = TRUE, cex = 0.7) # nodes info 40 | 41 | # Alternative visualisation no. 1 42 | party_tree <- as.party(tree_model) 43 | plot(party_tree, type = "simple", gp = gpar(cex = 0.5)) 44 | 45 | # Alternative visualisation no. 2 (not working - too many states?) 46 | # draw.tree(party_tree, cex = 0.5, nodeinfo = TRUE, col = (0:8 / 8)) 47 | 48 | 49 | # Prune tree model ------------------------------------------------------------ 50 | plotcp(tree_model) # plot of complexity parameter info 51 | printcp(tree_model) # variables used and complexity parameter info 52 | # from printcp the optimal tree size is 8 leaves (cp = 0.035) 53 | tree_model_final <- prune(tree_model, cp = 0.035) 54 | 55 | # Visualise final (pruned) model 56 | party_tree_final <- as.party(tree_model_final) 57 | plot(party_tree_final, type = "simple", gp = gpar(cex = 0.6)) 58 | plot(party_tree_final, gp = gpar(cex = 0.6)) # nice plot of the tree 59 | 60 | 61 | # Evaluate on the test data --------------------------------------------------- 62 | tree_model_pred <- predict(tree_model_final, churnTest) # make predictions 63 | temp <- ifelse(tree_model_pred[ ,1] < .5, "no", "yes") 64 | tree_model_pred <- data.frame(tree_model_pred, prediction = as.factor(temp)) 65 | tree_confusion <- table(tree_model_pred$prediction, churnTest$churn) 66 | tree_confusion # print confusion matrix 67 | prop.table(tree_confusion) 68 | (141+1415)/1667 # 93.34% of observations are correctly classified 69 | 70 | # ROC curve and AUC with pROC package 71 | roc_tree <- roc(response = churnTest$churn, 72 | predictor = tree_model_pred$yes, plot = TRUE) # ROC curve (sens/spec) 73 | plot(smooth(roc_tree), identity = FALSE) 74 | abline(a = c(1, 1), b = -1, col = 'red', lty = 2) # add 45' line (confusing param) 75 | ci.sp(roc_tree) # CI on specificity 76 | ci.se(roc_tree) # CI on sensitivity 77 | 78 | # ROC curve with the ROCR package 79 | rocplot <- function(pred, truth, ...) { # function to create ROC curve 80 | predob <- prediction(pred, truth ) 81 | perf <- performance(predob, "tpr", "fpr") 82 | plot(perf, ...) 83 | } 84 | par(pty="s") # ensure square graph 85 | rocplot(tree_model_pred$yes, churnTest$churn) 86 | abline(a = c(0,0), b = 1, col = 'red', lty = 2) 87 | 88 | # AUC 89 | auc(churnTest$churn, tree_model_pred$yes) 90 | 91 | 92 | # Classification tree using caret:::train -------------------------------------- 93 | ctrl <- trainControl(method = "cv", number = 10, classProbs = TRUE) # 10 fold cv 94 | 95 | caret_tree <- train(x = churnTrain[ ,1:19], y = churnTrain$churn, # note x, y entry 96 | method = 'rpart', 97 | tuneLength = 30, 98 | trControl = ctrl) 99 | caret_tree 100 | 101 | # Visualise pruned caret tree 102 | caret_tree_party <- as.party(caret_tree$finalModel) 103 | plot(caret_tree_party, gp = gpar(cex = 0.5)) # nice plot of the tree 104 | 105 | 106 | # Evaluate caret tree on test set --------------------------------------------- 107 | caret_tree_preds <- predict(caret_tree, churnTest, type = "prob") 108 | temp <- ifelse(caret_tree_preds[ ,1] < .5, "no", "yes") 109 | caret_tree_preds <- data.frame(caret_tree_preds, prediction = as.factor(temp)) 110 | caret_tree_confusion <- table(caret_tree_preds$prediction, churnTest$churn) 111 | caret_tree_confusion 112 | prop.table(caret_tree_confusion) 113 | (138+1426)/1667 # 93.38% of observations are correctly classified 114 | 115 | # ROC curve for caret and tree predictions 116 | rocplot(tree_model_pred$yes, churnTest$churn) 117 | rocplot(caret_tree_preds$yes, churnTest$churn, add = TRUE, col = "blue") 118 | abline(a = c(0,0), b = 1, col = 'red', lty = 2) 119 | legend("bottomright", c("caret model", "rpart model"), 120 | lty = 1, 121 | col = c("blue", "black"), 122 | cex = 0.75) 123 | 124 | # Slight difference in predictive accuracy of the caret and rpart models. 125 | # The caret model was pruned using accuracy, so this may be the reason. 126 | 127 | # Category encoding =========================================================== 128 | # A look at independent vs. grouped categories for the state variable 129 | # i.e. create (levels - 1) dummy variables for state and refit the decision 130 | # trees 131 | 132 | # Create dummy variables and add to the (new) dataframe(s) 133 | dmy <- dummyVars( ~ state , data = churnTrain, fullRank = TRUE) 134 | churnTrain_dmy <- data.frame(churnTrain[ ,-1], (predict(dmy, churnTrain))) 135 | churnTest_dmy <- data.frame(churnTest[ ,-1], (predict(dmy, churnTest))) 136 | 137 | # Fit and prune a decision tree using caret 138 | str(churnTrain_dmy) 139 | caret_dmytree <- train(x = churnTrain_dmy[ ,-19], y = churnTrain_dmy$churn, 140 | method = 'rpart', 141 | tuneLength = 30, 142 | trControl = ctrl, 143 | metric = "Accuracy") 144 | caret_dmytree 145 | 146 | # Visualise pruned dummy variable model 147 | caret_dmytree_party <- as.party(caret_dmytree$finalModel) 148 | plot(caret_dmytree_party, gp = gpar(cex = 0.5)) # nice plot of the tree 149 | 150 | 151 | # Evaluate dummy encoded caret tree on test set -------------------------------- 152 | caret_dmytree_preds <- predict(caret_dmytree, churnTest_dmy, type = "prob") 153 | temp <- ifelse(caret_dmytree_preds[ ,1] < .5, "no", "yes") 154 | caret_dmytree_preds <- data.frame(caret_dmytree_preds, prediction = as.factor(temp)) 155 | caret_dmytree_confusion <- table(caret_dmytree_preds$prediction, churnTest$churn) 156 | caret_dmytree_confusion 157 | prop.table(caret_tree_confusion) 158 | (142+1431)/1667 # 94.36% of observations are correctly classified 159 | 160 | # ROC curve for caret, rpart, dmy var predictions 161 | # Predict probs for the dummy variable model 162 | rocplot(tree_model_pred$yes, churnTest$churn) # Plot ROC curves 163 | rocplot(caret_tree_preds$yes, churnTest$churn, add = TRUE, col = "blue") 164 | rocplot(caret_dmytree_preds$yes, churnTest$churn, add = TRUE, col = "orange") 165 | abline(a = c(0,0), b = 1, col = 'red', lty = 2) 166 | legend("bottomright", c("caret model", "rpart model", "dummy var model"), lty = 1, 167 | col = c("blue", "black", "orange"), cex = 0.65) 168 | 169 | # AUC 170 | auc(churnTest$churn, caret_dmytree_preds$yes) 171 | 172 | 173 | # Bagged tree model ============================================================ 174 | # Fit bagged decision tree 175 | # Random forest with n = p is equivalent to bagging 176 | bagged_tree <- randomForest(x = churnTrain[ ,1:19], y = churnTrain$churn, 177 | importance = TRUE) 178 | bagged_tree 179 | 180 | 181 | # Using caret ------------------------------------------------------------------ 182 | ctrl2 <- trainControl("cv", number = 10) 183 | bagged_tree2 <- train(x = churnTrain[ ,1:19], y = churnTrain$churn, method = "rf", 184 | importance = TRUE, tuneGrid = expand.grid(mtry = 19), do.trace = TRUE) 185 | bagged_tree2 186 | 187 | 188 | # Variable importance ---------------------------------------------------------- 189 | # MeanDecreaseAccuracy: mean decrease of accuracy in predictions on the out 190 | # of bag samples when a given variable is excluded from the model. 191 | # MeanDecreaseGini: measure of the total decrease in node impurity that results 192 | # from splits over that variable, averaged over all trees 193 | varImpPlot(bagged_tree) 194 | 195 | 196 | # Evaluate bagged decision tree on test data ----------------------------------- 197 | bagged_tree2_preds <- predict(bagged_tree2, churnTest, type = "prob") 198 | temp <- ifelse(bagged_tree2_preds[ ,1] < .5, "no", "yes") 199 | bagged_tree2_preds <- data.frame(bagged_tree2_preds, prediction = as.factor(temp)) 200 | bagged_tree2_confusion <- table(bagged_tree2_preds$prediction, churnTest$churn) 201 | bagged_tree2_confusion 202 | prop.table(bagged_tree2_confusion) 203 | (188+1320)/1667 # 90.46% accuracy 204 | 205 | # ROC curve 206 | par(pty="s") # ensure square graph 207 | rocplot(tree_model_pred$yes, churnTest$churn) # Plot ROC curves 208 | rocplot(caret_tree_preds$yes, churnTest$churn, add = TRUE, col = "blue") 209 | rocplot(caret_dmytree_preds$yes, churnTest$churn, add = TRUE, col = "orange") 210 | rocplot(bagged_tree2_preds$yes, churnTest$churn, add = TRUE, col = "green") 211 | abline(a = c(0,0), b = 1, col = 'red', lty = 2) 212 | legend("bottomright", c("caret model", "rpart model", "dummy var model", 213 | "bagged tree model"), 214 | lty = 1, 215 | col = c("blue", "black", "orange", "green"), 216 | cex = 0.65) 217 | 218 | # AUC 219 | auc(churnTest$churn, bagged_tree2_preds$yes) 220 | 221 | 222 | # Why is the bagged model less accurate but has a higher AUC ?? ---------------- 223 | # See the histograms... 224 | histogram(tree_model_pred$yes) 225 | histogram(caret_dmytree_preds$yes) 226 | histogram(caret_tree_preds$yes) 227 | histogram(bagged_tree2_preds$yes) 228 | 229 | 230 | # Rule based models ============================================================ 231 | # Fit PART model 232 | part_model <- PART(churn ~ ., data = churnTrain) 233 | part_model 234 | 235 | 236 | # caret method ----------------------------------------------------------------- 237 | part_model.c <- train(x = churnTrain[ ,-20], y = churnTrain$churn, 238 | method = 'PART', 239 | tuneLength = 30, 240 | trControl = ctrl, 241 | metric = "Accuracy") 242 | part_model.c 243 | 244 | 245 | # Evaluate bagged classification tree on test data ---------------------------- 246 | part_model_preds <- predict(part_model.c, churnTest, type = "prob") 247 | temp <- ifelse(part_model_preds[ ,1] < .5, "no", "yes") 248 | part_model_preds <- data.frame(part_model_preds, prediction = as.factor(temp)) 249 | part_model_confusion <- table(part_model_preds$prediction, churnTest$churn) 250 | part_model_confusion 251 | round(prop.table(part_model_confusion), 3) 252 | (130+1376)/1667 # 90.34% accuracy 253 | 254 | # ROC curve 255 | par(pty="s") # ensure square graph 256 | rocplot(tree_model_pred$yes, churnTest$churn) # Plot ROC curves 257 | rocplot(caret_dmytree_preds$yes, churnTest$churn, add = TRUE, col = "orange") 258 | rocplot(bagged_tree2_preds$yes, churnTest$churn, add = TRUE, col = "green") 259 | rocplot(part_model_preds$yes, churnTest$churn, add = TRUE, col = "blue") 260 | abline(a = c(0,0), b = 1, col = 'red', lty = 2) 261 | legend("bottomright", c("PART rule model", "rpart model", "dummy var model", 262 | "bagged tree model"), 263 | lty = 1, 264 | col = c("blue", "black", "orange", "green"), 265 | cex = 0.65) 266 | 267 | # AUC 268 | auc(churnTest$churn, part_model_preds$yes) 269 | 270 | 271 | # Fit C5.0 rule model --------------------------------------------------------- 272 | # information theory based approach, rule voting method 273 | C5.0_model <- C5.0(x = churnTrain[ ,-20], y = churnTrain$churn, rules = TRUE) 274 | C5.0_model 275 | summary(C5.0_model) 276 | 277 | # caret version 278 | C5.0_model.c <- train(x = churnTrain[ ,-20], y = churnTrain$churn, 279 | method = "C5.0Rules", 280 | trControl = ctrl) 281 | C5.0_model.c 282 | 283 | 284 | # Evaulate C5.0 model on test data -------------------------------------------- 285 | C5.0_model_preds <- predict(C5.0_model.c, churnTest, type = "prob") 286 | temp <- ifelse(C5.0_model_preds[ ,1] < .5, "no", "yes") 287 | C5.0_model_preds <- data.frame(C5.0_model_preds, prediction = as.factor(temp)) 288 | C5.0_model_confusion <- table(C5.0_model_preds$prediction, churnTest$churn) 289 | C5.0_model_confusion 290 | round(prop.table(C5.0_model_confusion), 3) 291 | (149+1428)/1667 # 94.60% accuracy 292 | 293 | # ROC curve 294 | par(pty="s") # ensure square graph 295 | rocplot(tree_model_pred$yes, churnTest$churn) # Plot ROC curves 296 | rocplot(caret_dmytree_preds$yes, churnTest$churn, add = TRUE, col = "orange") 297 | rocplot(bagged_tree2_preds$yes, churnTest$churn, add = TRUE, col = "green") 298 | rocplot(part_model_preds$yes, churnTest$churn, add = TRUE, col = "blue") 299 | rocplot(C5.0_model_preds$yes, churnTest$churn, add = TRUE, col = "purple") 300 | abline(a = c(0,0), b = 1, col = 'red', lty = 2) 301 | legend("bottomright", c("PART rule model", "rpart model", "dummy var model", 302 | "bagged tree model", "C5.0 rule model"), 303 | lty = 1, 304 | col = c("blue", "black", "orange", "green", "purple"), 305 | cex = 0.65) 306 | 307 | # AUC 308 | auc(churnTest$churn, C5.0_model_preds$yes) 309 | 310 | 311 | # Lift chart ================================================================== 312 | prediction_models <- data.frame(rpart_tree = tree_model_pred$yes, 313 | dummy_tree = caret_dmytree_preds$yes, 314 | bagged_tree = bagged_tree2_preds$yes, 315 | part_rules = part_model_preds$yes, 316 | C5.0_rules = C5.0_model_preds$yes) 317 | labs <- c(rpart_tree = "Grouped Categories", dummy_tree = "Binary Categories", 318 | bagged_tree = "Bagged Tree", 319 | part_rules = "PART rules", 320 | C5.0_rules = "C5.0 rules") 321 | liftCurve <- lift(churnTest$churn ~ rpart_tree + dummy_tree + bagged_tree + 322 | part_rules + C5.0_rules, 323 | data = prediction_models, 324 | labels = labs) 325 | xyplot(liftCurve, auto.key = list(columns = 2, lines = TRUE, points = FALSE)) 326 | 327 | 328 | # NB 329 | # rpart, C5.0, and J48 use the formula method differently than 330 | # most other functions by respecting the categorical nature of the data 331 | # and treating these predictors as grouped sets of categories 332 | # caret:::train function follows the more common convention in R, 333 | # which is to create dummy variables prior to modeling 334 | 335 | 336 | 337 | 338 | -------------------------------------------------------------------------------- /measuring-predictor-importance/churn_data_181.md: -------------------------------------------------------------------------------- 1 | # Ex18.1 - Measuring predictor importance: churn data set 2 | Oisin Fitzgerald 3 | 4 | The “churn” data set was developed to predict telecom customer churn based on 5 | information about their account. It contains 20 variables, with the 19 6 | predictors including continuous and factor variables that describe an individual 7 | account. The response churn has two levels "yes" and "no". This script demonstrates 8 | methods for examining categorical and continuous predictor importance for 9 | classification problems. 10 | 11 | (a) Examining the correlation between predictors 12 | ------------------------------------------------ 13 | 14 | ```r 15 | # load packages 16 | library(AppliedPredictiveModeling) 17 | library(C50) 18 | library(corrplot) 19 | library(caret) 20 | ``` 21 | 22 | ``` 23 | ## Loading required package: lattice 24 | ## Loading required package: ggplot2 25 | ``` 26 | 27 | ```r 28 | library(CORElearn) 29 | library(ggplot2) 30 | library(pROC) 31 | ``` 32 | 33 | ``` 34 | ## Type 'citation("pROC")' for a citation. 35 | ## 36 | ## Attaching package: 'pROC' 37 | ## 38 | ## The following objects are masked from 'package:stats': 39 | ## 40 | ## cov, smooth, var 41 | ``` 42 | 43 | ```r 44 | # load data 45 | data(churn) 46 | str(churnTrain) 47 | ``` 48 | 49 | ``` 50 | ## 'data.frame': 3333 obs. of 20 variables: 51 | ## $ state : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ... 52 | ## $ account_length : int 128 107 137 84 75 118 121 147 117 141 ... 53 | ## $ area_code : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ... 54 | ## $ international_plan : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ... 55 | ## $ voice_mail_plan : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ... 56 | ## $ number_vmail_messages : int 25 26 0 0 0 0 24 0 0 37 ... 57 | ## $ total_day_minutes : num 265 162 243 299 167 ... 58 | ## $ total_day_calls : int 110 123 114 71 113 98 88 79 97 84 ... 59 | ## $ total_day_charge : num 45.1 27.5 41.4 50.9 28.3 ... 60 | ## $ total_eve_minutes : num 197.4 195.5 121.2 61.9 148.3 ... 61 | ## $ total_eve_calls : int 99 103 110 88 122 101 108 94 80 111 ... 62 | ## $ total_eve_charge : num 16.78 16.62 10.3 5.26 12.61 ... 63 | ## $ total_night_minutes : num 245 254 163 197 187 ... 64 | ## $ total_night_calls : int 91 103 104 89 121 118 118 96 90 97 ... 65 | ## $ total_night_charge : num 11.01 11.45 7.32 8.86 8.41 ... 66 | ## $ total_intl_minutes : num 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ... 67 | ## $ total_intl_calls : int 3 3 5 7 3 6 7 6 4 5 ... 68 | ## $ total_intl_charge : num 2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ... 69 | ## $ number_customer_service_calls: int 1 1 0 2 3 0 3 0 1 0 ... 70 | ## $ churn : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ... 71 | ``` 72 | 73 | ```r 74 | # correlation between the continuous variables the the 75 | numeric_vars <- sapply(churnTrain, is.numeric) 76 | corrs <- cor(churnTrain[numeric_vars]) 77 | corrplot(corrs, method = "number", tl.cex = 0.75) 78 | ``` 79 | 80 | ![](churn_data_181_files/figure-html/unnamed-chunk-1-1.png) 81 | 82 | Finding a perfect colinearity between the of four pairs of total charge and 83 | total minutes variables is of course no surprise, phone charges are set per time 84 | period! However it is somewhat surprising that there is no linear relation 85 | between any other pairs of variables. A scatterplot matrix of a subset of 86 | the continuous variables reveals the extent of any pairwise relationship and 87 | how certain variables contain numerous of zeros. 88 | 89 | ![](churn_data_181_files/figure-html/unnamed-chunk-2-1.png) 90 | 91 | (b) Assessing the importance of categorical predictors 92 | ------------------------------------------------------------------------------- 93 | Odds ratios, Fisher's exact test and chi-square tests provide methods to examine 94 | the extent of association between factor levels and the response categories. 95 | Fisher's exact test is considred more reliable than chi-square, however it is more 96 | computationally intensive. 97 | 98 | ```r 99 | # A function to calculate chi-square, odds ratios and fisher's exact test 100 | association_tests <- function(x, y) { # x is predictors, y is response 101 | 102 | x <- x[ ,sapply(x, is.factor)] 103 | n <- length(x) 104 | names <- colnames(x) 105 | 106 | out <- data.frame( 107 | chisq = rep(NA, n), 108 | chi.p.value = rep(NA, n), 109 | odds.ratio = rep(NA, n), 110 | fisher.p.value = rep(NA, n)) 111 | 112 | for (i in 1:n) { 113 | row.names(out)[i] <- names[i] 114 | if (nlevels(x[ ,i]) > 7) { 115 | fish_res <- fisher.test(x = x[ ,i], y = y, simulate.p.value = TRUE) 116 | out$fisher.p.value[i] <- fish_res$p.value 117 | } else { 118 | fish_res <- fisher.test(x = x[ ,i], y = y) 119 | out$fisher.p.value[i] <- fish_res$p.value 120 | if (nlevels(x[ ,i]) <= 2) out$odds.ratio[i] <- fish_res$estimate 121 | } 122 | 123 | chi_res <- chisq.test(x = x[ ,i], y = y, simulate.p.value = TRUE) # chisq test 124 | out$chisq[i] <- chi_res$statistic 125 | out$chi.p.value[i] <- chi_res$p.value 126 | } 127 | out 128 | } 129 | 130 | res <- association_tests(x = churnTrain[ ,1:19], y = churnTrain$churn) 131 | ``` 132 | 133 | The results suggest international plan to be an important variable, while area code 134 | shows little value as a predictor. The extremely low chi-square statistic and 135 | high p-value suggests data may have been purposely balanced by area code. Voice 136 | mail plan and state seems to have value, without the same strngth of association 137 | as international plan. 138 | 139 | ```r 140 | ggplot(data = res, aes(x = chisq, y = -log(chi.p.value))) + 141 | geom_point(size = 3) + 142 | annotate("text", x = res$chisq + 10, 143 | y = -log(res$chi.p.value) -.3, label = c(row.names(res))) + 144 | labs(title = "Chi square vs. -log(p.values)") + 145 | xlim(NA, 275) + ylim(NA, 9) 146 | ``` 147 | 148 | ![](churn_data_181_files/figure-html/unnamed-chunk-4-1.png) 149 | 150 | Receiver operating characteristic (ROC) curves offer a method to examine the extent 151 | to which a predictor variable distinguishes between the two levels of a response 152 | factor, e.g. to what extent does the "account_length" variable allow us to 153 | distinguish between customers likely to churn, and those who are not. The area 154 | under the ROC curve (AUC) quantifies the ability of a predictor variable to separate 155 | between classes. 156 | The AUC leads to different conclusions to the association tests, with international 157 | plan now considered the least important variable. Area code is now ranked the second 158 | most important variable. 159 | 160 | ```r 161 | # Calculate the area under the ROC curve 162 | factor_pred <- churnTrain[ ,sapply(churnTrain, is.factor)] # subset the factors 163 | factor_pred$churn <- NULL 164 | auc_factors <- filterVarImp(y = churnTrain$churn, x = factor_pred) 165 | # variables ranked by auc 166 | auc_factors[order(auc_factors$yes, decreasing = TRUE), ] 167 | ``` 168 | 169 | ``` 170 | ## yes no 171 | ## voice_mail_plan 0.5649036 0.5649036 172 | ## area_code 0.4975435 0.4975435 173 | ## state 0.4947913 0.4947913 174 | ## international_plan 0.3908096 0.3908096 175 | ``` 176 | 177 | (b) Assessing the importance of continuous predictors 178 | ------------------------------------------------------------------------------- 179 | 180 | ```r 181 | # create a subset of the continuous predictors 182 | cont_pred <- churnTrain[ ,sapply(churnTrain, is.numeric)] 183 | ``` 184 | 185 | Where the response is a category with two outcomes, t-tests can be used to assess 186 | the difference in the distributions of the continuous predictors by the response 187 | categories. As a signal/noise ratio the t-statistic quantifies the separation in 188 | the distributions, with the associated p value indicating the extent to which this 189 | would occur based on an assumption of no differnce. 190 | 191 | ```r 192 | get_tstats <- function(x, y) { 193 | test <- t.test(x ~ y) # Welch's t test 194 | out <- c(t_stat = test$statistic, p = test$p.value) 195 | out 196 | } 197 | 198 | t_values <- apply(cont_pred, MARGIN = 2, FUN = get_tstats, y = churnTrain$churn) 199 | t_values <- data.frame(t(t_values)) # transpose 200 | round(t_values[order(t_values$p), ], 6) 201 | ``` 202 | 203 | ``` 204 | ## t_stat.t p 205 | ## total_day_minutes 9.684563 0.000000 206 | ## total_day_charge 9.684476 0.000000 207 | ## number_customer_service_calls 8.955141 0.000000 208 | ## number_vmail_messages -5.821254 0.000000 209 | ## total_eve_minutes 5.272354 0.000000 210 | ## total_eve_charge 5.271986 0.000000 211 | ## total_intl_charge 3.939933 0.000090 212 | ## total_intl_minutes 3.938851 0.000091 213 | ## total_intl_calls -2.960420 0.003186 214 | ## total_night_charge 2.171007 0.030272 215 | ## total_night_minutes 2.170889 0.030280 216 | ## total_day_calls 1.002387 0.316543 217 | ## account_length 0.961889 0.336458 218 | ## total_eve_calls 0.537389 0.591180 219 | ## total_night_calls 0.348818 0.727339 220 | ``` 221 | 222 | The AUC and t-test for the continuous predictors both agree to a large extent. 223 | They share the same top 3 predictors, and only seem to have slight re-shuffling 224 | otherwise. 225 | 226 | ```r 227 | # Calculate the area under the ROC curve 228 | auc_numeric <- filterVarImp(y = churnTrain$churn, x = cont_pred) 229 | # continuous variables ranked by AUC 230 | auc_numeric[order(auc_numeric$yes, decreasing = TRUE), ] 231 | ``` 232 | 233 | ``` 234 | ## yes no 235 | ## total_day_minutes 0.6399666 0.6399666 236 | ## total_day_charge 0.6399666 0.6399666 237 | ## number_customer_service_calls 0.6082071 0.6082071 238 | ## total_eve_minutes 0.5726508 0.5726508 239 | ## total_eve_charge 0.5726417 0.5726417 240 | ## number_vmail_messages 0.5616465 0.5616465 241 | ## total_intl_calls 0.5606302 0.5606302 242 | ## total_intl_minutes 0.5498979 0.5498979 243 | ## total_intl_charge 0.5498979 0.5498979 244 | ## total_night_charge 0.5281719 0.5281719 245 | ## total_night_minutes 0.5281632 0.5281632 246 | ## total_day_calls 0.5215742 0.5215742 247 | ## account_length 0.5127787 0.5127787 248 | ## total_eve_calls 0.5070339 0.5070339 249 | ## total_night_calls 0.4961509 0.4961509 250 | ``` 251 | 252 | 253 | (d) Use RefliefF to jointly assess the importance of predictors 254 | ------------------------------------------------------------------------------- 255 | The Relief algorithm is another method to measure the importance of predictors 256 | for a two class response problem (although it can deal with other situations 257 | as well). It begins by randomly selecting a set of observations, R, of size m. 258 | The algorithm then evaluates each predictor in isolation by looping through each 259 | point in the random set and for each point (1) finding the two nearest scores 260 | that are a hit(i.e. share same class in response) and a miss (i.e. does not share 261 | same class in reponse) and (2) updating the score for that predictor, 262 | S = S - diff(R, Hit)^2/m + diff(R, Miss)^2/m. 263 | 264 | ```r 265 | relief_values <- attrEval(churn ~ ., data = churnTrain, 266 | estimator = "ReliefFequalK", # calculation method 267 | ReliefIterations = 50) # num iteration 268 | relief_values[order(relief_values, decreasing = TRUE)] 269 | ``` 270 | 271 | ``` 272 | ## total_day_minutes total_day_charge 273 | ## 0.23617332 0.23612810 274 | ## number_customer_service_calls international_plan 275 | ## 0.12600000 0.09800000 276 | ## voice_mail_plan total_night_calls 277 | ## 0.08800000 0.08708920 278 | ## total_day_calls total_night_minutes 279 | ## 0.06864646 0.06307872 280 | ## total_night_charge total_eve_minutes 281 | ## 0.06298665 0.05228962 282 | ## total_eve_charge number_vmail_messages 283 | ## 0.05226356 0.04826144 284 | ## total_intl_calls total_intl_minutes 285 | ## 0.04333333 0.04216667 286 | ## total_intl_charge area_code 287 | ## 0.04213580 0.03600000 288 | ## account_length total_eve_calls 289 | ## 0.02119559 0.01062745 290 | ## state 291 | ## 0.00000000 292 | ``` 293 | 294 | An addition to the Relief algorithm is to permutate the response observations so as to 295 | gain an understanding of the predictors score when it has no relevance. This method can be iterated several times giving a somewhat normal distribution of scores that can then be compared to the true Relief score in terms of standard 296 | deviations. This indicates how much greater the Relief score is than what could 297 | be expected by chance alone. 298 | 299 | ```r 300 | relief_perm <- permuteRelief(x = churnTrain[ ,-20], y = churnTrain$churn, nperm = 500, 301 | estimator = "ReliefFequalK", 302 | ReliefIterations = 50) 303 | ``` 304 | 305 | The results suggest that total day charge (and therefore total day minutes - 306 | one is a multiple of the other) and number of customer service calls are highly 307 | important variables. International plan is also quite far from its permutation 308 | distribution mean in terms of standard deviations, putting this method more in 309 | agreement with chisq/fishers than the AUC. However its relief score still quite 310 | low. This may be a result of a heavy bias towards "no" internation plan and "no" 311 | churn but ambiguity otherwise (poor sensitivity -> low AUC curve?). There are several predictors that appear without value including state, area code, total intl minutes (and charges), and total night minutes (and charges). 312 | 313 | ```r 314 | # Histograms of the permutated relief scores 315 | relief_values <- data.frame(Predictor = names(relief_values), 316 | value = relief_values, 317 | row.names = NULL) 318 | ggplot(data = relief_perm$permutations, aes(x = value)) + 319 | geom_histogram(binwidth = .01, colour = 1) + 320 | geom_vline(aes(xintercept = value), relief_values, colour = "red", linetype = 2) + 321 | facet_wrap(~ Predictor) + 322 | labs(title = "Relief Scores and Permutation Distributions", xlab = "Relief Scores") 323 | ``` 324 | 325 | ![](churn_data_181_files/figure-html/unnamed-chunk-11-1.png) 326 | 327 | ```r 328 | # Standard deviation of permutated distribution from non-permutated score 329 | relief_perm$standardized[order(relief_perm$standardized)] 330 | ``` 331 | 332 | ``` 333 | ## area_code total_eve_calls 334 | ## -2.17837888 -1.53075421 335 | ## account_length total_night_calls 336 | ## -0.53966612 -0.28130565 337 | ## total_night_minutes total_night_charge 338 | ## -0.16435951 -0.15880347 339 | ## total_intl_calls total_day_calls 340 | ## 0.00910222 0.76098712 341 | ## total_eve_charge total_eve_minutes 342 | ## 0.90009256 0.90072488 343 | ## state total_intl_minutes 344 | ## 1.16465807 1.42100050 345 | ## total_intl_charge number_vmail_messages 346 | ## 1.43765463 2.44992637 347 | ## international_plan total_day_charge 348 | ## 3.26771361 3.43767734 349 | ## total_day_minutes voice_mail_plan 350 | ## 3.43768610 3.78254810 351 | ## number_customer_service_calls 352 | ## 4.49641650 353 | ``` 354 | 355 | 356 | 357 | -------------------------------------------------------------------------------- /linear-regression-and-cousins/permeability_data_62.md: -------------------------------------------------------------------------------- 1 | # Ex6.2 - Linear regression: molecule permeability 2 | Oisin Fitzgerald 3 | 27 January 2016 4 | 5 | ### The data 6 | 7 | This pharmaceutical data set was used to develop a model for predicting compounds' 8 | permeability, a measure of a molecule's ability to cross a membrane. Permeability 9 | impacts on a potential drug's usefulness, i.e. it needs to be able to cross 10 | certain membranes to be effective. There exist assays to measure a compund's 11 | permeability. The gaol here is to develop a predictive model for permeability 12 | in an attempt to potentially reduce the need for the assay. 13 | (brief description, more on ?permeability) 14 | 15 | ### Outline 16 | 17 | * Training/test split 18 | * Pre-process the data 19 | + Skewness of response 20 | + Sparseness 21 | * Fit and test PLS models 22 | * Fit and ridge regression, LASSO and elastic net models 23 | 24 | 25 | ```r 26 | # Load data and packages 27 | library(AppliedPredictiveModeling) 28 | data(permeability) 29 | 30 | suppressMessages(library(caret)) 31 | suppressMessages(library(pls)) 32 | suppressMessages(library(elasticnet)) 33 | suppressMessages(library(lars)) 34 | ``` 35 | 36 | ### Data splitting and pre-processing 37 | 38 | 39 | ```r 40 | fingerprints <- data.frame(fingerprints) 41 | permeability <- as.vector(permeability) 42 | # Create training/test split index 43 | Split <- createDataPartition(permeability, times = 1, p = 0.75) 44 | Split <- Split$Resample1 45 | # Create training and test splits 46 | training <- fingerprints[Split, ] 47 | test <- fingerprints[-Split, ] 48 | response_train <- permeability[Split] 49 | response_test <- permeability[-Split] 50 | 51 | # Pre-processing 52 | training_filter <- training[ ,-nearZeroVar(training, freqCut = 95/5)] # near zero variances 53 | # binary data... 54 | 55 | # positively skewed 56 | ggplot() + geom_histogram(aes(x = permeability), binwidth = 4, col = 1) + 57 | labs(title = "Histogram of permeability", x = "Molecule permeability") + 58 | theme_bw() 59 | ``` 60 | 61 | ![](permeability_data_62_files/figure-html/unnamed-chunk-2-1.png)\ 62 | 63 | ```r 64 | preProcess(data.frame(permeability), method = c("YeoJohnson")) 65 | ``` 66 | 67 | ``` 68 | ## Created from 165 samples and 1 variables 69 | ## 70 | ## Pre-processing: 71 | ## - ignored (0) 72 | ## - Yeo-Johnson transformation (1) 73 | ## 74 | ## Lambda estimates for Yeo-Johnson transformation: 75 | ## -0.19 76 | ``` 77 | 78 | ### Fit and test partial least squares models 79 | 80 | Three variations on a PLS model were fit and tested: 81 | 1. A model fit to the full training set 82 | 2. A model fit to the near zero variance reduced training set 83 | 3. A model fit to a log transformed response 84 | 85 | None of the models created appear to have the predictive ability to replace the 86 | mentioned assay method, they lack accurate predictive ability. The log(response) 87 | PLS model (and others) appears to shown promise within a certain range. Possibly 88 | the current linear technique is too restrictive. Some models are producing negative 89 | predictions - multicolinearity?. 90 | 91 | 92 | ```r 93 | # Fit PLS models 94 | ctrl = trainControl("repeatedcv", number = 5, repeats = 5) 95 | pls_model.1 <- train(y = response_train, 96 | x = training, 97 | method = "pls", 98 | metric = "Rsquared", 99 | tuneLength = 10, 100 | trControl = ctrl) 101 | pls_model.1 102 | ``` 103 | 104 | ``` 105 | ## Partial Least Squares 106 | ## 107 | ## 125 samples 108 | ## 1107 predictors 109 | ## 110 | ## No pre-processing 111 | ## Resampling: Cross-Validated (5 fold, repeated 5 times) 112 | ## Summary of sample sizes: 101, 100, 100, 100, 99, 99, ... 113 | ## Resampling results across tuning parameters: 114 | ## 115 | ## ncomp RMSE Rsquared RMSE SD Rsquared SD 116 | ## 1 14.12589 0.2801179 2.015635 0.1594071 117 | ## 2 12.41503 0.4325951 2.089575 0.1719383 118 | ## 3 11.86396 0.4798535 2.175933 0.1709627 119 | ## 4 11.95253 0.4829617 2.599663 0.1876928 120 | ## 5 11.90959 0.4916929 2.613462 0.1848358 121 | ## 6 12.00479 0.4855347 2.450659 0.1790584 122 | ## 7 11.99263 0.4870670 2.462163 0.1834921 123 | ## 8 12.24073 0.4756755 2.435113 0.1808482 124 | ## 9 12.62208 0.4533456 2.255274 0.1705803 125 | ## 10 12.89147 0.4352161 2.126619 0.1643402 126 | ## 127 | ## Rsquared was used to select the optimal model using the largest value. 128 | ## The final value used for the model was ncomp = 5. 129 | ``` 130 | 131 | ```r 132 | pls_model.2 <- train(y = response_train, 133 | x = training_filter, 134 | method = "pls", 135 | metric = "Rsquared", 136 | tuneLength = 10, 137 | trControl = ctrl) 138 | pls_model.2 139 | ``` 140 | 141 | ``` 142 | ## Partial Least Squares 143 | ## 144 | ## 125 samples 145 | ## 370 predictors 146 | ## 147 | ## No pre-processing 148 | ## Resampling: Cross-Validated (5 fold, repeated 5 times) 149 | ## Summary of sample sizes: 101, 101, 100, 98, 100, 101, ... 150 | ## Resampling results across tuning parameters: 151 | ## 152 | ## ncomp RMSE Rsquared RMSE SD Rsquared SD 153 | ## 1 14.13641 0.2874455 1.994562 0.16353002 154 | ## 2 12.45425 0.4515863 2.294324 0.18511199 155 | ## 3 12.02477 0.4866895 1.819542 0.15405348 156 | ## 4 11.90942 0.5012018 1.518869 0.12878618 157 | ## 5 11.49920 0.5388000 1.636276 0.12788898 158 | ## 6 11.22734 0.5560475 1.573719 0.12308173 159 | ## 7 10.98385 0.5743623 1.597932 0.11307569 160 | ## 8 10.93646 0.5835801 1.423035 0.10149986 161 | ## 9 11.27032 0.5639006 1.400459 0.09743804 162 | ## 10 11.71468 0.5353356 1.389022 0.10426772 163 | ## 164 | ## Rsquared was used to select the optimal model using the largest value. 165 | ## The final value used for the model was ncomp = 8. 166 | ``` 167 | 168 | ```r 169 | pls_model.3 <- train(y = log(response_train), 170 | x = training, 171 | method = "pls", 172 | metric = "Rsquared", 173 | tuneLength = 10, 174 | trControl = ctrl) 175 | pls_model.3 176 | ``` 177 | 178 | ``` 179 | ## Partial Least Squares 180 | ## 181 | ## 125 samples 182 | ## 1107 predictors 183 | ## 184 | ## No pre-processing 185 | ## Resampling: Cross-Validated (5 fold, repeated 5 times) 186 | ## Summary of sample sizes: 100, 100, 99, 101, 100, 100, ... 187 | ## Resampling results across tuning parameters: 188 | ## 189 | ## ncomp RMSE Rsquared RMSE SD Rsquared SD 190 | ## 1 1.394441 0.2209422 0.2093430 0.15466007 191 | ## 2 1.263407 0.3538704 0.1414747 0.10271977 192 | ## 3 1.184729 0.4310581 0.1491980 0.11057008 193 | ## 4 1.146549 0.4644069 0.1344120 0.10952157 194 | ## 5 1.129306 0.4825250 0.1235525 0.10382922 195 | ## 6 1.105925 0.5051179 0.1484921 0.11964499 196 | ## 7 1.088086 0.5202448 0.1387113 0.10900261 197 | ## 8 1.058015 0.5470990 0.1483578 0.10895902 198 | ## 9 1.049602 0.5577494 0.1392980 0.09936161 199 | ## 10 1.062223 0.5519139 0.1482355 0.10572280 200 | ## 201 | ## Rsquared was used to select the optimal model using the largest value. 202 | ## The final value used for the model was ncomp = 9. 203 | ``` 204 | 205 | ```r 206 | # post hoc: remove multicollinearity and refit 207 | remove <- findLinearCombos(training_filter) 208 | training_filter2 <- training_filter[ ,remove$remove] 209 | 210 | pls_model.4 <- train(y = response_train, 211 | x = training_filter2, 212 | method = "pls", 213 | metric = "Rsquared", 214 | tuneLength = 10, 215 | trControl = ctrl) 216 | pls_model.4 217 | ``` 218 | 219 | ``` 220 | ## Partial Least Squares 221 | ## 222 | ## 125 samples 223 | ## 277 predictors 224 | ## 225 | ## No pre-processing 226 | ## Resampling: Cross-Validated (5 fold, repeated 5 times) 227 | ## Summary of sample sizes: 99, 100, 100, 100, 101, 100, ... 228 | ## Resampling results across tuning parameters: 229 | ## 230 | ## ncomp RMSE Rsquared RMSE SD Rsquared SD 231 | ## 1 14.54842 0.2293263 1.877668 0.1760119 232 | ## 2 12.80386 0.3991476 2.272226 0.1871358 233 | ## 3 12.70781 0.4169945 1.743083 0.1609883 234 | ## 4 12.63029 0.4255008 1.957180 0.1622419 235 | ## 5 12.70340 0.4287808 2.043817 0.1557772 236 | ## 6 12.59538 0.4455729 2.083568 0.1491574 237 | ## 7 12.70171 0.4411228 2.268840 0.1523978 238 | ## 8 12.85886 0.4320981 2.066638 0.1363235 239 | ## 9 12.98160 0.4296388 2.023933 0.1315781 240 | ## 10 13.29837 0.4131454 2.074848 0.1305968 241 | ## 242 | ## Rsquared was used to select the optimal model using the largest value. 243 | ## The final value used for the model was ncomp = 6. 244 | ``` 245 | 246 | ```r 247 | # Predict on test data 248 | pls1_preds <- predict(pls_model.1, test) 249 | RMSE(pls1_preds, response_test) 250 | ``` 251 | 252 | ``` 253 | ## [1] 14.74978 254 | ``` 255 | 256 | ```r 257 | cor(pls1_preds, response_test)^2 # Rsquared 258 | ``` 259 | 260 | ``` 261 | ## [1] 0.3746941 262 | ``` 263 | 264 | ```r 265 | ggplot() + 266 | geom_point(aes(x = pls1_preds, y = response_test)) + 267 | theme_bw() + 268 | labs(title = "PLS model 1 predictions vs. observed", 269 | x = "predicted permeability", 270 | y = "observed permeability") 271 | ``` 272 | 273 | ![](permeability_data_62_files/figure-html/unnamed-chunk-3-1.png)\ 274 | 275 | ```r 276 | pls2_preds <- predict(pls_model.2, test) 277 | RMSE(pls2_preds, response_test) 278 | ``` 279 | 280 | ``` 281 | ## [1] 13.8051 282 | ``` 283 | 284 | ```r 285 | cor(pls2_preds, response_test)^2 # Rsquared 286 | ``` 287 | 288 | ``` 289 | ## [1] 0.4424359 290 | ``` 291 | 292 | ```r 293 | ggplot() + 294 | geom_point(aes(x = pls2_preds, y = response_test)) + 295 | theme_bw() + 296 | labs(title = "PLS model 2 predictions vs. observed", 297 | x = "predicted permeability", 298 | y = "observed permeability") 299 | ``` 300 | 301 | ![](permeability_data_62_files/figure-html/unnamed-chunk-3-2.png)\ 302 | 303 | ```r 304 | pls3_preds <- predict(pls_model.3, test) 305 | RMSE(exp(pls3_preds), response_test) 306 | ``` 307 | 308 | ``` 309 | ## [1] 13.75083 310 | ``` 311 | 312 | ```r 313 | cor(exp(pls3_preds), response_test)^2 # Rsquared 314 | ``` 315 | 316 | ``` 317 | ## [1] 0.3905663 318 | ``` 319 | 320 | ```r 321 | ggplot() + 322 | geom_point(aes(x = pls3_preds, y = response_test)) + 323 | theme_bw() + 324 | labs(title = "PLS model 3 predictions vs. observed", 325 | x = "predicted permeability", 326 | y = "observed permeability") 327 | ``` 328 | 329 | ![](permeability_data_62_files/figure-html/unnamed-chunk-3-3.png)\ 330 | 331 | ```r 332 | # can it predict well within a certain range?? 333 | RMSE(exp(pls3_preds[response_test<20]), response_test[response_test<20]) 334 | ``` 335 | 336 | ``` 337 | ## [1] 12.19276 338 | ``` 339 | 340 | ```r 341 | pls4_preds <- predict(pls_model.4, test) 342 | RMSE(pls4_preds, response_test) 343 | ``` 344 | 345 | ``` 346 | ## [1] 14.45052 347 | ``` 348 | 349 | ```r 350 | cor(pls4_preds, response_test)^2 # Rsquared 351 | ``` 352 | 353 | ``` 354 | ## [1] 0.3826303 355 | ``` 356 | 357 | ```r 358 | ggplot() + 359 | geom_point(aes(x = pls4_preds, y = response_test)) + 360 | theme_bw() + 361 | labs(title = "PLS model 4 predictions vs. observed", 362 | x = "predicted permeability", 363 | y = "observed permeability") 364 | ``` 365 | 366 | ![](permeability_data_62_files/figure-html/unnamed-chunk-3-4.png)\ 367 | 368 | #### Fit and ridge regression, LASSO and elastic net models 369 | 370 | As with the PLS models, none of the models were stong predictors of the data, suggesting 371 | that laboratory methods of measuring permeability are prefereable (at least) to the 372 | models fitted. The PLS models actually outperformed the shrinkage methods. Further the 373 | cross validated estimates of RMSE and R2 were quite inaccurate compared to the test fits. 374 | There appears to be residual instability in the cofficients even in these shrinkage methods, 375 | with enet and ridge producing hugely negative predictions and RMSE before being tuned 376 | over a predefined range of shrinkage coefficients. 377 | 378 | 379 | ```r 380 | # Fit shrinkage models 381 | ctrl <- trainControl("cv", number = 5) 382 | 383 | ridge_grid <- expand.grid(.lambda = seq(0.05, 0.2, 0.01)) 384 | ridge_model <- train(y = response_train, 385 | x = training_filter2, # model fitting impacted by zero variance 386 | method = "ridge", 387 | tuneGrid = ridge_grid, 388 | metric = "RMSE", 389 | trControl = ctrl) 390 | ridge_model 391 | ``` 392 | 393 | ``` 394 | ## Ridge Regression 395 | ## 396 | ## 125 samples 397 | ## 277 predictors 398 | ## 399 | ## No pre-processing 400 | ## Resampling: Cross-Validated (5 fold) 401 | ## Summary of sample sizes: 100, 100, 100, 99, 101 402 | ## Resampling results across tuning parameters: 403 | ## 404 | ## lambda RMSE Rsquared RMSE SD Rsquared SD 405 | ## 0.05 14.26429 0.3469361 3.240907 0.1980152 406 | ## 0.06 14.13896 0.3559712 3.218352 0.1956677 407 | ## 0.07 14.04347 0.3635573 3.209867 0.1942977 408 | ## 0.08 13.96978 0.3702479 3.205188 0.1934118 409 | ## 0.09 13.90489 0.3761472 3.201440 0.1928339 410 | ## 0.10 13.84637 0.3815359 3.198597 0.1925102 411 | ## 0.11 13.81797 0.3856901 3.223695 0.1930815 412 | ## 0.12 13.78728 0.3898212 3.235720 0.1934281 413 | ## 0.13 13.76418 0.3935468 3.250413 0.1939387 414 | ## 0.14 13.70465 0.3988747 3.194384 0.1925732 415 | ## 0.15 13.72007 0.4007911 3.257135 0.1944723 416 | ## 0.16 13.73433 0.4025027 3.312094 0.1964504 417 | ## 0.17 13.65160 0.4088949 3.194072 0.1933448 418 | ## 0.18 13.67577 0.4103241 3.253685 0.1952229 419 | ## 0.19 13.63409 0.4146022 3.194897 0.1940248 420 | ## 0.20 13.65158 0.4162339 3.233357 0.1954084 421 | ## 422 | ## RMSE was used to select the optimal model using the smallest value. 423 | ## The final value used for the model was lambda = 0.19. 424 | ``` 425 | 426 | ```r 427 | # remaining instability in the coefficients? 428 | plot(ridge_model$finalModel) 429 | title(main = "Ridge regression coefficient stability") 430 | ``` 431 | 432 | ![](permeability_data_62_files/figure-html/unnamed-chunk-4-1.png)\ 433 | 434 | ```r 435 | lasso_model <- train(y = response_train, 436 | x = training_filter2, 437 | method = "lasso", 438 | tuneLength = 10, 439 | metric = "RMSE", 440 | trControl = ctrl) 441 | lasso_model 442 | ``` 443 | 444 | ``` 445 | ## The lasso 446 | ## 447 | ## 125 samples 448 | ## 277 predictors 449 | ## 450 | ## No pre-processing 451 | ## Resampling: Cross-Validated (5 fold) 452 | ## Summary of sample sizes: 98, 101, 101, 99, 101 453 | ## Resampling results across tuning parameters: 454 | ## 455 | ## fraction RMSE Rsquared RMSE SD Rsquared SD 456 | ## 0.1000000 9110.973 0.3242448 20342.54 0.2305155 457 | ## 0.1888889 17103.552 0.3248907 38214.02 0.2576174 458 | ## 0.2777778 33554.929 0.3315620 75000.16 0.2734539 459 | ## 0.3666667 51774.847 0.3286516 115740.61 0.2690170 460 | ## 0.4555556 70602.749 0.3143725 157840.22 0.2537059 461 | ## 0.5444444 90321.095 0.3037186 201931.22 0.2403576 462 | ## 0.6333333 110101.793 0.2976369 246161.76 0.2304663 463 | ## 0.7222222 129916.309 0.2949451 290468.13 0.2229269 464 | ## 0.8111111 149318.484 0.2870511 333852.23 0.2160301 465 | ## 0.9000000 168562.774 0.2763813 376883.05 0.2062604 466 | ## 467 | ## RMSE was used to select the optimal model using the smallest value. 468 | ## The final value used for the model was fraction = 0.1. 469 | ``` 470 | 471 | ```r 472 | enet_grid <- expand.grid(.lambda = c(0.01, 0.02, 0.03, 0.05), .fraction = c(seq(0.00001, 0.2, 0.02))) 473 | enet_model <- train(y = response_train, 474 | x = training_filter, 475 | method = "enet", 476 | tuneGrid = enet_grid, 477 | metric = "RMSE", 478 | trControl = ctrl) 479 | enet_model 480 | ``` 481 | 482 | ``` 483 | ## Elasticnet 484 | ## 485 | ## 125 samples 486 | ## 370 predictors 487 | ## 488 | ## No pre-processing 489 | ## Resampling: Cross-Validated (5 fold) 490 | ## Summary of sample sizes: 101, 100, 100, 101, 98 491 | ## Resampling results across tuning parameters: 492 | ## 493 | ## lambda fraction RMSE Rsquared RMSE SD Rsquared SD 494 | ## 0.01 0.00001 16.20387 0.4104844 1.0182184 0.18272955 495 | ## 0.01 0.02001 13.87268 0.4150714 1.0692719 0.16420088 496 | ## 0.01 0.04001 12.78141 0.4175494 1.1372225 0.10885746 497 | ## 0.01 0.06001 12.54578 0.4085081 1.1839283 0.09903446 498 | ## 0.01 0.08001 12.33540 0.4236548 0.9795106 0.08453264 499 | ## 0.01 0.10001 12.21048 0.4354692 0.8230406 0.07702949 500 | ## 0.01 0.12001 12.11723 0.4458469 0.8213061 0.07598583 501 | ## 0.01 0.14001 12.09290 0.4501368 0.8129704 0.07798469 502 | ## 0.01 0.16001 12.05235 0.4565517 0.8165102 0.08163804 503 | ## 0.01 0.18001 12.01800 0.4625189 0.8127518 0.08684807 504 | ## 0.02 0.00001 16.20410 0.4104844 1.0182928 0.18272955 505 | ## 0.02 0.02001 14.09418 0.4143868 1.0478786 0.16432456 506 | ## 0.02 0.04001 12.89163 0.4246374 1.1253347 0.11811265 507 | ## 0.02 0.06001 12.59891 0.4094209 1.1983122 0.10544503 508 | ## 0.02 0.08001 12.44036 0.4151066 1.1398766 0.09682953 509 | ## 0.02 0.10001 12.31475 0.4257743 0.9451951 0.08501402 510 | ## 0.02 0.12001 12.19302 0.4375956 0.8355392 0.07922614 511 | ## 0.02 0.14001 12.11181 0.4463517 0.8308710 0.07818994 512 | ## 0.02 0.16001 12.08293 0.4507623 0.8141316 0.07903433 513 | ## 0.02 0.18001 12.04035 0.4570083 0.8130894 0.08193302 514 | ## 0.03 0.00001 16.20421 0.4104844 1.0182682 0.18272955 515 | ## 0.03 0.02001 14.21115 0.4124214 1.0029421 0.16402693 516 | ## 0.03 0.04001 12.99712 0.4234639 1.1658812 0.13583843 517 | ## 0.03 0.06001 12.59611 0.4141328 1.1642766 0.10568952 518 | ## 0.03 0.08001 12.50335 0.4100662 1.2201652 0.10547035 519 | ## 0.03 0.10001 12.36856 0.4208988 1.1034849 0.09941567 520 | ## 0.03 0.12001 12.25816 0.4310796 0.9459415 0.09120199 521 | ## 0.03 0.14001 12.15817 0.4413571 0.8579290 0.08405673 522 | ## 0.03 0.16001 12.07518 0.4499026 0.8397519 0.08414664 523 | ## 0.03 0.18001 12.03730 0.4549964 0.8274005 0.08638877 524 | ## 0.05 0.00001 16.20440 0.4104844 1.0183573 0.18272955 525 | ## 0.05 0.02001 14.41209 0.4114486 1.0296163 0.16568131 526 | ## 0.05 0.04001 13.23826 0.4232309 1.2315575 0.15481758 527 | ## 0.05 0.06001 12.68330 0.4206394 1.1718478 0.11273411 528 | ## 0.05 0.08001 12.52746 0.4113844 1.1777974 0.10429839 529 | ## 0.05 0.10001 12.46332 0.4135083 1.2256036 0.10605972 530 | ## 0.05 0.12001 12.39655 0.4192155 1.1207993 0.09901656 531 | ## 0.05 0.14001 12.32335 0.4264486 0.9972745 0.09118388 532 | ## 0.05 0.16001 12.22329 0.4367257 0.9115938 0.08424315 533 | ## 0.05 0.18001 12.13832 0.4456274 0.8794157 0.08362259 534 | ## 535 | ## RMSE was used to select the optimal model using the smallest value. 536 | ## The final values used for the model were fraction = 0.18001 and lambda 537 | ## = 0.01. 538 | ``` 539 | 540 | ```r 541 | # Test shrinkage models 542 | ridge_preds <- predict(ridge_model, test) 543 | RMSE(ridge_preds, response_test) 544 | ``` 545 | 546 | ``` 547 | ## [1] 41.89106 548 | ``` 549 | 550 | ```r 551 | cor(ridge_preds, response_test)^2 552 | ``` 553 | 554 | ``` 555 | ## [1] 0.1116555 556 | ``` 557 | 558 | ```r 559 | lasso_preds <- predict(lasso_model, test) 560 | RMSE(lasso_preds, response_test) 561 | ``` 562 | 563 | ``` 564 | ## [1] 2.368724e+15 565 | ``` 566 | 567 | ```r 568 | cor(lasso_preds, response_test)^2 569 | ``` 570 | 571 | ``` 572 | ## [1] 0.1439556 573 | ``` 574 | 575 | ```r 576 | enet_preds <- predict(enet_model, test) 577 | RMSE(enet_preds, response_test) 578 | ``` 579 | 580 | ``` 581 | ## [1] 30.95827 582 | ``` 583 | 584 | ```r 585 | cor(enet_preds, response_test)^2 586 | ``` 587 | 588 | ``` 589 | ## [1] 0.2149163 590 | ``` 591 | 592 | 593 | 594 | -------------------------------------------------------------------------------- /linear-regression-and-cousins/IR_data_61.md: -------------------------------------------------------------------------------- 1 | # Ex6.1 - Linear regression: IR spectrum of food 2 | Oisin Fitzgerald 3 | 26 January 2016 4 | ### The data: 5 | The data provides an infrared (IR) profile and analytical chemistry determined 6 | percent content of water, fat, and protein for meat samples. If there can be establish 7 | a predictive relationship between IR spectrum and fat content, then food scientists 8 | could predict a sample’s fat content with IR instead of using analytical chemistry 9 | 10 | ### Outline: 11 | 1. What is the relationship between the predictors? Are they highly correlated given 12 | the same food sample is measured at many IR wavelengths? 13 | 2. Create training/test split 14 | 3. Fit different models 15 | + Linear regression 16 | + Ridge regression, lasso and elastic net 17 | + PCR and PLS 18 | 4. Compare models predictive ability 19 | 20 | 21 | ```r 22 | # load data and packages 23 | library(car) 24 | library(lars) 25 | ``` 26 | 27 | ``` 28 | ## Loaded lars 1.2 29 | ``` 30 | 31 | ```r 32 | library(broom) 33 | library(reshape2) 34 | suppressMessages(library(elasticnet)) 35 | suppressMessages(library(pls)) 36 | suppressMessages(library(caret)) 37 | 38 | data(tecator) # from caret 39 | ``` 40 | 41 | ### 1. Relationship between predictors and distributions 42 | 43 | 44 | ```r 45 | # correlation 46 | XX <- cor(absorp) 47 | XX[1:5, 1:5] # everything is related to everything! 48 | ``` 49 | 50 | ``` 51 | ## [,1] [,2] [,3] [,4] [,5] 52 | ## [1,] 1.0000000 0.9999908 0.9999649 0.9999243 0.9998715 53 | ## [2,] 0.9999908 1.0000000 0.9999916 0.9999678 0.9999309 54 | ## [3,] 0.9999649 0.9999916 1.0000000 0.9999923 0.9999707 55 | ## [4,] 0.9999243 0.9999678 0.9999923 1.0000000 0.9999930 56 | ## [5,] 0.9998715 0.9999309 0.9999707 0.9999930 1.0000000 57 | ``` 58 | 59 | ```r 60 | # PCA 61 | pca_object <- prcomp(absorp) 62 | percent_variance <- pca_object$sdev^2/sum(pca_object$sd^2)*100 63 | head(percent_variance) 64 | ``` 65 | 66 | ``` 67 | ## [1] 98.679162750 0.900926147 0.296292185 0.114005307 0.005754017 68 | ## [6] 0.002516023 69 | ``` 70 | 71 | ```r 72 | # Predictor distributions 73 | ggplot(data = data.frame(absorp)) + 74 | geom_histogram(aes(x = X1), bins = 20, col = 1) + 75 | labs(title = "Histogram of IR wavelength no. 1", 76 | x = "Wavelength predictor 1") # positive skew 77 | ``` 78 | 79 | ![](IR_data_61_files/figure-html/unnamed-chunk-2-1.png)\ 80 | 81 | ### 2. Create a training/test split 82 | 83 | * 75% of the data to the training set 84 | * The predictor variables show positive skew which Yeo-Johnson estimated lambda 85 | (of -1 i.e. reciprocal) altered 86 | 87 | 88 | ```r 89 | length(endpoints[ ,1]) # how many observations? 90 | ``` 91 | 92 | ``` 93 | ## [1] 215 94 | ``` 95 | 96 | ```r 97 | # create partition index 98 | data_split <- createDataPartition(endpoints[ ,1], p = .75) 99 | data_split <- data_split$Resample1 100 | 101 | # split data 102 | training <- absorp[data_split, ] 103 | test <- absorp[-data_split, ] 104 | train_resp <- endpoints[data_split, 2] # column 2 is fat content 105 | test_resp <- endpoints[-data_split, 2] 106 | 107 | # de-skew variables 108 | training <- data.frame(training) 109 | test <- data.frame(test) 110 | proc_object <- preProcess(training, 111 | method = c("YeoJohnson", "center", "scale")) 112 | training <- predict(proc_object, training) 113 | test <- predict(proc_object, test) 114 | ``` 115 | 116 | ### 3. Model fitting 117 | * Linear regression 118 | + Unsurprisingly prior removing of highly correlated predictors resulted in a model 119 | with only one independent variable. The performance on cross-validation was poor. 120 | * Ridge regression 121 | + The ridge model quickly highlighted the ability to improve on the linear regression 122 | model. However, subsequent fitting of a lasso model showed that an ability to drive 123 | the coefficients to zero was an advantage in the highly correlated predictor environment. 124 | * The lasso and elastic net 125 | + As noted the lasso model outperformed the ridge model. The optimal solution resulted 126 | in a large number of the coefficient being shrunk to zero 127 | + Enet performed similar to the lasso, with the best performing model having a 128 | low lambda for the ridge function 129 | * Principal components and partial least squares regression 130 | + These both performed quite well. The similarity of the PCR model to the PLS 131 | models is likely related to the variance in the predictors (IR response) very much 132 | being a consequence of the variance in the response (food fat content), thus the 133 | unsupervised nature of PCA causing little detriment. 134 | + The number of principal components was tuned rather than using the first two, 135 | or fist few that explained 90% of variance etc. 136 | 137 | 138 | ```r 139 | ctrl <- trainControl(method = "cv", number = 5, repeats = 5) 140 | # Linear regression 141 | mc <- findCorrelation(training, cutoff = 0.95) 142 | training_linear <- data.frame(training[ ,-mc]) 143 | # colnames(training_linear) <- "X1" 144 | linear_model <- train(y = train_resp, 145 | x = training_linear, 146 | method = "lm", 147 | trControl = ctrl) 148 | linear_model 149 | ``` 150 | 151 | ``` 152 | ## Linear Regression 153 | ## 154 | ## 163 samples 155 | ## 18 predictor 156 | ## 157 | ## No pre-processing 158 | ## Resampling: Cross-Validated (5 fold) 159 | ## Summary of sample sizes: 131, 131, 130, 129, 131 160 | ## Resampling results 161 | ## 162 | ## RMSE Rsquared RMSE SD Rsquared SD 163 | ## 2.463749 0.9598019 0.5468199 0.01877194 164 | ## 165 | ## 166 | ``` 167 | 168 | ```r 169 | # Ridge Regression - penalise square of coefficient 170 | ridge_model <- train(y = train_resp, 171 | x = training, 172 | method = "ridge", 173 | trControl = ctrl, 174 | tuneLength = 10) 175 | ridge_model 176 | ``` 177 | 178 | ``` 179 | ## Ridge Regression 180 | ## 181 | ## 163 samples 182 | ## 100 predictors 183 | ## 184 | ## No pre-processing 185 | ## Resampling: Cross-Validated (5 fold) 186 | ## Summary of sample sizes: 131, 131, 131, 130, 129 187 | ## Resampling results across tuning parameters: 188 | ## 189 | ## lambda RMSE Rsquared RMSE SD Rsquared SD 190 | ## 0.0000000000 3.739994 0.9119283 0.6066684 0.028597803 191 | ## 0.0001000000 2.629873 0.9578282 0.2248336 0.006184386 192 | ## 0.0002371374 2.757391 0.9531612 0.2228996 0.007462386 193 | ## 0.0005623413 2.845905 0.9496715 0.2116318 0.008515350 194 | ## 0.0013335214 2.929242 0.9462729 0.2067412 0.010149144 195 | ## 0.0031622777 3.097153 0.9395144 0.2453322 0.014306433 196 | ## 0.0074989421 3.459726 0.9240639 0.3541803 0.023381991 197 | ## 0.0177827941 3.993045 0.8987086 0.4894325 0.036862571 198 | ## 0.0421696503 4.607857 0.8662201 0.6188467 0.053116332 199 | ## 0.1000000000 5.414257 0.8148385 0.7583973 0.074649376 200 | ## 201 | ## RMSE was used to select the optimal model using the smallest value. 202 | ## The final value used for the model was lambda = 1e-04. 203 | ``` 204 | 205 | ```r 206 | plot(ridge_model) 207 | ``` 208 | 209 | ![](IR_data_61_files/figure-html/unnamed-chunk-4-1.png)\ 210 | 211 | ```r 212 | # Lasso - penalise absolute value of coeffienct 213 | lasso_grid <- expand.grid(.fraction = seq(0.001, 0.1, 0.01)) 214 | lasso_model <- train(y = train_resp, 215 | x = training, 216 | method = "lasso", 217 | trControl = ctrl, 218 | tuneGrid = lasso_grid) 219 | lasso_model 220 | ``` 221 | 222 | ``` 223 | ## The lasso 224 | ## 225 | ## 163 samples 226 | ## 100 predictors 227 | ## 228 | ## No pre-processing 229 | ## Resampling: Cross-Validated (5 fold) 230 | ## Summary of sample sizes: 131, 130, 130, 130, 131 231 | ## Resampling results across tuning parameters: 232 | ## 233 | ## fraction RMSE Rsquared RMSE SD Rsquared SD 234 | ## 0.001 2.498027 0.9586242 0.5517407 0.016702083 235 | ## 0.011 1.706257 0.9809880 0.4545091 0.007295174 236 | ## 0.021 1.616745 0.9828973 0.4571834 0.007445864 237 | ## 0.031 1.641162 0.9822315 0.4325699 0.007161921 238 | ## 0.041 1.690999 0.9809893 0.4493865 0.008285186 239 | ## 0.051 1.745590 0.9795241 0.4873115 0.009721832 240 | ## 0.061 1.812704 0.9777559 0.5213582 0.011084715 241 | ## 0.071 1.880056 0.9759682 0.5502445 0.012306083 242 | ## 0.081 1.949916 0.9740845 0.5786471 0.013559222 243 | ## 0.091 2.013504 0.9724054 0.5906854 0.014314542 244 | ## 245 | ## RMSE was used to select the optimal model using the smallest value. 246 | ## The final value used for the model was fraction = 0.021. 247 | ``` 248 | 249 | ```r 250 | plot(lasso_model) 251 | ``` 252 | 253 | ![](IR_data_61_files/figure-html/unnamed-chunk-4-2.png)\ 254 | 255 | ```r 256 | # Elastic Net - combination of ridge and lasso 257 | enet_grid <- expand.grid(.fraction = seq(0.001, 0.1, 0.01), .lambda = c(0, 0.0001, 0.001, 0.01)) 258 | enet_model <- train(y = train_resp, 259 | x = training, 260 | method = "enet", 261 | trControl = ctrl, 262 | tuneGrid = enet_grid) 263 | enet_model 264 | ``` 265 | 266 | ``` 267 | ## Elasticnet 268 | ## 269 | ## 163 samples 270 | ## 100 predictors 271 | ## 272 | ## No pre-processing 273 | ## Resampling: Cross-Validated (5 fold) 274 | ## Summary of sample sizes: 131, 130, 130, 131, 130 275 | ## Resampling results across tuning parameters: 276 | ## 277 | ## lambda fraction RMSE Rsquared RMSE SD Rsquared SD 278 | ## 0e+00 0.001 2.456576 0.9577158 0.2733749 0.011822745 279 | ## 0e+00 0.011 1.625075 0.9823436 0.3002864 0.004490752 280 | ## 0e+00 0.021 1.542594 0.9840462 0.3650790 0.005845262 281 | ## 0e+00 0.031 1.587296 0.9828909 0.4460547 0.008348075 282 | ## 0e+00 0.041 1.597265 0.9825894 0.4840485 0.009325435 283 | ## 0e+00 0.051 1.634045 0.9816044 0.5281381 0.010634739 284 | ## 0e+00 0.061 1.700641 0.9799644 0.5649473 0.011780352 285 | ## 0e+00 0.071 1.767640 0.9783395 0.5926034 0.012665845 286 | ## 0e+00 0.081 1.824923 0.9770237 0.5923753 0.012675485 287 | ## 0e+00 0.091 1.883738 0.9755821 0.5988473 0.013059291 288 | ## 1e-04 0.001 11.596061 0.3239978 1.1803964 0.096503222 289 | ## 1e-04 0.011 9.927933 0.3579037 1.3335128 0.112225044 290 | ## 1e-04 0.021 9.356587 0.4486944 1.2675170 0.129496136 291 | ## 1e-04 0.031 8.798182 0.5326704 1.2055201 0.139155795 292 | ## 1e-04 0.041 8.252979 0.6063615 1.1468485 0.140999618 293 | ## 1e-04 0.051 7.725916 0.6679715 1.0930366 0.136610017 294 | ## 1e-04 0.061 7.216946 0.7182172 1.0439252 0.128483466 295 | ## 1e-04 0.071 6.731303 0.7581199 1.0003492 0.118015584 296 | ## 1e-04 0.081 6.276621 0.7886260 0.9627171 0.106885189 297 | ## 1e-04 0.091 5.856909 0.8118695 0.9291989 0.096011467 298 | ## 1e-03 0.001 11.829583 0.3244982 1.1872087 0.095754537 299 | ## 1e-03 0.011 10.485965 0.3258426 1.2318222 0.096750294 300 | ## 1e-03 0.021 10.055309 0.3374139 1.3522166 0.105781522 301 | ## 1e-03 0.031 9.814328 0.3759592 1.3257997 0.113461485 302 | ## 1e-03 0.041 9.575285 0.4142052 1.2999088 0.120175687 303 | ## 1e-03 0.051 9.338072 0.4517076 1.2740945 0.125751839 304 | ## 1e-03 0.061 9.103097 0.4880241 1.2482033 0.130111816 305 | ## 1e-03 0.071 8.871446 0.5226429 1.2223026 0.133169835 306 | ## 1e-03 0.081 8.642478 0.5554473 1.1968951 0.134970623 307 | ## 1e-03 0.091 8.416020 0.5863351 1.1717988 0.135674594 308 | ## 1e-02 0.001 11.899480 0.3246449 1.1857761 0.095527559 309 | ## 1e-02 0.011 10.983045 0.3247104 1.1955365 0.096268084 310 | ## 1e-02 0.021 10.357981 0.3259192 1.2344795 0.097581761 311 | ## 1e-02 0.031 10.083140 0.3294455 1.3325124 0.101116372 312 | ## 1e-02 0.041 9.970072 0.3504728 1.3452981 0.107880681 313 | ## 1e-02 0.051 9.827450 0.3731767 1.3282255 0.112164220 314 | ## 1e-02 0.061 9.685545 0.3957958 1.3113342 0.116141469 315 | ## 1e-02 0.071 9.544374 0.4182280 1.2944268 0.119774964 316 | ## 1e-02 0.081 9.404137 0.4403542 1.2778098 0.123037552 317 | ## 1e-02 0.091 9.264591 0.4621313 1.2612852 0.125902998 318 | ## 319 | ## RMSE was used to select the optimal model using the smallest value. 320 | ## The final values used for the model were fraction = 0.021 and lambda = 0. 321 | ``` 322 | 323 | ```r 324 | plot(enet_model) 325 | ``` 326 | 327 | ![](IR_data_61_files/figure-html/unnamed-chunk-4-3.png)\ 328 | 329 | ```r 330 | # PCR - 331 | pcr_results <- list(results = data.frame(RMSE = NA, RMSE_sd = NA), final = NA) 332 | for (i in 1:20) { 333 | # fit model 334 | train_data <- princomp(training)$scores[ ,1:i] 335 | train_data <- data.frame(train_data) 336 | pcr_model <- train(y = train_resp, 337 | x = train_data, 338 | method = "lm", 339 | trControl = ctrl) 340 | 341 | # extract results 342 | pcr_results$results[i, 1] <- pcr_model$results$RMSE 343 | pcr_results$results[i, 2] <- pcr_model$results$RMSESD 344 | 345 | # extract model 346 | if (all(pcr_model$results$RMSE <= pcr_results$results$RMSE)) { 347 | pcr_results$final <- pcr_model 348 | } 349 | } 350 | pcr_results 351 | ``` 352 | 353 | ``` 354 | ## $results 355 | ## RMSE RMSE_sd 356 | ## 1 10.492353 1.0992732 357 | ## 2 10.068703 1.4639494 358 | ## 3 7.348469 0.3565198 359 | ## 4 4.853879 0.8780493 360 | ## 5 2.985895 0.3555118 361 | ## 6 2.920385 0.8442764 362 | ## 7 2.974529 0.5519718 363 | ## 8 3.008025 0.3828759 364 | ## 9 2.762710 0.1345710 365 | ## 10 2.656554 0.5481411 366 | ## 11 2.443768 0.4993940 367 | ## 12 2.448595 0.3380642 368 | ## 13 2.313984 0.6340779 369 | ## 14 2.073892 0.1928407 370 | ## 15 1.955401 0.1913254 371 | ## 16 1.998280 0.4672237 372 | ## 17 1.985154 0.4377664 373 | ## 18 1.721687 0.3300310 374 | ## 19 1.755865 0.3875057 375 | ## 20 1.849941 0.3611754 376 | ## 377 | ## $final 378 | ## Linear Regression 379 | ## 380 | ## 163 samples 381 | ## 18 predictor 382 | ## 383 | ## No pre-processing 384 | ## Resampling: Cross-Validated (5 fold) 385 | ## Summary of sample sizes: 130, 131, 130, 131, 130 386 | ## Resampling results 387 | ## 388 | ## RMSE Rsquared RMSE SD Rsquared SD 389 | ## 1.721687 0.9800546 0.330031 0.004523028 390 | ## 391 | ## 392 | ``` 393 | 394 | ```r 395 | # PLS 396 | pls_grid <- expand.grid(.ncomp = seq(10, 20, 1)) 397 | pls_model <- train(y = train_resp, 398 | x = training, 399 | method = "pls", 400 | trControl = ctrl, 401 | preProcess = c("center", "scale"), 402 | tuneGrid = pls_grid) 403 | pls_model 404 | ``` 405 | 406 | ``` 407 | ## Partial Least Squares 408 | ## 409 | ## 163 samples 410 | ## 100 predictors 411 | ## 412 | ## Pre-processing: centered (100), scaled (100) 413 | ## Resampling: Cross-Validated (5 fold) 414 | ## Summary of sample sizes: 130, 131, 131, 129, 131 415 | ## Resampling results across tuning parameters: 416 | ## 417 | ## ncomp RMSE Rsquared RMSE SD Rsquared SD 418 | ## 10 2.198022 0.9652696 0.4323613 0.013561087 419 | ## 11 2.077700 0.9686279 0.2833400 0.012102779 420 | ## 12 1.941817 0.9743164 0.2371756 0.005813342 421 | ## 13 1.756872 0.9799257 0.4054312 0.005654873 422 | ## 14 1.565419 0.9838826 0.4455804 0.005821119 423 | ## 15 1.519931 0.9849073 0.3392128 0.003558747 424 | ## 16 1.500339 0.9853390 0.3783697 0.003795604 425 | ## 17 1.541301 0.9843025 0.3130718 0.002964976 426 | ## 18 1.521648 0.9846310 0.2903265 0.002670155 427 | ## 19 1.539470 0.9845291 0.2975242 0.002202640 428 | ## 20 1.612049 0.9828719 0.3038365 0.002147520 429 | ## 430 | ## RMSE was used to select the optimal model using the smallest value. 431 | ## The final value used for the model was ncomp = 16. 432 | ``` 433 | 434 | ### 4. Compare performance on test set 435 | * The results from fitting on the test set followed from the cross-validation 436 | included in model fitting. Linear regression did very poorly with ridge regression 437 | slightly worse off than the group of lasso, elastic net, PCR and PLS. 438 | * In context the RMSE and correlation between predicted and observed results are 439 | superb, and surely suggest that any of these models could be used in measuring the 440 | fat content of food using infrared. 441 | * Given the similarities in the model performances I was interested in constructing 442 | confidence intervals around the RMSE. A function to calculate bootstrap 443 | estimation and its results are shown below. 444 | 445 | 446 | ```r 447 | # Linear regression 448 | test_linear <- data.frame(test[ ,-mc]) 449 | colnames(test_linear) <- colnames(training_linear) 450 | linear_pred <- predict(linear_model, test_linear) 451 | ggplot() + geom_point(aes(x = linear_pred, y = test_resp)) 452 | ``` 453 | 454 | ![](IR_data_61_files/figure-html/unnamed-chunk-5-1.png)\ 455 | 456 | ```r 457 | n <- length(test_resp) 458 | RMSE_lm <- sqrt(sum((test_resp - linear_pred)^2)/n); RMSE_lm 459 | ``` 460 | 461 | ``` 462 | ## [1] 2.474259 463 | ``` 464 | 465 | ```r 466 | # Ridge regression 467 | ridge_preds <- predict(ridge_model, test) 468 | ggplot() + geom_point(aes(x = ridge_preds, y = test_resp)) 469 | ``` 470 | 471 | ![](IR_data_61_files/figure-html/unnamed-chunk-5-2.png)\ 472 | 473 | ```r 474 | RMSE_ridge <- sqrt(sum((test_resp - ridge_preds)^2)/n); RMSE_ridge 475 | ``` 476 | 477 | ``` 478 | ## [1] 3.214482 479 | ``` 480 | 481 | ```r 482 | # Lasso 483 | lasso_preds <- predict(lasso_model, test) 484 | ggplot() + geom_point(aes(x = lasso_preds, y = test_resp)) 485 | ``` 486 | 487 | ![](IR_data_61_files/figure-html/unnamed-chunk-5-3.png)\ 488 | 489 | ```r 490 | RMSE_lasso <- sqrt(sum((test_resp - lasso_preds)^2)/n); RMSE_lasso 491 | ``` 492 | 493 | ``` 494 | ## [1] 1.638846 495 | ``` 496 | 497 | ```r 498 | # Elastic net 499 | enet_preds <- predict(enet_model, test) 500 | ggplot() + geom_point(aes(x = enet_preds, y = test_resp)) 501 | ``` 502 | 503 | ![](IR_data_61_files/figure-html/unnamed-chunk-5-4.png)\ 504 | 505 | ```r 506 | RMSE_enet <- sqrt(sum((test_resp - enet_preds)^2)/n); RMSE_enet 507 | ``` 508 | 509 | ``` 510 | ## [1] 1.638846 511 | ``` 512 | 513 | ```r 514 | # PCR 515 | pca_train <- princomp(training) 516 | test_pcs <- predict(pca_train, test) 517 | pcr_preds <- predict(pcr_results$final, test_pcs) 518 | ggplot() + geom_point(aes(x = pcr_preds, y = test_resp)) 519 | ``` 520 | 521 | ![](IR_data_61_files/figure-html/unnamed-chunk-5-5.png)\ 522 | 523 | ```r 524 | RMSE_pcr <- sqrt(sum((test_resp - pcr_preds)^2)/n); RMSE_pcr 525 | ``` 526 | 527 | ``` 528 | ## [1] 1.934021 529 | ``` 530 | 531 | ```r 532 | # PLS 533 | pls_preds <- predict(pls_model, test) 534 | ggplot() + geom_point(aes(x = pls_preds, y = test_resp)) 535 | ``` 536 | 537 | ![](IR_data_61_files/figure-html/unnamed-chunk-5-6.png)\ 538 | 539 | ```r 540 | RMSE_pls <- sqrt(sum((test_resp - pls_preds)^2)/n); RMSE_pls 541 | ``` 542 | 543 | ``` 544 | ## [1] 1.69883 545 | ``` 546 | 547 | ```r 548 | cor(pls_preds, test_resp) 549 | ``` 550 | 551 | ``` 552 | ## [1] 0.993344 553 | ``` 554 | 555 | ### 4. Compare performance on test set contd. 556 | * Bootstrap estimate of RMSE confidence interval 557 | + PLS appears to be the prefered model, it shows the least variation in its RMSE scores 558 | across the bootstrap samples. PCR is likely similar, an issue with variable naming meant 559 | I excluded it. 560 | 561 | 562 | ```r 563 | boostrap_RMSE <- function(model, data, obs, trials = 1000, CI = 0.95) { 564 | 565 | n <- nrow(data) 566 | out <- list(results = data.frame(RMSE = NA), lower = NA, upper = NA) 567 | 568 | for (i in 1:trials) { 569 | # create bootstrap sample 570 | samp <- sample(n, size = n, replace = TRUE) 571 | boot_obs <- obs[samp] 572 | boot_data <- data.frame(data[samp, ]) 573 | colnames(boot_data) <- colnames(data) 574 | # predict 575 | preds <- predict(model, newdata = boot_data) 576 | RMSE <- sqrt(sum((boot_obs - preds)^2)/n) 577 | 578 | out$results[i ,1] <- RMSE 579 | } 580 | 581 | temp <- out$results$RMSE 582 | temp <- quantile(temp, probs = c(0.025, 0.975), na.rm = TRUE) 583 | 584 | out$lower <- temp[1] 585 | out$upper <- temp[2] 586 | 587 | out 588 | } 589 | ``` 590 | 591 | 592 | 593 | ```r 594 | # The bootstrap results 595 | bRMSE_lm <- boostrap_RMSE(linear_model, test_linear, test_resp) 596 | bRMSE_ridge <- boostrap_RMSE(ridge_model, test, test_resp) 597 | bRMSE_lasso <- boostrap_RMSE(lasso_model, test, test_resp) 598 | bRMSE_enet <- boostrap_RMSE(enet_model, test, test_resp) 599 | # bRMSE_pcr <- boostrap_RMSE(pcr_model, test, test_resp) 600 | bRMSE_pls <- boostrap_RMSE(pls_model, test, test_resp) 601 | 602 | model_results <- data.frame(bRMSE_lm$results, bRMSE_ridge$results, 603 | bRMSE_lasso$results, bRMSE_enet$results, bRMSE_pls$results) 604 | colnames(model_results) <- c('lm', 'ridge', 'lasso', 'enet', 'pls') 605 | 606 | temp <- melt(model_results) 607 | ``` 608 | 609 | ``` 610 | ## No id variables; using all as measure variables 611 | ``` 612 | 613 | ```r 614 | ggplot(data = temp, aes(x = variable, y = value)) + 615 | geom_boxplot(width = 0.5) + 616 | theme_bw() + 617 | labs(title = 'Bootstrap Estimates of Model Performance', 618 | x = 'Model', 619 | y = 'RMSE') 620 | ``` 621 | 622 | ![](IR_data_61_files/figure-html/unnamed-chunk-7-1.png)\ 623 | 624 | ### Conclusion 625 | 626 | * The clear signal in the data meant that despite multicollinarity issues several 627 | linear model fitting methods had no problem producing extremely predictive models. 628 | * The predictors were highly correlated and likely possessed variations on same 629 | information. Therefore possibly as a result of their ability to extract the minimal 630 | dimension signal from several correlated variables PCR and PLS appear to have a slight 631 | performance advantage over other models. 632 | --------------------------------------------------------------------------------