├── classification-tree-and-rule-based-models
    ├── test
    ├── random_forests_143.R
    └── classification_trees_rules_142.R
├── README.md
├── linear-regression-and-cousins
    ├── IR_data_61_files
    │   └── figure-html
    │   │   ├── unnamed-chunk-2-1.png
    │   │   ├── unnamed-chunk-2-2.png
    │   │   ├── unnamed-chunk-4-1.png
    │   │   ├── unnamed-chunk-4-2.png
    │   │   ├── unnamed-chunk-4-3.png
    │   │   ├── unnamed-chunk-5-1.png
    │   │   ├── unnamed-chunk-5-2.png
    │   │   ├── unnamed-chunk-5-3.png
    │   │   ├── unnamed-chunk-5-4.png
    │   │   ├── unnamed-chunk-5-5.png
    │   │   ├── unnamed-chunk-5-6.png
    │   │   └── unnamed-chunk-7-1.png
    ├── permeability_data_62_files
    │   └── figure-html
    │   │   ├── unnamed-chunk-2-1.png
    │   │   ├── unnamed-chunk-3-1.png
    │   │   ├── unnamed-chunk-3-2.png
    │   │   ├── unnamed-chunk-3-3.png
    │   │   ├── unnamed-chunk-3-4.png
    │   │   └── unnamed-chunk-4-1.png
    ├── permeability_data_62.Rmd
    ├── chem_manufact_data_63.R
    ├── IR_data_61.Rmd
    ├── permeability_data_62.md
    └── IR_data_61.md
├── measuring-predictor-importance
    ├── oil_data_182_files
    │   └── figure-html
    │   │   ├── unnamed-chunk-2-1.png
    │   │   ├── unnamed-chunk-2-2.png
    │   │   └── unnamed-chunk-6-1.png
    ├── abalone_data_183_files
    │   └── figure-html
    │   │   ├── unnamed-chunk-2-1.png
    │   │   ├── unnamed-chunk-2-2.png
    │   │   ├── unnamed-chunk-3-1.png
    │   │   ├── unnamed-chunk-3-2.png
    │   │   ├── unnamed-chunk-4-1.png
    │   │   └── unnamed-chunk-4-2.png
    ├── churn_data_181_files
    │   └── figure-html
    │   │   ├── unnamed-chunk-1-1.png
    │   │   ├── unnamed-chunk-10-1.png
    │   │   ├── unnamed-chunk-11-1.png
    │   │   ├── unnamed-chunk-2-1.png
    │   │   ├── unnamed-chunk-4-1.png
    │   │   └── unnamed-chunk-9-1.png
    ├── oil_data_182.Rmd
    ├── oil_data_182.md
    ├── abalone_data_183.Rmd
    ├── churn_data_181.Rmd
    ├── abalone_data_183.md
    └── churn_data_181.md
├── non-linear-regression-models
    ├── svm_simulation_71.R
    └── friedman_data_72.R
├── linear-classification-models
    └── linear_classification_hepatic_121.R
└── data-pre-processing
    └── data_pre_proc.R


/classification-tree-and-rule-based-models/test:
--------------------------------------------------------------------------------
1 | hello
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Applied-Predictive-Modeling
2 | Exercises and R code related to the book Applied Predictive Modeling by Max Kuhn and Kjell Johnson
3 | 


--------------------------------------------------------------------------------
/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-2-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-2-2.png


--------------------------------------------------------------------------------
/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-4-1.png


--------------------------------------------------------------------------------
/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-4-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-4-2.png


--------------------------------------------------------------------------------
/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-4-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-4-3.png


--------------------------------------------------------------------------------
/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-5-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-5-2.png


--------------------------------------------------------------------------------
/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-5-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-5-3.png


--------------------------------------------------------------------------------
/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-5-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-5-4.png


--------------------------------------------------------------------------------
/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-5-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-5-5.png


--------------------------------------------------------------------------------
/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-5-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-5-6.png


--------------------------------------------------------------------------------
/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-7-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/IR_data_61_files/figure-html/unnamed-chunk-7-1.png


--------------------------------------------------------------------------------
/measuring-predictor-importance/oil_data_182_files/figure-html/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/oil_data_182_files/figure-html/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/measuring-predictor-importance/oil_data_182_files/figure-html/unnamed-chunk-2-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/oil_data_182_files/figure-html/unnamed-chunk-2-2.png


--------------------------------------------------------------------------------
/measuring-predictor-importance/oil_data_182_files/figure-html/unnamed-chunk-6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/oil_data_182_files/figure-html/unnamed-chunk-6-1.png


--------------------------------------------------------------------------------
/measuring-predictor-importance/abalone_data_183_files/figure-html/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/abalone_data_183_files/figure-html/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/measuring-predictor-importance/abalone_data_183_files/figure-html/unnamed-chunk-2-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/abalone_data_183_files/figure-html/unnamed-chunk-2-2.png


--------------------------------------------------------------------------------
/measuring-predictor-importance/abalone_data_183_files/figure-html/unnamed-chunk-3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/abalone_data_183_files/figure-html/unnamed-chunk-3-1.png


--------------------------------------------------------------------------------
/measuring-predictor-importance/abalone_data_183_files/figure-html/unnamed-chunk-3-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/abalone_data_183_files/figure-html/unnamed-chunk-3-2.png


--------------------------------------------------------------------------------
/measuring-predictor-importance/abalone_data_183_files/figure-html/unnamed-chunk-4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/abalone_data_183_files/figure-html/unnamed-chunk-4-1.png


--------------------------------------------------------------------------------
/measuring-predictor-importance/abalone_data_183_files/figure-html/unnamed-chunk-4-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/abalone_data_183_files/figure-html/unnamed-chunk-4-2.png


--------------------------------------------------------------------------------
/measuring-predictor-importance/churn_data_181_files/figure-html/unnamed-chunk-1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/churn_data_181_files/figure-html/unnamed-chunk-1-1.png


--------------------------------------------------------------------------------
/measuring-predictor-importance/churn_data_181_files/figure-html/unnamed-chunk-10-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/churn_data_181_files/figure-html/unnamed-chunk-10-1.png


--------------------------------------------------------------------------------
/measuring-predictor-importance/churn_data_181_files/figure-html/unnamed-chunk-11-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/churn_data_181_files/figure-html/unnamed-chunk-11-1.png


--------------------------------------------------------------------------------
/measuring-predictor-importance/churn_data_181_files/figure-html/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/churn_data_181_files/figure-html/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/measuring-predictor-importance/churn_data_181_files/figure-html/unnamed-chunk-4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/churn_data_181_files/figure-html/unnamed-chunk-4-1.png


--------------------------------------------------------------------------------
/measuring-predictor-importance/churn_data_181_files/figure-html/unnamed-chunk-9-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/measuring-predictor-importance/churn_data_181_files/figure-html/unnamed-chunk-9-1.png


--------------------------------------------------------------------------------
/linear-regression-and-cousins/permeability_data_62_files/figure-html/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/permeability_data_62_files/figure-html/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/linear-regression-and-cousins/permeability_data_62_files/figure-html/unnamed-chunk-3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/permeability_data_62_files/figure-html/unnamed-chunk-3-1.png


--------------------------------------------------------------------------------
/linear-regression-and-cousins/permeability_data_62_files/figure-html/unnamed-chunk-3-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/permeability_data_62_files/figure-html/unnamed-chunk-3-2.png


--------------------------------------------------------------------------------
/linear-regression-and-cousins/permeability_data_62_files/figure-html/unnamed-chunk-3-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/permeability_data_62_files/figure-html/unnamed-chunk-3-3.png


--------------------------------------------------------------------------------
/linear-regression-and-cousins/permeability_data_62_files/figure-html/unnamed-chunk-3-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/permeability_data_62_files/figure-html/unnamed-chunk-3-4.png


--------------------------------------------------------------------------------
/linear-regression-and-cousins/permeability_data_62_files/figure-html/unnamed-chunk-4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oizin/applied-predictive-modeling/HEAD/linear-regression-and-cousins/permeability_data_62_files/figure-html/unnamed-chunk-4-1.png


--------------------------------------------------------------------------------
/non-linear-regression-models/svm_simulation_71.R:
--------------------------------------------------------------------------------
 1 | # Chapter 7: Non-linear Regression models
 2 | #
 3 | # Script based on Ex 7.1 of Applied Predictive Modeling (Kuhn & Johnson, 2013)
 4 | # Exercise covers on data simulation and the SVM tuning parameters
 5 | #
 6 | ## SVM Notes:
 7 | # Can be seen as a form of robust regression.
 8 | # e-insensitive regression: data points with residuals inside the e boundary do not 
 9 | # contribute to the model while those outside contribute a linear scaled amount.
10 | # MIN Cost*SUM[L(y - y_hat)] + SUM(beta^2); L is the e-insensitve function.
11 | # Over-parameterised model, as many beta as points, however these are zero for 
12 | # points inside the e boundary.
13 | # The support vectors are those points with non-zero beta (or alpha!).
14 | # New points enter as dot-product of the support vectors. An extension to non-linear 
15 | # situations is to use other Kernel function around the dot product (e.g. radial
16 | # basis). Non-linear kernels have scaling parameters requiring tuning.
17 | #
18 | #
19 | # Packages ====================================================================
20 | library(kernlab)
21 | library(RColorBrewer)
22 | 
23 | # Simulate the data ===========================================================
24 | set.seed(100)
25 | x <- runif(100, min = 2, max = 10)
26 | y <- sin(x) + rnorm(length(x)) * .25
27 | sin_data <- data.frame(x = x, y = y)
28 | plot(x, y)
29 | 
30 | # Create a grid of x values to use for prediction
31 | data_grid <- data.frame(x = seq(2, 10, length = 100))
32 | 
33 | # Fit different models using a radial basis function and different values of ===
34 | # the cost (the C parameter)
35 | 
36 | mypalette <- brewer.pal(9, "Greens")
37 | 
38 | par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
39 | plot(x, y)
40 | for (i in 1:9) {
41 |   radial_SVM <- ksvm(x = x, y = y, data = sin_data,
42 |     kernel ="rbfdot", kpar = "automatic",
43 |     C = i/2, epsilon = 0.1)  # move the cost from 0.5 to 4.5
44 |   model_preds <- predict(radial_SVM, newdata = data_grid)
45 |   points(x = data_grid$x, y = model_preds[,1], type = "l", col = mypalette[i])
46 | }
47 | legend("topright", inset=c(-0.2,0), lty = 1, legend=c(seq(0.5, 4.5, by = 0.5)), 
48 |   col = mypalette[1:10], title="Cost", cex = 0.5)
49 | par(xpd = FALSE)
50 | points(seq(1, 11, by = 0.1), sin(seq(1, 11, by = 0.1)), type = "l", lwd = 2)
51 | 
52 | # Fit different models using a radial basis function and different values of ===
53 | # the epsilon boundary parameter
54 | 
55 | mypalette <- brewer.pal(9, "Blues")
56 | 
57 | par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
58 | plot(x, y, bty='L')
59 | for (i in 1:9) {
60 |   radial_SVM <- ksvm(x = x, y = y, data = sin_data,
61 |     kernel ="rbfdot", kpar = "automatic",
62 |     C = 1, epsilon = i/10) # move epsilon from 0.1 to 0.9
63 |   model_preds <- predict(radial_SVM, newdata = data_grid)
64 |   points(x = data_grid$x, y = model_preds[,1], type = "l", 
65 |     col = mypalette[i], 
66 |     lwd = 2)
67 | }
68 | legend("topright", inset=c(-0.2,0), lty = 1, legend=c(seq(0.1, 0.9, by = 0.1)), 
69 |   col = mypalette[1:10], title="Epsilon", cex = 0.5)
70 | par(xpd = FALSE)
71 | points(seq(1, 11, by = 0.1), sin(seq(1, 11, by = 0.1)), type = "l", lwd = 1)
72 | 
73 | # Fit different models using a radial basis function and different values of ===
74 | # sigma (the scaling parameter)
75 | 
76 | mypalette <- brewer.pal(9, "Reds")
77 | 
78 | par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
79 | plot(x, y)
80 | for (i in 1:9) {
81 |   radial_SVM <- ksvm(x = x, y = y, data = sin_data,
82 |     kernel ="rbfdot", kpar = list(sigma = i), # move epsilon from 1 to 9
83 |     C = 1, epsilon = 0.1) 
84 |   model_preds <- predict(radial_SVM, newdata = data_grid)
85 |   points(x = data_grid$x, y = model_preds[,1], type = "l", 
86 |     col = mypalette[i], 
87 |     lwd = 2)
88 | }
89 | legend("topright", inset=c(-0.2,0), lty = 1, legend=c(seq(1, 9, by = 1)), 
90 |   col = mypalette[1:10], title="Sigma", cex = 0.5)
91 | par(xpd = FALSE)
92 | points(seq(1, 11, by = 0.1), sin(seq(1, 11, by = 0.1)), type = "l", lwd = 1)
93 | 
94 | 
95 | # sigma determines the extent smoothing, if zero left only with beta_0...
96 | # Cost determines error impact on model, e determines number of support vectors
97 | # book suggests fixing e and tuning the cost (there is a relationship between the two)
98 | 


--------------------------------------------------------------------------------
/non-linear-regression-models/friedman_data_72.R:
--------------------------------------------------------------------------------
  1 | # Chapter 7: Non-linear Regression models
  2 | #
  3 | # Script based on Ex 7.2 of Applied Predictive Modeling (Kuhn & Johnson, 2013)
  4 | # Exercise covers on data simulation and model fitting
  5 | #
  6 | #
  7 | # Packages ====================================================================
  8 | library(mlbench)
  9 | library(caret)
 10 | library(reshape2)
 11 | 
 12 | # Simulate the data ===========================================================
 13 | set.seed(200)
 14 | training <- mlbench.friedman1(200, sd = 1)  # data is in list format
 15 | training$x <- data.frame(training$x)  # convert predictor matrix to dataframe
 16 | 
 17 | # slight pattern in som of the generating variables X1 through X5 compared to the 
 18 | # noise variables X6 through X10
 19 | featurePlot(training$x, training$y)
 20 | 
 21 | # simulate a large test set for greater precision in error estimation
 22 | test <- mlbench.friedman1(5000, sd = 1)  
 23 | test$x <- data.frame(test$x)
 24 | 
 25 | # Fit and test a KNN model ======================================================
 26 | ctrl <- trainControl(method = 'repeatedcv', number = 10, repeats = 5)
 27 | 
 28 | set.seed(100)
 29 | knn_model <- train(x = training$x,
 30 |   y = training$y,
 31 |   method = "knn",
 32 |   preProc = c("center", "scale"),
 33 |   trControl = ctrl,
 34 |   tuneLength = 10)
 35 | knn_model
 36 | 
 37 | knn_pred <- predict(knn_model, newdata = test$x)
 38 | postResample(pred = knn_pred, obs = test$y)
 39 | 
 40 | # the predictions are within a narrower range, tendecy to underpredict high values
 41 | # and overpredict low values
 42 | plot(x = knn_pred, y = test$y, xlab = 'prediction', ylab = 'observed')
 43 | abline(a = 0, b = 1, lty = 2, col = 2)
 44 | 
 45 | # Fit and test a SVM model ======================================================
 46 | set.seed(100)
 47 | svmR_model <- train(x = training$x,
 48 |   y = training$y,
 49 |   method = "svmRadial",
 50 |   preProc = c("center", "scale"),
 51 |   trControl = ctrl,
 52 |   tuneLength = 10)
 53 | svmR_model
 54 | svmR_model$finalModel  # epsilon = 0.1
 55 | 
 56 | svmR_pred <- predict(svmR_model, newdata = test$x)
 57 | postResample(pred = svmR_pred, obs = test$y)
 58 | 
 59 | # Much better fit than the knn model
 60 | plot(x = svmR_pred, y = test$y, xlab = 'prediction', ylab = 'observed',
 61 |   main = 'SVM model')
 62 | abline(a = 0, b = 1, lty = 2, col = 2)
 63 | 
 64 | # Fit and test a MARS model ====================================================
 65 | set.seed(100)
 66 | mars_grid <- expand.grid(.degree = 1:2, .nprune = seq(7, 20, by = 2))
 67 | mars_model <- train(x = training$x,
 68 |   y = training$y,
 69 |   method = "earth",
 70 |   preProc = c("center", "scale"),
 71 |   trControl = ctrl,
 72 |   tuneGrid = mars_grid)
 73 | mars_model
 74 | 
 75 | # Model differentiated signal from noise variables, only X6 was not pruned
 76 | # from the models (and then only for degree = 1)
 77 | plotmo(mars_model$finalModel)
 78 | varImp(mars_model)
 79 | 
 80 | mars_pred <- predict(mars_model, test$x)
 81 | postResample(pred = mars_pred, obs = test$y)
 82 | 
 83 | # Close fit...
 84 | plot(x = mars_pred, y = test$y, xlab = 'prediction', ylab = 'observed',
 85 |   main = 'MARS model')
 86 | abline(a = 0, b = 1, lty = 2, col = 2)
 87 | 
 88 | # Fit and test a Neural Networks model =========================================
 89 | cor(training$x)
 90 | 
 91 | nnet_grid <- expand.grid(.decay = seq(0.5, 1.5, by = 0.1), # book recommends decay is 
 92 |   .size = seq(1, 15, by = 2), .bag = FALSE)                # between 0 and 0.1
 93 | set.seed(100)
 94 | nnet_model <- train(x = training$x,
 95 |   y = training$y,
 96 |   method = "avNNet",  # avNNet performs model averaging
 97 |   preProc = c("center", "scale"),
 98 |   trControl = ctrl,
 99 |   tuneGrid = nnet_grid,
100 |   linout = TRUE,
101 |   maxit = 500)
102 | nnet_model
103 | nnet_model$finalModel  # there are 5 final models
104 | 
105 | nnet_pred <- predict(nnet_model, test$x)
106 | postResample(pred = nnet_pred, obs = test$y)
107 | 
108 | # Slight tendency, as with knn, towards a thinner tailed prediction distribution
109 | # than actually observed
110 | plot(x = nnet_pred, y = test$y, xlab = 'prediction', ylab = 'observed',
111 |   main = 'Neural Networks model')
112 | abline(a = 0, b = 1, lty = 2, col = 2)
113 | 
114 | nnet_pred_dist <- data.frame(training_data = training$y, predictions = nnet_pred,
115 |   population_data = test$y)
116 | nnet_pred_dist <- melt(nnet_pred_dist); str(nnet_pred_dist)
117 | ggplot(data = nnet_pred_dist, aes(value, ..density.., colour = variable)) + 
118 |   geom_density() +
119 |   theme_
120 | 
121 | 


--------------------------------------------------------------------------------
/classification-tree-and-rule-based-models/random_forests_143.R:
--------------------------------------------------------------------------------
  1 | # Classification Trees and Rule Based Models
  2 | #
  3 | # Script based on Ex 14.3 of Applied Predictive Modeling (Kuhn & Johnson, 2013)
  4 | #
  5 | # AIM: investigate difference between CART and conditional inference random
  6 | # forest models
  7 | #
  8 | # In particular:
  9 | # a) The resulting models - 10-fold cv performance, optimal mtry
 10 | # b) Fitting time
 11 | # c) Most important variables - any differences?
 12 | # d) Impact of preprocessing on CART most important variables
 13 | #
 14 | # Load data and packages =======================================================
 15 | library(caret)
 16 | library(AppliedPredictiveModeling)
 17 | library(party)
 18 | 
 19 | data(hepatic)
 20 | 
 21 | 
 22 | # Fit a random forest model using both CART trees and conditional inference trees
 23 | set.seed(714)
 24 | indx <- createFolds(injury, returnTrain = TRUE)
 25 | ctrl <- trainControl(method = "cv", index = indx)
 26 | mtryValues <- c(5, 10, 25, 50, 75, 100)
 27 | 
 28 | # CART random forest
 29 | rfCART <- train(chem, injury,  
 30 |   method = "rf",
 31 |   metric = "Kappa",
 32 |   ntree = 1000,
 33 |   tuneGrid = data.frame(.mtry = mtryValues), 
 34 |   do.trace = TRUE)
 35 | rfCART
 36 | 
 37 | # Conditional inference random forest
 38 | rfcForest <- train(chem, injury,  # conditional inference trees
 39 |   method = "cforest",
 40 |   metric = "Kappa",
 41 |   tuneGrid = data.frame(.mtry = mtryValues))
 42 | rfcForest
 43 | 
 44 | 
 45 | # Fitting times ---------------------------------------------------------------
 46 | rfCART$times$everything
 47 | rfcForest$times$everything
 48 | 
 49 | 
 50 | # Variable importance ---------------------------------------------------------
 51 | varImp(rfCART)
 52 | temp <- varImp(rfCART)$importance
 53 | imp_var_rfCART <- rownames(temp)[order(temp$Overall, decreasing = TRUE)][1:20]
 54 | round(sapply(chem[imp_var_rfCART], var), 2)  # variance of variables
 55 | sapply(chem[imp_var_rfCART], range)  # range of variables
 56 | 
 57 | 
 58 | varImp(rfcForest)
 59 | temp <- varImp(rfcForest)$importance
 60 | imp_var_rfcForest <- rownames(temp)[order(temp$Overall, decreasing = TRUE)][1:20]
 61 | round(sapply(chem[imp_var_rfcForest], var), 2)  # variance of variables
 62 | sapply(chem[imp_var_rfcForest], range)  # range of variables
 63 | 
 64 | 
 65 | # The preference of the CART model for continuous rather than binary or discrete
 66 | # variables that take on few values is clear from the variances of the top 20 
 67 | # variables of each model. 
 68 | #
 69 | # Does preprocessing CART models reduce this bias? ----------------------------
 70 | 
 71 | rfCART.2 <- train(chem, injury,  
 72 |   method = "rf",
 73 |   metric = "Kappa",
 74 |   ntree = 1000,
 75 |   tuneGrid = data.frame(.mtry = mtryValues), 
 76 |   do.trace = TRUE,
 77 |   preProcess = c("center", "scale"))
 78 | rfCART.2
 79 | 
 80 | varImp(rfCART.2)
 81 | temp <- varImp(rfCART.2)$importance
 82 | imp_var_rfCART.2 <- rownames(temp)[order(temp$Overall, decreasing = TRUE)][1:20]
 83 | round(sapply(chem[imp_var_rfCART.2], var), 2)  # variance of variables
 84 | sapply(chem[imp_var_rfCART.2], range)  # range of variables
 85 | 
 86 | # Pre-processing alters the variables considered most important although cross
 87 | # validation performance is reduced. The preference towards high variance
 88 | # variables taking on many values is reduced. A different mtry value is return.
 89 | 
 90 | 
 91 | # Random Forests (CART vs cforest) ============================================
 92 | # CART random forest: Trees are high variance, low bias. Bagging attempts to 
 93 | # reduce the variance by bootstrap aggregating a number of trees. However as 
 94 | # each tree uses the same set of predictors they are not independent of one 
 95 | # another. Random forest attempts to deal with this by selecting a subset m of 
 96 | # the full set predictors at each split. 
 97 | # For categorical response the m is typically set at sqrt(p), p being the full
 98 | # set of predictors. The trees are then fit as normal, i.e. using the Gini index
 99 | # or an information criterion for classification.
100 | #
101 | # One issue with CART random forest is a bias in variable selection, specifically
102 | # a preference for continuous over categorical variables. Conditional inference 
103 | # trees (Hothorn et al. 2006) are one technique to overcome this. These trees 
104 | # are fit by performing a hypothesis test at each candidate split (across an 
105 | # exhaustive list) and generating p-values. This allows p-values rather than 
106 | # raw differences to be compared which decreases bias as they are on the same 
107 | # scale. Multiple comparison correction is also used, as such trees are not pruned 
108 | # as increased splits result in reduced power of the test and less likelihood
109 | # of false positive splits
110 | 


--------------------------------------------------------------------------------
/measuring-predictor-importance/oil_data_182.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Ex18.1 Measuring predictor importance: oil data set'
  3 | author: "Oisin Fitzgerald"
  4 | output: 
  5 |   html_document:
  6 |     keep_md: true 
  7 | ---
  8 | Assessing varaible importance (and model fitting) may be more difficult with
  9 | a response factor with multiple levels. This script outlines the quantification
 10 | of variable importance in a data set where the response is a factor with seven
 11 | levels. The data set describes the fatty acid composition of seven oil types (e.g.
 12 | pumpkin (A), sunflower (B)....) measured using gas chromatography. More can be 
 13 | found in ?caret::oil.
 14 | 
 15 | Four approaches are taken to evaluating predictor importance, answer the following questions:
 16 | 
 17 | 1. Graphical display of the predictors for each level of the response (boxplots!):
 18 | how well separated do the classes appear?
 19 | 2. Analysis of variance: for each predictor is there more variance between rather
 20 | than within the response categories?
 21 | 3. Area under the ROC curve: what is the ability of a predictor to separate classes?
 22 | 4. ReliefF (incl permutation test): what is the ability of a predictor to separate classes and is this greater than would occur by chance?
 23 | ```{r}
 24 | # load data
 25 | library(caret)
 26 | data(oil)
 27 | str(oilType)
 28 | str(fattyAcids)
 29 | 
 30 | # load packages
 31 | library(reshape2)
 32 | library(CORElearn)
 33 | library(AppliedPredictiveModeling)
 34 | suppressMessages(library(pROC))
 35 | ```
 36 | 
 37 | ### 1. Boxplots 
 38 | Although not particularly pretty the boxplots does show that 
 39 | oleic and linoleic have the most variation between and within classes.
 40 | ```{r}
 41 | summary(fattyAcids)
 42 | # prepare data for boxplots
 43 | df <- data.frame(fattyAcids, oilType)
 44 | df <- melt(df)
 45 | df1 <- df[df$variable %in% c("Linoleic", "Oleic", "Palmitic"), ]
 46 | df2 <- df[!(df$variable %in% c("Linoleic", "Oleic", "Palmitic")), ]
 47 | # for linoleic and oleic
 48 | ggplot(data = df1, aes(y = value, x = oilType)) + 
 49 |   geom_boxplot() +
 50 |     facet_grid(. ~variable)
 51 | # the remaining variables
 52 | ggplot(data = df2, aes(y = value, x = oilType)) + 
 53 |   geom_boxplot() +
 54 |     facet_grid(. ~variable)
 55 | ```
 56 | 
 57 | ### 2. ANOVA
 58 | By modelling the fatty acids as the dependent variables and the oil 
 59 | types as the independent variables (a realistic description of the original
 60 | experiment) we can gain an understanding of likely differences between the means of 
 61 | factor levels for each fatty acid. The results suggest there are differences, although
 62 | an ANOVA won't indicate where the difference lie. Computing pairwise t-tests would
 63 | be required for this, but the amount of numbers to compare in this case is prohibitive compared to other methods.
 64 | ```{r}
 65 | anova_res <- function(x, y) {
 66 |   test <- anova(lm(x ~ y))
 67 |   out <- c(F_value = test$`F value`[1], p_value = test$`Pr(>F)`[1])
 68 |   out
 69 | }
 70 | anova_data <- apply(fattyAcids, MARGIN = 2, FUN = anova_res, y = oilType)
 71 | anova_data <- data.frame(t(anova_data))
 72 | anova_data[order(anova_data$p_value, decreasing = FALSE), ]
 73 | 
 74 | # method of pairwise comparison
 75 | # pairwise.t.test(x = fattyAcids$Palmitic, g = oilType, pool.sd = FALSE)
 76 | ```
 77 | 
 78 | ### 3. ROC and AUC
 79 | When there are three or more classes, filterVarImp will compute ROC curves
 80 | for each class versus the others and then returns the largest area under the
 81 | curve. In this case, with AUCs of 1, there are clearly some predictors that
 82 | can perfectly separate one class from the rest (but we could have guessed this
 83 | looking at the boxplots!).
 84 | ```{r}
 85 | # AUC results
 86 | auc_res <- filterVarImp(y = oilType, x = fattyAcids)
 87 | ```
 88 | 
 89 | ### 4. ReliefF and Permutation Test
 90 | The ReliefF algorithm takes a random sample of the data and calculates how far each
 91 | observation is from a randomly chosen observation of the same and a different (response) class.
 92 | ```{r}
 93 | relief_values <- attrEval(oilType ~ ., data = fattyAcids,
 94 |   estimator = "ReliefFequalK",  # calculation method
 95 |   ReliefIterations = 50)  # num iteration
 96 | relief_values[order(relief_values, decreasing = TRUE)]
 97 | ```
 98 | 
 99 | A permutation test adds to the ReliefF algorithm by allowing us to observe how the ReliefF score compares to a distribution of scores calculated on permutated data
100 | (i.e. where no predictor has any *real* relationship to the response)
101 | ```{r}
102 | relief_perm <- permuteRelief(x = fattyAcids, y = oilType, nperm = 500,
103 |   estimator = "ReliefFequalK",
104 |   ReliefIterations = 50)
105 | # format data for plotting
106 | relief_values <- data.frame(Predictor = names(relief_values), 
107 |   value = relief_values, 
108 |   row.names = NULL)
109 | # ggplot: facet wrapped histograms of permutations
110 | ggplot(data = relief_perm$permutations, aes(x = value)) + 
111 |   geom_histogram(binwidth = .01, colour = 1) +
112 |   geom_vline(aes(xintercept = value), relief_values, colour = "red", linetype = 2) +
113 |   facet_wrap(~ Predictor, scales = "free_x") + 
114 |   labs(title = "Relief Scores and Permutation Distributions", 
115 |     x = "Relief Scores\n*note free axes")
116 | ```
117 | 
118 | ```{r}
119 | # Standard deviation of permutated distribution from non-permutated score
120 | relief_perm$standardized[order(relief_perm$standardized)]
121 | ```
122 | 
123 | 


--------------------------------------------------------------------------------
/linear-classification-models/linear_classification_hepatic_121.R:
--------------------------------------------------------------------------------
  1 | # Linear Classification Methods
  2 | #
  3 | # Script based on Ex 12.1 of Applied Predictive Modeling (Kuhn & Johnson, 2013)
  4 | #
  5 | # AIM: predict whether a compound is likely to cause liver damage based 
  6 | # on biological and chemical variables
  7 | #
  8 | # In particular answering:
  9 | # a) What is a good train/test split given an imbalanced response
 10 | # b) Which of the linear classification techniques works best and what are the
 11 | # most important variables
 12 | # c) How does performance compare based on the bio and chem sata sets separately
 13 | # and together
 14 | #
 15 | # Load data and packages ======================================================
 16 | 
 17 | library(caret)
 18 | library(AppliedPredictiveModeling)
 19 | data(hepatic)
 20 | ?AppliedPredictiveModeling::bio
 21 | 
 22 | library(MASS)
 23 | library(corrplot)
 24 | 
 25 | # Create a training and test set ==============================================
 26 | table(injury)  # table of response levels
 27 | length(injury)  # how many obs
 28 | 
 29 | # Stratified sample: i.e. sample by factor level
 30 | # Create stratified sampling function
 31 | stratified_sample <- function(x, p) {  
 32 |   
 33 |   out <- NULL  # initialise output
 34 |   
 35 |   for (i in 1:length(levels(x))) {
 36 |   
 37 |     strata <- levels(x)[i]  # factor level to sample
 38 |     
 39 |     temp <- sample(which(x %in% strata),  # take sample by current level
 40 |       size = length(x[x == strata])*p, 
 41 |       replace = FALSE)
 42 |     
 43 |     out <- c(out, temp)  # add sample to previous sample
 44 |     
 45 |   }
 46 |   out <- sort(out)
 47 |   out  # output vector
 48 | }
 49 | 
 50 | training_index <- stratified_sample(x = injury, p = 0.6)
 51 | 
 52 | # Create the training/test sets
 53 | training_bio <- bio[training_index, ]
 54 | test_bio <- bio[-training_index, ]
 55 | training_chem <- chem[training_index, ]
 56 | test_chem <- chem[-training_index, ]
 57 | 
 58 | rm(bio, chem)  # remove unnecessary data
 59 | 
 60 | 
 61 | # Pre-process the data ========================================================
 62 | # 1. Near Zero Variance
 63 | nearZeroVar(training_bio, freqCut = 99/1, saveMetrics = TRUE)
 64 | remove <- nearZeroVar(training_bio, freqCut = 95/5)
 65 | training_bio <- training_bio[ ,-remove]
 66 | 
 67 | nearZeroVar(training_chem, freqCut = 99/1, saveMetrics = TRUE)
 68 | remove <- nearZeroVar(training_chem, freqCut = 95/5)
 69 | training_chem <- training_chem[ ,-remove]
 70 | 
 71 | # 2. Correlations
 72 | corr_bio <- cor(training_bio)
 73 | corrplot(corr_bio, tl.cex = 0.5)
 74 | remove <- findCorrelation(corr_bio)
 75 | training_bio <- training_bio[ ,-remove]
 76 | 
 77 | corr_chem <- cor(training_chem)
 78 | corrplot(corr_chem, tl.cex = 0.5)
 79 | remove <- findCorrelation(corr_chem)
 80 | training_chem <- training_chem[ ,-remove]
 81 | 
 82 | # 3. Transform, center and scale
 83 | preProc_bio <- preProcess(training_bio, method = c("center", "scale", "BoxCox"))
 84 | training_bio <- predict(preProc_bio, training_bio)
 85 | 
 86 | preProc_chem <- preProcess(training_chem, method = c("center", "scale", "BoxCox"))
 87 | training_chem <- predict(preProc_chem, training_chem)
 88 | 
 89 | rm(corr_bio, corr_chem)  # keep clean workspace
 90 | 
 91 | # 5. Prep test data sets (following training alterations)
 92 | temp <- colnames(test_bio) %in% colnames(training_bio)
 93 | test_bio <- test_bio[ ,temp]
 94 | test_bio <- predict(preProc_bio, test_bio)  # same preProc object
 95 | 
 96 | temp <- colnames(test_chem) %in% colnames(training_chem)
 97 | test_chem <- test_chem[ ,temp]
 98 | test_chem <- predict(preProc_chem, test_chem)  # same preProc object
 99 | 
100 | rm(preProc_bio, preProc_chem) 
101 | 
102 | # Fit linear classification models to optimise AUC ============================
103 | 
104 | # LDA -------------------------------------------------------------------------
105 | # Briefly: background to linear discriminant analysis (LDA)...
106 | # LDA models the distribution of the predictors X given the response Y (with k
107 | # classes) and uses Bayes theorem to flip these around into estimates of 
108 | # Pr(y = k|X = x). Let p(k) represent the overall (prior) probability that any 
109 | # random observation is from the kth class of Y. Let f(x = X|Y = k) denote the
110 | # density function of X for an observation that comes from the kth class. f(x) will
111 | # be high when a given value of X is likely to be associated with class k of Y.
112 | # Then Bayes theorem states Pr(y = k|X = x) = pk(x) = p(k)*fk(x)/SUM[p(l)*fl(x)]. 
113 | # p(k), the posterior probability, can be estimated using sample fractions of 
114 | # classes, however f(x) is more difficult w/out assuming a probability distribution
115 | # for X.
116 | # By modelling these distributions as Gaussian, and assuming equal variances the
117 | # discriminant function can be derived.
118 | 
119 | # 1a. fit LDA - biological data
120 | lda_bio <- lda(injury[training_index] ~ ., 
121 |   data = training_bio)
122 | 
123 | # 1a. fit LDA - chem data
124 | # failed - collinearity exists in prediction matrix
125 | # remove collinearity
126 | remove <- findLinearCombos(training_chem)$remove
127 | training_chem <- training_chem[ ,-remove]
128 | temp <- colnames(test_chem) %in% colnames(training_chem)
129 | test_chem <- test_chem[ ,temp]
130 | 
131 | lda_chem <- lda(injury[training_index] ~ ., 
132 |   data = training_chem)
133 | 
134 | # 2a. predict on test data - bio
135 | lda_bio_preds <- predict(lda_bio, test_bio)
136 | confusionMatrix(lda_bio_preds$class, injury[-training_index])
137 | 
138 | # 2b. predict on test data - chem
139 | lda_chem_preds <- predict(lda_chem, test_chem)
140 | confusionMatrix(lda_chem_preds$class, injury[-training_index])
141 | 
142 | 
143 | 
144 | 
145 | 


--------------------------------------------------------------------------------
/data-pre-processing/data_pre_proc.R:
--------------------------------------------------------------------------------
  1 | # Chapter 3: Data Pre-Processing
  2 | #
  3 | # Script based on Ex 3.1 to 3.3 of Applied Predictive Modeling (Kuhn & Johnson, 2013)
  4 | # Exercises focus on data pre-processing incl. missingness, distributions
  5 | # (skewness/outliers), transformations, near zero variance and visualising above...
  6 | #
  7 | # ==============================================================================
  8 | # Exercise 3.1 
  9 | # AIM: Visual data exploration (distributions/outliers) and explore possible
 10 | # transformations
 11 | #
 12 | # Notes:
 13 | #
 14 | #
 15 | # The data ---------------------------------------------------------------------
 16 | # 6 types of glass; defined in terms of their oxide content (i.e. Na, Fe, K, etc)
 17 | library(mlbench)
 18 | data(Glass)
 19 | str(Glass)
 20 | 
 21 | library(corrplot)
 22 | library(reshape2)
 23 | library(ggplot2)
 24 | library(caret)
 25 | library(e1071)
 26 | 
 27 | # Correlation matrix of the variables ------------------------------------------
 28 | Glass.corr <- cor(Glass[ ,1:9])  # correlation matrix of predictors
 29 | corrplot(Glass.corr, method = "number", type = "upper")  # correlation matrix visualisation
 30 | plot(Glass$RI, Glass$Ca, 
 31 |   main = "Scatterplot of RI and Ca") # evidence of linear relationship
 32 | 
 33 | # Histogram of the variables ---------------------------------------------------
 34 | Glass.stack <- melt(Glass)  # 3 columns w/ Type and variable name as factor variables
 35 | str(Glass.stack)
 36 | p <- ggplot(Glass.stack, aes(x = value))
 37 | p + geom_histogram(col = "blue") + facet_wrap(~ variable, scales = "free")
 38 | # RI, Na, Si, AL, Ca are relatively normal though with outliers
 39 | # Ba, Fe, K and Mg have large proportions of zero/near zero values
 40 | table(Glass$Ba == 0)/length(Glass$Ba)
 41 | 
 42 | # Near zero variances ----------------------------------------------------------
 43 | nearZeroVar(Glass, saveMetrics = TRUE)  # saveMetrics returns useful info
 44 | # Fe, Ba, K have large numbers of zero values
 45 | 
 46 | # Transformations: Skewness and Box Cox lamba estimates -----------------------
 47 | Glass.pred <- Glass[ ,1:9]
 48 | skewValues <- apply(Glass.pred, 2, skewness)
 49 | skewValues  # K > Ba > Ca > Fe > RI (all over |1|)
 50 | apply(Glass.pred[abs(skewValues) > 1], 2, BoxCoxTrans)  # BoxCox estimates
 51 | 
 52 | # Principal Component Analysis ------------------------------------------------
 53 | pcaObject <- prcomp(Glass.pred, center = TRUE, scale. = TRUE)
 54 | percentVariance <- pcaObject$sd^2/sum(pcaObject$sd^2)*100  # variance each component a/c for
 55 | round(percentVariance, 2)
 56 | round(head(pcaObject$rotation[ ,1:5]),2)  # variable loadings
 57 | 
 58 | rm(list = ls())  # clear enviroment
 59 | 
 60 | # ==============================================================================
 61 | # Exercise 3.2 
 62 | # AIM: Exploratory analysis focusing on identifying missing values and 
 63 | # near zero variance
 64 | #
 65 | # Notes:
 66 | #
 67 | #
 68 | # The data ---------------------------------------------------------------------
 69 | library(mlbench)
 70 | data(Soybean) 
 71 | ?mlbench::Soybean  # dna = does not apply
 72 | str(Soybean)  # factors and ordinal factors
 73 | 
 74 | library(caret)
 75 | library(ggplot2)
 76 | library(mi)  # for visualising missing values
 77 | library(ipred)  # imputation
 78 | 
 79 | # Near Zero Variances ---------------------------------------------------------
 80 | SoyZeroVar <- nearZeroVar(Soybean, freqCut = 95/5); SoyZeroVar  # 3 variables
 81 | qplot(Soybean[ ,19])
 82 | qplot(Soybean[ ,26])
 83 | qplot(Soybean[ ,28])
 84 | nearZeroVar(Soybean, freqCut = 90/10)  # 8 variables
 85 | 
 86 | # Missing Values a) Predictors -----------------------------------------------
 87 | table(is.na(Soybean))/(683*36)  # how many missing values?
 88 | tab.na <- matrix(data = NA, nrow = 2, ncol = 35)  # init matrix for storing NA info
 89 | rownames(tab.na) <- c("NA", "not NA")
 90 | colnames(tab.na) <- colnames(Soybean[-1])  # [-1] to leave out Class
 91 | 
 92 | # Create a table with NA vs not NA values for each variable
 93 | for (i in 2:36) { 
 94 |   tab.na[ , i-1] <- table(is.na(Soybean[ , i]))/(length(Soybean[ , i]))
 95 | }
 96 | tab.na <- tab.na[ ,order(tab.na[1, ])]
 97 | 
 98 | # Visualise the results
 99 | barplot(tab.na[1, -12], cex.names = 0.8, las = 2, 
100 |   main = "Proportion of non-missing values")
101 | 
102 | # Alternate visualisation
103 | mSoybean <- missing_data.frame(Soybean)
104 | image(mSoybean)
105 | 
106 | # Missing Values b) response (incl. general data missingness by response level)
107 | table(is.na(Soybean[,1]))  # all the class obs are present
108 | 
109 | for (i in 1:683) {  # loop to count missing values per row
110 |   Soybean[i, 37] <- sum(is.na(Soybean)[i, ])
111 | }
112 | colnames(Soybean)[37] <- "missing"
113 | Soybean.na <- subset(Soybean, select = c(Class, missing))  # subset by Class
114 | Soybean[ , 37] <- NULL  # rm extra column
115 | missing.by.Class <- aggregate(missing ~ Class, data = Soybean.na, FUN = "sum")
116 | plot(missing.by.Class, 
117 |   cex.axis = 0.7, 
118 |   las = 2, 
119 |   xlab = "", 
120 |   main = "Missing Values per Class")
121 | # phytophthora-rot most impacted by missing values, followed by 2-4-d-injury and cyst-nematode
122 | 
123 | # Initial attempt to deal with the missing values / NZV -----------------------
124 | Soybean.filtered <- Soybean[ , -SoyZeroVar] # rm zero var variables
125 | dummies <- dummyVars(Class ~ ., data = Soybean.filtered, na.action = na.pass)
126 | Soybean.filtered <- predict(dummies, newdata = Soybean)  # dummy variables
127 | preProcess(Soybean.filtered, method="bagImpute")  # impute missing values using 
128 | # bagged tree method
129 | 
130 | rm(list = ls())  # clear enviroment
131 | 
132 | # ==============================================================================
133 | # Exercise 3.3
134 | # AIM: Data exploration with a focus on associated predictors / near zero variance
135 | #
136 | # Notes:
137 | #
138 | #
139 | # The data ---------------------------------------------------------------------
140 | library(caret)
141 | data(BloodBrain)
142 | ?BloodBrain  # each of the 208 compounds have multiple descriptors
143 | str(bbbDescr)
144 | 
145 | library(corrplot); ?corrplot
146 | library(e1071)
147 | 
148 | # Correlation matrix of the variables ------------------------------------------
149 | bbbDescr.corr <- cor(bbbDescr)
150 | corrplot(bbbDescr.corr, method = "shade", tl.cex = 0.4, order = "hclust")
151 | # corrplot indicates evidence of linear relationship between many variables
152 | 
153 | highCorr <- findCorrelation(bbbDescr.corr, cutoff = .75)  # which variables?
154 | length(highCorr)  # 66 variables included
155 | bbbDescr.filtered <- bbbDescr[, -highCorr]  # rm algorithm selected high correlations
156 | 
157 | # Skewness and transformations ------------------------------------------------
158 | SkewValues <- apply(bbbDescr, 2, skewness)
159 | SkewedVariables <- subset(SkewValues, abs(SkewValues) > 1)
160 | length(SkewedVariables)  # 71 variables
161 | 
162 | # BoxCox Transformations
163 | preProc <- preProcess(bbbDescr.filtered, method = "BoxCox")
164 | bbbDescr.proc <- predict(preProc, bbbDescr.filtered)
165 | 
166 | # Near Zero Variance ----------------------------------------------------------
167 | nearZeroVar(bbbDescr)  # 7 variabless
168 | bbbDescr.nearzero <- nearZeroVar(bbbDescr.proc) 
169 | bbbDescr.final <- bbbDescr.proc[ , -bbbDescr.nearzero]
170 | 
171 | # Leading to the final dataframe -----------------------------------------------
172 | # remaining correlations
173 | corrplot(cor(bbbDescr.final), method = "shade", tl.cex = 0.4, order = "hclust")
174 | # skewness following tranformations
175 | table(apply(bbbDescr.final, 2, skewness) > 1)  
176 | 
177 | rm(list = ls())  # clear enviroment
178 | 
179 | 
180 | 
181 | 


--------------------------------------------------------------------------------
/linear-regression-and-cousins/permeability_data_62.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Ex6.2 - Linear regression: molecule permeability'
  3 | author: "Oisin Fitzgerald"
  4 | date: "27 January 2016"
  5 | output: 
  6 |   html_document:
  7 |     keep_md: true 
  8 | ---
  9 | 
 10 | ### The data 
 11 | 
 12 | This pharmaceutical data set was used to develop a model for predicting compounds' 
 13 | permeability, a measure of a molecule's ability to cross a membrane. Permeability
 14 | impacts on a potential drug's usefulness, i.e. it needs to be able to cross
 15 | certain membranes to be effective. There exist assays to measure a compund's
 16 | permeability. The gaol here is to develop a predictive model for permeability 
 17 | in an attempt to potentially reduce the need for the assay. 
 18 | (brief description, more on ?permeability)
 19 | 
 20 | ### Outline
 21 | 
 22 | * Training/test split
 23 | * Pre-process the data
 24 |     + Skewness of response
 25 |     + Sparseness
 26 | * Fit and test PLS models
 27 | * Fit and ridge regression, LASSO and elastic net models
 28 | 
 29 | ```{r}
 30 | # Load data and packages
 31 | library(AppliedPredictiveModeling)
 32 | data(permeability)
 33 | 
 34 | suppressMessages(library(caret))
 35 | suppressMessages(library(pls))
 36 | suppressMessages(library(elasticnet))
 37 | suppressMessages(library(lars))
 38 | ```
 39 | 
 40 | ### Data splitting and pre-processing
 41 | 
 42 | ```{r}
 43 | fingerprints <- data.frame(fingerprints)
 44 | permeability <- as.vector(permeability)
 45 | # Create training/test split index
 46 | Split <- createDataPartition(permeability, times = 1, p = 0.75)
 47 | Split <- Split$Resample1 
 48 | # Create training and test splits
 49 | training <- fingerprints[Split, ]
 50 | test <- fingerprints[-Split, ]
 51 | response_train <- permeability[Split]
 52 | response_test <- permeability[-Split]
 53 | 
 54 | # Pre-processing
 55 | training_filter <- training[ ,-nearZeroVar(training, freqCut = 95/5)]  # near zero variances
 56 | # binary data...
 57 | 
 58 | # positively skewed
 59 | ggplot() + geom_histogram(aes(x = permeability), binwidth = 4, col = 1) + 
 60 |   labs(title = "Histogram of permeability", x = "Molecule permeability") +
 61 |   theme_bw()
 62 | preProcess(data.frame(permeability), method = c("YeoJohnson"))
 63 | ```
 64 | 
 65 | ### Fit and test partial least squares models
 66 | 
 67 | Three variations on a PLS model were fit and tested:
 68 | 1. A model fit to the full training set
 69 | 2. A model fit to the near zero variance reduced training set
 70 | 3. A model fit to a log transformed response
 71 | 
 72 | None of the models created appear to have the predictive ability to replace the 
 73 | mentioned assay method, they lack accurate predictive ability. The log(response)
 74 | PLS model (and others) appears to shown promise within a certain range. Possibly 
 75 | the current linear technique is too restrictive. Some models are producing negative 
 76 | predictions - multicolinearity?. 
 77 | 
 78 | ```{r}
 79 | # Fit PLS models
 80 | ctrl = trainControl("repeatedcv", number = 5, repeats = 5)
 81 | pls_model.1 <- train(y = response_train,
 82 |   x = training,
 83 |   method = "pls",
 84 |   metric = "Rsquared",
 85 |   tuneLength = 10,
 86 |   trControl = ctrl)
 87 | pls_model.1
 88 | 
 89 | pls_model.2 <- train(y = response_train,
 90 |   x = training_filter,
 91 |   method = "pls",
 92 |   metric = "Rsquared",
 93 |   tuneLength = 10,
 94 |   trControl = ctrl)
 95 | pls_model.2
 96 | 
 97 | pls_model.3 <- train(y = log(response_train),
 98 |   x = training,
 99 |   method = "pls",
100 |   metric = "Rsquared",
101 |   tuneLength = 10,
102 |   trControl = ctrl)
103 | pls_model.3
104 | 
105 | # post hoc: remove multicollinearity and refit
106 | remove <- findLinearCombos(training_filter)
107 | training_filter2 <- training_filter[ ,remove$remove]
108 | 
109 | pls_model.4 <- train(y = response_train,
110 |   x = training_filter2,
111 |   method = "pls",
112 |   metric = "Rsquared",
113 |   tuneLength = 10,
114 |   trControl = ctrl)
115 | pls_model.4
116 | 
117 | 
118 | # Predict on test data
119 | pls1_preds <- predict(pls_model.1, test)
120 | RMSE(pls1_preds, response_test)
121 | cor(pls1_preds, response_test)^2  # Rsquared
122 | ggplot() + 
123 |   geom_point(aes(x = pls1_preds, y = response_test)) + 
124 |   theme_bw() +
125 |   labs(title = "PLS model 1 predictions vs. observed", 
126 |     x = "predicted permeability", 
127 |     y = "observed permeability")
128 | 
129 | pls2_preds <- predict(pls_model.2, test)
130 | RMSE(pls2_preds, response_test)
131 | cor(pls2_preds, response_test)^2  # Rsquared
132 | ggplot() + 
133 |   geom_point(aes(x = pls2_preds, y = response_test)) + 
134 |   theme_bw() +
135 |   labs(title = "PLS model 2 predictions vs. observed", 
136 |     x = "predicted permeability", 
137 |     y = "observed permeability")
138 | 
139 | pls3_preds <- predict(pls_model.3, test)
140 | RMSE(exp(pls3_preds), response_test)
141 | cor(exp(pls3_preds), response_test)^2  # Rsquared
142 | ggplot() + 
143 |   geom_point(aes(x = pls3_preds, y = response_test)) + 
144 |   theme_bw() +
145 |   labs(title = "PLS model 3 predictions vs. observed", 
146 |     x = "predicted permeability", 
147 |     y = "observed permeability")
148 | # can it predict well within a certain range??
149 | RMSE(exp(pls3_preds[response_test<20]), response_test[response_test<20])
150 | 
151 | pls4_preds <- predict(pls_model.4, test)
152 | RMSE(pls4_preds, response_test)
153 | cor(pls4_preds, response_test)^2  # Rsquared
154 | ggplot() + 
155 |   geom_point(aes(x = pls4_preds, y = response_test)) + 
156 |   theme_bw() +
157 |   labs(title = "PLS model 4 predictions vs. observed", 
158 |     x = "predicted permeability", 
159 |     y = "observed permeability")
160 | 
161 | ```
162 | 
163 | #### Fit and ridge regression, LASSO and elastic net models
164 | 
165 | As with the PLS models, none of the models were stong predictors of the data, suggesting
166 | that laboratory methods of measuring permeability are prefereable (at least) to the
167 | models fitted. The PLS models actually outperformed the shrinkage methods. Further the 
168 | cross validated estimates of RMSE and R2 were quite inaccurate compared to the test fits. 
169 | There appears to be residual instability in the cofficients even in these shrinkage methods, 
170 | with enet and ridge producing hugely negative predictions and RMSE before being tuned 
171 | over a predefined range of shrinkage coefficients. 
172 | 
173 | ```{r}
174 | # Fit shrinkage models
175 | ctrl <- trainControl("cv", number = 5)
176 | 
177 | ridge_grid <- expand.grid(.lambda = seq(0.05, 0.2, 0.01))
178 | ridge_model <- train(y = response_train, 
179 |   x = training_filter2,  # model fitting impacted by zero variance
180 |   method = "ridge",
181 |   tuneGrid = ridge_grid,
182 |   metric = "RMSE",
183 |   trControl = ctrl)
184 | ridge_model
185 | # remaining instability in the coefficients?
186 | plot(ridge_model$finalModel)
187 | title(main = "Ridge regression coefficient stability")  
188 | 
189 | lasso_model <- train(y = response_train, 
190 |   x = training_filter2,  
191 |   method = "lasso",
192 |   tuneLength = 10,
193 |   metric = "RMSE",
194 |   trControl = ctrl)
195 | lasso_model
196 | 
197 | enet_grid <- expand.grid(.lambda = c(0.01, 0.02, 0.03, 0.05), .fraction = c(seq(0.00001, 0.2, 0.02)))
198 | enet_model <- train(y = response_train, 
199 |   x = training_filter,
200 |   method = "enet",
201 |   tuneGrid = enet_grid,
202 |   metric = "RMSE",
203 |   trControl = ctrl)
204 | enet_model
205 | 
206 | 
207 | # Test shrinkage models
208 | ridge_preds <- predict(ridge_model, test)
209 | RMSE(ridge_preds, response_test)
210 | cor(ridge_preds, response_test)^2
211 | 
212 | lasso_preds <- predict(lasso_model, test)
213 | RMSE(lasso_preds, response_test)
214 | cor(lasso_preds, response_test)^2
215 | 
216 | enet_preds <- predict(enet_model, test)
217 | RMSE(enet_preds, response_test)
218 | cor(enet_preds, response_test)^2
219 | ```
220 | 
221 | 
222 | 
223 | 


--------------------------------------------------------------------------------
/measuring-predictor-importance/oil_data_182.md:
--------------------------------------------------------------------------------
  1 | # Ex18.1 Measuring predictor importance: oil data set
  2 | Oisin Fitzgerald  
  3 | Assessing varaible importance (and model fitting) may be more difficult with
  4 | a response factor with multiple levels. This script outlines the quantification
  5 | of variable importance in a data set where the response is a factor with seven
  6 | levels. The data set describes the fatty acid composition of seven oil types (e.g.
  7 | pumpkin (A), sunflower (B)....) measured using gas chromatography. More can be 
  8 | found in ?caret::oil.
  9 | 
 10 | Four approaches are taken to evaluating predictor importance, answer the following questions:
 11 | 
 12 | 1. Graphical display of the predictors for each level of the response (boxplots!):
 13 | how well separated do the classes appear?
 14 | 2. Analysis of variance: for each predictor is there more variance between rather
 15 | than within the response categories?
 16 | 3. Area under the ROC curve: what is the ability of a predictor to separate classes?
 17 | 4. ReliefF (incl permutation test): what is the ability of a predictor to separate classes and is this greater than would occur by chance?
 18 | 
 19 | ```r
 20 | # load data
 21 | library(caret)
 22 | ```
 23 | 
 24 | ```
 25 | ## Loading required package: lattice
 26 | ## Loading required package: ggplot2
 27 | ```
 28 | 
 29 | ```r
 30 | data(oil)
 31 | str(oilType)
 32 | ```
 33 | 
 34 | ```
 35 | ##  Factor w/ 7 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...
 36 | ```
 37 | 
 38 | ```r
 39 | str(fattyAcids)
 40 | ```
 41 | 
 42 | ```
 43 | ## 'data.frame':	96 obs. of  7 variables:
 44 | ##  $ Palmitic  : num  9.7 11.1 11.5 10 12.2 9.8 10.5 10.5 11.5 10 ...
 45 | ##  $ Stearic   : num  5.2 5 5.2 4.8 5 4.2 5 5 5.2 4.8 ...
 46 | ##  $ Oleic     : num  31 32.9 35 30.4 31.1 43 31.8 31.8 35 30.4 ...
 47 | ##  $ Linoleic  : num  52.7 49.8 47.2 53.5 50.5 39.2 51.3 51.3 47.2 53.5 ...
 48 | ##  $ Linolenic : num  0.4 0.3 0.2 0.3 0.3 2.4 0.4 0.4 0.2 0.3 ...
 49 | ##  $ Eicosanoic: num  0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 ...
 50 | ##  $ Eicosenoic: num  0.1 0.1 0.1 0.1 0.1 0.5 0.1 0.1 0.1 0.1 ...
 51 | ```
 52 | 
 53 | ```r
 54 | # load packages
 55 | library(reshape2)
 56 | library(CORElearn)
 57 | library(AppliedPredictiveModeling)
 58 | suppressMessages(library(pROC))
 59 | ```
 60 | 
 61 | ### 1. Boxplots 
 62 | Although not particularly pretty the boxplots does show that 
 63 | oleic and linoleic have the most variation between and within classes.
 64 | 
 65 | ```r
 66 | summary(fattyAcids)
 67 | ```
 68 | 
 69 | ```
 70 | ##     Palmitic        Stearic          Oleic          Linoleic    
 71 | ##  Min.   : 4.50   Min.   :1.700   Min.   :22.80   Min.   : 7.90  
 72 | ##  1st Qu.: 6.20   1st Qu.:3.475   1st Qu.:26.30   1st Qu.:43.10  
 73 | ##  Median : 9.85   Median :4.200   Median :30.70   Median :50.80  
 74 | ##  Mean   : 9.04   Mean   :4.200   Mean   :36.73   Mean   :46.49  
 75 | ##  3rd Qu.:11.12   3rd Qu.:5.000   3rd Qu.:38.62   3rd Qu.:58.08  
 76 | ##  Max.   :14.90   Max.   :6.700   Max.   :76.70   Max.   :66.10  
 77 | ##    Linolenic       Eicosanoic      Eicosenoic    
 78 | ##  Min.   :0.100   Min.   :0.100   Min.   :0.1000  
 79 | ##  1st Qu.:0.375   1st Qu.:0.100   1st Qu.:0.1000  
 80 | ##  Median :0.800   Median :0.400   Median :0.1000  
 81 | ##  Mean   :2.272   Mean   :0.399   Mean   :0.3115  
 82 | ##  3rd Qu.:2.650   3rd Qu.:0.400   3rd Qu.:0.3000  
 83 | ##  Max.   :9.500   Max.   :2.800   Max.   :1.8000
 84 | ```
 85 | 
 86 | ```r
 87 | # prepare data for boxplots
 88 | df <- data.frame(fattyAcids, oilType)
 89 | df <- melt(df)
 90 | ```
 91 | 
 92 | ```
 93 | ## Using oilType as id variables
 94 | ```
 95 | 
 96 | ```r
 97 | df1 <- df[df$variable %in% c("Linoleic", "Oleic", "Palmitic"), ]
 98 | df2 <- df[!(df$variable %in% c("Linoleic", "Oleic", "Palmitic")), ]
 99 | # for linoleic and oleic
100 | ggplot(data = df1, aes(y = value, x = oilType)) + 
101 |   geom_boxplot() +
102 |     facet_grid(. ~variable)
103 | ```
104 | 
105 | ![](oil_data_182_files/figure-html/unnamed-chunk-2-1.png) 
106 | 
107 | ```r
108 | # the remaining variables
109 | ggplot(data = df2, aes(y = value, x = oilType)) + 
110 |   geom_boxplot() +
111 |     facet_grid(. ~variable)
112 | ```
113 | 
114 | ![](oil_data_182_files/figure-html/unnamed-chunk-2-2.png) 
115 | 
116 | ### 2. ANOVA
117 | By modelling the fatty acids as the dependent variables and the oil 
118 | types as the independent variables (a realistic description of the original
119 | experiment) we can gain an understanding of likely differences between the means of 
120 | factor levels for each fatty acid. The results suggest there are differences, although
121 | an ANOVA won't indicate where the difference lie. Computing pairwise t-tests would
122 | be required for this, but the amount of numbers to compare in this case is prohibitive compared to other methods.
123 | 
124 | ```r
125 | anova_res <- function(x, y) {
126 |   test <- anova(lm(x ~ y))
127 |   out <- c(F_value = test$`F value`[1], p_value = test$`Pr(>F)`[1])
128 |   out
129 | }
130 | anova_data <- apply(fattyAcids, MARGIN = 2, FUN = anova_res, y = oilType)
131 | anova_data <- data.frame(t(anova_data))
132 | anova_data[order(anova_data$p_value, decreasing = FALSE), ]
133 | ```
134 | 
135 | ```
136 | ##               F_value      p_value
137 | ## Linoleic   317.956365 7.400916e-58
138 | ## Oleic      283.022304 1.018427e-55
139 | ## Linolenic  186.689720 3.445215e-48
140 | ## Stearic     97.030804 7.197315e-37
141 | ## Palmitic    95.029863 1.598196e-36
142 | ## Eicosenoic  20.851957 4.033448e-15
143 | ## Eicosanoic   5.894469 3.263021e-05
144 | ```
145 | 
146 | ```r
147 | # method of pairwise comparison
148 | # pairwise.t.test(x = fattyAcids$Palmitic, g = oilType, pool.sd = FALSE)
149 | ```
150 | 
151 | ### 3. ROC and AUC
152 | When there are three or more classes, filterVarImp will compute ROC curves
153 | for each class versus the others and then returns the largest area under the
154 | curve. In this case, with AUCs of 1, there are clearly some predictors that
155 | can perfectly separate one class from the rest (but we could have guessed this
156 | looking at the boxplots!).
157 | 
158 | ```r
159 | # AUC results
160 | auc_res <- filterVarImp(y = oilType, x = fattyAcids)
161 | ```
162 | 
163 | ### 4. ReliefF and Permutation Test
164 | The ReliefF algorithm takes a random sample of the data and calculates how far each
165 | observation is from a randomly chosen observation of the same and a different (response) class.
166 | 
167 | ```r
168 | relief_values <- attrEval(oilType ~ ., data = fattyAcids,
169 |   estimator = "ReliefFequalK",  # calculation method
170 |   ReliefIterations = 50)  # num iteration
171 | relief_values[order(relief_values, decreasing = TRUE)]
172 | ```
173 | 
174 | ```
175 | ##   Linoleic    Stearic      Oleic   Palmitic  Linolenic Eicosanoic 
176 | ## 0.63730668 0.54491315 0.52274769 0.43404607 0.24963549 0.10965929 
177 | ## Eicosenoic 
178 | ## 0.08903402
179 | ```
180 | 
181 | A permutation test adds to the ReliefF algorithm by allowing us to observe how the ReliefF score compares to a distribution of scores calculated on permutated data
182 | (i.e. where no predictor has any *real* relationship to the response)
183 | 
184 | ```r
185 | relief_perm <- permuteRelief(x = fattyAcids, y = oilType, nperm = 500,
186 |   estimator = "ReliefFequalK",
187 |   ReliefIterations = 50)
188 | # format data for plotting
189 | relief_values <- data.frame(Predictor = names(relief_values), 
190 |   value = relief_values, 
191 |   row.names = NULL)
192 | # ggplot: facet wrapped histograms of permutations
193 | ggplot(data = relief_perm$permutations, aes(x = value)) + 
194 |   geom_histogram(binwidth = .01, colour = 1) +
195 |   geom_vline(aes(xintercept = value), relief_values, colour = "red", linetype = 2) +
196 |   facet_wrap(~ Predictor, scales = "free_x") + 
197 |   labs(title = "Relief Scores and Permutation Distributions", 
198 |     x = "Relief Scores\n*note free axes")
199 | ```
200 | 
201 | ![](oil_data_182_files/figure-html/unnamed-chunk-6-1.png) 
202 | 
203 | 
204 | ```r
205 | # Standard deviation of permutated distribution from non-permutated score
206 | relief_perm$standardized[order(relief_perm$standardized)]
207 | ```
208 | 
209 | ```
210 | ## Eicosanoic Eicosenoic  Linolenic   Palmitic      Oleic    Stearic 
211 | ##   2.989642   5.748575   9.965823  12.783484  15.531254  16.228189 
212 | ##   Linoleic 
213 | ##  18.022441
214 | ```
215 | 
216 | 


--------------------------------------------------------------------------------
/linear-regression-and-cousins/chem_manufact_data_63.R:
--------------------------------------------------------------------------------
  1 | # Chapter 6: Linear Regression and its Cousins
  2 | #
  3 | # Script based on Ex 6.3 of Applied Predictive Modeling (Kuhn & Johnson, 2013)
  4 | # Exercise covers on data pre-processing, train/test splitting, missing value 
  5 | # imputation, model fitting and assessing variable importance scores
  6 | #
  7 | # Notes:
  8 | #
  9 | #
 10 | # The data and packages =======================================================
 11 | # Response: % yield from a pharmaceutical product's manufacture
 12 | # Predictors: biological and manufacturing process variables
 13 | 
 14 | library(AppliedPredictiveModeling)
 15 | data(ChemicalManufacturingProcess)
 16 | str(ChemicalManufacturingProcess)
 17 | 
 18 | library(caret)
 19 | library(fBasics)
 20 | library(corrplot)
 21 | library(car)
 22 | 
 23 | # Split into training and test data ===========================================
 24 | yield <- ChemicalManufacturingProcess$Yield
 25 | Split <- createDataPartition(yield, times = 1, p = 0.75)
 26 | # predictors...
 27 | training <- ChemicalManufacturingProcess[Split$Resample1, -1]
 28 | test <- ChemicalManufacturingProcess[-Split$Resample1, -1]
 29 | # response ...
 30 | resp_train <- yield[Split$Resample1]
 31 | resp_test <- yield[-Split$Resample1]
 32 | 
 33 | # Preprocess (inlc. missing data) =============================================
 34 | preproc_1 <- preProcess(training, 
 35 |   method = c("knnImpute", "center", "scale"), k = 5)
 36 | # centering and scaling are important when using e.g PLS/PCR as the methods
 37 | # seek directions of maximal variation
 38 | 
 39 | training <- predict(preproc_1, training)
 40 | test <- predict(preproc_1, test)
 41 | 
 42 | XX <- cor(train)
 43 | corrplot(XX, tl.cex = 0.3)  # some strong correlations
 44 | rk(XX)  # rank less than num vars; linear combinations
 45 | remove <- findLinearCombos(XX)
 46 | training <- training[ ,-remove$remove]
 47 | test <- test[ ,-remove$remove]
 48 | 
 49 | # what columns should be removed to reduce the correlations?
 50 | remove <- findCorrelation(XX, cutoff = .90) 
 51 | training_filter <- training[ ,-remove]
 52 | test_filter <- test[ ,-remove]
 53 | 
 54 | # Transform data - estimate lambda with Yeo Johnson
 55 | preproc_3 <- preProcess(training, method = "YeoJohnson")
 56 | training <- predict(preproc_3, training)
 57 | test <- predict(preproc_3, test)
 58 | 
 59 | # distribution of the response
 60 | hist(resp_train, xlab = "% yield", main = "Histogram of % yield")
 61 | 
 62 | # Model fitting ===============================================================
 63 | ctrl <- trainControl(method = "repeatedcv", number = 5, repeats = 5)
 64 | 
 65 | # 1. A linear regression model -------------------------------------------------
 66 | linear_model <- train(y = resp_train, x = training_filter,
 67 |   method = "lm",
 68 |   trControl = ctrl)
 69 | linear_model
 70 | # plots allow search for ill fit or potential quadratic effects
 71 | plot(linear_model$finalModel$fitted.values, 
 72 |   linear_model$finalModel$residuals, xlab = "fitted", ylab = "residuals")
 73 | avPlots(linear_model$finalModel)
 74 | 
 75 | # how does the final model fit look vs. the data?
 76 | plot(linear_model$finalModel$fitted.values, resp_train, xlab = "observed",
 77 |   ylab = "predicted")
 78 | abline(a = 0, b = 1, col = 2)
 79 | cor(linear_model$finalModel$fitted.values, resp_train)^2  # R2 fit
 80 | 
 81 | # 2. Principal components regression ------------------------------------------
 82 | pcr_results <- list(results = data.frame(RMSE = NA, RMSE_sd = NA), final = NA)
 83 | for (i in 1:20) {
 84 |   # fit model
 85 |   train_data <- princomp(training)$scores[ ,1:i]
 86 |   train_data <- data.frame(train_data)
 87 |   pcr_model <- train(y = resp_train,
 88 |     x = train_data,
 89 |     method = "lm",
 90 |     trControl = ctrl)
 91 |   
 92 |   # extract results
 93 |   pcr_results$results[i, 1] <- pcr_model$results$RMSE
 94 |   pcr_results$results[i, 2] <- pcr_model$results$RMSESD
 95 |   
 96 |   # extract model
 97 |   if (all(pcr_model$results$RMSE <= pcr_results$results$RMSE)) {
 98 |     pcr_results$final <- pcr_model
 99 |   }
100 | }
101 | pcr_results
102 | xyplot(pcr_results$results$RMSE ~ 1:20, xlab = "ncomp", ylab = "RMSE")
103 | 
104 | # 3. Partial least squares -----------------------------------------------------
105 | pls_model <- train(y = resp_train, x = training,
106 |   method = "pls",
107 |   trControl = ctrl,
108 |   tuneLength = 10)
109 | pls_model
110 | plot(pls_model)
111 | 
112 | # 4. Ridge regression ----------------------------------------------------------
113 | ridge_grid <- expand.grid(.lambda = seq(0.05, 0.2, 0.01))
114 | ridge_model <- train(y = resp_train, x = training,
115 |   method = "ridge",
116 |   trControl = ctrl,
117 |   tuneGrid = ridge_grid)
118 | ridge_model
119 | plot(ridge_model)
120 | 
121 | # y-axis: coefficient, x-axis: shrinkage proportion
122 | plot(ridge_model$finalModel)
123 | 
124 | # 5. LASSO ----------------------------------------------------------------------
125 | lasso_grid <- expand.grid(.fraction = seq(0.01, 0.20, 0.01))
126 | lasso_model <- train(y = resp_train, x = training,
127 |   method = "lasso",
128 |   trControl = ctrl,
129 |   tuneGrid = lasso_grid)
130 | lasso_model
131 | plot(lasso_model)
132 | 
133 | # 6. Elastic Net ---------------------------------------------------------------
134 | enet_model <- train(y = resp_train, x = training,
135 |   method = "enet",
136 |   trControl = ctrl,
137 |   tuneLength = 10)
138 | enet_model
139 | plot(enet_model)
140 | 
141 | # Make predictions ============================================================
142 | # 1. linear regression model
143 | lm_preds <- predict(linear_model, test_filter)
144 | RMSE(lm_preds, resp_test)
145 | plot(lm_preds, resp_test, xlab = "prediction", ylab = "observed", pch = 20,
146 |   main = "Linear Regression: RMSE = 1.172")
147 | abline(a = 0, b = 1, col = 2, lty = 2)
148 | 
149 | # 2. PCR model 
150 | pca_object <- princomp(training)
151 | test_pc <- predict(pca_object, test)
152 | pcr_preds <- predict(pcr_results$final, test_pc)
153 | RMSE(pcr_preds, resp_test)
154 | plot(pcr_preds, resp_test, xlab = "prediction", ylab = "observed", pch = 20,
155 |   main = "PCR: RMSE = 1.144")
156 | abline(a = 0, b = 1, col = 2, lty = 2)
157 | 
158 | # 3. PLS model 
159 | pls_preds <- predict(pls_model, test)
160 | RMSE(pls_preds, resp_test, na.rm = TRUE)
161 | plot(pls_preds, resp_test, xlab = "prediction", ylab = "observed", 
162 |   pch = 20, main = "Partial Least Squares: RMSE = 1.079")
163 | abline(a = 0, b = 1, col = 2, lty = 2)
164 | 
165 | # 4. Ridge model
166 | ridge_preds <- predict(ridge_model, test)
167 | RMSE(ridge_preds, resp_test)
168 | plot(ridge_preds, resp_test, xlab = "prediction", ylab = "observed", 
169 |   pch = 20, main = "Ridge Reg: RMSE = 1.086")
170 | abline(a = 0, b = 1, col = 2, lty = 2)
171 | 
172 | # 4. LASSO model
173 | lasso_preds <- predict(lasso_model, test)
174 | RMSE(lasso_preds, resp_test)
175 | plot(lasso_preds, resp_test, xlab = "prediction", ylab = "observed", 
176 |   pch = 20, main = "LASSO: RMSE = 1.112")
177 | abline(a = 0, b = 1, col = 2, lty = 2)
178 | 
179 | # 5. enet model
180 | enet_preds <- predict(enet_model, test)
181 | RMSE(enet_preds, resp_test)
182 | plot(enet_preds, resp_test, xlab = "prediction", ylab = "observed", pch = 20,
183 |   main = "Elastic Net: RMSE = 1.040")
184 | abline(a = 0, b = 1, col = 2, lty = 2)
185 | 
186 | # Variable Importance =========================================================
187 | varImp(pls_model)
188 | varImp(ridge_model)
189 | varImp(enet_model)
190 | 
191 | plot(training$ManufacturingProcess13, resp_train, pch = 20, 
192 |   xlab = "ManufacturingProcess13", ylab = "% yield")
193 | lines(loess.smooth(training$ManufacturingProcess13, resp_train), col = 2)
194 | 
195 | plot(training$ManufacturingProcess32, resp_train, pch = 20, 
196 |   xlab = "ManufacturingProcess32", ylab = "% yield")
197 | lines(loess.smooth(training$ManufacturingProcess32, resp_train), col = 2)
198 | 
199 | plot(training$ManufacturingProcess36, resp_train, pch = 20, 
200 |   xlab = "ManufacturingProcess36", ylab = "% yield")
201 | lines(loess.smooth(training$ManufacturingProcess36, resp_train), col = 2)
202 | 
203 | plot(training$ManufacturingProcess36, resp_train, pch = 20, 
204 |   xlab = "ManufacturingProcess36", ylab = "% yield")
205 | lines(loess.smooth(training$ManufacturingProcess36, resp_train), col = 2)
206 | 
207 | # The variable importance scores reveal several relationships worth examining in 
208 | # any experiments aimed at improving the manufacturing process.


--------------------------------------------------------------------------------
/measuring-predictor-importance/abalone_data_183.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Ex 18.3 Measuring predictor importance: abalone data set"
  3 | author: "Oisin Fitzgerald"
  4 | output: 
  5 |   html_document:
  6 |     keep_md: true 
  7 | ---
  8 | The UCI Abalone data (http://archive.ics.uci.edu/ml/datasets/Abalone)
  9 | consist of data from 4,177 abalones (sea snails). The data contain measurements 
 10 | of the type (male, female, and infant), the longest shell measurement, the diameter,
 11 | height, and several weights (whole, shucked, viscera, and shell). The outcome
 12 | is the number of rings.
 13 | This script:
 14 | 1. Visualises how the predictors relate to the reponse 
 15 | 2. Visualises how the predictors relate to each other
 16 | 3. Evaluates predictor importance based on several methods
 17 | 4. Filters redundant predictors and uses PCA to create a set of orthogonal predictors
 18 | 
 19 | ```{r}
 20 | library(AppliedPredictiveModeling)
 21 | data(abalone)
 22 | str(abalone)
 23 | head(abalone)
 24 | 
 25 | library(ggplot2)
 26 | library(tidyr)
 27 | library(scales)
 28 | library(corrplot)
 29 | library(CORElearn)
 30 | library(car)
 31 | library(minerva)
 32 | suppressMessages(library(caret))
 33 | suppressMessages(library(pROC))
 34 | ```
 35 | 
 36 | ### 1. How do the predictors relate to the number of rings?
 37 | Visually displaying the data shows some clear relationships and also outlying
 38 | values. In the plots of rings vs. the continouos variables there are both linear
 39 | (e.g. diameter) and non-linear (e.g. the weight variables) patterns. The similar 
 40 | shapes of some of the point clouds are suggestive that some of the variables likely 
 41 | contain the same information. Of course this makes sense given likely strong 
 42 | relationships between the various weight and length variables. Height shows two 
 43 | outlying points with values about 4 to 10 times greater than normal, suggesting
 44 | they may be incorrectly entered values. The boxplots show an expected pattern, 
 45 | with infants having less rings than adults.
 46 | ```{r, fig.width = 10, fig.height = 10}
 47 | # format data for plotting
 48 | gg_data <- gather(abalone, Rings)
 49 | names(gg_data) <- c("Rings", "variable", "value")
 50 | gg_data <- subset(gg_data, gg_data$variable != "Type")
 51 | gg_data$value <- as.numeric(gg_data$value)
 52 | # scatter plots for continuous variables
 53 | ggplot(aes(x = value, y = Rings), data = gg_data) + 
 54 |   geom_point() +
 55 |   facet_wrap(~variable, scales = "free_x") +
 56 |   scale_x_continuous(breaks = pretty_breaks(n = 8)) 
 57 | ```
 58 | 
 59 | ```{r}
 60 | # boxplot for Type variable
 61 | ggplot(aes(x = Type, y = Rings), data = abalone) + 
 62 |   geom_boxplot()
 63 | ```
 64 | 
 65 | ### 2. How do the predictors relate to each other?
 66 | The car packages amazing function car::scatterplotMatrix shows clear relationships
 67 | between the variables. This further emphasised by the correlation plot. There are
 68 | clearly near linear dependencies in the data.
 69 | ```{r, fig.width = 10, fig.height = 10}
 70 | X <- abalone[ , sapply(abalone, is.numeric) ]
 71 | X <- X[ ,-8]  # remove Rings
 72 | 
 73 | # matrix scatter plots
 74 | scatterplotMatrix(X, smoother = FALSE, reg.line = FALSE)
 75 | 
 76 | # LOESS fit
 77 | loess_results <- filterVarImp(x = X, y = abalone$Rings, nonpara = TRUE)
 78 | loess_results
 79 | 
 80 | # correlations
 81 | XX <- cor(X)
 82 | corrplot(XX, "number", tl.cex = 0.7)
 83 | ```
 84 | 
 85 | ### 3. Predictor importance scores
 86 | A downside of all the measure used in this section is that they soley reveal bivariate
 87 | relationships. We cannot know for example if ther interaction of two predictors is 
 88 | an important term to include in any model. Regardless the various measures of linear, 
 89 | rank, and information provide a useful to gauge the likely importance of a variable in
 90 | improving the predictive ability of a model (i.e. screening!). 
 91 | Pearson's *r* provides a measure of the linear relationships between two variables
 92 | while Spearman's *rho* is the rank correlation between the variable and so is better
 93 | suited to picking up non-linear relationships. All variable have a greater Spearman's
 94 | *rho* than Pearson's *r* suggesting future model sshould take into account this non-linearity.
 95 | The ANOVA and pariwise t-tests confirm what the boxplot showed: that infants are 
 96 | most different from the other groups in number of rings.
 97 | ```{r}
 98 | # linear correlations
 99 | pearsonsR <- apply(X, MARGIN = 2, FUN = cor, y = abalone$Rings, method = "pearson")
100 | pearsonsR
101 | 
102 | # rank correlations
103 | spearmansRho <- apply(X, MARGIN = 2, FUN = cor, y = abalone$Rings, method = "spearman")
104 | spearmansRho
105 | 
106 | # ANOVA and t tests (Type variable)
107 | anova(lm(Rings ~ Type, data = abalone)) 
108 | pairwise.t.test(abalone$Rings, abalone$Type, pool.sd = FALSE)
109 | ```
110 | 
111 | The maximal information coefficient (MIC) is an information theory based measure
112 | of the strength of linear and/or non-linear relationship between two variables. It
113 | bins continuous variables in such a way as to maximise the mutual information, the
114 | amount of information you gain about the likely value of one variable given the 
115 | value of another. The results suggest that all variables are moderately related to 
116 | thee number fo rings. MIC minus R^2 is suggested as a measure of the degree of 
117 | non-linearity in the relationship, all values of this measure are close to zero 
118 | implying non-linear relationships.
119 | ```{r}
120 | # MIC
121 | mic_values <- mine(x = X, y = abalone$Rings)
122 | mic_values$MIC
123 | ```
124 | 
125 | The RReliefF algorithm is an adaption of ReliefF to a regression setting 
126 | (Robnik-Šikonja & Kononenko, 1997). It is a measure of how likely nearby instances
127 | of randomly selected observations are to give a similar prediction in the response.
128 | It can be combined with a permutation test to give an indication of how much the
129 | results differ from chance.
130 | ```{r}
131 | #RreliefF (optimistic!!)
132 | relief_values <- attrEval(abalone$Rings ~ ., data = X,
133 |   estimator = "RReliefFbestK",  # calculation method
134 |   ReliefIterations = 50)  # num iteration
135 | relief_values <- data.frame(Predictor = names(relief_values), 
136 |   value = relief_values, 
137 |   row.names = NULL)
138 | 
139 | # RreliefF permutation test
140 | relief_perm <- permuteRelief(x = X, y = abalone$Rings, nperm = 500,
141 |   estimator = "RReliefFbestK",
142 |   ReliefIterations = 50)
143 | 
144 | # standard deviations from permutation score distribution
145 | relief_perm$standardized[order(relief_perm$standardized)]
146 | ```
147 | 
148 | ### 4. Filters redundant predictors and create a set of non-redundant Principal component analysis
149 | Given the relationships between the variables this function filters out highly
150 | correlated variable leaving a reduced set. It follows a heuristic algorithm in 
151 | Kuhn and Johnson's book in removing from a pair that variable most related to the
152 | other variables. PCA is then performed, with the first two principal components
153 | accounting for 90% of the variance. The filter method at *r* = 0.75 also leads to
154 | the conclusion there are only two non-redundant "pieces of information" in this data set.
155 | ```{r}
156 | # Filter predictors
157 | # returns name of predictors to keep, possibly still highly correlated if only
158 | # two, in this case examine how they relate to the response in making decisions
159 | filter_vars <- function(X, cor_level = 0.75,...) {
160 |   XX <- cor(X)
161 |   XX <- XX - diag(diag(XX))
162 |   while (any(XX > cor_level)) {
163 |         if (ncol(XX) <= 2) { # prevent entering filtering loop
164 |           return(colnames(XX))
165 |     } else {
166 |       var_ind <- which(XX == max(XX), arr.ind = TRUE)
167 |       var.1 <- row.names(var_ind)[1]
168 |       var.2 <- row.names(var_ind)[2]
169 |       var.1_av <- sum(XX[var.1, ])/(length(XX[var.1, ]) - 1)
170 |       var.2_av <- sum(XX[var.2, ])/(length(XX[var.2, ]) - 1)
171 |       if (var.1_av > var.2_av) {
172 |         XX <- XX[!(row.names(XX) == var.1),!(colnames(XX) == var.1)]
173 |         } else {
174 |           XX <- XX[!(row.names(XX) == var.2),!(colnames(XX) == var.2)]
175 |         }
176 |     }
177 |   }
178 |   colnames(XX)
179 | }
180 | 
181 | filter_vars(X)  # works...
182 | 
183 | # PCA
184 | pca_object <- prcomp(X, center = TRUE, scale. = TRUE)
185 | percent_variance <- pca_object$sd^2/sum(pca_object$sd^2)*100
186 | percent_variance # in agreement with filtering method that there are really 
187 | ```
188 | 
189 | 
190 | 


--------------------------------------------------------------------------------
/measuring-predictor-importance/churn_data_181.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Ex18.1 - Measuring predictor importance: churn data set"
  3 | author: "Oisin Fitzgerald"
  4 | output: 
  5 |   html_document:
  6 |     keep_md: true 
  7 | ---
  8 | 
  9 | The “churn” data set was developed to predict telecom customer churn based on 
 10 | information about their account. It contains 20 variables, with the 19
 11 | predictors including continuous and factor variables that describe an individual
 12 | account. The response churn has two levels "yes" and "no". This script demonstrates
 13 | methods for examining categorical and continuous predictor importance for 
 14 | classification problems.
 15 | 
 16 | (a) Examining the correlation between predictors
 17 | ------------------------------------------------
 18 | ```{r}
 19 | # load packages
 20 | library(AppliedPredictiveModeling)
 21 | library(C50)
 22 | library(corrplot)
 23 | library(caret)
 24 | library(CORElearn)
 25 | library(ggplot2)
 26 | library(pROC)
 27 | 
 28 | # load data
 29 | data(churn)
 30 | str(churnTrain)
 31 | 
 32 | # correlation between the continuous variables the the 
 33 | numeric_vars <- sapply(churnTrain, is.numeric)
 34 | corrs <- cor(churnTrain[numeric_vars])
 35 | corrplot(corrs, method = "number", tl.cex = 0.75)
 36 | ```
 37 | 
 38 | Finding a perfect colinearity between the of four pairs of total charge and 
 39 | total minutes variables is of course no surprise, phone charges are set per time 
 40 | period! However it is somewhat surprising that there is no linear relation 
 41 | between any other pairs of variables. A scatterplot matrix of a subset of 
 42 | the continuous variables reveals the extent of any pairwise relationship and 
 43 | how certain variables contain numerous of zeros.
 44 | 
 45 | ```{r, echo=FALSE}
 46 | pairs(~ ., data = churnTrain[numeric_vars][1:4], main="Scatterplot Matrix")
 47 | # make conditional on churn
 48 | ```
 49 | 
 50 | (b) Assessing the importance of categorical predictors
 51 | -------------------------------------------------------------------------------
 52 | Odds ratios, Fisher's exact test and chi-square tests provide methods to examine 
 53 | the extent of association between factor levels and the response categories. 
 54 | Fisher's exact test is considred more reliable than chi-square, however it is more
 55 | computationally intensive.
 56 | ```{r}
 57 | # A function to calculate chi-square, odds ratios and fisher's exact test
 58 | association_tests <- function(x, y) { # x is predictors, y is response
 59 |   
 60 |   x <- x[ ,sapply(x, is.factor)]
 61 |   n <- length(x)
 62 |   names <- colnames(x)
 63 | 
 64 |   out <- data.frame(
 65 |     chisq = rep(NA, n), 
 66 |     chi.p.value = rep(NA, n), 
 67 |     odds.ratio = rep(NA, n), 
 68 |     fisher.p.value = rep(NA, n))
 69 | 
 70 |   for (i in 1:n) {
 71 |     row.names(out)[i] <- names[i]
 72 |     if (nlevels(x[ ,i]) > 7) {
 73 |       fish_res <- fisher.test(x = x[ ,i], y = y, simulate.p.value = TRUE)
 74 |       out$fisher.p.value[i] <- fish_res$p.value
 75 |     } else {
 76 |       fish_res <- fisher.test(x = x[ ,i], y = y)
 77 |       out$fisher.p.value[i] <- fish_res$p.value
 78 |       if (nlevels(x[ ,i]) <= 2) out$odds.ratio[i] <- fish_res$estimate
 79 |       }
 80 |     
 81 |     chi_res <- chisq.test(x = x[ ,i], y = y, simulate.p.value = TRUE) # chisq test
 82 |     out$chisq[i] <- chi_res$statistic
 83 |     out$chi.p.value[i] <- chi_res$p.value
 84 |   }
 85 |   out
 86 | }
 87 | 
 88 | res <- association_tests(x = churnTrain[ ,1:19], y = churnTrain$churn)
 89 | ```
 90 | 
 91 | The results suggest international plan to be an important variable, while area code 
 92 | shows little value as a predictor. The extremely low chi-square statistic and 
 93 | high p-value suggests data may have been purposely balanced by area code. Voice 
 94 | mail plan and state seems to have value, without the same strngth of association
 95 | as international plan.
 96 | ```{r}
 97 | ggplot(data = res, aes(x = chisq, y = -log(chi.p.value))) + 
 98 |   geom_point(size = 3) +
 99 |   annotate("text", x = res$chisq + 10, 
100 |     y = -log(res$chi.p.value) -.3, label = c(row.names(res))) +
101 |   labs(title = "Chi square vs. -log(p.values)") +
102 |   xlim(NA, 275) + ylim(NA, 9)
103 | ```
104 | 
105 | Receiver operating characteristic (ROC) curves offer a method to examine the extent
106 | to which a predictor variable distinguishes between the two levels of a response
107 | factor, e.g. to what extent does the "account_length" variable allow us to 
108 | distinguish between customers likely to churn, and those who are not. The area 
109 | under the ROC curve (AUC) quantifies the ability of a predictor variable to separate
110 | between classes. 
111 | The AUC leads to different conclusions to the association tests, with international
112 | plan now considered the least important variable. Area code is now ranked the second
113 | most important variable.  
114 | ```{r}
115 | # Calculate the area under the ROC curve
116 | factor_pred <- churnTrain[ ,sapply(churnTrain, is.factor)]  # subset the factors
117 | factor_pred$churn <- NULL
118 | auc_factors <- filterVarImp(y = churnTrain$churn, x = factor_pred)
119 | # variables ranked by auc
120 | auc_factors[order(auc_factors$yes, decreasing = TRUE), ]  
121 | ```
122 | 
123 | (b) Assessing the importance of continuous predictors
124 | -------------------------------------------------------------------------------
125 | ```{r}
126 | # create a subset of the continuous predictors 
127 | cont_pred <- churnTrain[ ,sapply(churnTrain, is.numeric)]
128 | ```
129 | 
130 | Where the response is a category with two outcomes, t-tests can be used to assess
131 | the difference in the distributions of the continuous predictors by the response
132 | categories. As a signal/noise ratio the t-statistic quantifies the separation in
133 | the distributions, with the associated p value indicating the extent to which this
134 | would occur based on an assumption of no differnce.
135 | ```{r}
136 | get_tstats <- function(x, y) {
137 |   test <- t.test(x ~ y)  # Welch's t test
138 |   out <- c(t_stat = test$statistic, p = test$p.value)
139 |   out
140 | }
141 | 
142 | t_values <- apply(cont_pred, MARGIN = 2, FUN = get_tstats, y = churnTrain$churn)
143 | t_values <- data.frame(t(t_values))  # transpose
144 | round(t_values[order(t_values$p), ], 6)
145 | ```
146 | 
147 | The AUC and t-test for the continuous predictors both agree to a large extent.
148 | They share the same top 3 predictors, and only seem to have slight re-shuffling 
149 | otherwise.
150 | ```{r}
151 | # Calculate the area under the ROC curve
152 | auc_numeric <- filterVarImp(y = churnTrain$churn, x = cont_pred)
153 | # continuous variables ranked by AUC
154 | auc_numeric[order(auc_numeric$yes, decreasing = TRUE), ]  
155 | ```
156 | 
157 | 
158 | (d) Use RefliefF to jointly assess the importance of predictors
159 | -------------------------------------------------------------------------------
160 | The Relief algorithm is another method to measure the importance of predictors
161 | for a two class response problem (although it can deal with other situations
162 | as well). It begins by randomly selecting a set of observations, R, of size m. 
163 | The algorithm then evaluates each predictor in isolation by looping through each 
164 | point in the random set and for each point (1) finding the two nearest scores 
165 | that are a hit(i.e. share same class in response) and a miss (i.e. does not share 
166 | same class in reponse) and (2) updating the score for that predictor, 
167 | S = S - diff(R, Hit)^2/m + diff(R, Miss)^2/m.
168 | ```{r}
169 | relief_values <- attrEval(churn ~ ., data = churnTrain,
170 |   estimator = "ReliefFequalK",  # calculation method
171 |   ReliefIterations = 50)  # num iteration
172 | relief_values[order(relief_values, decreasing = TRUE)]
173 | 
174 | ```
175 | 
176 | An addition to the Relief algorithm is to permutate the response observations so as to 
177 | gain an understanding of the predictors score when it has no relevance. This method can be iterated several times giving a somewhat normal distribution of scores that can then be compared to the true Relief score in terms of standard
178 | deviations. This indicates how much greater the Relief score is than what could 
179 | be expected by chance alone. 
180 | ```{r}
181 | relief_perm <- permuteRelief(x = churnTrain[ ,-20], y = churnTrain$churn, nperm = 500,
182 |   estimator = "ReliefFequalK",
183 |   ReliefIterations = 50)
184 | ```
185 | 
186 | The results suggest that total day charge (and therefore total day minutes - 
187 | one is a multiple of the other) and number of customer service calls are highly
188 | important variables. International plan is also quite far from its permutation
189 | distribution mean in terms of standard deviations, putting this method more in
190 | agreement with chisq/fishers than the AUC. However its relief score still quite
191 | low. This may be a result of a heavy bias towards "no" internation plan and "no" 
192 | churn but ambiguity otherwise (poor sensitivity -> low AUC curve?). There are several predictors that appear without value including state, area code, total intl minutes (and charges), and total night minutes (and charges).
193 | ```{r, fig.width = 10}
194 | # Histograms of the permutated relief scores
195 | relief_values <- data.frame(Predictor = names(relief_values), 
196 |   value = relief_values, 
197 |   row.names = NULL)
198 | ggplot(data = relief_perm$permutations, aes(x = value)) + 
199 |   geom_histogram(binwidth = .01, colour = 1) +
200 |   geom_vline(aes(xintercept = value), relief_values, colour = "red", linetype = 2) +
201 |   facet_wrap(~ Predictor) + 
202 |   labs(title = "Relief Scores and Permutation Distributions", xlab = "Relief Scores")
203 | 
204 | # Standard deviation of permutated distribution from non-permutated score
205 | relief_perm$standardized[order(relief_perm$standardized)]
206 | ```
207 | 
208 | 
209 | 
210 | 


--------------------------------------------------------------------------------
/linear-regression-and-cousins/IR_data_61.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Ex6.1 - Linear regression: IR spectrum of food"
  3 | author: "Oisin Fitzgerald"
  4 | date: "26 January 2016"
  5 | output: 
  6 |   html_document:
  7 |     keep_md: true 
  8 | ---
  9 | ### The data:
 10 | The data provides an infrared (IR) profile and analytical chemistry determined 
 11 | percent content of water, fat, and protein for meat samples. If there can be establish 
 12 | a predictive relationship between IR spectrum and fat content, then food scientists 
 13 | could predict a sample’s fat content with IR instead of using analytical chemistry
 14 | 
 15 | ### Outline:
 16 | 1. What is the relationship between the predictors? Are they highly correlated given 
 17 | the same food sample is measured at many IR wavelengths?
 18 | 2. Create training/test split
 19 | 3. Fit different models
 20 |   + Linear regression
 21 |   + Ridge regression, lasso and elastic net
 22 |   + PCR and PLS
 23 | 4. Compare models predictive ability
 24 | 
 25 | ```{r}
 26 | # load data and packages
 27 | library(car)
 28 | library(lars)
 29 | library(broom)
 30 | library(reshape2)
 31 | suppressMessages(library(elasticnet))
 32 | suppressMessages(library(pls))
 33 | suppressMessages(library(caret))
 34 | 
 35 | data(tecator) # from caret
 36 | ```
 37 | 
 38 | ### 1. Relationship between predictors and distributions 
 39 | 
 40 | ```{r}
 41 | # correlation
 42 | XX <- cor(absorp)
 43 | XX[1:5, 1:5]  # everything is related to everything!
 44 | 
 45 | # PCA
 46 | pca_object <- prcomp(absorp)
 47 | percent_variance <- pca_object$sdev^2/sum(pca_object$sd^2)*100
 48 | head(percent_variance)
 49 | 
 50 | # Predictor distributions
 51 | ggplot(data = data.frame(absorp)) + 
 52 |   geom_histogram(aes(x = X1), bins = 20, col = 1) +
 53 |   labs(title = "Histogram of IR wavelength no. 1",
 54 |     x = "Wavelength predictor 1")  # positive skew
 55 | ```
 56 | 
 57 | ### 2. Create a training/test split
 58 | 
 59 | * 75% of the data to the training set
 60 | * The predictor variables show positive skew which Yeo-Johnson estimated lambda
 61 | (of -1 i.e. reciprocal) altered
 62 | 
 63 | ```{r}
 64 | length(endpoints[ ,1])  # how many observations?
 65 | # create partition index
 66 | data_split <- createDataPartition(endpoints[ ,1], p = .75)
 67 | data_split <- data_split$Resample1
 68 | 
 69 | # split data
 70 | training <- absorp[data_split, ]
 71 | test <- absorp[-data_split, ]
 72 | train_resp <- endpoints[data_split, 2]  # column 2 is fat content
 73 | test_resp <- endpoints[-data_split, 2]
 74 | 
 75 | # de-skew variables
 76 | training <- data.frame(training)
 77 | test <- data.frame(test)
 78 | proc_object <- preProcess(training, 
 79 |   method = c("YeoJohnson", "center", "scale"))
 80 | training <- predict(proc_object, training)
 81 | test <- predict(proc_object, test)
 82 | ```
 83 | 
 84 | ### 3. Model fitting
 85 | * Linear regression  
 86 |     + Unsurprisingly prior removing of highly correlated predictors resulted in a model
 87 |     with only one independent variable. The performance on cross-validation was poor.
 88 | * Ridge regression
 89 |     + The ridge model quickly highlighted the ability to improve on the linear regression
 90 |     model. However, subsequent fitting of a lasso model showed that an ability to drive
 91 |     the coefficients to zero was an advantage in the highly correlated predictor environment.
 92 | * The lasso and elastic net
 93 |     + As noted the lasso model outperformed the ridge model. The optimal solution resulted
 94 |     in a large number of the coefficient being shrunk to zero
 95 |     + Enet performed similar to the lasso, with the best performing model having a
 96 |     low lambda for the ridge function  
 97 | * Principal components and partial least squares regression
 98 |     + These both performed quite well. The similarity of the PCR model to the PLS
 99 |     models is likely related to the variance in the predictors (IR response) very much 
100 |     being a consequence of the variance in the response (food fat content), thus the 
101 |     unsupervised nature of PCA causing little detriment.
102 |     + The number of principal components was tuned rather than using the first two,
103 |     or fist few that explained 90% of variance etc.
104 | 
105 | ```{r}
106 | ctrl <- trainControl(method = "cv", number = 5, repeats = 5)
107 | # Linear regression
108 | mc <- findCorrelation(training, cutoff = 0.95)
109 | training_linear <- data.frame(training[ ,-mc])
110 | # colnames(training_linear) <- "X1"
111 | linear_model <- train(y = train_resp,
112 |   x = training_linear,
113 |   method = "lm",
114 |   trControl = ctrl)
115 | linear_model
116 | 
117 | # Ridge Regression - penalise square of coefficient
118 | ridge_model <- train(y = train_resp,
119 |   x = training,
120 |   method = "ridge",
121 |   trControl = ctrl,
122 |   tuneLength = 10)
123 | ridge_model
124 | plot(ridge_model)
125 | 
126 | # Lasso - penalise absolute value of coeffienct
127 | lasso_grid <- expand.grid(.fraction = seq(0.001, 0.1, 0.01))
128 | lasso_model <- train(y = train_resp,
129 |   x = training,
130 |   method = "lasso",
131 |   trControl = ctrl,
132 |   tuneGrid = lasso_grid)
133 | lasso_model
134 | plot(lasso_model)
135 | 
136 | # Elastic Net - combination of ridge and lasso
137 | enet_grid <- expand.grid(.fraction = seq(0.001, 0.1, 0.01), .lambda = c(0, 0.0001, 0.001, 0.01))
138 | enet_model <- train(y = train_resp,
139 |   x = training,
140 |   method = "enet",
141 |   trControl = ctrl,
142 |   tuneGrid = enet_grid)
143 | enet_model
144 | plot(enet_model)
145 | 
146 | # PCR - 
147 | pcr_results <- list(results = data.frame(RMSE = NA, RMSE_sd = NA), final = NA)
148 | for (i in 1:20) {
149 |   # fit model
150 |   train_data <- princomp(training)$scores[ ,1:i]
151 |   train_data <- data.frame(train_data)
152 |   pcr_model <- train(y = train_resp,
153 |   x = train_data,
154 |   method = "lm",
155 |   trControl = ctrl)
156 |   
157 |   # extract results
158 |   pcr_results$results[i, 1] <- pcr_model$results$RMSE
159 |   pcr_results$results[i, 2] <- pcr_model$results$RMSESD
160 |   
161 |   # extract model
162 |   if (all(pcr_model$results$RMSE <= pcr_results$results$RMSE)) {
163 |     pcr_results$final <- pcr_model
164 |     }
165 | }
166 | pcr_results
167 | 
168 | 
169 | # PLS
170 | pls_grid <- expand.grid(.ncomp = seq(10, 20, 1))
171 | pls_model <- train(y = train_resp,
172 |   x = training,
173 |   method = "pls",
174 |   trControl = ctrl,
175 |   preProcess = c("center", "scale"),
176 |   tuneGrid = pls_grid)
177 | pls_model
178 | ```
179 | 
180 | ### 4. Compare performance on test set
181 | * The results from fitting on the test set followed from the cross-validation 
182 | included in model fitting. Linear regression did very poorly with ridge regression 
183 | slightly worse off than the group of lasso, elastic net, PCR and PLS. 
184 | * In context the RMSE and correlation between predicted and observed results are 
185 | superb, and surely suggest that any of these models could be used in measuring the
186 | fat content of food using infrared.
187 | * Given the similarities in the model performances I was interested in constructing
188 | confidence intervals around the RMSE. A function to calculate bootstrap 
189 | estimation and its results are shown below.
190 | 
191 | ```{r}
192 | # Linear regression
193 | test_linear <- data.frame(test[ ,-mc])
194 | colnames(test_linear) <- colnames(training_linear)
195 | linear_pred <- predict(linear_model, test_linear)
196 | ggplot() + geom_point(aes(x = linear_pred, y = test_resp))
197 | 
198 | n <- length(test_resp)
199 | RMSE_lm <- sqrt(sum((test_resp - linear_pred)^2)/n); RMSE_lm
200 | 
201 | # Ridge regression
202 | ridge_preds <- predict(ridge_model, test)
203 | ggplot() + geom_point(aes(x = ridge_preds, y = test_resp))
204 | 
205 | RMSE_ridge <- sqrt(sum((test_resp - ridge_preds)^2)/n); RMSE_ridge
206 | 
207 | # Lasso
208 | lasso_preds <- predict(lasso_model, test)
209 | ggplot() + geom_point(aes(x = lasso_preds, y = test_resp))
210 | 
211 | RMSE_lasso <- sqrt(sum((test_resp - lasso_preds)^2)/n); RMSE_lasso
212 | 
213 | # Elastic net
214 | enet_preds <- predict(enet_model, test)
215 | ggplot() + geom_point(aes(x = enet_preds, y = test_resp))
216 | 
217 | RMSE_enet <- sqrt(sum((test_resp - enet_preds)^2)/n); RMSE_enet
218 | 
219 | # PCR
220 | pca_train <- princomp(training)
221 | test_pcs <- predict(pca_train, test)
222 | pcr_preds <- predict(pcr_results$final, test_pcs)
223 | ggplot() + geom_point(aes(x = pcr_preds, y = test_resp))
224 | 
225 | RMSE_pcr <- sqrt(sum((test_resp - pcr_preds)^2)/n); RMSE_pcr
226 | 
227 | # PLS
228 | pls_preds <- predict(pls_model, test)
229 | ggplot() + geom_point(aes(x = pls_preds, y = test_resp))
230 | 
231 | RMSE_pls <- sqrt(sum((test_resp - pls_preds)^2)/n); RMSE_pls
232 | cor(pls_preds, test_resp)
233 | ```
234 | 
235 | ### 4. Compare performance on test set contd. 
236 | * Bootstrap estimate of RMSE confidence interval
237 |     + PLS appears to be the prefered model, it shows the least variation in its RMSE scores
238 |     across the bootstrap samples. PCR is likely similar, an issue with variable naming meant
239 |     I excluded it.
240 | 
241 | ```{r}
242 | boostrap_RMSE <- function(model, data, obs, trials = 1000, CI = 0.95) {
243 |   
244 |   n <- nrow(data)
245 |   out <- list(results = data.frame(RMSE = NA), lower = NA, upper = NA)
246 |   
247 |   for (i in 1:trials) {
248 |     # create bootstrap sample
249 |     samp <- sample(n, size = n, replace = TRUE)
250 |     boot_obs <- obs[samp]
251 |     boot_data <- data.frame(data[samp, ])
252 |     colnames(boot_data) <- colnames(data)
253 |     # predict
254 |     preds <- predict(model, newdata = boot_data)
255 |     RMSE <- sqrt(sum((boot_obs - preds)^2)/n)
256 |     
257 |     out$results[i ,1] <- RMSE
258 |   }
259 |   
260 |   temp <- out$results$RMSE
261 |   temp <- quantile(temp, probs = c(0.025, 0.975), na.rm = TRUE)
262 |   
263 |   out$lower <- temp[1]
264 |   out$upper <- temp[2]
265 |   
266 |   out
267 | }
268 | ```
269 | 
270 | 
271 | ```{r}
272 | # The bootstrap results
273 | bRMSE_lm <- boostrap_RMSE(linear_model, test_linear, test_resp)
274 | bRMSE_ridge <- boostrap_RMSE(ridge_model, test, test_resp)
275 | bRMSE_lasso <- boostrap_RMSE(lasso_model, test, test_resp)
276 | bRMSE_enet <- boostrap_RMSE(enet_model, test, test_resp)
277 | # bRMSE_pcr <- boostrap_RMSE(pcr_model, test, test_resp)
278 | bRMSE_pls <- boostrap_RMSE(pls_model, test, test_resp)
279 | 
280 | model_results <- data.frame(bRMSE_lm$results, bRMSE_ridge$results,
281 |   bRMSE_lasso$results, bRMSE_enet$results, bRMSE_pls$results)
282 | colnames(model_results) <- c('lm', 'ridge', 'lasso', 'enet', 'pls')
283 | 
284 | temp <- melt(model_results)
285 | 
286 | ggplot(data = temp, aes(x = variable, y = value)) + 
287 |   geom_boxplot(width = 0.5) + 
288 |   theme_bw() + 
289 |   labs(title = 'Bootstrap Estimates of Model Performance',
290 |     x = 'Model',
291 |     y = 'RMSE')
292 | ```
293 | 
294 | ### Conclusion
295 | 
296 | * The clear signal in the data meant that despite multicollinarity issues several 
297 | linear model fitting methods had no problem producing extremely predictive models. 
298 | * The predictors were highly correlated and likely possessed variations on same 
299 | information. Therefore possibly as a result of their ability to extract the minimal 
300 | dimension signal from several correlated variables PCR and PLS appear to have a slight 
301 | performance advantage over other models.
302 | 


--------------------------------------------------------------------------------
/measuring-predictor-importance/abalone_data_183.md:
--------------------------------------------------------------------------------
  1 | # Ex 18.3 Measuring predictor importance: abalone data set
  2 | Oisin Fitzgerald  
  3 | The UCI Abalone data (http://archive.ics.uci.edu/ml/datasets/Abalone)
  4 | consist of data from 4,177 abalones (sea snails). The data contain measurements 
  5 | of the type (male, female, and infant), the longest shell measurement, the diameter,
  6 | height, and several weights (whole, shucked, viscera, and shell). The outcome
  7 | is the number of rings.
  8 | This script:
  9 | 1. Visualises how the predictors relate to the reponse 
 10 | 2. Visualises how the predictors relate to each other
 11 | 3. Evaluates predictor importance based on several methods
 12 | 4. Filters redundant predictors and uses PCA to create a set of orthogonal predictors
 13 | 
 14 | 
 15 | ```r
 16 | library(AppliedPredictiveModeling)
 17 | data(abalone)
 18 | str(abalone)
 19 | ```
 20 | 
 21 | ```
 22 | ## 'data.frame':	4177 obs. of  9 variables:
 23 | ##  $ Type         : Factor w/ 3 levels "F","I","M": 3 3 1 3 2 2 1 1 3 1 ...
 24 | ##  $ LongestShell : num  0.455 0.35 0.53 0.44 0.33 0.425 0.53 0.545 0.475 0.55 ...
 25 | ##  $ Diameter     : num  0.365 0.265 0.42 0.365 0.255 0.3 0.415 0.425 0.37 0.44 ...
 26 | ##  $ Height       : num  0.095 0.09 0.135 0.125 0.08 0.095 0.15 0.125 0.125 0.15 ...
 27 | ##  $ WholeWeight  : num  0.514 0.226 0.677 0.516 0.205 ...
 28 | ##  $ ShuckedWeight: num  0.2245 0.0995 0.2565 0.2155 0.0895 ...
 29 | ##  $ VisceraWeight: num  0.101 0.0485 0.1415 0.114 0.0395 ...
 30 | ##  $ ShellWeight  : num  0.15 0.07 0.21 0.155 0.055 0.12 0.33 0.26 0.165 0.32 ...
 31 | ##  $ Rings        : int  15 7 9 10 7 8 20 16 9 19 ...
 32 | ```
 33 | 
 34 | ```r
 35 | head(abalone)
 36 | ```
 37 | 
 38 | ```
 39 | ##   Type LongestShell Diameter Height WholeWeight ShuckedWeight
 40 | ## 1    M        0.455    0.365  0.095      0.5140        0.2245
 41 | ## 2    M        0.350    0.265  0.090      0.2255        0.0995
 42 | ## 3    F        0.530    0.420  0.135      0.6770        0.2565
 43 | ## 4    M        0.440    0.365  0.125      0.5160        0.2155
 44 | ## 5    I        0.330    0.255  0.080      0.2050        0.0895
 45 | ## 6    I        0.425    0.300  0.095      0.3515        0.1410
 46 | ##   VisceraWeight ShellWeight Rings
 47 | ## 1        0.1010       0.150    15
 48 | ## 2        0.0485       0.070     7
 49 | ## 3        0.1415       0.210     9
 50 | ## 4        0.1140       0.155    10
 51 | ## 5        0.0395       0.055     7
 52 | ## 6        0.0775       0.120     8
 53 | ```
 54 | 
 55 | ```r
 56 | library(ggplot2)
 57 | library(tidyr)
 58 | library(scales)
 59 | library(corrplot)
 60 | library(CORElearn)
 61 | library(car)
 62 | library(minerva)
 63 | suppressMessages(library(caret))
 64 | suppressMessages(library(pROC))
 65 | ```
 66 | 
 67 | ### 1. How do the predictors relate to the number of rings?
 68 | Visually displaying the data shows some clear relationships and also outlying
 69 | values. In the plots of rings vs. the continouos variables there are both linear
 70 | (e.g. diameter) and non-linear (e.g. the weight variables) patterns. The similar 
 71 | shapes of some of the point clouds are suggestive that some of the variables likely 
 72 | contain the same information. Of course this makes sense given likely strong 
 73 | relationships between the various weight and length variables. Height shows two 
 74 | outlying points with values about 4 to 10 times greater than normal, suggesting
 75 | they may be incorrectly entered values. The boxplots show an expected pattern, 
 76 | with infants having less rings than adults.
 77 | 
 78 | ```r
 79 | # format data for plotting
 80 | gg_data <- gather(abalone, Rings)
 81 | ```
 82 | 
 83 | ```
 84 | ## Warning: attributes are not identical across measure variables; they will
 85 | ## be dropped
 86 | ```
 87 | 
 88 | ```r
 89 | names(gg_data) <- c("Rings", "variable", "value")
 90 | gg_data <- subset(gg_data, gg_data$variable != "Type")
 91 | gg_data$value <- as.numeric(gg_data$value)
 92 | # scatter plots for continuous variables
 93 | ggplot(aes(x = value, y = Rings), data = gg_data) + 
 94 |   geom_point() +
 95 |   facet_wrap(~variable, scales = "free_x") +
 96 |   scale_x_continuous(breaks = pretty_breaks(n = 8)) 
 97 | ```
 98 | 
 99 | ![](abalone_data_183_files/figure-html/unnamed-chunk-2-1.png) 
100 | 
101 | 
102 | ```r
103 | # boxplot for Type variable
104 | ggplot(aes(x = Type, y = Rings), data = abalone) + 
105 |   geom_boxplot()
106 | ```
107 | 
108 | ![](abalone_data_183_files/figure-html/unnamed-chunk-3-1.png) 
109 | 
110 | ### 2. How do the predictors relate to each other?
111 | The car packages amazing function car::scatterplotMatrix shows clear relationships
112 | between the variables. This further emphasised by the correlation plot. There are
113 | clearly near linear dependencies in the data.
114 | 
115 | ```r
116 | X <- abalone[ , sapply(abalone, is.numeric) ]
117 | X <- X[ ,-8]  # remove Rings
118 | 
119 | # matrix scatter plots
120 | scatterplotMatrix(X, smoother = FALSE, reg.line = FALSE)
121 | ```
122 | 
123 | ![](abalone_data_183_files/figure-html/unnamed-chunk-4-1.png) 
124 | 
125 | ```r
126 | # LOESS fit
127 | loess_results <- filterVarImp(x = X, y = abalone$Rings, nonpara = TRUE)
128 | loess_results
129 | ```
130 | 
131 | ```
132 | ##                 Overall
133 | ## LongestShell  0.3099367
134 | ## Diameter      0.3302339
135 | ## Height        0.3107698
136 | ## WholeWeight   0.3454217
137 | ## ShuckedWeight 0.2691416
138 | ## VisceraWeight 0.3248622
139 | ## ShellWeight   0.4224143
140 | ```
141 | 
142 | ```r
143 | # correlations
144 | XX <- cor(X)
145 | corrplot(XX, "number", tl.cex = 0.7)
146 | ```
147 | 
148 | ![](abalone_data_183_files/figure-html/unnamed-chunk-4-2.png) 
149 | 
150 | ### 3. Predictor importance scores
151 | A downside of all the measure used in this section is that they soley reveal bivariate
152 | relationships. We cannot know for example if ther interaction of two predictors is 
153 | an important term to include in any model. Regardless the various measures of linear, 
154 | rank, and information provide a useful to gauge the likely importance of a variable in
155 | improving the predictive ability of a model (i.e. screening!). 
156 | Pearson's *r* provides a measure of the linear relationships between two variables
157 | while Spearman's *rho* is the rank correlation between the variable and so is better
158 | suited to picking up non-linear relationships. All variable have a greater Spearman's
159 | *rho* than Pearson's *r* suggesting future model sshould take into account this non-linearity.
160 | The ANOVA and pariwise t-tests confirm what the boxplot showed: that infants are 
161 | most different from the other groups in number of rings.
162 | 
163 | ```r
164 | # linear correlations
165 | pearsonsR <- apply(X, MARGIN = 2, FUN = cor, y = abalone$Rings, method = "pearson")
166 | pearsonsR
167 | ```
168 | 
169 | ```
170 | ##  LongestShell      Diameter        Height   WholeWeight ShuckedWeight 
171 | ##     0.5567196     0.5746599     0.5574673     0.5403897     0.4208837 
172 | ## VisceraWeight   ShellWeight 
173 | ##     0.5038192     0.6275740
174 | ```
175 | 
176 | ```r
177 | # rank correlations
178 | spearmansRho <- apply(X, MARGIN = 2, FUN = cor, y = abalone$Rings, method = "spearman")
179 | spearmansRho
180 | ```
181 | 
182 | ```
183 | ##  LongestShell      Diameter        Height   WholeWeight ShuckedWeight 
184 | ##     0.6043853     0.6228950     0.6577164     0.6308320     0.5394200 
185 | ## VisceraWeight   ShellWeight 
186 | ##     0.6143438     0.6924746
187 | ```
188 | 
189 | ```r
190 | # ANOVA and t tests (Type variable)
191 | anova(lm(Rings ~ Type, data = abalone)) 
192 | ```
193 | 
194 | ```
195 | ## Analysis of Variance Table
196 | ## 
197 | ## Response: Rings
198 | ##             Df Sum Sq Mean Sq F value    Pr(>F)    
199 | ## Type         2   8381  4190.6  499.33 < 2.2e-16 ***
200 | ## Residuals 4174  35030     8.4                      
201 | ## ---
202 | ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
203 | ```
204 | 
205 | ```r
206 | pairwise.t.test(abalone$Rings, abalone$Type, pool.sd = FALSE)
207 | ```
208 | 
209 | ```
210 | ## 
211 | ## 	Pairwise comparisons using t tests with non-pooled SD 
212 | ## 
213 | ## data:  abalone$Rings and abalone$Type 
214 | ## 
215 | ##   F       I      
216 | ## I < 2e-16 -      
217 | ## M 0.00025 < 2e-16
218 | ## 
219 | ## P value adjustment method: holm
220 | ```
221 | 
222 | The maximal information coefficient (MIC) is an information theory based measure
223 | of the strength of linear and/or non-linear relationship between two variables. It
224 | bins continuous variables in such a way as to maximise the mutual information, the
225 | amount of information you gain about the likely value of one variable given the 
226 | value of another. The results suggest that all variables are moderately related to 
227 | thee number fo rings. MIC minus R^2 is suggested as a measure of the degree of 
228 | non-linearity in the relationship, all values of this measure are close to zero 
229 | implying non-linear relationships.
230 | 
231 | ```r
232 | # MIC
233 | mic_values <- mine(x = X, y = abalone$Rings)
234 | mic_values$MIC
235 | ```
236 | 
237 | ```
238 | ##                       Y
239 | ## LongestShell  0.3546951
240 | ## Diameter      0.3652849
241 | ## Height        0.3449086
242 | ## WholeWeight   0.3539886
243 | ## ShuckedWeight 0.3153369
244 | ## VisceraWeight 0.3510614
245 | ## ShellWeight   0.3866459
246 | ```
247 | 
248 | The RReliefF algorithm is an adaption of ReliefF to a regression setting 
249 | (Robnik-Šikonja & Kononenko, 1997). It is a measure of how likely nearby instances
250 | of randomly selected observations are to give a similar prediction in the response.
251 | It can be combined with a permutation test to give an indication of how much the
252 | results differ from chance.
253 | 
254 | ```r
255 | #RreliefF (optimistic!!)
256 | relief_values <- attrEval(abalone$Rings ~ ., data = X,
257 |   estimator = "RReliefFbestK",  # calculation method
258 |   ReliefIterations = 50)  # num iteration
259 | relief_values <- data.frame(Predictor = names(relief_values), 
260 |   value = relief_values, 
261 |   row.names = NULL)
262 | 
263 | # RreliefF permutation test
264 | relief_perm <- permuteRelief(x = X, y = abalone$Rings, nperm = 500,
265 |   estimator = "RReliefFbestK",
266 |   ReliefIterations = 50)
267 | 
268 | # standard deviations from permutation score distribution
269 | relief_perm$standardized[order(relief_perm$standardized)]
270 | ```
271 | 
272 | ```
273 | ## ShuckedWeight   WholeWeight VisceraWeight  LongestShell      Diameter 
274 | ##      3.929672      5.283250      5.732581      5.795141      6.438950 
275 | ##   ShellWeight        Height 
276 | ##      9.404226     13.175797
277 | ```
278 | 
279 | ### 4. Filters redundant predictors and create a set of non-redundant Principal component analysis
280 | Given the relationships between the variables this function filters out highly
281 | correlated variable leaving a reduced set. It follows a heuristic algorithm in 
282 | Kuhn and Johnson's book in removing from a pair that variable most related to the
283 | other variables. PCA is then performed, with the first two principal components
284 | accounting for 90% of the variance. The filter method at *r* = 0.75 also leads to
285 | the conclusion there are only two non-redundant "pieces of information" in this data set.
286 | 
287 | ```r
288 | # Filter predictors
289 | # returns name of predictors to keep, possibly still highly correlated if only
290 | # two, in this case examine how they relate to the response in making decisions
291 | filter_vars <- function(X, cor_level = 0.75,...) {
292 |   XX <- cor(X)
293 |   XX <- XX - diag(diag(XX))
294 |   while (any(XX > cor_level)) {
295 |         if (ncol(XX) <= 2) { # prevent entering filtering loop
296 |           return(colnames(XX))
297 |     } else {
298 |       var_ind <- which(XX == max(XX), arr.ind = TRUE)
299 |       var.1 <- row.names(var_ind)[1]
300 |       var.2 <- row.names(var_ind)[2]
301 |       var.1_av <- sum(XX[var.1, ])/(length(XX[var.1, ]) - 1)
302 |       var.2_av <- sum(XX[var.2, ])/(length(XX[var.2, ]) - 1)
303 |       if (var.1_av > var.2_av) {
304 |         XX <- XX[!(row.names(XX) == var.1),!(colnames(XX) == var.1)]
305 |         } else {
306 |           XX <- XX[!(row.names(XX) == var.2),!(colnames(XX) == var.2)]
307 |         }
308 |     }
309 |   }
310 |   colnames(XX)
311 | }
312 | 
313 | filter_vars(X)  # works...
314 | ```
315 | 
316 | ```
317 | ## [1] "Height"        "ShuckedWeight"
318 | ```
319 | 
320 | ```r
321 | # PCA
322 | pca_object <- prcomp(X, center = TRUE, scale. = TRUE)
323 | percent_variance <- pca_object$sd^2/sum(pca_object$sd^2)*100
324 | percent_variance # in agreement with filtering method that there are really 
325 | ```
326 | 
327 | ```
328 | ## [1] 90.78731479  3.99189090  2.39063820  1.62959779  0.92362741  0.18182994
329 | ## [7]  0.09510098
330 | ```
331 | 
332 | 
333 | 


--------------------------------------------------------------------------------
/classification-tree-and-rule-based-models/classification_trees_rules_142.R:
--------------------------------------------------------------------------------
  1 | # Classification Trees and Rule Based Models
  2 | #
  3 | # Script based on Ex 14.2 of Applied Predictive Modeling (Kuhn & Johnson, 2013)
  4 | #
  5 | # AIM: predict whether a customer is likely to leave a telecomm company based 
  6 | # on 19 variables
  7 | #
  8 | # In particular answering:
  9 | # a) Impact of grouped vs. binary dummy encoding of the state categorical 
 10 | # variable
 11 | # b) Which classification technique works best of (gini fit) classificaiton 
 12 | # trees, PART and C50 rule based model and bagged trees
 13 | #
 14 | # Load data and packages ======================================================
 15 | library(C50)
 16 | library(caret)
 17 | library(rpart)
 18 | library(partykit)
 19 | library(tree)
 20 | library(pROC)
 21 | library(ROCR)
 22 | library(randomForest)
 23 | library(RWeka)
 24 | 
 25 | data(churn)  # load data
 26 | 
 27 | str(churnTrain)
 28 | head(churnTrain)
 29 | table(churnTrain$churn)  # outcome variable category frequencies
 30 | prop.table(table(churnTrain$churn)) # outcome variable category probs
 31 | 
 32 | # Basic tree models ===========================================================
 33 | # Classification tree model
 34 | tree_model <- rpart(churn ~ ., data = churnTrain)  # classification tree
 35 | print(tree_model)  # decision nodes/splits
 36 | 
 37 | # Visualise unpruned tree (default method)
 38 | plot(tree_model, branch = 0.3, compress = TRUE, xpd = NA)  # plot the decision tree
 39 | text(tree_model, use.n = TRUE, cex = 0.7)  # nodes info
 40 | 
 41 | # Alternative visualisation no. 1
 42 | party_tree <- as.party(tree_model)
 43 | plot(party_tree, type = "simple", gp = gpar(cex = 0.5))
 44 | 
 45 | # Alternative visualisation no. 2 (not working - too many states?)
 46 | # draw.tree(party_tree, cex = 0.5, nodeinfo = TRUE, col = (0:8 / 8))
 47 | 
 48 | 
 49 | # Prune tree model ------------------------------------------------------------
 50 | plotcp(tree_model)  # plot of complexity parameter info
 51 | printcp(tree_model)  # variables used and complexity parameter info
 52 | # from printcp the optimal tree size is 8 leaves (cp = 0.035)
 53 | tree_model_final <- prune(tree_model, cp = 0.035)
 54 | 
 55 | # Visualise final (pruned) model
 56 | party_tree_final <- as.party(tree_model_final)
 57 | plot(party_tree_final, type = "simple", gp = gpar(cex = 0.6))
 58 | plot(party_tree_final, gp = gpar(cex = 0.6))  # nice plot of the tree
 59 | 
 60 | 
 61 | # Evaluate on the test data ---------------------------------------------------
 62 | tree_model_pred <- predict(tree_model_final, churnTest)  # make predictions
 63 | temp <- ifelse(tree_model_pred[ ,1] < .5, "no", "yes")
 64 | tree_model_pred <- data.frame(tree_model_pred, prediction = as.factor(temp))
 65 | tree_confusion <- table(tree_model_pred$prediction, churnTest$churn)
 66 | tree_confusion  # print confusion matrix
 67 | prop.table(tree_confusion)
 68 | (141+1415)/1667  # 93.34% of observations are correctly classified
 69 | 
 70 | # ROC curve and AUC with pROC package
 71 | roc_tree <- roc(response = churnTest$churn, 
 72 |   predictor = tree_model_pred$yes, plot = TRUE)  # ROC curve (sens/spec)
 73 | plot(smooth(roc_tree), identity = FALSE)
 74 | abline(a = c(1, 1), b = -1, col = 'red', lty = 2)  # add 45' line (confusing param)
 75 | ci.sp(roc_tree)  # CI on specificity
 76 | ci.se(roc_tree)  # CI on sensitivity
 77 | 
 78 | # ROC curve with the ROCR package
 79 | rocplot <- function(pred, truth, ...) { # function to create ROC curve
 80 |   predob <- prediction(pred, truth )
 81 |   perf <- performance(predob, "tpr", "fpr")
 82 |   plot(perf, ...)
 83 | }
 84 | par(pty="s")  # ensure square graph
 85 | rocplot(tree_model_pred$yes, churnTest$churn)
 86 | abline(a = c(0,0), b = 1, col = 'red', lty = 2)
 87 | 
 88 | # AUC
 89 | auc(churnTest$churn, tree_model_pred$yes)
 90 | 
 91 | 
 92 | # Classification tree using caret:::train --------------------------------------
 93 | ctrl <- trainControl(method = "cv", number = 10, classProbs = TRUE)  # 10 fold cv
 94 | 
 95 | caret_tree <- train(x = churnTrain[ ,1:19], y = churnTrain$churn,  # note x, y entry
 96 |   method = 'rpart', 
 97 |   tuneLength = 30, 
 98 |   trControl = ctrl)
 99 | caret_tree
100 | 
101 | # Visualise pruned caret tree
102 | caret_tree_party <- as.party(caret_tree$finalModel)
103 | plot(caret_tree_party, gp = gpar(cex = 0.5))  # nice plot of the tree
104 | 
105 | 
106 | # Evaluate caret tree on test set ---------------------------------------------
107 | caret_tree_preds <- predict(caret_tree, churnTest, type = "prob")
108 | temp <- ifelse(caret_tree_preds[ ,1] < .5, "no", "yes")
109 | caret_tree_preds <- data.frame(caret_tree_preds, prediction = as.factor(temp))
110 | caret_tree_confusion <- table(caret_tree_preds$prediction, churnTest$churn)
111 | caret_tree_confusion
112 | prop.table(caret_tree_confusion)
113 | (138+1426)/1667  # 93.38% of observations are correctly classified
114 | 
115 | # ROC curve for caret and tree predictions
116 | rocplot(tree_model_pred$yes, churnTest$churn)
117 | rocplot(caret_tree_preds$yes, churnTest$churn, add = TRUE, col = "blue")
118 | abline(a = c(0,0), b = 1, col = 'red', lty = 2)
119 | legend("bottomright", c("caret model", "rpart model"), 
120 |   lty = 1,
121 |   col = c("blue", "black"), 
122 |   cex = 0.75)
123 | 
124 | # Slight difference in predictive accuracy of the caret and rpart models.
125 | # The caret model was pruned using accuracy, so this may be the reason.
126 | 
127 | # Category encoding ===========================================================
128 | # A look at independent vs. grouped categories for the state variable
129 | # i.e. create (levels - 1) dummy variables for state and refit the decision
130 | # trees
131 | 
132 | # Create dummy variables and add to the (new) dataframe(s)
133 | dmy <- dummyVars( ~ state , data = churnTrain, fullRank = TRUE)
134 | churnTrain_dmy <- data.frame(churnTrain[ ,-1], (predict(dmy, churnTrain)))
135 | churnTest_dmy <- data.frame(churnTest[ ,-1], (predict(dmy, churnTest)))
136 | 
137 | # Fit and prune a decision tree using caret
138 | str(churnTrain_dmy)
139 | caret_dmytree <- train(x = churnTrain_dmy[ ,-19], y = churnTrain_dmy$churn, 
140 |   method = 'rpart', 
141 |   tuneLength = 30, 
142 |   trControl = ctrl, 
143 |   metric = "Accuracy")
144 | caret_dmytree
145 | 
146 | # Visualise pruned dummy variable model
147 | caret_dmytree_party <- as.party(caret_dmytree$finalModel)
148 | plot(caret_dmytree_party, gp = gpar(cex = 0.5))  # nice plot of the tree
149 | 
150 | 
151 | # Evaluate dummy encoded caret tree on test set --------------------------------
152 | caret_dmytree_preds <- predict(caret_dmytree, churnTest_dmy, type = "prob")
153 | temp <- ifelse(caret_dmytree_preds[ ,1] < .5, "no", "yes")
154 | caret_dmytree_preds <- data.frame(caret_dmytree_preds, prediction = as.factor(temp))
155 | caret_dmytree_confusion <- table(caret_dmytree_preds$prediction, churnTest$churn)
156 | caret_dmytree_confusion
157 | prop.table(caret_tree_confusion)
158 | (142+1431)/1667  # 94.36% of observations are correctly classified
159 | 
160 | # ROC curve for caret, rpart, dmy var predictions
161 | # Predict probs for the dummy variable model
162 | rocplot(tree_model_pred$yes, churnTest$churn)  # Plot ROC curves
163 | rocplot(caret_tree_preds$yes, churnTest$churn, add = TRUE, col = "blue")
164 | rocplot(caret_dmytree_preds$yes, churnTest$churn, add = TRUE, col = "orange")
165 | abline(a = c(0,0), b = 1, col = 'red', lty = 2)
166 | legend("bottomright", c("caret model", "rpart model", "dummy var model"), lty = 1, 
167 |   col = c("blue", "black", "orange"), cex = 0.65)
168 | 
169 | # AUC
170 | auc(churnTest$churn, caret_dmytree_preds$yes)
171 | 
172 | 
173 | # Bagged tree model ============================================================
174 | # Fit bagged decision tree
175 | # Random forest with n = p is equivalent to bagging
176 | bagged_tree <- randomForest(x = churnTrain[ ,1:19], y = churnTrain$churn,
177 |   importance = TRUE)
178 | bagged_tree
179 | 
180 | 
181 | # Using caret ------------------------------------------------------------------
182 | ctrl2 <- trainControl("cv", number = 10)
183 | bagged_tree2 <- train(x = churnTrain[ ,1:19], y = churnTrain$churn, method = "rf",
184 |   importance = TRUE, tuneGrid = expand.grid(mtry = 19), do.trace = TRUE)
185 | bagged_tree2
186 | 
187 | 
188 | # Variable importance ----------------------------------------------------------
189 | # MeanDecreaseAccuracy: mean decrease of accuracy in predictions on the out 
190 | # of bag samples when a given variable is excluded from the model.
191 | # MeanDecreaseGini: measure of the total decrease in node impurity that results 
192 | # from splits over that variable, averaged over all trees
193 | varImpPlot(bagged_tree)
194 | 
195 | 
196 | # Evaluate bagged decision tree on test data -----------------------------------
197 | bagged_tree2_preds <- predict(bagged_tree2, churnTest, type = "prob")
198 | temp <- ifelse(bagged_tree2_preds[ ,1] < .5, "no", "yes")
199 | bagged_tree2_preds <- data.frame(bagged_tree2_preds, prediction = as.factor(temp))
200 | bagged_tree2_confusion <- table(bagged_tree2_preds$prediction, churnTest$churn)
201 | bagged_tree2_confusion
202 | prop.table(bagged_tree2_confusion)
203 | (188+1320)/1667  # 90.46% accuracy
204 | 
205 | # ROC curve
206 | par(pty="s")  # ensure square graph
207 | rocplot(tree_model_pred$yes, churnTest$churn)  # Plot ROC curves
208 | rocplot(caret_tree_preds$yes, churnTest$churn, add = TRUE, col = "blue")
209 | rocplot(caret_dmytree_preds$yes, churnTest$churn, add = TRUE, col = "orange")
210 | rocplot(bagged_tree2_preds$yes, churnTest$churn, add = TRUE, col = "green")
211 | abline(a = c(0,0), b = 1, col = 'red', lty = 2)
212 | legend("bottomright", c("caret model", "rpart model", "dummy var model", 
213 |   "bagged tree model"), 
214 |   lty = 1, 
215 |   col = c("blue", "black", "orange", "green"), 
216 |   cex = 0.65)
217 | 
218 | # AUC
219 | auc(churnTest$churn, bagged_tree2_preds$yes)
220 | 
221 | 
222 | # Why is the bagged model less accurate but has a higher AUC ?? ----------------
223 | # See the histograms...
224 | histogram(tree_model_pred$yes)
225 | histogram(caret_dmytree_preds$yes)
226 | histogram(caret_tree_preds$yes)
227 | histogram(bagged_tree2_preds$yes)
228 | 
229 | 
230 | # Rule based models ============================================================
231 | # Fit PART model
232 | part_model <- PART(churn ~ ., data = churnTrain)
233 | part_model
234 | 
235 | 
236 | # caret method -----------------------------------------------------------------
237 | part_model.c <- train(x = churnTrain[ ,-20], y = churnTrain$churn, 
238 |   method = 'PART', 
239 |   tuneLength = 30, 
240 |   trControl = ctrl, 
241 |   metric = "Accuracy")
242 | part_model.c
243 | 
244 | 
245 | # Evaluate bagged classification tree on test data ----------------------------
246 | part_model_preds <- predict(part_model.c, churnTest, type = "prob")
247 | temp <- ifelse(part_model_preds[ ,1] < .5, "no", "yes")
248 | part_model_preds <- data.frame(part_model_preds, prediction = as.factor(temp))
249 | part_model_confusion <- table(part_model_preds$prediction, churnTest$churn)
250 | part_model_confusion
251 | round(prop.table(part_model_confusion), 3)
252 | (130+1376)/1667  # 90.34% accuracy
253 | 
254 | # ROC curve
255 | par(pty="s")  # ensure square graph
256 | rocplot(tree_model_pred$yes, churnTest$churn)  # Plot ROC curves
257 | rocplot(caret_dmytree_preds$yes, churnTest$churn, add = TRUE, col = "orange")
258 | rocplot(bagged_tree2_preds$yes, churnTest$churn, add = TRUE, col = "green")
259 | rocplot(part_model_preds$yes, churnTest$churn, add = TRUE, col = "blue")
260 | abline(a = c(0,0), b = 1, col = 'red', lty = 2)
261 | legend("bottomright", c("PART rule model", "rpart model", "dummy var model", 
262 |   "bagged tree model"), 
263 |   lty = 1, 
264 |   col = c("blue", "black", "orange", "green"), 
265 |   cex = 0.65)
266 | 
267 | # AUC
268 | auc(churnTest$churn, part_model_preds$yes)
269 | 
270 | 
271 | # Fit C5.0 rule model ---------------------------------------------------------
272 | # information theory based approach, rule voting method
273 | C5.0_model <- C5.0(x = churnTrain[ ,-20], y = churnTrain$churn, rules = TRUE)
274 | C5.0_model
275 | summary(C5.0_model)
276 | 
277 | # caret version
278 | C5.0_model.c <- train(x = churnTrain[ ,-20], y = churnTrain$churn, 
279 |   method = "C5.0Rules", 
280 |   trControl = ctrl)
281 | C5.0_model.c
282 | 
283 | 
284 | # Evaulate C5.0 model on test data --------------------------------------------
285 | C5.0_model_preds <- predict(C5.0_model.c, churnTest, type = "prob")
286 | temp <- ifelse(C5.0_model_preds[ ,1] < .5, "no", "yes")
287 | C5.0_model_preds <- data.frame(C5.0_model_preds, prediction = as.factor(temp))
288 | C5.0_model_confusion <- table(C5.0_model_preds$prediction, churnTest$churn)
289 | C5.0_model_confusion
290 | round(prop.table(C5.0_model_confusion), 3)
291 | (149+1428)/1667  # 94.60% accuracy
292 | 
293 | # ROC curve
294 | par(pty="s")  # ensure square graph
295 | rocplot(tree_model_pred$yes, churnTest$churn)  # Plot ROC curves
296 | rocplot(caret_dmytree_preds$yes, churnTest$churn, add = TRUE, col = "orange")
297 | rocplot(bagged_tree2_preds$yes, churnTest$churn, add = TRUE, col = "green")
298 | rocplot(part_model_preds$yes, churnTest$churn, add = TRUE, col = "blue")
299 | rocplot(C5.0_model_preds$yes, churnTest$churn, add = TRUE, col = "purple")
300 | abline(a = c(0,0), b = 1, col = 'red', lty = 2)
301 | legend("bottomright", c("PART rule model", "rpart model", "dummy var model", 
302 |   "bagged tree model", "C5.0 rule model"), 
303 |   lty = 1, 
304 |   col = c("blue", "black", "orange", "green", "purple"), 
305 |   cex = 0.65)
306 | 
307 | # AUC
308 | auc(churnTest$churn, C5.0_model_preds$yes)
309 | 
310 | 
311 | # Lift chart ==================================================================
312 | prediction_models <- data.frame(rpart_tree = tree_model_pred$yes, 
313 |   dummy_tree = caret_dmytree_preds$yes,
314 |   bagged_tree = bagged_tree2_preds$yes,
315 |   part_rules = part_model_preds$yes,
316 |   C5.0_rules = C5.0_model_preds$yes)
317 | labs <- c(rpart_tree = "Grouped Categories", dummy_tree = "Binary Categories",
318 |   bagged_tree = "Bagged Tree", 
319 |   part_rules = "PART rules",
320 |   C5.0_rules = "C5.0 rules")
321 | liftCurve <- lift(churnTest$churn ~ rpart_tree + dummy_tree + bagged_tree + 
322 |   part_rules + C5.0_rules, 
323 |   data = prediction_models,
324 |   labels = labs)
325 | xyplot(liftCurve, auto.key = list(columns = 2, lines = TRUE, points = FALSE))
326 | 
327 | 
328 | # NB
329 | # rpart, C5.0, and J48 use the formula method differently than
330 | # most other functions by respecting the categorical nature of the data 
331 | # and treating these predictors as grouped sets of categories
332 | # caret:::train function follows the more common convention in R,
333 | # which is to create dummy variables prior to modeling
334 | 
335 | 
336 | 
337 | 
338 | 


--------------------------------------------------------------------------------
/measuring-predictor-importance/churn_data_181.md:
--------------------------------------------------------------------------------
  1 | # Ex18.1 - Measuring predictor importance: churn data set
  2 | Oisin Fitzgerald  
  3 | 
  4 | The “churn” data set was developed to predict telecom customer churn based on 
  5 | information about their account. It contains 20 variables, with the 19
  6 | predictors including continuous and factor variables that describe an individual
  7 | account. The response churn has two levels "yes" and "no". This script demonstrates
  8 | methods for examining categorical and continuous predictor importance for 
  9 | classification problems.
 10 | 
 11 | (a) Examining the correlation between predictors
 12 | ------------------------------------------------
 13 | 
 14 | ```r
 15 | # load packages
 16 | library(AppliedPredictiveModeling)
 17 | library(C50)
 18 | library(corrplot)
 19 | library(caret)
 20 | ```
 21 | 
 22 | ```
 23 | ## Loading required package: lattice
 24 | ## Loading required package: ggplot2
 25 | ```
 26 | 
 27 | ```r
 28 | library(CORElearn)
 29 | library(ggplot2)
 30 | library(pROC)
 31 | ```
 32 | 
 33 | ```
 34 | ## Type 'citation("pROC")' for a citation.
 35 | ## 
 36 | ## Attaching package: 'pROC'
 37 | ## 
 38 | ## The following objects are masked from 'package:stats':
 39 | ## 
 40 | ##     cov, smooth, var
 41 | ```
 42 | 
 43 | ```r
 44 | # load data
 45 | data(churn)
 46 | str(churnTrain)
 47 | ```
 48 | 
 49 | ```
 50 | ## 'data.frame':	3333 obs. of  20 variables:
 51 | ##  $ state                        : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
 52 | ##  $ account_length               : int  128 107 137 84 75 118 121 147 117 141 ...
 53 | ##  $ area_code                    : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
 54 | ##  $ international_plan           : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
 55 | ##  $ voice_mail_plan              : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
 56 | ##  $ number_vmail_messages        : int  25 26 0 0 0 0 24 0 0 37 ...
 57 | ##  $ total_day_minutes            : num  265 162 243 299 167 ...
 58 | ##  $ total_day_calls              : int  110 123 114 71 113 98 88 79 97 84 ...
 59 | ##  $ total_day_charge             : num  45.1 27.5 41.4 50.9 28.3 ...
 60 | ##  $ total_eve_minutes            : num  197.4 195.5 121.2 61.9 148.3 ...
 61 | ##  $ total_eve_calls              : int  99 103 110 88 122 101 108 94 80 111 ...
 62 | ##  $ total_eve_charge             : num  16.78 16.62 10.3 5.26 12.61 ...
 63 | ##  $ total_night_minutes          : num  245 254 163 197 187 ...
 64 | ##  $ total_night_calls            : int  91 103 104 89 121 118 118 96 90 97 ...
 65 | ##  $ total_night_charge           : num  11.01 11.45 7.32 8.86 8.41 ...
 66 | ##  $ total_intl_minutes           : num  10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
 67 | ##  $ total_intl_calls             : int  3 3 5 7 3 6 7 6 4 5 ...
 68 | ##  $ total_intl_charge            : num  2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
 69 | ##  $ number_customer_service_calls: int  1 1 0 2 3 0 3 0 1 0 ...
 70 | ##  $ churn                        : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
 71 | ```
 72 | 
 73 | ```r
 74 | # correlation between the continuous variables the the 
 75 | numeric_vars <- sapply(churnTrain, is.numeric)
 76 | corrs <- cor(churnTrain[numeric_vars])
 77 | corrplot(corrs, method = "number", tl.cex = 0.75)
 78 | ```
 79 | 
 80 | ![](churn_data_181_files/figure-html/unnamed-chunk-1-1.png) 
 81 | 
 82 | Finding a perfect colinearity between the of four pairs of total charge and 
 83 | total minutes variables is of course no surprise, phone charges are set per time 
 84 | period! However it is somewhat surprising that there is no linear relation 
 85 | between any other pairs of variables. A scatterplot matrix of a subset of 
 86 | the continuous variables reveals the extent of any pairwise relationship and 
 87 | how certain variables contain numerous of zeros.
 88 | 
 89 | ![](churn_data_181_files/figure-html/unnamed-chunk-2-1.png) 
 90 | 
 91 | (b) Assessing the importance of categorical predictors
 92 | -------------------------------------------------------------------------------
 93 | Odds ratios, Fisher's exact test and chi-square tests provide methods to examine 
 94 | the extent of association between factor levels and the response categories. 
 95 | Fisher's exact test is considred more reliable than chi-square, however it is more
 96 | computationally intensive.
 97 | 
 98 | ```r
 99 | # A function to calculate chi-square, odds ratios and fisher's exact test
100 | association_tests <- function(x, y) { # x is predictors, y is response
101 |   
102 |   x <- x[ ,sapply(x, is.factor)]
103 |   n <- length(x)
104 |   names <- colnames(x)
105 | 
106 |   out <- data.frame(
107 |     chisq = rep(NA, n), 
108 |     chi.p.value = rep(NA, n), 
109 |     odds.ratio = rep(NA, n), 
110 |     fisher.p.value = rep(NA, n))
111 | 
112 |   for (i in 1:n) {
113 |     row.names(out)[i] <- names[i]
114 |     if (nlevels(x[ ,i]) > 7) {
115 |       fish_res <- fisher.test(x = x[ ,i], y = y, simulate.p.value = TRUE)
116 |       out$fisher.p.value[i] <- fish_res$p.value
117 |     } else {
118 |       fish_res <- fisher.test(x = x[ ,i], y = y)
119 |       out$fisher.p.value[i] <- fish_res$p.value
120 |       if (nlevels(x[ ,i]) <= 2) out$odds.ratio[i] <- fish_res$estimate
121 |       }
122 |     
123 |     chi_res <- chisq.test(x = x[ ,i], y = y, simulate.p.value = TRUE) # chisq test
124 |     out$chisq[i] <- chi_res$statistic
125 |     out$chi.p.value[i] <- chi_res$p.value
126 |   }
127 |   out
128 | }
129 | 
130 | res <- association_tests(x = churnTrain[ ,1:19], y = churnTrain$churn)
131 | ```
132 | 
133 | The results suggest international plan to be an important variable, while area code 
134 | shows little value as a predictor. The extremely low chi-square statistic and 
135 | high p-value suggests data may have been purposely balanced by area code. Voice 
136 | mail plan and state seems to have value, without the same strngth of association
137 | as international plan.
138 | 
139 | ```r
140 | ggplot(data = res, aes(x = chisq, y = -log(chi.p.value))) + 
141 |   geom_point(size = 3) +
142 |   annotate("text", x = res$chisq + 10, 
143 |     y = -log(res$chi.p.value) -.3, label = c(row.names(res))) +
144 |   labs(title = "Chi square vs. -log(p.values)") +
145 |   xlim(NA, 275) + ylim(NA, 9)
146 | ```
147 | 
148 | ![](churn_data_181_files/figure-html/unnamed-chunk-4-1.png) 
149 | 
150 | Receiver operating characteristic (ROC) curves offer a method to examine the extent
151 | to which a predictor variable distinguishes between the two levels of a response
152 | factor, e.g. to what extent does the "account_length" variable allow us to 
153 | distinguish between customers likely to churn, and those who are not. The area 
154 | under the ROC curve (AUC) quantifies the ability of a predictor variable to separate
155 | between classes. 
156 | The AUC leads to different conclusions to the association tests, with international
157 | plan now considered the least important variable. Area code is now ranked the second
158 | most important variable.  
159 | 
160 | ```r
161 | # Calculate the area under the ROC curve
162 | factor_pred <- churnTrain[ ,sapply(churnTrain, is.factor)]  # subset the factors
163 | factor_pred$churn <- NULL
164 | auc_factors <- filterVarImp(y = churnTrain$churn, x = factor_pred)
165 | # variables ranked by auc
166 | auc_factors[order(auc_factors$yes, decreasing = TRUE), ]  
167 | ```
168 | 
169 | ```
170 | ##                          yes        no
171 | ## voice_mail_plan    0.5649036 0.5649036
172 | ## area_code          0.4975435 0.4975435
173 | ## state              0.4947913 0.4947913
174 | ## international_plan 0.3908096 0.3908096
175 | ```
176 | 
177 | (b) Assessing the importance of continuous predictors
178 | -------------------------------------------------------------------------------
179 | 
180 | ```r
181 | # create a subset of the continuous predictors 
182 | cont_pred <- churnTrain[ ,sapply(churnTrain, is.numeric)]
183 | ```
184 | 
185 | Where the response is a category with two outcomes, t-tests can be used to assess
186 | the difference in the distributions of the continuous predictors by the response
187 | categories. As a signal/noise ratio the t-statistic quantifies the separation in
188 | the distributions, with the associated p value indicating the extent to which this
189 | would occur based on an assumption of no differnce.
190 | 
191 | ```r
192 | get_tstats <- function(x, y) {
193 |   test <- t.test(x ~ y)  # Welch's t test
194 |   out <- c(t_stat = test$statistic, p = test$p.value)
195 |   out
196 | }
197 | 
198 | t_values <- apply(cont_pred, MARGIN = 2, FUN = get_tstats, y = churnTrain$churn)
199 | t_values <- data.frame(t(t_values))  # transpose
200 | round(t_values[order(t_values$p), ], 6)
201 | ```
202 | 
203 | ```
204 | ##                                t_stat.t        p
205 | ## total_day_minutes              9.684563 0.000000
206 | ## total_day_charge               9.684476 0.000000
207 | ## number_customer_service_calls  8.955141 0.000000
208 | ## number_vmail_messages         -5.821254 0.000000
209 | ## total_eve_minutes              5.272354 0.000000
210 | ## total_eve_charge               5.271986 0.000000
211 | ## total_intl_charge              3.939933 0.000090
212 | ## total_intl_minutes             3.938851 0.000091
213 | ## total_intl_calls              -2.960420 0.003186
214 | ## total_night_charge             2.171007 0.030272
215 | ## total_night_minutes            2.170889 0.030280
216 | ## total_day_calls                1.002387 0.316543
217 | ## account_length                 0.961889 0.336458
218 | ## total_eve_calls                0.537389 0.591180
219 | ## total_night_calls              0.348818 0.727339
220 | ```
221 | 
222 | The AUC and t-test for the continuous predictors both agree to a large extent.
223 | They share the same top 3 predictors, and only seem to have slight re-shuffling 
224 | otherwise.
225 | 
226 | ```r
227 | # Calculate the area under the ROC curve
228 | auc_numeric <- filterVarImp(y = churnTrain$churn, x = cont_pred)
229 | # continuous variables ranked by AUC
230 | auc_numeric[order(auc_numeric$yes, decreasing = TRUE), ]  
231 | ```
232 | 
233 | ```
234 | ##                                     yes        no
235 | ## total_day_minutes             0.6399666 0.6399666
236 | ## total_day_charge              0.6399666 0.6399666
237 | ## number_customer_service_calls 0.6082071 0.6082071
238 | ## total_eve_minutes             0.5726508 0.5726508
239 | ## total_eve_charge              0.5726417 0.5726417
240 | ## number_vmail_messages         0.5616465 0.5616465
241 | ## total_intl_calls              0.5606302 0.5606302
242 | ## total_intl_minutes            0.5498979 0.5498979
243 | ## total_intl_charge             0.5498979 0.5498979
244 | ## total_night_charge            0.5281719 0.5281719
245 | ## total_night_minutes           0.5281632 0.5281632
246 | ## total_day_calls               0.5215742 0.5215742
247 | ## account_length                0.5127787 0.5127787
248 | ## total_eve_calls               0.5070339 0.5070339
249 | ## total_night_calls             0.4961509 0.4961509
250 | ```
251 | 
252 | 
253 | (d) Use RefliefF to jointly assess the importance of predictors
254 | -------------------------------------------------------------------------------
255 | The Relief algorithm is another method to measure the importance of predictors
256 | for a two class response problem (although it can deal with other situations
257 | as well). It begins by randomly selecting a set of observations, R, of size m. 
258 | The algorithm then evaluates each predictor in isolation by looping through each 
259 | point in the random set and for each point (1) finding the two nearest scores 
260 | that are a hit(i.e. share same class in response) and a miss (i.e. does not share 
261 | same class in reponse) and (2) updating the score for that predictor, 
262 | S = S - diff(R, Hit)^2/m + diff(R, Miss)^2/m.
263 | 
264 | ```r
265 | relief_values <- attrEval(churn ~ ., data = churnTrain,
266 |   estimator = "ReliefFequalK",  # calculation method
267 |   ReliefIterations = 50)  # num iteration
268 | relief_values[order(relief_values, decreasing = TRUE)]
269 | ```
270 | 
271 | ```
272 | ##             total_day_minutes              total_day_charge 
273 | ##                    0.23617332                    0.23612810 
274 | ## number_customer_service_calls            international_plan 
275 | ##                    0.12600000                    0.09800000 
276 | ##               voice_mail_plan             total_night_calls 
277 | ##                    0.08800000                    0.08708920 
278 | ##               total_day_calls           total_night_minutes 
279 | ##                    0.06864646                    0.06307872 
280 | ##            total_night_charge             total_eve_minutes 
281 | ##                    0.06298665                    0.05228962 
282 | ##              total_eve_charge         number_vmail_messages 
283 | ##                    0.05226356                    0.04826144 
284 | ##              total_intl_calls            total_intl_minutes 
285 | ##                    0.04333333                    0.04216667 
286 | ##             total_intl_charge                     area_code 
287 | ##                    0.04213580                    0.03600000 
288 | ##                account_length               total_eve_calls 
289 | ##                    0.02119559                    0.01062745 
290 | ##                         state 
291 | ##                    0.00000000
292 | ```
293 | 
294 | An addition to the Relief algorithm is to permutate the response observations so as to 
295 | gain an understanding of the predictors score when it has no relevance. This method can be iterated several times giving a somewhat normal distribution of scores that can then be compared to the true Relief score in terms of standard
296 | deviations. This indicates how much greater the Relief score is than what could 
297 | be expected by chance alone. 
298 | 
299 | ```r
300 | relief_perm <- permuteRelief(x = churnTrain[ ,-20], y = churnTrain$churn, nperm = 500,
301 |   estimator = "ReliefFequalK",
302 |   ReliefIterations = 50)
303 | ```
304 | 
305 | The results suggest that total day charge (and therefore total day minutes - 
306 | one is a multiple of the other) and number of customer service calls are highly
307 | important variables. International plan is also quite far from its permutation
308 | distribution mean in terms of standard deviations, putting this method more in
309 | agreement with chisq/fishers than the AUC. However its relief score still quite
310 | low. This may be a result of a heavy bias towards "no" internation plan and "no" 
311 | churn but ambiguity otherwise (poor sensitivity -> low AUC curve?). There are several predictors that appear without value including state, area code, total intl minutes (and charges), and total night minutes (and charges).
312 | 
313 | ```r
314 | # Histograms of the permutated relief scores
315 | relief_values <- data.frame(Predictor = names(relief_values), 
316 |   value = relief_values, 
317 |   row.names = NULL)
318 | ggplot(data = relief_perm$permutations, aes(x = value)) + 
319 |   geom_histogram(binwidth = .01, colour = 1) +
320 |   geom_vline(aes(xintercept = value), relief_values, colour = "red", linetype = 2) +
321 |   facet_wrap(~ Predictor) + 
322 |   labs(title = "Relief Scores and Permutation Distributions", xlab = "Relief Scores")
323 | ```
324 | 
325 | ![](churn_data_181_files/figure-html/unnamed-chunk-11-1.png) 
326 | 
327 | ```r
328 | # Standard deviation of permutated distribution from non-permutated score
329 | relief_perm$standardized[order(relief_perm$standardized)]
330 | ```
331 | 
332 | ```
333 | ##                     area_code               total_eve_calls 
334 | ##                   -2.17837888                   -1.53075421 
335 | ##                account_length             total_night_calls 
336 | ##                   -0.53966612                   -0.28130565 
337 | ##           total_night_minutes            total_night_charge 
338 | ##                   -0.16435951                   -0.15880347 
339 | ##              total_intl_calls               total_day_calls 
340 | ##                    0.00910222                    0.76098712 
341 | ##              total_eve_charge             total_eve_minutes 
342 | ##                    0.90009256                    0.90072488 
343 | ##                         state            total_intl_minutes 
344 | ##                    1.16465807                    1.42100050 
345 | ##             total_intl_charge         number_vmail_messages 
346 | ##                    1.43765463                    2.44992637 
347 | ##            international_plan              total_day_charge 
348 | ##                    3.26771361                    3.43767734 
349 | ##             total_day_minutes               voice_mail_plan 
350 | ##                    3.43768610                    3.78254810 
351 | ## number_customer_service_calls 
352 | ##                    4.49641650
353 | ```
354 | 
355 | 
356 | 
357 | 


--------------------------------------------------------------------------------
/linear-regression-and-cousins/permeability_data_62.md:
--------------------------------------------------------------------------------
  1 | # Ex6.2 - Linear regression: molecule permeability
  2 | Oisin Fitzgerald  
  3 | 27 January 2016  
  4 | 
  5 | ### The data 
  6 | 
  7 | This pharmaceutical data set was used to develop a model for predicting compounds' 
  8 | permeability, a measure of a molecule's ability to cross a membrane. Permeability
  9 | impacts on a potential drug's usefulness, i.e. it needs to be able to cross
 10 | certain membranes to be effective. There exist assays to measure a compund's
 11 | permeability. The gaol here is to develop a predictive model for permeability 
 12 | in an attempt to potentially reduce the need for the assay. 
 13 | (brief description, more on ?permeability)
 14 | 
 15 | ### Outline
 16 | 
 17 | * Training/test split
 18 | * Pre-process the data
 19 |     + Skewness of response
 20 |     + Sparseness
 21 | * Fit and test PLS models
 22 | * Fit and ridge regression, LASSO and elastic net models
 23 | 
 24 | 
 25 | ```r
 26 | # Load data and packages
 27 | library(AppliedPredictiveModeling)
 28 | data(permeability)
 29 | 
 30 | suppressMessages(library(caret))
 31 | suppressMessages(library(pls))
 32 | suppressMessages(library(elasticnet))
 33 | suppressMessages(library(lars))
 34 | ```
 35 | 
 36 | ### Data splitting and pre-processing
 37 | 
 38 | 
 39 | ```r
 40 | fingerprints <- data.frame(fingerprints)
 41 | permeability <- as.vector(permeability)
 42 | # Create training/test split index
 43 | Split <- createDataPartition(permeability, times = 1, p = 0.75)
 44 | Split <- Split$Resample1 
 45 | # Create training and test splits
 46 | training <- fingerprints[Split, ]
 47 | test <- fingerprints[-Split, ]
 48 | response_train <- permeability[Split]
 49 | response_test <- permeability[-Split]
 50 | 
 51 | # Pre-processing
 52 | training_filter <- training[ ,-nearZeroVar(training, freqCut = 95/5)]  # near zero variances
 53 | # binary data...
 54 | 
 55 | # positively skewed
 56 | ggplot() + geom_histogram(aes(x = permeability), binwidth = 4, col = 1) + 
 57 |   labs(title = "Histogram of permeability", x = "Molecule permeability") +
 58 |   theme_bw()
 59 | ```
 60 | 
 61 | ![](permeability_data_62_files/figure-html/unnamed-chunk-2-1.png)\
 62 | 
 63 | ```r
 64 | preProcess(data.frame(permeability), method = c("YeoJohnson"))
 65 | ```
 66 | 
 67 | ```
 68 | ## Created from 165 samples and 1 variables
 69 | ## 
 70 | ## Pre-processing:
 71 | ##   - ignored (0)
 72 | ##   - Yeo-Johnson transformation (1)
 73 | ## 
 74 | ## Lambda estimates for Yeo-Johnson transformation:
 75 | ## -0.19
 76 | ```
 77 | 
 78 | ### Fit and test partial least squares models
 79 | 
 80 | Three variations on a PLS model were fit and tested:
 81 | 1. A model fit to the full training set
 82 | 2. A model fit to the near zero variance reduced training set
 83 | 3. A model fit to a log transformed response
 84 | 
 85 | None of the models created appear to have the predictive ability to replace the 
 86 | mentioned assay method, they lack accurate predictive ability. The log(response)
 87 | PLS model (and others) appears to shown promise within a certain range. Possibly 
 88 | the current linear technique is too restrictive. Some models are producing negative 
 89 | predictions - multicolinearity?. 
 90 | 
 91 | 
 92 | ```r
 93 | # Fit PLS models
 94 | ctrl = trainControl("repeatedcv", number = 5, repeats = 5)
 95 | pls_model.1 <- train(y = response_train,
 96 |   x = training,
 97 |   method = "pls",
 98 |   metric = "Rsquared",
 99 |   tuneLength = 10,
100 |   trControl = ctrl)
101 | pls_model.1
102 | ```
103 | 
104 | ```
105 | ## Partial Least Squares 
106 | ## 
107 | ##  125 samples
108 | ## 1107 predictors
109 | ## 
110 | ## No pre-processing
111 | ## Resampling: Cross-Validated (5 fold, repeated 5 times) 
112 | ## Summary of sample sizes: 101, 100, 100, 100, 99, 99, ... 
113 | ## Resampling results across tuning parameters:
114 | ## 
115 | ##   ncomp  RMSE      Rsquared   RMSE SD   Rsquared SD
116 | ##    1     14.12589  0.2801179  2.015635  0.1594071  
117 | ##    2     12.41503  0.4325951  2.089575  0.1719383  
118 | ##    3     11.86396  0.4798535  2.175933  0.1709627  
119 | ##    4     11.95253  0.4829617  2.599663  0.1876928  
120 | ##    5     11.90959  0.4916929  2.613462  0.1848358  
121 | ##    6     12.00479  0.4855347  2.450659  0.1790584  
122 | ##    7     11.99263  0.4870670  2.462163  0.1834921  
123 | ##    8     12.24073  0.4756755  2.435113  0.1808482  
124 | ##    9     12.62208  0.4533456  2.255274  0.1705803  
125 | ##   10     12.89147  0.4352161  2.126619  0.1643402  
126 | ## 
127 | ## Rsquared was used to select the optimal model using  the largest value.
128 | ## The final value used for the model was ncomp = 5.
129 | ```
130 | 
131 | ```r
132 | pls_model.2 <- train(y = response_train,
133 |   x = training_filter,
134 |   method = "pls",
135 |   metric = "Rsquared",
136 |   tuneLength = 10,
137 |   trControl = ctrl)
138 | pls_model.2
139 | ```
140 | 
141 | ```
142 | ## Partial Least Squares 
143 | ## 
144 | ## 125 samples
145 | ## 370 predictors
146 | ## 
147 | ## No pre-processing
148 | ## Resampling: Cross-Validated (5 fold, repeated 5 times) 
149 | ## Summary of sample sizes: 101, 101, 100, 98, 100, 101, ... 
150 | ## Resampling results across tuning parameters:
151 | ## 
152 | ##   ncomp  RMSE      Rsquared   RMSE SD   Rsquared SD
153 | ##    1     14.13641  0.2874455  1.994562  0.16353002 
154 | ##    2     12.45425  0.4515863  2.294324  0.18511199 
155 | ##    3     12.02477  0.4866895  1.819542  0.15405348 
156 | ##    4     11.90942  0.5012018  1.518869  0.12878618 
157 | ##    5     11.49920  0.5388000  1.636276  0.12788898 
158 | ##    6     11.22734  0.5560475  1.573719  0.12308173 
159 | ##    7     10.98385  0.5743623  1.597932  0.11307569 
160 | ##    8     10.93646  0.5835801  1.423035  0.10149986 
161 | ##    9     11.27032  0.5639006  1.400459  0.09743804 
162 | ##   10     11.71468  0.5353356  1.389022  0.10426772 
163 | ## 
164 | ## Rsquared was used to select the optimal model using  the largest value.
165 | ## The final value used for the model was ncomp = 8.
166 | ```
167 | 
168 | ```r
169 | pls_model.3 <- train(y = log(response_train),
170 |   x = training,
171 |   method = "pls",
172 |   metric = "Rsquared",
173 |   tuneLength = 10,
174 |   trControl = ctrl)
175 | pls_model.3
176 | ```
177 | 
178 | ```
179 | ## Partial Least Squares 
180 | ## 
181 | ##  125 samples
182 | ## 1107 predictors
183 | ## 
184 | ## No pre-processing
185 | ## Resampling: Cross-Validated (5 fold, repeated 5 times) 
186 | ## Summary of sample sizes: 100, 100, 99, 101, 100, 100, ... 
187 | ## Resampling results across tuning parameters:
188 | ## 
189 | ##   ncomp  RMSE      Rsquared   RMSE SD    Rsquared SD
190 | ##    1     1.394441  0.2209422  0.2093430  0.15466007 
191 | ##    2     1.263407  0.3538704  0.1414747  0.10271977 
192 | ##    3     1.184729  0.4310581  0.1491980  0.11057008 
193 | ##    4     1.146549  0.4644069  0.1344120  0.10952157 
194 | ##    5     1.129306  0.4825250  0.1235525  0.10382922 
195 | ##    6     1.105925  0.5051179  0.1484921  0.11964499 
196 | ##    7     1.088086  0.5202448  0.1387113  0.10900261 
197 | ##    8     1.058015  0.5470990  0.1483578  0.10895902 
198 | ##    9     1.049602  0.5577494  0.1392980  0.09936161 
199 | ##   10     1.062223  0.5519139  0.1482355  0.10572280 
200 | ## 
201 | ## Rsquared was used to select the optimal model using  the largest value.
202 | ## The final value used for the model was ncomp = 9.
203 | ```
204 | 
205 | ```r
206 | # post hoc: remove multicollinearity and refit
207 | remove <- findLinearCombos(training_filter)
208 | training_filter2 <- training_filter[ ,remove$remove]
209 | 
210 | pls_model.4 <- train(y = response_train,
211 |   x = training_filter2,
212 |   method = "pls",
213 |   metric = "Rsquared",
214 |   tuneLength = 10,
215 |   trControl = ctrl)
216 | pls_model.4
217 | ```
218 | 
219 | ```
220 | ## Partial Least Squares 
221 | ## 
222 | ## 125 samples
223 | ## 277 predictors
224 | ## 
225 | ## No pre-processing
226 | ## Resampling: Cross-Validated (5 fold, repeated 5 times) 
227 | ## Summary of sample sizes: 99, 100, 100, 100, 101, 100, ... 
228 | ## Resampling results across tuning parameters:
229 | ## 
230 | ##   ncomp  RMSE      Rsquared   RMSE SD   Rsquared SD
231 | ##    1     14.54842  0.2293263  1.877668  0.1760119  
232 | ##    2     12.80386  0.3991476  2.272226  0.1871358  
233 | ##    3     12.70781  0.4169945  1.743083  0.1609883  
234 | ##    4     12.63029  0.4255008  1.957180  0.1622419  
235 | ##    5     12.70340  0.4287808  2.043817  0.1557772  
236 | ##    6     12.59538  0.4455729  2.083568  0.1491574  
237 | ##    7     12.70171  0.4411228  2.268840  0.1523978  
238 | ##    8     12.85886  0.4320981  2.066638  0.1363235  
239 | ##    9     12.98160  0.4296388  2.023933  0.1315781  
240 | ##   10     13.29837  0.4131454  2.074848  0.1305968  
241 | ## 
242 | ## Rsquared was used to select the optimal model using  the largest value.
243 | ## The final value used for the model was ncomp = 6.
244 | ```
245 | 
246 | ```r
247 | # Predict on test data
248 | pls1_preds <- predict(pls_model.1, test)
249 | RMSE(pls1_preds, response_test)
250 | ```
251 | 
252 | ```
253 | ## [1] 14.74978
254 | ```
255 | 
256 | ```r
257 | cor(pls1_preds, response_test)^2  # Rsquared
258 | ```
259 | 
260 | ```
261 | ## [1] 0.3746941
262 | ```
263 | 
264 | ```r
265 | ggplot() + 
266 |   geom_point(aes(x = pls1_preds, y = response_test)) + 
267 |   theme_bw() +
268 |   labs(title = "PLS model 1 predictions vs. observed", 
269 |     x = "predicted permeability", 
270 |     y = "observed permeability")
271 | ```
272 | 
273 | ![](permeability_data_62_files/figure-html/unnamed-chunk-3-1.png)\
274 | 
275 | ```r
276 | pls2_preds <- predict(pls_model.2, test)
277 | RMSE(pls2_preds, response_test)
278 | ```
279 | 
280 | ```
281 | ## [1] 13.8051
282 | ```
283 | 
284 | ```r
285 | cor(pls2_preds, response_test)^2  # Rsquared
286 | ```
287 | 
288 | ```
289 | ## [1] 0.4424359
290 | ```
291 | 
292 | ```r
293 | ggplot() + 
294 |   geom_point(aes(x = pls2_preds, y = response_test)) + 
295 |   theme_bw() +
296 |   labs(title = "PLS model 2 predictions vs. observed", 
297 |     x = "predicted permeability", 
298 |     y = "observed permeability")
299 | ```
300 | 
301 | ![](permeability_data_62_files/figure-html/unnamed-chunk-3-2.png)\
302 | 
303 | ```r
304 | pls3_preds <- predict(pls_model.3, test)
305 | RMSE(exp(pls3_preds), response_test)
306 | ```
307 | 
308 | ```
309 | ## [1] 13.75083
310 | ```
311 | 
312 | ```r
313 | cor(exp(pls3_preds), response_test)^2  # Rsquared
314 | ```
315 | 
316 | ```
317 | ## [1] 0.3905663
318 | ```
319 | 
320 | ```r
321 | ggplot() + 
322 |   geom_point(aes(x = pls3_preds, y = response_test)) + 
323 |   theme_bw() +
324 |   labs(title = "PLS model 3 predictions vs. observed", 
325 |     x = "predicted permeability", 
326 |     y = "observed permeability")
327 | ```
328 | 
329 | ![](permeability_data_62_files/figure-html/unnamed-chunk-3-3.png)\
330 | 
331 | ```r
332 | # can it predict well within a certain range??
333 | RMSE(exp(pls3_preds[response_test<20]), response_test[response_test<20])
334 | ```
335 | 
336 | ```
337 | ## [1] 12.19276
338 | ```
339 | 
340 | ```r
341 | pls4_preds <- predict(pls_model.4, test)
342 | RMSE(pls4_preds, response_test)
343 | ```
344 | 
345 | ```
346 | ## [1] 14.45052
347 | ```
348 | 
349 | ```r
350 | cor(pls4_preds, response_test)^2  # Rsquared
351 | ```
352 | 
353 | ```
354 | ## [1] 0.3826303
355 | ```
356 | 
357 | ```r
358 | ggplot() + 
359 |   geom_point(aes(x = pls4_preds, y = response_test)) + 
360 |   theme_bw() +
361 |   labs(title = "PLS model 4 predictions vs. observed", 
362 |     x = "predicted permeability", 
363 |     y = "observed permeability")
364 | ```
365 | 
366 | ![](permeability_data_62_files/figure-html/unnamed-chunk-3-4.png)\
367 | 
368 | #### Fit and ridge regression, LASSO and elastic net models
369 | 
370 | As with the PLS models, none of the models were stong predictors of the data, suggesting
371 | that laboratory methods of measuring permeability are prefereable (at least) to the
372 | models fitted. The PLS models actually outperformed the shrinkage methods. Further the 
373 | cross validated estimates of RMSE and R2 were quite inaccurate compared to the test fits. 
374 | There appears to be residual instability in the cofficients even in these shrinkage methods, 
375 | with enet and ridge producing hugely negative predictions and RMSE before being tuned 
376 | over a predefined range of shrinkage coefficients. 
377 | 
378 | 
379 | ```r
380 | # Fit shrinkage models
381 | ctrl <- trainControl("cv", number = 5)
382 | 
383 | ridge_grid <- expand.grid(.lambda = seq(0.05, 0.2, 0.01))
384 | ridge_model <- train(y = response_train, 
385 |   x = training_filter2,  # model fitting impacted by zero variance
386 |   method = "ridge",
387 |   tuneGrid = ridge_grid,
388 |   metric = "RMSE",
389 |   trControl = ctrl)
390 | ridge_model
391 | ```
392 | 
393 | ```
394 | ## Ridge Regression 
395 | ## 
396 | ## 125 samples
397 | ## 277 predictors
398 | ## 
399 | ## No pre-processing
400 | ## Resampling: Cross-Validated (5 fold) 
401 | ## Summary of sample sizes: 100, 100, 100, 99, 101 
402 | ## Resampling results across tuning parameters:
403 | ## 
404 | ##   lambda  RMSE      Rsquared   RMSE SD   Rsquared SD
405 | ##   0.05    14.26429  0.3469361  3.240907  0.1980152  
406 | ##   0.06    14.13896  0.3559712  3.218352  0.1956677  
407 | ##   0.07    14.04347  0.3635573  3.209867  0.1942977  
408 | ##   0.08    13.96978  0.3702479  3.205188  0.1934118  
409 | ##   0.09    13.90489  0.3761472  3.201440  0.1928339  
410 | ##   0.10    13.84637  0.3815359  3.198597  0.1925102  
411 | ##   0.11    13.81797  0.3856901  3.223695  0.1930815  
412 | ##   0.12    13.78728  0.3898212  3.235720  0.1934281  
413 | ##   0.13    13.76418  0.3935468  3.250413  0.1939387  
414 | ##   0.14    13.70465  0.3988747  3.194384  0.1925732  
415 | ##   0.15    13.72007  0.4007911  3.257135  0.1944723  
416 | ##   0.16    13.73433  0.4025027  3.312094  0.1964504  
417 | ##   0.17    13.65160  0.4088949  3.194072  0.1933448  
418 | ##   0.18    13.67577  0.4103241  3.253685  0.1952229  
419 | ##   0.19    13.63409  0.4146022  3.194897  0.1940248  
420 | ##   0.20    13.65158  0.4162339  3.233357  0.1954084  
421 | ## 
422 | ## RMSE was used to select the optimal model using  the smallest value.
423 | ## The final value used for the model was lambda = 0.19.
424 | ```
425 | 
426 | ```r
427 | # remaining instability in the coefficients?
428 | plot(ridge_model$finalModel)
429 | title(main = "Ridge regression coefficient stability")  
430 | ```
431 | 
432 | ![](permeability_data_62_files/figure-html/unnamed-chunk-4-1.png)\
433 | 
434 | ```r
435 | lasso_model <- train(y = response_train, 
436 |   x = training_filter2,  
437 |   method = "lasso",
438 |   tuneLength = 10,
439 |   metric = "RMSE",
440 |   trControl = ctrl)
441 | lasso_model
442 | ```
443 | 
444 | ```
445 | ## The lasso 
446 | ## 
447 | ## 125 samples
448 | ## 277 predictors
449 | ## 
450 | ## No pre-processing
451 | ## Resampling: Cross-Validated (5 fold) 
452 | ## Summary of sample sizes: 98, 101, 101, 99, 101 
453 | ## Resampling results across tuning parameters:
454 | ## 
455 | ##   fraction   RMSE        Rsquared   RMSE SD    Rsquared SD
456 | ##   0.1000000    9110.973  0.3242448   20342.54  0.2305155  
457 | ##   0.1888889   17103.552  0.3248907   38214.02  0.2576174  
458 | ##   0.2777778   33554.929  0.3315620   75000.16  0.2734539  
459 | ##   0.3666667   51774.847  0.3286516  115740.61  0.2690170  
460 | ##   0.4555556   70602.749  0.3143725  157840.22  0.2537059  
461 | ##   0.5444444   90321.095  0.3037186  201931.22  0.2403576  
462 | ##   0.6333333  110101.793  0.2976369  246161.76  0.2304663  
463 | ##   0.7222222  129916.309  0.2949451  290468.13  0.2229269  
464 | ##   0.8111111  149318.484  0.2870511  333852.23  0.2160301  
465 | ##   0.9000000  168562.774  0.2763813  376883.05  0.2062604  
466 | ## 
467 | ## RMSE was used to select the optimal model using  the smallest value.
468 | ## The final value used for the model was fraction = 0.1.
469 | ```
470 | 
471 | ```r
472 | enet_grid <- expand.grid(.lambda = c(0.01, 0.02, 0.03, 0.05), .fraction = c(seq(0.00001, 0.2, 0.02)))
473 | enet_model <- train(y = response_train, 
474 |   x = training_filter,
475 |   method = "enet",
476 |   tuneGrid = enet_grid,
477 |   metric = "RMSE",
478 |   trControl = ctrl)
479 | enet_model
480 | ```
481 | 
482 | ```
483 | ## Elasticnet 
484 | ## 
485 | ## 125 samples
486 | ## 370 predictors
487 | ## 
488 | ## No pre-processing
489 | ## Resampling: Cross-Validated (5 fold) 
490 | ## Summary of sample sizes: 101, 100, 100, 101, 98 
491 | ## Resampling results across tuning parameters:
492 | ## 
493 | ##   lambda  fraction  RMSE      Rsquared   RMSE SD    Rsquared SD
494 | ##   0.01    0.00001   16.20387  0.4104844  1.0182184  0.18272955 
495 | ##   0.01    0.02001   13.87268  0.4150714  1.0692719  0.16420088 
496 | ##   0.01    0.04001   12.78141  0.4175494  1.1372225  0.10885746 
497 | ##   0.01    0.06001   12.54578  0.4085081  1.1839283  0.09903446 
498 | ##   0.01    0.08001   12.33540  0.4236548  0.9795106  0.08453264 
499 | ##   0.01    0.10001   12.21048  0.4354692  0.8230406  0.07702949 
500 | ##   0.01    0.12001   12.11723  0.4458469  0.8213061  0.07598583 
501 | ##   0.01    0.14001   12.09290  0.4501368  0.8129704  0.07798469 
502 | ##   0.01    0.16001   12.05235  0.4565517  0.8165102  0.08163804 
503 | ##   0.01    0.18001   12.01800  0.4625189  0.8127518  0.08684807 
504 | ##   0.02    0.00001   16.20410  0.4104844  1.0182928  0.18272955 
505 | ##   0.02    0.02001   14.09418  0.4143868  1.0478786  0.16432456 
506 | ##   0.02    0.04001   12.89163  0.4246374  1.1253347  0.11811265 
507 | ##   0.02    0.06001   12.59891  0.4094209  1.1983122  0.10544503 
508 | ##   0.02    0.08001   12.44036  0.4151066  1.1398766  0.09682953 
509 | ##   0.02    0.10001   12.31475  0.4257743  0.9451951  0.08501402 
510 | ##   0.02    0.12001   12.19302  0.4375956  0.8355392  0.07922614 
511 | ##   0.02    0.14001   12.11181  0.4463517  0.8308710  0.07818994 
512 | ##   0.02    0.16001   12.08293  0.4507623  0.8141316  0.07903433 
513 | ##   0.02    0.18001   12.04035  0.4570083  0.8130894  0.08193302 
514 | ##   0.03    0.00001   16.20421  0.4104844  1.0182682  0.18272955 
515 | ##   0.03    0.02001   14.21115  0.4124214  1.0029421  0.16402693 
516 | ##   0.03    0.04001   12.99712  0.4234639  1.1658812  0.13583843 
517 | ##   0.03    0.06001   12.59611  0.4141328  1.1642766  0.10568952 
518 | ##   0.03    0.08001   12.50335  0.4100662  1.2201652  0.10547035 
519 | ##   0.03    0.10001   12.36856  0.4208988  1.1034849  0.09941567 
520 | ##   0.03    0.12001   12.25816  0.4310796  0.9459415  0.09120199 
521 | ##   0.03    0.14001   12.15817  0.4413571  0.8579290  0.08405673 
522 | ##   0.03    0.16001   12.07518  0.4499026  0.8397519  0.08414664 
523 | ##   0.03    0.18001   12.03730  0.4549964  0.8274005  0.08638877 
524 | ##   0.05    0.00001   16.20440  0.4104844  1.0183573  0.18272955 
525 | ##   0.05    0.02001   14.41209  0.4114486  1.0296163  0.16568131 
526 | ##   0.05    0.04001   13.23826  0.4232309  1.2315575  0.15481758 
527 | ##   0.05    0.06001   12.68330  0.4206394  1.1718478  0.11273411 
528 | ##   0.05    0.08001   12.52746  0.4113844  1.1777974  0.10429839 
529 | ##   0.05    0.10001   12.46332  0.4135083  1.2256036  0.10605972 
530 | ##   0.05    0.12001   12.39655  0.4192155  1.1207993  0.09901656 
531 | ##   0.05    0.14001   12.32335  0.4264486  0.9972745  0.09118388 
532 | ##   0.05    0.16001   12.22329  0.4367257  0.9115938  0.08424315 
533 | ##   0.05    0.18001   12.13832  0.4456274  0.8794157  0.08362259 
534 | ## 
535 | ## RMSE was used to select the optimal model using  the smallest value.
536 | ## The final values used for the model were fraction = 0.18001 and lambda
537 | ##  = 0.01.
538 | ```
539 | 
540 | ```r
541 | # Test shrinkage models
542 | ridge_preds <- predict(ridge_model, test)
543 | RMSE(ridge_preds, response_test)
544 | ```
545 | 
546 | ```
547 | ## [1] 41.89106
548 | ```
549 | 
550 | ```r
551 | cor(ridge_preds, response_test)^2
552 | ```
553 | 
554 | ```
555 | ## [1] 0.1116555
556 | ```
557 | 
558 | ```r
559 | lasso_preds <- predict(lasso_model, test)
560 | RMSE(lasso_preds, response_test)
561 | ```
562 | 
563 | ```
564 | ## [1] 2.368724e+15
565 | ```
566 | 
567 | ```r
568 | cor(lasso_preds, response_test)^2
569 | ```
570 | 
571 | ```
572 | ## [1] 0.1439556
573 | ```
574 | 
575 | ```r
576 | enet_preds <- predict(enet_model, test)
577 | RMSE(enet_preds, response_test)
578 | ```
579 | 
580 | ```
581 | ## [1] 30.95827
582 | ```
583 | 
584 | ```r
585 | cor(enet_preds, response_test)^2
586 | ```
587 | 
588 | ```
589 | ## [1] 0.2149163
590 | ```
591 | 
592 | 
593 | 
594 | 


--------------------------------------------------------------------------------
/linear-regression-and-cousins/IR_data_61.md:
--------------------------------------------------------------------------------
  1 | # Ex6.1 - Linear regression: IR spectrum of food
  2 | Oisin Fitzgerald  
  3 | 26 January 2016  
  4 | ### The data:
  5 | The data provides an infrared (IR) profile and analytical chemistry determined 
  6 | percent content of water, fat, and protein for meat samples. If there can be establish 
  7 | a predictive relationship between IR spectrum and fat content, then food scientists 
  8 | could predict a sample’s fat content with IR instead of using analytical chemistry
  9 | 
 10 | ### Outline:
 11 | 1. What is the relationship between the predictors? Are they highly correlated given 
 12 | the same food sample is measured at many IR wavelengths?
 13 | 2. Create training/test split
 14 | 3. Fit different models
 15 |   + Linear regression
 16 |   + Ridge regression, lasso and elastic net
 17 |   + PCR and PLS
 18 | 4. Compare models predictive ability
 19 | 
 20 | 
 21 | ```r
 22 | # load data and packages
 23 | library(car)
 24 | library(lars)
 25 | ```
 26 | 
 27 | ```
 28 | ## Loaded lars 1.2
 29 | ```
 30 | 
 31 | ```r
 32 | library(broom)
 33 | library(reshape2)
 34 | suppressMessages(library(elasticnet))
 35 | suppressMessages(library(pls))
 36 | suppressMessages(library(caret))
 37 | 
 38 | data(tecator) # from caret
 39 | ```
 40 | 
 41 | ### 1. Relationship between predictors and distributions 
 42 | 
 43 | 
 44 | ```r
 45 | # correlation
 46 | XX <- cor(absorp)
 47 | XX[1:5, 1:5]  # everything is related to everything!
 48 | ```
 49 | 
 50 | ```
 51 | ##           [,1]      [,2]      [,3]      [,4]      [,5]
 52 | ## [1,] 1.0000000 0.9999908 0.9999649 0.9999243 0.9998715
 53 | ## [2,] 0.9999908 1.0000000 0.9999916 0.9999678 0.9999309
 54 | ## [3,] 0.9999649 0.9999916 1.0000000 0.9999923 0.9999707
 55 | ## [4,] 0.9999243 0.9999678 0.9999923 1.0000000 0.9999930
 56 | ## [5,] 0.9998715 0.9999309 0.9999707 0.9999930 1.0000000
 57 | ```
 58 | 
 59 | ```r
 60 | # PCA
 61 | pca_object <- prcomp(absorp)
 62 | percent_variance <- pca_object$sdev^2/sum(pca_object$sd^2)*100
 63 | head(percent_variance)
 64 | ```
 65 | 
 66 | ```
 67 | ## [1] 98.679162750  0.900926147  0.296292185  0.114005307  0.005754017
 68 | ## [6]  0.002516023
 69 | ```
 70 | 
 71 | ```r
 72 | # Predictor distributions
 73 | ggplot(data = data.frame(absorp)) + 
 74 |   geom_histogram(aes(x = X1), bins = 20, col = 1) +
 75 |   labs(title = "Histogram of IR wavelength no. 1",
 76 |     x = "Wavelength predictor 1")  # positive skew
 77 | ```
 78 | 
 79 | ![](IR_data_61_files/figure-html/unnamed-chunk-2-1.png)\
 80 | 
 81 | ### 2. Create a training/test split
 82 | 
 83 | * 75% of the data to the training set
 84 | * The predictor variables show positive skew which Yeo-Johnson estimated lambda
 85 | (of -1 i.e. reciprocal) altered
 86 | 
 87 | 
 88 | ```r
 89 | length(endpoints[ ,1])  # how many observations?
 90 | ```
 91 | 
 92 | ```
 93 | ## [1] 215
 94 | ```
 95 | 
 96 | ```r
 97 | # create partition index
 98 | data_split <- createDataPartition(endpoints[ ,1], p = .75)
 99 | data_split <- data_split$Resample1
100 | 
101 | # split data
102 | training <- absorp[data_split, ]
103 | test <- absorp[-data_split, ]
104 | train_resp <- endpoints[data_split, 2]  # column 2 is fat content
105 | test_resp <- endpoints[-data_split, 2]
106 | 
107 | # de-skew variables
108 | training <- data.frame(training)
109 | test <- data.frame(test)
110 | proc_object <- preProcess(training, 
111 |   method = c("YeoJohnson", "center", "scale"))
112 | training <- predict(proc_object, training)
113 | test <- predict(proc_object, test)
114 | ```
115 | 
116 | ### 3. Model fitting
117 | * Linear regression  
118 |     + Unsurprisingly prior removing of highly correlated predictors resulted in a model
119 |     with only one independent variable. The performance on cross-validation was poor.
120 | * Ridge regression
121 |     + The ridge model quickly highlighted the ability to improve on the linear regression
122 |     model. However, subsequent fitting of a lasso model showed that an ability to drive
123 |     the coefficients to zero was an advantage in the highly correlated predictor environment.
124 | * The lasso and elastic net
125 |     + As noted the lasso model outperformed the ridge model. The optimal solution resulted
126 |     in a large number of the coefficient being shrunk to zero
127 |     + Enet performed similar to the lasso, with the best performing model having a
128 |     low lambda for the ridge function  
129 | * Principal components and partial least squares regression
130 |     + These both performed quite well. The similarity of the PCR model to the PLS
131 |     models is likely related to the variance in the predictors (IR response) very much 
132 |     being a consequence of the variance in the response (food fat content), thus the 
133 |     unsupervised nature of PCA causing little detriment.
134 |     + The number of principal components was tuned rather than using the first two,
135 |     or fist few that explained 90% of variance etc.
136 | 
137 | 
138 | ```r
139 | ctrl <- trainControl(method = "cv", number = 5, repeats = 5)
140 | # Linear regression
141 | mc <- findCorrelation(training, cutoff = 0.95)
142 | training_linear <- data.frame(training[ ,-mc])
143 | # colnames(training_linear) <- "X1"
144 | linear_model <- train(y = train_resp,
145 |   x = training_linear,
146 |   method = "lm",
147 |   trControl = ctrl)
148 | linear_model
149 | ```
150 | 
151 | ```
152 | ## Linear Regression 
153 | ## 
154 | ## 163 samples
155 | ##  18 predictor
156 | ## 
157 | ## No pre-processing
158 | ## Resampling: Cross-Validated (5 fold) 
159 | ## Summary of sample sizes: 131, 131, 130, 129, 131 
160 | ## Resampling results
161 | ## 
162 | ##   RMSE      Rsquared   RMSE SD    Rsquared SD
163 | ##   2.463749  0.9598019  0.5468199  0.01877194 
164 | ## 
165 | ## 
166 | ```
167 | 
168 | ```r
169 | # Ridge Regression - penalise square of coefficient
170 | ridge_model <- train(y = train_resp,
171 |   x = training,
172 |   method = "ridge",
173 |   trControl = ctrl,
174 |   tuneLength = 10)
175 | ridge_model
176 | ```
177 | 
178 | ```
179 | ## Ridge Regression 
180 | ## 
181 | ## 163 samples
182 | ## 100 predictors
183 | ## 
184 | ## No pre-processing
185 | ## Resampling: Cross-Validated (5 fold) 
186 | ## Summary of sample sizes: 131, 131, 131, 130, 129 
187 | ## Resampling results across tuning parameters:
188 | ## 
189 | ##   lambda        RMSE      Rsquared   RMSE SD    Rsquared SD
190 | ##   0.0000000000  3.739994  0.9119283  0.6066684  0.028597803
191 | ##   0.0001000000  2.629873  0.9578282  0.2248336  0.006184386
192 | ##   0.0002371374  2.757391  0.9531612  0.2228996  0.007462386
193 | ##   0.0005623413  2.845905  0.9496715  0.2116318  0.008515350
194 | ##   0.0013335214  2.929242  0.9462729  0.2067412  0.010149144
195 | ##   0.0031622777  3.097153  0.9395144  0.2453322  0.014306433
196 | ##   0.0074989421  3.459726  0.9240639  0.3541803  0.023381991
197 | ##   0.0177827941  3.993045  0.8987086  0.4894325  0.036862571
198 | ##   0.0421696503  4.607857  0.8662201  0.6188467  0.053116332
199 | ##   0.1000000000  5.414257  0.8148385  0.7583973  0.074649376
200 | ## 
201 | ## RMSE was used to select the optimal model using  the smallest value.
202 | ## The final value used for the model was lambda = 1e-04.
203 | ```
204 | 
205 | ```r
206 | plot(ridge_model)
207 | ```
208 | 
209 | ![](IR_data_61_files/figure-html/unnamed-chunk-4-1.png)\
210 | 
211 | ```r
212 | # Lasso - penalise absolute value of coeffienct
213 | lasso_grid <- expand.grid(.fraction = seq(0.001, 0.1, 0.01))
214 | lasso_model <- train(y = train_resp,
215 |   x = training,
216 |   method = "lasso",
217 |   trControl = ctrl,
218 |   tuneGrid = lasso_grid)
219 | lasso_model
220 | ```
221 | 
222 | ```
223 | ## The lasso 
224 | ## 
225 | ## 163 samples
226 | ## 100 predictors
227 | ## 
228 | ## No pre-processing
229 | ## Resampling: Cross-Validated (5 fold) 
230 | ## Summary of sample sizes: 131, 130, 130, 130, 131 
231 | ## Resampling results across tuning parameters:
232 | ## 
233 | ##   fraction  RMSE      Rsquared   RMSE SD    Rsquared SD
234 | ##   0.001     2.498027  0.9586242  0.5517407  0.016702083
235 | ##   0.011     1.706257  0.9809880  0.4545091  0.007295174
236 | ##   0.021     1.616745  0.9828973  0.4571834  0.007445864
237 | ##   0.031     1.641162  0.9822315  0.4325699  0.007161921
238 | ##   0.041     1.690999  0.9809893  0.4493865  0.008285186
239 | ##   0.051     1.745590  0.9795241  0.4873115  0.009721832
240 | ##   0.061     1.812704  0.9777559  0.5213582  0.011084715
241 | ##   0.071     1.880056  0.9759682  0.5502445  0.012306083
242 | ##   0.081     1.949916  0.9740845  0.5786471  0.013559222
243 | ##   0.091     2.013504  0.9724054  0.5906854  0.014314542
244 | ## 
245 | ## RMSE was used to select the optimal model using  the smallest value.
246 | ## The final value used for the model was fraction = 0.021.
247 | ```
248 | 
249 | ```r
250 | plot(lasso_model)
251 | ```
252 | 
253 | ![](IR_data_61_files/figure-html/unnamed-chunk-4-2.png)\
254 | 
255 | ```r
256 | # Elastic Net - combination of ridge and lasso
257 | enet_grid <- expand.grid(.fraction = seq(0.001, 0.1, 0.01), .lambda = c(0, 0.0001, 0.001, 0.01))
258 | enet_model <- train(y = train_resp,
259 |   x = training,
260 |   method = "enet",
261 |   trControl = ctrl,
262 |   tuneGrid = enet_grid)
263 | enet_model
264 | ```
265 | 
266 | ```
267 | ## Elasticnet 
268 | ## 
269 | ## 163 samples
270 | ## 100 predictors
271 | ## 
272 | ## No pre-processing
273 | ## Resampling: Cross-Validated (5 fold) 
274 | ## Summary of sample sizes: 131, 130, 130, 131, 130 
275 | ## Resampling results across tuning parameters:
276 | ## 
277 | ##   lambda  fraction  RMSE       Rsquared   RMSE SD    Rsquared SD
278 | ##   0e+00   0.001      2.456576  0.9577158  0.2733749  0.011822745
279 | ##   0e+00   0.011      1.625075  0.9823436  0.3002864  0.004490752
280 | ##   0e+00   0.021      1.542594  0.9840462  0.3650790  0.005845262
281 | ##   0e+00   0.031      1.587296  0.9828909  0.4460547  0.008348075
282 | ##   0e+00   0.041      1.597265  0.9825894  0.4840485  0.009325435
283 | ##   0e+00   0.051      1.634045  0.9816044  0.5281381  0.010634739
284 | ##   0e+00   0.061      1.700641  0.9799644  0.5649473  0.011780352
285 | ##   0e+00   0.071      1.767640  0.9783395  0.5926034  0.012665845
286 | ##   0e+00   0.081      1.824923  0.9770237  0.5923753  0.012675485
287 | ##   0e+00   0.091      1.883738  0.9755821  0.5988473  0.013059291
288 | ##   1e-04   0.001     11.596061  0.3239978  1.1803964  0.096503222
289 | ##   1e-04   0.011      9.927933  0.3579037  1.3335128  0.112225044
290 | ##   1e-04   0.021      9.356587  0.4486944  1.2675170  0.129496136
291 | ##   1e-04   0.031      8.798182  0.5326704  1.2055201  0.139155795
292 | ##   1e-04   0.041      8.252979  0.6063615  1.1468485  0.140999618
293 | ##   1e-04   0.051      7.725916  0.6679715  1.0930366  0.136610017
294 | ##   1e-04   0.061      7.216946  0.7182172  1.0439252  0.128483466
295 | ##   1e-04   0.071      6.731303  0.7581199  1.0003492  0.118015584
296 | ##   1e-04   0.081      6.276621  0.7886260  0.9627171  0.106885189
297 | ##   1e-04   0.091      5.856909  0.8118695  0.9291989  0.096011467
298 | ##   1e-03   0.001     11.829583  0.3244982  1.1872087  0.095754537
299 | ##   1e-03   0.011     10.485965  0.3258426  1.2318222  0.096750294
300 | ##   1e-03   0.021     10.055309  0.3374139  1.3522166  0.105781522
301 | ##   1e-03   0.031      9.814328  0.3759592  1.3257997  0.113461485
302 | ##   1e-03   0.041      9.575285  0.4142052  1.2999088  0.120175687
303 | ##   1e-03   0.051      9.338072  0.4517076  1.2740945  0.125751839
304 | ##   1e-03   0.061      9.103097  0.4880241  1.2482033  0.130111816
305 | ##   1e-03   0.071      8.871446  0.5226429  1.2223026  0.133169835
306 | ##   1e-03   0.081      8.642478  0.5554473  1.1968951  0.134970623
307 | ##   1e-03   0.091      8.416020  0.5863351  1.1717988  0.135674594
308 | ##   1e-02   0.001     11.899480  0.3246449  1.1857761  0.095527559
309 | ##   1e-02   0.011     10.983045  0.3247104  1.1955365  0.096268084
310 | ##   1e-02   0.021     10.357981  0.3259192  1.2344795  0.097581761
311 | ##   1e-02   0.031     10.083140  0.3294455  1.3325124  0.101116372
312 | ##   1e-02   0.041      9.970072  0.3504728  1.3452981  0.107880681
313 | ##   1e-02   0.051      9.827450  0.3731767  1.3282255  0.112164220
314 | ##   1e-02   0.061      9.685545  0.3957958  1.3113342  0.116141469
315 | ##   1e-02   0.071      9.544374  0.4182280  1.2944268  0.119774964
316 | ##   1e-02   0.081      9.404137  0.4403542  1.2778098  0.123037552
317 | ##   1e-02   0.091      9.264591  0.4621313  1.2612852  0.125902998
318 | ## 
319 | ## RMSE was used to select the optimal model using  the smallest value.
320 | ## The final values used for the model were fraction = 0.021 and lambda = 0.
321 | ```
322 | 
323 | ```r
324 | plot(enet_model)
325 | ```
326 | 
327 | ![](IR_data_61_files/figure-html/unnamed-chunk-4-3.png)\
328 | 
329 | ```r
330 | # PCR - 
331 | pcr_results <- list(results = data.frame(RMSE = NA, RMSE_sd = NA), final = NA)
332 | for (i in 1:20) {
333 |   # fit model
334 |   train_data <- princomp(training)$scores[ ,1:i]
335 |   train_data <- data.frame(train_data)
336 |   pcr_model <- train(y = train_resp,
337 |   x = train_data,
338 |   method = "lm",
339 |   trControl = ctrl)
340 |   
341 |   # extract results
342 |   pcr_results$results[i, 1] <- pcr_model$results$RMSE
343 |   pcr_results$results[i, 2] <- pcr_model$results$RMSESD
344 |   
345 |   # extract model
346 |   if (all(pcr_model$results$RMSE <= pcr_results$results$RMSE)) {
347 |     pcr_results$final <- pcr_model
348 |     }
349 | }
350 | pcr_results
351 | ```
352 | 
353 | ```
354 | ## $results
355 | ##         RMSE   RMSE_sd
356 | ## 1  10.492353 1.0992732
357 | ## 2  10.068703 1.4639494
358 | ## 3   7.348469 0.3565198
359 | ## 4   4.853879 0.8780493
360 | ## 5   2.985895 0.3555118
361 | ## 6   2.920385 0.8442764
362 | ## 7   2.974529 0.5519718
363 | ## 8   3.008025 0.3828759
364 | ## 9   2.762710 0.1345710
365 | ## 10  2.656554 0.5481411
366 | ## 11  2.443768 0.4993940
367 | ## 12  2.448595 0.3380642
368 | ## 13  2.313984 0.6340779
369 | ## 14  2.073892 0.1928407
370 | ## 15  1.955401 0.1913254
371 | ## 16  1.998280 0.4672237
372 | ## 17  1.985154 0.4377664
373 | ## 18  1.721687 0.3300310
374 | ## 19  1.755865 0.3875057
375 | ## 20  1.849941 0.3611754
376 | ## 
377 | ## $final
378 | ## Linear Regression 
379 | ## 
380 | ## 163 samples
381 | ##  18 predictor
382 | ## 
383 | ## No pre-processing
384 | ## Resampling: Cross-Validated (5 fold) 
385 | ## Summary of sample sizes: 130, 131, 130, 131, 130 
386 | ## Resampling results
387 | ## 
388 | ##   RMSE      Rsquared   RMSE SD   Rsquared SD
389 | ##   1.721687  0.9800546  0.330031  0.004523028
390 | ## 
391 | ## 
392 | ```
393 | 
394 | ```r
395 | # PLS
396 | pls_grid <- expand.grid(.ncomp = seq(10, 20, 1))
397 | pls_model <- train(y = train_resp,
398 |   x = training,
399 |   method = "pls",
400 |   trControl = ctrl,
401 |   preProcess = c("center", "scale"),
402 |   tuneGrid = pls_grid)
403 | pls_model
404 | ```
405 | 
406 | ```
407 | ## Partial Least Squares 
408 | ## 
409 | ## 163 samples
410 | ## 100 predictors
411 | ## 
412 | ## Pre-processing: centered (100), scaled (100) 
413 | ## Resampling: Cross-Validated (5 fold) 
414 | ## Summary of sample sizes: 130, 131, 131, 129, 131 
415 | ## Resampling results across tuning parameters:
416 | ## 
417 | ##   ncomp  RMSE      Rsquared   RMSE SD    Rsquared SD
418 | ##   10     2.198022  0.9652696  0.4323613  0.013561087
419 | ##   11     2.077700  0.9686279  0.2833400  0.012102779
420 | ##   12     1.941817  0.9743164  0.2371756  0.005813342
421 | ##   13     1.756872  0.9799257  0.4054312  0.005654873
422 | ##   14     1.565419  0.9838826  0.4455804  0.005821119
423 | ##   15     1.519931  0.9849073  0.3392128  0.003558747
424 | ##   16     1.500339  0.9853390  0.3783697  0.003795604
425 | ##   17     1.541301  0.9843025  0.3130718  0.002964976
426 | ##   18     1.521648  0.9846310  0.2903265  0.002670155
427 | ##   19     1.539470  0.9845291  0.2975242  0.002202640
428 | ##   20     1.612049  0.9828719  0.3038365  0.002147520
429 | ## 
430 | ## RMSE was used to select the optimal model using  the smallest value.
431 | ## The final value used for the model was ncomp = 16.
432 | ```
433 | 
434 | ### 4. Compare performance on test set
435 | * The results from fitting on the test set followed from the cross-validation 
436 | included in model fitting. Linear regression did very poorly with ridge regression 
437 | slightly worse off than the group of lasso, elastic net, PCR and PLS. 
438 | * In context the RMSE and correlation between predicted and observed results are 
439 | superb, and surely suggest that any of these models could be used in measuring the
440 | fat content of food using infrared.
441 | * Given the similarities in the model performances I was interested in constructing
442 | confidence intervals around the RMSE. A function to calculate bootstrap 
443 | estimation and its results are shown below.
444 | 
445 | 
446 | ```r
447 | # Linear regression
448 | test_linear <- data.frame(test[ ,-mc])
449 | colnames(test_linear) <- colnames(training_linear)
450 | linear_pred <- predict(linear_model, test_linear)
451 | ggplot() + geom_point(aes(x = linear_pred, y = test_resp))
452 | ```
453 | 
454 | ![](IR_data_61_files/figure-html/unnamed-chunk-5-1.png)\
455 | 
456 | ```r
457 | n <- length(test_resp)
458 | RMSE_lm <- sqrt(sum((test_resp - linear_pred)^2)/n); RMSE_lm
459 | ```
460 | 
461 | ```
462 | ## [1] 2.474259
463 | ```
464 | 
465 | ```r
466 | # Ridge regression
467 | ridge_preds <- predict(ridge_model, test)
468 | ggplot() + geom_point(aes(x = ridge_preds, y = test_resp))
469 | ```
470 | 
471 | ![](IR_data_61_files/figure-html/unnamed-chunk-5-2.png)\
472 | 
473 | ```r
474 | RMSE_ridge <- sqrt(sum((test_resp - ridge_preds)^2)/n); RMSE_ridge
475 | ```
476 | 
477 | ```
478 | ## [1] 3.214482
479 | ```
480 | 
481 | ```r
482 | # Lasso
483 | lasso_preds <- predict(lasso_model, test)
484 | ggplot() + geom_point(aes(x = lasso_preds, y = test_resp))
485 | ```
486 | 
487 | ![](IR_data_61_files/figure-html/unnamed-chunk-5-3.png)\
488 | 
489 | ```r
490 | RMSE_lasso <- sqrt(sum((test_resp - lasso_preds)^2)/n); RMSE_lasso
491 | ```
492 | 
493 | ```
494 | ## [1] 1.638846
495 | ```
496 | 
497 | ```r
498 | # Elastic net
499 | enet_preds <- predict(enet_model, test)
500 | ggplot() + geom_point(aes(x = enet_preds, y = test_resp))
501 | ```
502 | 
503 | ![](IR_data_61_files/figure-html/unnamed-chunk-5-4.png)\
504 | 
505 | ```r
506 | RMSE_enet <- sqrt(sum((test_resp - enet_preds)^2)/n); RMSE_enet
507 | ```
508 | 
509 | ```
510 | ## [1] 1.638846
511 | ```
512 | 
513 | ```r
514 | # PCR
515 | pca_train <- princomp(training)
516 | test_pcs <- predict(pca_train, test)
517 | pcr_preds <- predict(pcr_results$final, test_pcs)
518 | ggplot() + geom_point(aes(x = pcr_preds, y = test_resp))
519 | ```
520 | 
521 | ![](IR_data_61_files/figure-html/unnamed-chunk-5-5.png)\
522 | 
523 | ```r
524 | RMSE_pcr <- sqrt(sum((test_resp - pcr_preds)^2)/n); RMSE_pcr
525 | ```
526 | 
527 | ```
528 | ## [1] 1.934021
529 | ```
530 | 
531 | ```r
532 | # PLS
533 | pls_preds <- predict(pls_model, test)
534 | ggplot() + geom_point(aes(x = pls_preds, y = test_resp))
535 | ```
536 | 
537 | ![](IR_data_61_files/figure-html/unnamed-chunk-5-6.png)\
538 | 
539 | ```r
540 | RMSE_pls <- sqrt(sum((test_resp - pls_preds)^2)/n); RMSE_pls
541 | ```
542 | 
543 | ```
544 | ## [1] 1.69883
545 | ```
546 | 
547 | ```r
548 | cor(pls_preds, test_resp)
549 | ```
550 | 
551 | ```
552 | ## [1] 0.993344
553 | ```
554 | 
555 | ### 4. Compare performance on test set contd. 
556 | * Bootstrap estimate of RMSE confidence interval
557 |     + PLS appears to be the prefered model, it shows the least variation in its RMSE scores
558 |     across the bootstrap samples. PCR is likely similar, an issue with variable naming meant
559 |     I excluded it.
560 | 
561 | 
562 | ```r
563 | boostrap_RMSE <- function(model, data, obs, trials = 1000, CI = 0.95) {
564 |   
565 |   n <- nrow(data)
566 |   out <- list(results = data.frame(RMSE = NA), lower = NA, upper = NA)
567 |   
568 |   for (i in 1:trials) {
569 |     # create bootstrap sample
570 |     samp <- sample(n, size = n, replace = TRUE)
571 |     boot_obs <- obs[samp]
572 |     boot_data <- data.frame(data[samp, ])
573 |     colnames(boot_data) <- colnames(data)
574 |     # predict
575 |     preds <- predict(model, newdata = boot_data)
576 |     RMSE <- sqrt(sum((boot_obs - preds)^2)/n)
577 |     
578 |     out$results[i ,1] <- RMSE
579 |   }
580 |   
581 |   temp <- out$results$RMSE
582 |   temp <- quantile(temp, probs = c(0.025, 0.975), na.rm = TRUE)
583 |   
584 |   out$lower <- temp[1]
585 |   out$upper <- temp[2]
586 |   
587 |   out
588 | }
589 | ```
590 | 
591 | 
592 | 
593 | ```r
594 | # The bootstrap results
595 | bRMSE_lm <- boostrap_RMSE(linear_model, test_linear, test_resp)
596 | bRMSE_ridge <- boostrap_RMSE(ridge_model, test, test_resp)
597 | bRMSE_lasso <- boostrap_RMSE(lasso_model, test, test_resp)
598 | bRMSE_enet <- boostrap_RMSE(enet_model, test, test_resp)
599 | # bRMSE_pcr <- boostrap_RMSE(pcr_model, test, test_resp)
600 | bRMSE_pls <- boostrap_RMSE(pls_model, test, test_resp)
601 | 
602 | model_results <- data.frame(bRMSE_lm$results, bRMSE_ridge$results,
603 |   bRMSE_lasso$results, bRMSE_enet$results, bRMSE_pls$results)
604 | colnames(model_results) <- c('lm', 'ridge', 'lasso', 'enet', 'pls')
605 | 
606 | temp <- melt(model_results)
607 | ```
608 | 
609 | ```
610 | ## No id variables; using all as measure variables
611 | ```
612 | 
613 | ```r
614 | ggplot(data = temp, aes(x = variable, y = value)) + 
615 |   geom_boxplot(width = 0.5) + 
616 |   theme_bw() + 
617 |   labs(title = 'Bootstrap Estimates of Model Performance',
618 |     x = 'Model',
619 |     y = 'RMSE')
620 | ```
621 | 
622 | ![](IR_data_61_files/figure-html/unnamed-chunk-7-1.png)\
623 | 
624 | ### Conclusion
625 | 
626 | * The clear signal in the data meant that despite multicollinarity issues several 
627 | linear model fitting methods had no problem producing extremely predictive models. 
628 | * The predictors were highly correlated and likely possessed variations on same 
629 | information. Therefore possibly as a result of their ability to extract the minimal 
630 | dimension signal from several correlated variables PCR and PLS appear to have a slight 
631 | performance advantage over other models.
632 | 


--------------------------------------------------------------------------------