├── .Rbuildignore ├── .gitignore ├── AppliedPredictiveModeling.Rproj ├── DESCRIPTION ├── NAMESPACE ├── R ├── bookTheme.R ├── data.R ├── getPackages.R ├── panels.R ├── permuteRelief.R ├── quadBoundaryFunc.R └── scriptLocation.R ├── README.md ├── data ├── AlzheimerDisease.RData ├── ChemicalManufacturingProcess.RData ├── FuelEconomy.RData ├── abalone.RData ├── concrete.RData ├── hepatic.RData ├── logisticCreditPredictions.RData ├── permeability.RData ├── schedulingData.RData ├── segmentationOriginal.RData ├── solubility.RData └── twoClassData.RData ├── inst ├── NEWS.Rd └── chapters │ ├── 02_A_Short_Tour.R │ ├── 02_A_Short_Tour.Rout │ ├── 03_Data_Pre_Processing.R │ ├── 03_Data_Pre_Processing.Rout │ ├── 04_Over_Fitting.R │ ├── 04_Over_Fitting.Rout │ ├── 06_Linear_Regression.R │ ├── 06_Linear_Regression.Rout │ ├── 07_Non-Linear_Reg.R │ ├── 07_Non-Linear_Reg.Rout │ ├── 08_Regression_Trees.R │ ├── 08_Regression_Trees.Rout │ ├── 10_Case_Study_Concrete.R │ ├── 10_Case_Study_Concrete.Rout │ ├── 11_Class_Performance.R │ ├── 11_Class_Performance.Rout │ ├── 12_Discriminant_Analysis.R │ ├── 12_Discriminant_Analysis.Rout │ ├── 13_Non-Linear_Class.R │ ├── 13_Non-Linear_Class.Rout │ ├── 14_Class_Trees.R │ ├── 14_Class_Trees.Rout │ ├── 16_Class_Imbalance.R │ ├── 16_Class_Imbalance.Rout │ ├── 17_Job_Scheduling.R │ ├── 17_Job_Scheduling.Rout │ ├── 18_Importance.R │ ├── 18_Importance.Rout │ ├── 19_Feature_Select.R │ ├── 19_Feature_Select.Rout │ ├── CreateGrantData.R │ └── CreateGrantData.Rout └── man ├── AlzheimerDisease.Rd ├── ChemicalManufacturingProcess.Rd ├── FuelEconomy.Rd ├── Hepatic.Rd ├── abalone.Rd ├── apm-internal.Rd ├── bookTheme.Rd ├── concrete.Rd ├── getPackages.Rd ├── logisticCreditPredictions.Rd ├── permeability.Rd ├── permuteRelief.Rd ├── quadBoundaryFunc.Rd ├── schedulingData.Rd ├── scriptLocation.Rd ├── segmentationOriginal.Rd ├── solubility.Rd └── twoClassData.Rd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | 8 | # Example code in package build process 9 | *-Ex.R 10 | 11 | # Output files from R CMD build 12 | /*.tar.gz 13 | 14 | # Output files from R CMD check 15 | /*.Rcheck/ 16 | 17 | # RStudio files 18 | .Rproj.user/ 19 | 20 | # produced vignettes 21 | vignettes/*.html 22 | vignettes/*.pdf 23 | 24 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 25 | .httr-oauth 26 | 27 | # knitr and R markdown default cache directories 28 | /*_cache/ 29 | /cache/ 30 | 31 | # Temporary files created by R markdown 32 | *.utf8.md 33 | *.knit.md 34 | 35 | # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html 36 | rsconnect/ 37 | inst/.DS_Store 38 | -------------------------------------------------------------------------------- /AppliedPredictiveModeling.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageUseDevtools: Yes 17 | PackageInstallArgs: --no-multiarch --with-keep.source 18 | PackageRoxygenize: rd,collate,namespace 19 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: AppliedPredictiveModeling 2 | Type: Package 3 | Title: Functions and Data Sets for 'Applied Predictive Modeling' 4 | Version: 1.1.9000 5 | Authors@R: c( 6 | person("Max", "Kuhn", , "mxkuhn@gmail.com", role = c("aut", "cre")), 7 | person("Kjell", "Johnson", role = c("aut")) 8 | ) 9 | Description: A few functions and several data set for the Springer book 10 | 'Applied Predictive Modeling'. 11 | URL: http://appliedpredictivemodeling.com/ 12 | Depends: R (>= 2.10) 13 | Imports: CORElearn, MASS, lattice, ellipse, reshape2 14 | Suggests: caret (>= 6.0-22) 15 | License: GPL-2 16 | RoxygenNote: 7.2.3 17 | Encoding: UTF-8 18 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(bookTheme) 4 | export(easyBoundaryFunc) 5 | export(getPackages) 6 | export(permuteRelief) 7 | export(quadBoundaryFunc) 8 | export(scriptLocation) 9 | export(transparentTheme) 10 | -------------------------------------------------------------------------------- /R/bookTheme.R: -------------------------------------------------------------------------------- 1 | #' Lattice Themes 2 | #' 3 | #' Two \pkg{lattice} themes used throughout the book. 4 | #' 5 | #' When using these functions to save a plot, make sure to invoke them after 6 | #' the device has been opened (e.g. after calls such as \code{pdf()}. 7 | #' 8 | #' @aliases bookTheme transparentTheme 9 | #' @param set a logical: should these settings be applied to the current 10 | #' device? 11 | #' @param pchSize the size of the plot symbols 12 | #' @param trans the amount of transparency (via the alpha channel). Note that 13 | #' transparency is not supported by all graphics devices. 14 | #' @return Each function returns a list of theme parameters. See Sarkar (2008) 15 | #' or \code{\link[lattice]{trellis.par.get}} for specific details. 16 | #' @author Max Kuhn 17 | #' @references Some of the colors are based on values from ColorBrewer 18 | #' \url{http://www.colorbrewer.org}. 19 | #' 20 | #' Sarkar, D. (2008). Lattice: Multivariate Data Visualization with R. UseR! 21 | #' (1st ed. p. 286). Springer. 22 | #' @keywords hplot 23 | #' @examples 24 | #' 25 | #' library(lattice) 26 | #' 27 | #' example <- quadBoundaryFunc(100) 28 | #' 29 | #' bookTheme(set = TRUE) 30 | #' xyplot(X2 ~ X1, data = example, groups = class, auto.key = TRUE) 31 | #' 32 | #' transparentTheme(set = TRUE, trans = .6) 33 | #' xyplot(X2 ~ X1, data = example, groups = class, auto.key = TRUE) 34 | #' 35 | #' @export bookTheme 36 | bookTheme <- function(set = TRUE){ 37 | theme <- list( 38 | plot.polygon = list(alpha = 1, col = "aliceblue", border = "black", lty = 1, lwd = 1), 39 | background = list(col = "transparent"), 40 | bar.fill = list(col = "#cce6ff"), 41 | box.rectangle = list(col = "black"), 42 | box.umbrella = list(col = "black"), 43 | dot.line = list(col = "#e8e8e8"), 44 | dot.symbol = list(col = "black"), 45 | plot.line = list(col = "black", lwd = 1, lty = 1), 46 | plot.symbol = list(col = "black", pch = 16), 47 | regions = list(col = 48 | c("#FEF8FA", "#FDF6F9", "#FBF5F9", "#FAF3F8", 49 | "#F8F2F7", "#F7F0F7", "#F5EEF6", "#F4EDF5", 50 | "#F2EBF5", "#F1EAF4", "#EFE8F3", "#EDE7F2", 51 | "#ECE5F1", "#EAE4F1", "#E8E2F0", "#E6E1EF", 52 | "#E4DFEE", "#E2DEED", "#E0DCEC", "#DEDAEB", 53 | "#DCD9EA", "#D9D7E9", "#D7D6E8", "#D4D4E7", 54 | "#D1D2E6", "#CED1E5", "#CCCFE4", "#C8CEE3", 55 | "#C5CCE2", "#C2CAE1", "#BFC9E0", "#BBC7DF", 56 | "#B8C5DF", "#B4C4DE", "#B1C2DD", "#ADC0DC", 57 | "#A9BFDB", "#A6BDDA", "#A2BBD9", "#9EB9D9", 58 | "#9BB8D8", "#97B6D7", "#93B4D6", "#8FB2D5", 59 | "#8BB0D4", "#87AFD3", "#83ADD2", "#7FABD1", 60 | "#7AA9D0", "#76A7CF", "#71A5CE", "#6CA3CC", 61 | "#68A1CB", "#63A0CA", "#5D9EC9", "#589CC8", 62 | "#539AC6", "#4E98C5", "#4996C4", "#4493C3", 63 | "#3F91C1", "#3A8FC0", "#358DBF", "#308BBE", 64 | "#2C89BD", "#2887BC", "#2385BB", "#1F83BA", 65 | "#1C80B9", "#187EB7", "#157CB6", "#127AB5", 66 | "#0F78B3", "#0D76B2", "#0A73B0", "#0971AE", 67 | "#076FAC", "#066DAA", "#056AA7", "#0568A5") 68 | ), 69 | strip.shingle = list(col = c( 70 | "#ff7f00", "#00ff00", "#00ffff", 71 | "#ff00ff", "#ff0000", "#ffff00", "#0080ff") 72 | ), 73 | strip.background = list(col = c( 74 | "#ffe5cc", "#ccffcc", "#ccffff", 75 | "#ffccff", "#ffcccc", "#ffffcc", "#cce6ff") 76 | ), 77 | reference.line = list(col = "#e8e8e8"), 78 | superpose.line = list( 79 | col = c( 80 | "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black", 81 | "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black", 82 | "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black", 83 | "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black", 84 | "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black", 85 | "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black" 86 | ), 87 | lty = rep(1:6, each = 6)), 88 | superpose.symbol = list( 89 | pch = c( 90 | 1, 4, 6, 0, 5, 17, 91 | 4, 6, 0, 5, 17, 1, 92 | 6, 0, 5, 17, 1, 4, 93 | 0, 5, 17, 1, 4, 6, 94 | 5, 17, 1, 4, 6, 0 , 95 | 17, 1, 4, 6, 0, 5), 96 | cex = rep(0.7, 6 * 6), 97 | col = c( 98 | "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black", 99 | "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black", 100 | "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black", 101 | "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black", 102 | "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black", 103 | "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black" 104 | ) 105 | ) 106 | ) 107 | 108 | if(set) lattice::trellis.par.set(theme) 109 | invisible(theme) 110 | } 111 | 112 | #' @rdname bookTheme 113 | #' @export 114 | transparentTheme <- 115 | function(set = TRUE, pchSize = 1, trans = .2) { 116 | theme <- list( 117 | plot.polygon = list(alpha = 1, col = "aliceblue", border = "black", lty = 1, lwd = 1), 118 | background = list(col = "transparent"), 119 | bar.fill = list(col = "#cce6ff"), 120 | box.rectangle = list(col = "black"), 121 | box.umbrella = list(col = "black"), 122 | dot.line = list(col = "#e8e8e8"), 123 | dot.symbol = list(col = "black"), 124 | plot.line = list(col = "black"), 125 | plot.symbol = list(col = "black"), 126 | regions = list(col = 127 | c("#FEF8FA", "#FDF6F9", "#FBF5F9", "#FAF3F8", 128 | "#F8F2F7", "#F7F0F7", "#F5EEF6", "#F4EDF5", 129 | "#F2EBF5", "#F1EAF4", "#EFE8F3", "#EDE7F2", 130 | "#ECE5F1", "#EAE4F1", "#E8E2F0", "#E6E1EF", 131 | "#E4DFEE", "#E2DEED", "#E0DCEC", "#DEDAEB", 132 | "#DCD9EA", "#D9D7E9", "#D7D6E8", "#D4D4E7", 133 | "#D1D2E6", "#CED1E5", "#CCCFE4", "#C8CEE3", 134 | "#C5CCE2", "#C2CAE1", "#BFC9E0", "#BBC7DF", 135 | "#B8C5DF", "#B4C4DE", "#B1C2DD", "#ADC0DC", 136 | "#A9BFDB", "#A6BDDA", "#A2BBD9", "#9EB9D9", 137 | "#9BB8D8", "#97B6D7", "#93B4D6", "#8FB2D5", 138 | "#8BB0D4", "#87AFD3", "#83ADD2", "#7FABD1", 139 | "#7AA9D0", "#76A7CF", "#71A5CE", "#6CA3CC", 140 | "#68A1CB", "#63A0CA", "#5D9EC9", "#589CC8", 141 | "#539AC6", "#4E98C5", "#4996C4", "#4493C3", 142 | "#3F91C1", "#3A8FC0", "#358DBF", "#308BBE", 143 | "#2C89BD", "#2887BC", "#2385BB", "#1F83BA", 144 | "#1C80B9", "#187EB7", "#157CB6", "#127AB5", 145 | "#0F78B3", "#0D76B2", "#0A73B0", "#0971AE", 146 | "#076FAC", "#066DAA", "#056AA7", "#0568A5") 147 | ), 148 | strip.shingle = list(col = c( 149 | "#ff7f00", "#00ff00", "#00ffff", 150 | "#ff00ff", "#ff0000", "#ffff00", "#0080ff")), 151 | strip.background = list(col = c( 152 | "#ffe5cc", "#ccffcc", "#ccffff", 153 | "#ffccff", "#ffcccc", "#ffffcc", "#cce6ff")), 154 | reference.line = list(col = "#e8e8e8"), 155 | superpose.line = list( 156 | col = c( 157 | grDevices::rgb(1, 0, 0, trans), grDevices::rgb(0, 0, 1, trans), 158 | grDevices::rgb(0.3984375, 0.7578125, 0.6445312, max(.6, trans)), 159 | grDevices::rgb(0, 0, 0, trans)), 160 | lty = rep(1:2, 6)), 161 | superpose.symbol = list( 162 | pch = c(16, 15, 17, 18, 16), 163 | cex = rep(pchSize, 5), 164 | col = c( 165 | grDevices::rgb(1, 0, 0, trans), grDevices::rgb(0, 0, 1, trans), 166 | grDevices::rgb(0.3984375, 0.7578125, 0.6445312, max(.6, trans)), 167 | grDevices::rgb(0, 0, 0, trans))) 168 | ) 169 | 170 | if(set) lattice::trellis.par.set(theme, warn = FALSE) 171 | invisible(theme) 172 | } 173 | -------------------------------------------------------------------------------- /R/getPackages.R: -------------------------------------------------------------------------------- 1 | #' Install Packages for Each Chapter 2 | #' 3 | #' This function identifies the physical location on the user's computer where 4 | #' the chapter R scripts are located. 5 | #' 6 | #' Chapter names and packages. about dependencies. 7 | #' 8 | #' @param chapter an integer vector (or character versions of the integer) for 9 | #' the chapter number. See Details below: 10 | #' @param ... options to pass to \code{\link[utils]{install.packages}} 11 | #' @author Max Kuhn 12 | #' @keywords utilities 13 | #' @examples 14 | #' 15 | #' \dontrun{ 16 | #' getPackages(2) 17 | #' getPackages(2:3) 18 | #' getPackages("4") 19 | #' } 20 | #' 21 | #' @export getPackages 22 | getPackages <- function(chapter, ...) { 23 | if(is.numeric(chapter)) 24 | chapter <- paste(chapter) 25 | pkg <- list() 26 | pkg[["2"]] <- c("earth", "caret", "lattice") 27 | pkg[["3"]] <- c("e1071", "caret", "corrplot") 28 | pkg[["4"]] <- c("kernlab", "caret") 29 | pkg[["6"]] <- c("lattice", "corrplot", "pls", "elasticnet") 30 | pkg[["7"]] <- c("caret", "earth", "kernlab","lattice", "nnet") 31 | pkg[["8"]] <- c("caret", "Cubist", "gbm", "lattice", "party", "partykit", 32 | "randomForest", "rpart", "RWeka") 33 | pkg[["10"]] <- c("caret", "Cubist", "earth", "elasticnet", "gbm", "ipred", 34 | "lattice", "nnet", "party","pls", "randomForests", "rpart", 35 | "RWeka") 36 | pkg[["11"]] <- c("caret", "MASS", "randomForest", "pROC", "klaR") 37 | pkg[["12"]] <- c("caret", "glmnet", "lattice", 38 | "MASS", "pamr", "pls", "pROC", "sparseLDA") 39 | pkg[["13"]] <- c("caret", "kernlab", "klaR", "lattice", "latticeExtra", 40 | "MASS", "mda", "nnet", "pROC") 41 | pkg[["14"]] <- c("C50", "caret", "gbm", "lattice", "partykit", "pROC", 42 | "randomForest", "reshape2", 43 | "rpart", "RWeka") 44 | pkg[["16"]] <- c("caret", "C50", "earth", "DMwR", "DWD", " kernlab", "mda", 45 | "pROC", "randomForest", "rpart") 46 | pkg[["17"]] <- c("C50", "caret", "earth", "Hmisc", "ipred", "tabplot", 47 | "kernlab", "lattice", "MASS", "mda", "nnet", "pls", 48 | "randomForest", "rpart", "sparseLDA") 49 | pkg[["18"]] <- c("caret", "CORElearn", "corrplot", "pROC", "minerva") 50 | pkg[["19"]] <- c("caret", "MASS", "corrplot", "RColorBrewer", "randomForest", 51 | "kernlab", "klaR") 52 | plist <- 53 | paste(paste("'", names(pkg), "'", sep = ""), collapse = ", ") 54 | if (!any(chapter %in% names(pkg))) 55 | stop(paste("'chapter' must be: ", 56 | paste(plist, collapse = ", "))) 57 | 58 | pkg <- unlist(pkg[chapter]) 59 | pkg <- pkg[!is.na(pkg)] 60 | pkg <- pkg[pkg != ""] 61 | pkg <- pkg[order(tolower(pkg))] 62 | 63 | utils::install.packages(pkg, ...) 64 | } 65 | -------------------------------------------------------------------------------- /R/panels.R: -------------------------------------------------------------------------------- 1 | upperp <- function(...) { 2 | args <- list(...) 3 | circ1 <- ellipse::ellipse(diag(rep(1, 2)), t = .1) 4 | lattice::panel.xyplot( 5 | circ1[, 1], 6 | circ1[, 2], 7 | type = "l", 8 | lty = lattice::trellis.par.get("reference.line")$lty, 9 | col = lattice::trellis.par.get("reference.line")$col, 10 | lwd = lattice::trellis.par.get("reference.line")$lwd 11 | ) 12 | circ2 <- ellipse::ellipse(diag(rep(1, 2)), t = .2) 13 | lattice::panel.xyplot( 14 | circ2[, 1], 15 | circ2[, 2], 16 | type = "l", 17 | lty = lattice::trellis.par.get("reference.line")$lty, 18 | col = lattice::trellis.par.get("reference.line")$col, 19 | lwd = lattice::trellis.par.get("reference.line")$lwd 20 | ) 21 | circ3 <- ellipse::ellipse(diag(rep(1, 2)), t = .3) 22 | lattice::panel.xyplot( 23 | circ3[, 1], 24 | circ3[, 2], 25 | type = "l", 26 | lty = lattice::trellis.par.get("reference.line")$lty, 27 | col = lattice::trellis.par.get("reference.line")$col, 28 | lwd = lattice::trellis.par.get("reference.line")$lwd 29 | ) 30 | lattice::panel.xyplot(args$x, 31 | args$y, 32 | groups = args$groups, 33 | subscripts = args$subscripts) 34 | } 35 | 36 | lowerp <- function(...) { 37 | 38 | } 39 | -------------------------------------------------------------------------------- /R/permuteRelief.R: -------------------------------------------------------------------------------- 1 | #' Permutation Statistics for the Relief Algorithm 2 | #' 3 | #' This function uses a permutation approach to determining the relative 4 | #' magnitude of Relief scores (Kira and Rendell, 1992 and Kononenko, 1994). 5 | #' 6 | #' The scores for each predictor are computed using the original data and after 7 | #' outcome data are randomly scrambled (\code{nprem} times). The mean and 8 | #' standard deviation of the permuted values are determined and a standardized 9 | #' version of the observed scores are determined by subtracting the permuted 10 | #' means from the original values, then dividing each by the corresponding 11 | #' standard deviation. 12 | #' 13 | #' @param x a data frame of predictor data 14 | #' @param y a vector of outcomes 15 | #' @param nperm the number of random permutations of the data 16 | #' @param \dots options to pass to \code{\link[CORElearn]{attrEval}}, such as 17 | #' the exact Relief algorithm, to use 18 | #' @return a list with elements \item{standardized }{a vector of standardized 19 | #' predictor scores} \item{permutations }{the values of the permuted scores, 20 | #' for plotting to assess the permutation distribution} \item{observed}{the 21 | #' observed scores} \item{options}{a list of options passed using \ldots{}} 22 | #' @author Max Kuhn 23 | #' @seealso \code{\link[CORElearn]{attrEval}} 24 | #' @references Kira, K., & Rendell, L. (1992). The feature selection problem: 25 | #' Traditional methods and a new algorithm. \emph{Proceedings of the Eleventh 26 | #' International Conference on Machine Learning}, 129-129. 27 | #' 28 | #' Kononenko, I. (1994). Estimating attributes: analysis and extensions of 29 | #' RELIEF. Machine Learning: ECML-94, 171-182. 30 | #' @keywords htest 31 | #' @examples 32 | #' 33 | #' set.seed(874) 34 | #' reliefEx3 <- easyBoundaryFunc(500) 35 | #' reliefEx3$X1 <- scale(reliefEx3$X1) 36 | #' reliefEx3$X2 <- scale(reliefEx3$X2) 37 | #' reliefEx3$prob <- NULL 38 | #' 39 | #' standardized <- permuteRelief(reliefEx3[, 1:2], reliefEx3$class, 40 | #' ## For efficiency, a small number of 41 | #' ## permutations are used here. 42 | #' nperm = 50, 43 | #' estimator="ReliefFequalK", 44 | #' ReliefIterations= 50) 45 | #' 46 | #' 47 | #' @export permuteRelief 48 | permuteRelief <- 49 | function(x, y, nperm = 100, ...) { 50 | dat <- x 51 | dat$y <- y 52 | 53 | obs <- CORElearn::attrEval(y ~ ., data = dat, ...) 54 | permuted <- matrix(NA, ncol = length(obs), nrow = nperm) 55 | colnames(permuted) <- names(obs) 56 | for (i in 1:nperm) { 57 | dat$y <- sample(y) 58 | permuted[i,] <- CORElearn::attrEval(y ~ ., data = dat, ...) 59 | } 60 | means <- colMeans(permuted) 61 | sds <- apply(permuted, 2, stats::sd) 62 | permuted <- reshape2::melt(permuted) 63 | names(permuted)[2] <- "Predictor" 64 | permuted$X1 <- NULL 65 | list( 66 | standardized = (obs - means) / sds, 67 | permutations = permuted, 68 | observed = obs, 69 | options = list(...) 70 | ) 71 | } 72 | -------------------------------------------------------------------------------- /R/quadBoundaryFunc.R: -------------------------------------------------------------------------------- 1 | #' Functions for Simulating Data 2 | #' 3 | #' These functions simulate data that are used in the text. 4 | #' 5 | #' The \code{quadBoundaryFunc} function creates a class boundary that is a 6 | #' function of both predictors. The probability values are based on a logistic 7 | #' regression model with model equation: \eqn{-1-2X_1 -0.2X_1^2 + 8 | #' 2X_2^2}{-1-2*X1 -0.2*X1^2 + 2*X2^2}. The predictors here are multivariate 9 | #' normal with mean (1, 0) and a moderate degree of positive correlation. 10 | #' 11 | #' Similarly, the \code{easyBoundaryFunc} uses a logistic regression model with 12 | #' model equation: \eqn{intercept -4X_1 + 4X_2 + interaction \times X_1 \times 13 | #' X_2}{intercept -4*X1 + 4*X2 + interaction*X1*X2}. The predictors here are 14 | #' multivariate normal with mean (1, 0) and a strong positive correlation. 15 | #' 16 | #' @aliases quadBoundaryFunc easyBoundaryFunc 17 | #' @param n the sample size 18 | #' @param intercept the coefficient for the logistic regression intercept term 19 | #' @param interaction the coefficient for the logistic regression interaction 20 | #' term 21 | #' @return Both functions return data frames with columns \item{X1}{numeric 22 | #' predictor value} \item{X2}{numeric predictor value} \item{prob }{numeric 23 | #' value reflecting the true probability of the first class} \item{class }{a 24 | #' factor variable with levels 'Class1' and 'Class2'} 25 | #' @author Max Kuhn 26 | #' @keywords utilities 27 | #' @examples 28 | #' 29 | #' ## in Chapter 11, 'Measuring Performance in Classification Model' 30 | #' set.seed(975) 31 | #' training <- quadBoundaryFunc(500) 32 | #' testing <- quadBoundaryFunc(1000) 33 | #' 34 | #' 35 | #' ## in Chapter 20, 'Factors That Can Affect Model Performance' 36 | #' set.seed(615) 37 | #' dat <- easyBoundaryFunc(200, interaction = 3, intercept = 3) 38 | #' dat$X1 <- scale(dat$X1) 39 | #' dat$X2 <- scale(dat$X2) 40 | #' dat$Data <- "Original" 41 | #' dat$prob <- NULL 42 | #' 43 | #' ## in Chapter X, 'An Introduction to Feature Selection' 44 | #' 45 | #' set.seed(874) 46 | #' reliefEx3 <- easyBoundaryFunc(500) 47 | #' reliefEx3$X1 <- scale(reliefEx3$X1) 48 | #' reliefEx3$X2 <- scale(reliefEx3$X2) 49 | #' reliefEx3$prob <- NULL 50 | #' 51 | #' 52 | #' @export quadBoundaryFunc 53 | quadBoundaryFunc <- function(n) { 54 | sigma <- matrix(c(1, .7, .7, 2), 2, 2) 55 | 56 | tmpData <- data.frame(MASS::mvrnorm(n = n, c(1, 0), sigma)) 57 | xSeq <- seq(-4, 4, length = 40) 58 | plotGrid <- expand.grid(x = xSeq, y = xSeq) 59 | zFoo <- function(x, y) 60 | - 1 - 2 * x - 0 * y - .2 * x ^ 2 + 2 * y ^ 2 61 | z2p <- function(x) 62 | 1 / (1 + exp(-x)) 63 | 64 | tmpData$prob <- z2p(zFoo(tmpData$X1, tmpData$X2)) 65 | tmpData$class <- 66 | factor(ifelse(stats::runif(length(tmpData$prob)) <= tmpData$prob, "Class1", "Class2")) 67 | tmpData 68 | } 69 | 70 | 71 | #' @rdname quadBoundaryFunc 72 | #' @export 73 | easyBoundaryFunc <- function(n, 74 | intercept = 0, 75 | interaction = 2) { 76 | sigma <- matrix(c(2, 1.3, 1.3, 2), 2, 2) 77 | 78 | tmpData <- data.frame(MASS::mvrnorm(n = n, c(0, 0), sigma)) 79 | xSeq <- seq(-4, 4, length = 40) 80 | plotGrid <- expand.grid(x = xSeq, y = xSeq) 81 | zFoo <- function(x, y) 82 | intercept - 4 * x + 4 * y + interaction * x * y 83 | z2p <- function(x) 84 | 1 / (1 + exp(-x)) 85 | 86 | tmpData$prob <- z2p(zFoo(tmpData$X1, tmpData$X2)) 87 | tmpData$class <- 88 | factor(ifelse(stats::runif(length(tmpData$prob)) <= tmpData$prob, "Class1", "Class2")) 89 | tmpData 90 | } 91 | -------------------------------------------------------------------------------- /R/scriptLocation.R: -------------------------------------------------------------------------------- 1 | #' Find Chapter Script Files 2 | #' 3 | #' This function identifies the physical location on the user's computer where 4 | #' the chapter R scripts are located. 5 | #' 6 | #' 7 | #' @author Max Kuhn 8 | #' @keywords utilities 9 | #' @examples 10 | #' 11 | #' scriptLocation() 12 | #' 13 | #' @export scriptLocation 14 | scriptLocation <- 15 | function() 16 | system.file("chapters", package = "AppliedPredictiveModeling") 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AppliedPredictiveModeling 2 | 3 | Data and code from Applied Predictive Modeling (2013) 4 | -------------------------------------------------------------------------------- /data/AlzheimerDisease.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/topepo/AppliedPredictiveModeling/9a3717b2dcd040af8a674b57f4fe90e7995d5002/data/AlzheimerDisease.RData -------------------------------------------------------------------------------- /data/ChemicalManufacturingProcess.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/topepo/AppliedPredictiveModeling/9a3717b2dcd040af8a674b57f4fe90e7995d5002/data/ChemicalManufacturingProcess.RData -------------------------------------------------------------------------------- /data/FuelEconomy.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/topepo/AppliedPredictiveModeling/9a3717b2dcd040af8a674b57f4fe90e7995d5002/data/FuelEconomy.RData -------------------------------------------------------------------------------- /data/abalone.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/topepo/AppliedPredictiveModeling/9a3717b2dcd040af8a674b57f4fe90e7995d5002/data/abalone.RData -------------------------------------------------------------------------------- /data/concrete.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/topepo/AppliedPredictiveModeling/9a3717b2dcd040af8a674b57f4fe90e7995d5002/data/concrete.RData -------------------------------------------------------------------------------- /data/hepatic.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/topepo/AppliedPredictiveModeling/9a3717b2dcd040af8a674b57f4fe90e7995d5002/data/hepatic.RData -------------------------------------------------------------------------------- /data/logisticCreditPredictions.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/topepo/AppliedPredictiveModeling/9a3717b2dcd040af8a674b57f4fe90e7995d5002/data/logisticCreditPredictions.RData -------------------------------------------------------------------------------- /data/permeability.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/topepo/AppliedPredictiveModeling/9a3717b2dcd040af8a674b57f4fe90e7995d5002/data/permeability.RData -------------------------------------------------------------------------------- /data/schedulingData.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/topepo/AppliedPredictiveModeling/9a3717b2dcd040af8a674b57f4fe90e7995d5002/data/schedulingData.RData -------------------------------------------------------------------------------- /data/segmentationOriginal.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/topepo/AppliedPredictiveModeling/9a3717b2dcd040af8a674b57f4fe90e7995d5002/data/segmentationOriginal.RData -------------------------------------------------------------------------------- /data/solubility.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/topepo/AppliedPredictiveModeling/9a3717b2dcd040af8a674b57f4fe90e7995d5002/data/solubility.RData -------------------------------------------------------------------------------- /data/twoClassData.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/topepo/AppliedPredictiveModeling/9a3717b2dcd040af8a674b57f4fe90e7995d5002/data/twoClassData.RData -------------------------------------------------------------------------------- /inst/NEWS.Rd: -------------------------------------------------------------------------------- 1 | \name{NEWS} 2 | \title{News for Package \pkg{AppliedPredictiveModeling}} 3 | \newcommand{\cpkg}{\href{https://cran.r-project.org/package=#1}{\pkg{#1}}} 4 | 5 | 6 | \section{Changes in version 1.1-6}{ 7 | 8 | \itemize{ 9 | \item The file \code{CreateGrantData.R} was updated to include code to create the objects \code{factorPredictors} and \code{factorForm}. 10 | 11 | } 12 | } 13 | 14 | \section{Changes in version 1.1-5}{ 15 | 16 | The package dependencies were updated. Some were moved to 'Imports' 17 | 18 | The chapter scripts were re-run with the latest versions of the \cpkg{AppliedPredictiveModeling} and\cpkg{caret} packages. For \cpkg{caret}, the names of the tuning parameter columns were changed to remove the dot. These were made to be consistent with the newer version of \cpkg{caret} and are not required. For example, \code{.sigma} was changed to \code{sigma} and so on. 19 | 20 | Additional changes are: 21 | 22 | \itemize{ 23 | \item \code{03_Data_Pre_Processing.R} was changed to include code from the Computing section on creating dummy variables. 24 | 25 | \item \code{04_Over_Fitting.R} was modified. Changes were made to: 26 | \itemize{ 27 | \item avoid a warning message when the SVM grid was created 28 | \item \code{classProbs = TRUE} was added to the control function for the object \code{svmFit} 29 | \item some notes were made in the potential differences in SVM results between versions of \cpkg{caret} 30 | } 31 | \item \code{06_Linear_Regression.R} was updated to use the newer "ridge" model in \cpkg{caret}. 32 | 33 | \item In \code{07_Non-Linear_Reg.R}, some notes were made in the potential differences in SVM results between versions of \cpkg{caret} 34 | 35 | \item In \code{16_Class_Imbalance.R}, verboseness (verbosity?) was turned off to make the results cleaner. Also, the code for \code{svmWtFit} was using the wrong tuning grid (\code{svmGrid2} instead of \code{svmGrid1}). 36 | 37 | \item \code{19_Feature_Select.R} was changed so that the resampling values were only saved for the final model. 38 | 39 | } 40 | } 41 | 42 | \section{Changes in version 1.1-4}{ 43 | \itemize{ 44 | \item The data set \code{ChemicalManufacturingProcess} did not contain 45 | the rows with missing data. They were added back in. 46 | 47 | \item Small changes to conform to R CMD check. 48 | }} 49 | 50 | \section{Changes in version 1.1-2}{ 51 | \itemize{ 52 | \item Code to create the \code{carsSubset} object in Seciton 3.8 was added 53 | to 03_Data_Pre_Processing.R 54 | }} 55 | 56 | \section{Changes in version 1.1-1}{ 57 | \itemize{ 58 | \item Initial Version 59 | }} 60 | 61 | -------------------------------------------------------------------------------- /inst/chapters/02_A_Short_Tour.R: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson. 3 | ### Copyright 2013 Kuhn and Johnson 4 | ### Web Page: http://www.appliedpredictivemodeling.com 5 | ### Contact: Max Kuhn (mxkuhn@gmail.com) 6 | ### 7 | ### Chapter 2: A Short Tour of the Predictive Modeling Process 8 | ### 9 | ### Required packages: AppliedPredictiveModeling, earth, caret, lattice 10 | ### 11 | ### Data used: The FuelEconomy data in the AppliedPredictiveModeling package 12 | ### 13 | ### Notes: 14 | ### 1) This code is provided without warranty. 15 | ### 16 | ### 2) This code should help the user reproduce the results in the 17 | ### text. There will be differences between this code and what is is 18 | ### the computing section. For example, the computing sections show 19 | ### how the source functions work (e.g. randomForest() or plsr()), 20 | ### which were not directly used when creating the book. Also, there may be 21 | ### syntax differences that occur over time as packages evolve. These files 22 | ### will reflect those changes. 23 | ### 24 | ### 3) In some cases, the calculations in the book were run in 25 | ### parallel. The sub-processes may reset the random number seed. 26 | ### Your results may slightly vary. 27 | ### 28 | ################################################################################ 29 | 30 | ################################################################################ 31 | ### Section 2.1 Case Study: Predicting Fuel Economy 32 | 33 | library(AppliedPredictiveModeling) 34 | data(FuelEconomy) 35 | 36 | ## Format data for plotting against engine displacement 37 | 38 | ## Sort by engine displacement 39 | cars2010 <- cars2010[order(cars2010$EngDispl),] 40 | cars2011 <- cars2011[order(cars2011$EngDispl),] 41 | 42 | ## Combine data into one data frame 43 | cars2010a <- cars2010 44 | cars2010a$Year <- "2010 Model Year" 45 | cars2011a <- cars2011 46 | cars2011a$Year <- "2011 Model Year" 47 | 48 | plotData <- rbind(cars2010a, cars2011a) 49 | 50 | library(lattice) 51 | xyplot(FE ~ EngDispl|Year, plotData, 52 | xlab = "Engine Displacement", 53 | ylab = "Fuel Efficiency (MPG)", 54 | between = list(x = 1.2)) 55 | 56 | ## Fit a single linear model and conduct 10-fold CV to estimate the error 57 | library(caret) 58 | set.seed(1) 59 | lm1Fit <- train(FE ~ EngDispl, 60 | data = cars2010, 61 | method = "lm", 62 | trControl = trainControl(method= "cv")) 63 | lm1Fit 64 | 65 | 66 | ## Fit a quadratic model too 67 | 68 | ## Create squared terms 69 | cars2010$ED2 <- cars2010$EngDispl^2 70 | cars2011$ED2 <- cars2011$EngDispl^2 71 | 72 | set.seed(1) 73 | lm2Fit <- train(FE ~ EngDispl + ED2, 74 | data = cars2010, 75 | method = "lm", 76 | trControl = trainControl(method= "cv")) 77 | lm2Fit 78 | 79 | ## Finally a MARS model (via the earth package) 80 | 81 | library(earth) 82 | set.seed(1) 83 | marsFit <- train(FE ~ EngDispl, 84 | data = cars2010, 85 | method = "earth", 86 | tuneLength = 15, 87 | trControl = trainControl(method= "cv")) 88 | marsFit 89 | 90 | plot(marsFit) 91 | 92 | ## Predict the test set data 93 | cars2011$lm1 <- predict(lm1Fit, cars2011) 94 | cars2011$lm2 <- predict(lm2Fit, cars2011) 95 | cars2011$mars <- predict(marsFit, cars2011) 96 | 97 | ## Get test set performance values via caret's postResample function 98 | 99 | postResample(pred = cars2011$lm1, obs = cars2011$FE) 100 | postResample(pred = cars2011$lm2, obs = cars2011$FE) 101 | postResample(pred = cars2011$mars, obs = cars2011$FE) 102 | 103 | ################################################################################ 104 | ### Session Information 105 | 106 | sessionInfo() 107 | 108 | q("no") 109 | 110 | 111 | -------------------------------------------------------------------------------- /inst/chapters/02_A_Short_Tour.Rout: -------------------------------------------------------------------------------- 1 | 2 | R version 3.0.1 (2013-05-16) -- "Good Sport" 3 | Copyright (C) 2013 The R Foundation for Statistical Computing 4 | Platform: x86_64-apple-darwin10.8.0 (64-bit) 5 | 6 | R is free software and comes with ABSOLUTELY NO WARRANTY. 7 | You are welcome to redistribute it under certain conditions. 8 | Type 'license()' or 'licence()' for distribution details. 9 | 10 | Natural language support but running in an English locale 11 | 12 | R is a collaborative project with many contributors. 13 | Type 'contributors()' for more information and 14 | 'citation()' on how to cite R or R packages in publications. 15 | 16 | Type 'demo()' for some demos, 'help()' for on-line help, or 17 | 'help.start()' for an HTML browser interface to help. 18 | Type 'q()' to quit R. 19 | 20 | > ################################################################################ 21 | > ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson. 22 | > ### Copyright 2013 Kuhn and Johnson 23 | > ### Web Page: http://www.appliedpredictivemodeling.com 24 | > ### Contact: Max Kuhn (mxkuhn@gmail.com) 25 | > ### 26 | > ### Chapter 2: A Short Tour of the Predictive Modeling Process 27 | > ### 28 | > ### Required packages: AppliedPredictiveModeling, earth, caret, lattice 29 | > ### 30 | > ### Data used: The FuelEconomy data in the AppliedPredictiveModeling package 31 | > ### 32 | > ### Notes: 33 | > ### 1) This code is provided without warranty. 34 | > ### 35 | > ### 2) This code should help the user reproduce the results in the 36 | > ### text. There will be differences between this code and what is is 37 | > ### the computing section. For example, the computing sections show 38 | > ### how the source functions work (e.g. randomForest() or plsr()), 39 | > ### which were not directly used when creating the book. Also, there may be 40 | > ### syntax differences that occur over time as packages evolve. These files 41 | > ### will reflect those changes. 42 | > ### 43 | > ### 3) In some cases, the calculations in the book were run in 44 | > ### parallel. The sub-processes may reset the random number seed. 45 | > ### Your results may slightly vary. 46 | > ### 47 | > ################################################################################ 48 | > 49 | > ################################################################################ 50 | > ### Section 2.1 Case Study: Predicting Fuel Economy 51 | > 52 | > library(AppliedPredictiveModeling) 53 | > data(FuelEconomy) 54 | > 55 | > ## Format data for plotting against engine displacement 56 | > 57 | > ## Sort by engine displacement 58 | > cars2010 <- cars2010[order(cars2010$EngDispl),] 59 | > cars2011 <- cars2011[order(cars2011$EngDispl),] 60 | > 61 | > ## Combine data into one data frame 62 | > cars2010a <- cars2010 63 | > cars2010a$Year <- "2010 Model Year" 64 | > cars2011a <- cars2011 65 | > cars2011a$Year <- "2011 Model Year" 66 | > 67 | > plotData <- rbind(cars2010a, cars2011a) 68 | > 69 | > library(lattice) 70 | > xyplot(FE ~ EngDispl|Year, plotData, 71 | + xlab = "Engine Displacement", 72 | + ylab = "Fuel Efficiency (MPG)", 73 | + between = list(x = 1.2)) 74 | > 75 | > ## Fit a single linear model and conduct 10-fold CV to estimate the error 76 | > library(caret) 77 | Loading required package: ggplot2 78 | > set.seed(1) 79 | > lm1Fit <- train(FE ~ EngDispl, 80 | + data = cars2010, 81 | + method = "lm", 82 | + trControl = trainControl(method= "cv")) 83 | > lm1Fit 84 | Linear Regression 85 | 86 | 1107 samples 87 | 13 predictors 88 | 89 | No pre-processing 90 | Resampling: Cross-Validated (10 fold) 91 | 92 | Summary of sample sizes: 997, 996, 995, 996, 997, 996, ... 93 | 94 | Resampling results 95 | 96 | RMSE Rsquared RMSE SD Rsquared SD 97 | 4.6 0.628 0.493 0.0442 98 | 99 | 100 | > 101 | > 102 | > ## Fit a quadratic model too 103 | > 104 | > ## Create squared terms 105 | > cars2010$ED2 <- cars2010$EngDispl^2 106 | > cars2011$ED2 <- cars2011$EngDispl^2 107 | > 108 | > set.seed(1) 109 | > lm2Fit <- train(FE ~ EngDispl + ED2, 110 | + data = cars2010, 111 | + method = "lm", 112 | + trControl = trainControl(method= "cv")) 113 | > lm2Fit 114 | Linear Regression 115 | 116 | 1107 samples 117 | 14 predictors 118 | 119 | No pre-processing 120 | Resampling: Cross-Validated (10 fold) 121 | 122 | Summary of sample sizes: 997, 996, 995, 996, 997, 996, ... 123 | 124 | Resampling results 125 | 126 | RMSE Rsquared RMSE SD Rsquared SD 127 | 4.23 0.684 0.419 0.0421 128 | 129 | 130 | > 131 | > ## Finally a MARS model (via the earth package) 132 | > 133 | > library(earth) 134 | Loading required package: plotmo 135 | Loading required package: plotrix 136 | > set.seed(1) 137 | > marsFit <- train(FE ~ EngDispl, 138 | + data = cars2010, 139 | + method = "earth", 140 | + tuneLength = 15, 141 | + trControl = trainControl(method= "cv")) 142 | > marsFit 143 | Multivariate Adaptive Regression Spline 144 | 145 | 1107 samples 146 | 14 predictors 147 | 148 | No pre-processing 149 | Resampling: Cross-Validated (10 fold) 150 | 151 | Summary of sample sizes: 997, 996, 995, 996, 997, 996, ... 152 | 153 | Resampling results across tuning parameters: 154 | 155 | nprune RMSE Rsquared RMSE SD Rsquared SD 156 | 2 4.3 0.673 0.441 0.0429 157 | 3 4.26 0.68 0.44 0.0395 158 | 4 4.23 0.685 0.449 0.0428 159 | 5 4.25 0.682 0.489 0.0432 160 | 161 | Tuning parameter 'degree' was held constant at a value of 1 162 | RMSE was used to select the optimal model using the smallest value. 163 | The final values used for the model were nprune = 4 and degree = 1. 164 | > 165 | > plot(marsFit) 166 | > 167 | > ## Predict the test set data 168 | > cars2011$lm1 <- predict(lm1Fit, cars2011) 169 | > cars2011$lm2 <- predict(lm2Fit, cars2011) 170 | > cars2011$mars <- predict(marsFit, cars2011) 171 | > 172 | > ## Get test set performance values via caret's postResample function 173 | > 174 | > postResample(pred = cars2011$lm1, obs = cars2011$FE) 175 | RMSE Rsquared 176 | 5.1625309 0.7018642 177 | > postResample(pred = cars2011$lm2, obs = cars2011$FE) 178 | RMSE Rsquared 179 | 4.7162853 0.7486074 180 | > postResample(pred = cars2011$mars, obs = cars2011$FE) 181 | RMSE Rsquared 182 | 4.6855501 0.7499953 183 | > 184 | > ################################################################################ 185 | > ### Session Information 186 | > 187 | > sessionInfo() 188 | R version 3.0.1 (2013-05-16) 189 | Platform: x86_64-apple-darwin10.8.0 (64-bit) 190 | 191 | locale: 192 | [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8 193 | 194 | attached base packages: 195 | [1] stats graphics grDevices utils datasets methods base 196 | 197 | other attached packages: 198 | [1] earth_3.2-6 plotrix_3.4-7 199 | [3] plotmo_1.3-2 caret_6.0-22 200 | [5] ggplot2_0.9.3.1 lattice_0.20-15 201 | [7] AppliedPredictiveModeling_1.1-5 202 | 203 | loaded via a namespace (and not attached): 204 | [1] car_2.0-17 codetools_0.2-8 colorspace_1.2-2 compiler_3.0.1 205 | [5] CORElearn_0.9.41 dichromat_2.0-0 digest_0.6.3 foreach_1.4.0 206 | [9] grid_3.0.1 gtable_0.1.2 iterators_1.0.6 labeling_0.1 207 | [13] MASS_7.3-26 munsell_0.4 plyr_1.8 proto_0.3-10 208 | [17] RColorBrewer_1.0-5 reshape2_1.2.2 scales_0.2.3 stringr_0.6.2 209 | [21] tools_3.0.1 210 | > 211 | > q("no") 212 | > proc.time() 213 | user system elapsed 214 | 4.971 0.114 5.292 215 | -------------------------------------------------------------------------------- /inst/chapters/03_Data_Pre_Processing.R: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson. 3 | ### Copyright 2013 Kuhn and Johnson 4 | ### Web Page: http://www.appliedpredictivemodeling.com 5 | ### Contact: Max Kuhn (mxkuhn@gmail.com) 6 | ### 7 | ### Chapter 3: Data Pre-Processing 8 | ### 9 | ### Required packages: AppliedPredictiveModeling, e1071, caret, corrplot 10 | ### 11 | ### Data used: The (unprocessed) cell segmentation data from the 12 | ### AppliedPredictiveModeling package. 13 | ### 14 | ### Notes: 15 | ### 1) This code is provided without warranty. 16 | ### 17 | ### 2) This code should help the user reproduce the results in the 18 | ### text. There will be differences between this code and what is is 19 | ### the computing section. For example, the computing sections show 20 | ### how the source functions work (e.g. randomForest() or plsr()), 21 | ### which were not directly used when creating the book. Also, there may be 22 | ### syntax differences that occur over time as packages evolve. These files 23 | ### will reflect those changes. 24 | ### 25 | ### 3) In some cases, the calculations in the book were run in 26 | ### parallel. The sub-processes may reset the random number seed. 27 | ### Your results may slightly vary. 28 | ### 29 | ################################################################################ 30 | 31 | ################################################################################ 32 | ### Section 3.1 Case Study: Cell Segmentation in High-Content Screening 33 | 34 | library(AppliedPredictiveModeling) 35 | data(segmentationOriginal) 36 | 37 | ## Retain the original training set 38 | segTrain <- subset(segmentationOriginal, Case == "Train") 39 | 40 | ## Remove the first three columns (identifier columns) 41 | segTrainX <- segTrain[, -(1:3)] 42 | segTrainClass <- segTrain$Class 43 | 44 | ################################################################################ 45 | ### Section 3.2 Data Transformations for Individual Predictors 46 | 47 | ## The column VarIntenCh3 measures the standard deviation of the intensity 48 | ## of the pixels in the actin filaments 49 | 50 | max(segTrainX$VarIntenCh3)/min(segTrainX$VarIntenCh3) 51 | 52 | library(e1071) 53 | skewness(segTrainX$VarIntenCh3) 54 | 55 | library(caret) 56 | 57 | ## Use caret's preProcess function to transform for skewness 58 | segPP <- preProcess(segTrainX, method = "BoxCox") 59 | 60 | ## Apply the transformations 61 | segTrainTrans <- predict(segPP, segTrainX) 62 | 63 | ## Results for a single predictor 64 | segPP$bc$VarIntenCh3 65 | 66 | histogram(~segTrainX$VarIntenCh3, 67 | xlab = "Natural Units", 68 | type = "count") 69 | 70 | histogram(~log(segTrainX$VarIntenCh3), 71 | xlab = "Log Units", 72 | ylab = " ", 73 | type = "count") 74 | 75 | segPP$bc$PerimCh1 76 | 77 | histogram(~segTrainX$PerimCh1, 78 | xlab = "Natural Units", 79 | type = "count") 80 | 81 | histogram(~segTrainTrans$PerimCh1, 82 | xlab = "Transformed Data", 83 | ylab = " ", 84 | type = "count") 85 | 86 | ################################################################################ 87 | ### Section 3.3 Data Transformations for Multiple Predictors 88 | 89 | ## R's prcomp is used to conduct PCA 90 | pr <- prcomp(~ AvgIntenCh1 + EntropyIntenCh1, 91 | data = segTrainTrans, 92 | scale. = TRUE) 93 | 94 | transparentTheme(pchSize = .7, trans = .3) 95 | 96 | xyplot(AvgIntenCh1 ~ EntropyIntenCh1, 97 | data = segTrainTrans, 98 | groups = segTrain$Class, 99 | xlab = "Channel 1 Fiber Width", 100 | ylab = "Intensity Entropy Channel 1", 101 | auto.key = list(columns = 2), 102 | type = c("p", "g"), 103 | main = "Original Data", 104 | aspect = 1) 105 | 106 | xyplot(PC2 ~ PC1, 107 | data = as.data.frame(pr$x), 108 | groups = segTrain$Class, 109 | xlab = "Principal Component #1", 110 | ylab = "Principal Component #2", 111 | main = "Transformed", 112 | xlim = extendrange(pr$x), 113 | ylim = extendrange(pr$x), 114 | type = c("p", "g"), 115 | aspect = 1) 116 | 117 | 118 | ## Apply PCA to the entire set of predictors. 119 | 120 | ## There are a few predictors with only a single value, so we remove these first 121 | ## (since PCA uses variances, which would be zero) 122 | 123 | isZV <- apply(segTrainX, 2, function(x) length(unique(x)) == 1) 124 | segTrainX <- segTrainX[, !isZV] 125 | 126 | segPP <- preProcess(segTrainX, c("BoxCox", "center", "scale")) 127 | segTrainTrans <- predict(segPP, segTrainX) 128 | 129 | segPCA <- prcomp(segTrainTrans, center = TRUE, scale. = TRUE) 130 | 131 | ## Plot a scatterplot matrix of the first three components 132 | transparentTheme(pchSize = .8, trans = .3) 133 | 134 | panelRange <- extendrange(segPCA$x[, 1:3]) 135 | splom(as.data.frame(segPCA$x[, 1:3]), 136 | groups = segTrainClass, 137 | type = c("p", "g"), 138 | as.table = TRUE, 139 | auto.key = list(columns = 2), 140 | prepanel.limits = function(x) panelRange) 141 | 142 | ## Format the rotation values for plotting 143 | segRot <- as.data.frame(segPCA$rotation[, 1:3]) 144 | 145 | ## Derive the channel variable 146 | vars <- rownames(segPCA$rotation) 147 | channel <- rep(NA, length(vars)) 148 | channel[grepl("Ch1$", vars)] <- "Channel 1" 149 | channel[grepl("Ch2$", vars)] <- "Channel 2" 150 | channel[grepl("Ch3$", vars)] <- "Channel 3" 151 | channel[grepl("Ch4$", vars)] <- "Channel 4" 152 | 153 | segRot$Channel <- channel 154 | segRot <- segRot[complete.cases(segRot),] 155 | segRot$Channel <- factor(as.character(segRot$Channel)) 156 | 157 | ## Plot a scatterplot matrix of the first three rotation variables 158 | 159 | transparentTheme(pchSize = .8, trans = .7) 160 | panelRange <- extendrange(segRot[, 1:3]) 161 | library(ellipse) 162 | upperp <- function(...) 163 | { 164 | args <- list(...) 165 | circ1 <- ellipse(diag(rep(1, 2)), t = .1) 166 | panel.xyplot(circ1[,1], circ1[,2], 167 | type = "l", 168 | lty = trellis.par.get("reference.line")$lty, 169 | col = trellis.par.get("reference.line")$col, 170 | lwd = trellis.par.get("reference.line")$lwd) 171 | circ2 <- ellipse(diag(rep(1, 2)), t = .2) 172 | panel.xyplot(circ2[,1], circ2[,2], 173 | type = "l", 174 | lty = trellis.par.get("reference.line")$lty, 175 | col = trellis.par.get("reference.line")$col, 176 | lwd = trellis.par.get("reference.line")$lwd) 177 | circ3 <- ellipse(diag(rep(1, 2)), t = .3) 178 | panel.xyplot(circ3[,1], circ3[,2], 179 | type = "l", 180 | lty = trellis.par.get("reference.line")$lty, 181 | col = trellis.par.get("reference.line")$col, 182 | lwd = trellis.par.get("reference.line")$lwd) 183 | panel.xyplot(args$x, args$y, groups = args$groups, subscripts = args$subscripts) 184 | } 185 | splom(~segRot[, 1:3], 186 | groups = segRot$Channel, 187 | lower.panel = function(...){}, upper.panel = upperp, 188 | prepanel.limits = function(x) panelRange, 189 | auto.key = list(columns = 2)) 190 | 191 | ################################################################################ 192 | ### Section 3.5 Removing Variables 193 | 194 | ## To filter on correlations, we first get the correlation matrix for the 195 | ## predictor set 196 | 197 | segCorr <- cor(segTrainTrans) 198 | 199 | library(corrplot) 200 | corrplot(segCorr, order = "hclust", tl.cex = .35) 201 | 202 | ## caret's findCorrelation function is used to identify columns to remove. 203 | highCorr <- findCorrelation(segCorr, .75) 204 | 205 | ################################################################################ 206 | ### Section 3.8 Computing (Creating Dummy Variables) 207 | 208 | data(cars) 209 | type <- c("convertible", "coupe", "hatchback", "sedan", "wagon") 210 | cars$Type <- factor(apply(cars[, 14:18], 1, function(x) type[which(x == 1)])) 211 | 212 | carSubset <- cars[sample(1:nrow(cars), 20), c(1, 2, 19)] 213 | 214 | head(carSubset) 215 | levels(carSubset$Type) 216 | 217 | simpleMod <- dummyVars(~Mileage + Type, 218 | data = carSubset, 219 | ## Remove the variable name from the 220 | ## column name 221 | levelsOnly = TRUE) 222 | simpleMod 223 | 224 | withInteraction <- dummyVars(~Mileage + Type + Mileage:Type, 225 | data = carSubset, 226 | levelsOnly = TRUE) 227 | withInteraction 228 | predict(withInteraction, head(carSubset)) 229 | 230 | 231 | 232 | ################################################################################ 233 | ### Session Information 234 | 235 | sessionInfo() 236 | 237 | q("no") 238 | 239 | 240 | -------------------------------------------------------------------------------- /inst/chapters/03_Data_Pre_Processing.Rout: -------------------------------------------------------------------------------- 1 | 2 | R version 3.0.1 (2013-05-16) -- "Good Sport" 3 | Copyright (C) 2013 The R Foundation for Statistical Computing 4 | Platform: x86_64-apple-darwin10.8.0 (64-bit) 5 | 6 | R is free software and comes with ABSOLUTELY NO WARRANTY. 7 | You are welcome to redistribute it under certain conditions. 8 | Type 'license()' or 'licence()' for distribution details. 9 | 10 | Natural language support but running in an English locale 11 | 12 | R is a collaborative project with many contributors. 13 | Type 'contributors()' for more information and 14 | 'citation()' on how to cite R or R packages in publications. 15 | 16 | Type 'demo()' for some demos, 'help()' for on-line help, or 17 | 'help.start()' for an HTML browser interface to help. 18 | Type 'q()' to quit R. 19 | 20 | > ################################################################################ 21 | > ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson. 22 | > ### Copyright 2013 Kuhn and Johnson 23 | > ### Web Page: http://www.appliedpredictivemodeling.com 24 | > ### Contact: Max Kuhn (mxkuhn@gmail.com) 25 | > ### 26 | > ### Chapter 3: Data Pre-Processing 27 | > ### 28 | > ### Required packages: AppliedPredictiveModeling, e1071, caret, corrplot 29 | > ### 30 | > ### Data used: The (unprocessed) cell segmentation data from the 31 | > ### AppliedPredictiveModeling package. 32 | > ### 33 | > ### Notes: 34 | > ### 1) This code is provided without warranty. 35 | > ### 36 | > ### 2) This code should help the user reproduce the results in the 37 | > ### text. There will be differences between this code and what is is 38 | > ### the computing section. For example, the computing sections show 39 | > ### how the source functions work (e.g. randomForest() or plsr()), 40 | > ### which were not directly used when creating the book. Also, there may be 41 | > ### syntax differences that occur over time as packages evolve. These files 42 | > ### will reflect those changes. 43 | > ### 44 | > ### 3) In some cases, the calculations in the book were run in 45 | > ### parallel. The sub-processes may reset the random number seed. 46 | > ### Your results may slightly vary. 47 | > ### 48 | > ################################################################################ 49 | > 50 | > ################################################################################ 51 | > ### Section 3.1 Case Study: Cell Segmentation in High-Content Screening 52 | > 53 | > library(AppliedPredictiveModeling) 54 | > data(segmentationOriginal) 55 | > 56 | > ## Retain the original training set 57 | > segTrain <- subset(segmentationOriginal, Case == "Train") 58 | > 59 | > ## Remove the first three columns (identifier columns) 60 | > segTrainX <- segTrain[, -(1:3)] 61 | > segTrainClass <- segTrain$Class 62 | > 63 | > ################################################################################ 64 | > ### Section 3.2 Data Transformations for Individual Predictors 65 | > 66 | > ## The column VarIntenCh3 measures the standard deviation of the intensity 67 | > ## of the pixels in the actin filaments 68 | > 69 | > max(segTrainX$VarIntenCh3)/min(segTrainX$VarIntenCh3) 70 | [1] 870.8872 71 | > 72 | > library(e1071) 73 | Loading required package: class 74 | > skewness(segTrainX$VarIntenCh3) 75 | [1] 2.391624 76 | > 77 | > library(caret) 78 | Loading required package: lattice 79 | Loading required package: ggplot2 80 | > 81 | > ## Use caret's preProcess function to transform for skewness 82 | > segPP <- preProcess(segTrainX, method = "BoxCox") 83 | > 84 | > ## Apply the transformations 85 | > segTrainTrans <- predict(segPP, segTrainX) 86 | > 87 | > ## Results for a single predictor 88 | > segPP$bc$VarIntenCh3 89 | Box-Cox Transformation 90 | 91 | 1009 data points used to estimate Lambda 92 | 93 | Input data summary: 94 | Min. 1st Qu. Median Mean 3rd Qu. Max. 95 | 0.8693 37.0600 68.1300 101.7000 125.0000 757.0000 96 | 97 | Largest/Smallest: 871 98 | Sample Skewness: 2.39 99 | 100 | Estimated Lambda: 0.1 101 | With fudge factor, Lambda = 0 will be used for transformations 102 | 103 | > 104 | > histogram(~segTrainX$VarIntenCh3, 105 | + xlab = "Natural Units", 106 | + type = "count") 107 | > 108 | > histogram(~log(segTrainX$VarIntenCh3), 109 | + xlab = "Log Units", 110 | + ylab = " ", 111 | + type = "count") 112 | > 113 | > segPP$bc$PerimCh1 114 | Box-Cox Transformation 115 | 116 | 1009 data points used to estimate Lambda 117 | 118 | Input data summary: 119 | Min. 1st Qu. Median Mean 3rd Qu. Max. 120 | 47.74 64.37 79.02 91.61 103.20 459.80 121 | 122 | Largest/Smallest: 9.63 123 | Sample Skewness: 2.59 124 | 125 | Estimated Lambda: -1.1 126 | 127 | > 128 | > histogram(~segTrainX$PerimCh1, 129 | + xlab = "Natural Units", 130 | + type = "count") 131 | > 132 | > histogram(~segTrainTrans$PerimCh1, 133 | + xlab = "Transformed Data", 134 | + ylab = " ", 135 | + type = "count") 136 | > 137 | > ################################################################################ 138 | > ### Section 3.3 Data Transformations for Multiple Predictors 139 | > 140 | > ## R's prcomp is used to conduct PCA 141 | > pr <- prcomp(~ AvgIntenCh1 + EntropyIntenCh1, 142 | + data = segTrainTrans, 143 | + scale. = TRUE) 144 | > 145 | > transparentTheme(pchSize = .7, trans = .3) 146 | > 147 | > xyplot(AvgIntenCh1 ~ EntropyIntenCh1, 148 | + data = segTrainTrans, 149 | + groups = segTrain$Class, 150 | + xlab = "Channel 1 Fiber Width", 151 | + ylab = "Intensity Entropy Channel 1", 152 | + auto.key = list(columns = 2), 153 | + type = c("p", "g"), 154 | + main = "Original Data", 155 | + aspect = 1) 156 | > 157 | > xyplot(PC2 ~ PC1, 158 | + data = as.data.frame(pr$x), 159 | + groups = segTrain$Class, 160 | + xlab = "Principal Component #1", 161 | + ylab = "Principal Component #2", 162 | + main = "Transformed", 163 | + xlim = extendrange(pr$x), 164 | + ylim = extendrange(pr$x), 165 | + type = c("p", "g"), 166 | + aspect = 1) 167 | > 168 | > 169 | > ## Apply PCA to the entire set of predictors. 170 | > 171 | > ## There are a few predictors with only a single value, so we remove these first 172 | > ## (since PCA uses variances, which would be zero) 173 | > 174 | > isZV <- apply(segTrainX, 2, function(x) length(unique(x)) == 1) 175 | > segTrainX <- segTrainX[, !isZV] 176 | > 177 | > segPP <- preProcess(segTrainX, c("BoxCox", "center", "scale")) 178 | > segTrainTrans <- predict(segPP, segTrainX) 179 | > 180 | > segPCA <- prcomp(segTrainTrans, center = TRUE, scale. = TRUE) 181 | > 182 | > ## Plot a scatterplot matrix of the first three components 183 | > transparentTheme(pchSize = .8, trans = .3) 184 | > 185 | > panelRange <- extendrange(segPCA$x[, 1:3]) 186 | > splom(as.data.frame(segPCA$x[, 1:3]), 187 | + groups = segTrainClass, 188 | + type = c("p", "g"), 189 | + as.table = TRUE, 190 | + auto.key = list(columns = 2), 191 | + prepanel.limits = function(x) panelRange) 192 | > 193 | > ## Format the rotation values for plotting 194 | > segRot <- as.data.frame(segPCA$rotation[, 1:3]) 195 | > 196 | > ## Derive the channel variable 197 | > vars <- rownames(segPCA$rotation) 198 | > channel <- rep(NA, length(vars)) 199 | > channel[grepl("Ch1$", vars)] <- "Channel 1" 200 | > channel[grepl("Ch2$", vars)] <- "Channel 2" 201 | > channel[grepl("Ch3$", vars)] <- "Channel 3" 202 | > channel[grepl("Ch4$", vars)] <- "Channel 4" 203 | > 204 | > segRot$Channel <- channel 205 | > segRot <- segRot[complete.cases(segRot),] 206 | > segRot$Channel <- factor(as.character(segRot$Channel)) 207 | > 208 | > ## Plot a scatterplot matrix of the first three rotation variables 209 | > 210 | > transparentTheme(pchSize = .8, trans = .7) 211 | > panelRange <- extendrange(segRot[, 1:3]) 212 | > library(ellipse) 213 | > upperp <- function(...) 214 | + { 215 | + args <- list(...) 216 | + circ1 <- ellipse(diag(rep(1, 2)), t = .1) 217 | + panel.xyplot(circ1[,1], circ1[,2], 218 | + type = "l", 219 | + lty = trellis.par.get("reference.line")$lty, 220 | + col = trellis.par.get("reference.line")$col, 221 | + lwd = trellis.par.get("reference.line")$lwd) 222 | + circ2 <- ellipse(diag(rep(1, 2)), t = .2) 223 | + panel.xyplot(circ2[,1], circ2[,2], 224 | + type = "l", 225 | + lty = trellis.par.get("reference.line")$lty, 226 | + col = trellis.par.get("reference.line")$col, 227 | + lwd = trellis.par.get("reference.line")$lwd) 228 | + circ3 <- ellipse(diag(rep(1, 2)), t = .3) 229 | + panel.xyplot(circ3[,1], circ3[,2], 230 | + type = "l", 231 | + lty = trellis.par.get("reference.line")$lty, 232 | + col = trellis.par.get("reference.line")$col, 233 | + lwd = trellis.par.get("reference.line")$lwd) 234 | + panel.xyplot(args$x, args$y, groups = args$groups, subscripts = args$subscripts) 235 | + } 236 | > splom(~segRot[, 1:3], 237 | + groups = segRot$Channel, 238 | + lower.panel = function(...){}, upper.panel = upperp, 239 | + prepanel.limits = function(x) panelRange, 240 | + auto.key = list(columns = 2)) 241 | > 242 | > ################################################################################ 243 | > ### Section 3.5 Removing Variables 244 | > 245 | > ## To filter on correlations, we first get the correlation matrix for the 246 | > ## predictor set 247 | > 248 | > segCorr <- cor(segTrainTrans) 249 | > 250 | > library(corrplot) 251 | > corrplot(segCorr, order = "hclust", tl.cex = .35) 252 | > 253 | > ## caret's findCorrelation function is used to identify columns to remove. 254 | > highCorr <- findCorrelation(segCorr, .75) 255 | > 256 | > ################################################################################ 257 | > ### Section 3.8 Computing (Creating Dummy Variables) 258 | > 259 | > data(cars) 260 | > type <- c("convertible", "coupe", "hatchback", "sedan", "wagon") 261 | > cars$Type <- factor(apply(cars[, 14:18], 1, function(x) type[which(x == 1)])) 262 | > 263 | > carSubset <- cars[sample(1:nrow(cars), 20), c(1, 2, 19)] 264 | > 265 | > head(carSubset) 266 | Price Mileage Type 267 | 415 51154.05 2202 sedan 268 | 503 14116.92 12878 sedan 269 | 484 18620.87 25516 sedan 270 | 642 19423.17 25557 sedan 271 | 337 11391.21 21421 hatchback 272 | 121 20538.09 15066 sedan 273 | > levels(carSubset$Type) 274 | [1] "convertible" "coupe" "hatchback" "sedan" "wagon" 275 | > 276 | > simpleMod <- dummyVars(~Mileage + Type, 277 | + data = carSubset, 278 | + ## Remove the variable name from the 279 | + ## column name 280 | + levelsOnly = TRUE) 281 | > simpleMod 282 | Dummy Variable Object 283 | 284 | Formula: ~Mileage + Type 285 | 2 variables, 1 factors 286 | Factor variable names will be removed 287 | A less than full rank encoding is used 288 | > 289 | > withInteraction <- dummyVars(~Mileage + Type + Mileage:Type, 290 | + data = carSubset, 291 | + levelsOnly = TRUE) 292 | > withInteraction 293 | Dummy Variable Object 294 | 295 | Formula: ~Mileage + Type + Mileage:Type 296 | 2 variables, 1 factors 297 | Factor variable names will be removed 298 | A less than full rank encoding is used 299 | > predict(withInteraction, head(carSubset)) 300 | Mileage convertible coupe hatchback sedan wagon Mileage:convertible 301 | 415 2202 0 0 0 1 0 0 302 | 503 12878 0 0 0 1 0 0 303 | 484 25516 0 0 0 1 0 0 304 | 642 25557 0 0 0 1 0 0 305 | 337 21421 0 0 1 0 0 0 306 | 121 15066 0 0 0 1 0 0 307 | Mileage:coupe Mileage:hatchback Mileage:sedan Mileage:wagon 308 | 415 0 0 2202 0 309 | 503 0 0 12878 0 310 | 484 0 0 25516 0 311 | 642 0 0 25557 0 312 | 337 0 21421 0 0 313 | 121 0 0 15066 0 314 | > 315 | > 316 | > 317 | > ################################################################################ 318 | > ### Session Information 319 | > 320 | > sessionInfo() 321 | R version 3.0.1 (2013-05-16) 322 | Platform: x86_64-apple-darwin10.8.0 (64-bit) 323 | 324 | locale: 325 | [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8 326 | 327 | attached base packages: 328 | [1] stats graphics grDevices utils datasets methods base 329 | 330 | other attached packages: 331 | [1] corrplot_0.71 ellipse_0.3-8 332 | [3] MASS_7.3-26 caret_6.0-22 333 | [5] ggplot2_0.9.3.1 lattice_0.20-15 334 | [7] e1071_1.6-1 class_7.3-7 335 | [9] AppliedPredictiveModeling_1.1-5 336 | 337 | loaded via a namespace (and not attached): 338 | [1] car_2.0-17 codetools_0.2-8 colorspace_1.2-2 CORElearn_0.9.41 339 | [5] dichromat_2.0-0 digest_0.6.3 foreach_1.4.0 grid_3.0.1 340 | [9] gtable_0.1.2 iterators_1.0.6 labeling_0.1 munsell_0.4 341 | [13] plyr_1.8 proto_0.3-10 RColorBrewer_1.0-5 reshape2_1.2.2 342 | [17] scales_0.2.3 stringr_0.6.2 tools_3.0.1 343 | > 344 | > q("no") 345 | > proc.time() 346 | user system elapsed 347 | 5.791 0.147 6.146 348 | -------------------------------------------------------------------------------- /inst/chapters/04_Over_Fitting.R: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson. 3 | ### Copyright 2013 Kuhn and Johnson 4 | ### Web Page: http://www.appliedpredictivemodeling.com 5 | ### Contact: Max Kuhn (mxkuhn@gmail.com) 6 | ### 7 | ### Chapter 4: Over-Fitting and Model Tuning 8 | ### 9 | ### Required packages: caret, doMC (optional), kernlab 10 | ### 11 | ### Data used: 12 | ### 13 | ### Notes: 14 | ### 1) This code is provided without warranty. 15 | ### 16 | ### 2) This code should help the user reproduce the results in the 17 | ### text. There will be differences between this code and what is is 18 | ### the computing section. For example, the computing sections show 19 | ### how the source functions work (e.g. randomForest() or plsr()), 20 | ### which were not directly used when creating the book. Also, there may be 21 | ### syntax differences that occur over time as packages evolve. These files 22 | ### will reflect those changes. 23 | ### 24 | ### 3) In some cases, the calculations in the book were run in 25 | ### parallel. The sub-processes may reset the random number seed. 26 | ### Your results may slightly vary. 27 | ### 28 | ################################################################################ 29 | 30 | ################################################################################ 31 | ### Section 4.6 Choosing Final Tuning Parameters 32 | 33 | library(caret) 34 | data(GermanCredit) 35 | 36 | ## First, remove near-zero variance predictors then get rid of a few predictors 37 | ## that duplicate values. For example, there are two possible values for the 38 | ## housing variable: "Rent", "Own" and "ForFree". So that we don't have linear 39 | ## dependencies, we get rid of one of the levels (e.g. "ForFree") 40 | 41 | GermanCredit <- GermanCredit[, -nearZeroVar(GermanCredit)] 42 | GermanCredit$CheckingAccountStatus.lt.0 <- NULL 43 | GermanCredit$SavingsAccountBonds.lt.100 <- NULL 44 | GermanCredit$EmploymentDuration.lt.1 <- NULL 45 | GermanCredit$EmploymentDuration.Unemployed <- NULL 46 | GermanCredit$Personal.Male.Married.Widowed <- NULL 47 | GermanCredit$Property.Unknown <- NULL 48 | GermanCredit$Housing.ForFree <- NULL 49 | 50 | ## Split the data into training (80%) and test sets (20%) 51 | set.seed(100) 52 | inTrain <- createDataPartition(GermanCredit$Class, p = .8)[[1]] 53 | GermanCreditTrain <- GermanCredit[ inTrain, ] 54 | GermanCreditTest <- GermanCredit[-inTrain, ] 55 | 56 | ## The model fitting code shown in the computing section is fairly 57 | ## simplistic. For the text we estimate the tuning parameter grid 58 | ## up-front and pass it in explicitly. This generally is not needed, 59 | ## but was used here so that we could trim the cost values to a 60 | ## presentable range and to re-use later with different resampling 61 | ## methods. 62 | 63 | library(kernlab) 64 | set.seed(231) 65 | sigDist <- sigest(Class ~ ., data = GermanCreditTrain, frac = 1) 66 | svmTuneGrid <- data.frame(sigma = as.vector(sigDist)[1], C = 2^(-2:7)) 67 | 68 | ### Optional: parallel processing can be used via the 'do' packages, 69 | ### such as doMC, doMPI etc. We used doMC (not on Windows) to speed 70 | ### up the computations. 71 | 72 | ### WARNING: Be aware of how much memory is needed to parallel 73 | ### process. It can very quickly overwhelm the available hardware. We 74 | ### estimate the memory usage (VSIZE = total memory size) to be 75 | ### 2566M/core. 76 | 77 | library(doMC) 78 | registerDoMC(4) 79 | 80 | set.seed(1056) 81 | svmFit <- train(Class ~ ., 82 | data = GermanCreditTrain, 83 | method = "svmRadial", 84 | preProc = c("center", "scale"), 85 | tuneGrid = svmTuneGrid, 86 | trControl = trainControl(method = "repeatedcv", 87 | repeats = 5, 88 | classProbs = TRUE)) 89 | ## classProbs = TRUE was added since the text was written 90 | 91 | ## Print the results 92 | svmFit 93 | 94 | ## A line plot of the average performance. The 'scales' argument is actually an 95 | ## argument to xyplot that converts the x-axis to log-2 units. 96 | 97 | plot(svmFit, scales = list(x = list(log = 2))) 98 | 99 | ## Test set predictions 100 | 101 | predictedClasses <- predict(svmFit, GermanCreditTest) 102 | str(predictedClasses) 103 | 104 | ## Use the "type" option to get class probabilities 105 | 106 | predictedProbs <- predict(svmFit, newdata = GermanCreditTest, type = "prob") 107 | head(predictedProbs) 108 | 109 | 110 | ## Fit the same model using different resampling methods. The main syntax change 111 | ## is the control object. 112 | 113 | set.seed(1056) 114 | svmFit10CV <- train(Class ~ ., 115 | data = GermanCreditTrain, 116 | method = "svmRadial", 117 | preProc = c("center", "scale"), 118 | tuneGrid = svmTuneGrid, 119 | trControl = trainControl(method = "cv", number = 10)) 120 | svmFit10CV 121 | 122 | set.seed(1056) 123 | svmFitLOO <- train(Class ~ ., 124 | data = GermanCreditTrain, 125 | method = "svmRadial", 126 | preProc = c("center", "scale"), 127 | tuneGrid = svmTuneGrid, 128 | trControl = trainControl(method = "LOOCV")) 129 | svmFitLOO 130 | 131 | set.seed(1056) 132 | svmFitLGO <- train(Class ~ ., 133 | data = GermanCreditTrain, 134 | method = "svmRadial", 135 | preProc = c("center", "scale"), 136 | tuneGrid = svmTuneGrid, 137 | trControl = trainControl(method = "LGOCV", 138 | number = 50, 139 | p = .8)) 140 | svmFitLGO 141 | 142 | set.seed(1056) 143 | svmFitBoot <- train(Class ~ ., 144 | data = GermanCreditTrain, 145 | method = "svmRadial", 146 | preProc = c("center", "scale"), 147 | tuneGrid = svmTuneGrid, 148 | trControl = trainControl(method = "boot", number = 50)) 149 | svmFitBoot 150 | 151 | set.seed(1056) 152 | svmFitBoot632 <- train(Class ~ ., 153 | data = GermanCreditTrain, 154 | method = "svmRadial", 155 | preProc = c("center", "scale"), 156 | tuneGrid = svmTuneGrid, 157 | trControl = trainControl(method = "boot632", 158 | number = 50)) 159 | svmFitBoot632 160 | 161 | ################################################################################ 162 | ### Section 4.8 Choosing Between Models 163 | 164 | set.seed(1056) 165 | glmProfile <- train(Class ~ ., 166 | data = GermanCreditTrain, 167 | method = "glm", 168 | trControl = trainControl(method = "repeatedcv", 169 | repeats = 5)) 170 | glmProfile 171 | 172 | resamp <- resamples(list(SVM = svmFit, Logistic = glmProfile)) 173 | summary(resamp) 174 | 175 | ## These results are slightly different from those shown in the text. 176 | ## There are some differences in the train() function since the 177 | ## original results were produced. This is due to a difference in 178 | ## predictions from the ksvm() function when class probs are requested 179 | ## and when they are not. See, for example, 180 | ## https://stat.ethz.ch/pipermail/r-help/2013-November/363188.html 181 | 182 | modelDifferences <- diff(resamp) 183 | summary(modelDifferences) 184 | 185 | ## The actual paired t-test: 186 | modelDifferences$statistics$Accuracy 187 | 188 | ################################################################################ 189 | ### Session Information 190 | 191 | sessionInfo() 192 | 193 | q("no") 194 | 195 | 196 | 197 | -------------------------------------------------------------------------------- /inst/chapters/06_Linear_Regression.R: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson. 3 | ### Copyright 2013 Kuhn and Johnson 4 | ### Web Page: http://www.appliedpredictivemodeling.com 5 | ### Contact: Max Kuhn (mxkuhn@gmail.com) 6 | ### 7 | ### Chapter 6: Linear Regression and Its Cousins 8 | ### 9 | ### Required packages: AppliedPredictiveModeling, lattice, corrplot, pls, 10 | ### elasticnet, 11 | ### 12 | ### Data used: The solubility from the AppliedPredictiveModeling package 13 | ### 14 | ### Notes: 15 | ### 1) This code is provided without warranty. 16 | ### 17 | ### 2) This code should help the user reproduce the results in the 18 | ### text. There will be differences between this code and what is is 19 | ### the computing section. For example, the computing sections show 20 | ### how the source functions work (e.g. randomForest() or plsr()), 21 | ### which were not directly used when creating the book. Also, there may be 22 | ### syntax differences that occur over time as packages evolve. These files 23 | ### will reflect those changes. 24 | ### 25 | ### 3) In some cases, the calculations in the book were run in 26 | ### parallel. The sub-processes may reset the random number seed. 27 | ### Your results may slightly vary. 28 | ### 29 | ################################################################################ 30 | 31 | ################################################################################ 32 | ### Section 6.1 Case Study: Quantitative Structure- Activity 33 | ### Relationship Modeling 34 | 35 | library(AppliedPredictiveModeling) 36 | data(solubility) 37 | 38 | library(lattice) 39 | 40 | ### Some initial plots of the data 41 | 42 | xyplot(solTrainY ~ solTrainX$MolWeight, type = c("p", "g"), 43 | ylab = "Solubility (log)", 44 | main = "(a)", 45 | xlab = "Molecular Weight") 46 | xyplot(solTrainY ~ solTrainX$NumRotBonds, type = c("p", "g"), 47 | ylab = "Solubility (log)", 48 | xlab = "Number of Rotatable Bonds") 49 | bwplot(solTrainY ~ ifelse(solTrainX[,100] == 1, 50 | "structure present", 51 | "structure absent"), 52 | ylab = "Solubility (log)", 53 | main = "(b)", 54 | horizontal = FALSE) 55 | 56 | ### Find the columns that are not fingerprints (i.e. the continuous 57 | ### predictors). grep will return a list of integers corresponding to 58 | ### column names that contain the pattern "FP". 59 | 60 | notFingerprints <- grep("FP", names(solTrainXtrans)) 61 | 62 | library(caret) 63 | featurePlot(solTrainXtrans[, -notFingerprints], 64 | solTrainY, 65 | between = list(x = 1, y = 1), 66 | type = c("g", "p", "smooth"), 67 | labels = rep("", 2)) 68 | 69 | library(corrplot) 70 | 71 | ### We used the full namespace to call this function because the pls 72 | ### package (also used in this chapter) has a function with the same 73 | ### name. 74 | 75 | corrplot::corrplot(cor(solTrainXtrans[, -notFingerprints]), 76 | order = "hclust", 77 | tl.cex = .8) 78 | 79 | ################################################################################ 80 | ### Section 6.2 Linear Regression 81 | 82 | ### Create a control function that will be used across models. We 83 | ### create the fold assignments explicitly instead of relying on the 84 | ### random number seed being set to identical values. 85 | 86 | set.seed(100) 87 | indx <- createFolds(solTrainY, returnTrain = TRUE) 88 | ctrl <- trainControl(method = "cv", index = indx) 89 | 90 | ### Linear regression model with all of the predictors. This will 91 | ### produce some warnings that a 'rank-deficient fit may be 92 | ### misleading'. This is related to the predictors being so highly 93 | ### correlated that some of the math has broken down. 94 | 95 | set.seed(100) 96 | lmTune0 <- train(x = solTrainXtrans, y = solTrainY, 97 | method = "lm", 98 | trControl = ctrl) 99 | 100 | lmTune0 101 | 102 | ### And another using a set of predictors reduced by unsupervised 103 | ### filtering. We apply a filter to reduce extreme between-predictor 104 | ### correlations. Note the lack of warnings. 105 | 106 | tooHigh <- findCorrelation(cor(solTrainXtrans), .9) 107 | trainXfiltered <- solTrainXtrans[, -tooHigh] 108 | testXfiltered <- solTestXtrans[, -tooHigh] 109 | 110 | set.seed(100) 111 | lmTune <- train(x = trainXfiltered, y = solTrainY, 112 | method = "lm", 113 | trControl = ctrl) 114 | 115 | lmTune 116 | 117 | ### Save the test set results in a data frame 118 | testResults <- data.frame(obs = solTestY, 119 | Linear_Regression = predict(lmTune, testXfiltered)) 120 | 121 | 122 | ################################################################################ 123 | ### Section 6.3 Partial Least Squares 124 | 125 | ## Run PLS and PCR on solubility data and compare results 126 | set.seed(100) 127 | plsTune <- train(x = solTrainXtrans, y = solTrainY, 128 | method = "pls", 129 | tuneGrid = expand.grid(ncomp = 1:20), 130 | trControl = ctrl) 131 | plsTune 132 | 133 | testResults$PLS <- predict(plsTune, solTestXtrans) 134 | 135 | set.seed(100) 136 | pcrTune <- train(x = solTrainXtrans, y = solTrainY, 137 | method = "pcr", 138 | tuneGrid = expand.grid(ncomp = 1:35), 139 | trControl = ctrl) 140 | pcrTune 141 | 142 | plsResamples <- plsTune$results 143 | plsResamples$Model <- "PLS" 144 | pcrResamples <- pcrTune$results 145 | pcrResamples$Model <- "PCR" 146 | plsPlotData <- rbind(plsResamples, pcrResamples) 147 | 148 | xyplot(RMSE ~ ncomp, 149 | data = plsPlotData, 150 | #aspect = 1, 151 | xlab = "# Components", 152 | ylab = "RMSE (Cross-Validation)", 153 | auto.key = list(columns = 2), 154 | groups = Model, 155 | type = c("o", "g")) 156 | 157 | plsImp <- varImp(plsTune, scale = FALSE) 158 | plot(plsImp, top = 25, scales = list(y = list(cex = .95))) 159 | 160 | ################################################################################ 161 | ### Section 6.4 Penalized Models 162 | 163 | ## The text used the elasticnet to obtain a ridge regression model. 164 | ## There is now a simple ridge regression method. 165 | 166 | ridgeGrid <- expand.grid(lambda = seq(0, .1, length = 15)) 167 | 168 | set.seed(100) 169 | ridgeTune <- train(x = solTrainXtrans, y = solTrainY, 170 | method = "ridge", 171 | tuneGrid = ridgeGrid, 172 | trControl = ctrl, 173 | preProc = c("center", "scale")) 174 | ridgeTune 175 | 176 | print(update(plot(ridgeTune), xlab = "Penalty")) 177 | 178 | 179 | enetGrid <- expand.grid(lambda = c(0, 0.01, .1), 180 | fraction = seq(.05, 1, length = 20)) 181 | set.seed(100) 182 | enetTune <- train(x = solTrainXtrans, y = solTrainY, 183 | method = "enet", 184 | tuneGrid = enetGrid, 185 | trControl = ctrl, 186 | preProc = c("center", "scale")) 187 | enetTune 188 | 189 | plot(enetTune) 190 | 191 | testResults$Enet <- predict(enetTune, solTestXtrans) 192 | 193 | ################################################################################ 194 | ### Session Information 195 | 196 | sessionInfo() 197 | 198 | q("no") 199 | 200 | 201 | 202 | -------------------------------------------------------------------------------- /inst/chapters/07_Non-Linear_Reg.R: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson. 3 | ### Copyright 2013 Kuhn and Johnson 4 | ### Web Page: http://www.appliedpredictivemodeling.com 5 | ### Contact: Max Kuhn (mxkuhn@gmail.com) 6 | ### 7 | ### Chapter 7: Non-Linear Regression Models 8 | ### 9 | ### Required packages: AppliedPredictiveModeling, caret, doMC (optional), earth, 10 | ### kernlab, lattice, nnet 11 | ### 12 | ### Data used: The solubility from the AppliedPredictiveModeling package 13 | ### 14 | ### Notes: 15 | ### 1) This code is provided without warranty. 16 | ### 17 | ### 2) This code should help the user reproduce the results in the 18 | ### text. There will be differences between this code and what is is 19 | ### the computing section. For example, the computing sections show 20 | ### how the source functions work (e.g. randomForest() or plsr()), 21 | ### which were not directly used when creating the book. Also, there may be 22 | ### syntax differences that occur over time as packages evolve. These files 23 | ### will reflect those changes. 24 | ### 25 | ### 3) In some cases, the calculations in the book were run in 26 | ### parallel. The sub-processes may reset the random number seed. 27 | ### Your results may slightly vary. 28 | ### 29 | ################################################################################ 30 | 31 | ################################################################################ 32 | ### Load the data 33 | 34 | library(AppliedPredictiveModeling) 35 | data(solubility) 36 | 37 | ### Create a control funciton that will be used across models. We 38 | ### create the fold assignments explictily instead of relying on the 39 | ### random number seed being set to identical values. 40 | 41 | library(caret) 42 | set.seed(100) 43 | indx <- createFolds(solTrainY, returnTrain = TRUE) 44 | ctrl <- trainControl(method = "cv", index = indx) 45 | 46 | ################################################################################ 47 | ### Section 7.1 Neural Networks 48 | 49 | ### Optional: parallel processing can be used via the 'do' packages, 50 | ### such as doMC, doMPI etc. We used doMC (not on Windows) to speed 51 | ### up the computations. 52 | 53 | ### WARNING: Be aware of how much memory is needed to parallel 54 | ### process. It can very quickly overwhelm the availible hardware. We 55 | ### estimate the memory usuage (VSIZE = total memory size) to be 56 | ### 2677M/core. 57 | 58 | library(doMC) 59 | registerDoMC(10) 60 | 61 | 62 | library(caret) 63 | 64 | nnetGrid <- expand.grid(decay = c(0, 0.01, .1), 65 | size = c(1, 3, 5, 7, 9, 11, 13), 66 | bag = FALSE) 67 | 68 | set.seed(100) 69 | nnetTune <- train(x = solTrainXtrans, y = solTrainY, 70 | method = "avNNet", 71 | tuneGrid = nnetGrid, 72 | trControl = ctrl, 73 | preProc = c("center", "scale"), 74 | linout = TRUE, 75 | trace = FALSE, 76 | MaxNWts = 13 * (ncol(solTrainXtrans) + 1) + 13 + 1, 77 | maxit = 1000, 78 | allowParallel = FALSE) 79 | nnetTune 80 | 81 | plot(nnetTune) 82 | 83 | testResults <- data.frame(obs = solTestY, 84 | NNet = predict(nnetTune, solTestXtrans)) 85 | 86 | ################################################################################ 87 | ### Section 7.2 Multivariate Adaptive Regression Splines 88 | 89 | set.seed(100) 90 | marsTune <- train(x = solTrainXtrans, y = solTrainY, 91 | method = "earth", 92 | tuneGrid = expand.grid(degree = 1, nprune = 2:38), 93 | trControl = ctrl) 94 | marsTune 95 | 96 | plot(marsTune) 97 | 98 | testResults$MARS <- predict(marsTune, solTestXtrans) 99 | 100 | marsImp <- varImp(marsTune, scale = FALSE) 101 | plot(marsImp, top = 25) 102 | 103 | ################################################################################ 104 | ### Section 7.3 Support Vector Machines 105 | 106 | ## In a recent update to caret, the method to estimate the 107 | ## sigma parameter was slightly changed. These results will 108 | ## slightly differ from the text for that reason. 109 | 110 | set.seed(100) 111 | svmRTune <- train(x = solTrainXtrans, y = solTrainY, 112 | method = "svmRadial", 113 | preProc = c("center", "scale"), 114 | tuneLength = 14, 115 | trControl = ctrl) 116 | svmRTune 117 | plot(svmRTune, scales = list(x = list(log = 2))) 118 | 119 | svmGrid <- expand.grid(degree = 1:2, 120 | scale = c(0.01, 0.005, 0.001), 121 | C = 2^(-2:5)) 122 | set.seed(100) 123 | svmPTune <- train(x = solTrainXtrans, y = solTrainY, 124 | method = "svmPoly", 125 | preProc = c("center", "scale"), 126 | tuneGrid = svmGrid, 127 | trControl = ctrl) 128 | 129 | svmPTune 130 | plot(svmPTune, 131 | scales = list(x = list(log = 2), 132 | between = list(x = .5, y = 1))) 133 | 134 | testResults$SVMr <- predict(svmRTune, solTestXtrans) 135 | testResults$SVMp <- predict(svmPTune, solTestXtrans) 136 | 137 | ################################################################################ 138 | ### Section 7.4 K-Nearest Neighbors 139 | 140 | ### First we remove near-zero variance predictors 141 | knnDescr <- solTrainXtrans[, -nearZeroVar(solTrainXtrans)] 142 | 143 | set.seed(100) 144 | knnTune <- train(x = knnDescr, y = solTrainY, 145 | method = "knn", 146 | preProc = c("center", "scale"), 147 | tuneGrid = data.frame(k = 1:20), 148 | trControl = ctrl) 149 | 150 | knnTune 151 | 152 | plot(knnTune) 153 | 154 | testResults$Knn <- predict(svmRTune, solTestXtrans[, names(knnDescr)]) 155 | 156 | ################################################################################ 157 | ### Session Information 158 | 159 | sessionInfo() 160 | 161 | q("no") 162 | 163 | 164 | -------------------------------------------------------------------------------- /inst/chapters/08_Regression_Trees.R: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson. 3 | ### Copyright 2013 Kuhn and Johnson 4 | ### Web Page: http://www.appliedpredictivemodeling.com 5 | ### Contact: Max Kuhn (mxkuhn@gmail.com) 6 | ### 7 | ### Chapter 8: Regression Trees and Rule-Based Models 8 | ### 9 | ### Required packages: AppliedPredictiveModeling, caret, Cubis, doMC (optional), 10 | ### gbm, lattice, party, partykit, randomForest, rpart, RWeka 11 | ### 12 | ### Data used: The solubility from the AppliedPredictiveModeling package 13 | ### 14 | ### Notes: 15 | ### 1) This code is provided without warranty. 16 | ### 17 | ### 2) This code should help the user reproduce the results in the 18 | ### text. There will be differences between this code and what is is 19 | ### the computing section. For example, the computing sections show 20 | ### how the source functions work (e.g. randomForest() or plsr()), 21 | ### which were not directly used when creating the book. Also, there may be 22 | ### syntax differences that occur over time as packages evolve. These files 23 | ### will reflect those changes. 24 | ### 25 | ### 3) In some cases, the calculations in the book were run in 26 | ### parallel. The sub-processes may reset the random number seed. 27 | ### Your results may slightly vary. 28 | ### 29 | ################################################################################ 30 | 31 | ################################################################################ 32 | ### Load the data 33 | 34 | library(AppliedPredictiveModeling) 35 | data(solubility) 36 | 37 | ### Create a control function that will be used across models. We 38 | ### create the fold assignments explicitly instead of relying on the 39 | ### random number seed being set to identical values. 40 | 41 | library(caret) 42 | set.seed(100) 43 | indx <- createFolds(solTrainY, returnTrain = TRUE) 44 | ctrl <- trainControl(method = "cv", index = indx) 45 | 46 | ################################################################################ 47 | ### Section 8.1 Basic Regression Trees 48 | 49 | library(rpart) 50 | 51 | ### Fit two CART models to show the initial splitting process. rpart 52 | ### only uses formulas, so we put the predictors and outcome into 53 | ### a common data frame first. 54 | 55 | trainData <- solTrainXtrans 56 | trainData$y <- solTrainY 57 | 58 | rpStump <- rpart(y ~ ., data = trainData, 59 | control = rpart.control(maxdepth = 1)) 60 | rpSmall <- rpart(y ~ ., data = trainData, 61 | control = rpart.control(maxdepth = 2)) 62 | 63 | ### Tune the model 64 | library(caret) 65 | 66 | set.seed(100) 67 | cartTune <- train(x = solTrainXtrans, y = solTrainY, 68 | method = "rpart", 69 | tuneLength = 25, 70 | trControl = ctrl) 71 | cartTune 72 | ## cartTune$finalModel 73 | 74 | 75 | ### Plot the tuning results 76 | plot(cartTune, scales = list(x = list(log = 10))) 77 | 78 | ### Use the partykit package to make some nice plots. First, convert 79 | ### the rpart objects to party objects. 80 | 81 | # library(partykit) 82 | # 83 | # cartTree <- as.party(cartTune$finalModel) 84 | # plot(cartTree) 85 | 86 | ### Get the variable importance. 'competes' is an argument that 87 | ### controls whether splits not used in the tree should be included 88 | ### in the importance calculations. 89 | 90 | cartImp <- varImp(cartTune, scale = FALSE, competes = FALSE) 91 | cartImp 92 | 93 | ### Save the test set results in a data frame 94 | testResults <- data.frame(obs = solTestY, 95 | CART = predict(cartTune, solTestXtrans)) 96 | 97 | ### Tune the conditional inference tree 98 | 99 | cGrid <- data.frame(mincriterion = sort(c(.95, seq(.75, .99, length = 2)))) 100 | 101 | set.seed(100) 102 | ctreeTune <- train(x = solTrainXtrans, y = solTrainY, 103 | method = "ctree", 104 | tuneGrid = cGrid, 105 | trControl = ctrl) 106 | ctreeTune 107 | plot(ctreeTune) 108 | 109 | ##ctreeTune$finalModel 110 | plot(ctreeTune$finalModel) 111 | 112 | testResults$cTree <- predict(ctreeTune, solTestXtrans) 113 | 114 | ################################################################################ 115 | ### Section 8.2 Regression Model Trees and 8.3 Rule-Based Models 116 | 117 | ### Tune the model tree. Using method = "M5" actually tunes over the 118 | ### tree- and rule-based versions of the model. M = 10 is also passed 119 | ### in to make sure that there are larger terminal nodes for the 120 | ### regression models. 121 | 122 | set.seed(100) 123 | m5Tune <- train(x = solTrainXtrans, y = solTrainY, 124 | method = "M5", 125 | trControl = ctrl, 126 | control = Weka_control(M = 10)) 127 | m5Tune 128 | 129 | plot(m5Tune) 130 | 131 | ## m5Tune$finalModel 132 | 133 | ## plot(m5Tune$finalModel) 134 | 135 | ### Show the rule-based model too 136 | 137 | ruleFit <- M5Rules(y~., data = trainData, control = Weka_control(M = 10)) 138 | ruleFit 139 | 140 | ################################################################################ 141 | ### Section 8.4 Bagged Trees 142 | 143 | ### Optional: parallel processing can be used via the 'do' packages, 144 | ### such as doMC, doMPI etc. We used doMC (not on Windows) to speed 145 | ### up the computations. 146 | 147 | ### WARNING: Be aware of how much memory is needed to parallel 148 | ### process. It can very quickly overwhelm the available hardware. The 149 | ### estimate of the median memory usage (VSIZE = total memory size) 150 | ### was 9706M for a core, but could range up to 9706M. This becomes 151 | ### severe when parallelizing randomForest() and (especially) calls 152 | ### to cforest(). 153 | 154 | ### WARNING 2: The RWeka package does not work well with some forms of 155 | ### parallel processing, such as mutlicore (i.e. doMC). 156 | 157 | library(doMC) 158 | registerDoMC(5) 159 | 160 | set.seed(100) 161 | 162 | treebagTune <- train(x = solTrainXtrans, y = solTrainY, 163 | method = "treebag", 164 | nbagg = 50, 165 | trControl = ctrl) 166 | 167 | treebagTune 168 | 169 | ################################################################################ 170 | ### Section 8.5 Random Forests 171 | 172 | mtryGrid <- data.frame(mtry = floor(seq(10, ncol(solTrainXtrans), length = 10))) 173 | 174 | 175 | ### Tune the model using cross-validation 176 | set.seed(100) 177 | rfTune <- train(x = solTrainXtrans, y = solTrainY, 178 | method = "rf", 179 | tuneGrid = mtryGrid, 180 | ntree = 1000, 181 | importance = TRUE, 182 | trControl = ctrl) 183 | rfTune 184 | 185 | plot(rfTune) 186 | 187 | rfImp <- varImp(rfTune, scale = FALSE) 188 | rfImp 189 | 190 | ### Tune the model using the OOB estimates 191 | ctrlOOB <- trainControl(method = "oob") 192 | set.seed(100) 193 | rfTuneOOB <- train(x = solTrainXtrans, y = solTrainY, 194 | method = "rf", 195 | tuneGrid = mtryGrid, 196 | ntree = 1000, 197 | importance = TRUE, 198 | trControl = ctrlOOB) 199 | rfTuneOOB 200 | 201 | plot(rfTuneOOB) 202 | 203 | ### Tune the conditional inference forests 204 | set.seed(100) 205 | condrfTune <- train(x = solTrainXtrans, y = solTrainY, 206 | method = "cforest", 207 | tuneGrid = mtryGrid, 208 | controls = cforest_unbiased(ntree = 1000), 209 | trControl = ctrl) 210 | condrfTune 211 | 212 | plot(condrfTune) 213 | 214 | set.seed(100) 215 | condrfTuneOOB <- train(x = solTrainXtrans, y = solTrainY, 216 | method = "cforest", 217 | tuneGrid = mtryGrid, 218 | controls = cforest_unbiased(ntree = 1000), 219 | trControl = trainControl(method = "oob")) 220 | condrfTuneOOB 221 | 222 | plot(condrfTuneOOB) 223 | 224 | ################################################################################ 225 | ### Section 8.6 Boosting 226 | 227 | gbmGrid <- expand.grid(interaction.depth = seq(1, 7, by = 2), 228 | n.trees = seq(100, 1000, by = 50), 229 | shrinkage = c(0.01, 0.1)) 230 | set.seed(100) 231 | gbmTune <- train(x = solTrainXtrans, y = solTrainY, 232 | method = "gbm", 233 | tuneGrid = gbmGrid, 234 | trControl = ctrl, 235 | verbose = FALSE) 236 | gbmTune 237 | 238 | plot(gbmTune, auto.key = list(columns = 4, lines = TRUE)) 239 | 240 | gbmImp <- varImp(gbmTune, scale = FALSE) 241 | gbmImp 242 | 243 | ################################################################################ 244 | ### Section 8.7 Cubist 245 | 246 | cbGrid <- expand.grid(committees = c(1:10, 20, 50, 75, 100), 247 | neighbors = c(0, 1, 5, 9)) 248 | 249 | set.seed(100) 250 | cubistTune <- train(solTrainXtrans, solTrainY, 251 | "cubist", 252 | tuneGrid = cbGrid, 253 | trControl = ctrl) 254 | cubistTune 255 | 256 | plot(cubistTune, auto.key = list(columns = 4, lines = TRUE)) 257 | 258 | cbImp <- varImp(cubistTune, scale = FALSE) 259 | cbImp 260 | 261 | ################################################################################ 262 | ### Session Information 263 | 264 | sessionInfo() 265 | 266 | q("no") 267 | -------------------------------------------------------------------------------- /inst/chapters/10_Case_Study_Concrete.R: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson. 3 | ### Copyright 2013 Kuhn and Johnson 4 | ### Web Page: http://www.appliedpredictivemodeling.com 5 | ### Contact: Max Kuhn (mxkuhn@gmail.com) 6 | ### 7 | ### Chapter 10: Case Study: Compressive Strength of Concrete Mixtures 8 | ### 9 | ### Required packages: AppliedPredictiveModeling, caret, Cubist, doMC (optional), 10 | ### earth, elasticnet, gbm, ipred, lattice, nnet, party, pls, 11 | ### randomForests, rpart, RWeka 12 | ### 13 | ### Data used: The concrete from the AppliedPredictiveModeling package 14 | ### 15 | ### Notes: 16 | ### 1) This code is provided without warranty. 17 | ### 18 | ### 2) This code should help the user reproduce the results in the 19 | ### text. There will be differences between this code and what is is 20 | ### the computing section. For example, the computing sections show 21 | ### how the source functions work (e.g. randomForest() or plsr()), 22 | ### which were not directly used when creating the book. Also, there may be 23 | ### syntax differences that occur over time as packages evolve. These files 24 | ### will reflect those changes. 25 | ### 26 | ### 3) In some cases, the calculations in the book were run in 27 | ### parallel. The sub-processes may reset the random number seed. 28 | ### Your results may slightly vary. 29 | ### 30 | ################################################################################ 31 | 32 | ################################################################################ 33 | ### Load the data and plot the data 34 | 35 | library(AppliedPredictiveModeling) 36 | data(concrete) 37 | 38 | library(caret) 39 | library(plyr) 40 | 41 | featurePlot(concrete[, -9], concrete$CompressiveStrength, 42 | between = list(x = 1, y = 1), 43 | type = c("g", "p", "smooth")) 44 | 45 | 46 | ################################################################################ 47 | ### Section 10.1 Model Building Strategy 48 | ### There are replicated mixtures, so take the average per mixture 49 | 50 | averaged <- ddply(mixtures, 51 | .(Cement, BlastFurnaceSlag, FlyAsh, Water, 52 | Superplasticizer, CoarseAggregate, 53 | FineAggregate, Age), 54 | function(x) c(CompressiveStrength = 55 | mean(x$CompressiveStrength))) 56 | 57 | ### Split the data and create a control object for train() 58 | 59 | set.seed(975) 60 | inTrain <- createDataPartition(averaged$CompressiveStrength, p = 3/4)[[1]] 61 | training <- averaged[ inTrain,] 62 | testing <- averaged[-inTrain,] 63 | 64 | ctrl <- trainControl(method = "repeatedcv", repeats = 5, number = 10) 65 | 66 | ### Create a model formula that can be used repeatedly 67 | 68 | modForm <- paste("CompressiveStrength ~ (.)^2 + I(Cement^2) + I(BlastFurnaceSlag^2) +", 69 | "I(FlyAsh^2) + I(Water^2) + I(Superplasticizer^2) +", 70 | "I(CoarseAggregate^2) + I(FineAggregate^2) + I(Age^2)") 71 | modForm <- as.formula(modForm) 72 | 73 | ### Fit the various models 74 | 75 | ### Optional: parallel processing can be used via the 'do' packages, 76 | ### such as doMC, doMPI etc. We used doMC (not on Windows) to speed 77 | ### up the computations. 78 | 79 | ### WARNING: Be aware of how much memory is needed to parallel 80 | ### process. It can very quickly overwhelm the available hardware. The 81 | ### estimate of the median memory usage (VSIZE = total memory size) 82 | ### was 2800M for a core although the M5 calculations require about 83 | ### 3700M without parallel processing. 84 | 85 | ### WARNING 2: The RWeka package does not work well with some forms of 86 | ### parallel processing, such as mutlicore (i.e. doMC). 87 | 88 | library(doMC) 89 | registerDoMC(14) 90 | 91 | set.seed(669) 92 | lmFit <- train(modForm, data = training, 93 | method = "lm", 94 | trControl = ctrl) 95 | 96 | set.seed(669) 97 | plsFit <- train(modForm, data = training, 98 | method = "pls", 99 | preProc = c("center", "scale"), 100 | tuneLength = 15, 101 | trControl = ctrl) 102 | 103 | lassoGrid <- expand.grid(lambda = c(0, .001, .01, .1), 104 | fraction = seq(0.05, 1, length = 20)) 105 | set.seed(669) 106 | lassoFit <- train(modForm, data = training, 107 | method = "enet", 108 | preProc = c("center", "scale"), 109 | tuneGrid = lassoGrid, 110 | trControl = ctrl) 111 | 112 | set.seed(669) 113 | earthFit <- train(CompressiveStrength ~ ., data = training, 114 | method = "earth", 115 | tuneGrid = expand.grid(degree = 1, 116 | nprune = 2:25), 117 | trControl = ctrl) 118 | 119 | set.seed(669) 120 | svmRFit <- train(CompressiveStrength ~ ., data = training, 121 | method = "svmRadial", 122 | tuneLength = 15, 123 | preProc = c("center", "scale"), 124 | trControl = ctrl) 125 | 126 | 127 | nnetGrid <- expand.grid(decay = c(0.001, .01, .1), 128 | size = seq(1, 27, by = 2), 129 | bag = FALSE) 130 | set.seed(669) 131 | nnetFit <- train(CompressiveStrength ~ ., 132 | data = training, 133 | method = "avNNet", 134 | tuneGrid = nnetGrid, 135 | preProc = c("center", "scale"), 136 | linout = TRUE, 137 | trace = FALSE, 138 | maxit = 1000, 139 | allowParallel = FALSE, 140 | trControl = ctrl) 141 | 142 | set.seed(669) 143 | rpartFit <- train(CompressiveStrength ~ ., 144 | data = training, 145 | method = "rpart", 146 | tuneLength = 30, 147 | trControl = ctrl) 148 | 149 | set.seed(669) 150 | treebagFit <- train(CompressiveStrength ~ ., 151 | data = training, 152 | method = "treebag", 153 | trControl = ctrl) 154 | 155 | set.seed(669) 156 | ctreeFit <- train(CompressiveStrength ~ ., 157 | data = training, 158 | method = "ctree", 159 | tuneLength = 10, 160 | trControl = ctrl) 161 | 162 | set.seed(669) 163 | rfFit <- train(CompressiveStrength ~ ., 164 | data = training, 165 | method = "rf", 166 | tuneLength = 10, 167 | ntrees = 1000, 168 | importance = TRUE, 169 | trControl = ctrl) 170 | 171 | 172 | gbmGrid <- expand.grid(interaction.depth = seq(1, 7, by = 2), 173 | n.trees = seq(100, 1000, by = 50), 174 | shrinkage = c(0.01, 0.1)) 175 | set.seed(669) 176 | gbmFit <- train(CompressiveStrength ~ ., 177 | data = training, 178 | method = "gbm", 179 | tuneGrid = gbmGrid, 180 | verbose = FALSE, 181 | trControl = ctrl) 182 | 183 | 184 | cbGrid <- expand.grid(committees = c(1, 5, 10, 50, 75, 100), 185 | neighbors = c(0, 1, 3, 5, 7, 9)) 186 | set.seed(669) 187 | cbFit <- train(CompressiveStrength ~ ., 188 | data = training, 189 | method = "cubist", 190 | tuneGrid = cbGrid, 191 | trControl = ctrl) 192 | 193 | ### Turn off the parallel processing to use RWeka. 194 | registerDoSEQ() 195 | 196 | 197 | set.seed(669) 198 | mtFit <- train(CompressiveStrength ~ ., 199 | data = training, 200 | method = "M5", 201 | trControl = ctrl) 202 | 203 | ################################################################################ 204 | ### Section 10.2 Model Performance 205 | 206 | ### Collect the resampling statistics across all the models 207 | 208 | rs <- resamples(list("Linear Reg" = lmFit, " 209 | PLS" = plsFit, 210 | "Elastic Net" = lassoFit, 211 | MARS = earthFit, 212 | SVM = svmRFit, 213 | "Neural Networks" = nnetFit, 214 | CART = rpartFit, 215 | "Cond Inf Tree" = ctreeFit, 216 | "Bagged Tree" = treebagFit, 217 | "Boosted Tree" = gbmFit, 218 | "Random Forest" = rfFit, 219 | Cubist = cbFit)) 220 | 221 | #parallelPlot(rs) 222 | #parallelPlot(rs, metric = "Rsquared") 223 | 224 | ### Get the test set results across several models 225 | 226 | nnetPred <- predict(nnetFit, testing) 227 | gbmPred <- predict(gbmFit, testing) 228 | cbPred <- predict(cbFit, testing) 229 | 230 | testResults <- rbind(postResample(nnetPred, testing$CompressiveStrength), 231 | postResample(gbmPred, testing$CompressiveStrength), 232 | postResample(cbPred, testing$CompressiveStrength)) 233 | testResults <- as.data.frame(testResults) 234 | testResults$Model <- c("Neural Networks", "Boosted Tree", "Cubist") 235 | testResults <- testResults[order(testResults$RMSE),] 236 | 237 | ################################################################################ 238 | ### Section 10.3 Optimizing Compressive Strength 239 | 240 | library(proxy) 241 | 242 | ### Create a function to maximize compressive strength* while keeping 243 | ### the predictor values as mixtures. Water (in x[7]) is used as the 244 | ### 'slack variable'. 245 | 246 | ### * We are actually minimizing the negative compressive strength 247 | 248 | modelPrediction <- function(x, mod, limit = 2500) 249 | { 250 | if(x[1] < 0 | x[1] > 1) return(10^38) 251 | if(x[2] < 0 | x[2] > 1) return(10^38) 252 | if(x[3] < 0 | x[3] > 1) return(10^38) 253 | if(x[4] < 0 | x[4] > 1) return(10^38) 254 | if(x[5] < 0 | x[5] > 1) return(10^38) 255 | if(x[6] < 0 | x[6] > 1) return(10^38) 256 | 257 | x <- c(x, 1 - sum(x)) 258 | 259 | if(x[7] < 0.05) return(10^38) 260 | 261 | tmp <- as.data.frame(t(x)) 262 | names(tmp) <- c('Cement','BlastFurnaceSlag','FlyAsh', 263 | 'Superplasticizer','CoarseAggregate', 264 | 'FineAggregate', 'Water') 265 | tmp$Age <- 28 266 | -predict(mod, tmp) 267 | } 268 | 269 | ### Get mixtures at 28 days 270 | subTrain <- subset(training, Age == 28) 271 | 272 | ### Center and scale the data to use dissimilarity sampling 273 | pp1 <- preProcess(subTrain[, -(8:9)], c("center", "scale")) 274 | scaledTrain <- predict(pp1, subTrain[, 1:7]) 275 | 276 | ### Randomly select a few mixtures as a starting pool 277 | 278 | set.seed(91) 279 | startMixture <- sample(1:nrow(subTrain), 1) 280 | starters <- scaledTrain[startMixture, 1:7] 281 | pool <- scaledTrain 282 | index <- maxDissim(starters, pool, 14) 283 | startPoints <- c(startMixture, index) 284 | 285 | starters <- subTrain[startPoints,1:7] 286 | startingValues <- starters[, -4] 287 | 288 | ### For each starting mixture, optimize the Cubist model using 289 | ### a simplex search routine 290 | 291 | cbResults <- startingValues 292 | cbResults$Water <- NA 293 | cbResults$Prediction <- NA 294 | 295 | for(i in 1:nrow(cbResults)) 296 | { 297 | results <- optim(unlist(cbResults[i,1:6]), 298 | modelPrediction, 299 | method = "Nelder-Mead", 300 | control=list(maxit=5000), 301 | mod = cbFit) 302 | cbResults$Prediction[i] <- -results$value 303 | cbResults[i,1:6] <- results$par 304 | } 305 | cbResults$Water <- 1 - apply(cbResults[,1:6], 1, sum) 306 | cbResults <- subset(cbResults, Prediction > 0 & Water > .02) 307 | cbResults <- cbResults[order(-cbResults$Prediction),][1:3,] 308 | cbResults$Model <- "Cubist" 309 | 310 | ### Do the same for the neural network model 311 | 312 | nnetResults <- startingValues 313 | nnetResults$Water <- NA 314 | nnetResults$Prediction <- NA 315 | 316 | for(i in 1:nrow(nnetResults)) 317 | { 318 | results <- optim(unlist(nnetResults[i, 1:6,]), 319 | modelPrediction, 320 | method = "Nelder-Mead", 321 | control=list(maxit=5000), 322 | mod = nnetFit) 323 | nnetResults$Prediction[i] <- -results$value 324 | nnetResults[i,1:6] <- results$par 325 | } 326 | nnetResults$Water <- 1 - apply(nnetResults[,1:6], 1, sum) 327 | nnetResults <- subset(nnetResults, Prediction > 0 & Water > .02) 328 | nnetResults <- nnetResults[order(-nnetResults$Prediction),][1:3,] 329 | nnetResults$Model <- "NNet" 330 | 331 | ### Convert the predicted mixtures to PCA space and plot 332 | 333 | pp2 <- preProcess(subTrain[, 1:7], "pca") 334 | pca1 <- predict(pp2, subTrain[, 1:7]) 335 | pca1$Data <- "Training Set" 336 | pca1$Data[startPoints] <- "Starting Values" 337 | pca3 <- predict(pp2, cbResults[, names(subTrain[, 1:7])]) 338 | pca3$Data <- "Cubist" 339 | pca4 <- predict(pp2, nnetResults[, names(subTrain[, 1:7])]) 340 | pca4$Data <- "Neural Network" 341 | 342 | pcaData <- rbind(pca1, pca3, pca4) 343 | pcaData$Data <- factor(pcaData$Data, 344 | levels = c("Training Set","Starting Values", 345 | "Cubist","Neural Network")) 346 | 347 | lim <- extendrange(pcaData[, 1:2]) 348 | 349 | xyplot(PC2 ~ PC1, 350 | data = pcaData, 351 | groups = Data, 352 | auto.key = list(columns = 2), 353 | xlim = lim, 354 | ylim = lim, 355 | type = c("g", "p")) 356 | 357 | 358 | ################################################################################ 359 | ### Session Information 360 | 361 | sessionInfo() 362 | 363 | q("no") 364 | -------------------------------------------------------------------------------- /inst/chapters/11_Class_Performance.R: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson. 3 | ### Copyright 2013 Kuhn and Johnson 4 | ### Web Page: http://www.appliedpredictivemodeling.com 5 | ### Contact: Max Kuhn (mxkuhn@gmail.com) 6 | ### 7 | ### Chapter 11: Measuring Performance in Classification Models 8 | ### 9 | ### Required packages: AppliedPredictiveModeling, caret, MASS, randomForest, 10 | ### pROC, klaR 11 | ### 12 | ### Data used: The solubility from the AppliedPredictiveModeling package 13 | ### 14 | ### Notes: 15 | ### 1) This code is provided without warranty. 16 | ### 17 | ### 2) This code should help the user reproduce the results in the 18 | ### text. There will be differences between this code and what is is 19 | ### the computing section. For example, the computing sections show 20 | ### how the source functions work (e.g. randomForest() or plsr()), 21 | ### which were not directly used when creating the book. Also, there may be 22 | ### syntax differences that occur over time as packages evolve. These files 23 | ### will reflect those changes. 24 | ### 25 | ### 3) In some cases, the calculations in the book were run in 26 | ### parallel. The sub-processes may reset the random number seed. 27 | ### Your results may slightly vary. 28 | ### 29 | ################################################################################ 30 | 31 | ################################################################################ 32 | ### Section 11.1 Class Predictions 33 | 34 | library(AppliedPredictiveModeling) 35 | 36 | ### Simulate some two class data with two predictors 37 | set.seed(975) 38 | training <- quadBoundaryFunc(500) 39 | testing <- quadBoundaryFunc(1000) 40 | testing$class2 <- ifelse(testing$class == "Class1", 1, 0) 41 | testing$ID <- 1:nrow(testing) 42 | 43 | ### Fit models 44 | library(MASS) 45 | qdaFit <- qda(class ~ X1 + X2, data = training) 46 | library(randomForest) 47 | rfFit <- randomForest(class ~ X1 + X2, data = training, ntree = 2000) 48 | 49 | ### Predict the test set 50 | testing$qda <- predict(qdaFit, testing)$posterior[,1] 51 | testing$rf <- predict(rfFit, testing, type = "prob")[,1] 52 | 53 | 54 | ### Generate the calibration analysis 55 | library(caret) 56 | calData1 <- calibration(class ~ qda + rf, data = testing, cuts = 10) 57 | 58 | ### Plot the curve 59 | xyplot(calData1, auto.key = list(columns = 2)) 60 | 61 | ### To calibrate the data, treat the probabilities as inputs into the 62 | ### model 63 | 64 | trainProbs <- training 65 | trainProbs$qda <- predict(qdaFit)$posterior[,1] 66 | 67 | ### These models take the probabilities as inputs and, based on the 68 | ### true class, re-calibrate them. 69 | library(klaR) 70 | nbCal <- NaiveBayes(class ~ qda, data = trainProbs, usekernel = TRUE) 71 | 72 | ### We use relevel() here because glm() models the probability of the 73 | ### second factor level. 74 | lrCal <- glm(relevel(class, "Class2") ~ qda, data = trainProbs, family = binomial) 75 | 76 | ### Now re-predict the test set using the modified class probability 77 | ### estimates 78 | testing$qda2 <- predict(nbCal, testing[, "qda", drop = FALSE])$posterior[,1] 79 | testing$qda3 <- predict(lrCal, testing[, "qda", drop = FALSE], type = "response") 80 | 81 | 82 | ### Manipulate the data a bit for pretty plotting 83 | simulatedProbs <- testing[, c("class", "rf", "qda3")] 84 | names(simulatedProbs) <- c("TrueClass", "RandomForestProb", "QDACalibrated") 85 | simulatedProbs$RandomForestClass <- predict(rfFit, testing) 86 | 87 | calData2 <- calibration(class ~ qda + qda2 + qda3, data = testing) 88 | calData2$data$calibModelVar <- as.character(calData2$data$calibModelVar) 89 | calData2$data$calibModelVar <- ifelse(calData2$data$calibModelVar == "qda", 90 | "QDA", 91 | calData2$data$calibModelVar) 92 | calData2$data$calibModelVar <- ifelse(calData2$data$calibModelVar == "qda2", 93 | "Bayesian Calibration", 94 | calData2$data$calibModelVar) 95 | 96 | calData2$data$calibModelVar <- ifelse(calData2$data$calibModelVar == "qda3", 97 | "Sigmoidal Calibration", 98 | calData2$data$calibModelVar) 99 | 100 | calData2$data$calibModelVar <- factor(calData2$data$calibModelVar, 101 | levels = c("QDA", 102 | "Bayesian Calibration", 103 | "Sigmoidal Calibration")) 104 | 105 | xyplot(calData2, auto.key = list(columns = 1)) 106 | 107 | ### Recreate the model used in the over-fitting chapter 108 | 109 | library(caret) 110 | data(GermanCredit) 111 | 112 | ## First, remove near-zero variance predictors then get rid of a few predictors 113 | ## that duplicate values. For example, there are two possible values for the 114 | ## housing variable: "Rent", "Own" and "ForFree". So that we don't have linear 115 | ## dependencies, we get rid of one of the levels (e.g. "ForFree") 116 | 117 | GermanCredit <- GermanCredit[, -nearZeroVar(GermanCredit)] 118 | GermanCredit$CheckingAccountStatus.lt.0 <- NULL 119 | GermanCredit$SavingsAccountBonds.lt.100 <- NULL 120 | GermanCredit$EmploymentDuration.lt.1 <- NULL 121 | GermanCredit$EmploymentDuration.Unemployed <- NULL 122 | GermanCredit$Personal.Male.Married.Widowed <- NULL 123 | GermanCredit$Property.Unknown <- NULL 124 | GermanCredit$Housing.ForFree <- NULL 125 | 126 | ## Split the data into training (80%) and test sets (20%) 127 | set.seed(100) 128 | inTrain <- createDataPartition(GermanCredit$Class, p = .8)[[1]] 129 | GermanCreditTrain <- GermanCredit[ inTrain, ] 130 | GermanCreditTest <- GermanCredit[-inTrain, ] 131 | 132 | set.seed(1056) 133 | logisticReg <- train(Class ~ ., 134 | data = GermanCreditTrain, 135 | method = "glm", 136 | trControl = trainControl(method = "repeatedcv", 137 | repeats = 5)) 138 | logisticReg 139 | 140 | ### Predict the test set 141 | creditResults <- data.frame(obs = GermanCreditTest$Class) 142 | creditResults$prob <- predict(logisticReg, GermanCreditTest, type = "prob")[, "Bad"] 143 | creditResults$pred <- predict(logisticReg, GermanCreditTest) 144 | creditResults$Label <- ifelse(creditResults$obs == "Bad", 145 | "True Outcome: Bad Credit", 146 | "True Outcome: Good Credit") 147 | 148 | ### Plot the probability of bad credit 149 | histogram(~prob|Label, 150 | data = creditResults, 151 | layout = c(2, 1), 152 | nint = 20, 153 | xlab = "Probability of Bad Credit", 154 | type = "count") 155 | 156 | ### Calculate and plot the calibration curve 157 | creditCalib <- calibration(obs ~ prob, data = creditResults) 158 | xyplot(creditCalib) 159 | 160 | ### Create the confusion matrix from the test set. 161 | confusionMatrix(data = creditResults$pred, 162 | reference = creditResults$obs) 163 | 164 | ### ROC curves: 165 | 166 | ### Like glm(), roc() treats the last level of the factor as the event 167 | ### of interest so we use relevel() to change the observed class data 168 | 169 | library(pROC) 170 | creditROC <- roc(relevel(creditResults$obs, "Good"), creditResults$prob) 171 | 172 | coords(creditROC, "all")[,1:3] 173 | 174 | auc(creditROC) 175 | ci.auc(creditROC) 176 | 177 | ### Note the x-axis is reversed 178 | plot(creditROC) 179 | 180 | ### Old-school: 181 | plot(creditROC, legacy.axes = TRUE) 182 | 183 | ### Lift charts 184 | 185 | creditLift <- lift(obs ~ prob, data = creditResults) 186 | xyplot(creditLift) 187 | 188 | 189 | ################################################################################ 190 | ### Session Information 191 | 192 | sessionInfo() 193 | 194 | q("no") 195 | -------------------------------------------------------------------------------- /inst/chapters/11_Class_Performance.Rout: -------------------------------------------------------------------------------- 1 | 2 | R version 3.0.1 (2013-05-16) -- "Good Sport" 3 | Copyright (C) 2013 The R Foundation for Statistical Computing 4 | Platform: x86_64-apple-darwin10.8.0 (64-bit) 5 | 6 | R is free software and comes with ABSOLUTELY NO WARRANTY. 7 | You are welcome to redistribute it under certain conditions. 8 | Type 'license()' or 'licence()' for distribution details. 9 | 10 | Natural language support but running in an English locale 11 | 12 | R is a collaborative project with many contributors. 13 | Type 'contributors()' for more information and 14 | 'citation()' on how to cite R or R packages in publications. 15 | 16 | Type 'demo()' for some demos, 'help()' for on-line help, or 17 | 'help.start()' for an HTML browser interface to help. 18 | Type 'q()' to quit R. 19 | 20 | > ################################################################################ 21 | > ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson. 22 | > ### Copyright 2013 Kuhn and Johnson 23 | > ### Web Page: http://www.appliedpredictivemodeling.com 24 | > ### Contact: Max Kuhn (mxkuhn@gmail.com) 25 | > ### 26 | > ### Chapter 11: Measuring Performance in Classification Models 27 | > ### 28 | > ### Required packages: AppliedPredictiveModeling, caret, MASS, randomForest, 29 | > ### pROC, klaR 30 | > ### 31 | > ### Data used: The solubility from the AppliedPredictiveModeling package 32 | > ### 33 | > ### Notes: 34 | > ### 1) This code is provided without warranty. 35 | > ### 36 | > ### 2) This code should help the user reproduce the results in the 37 | > ### text. There will be differences between this code and what is is 38 | > ### the computing section. For example, the computing sections show 39 | > ### how the source functions work (e.g. randomForest() or plsr()), 40 | > ### which were not directly used when creating the book. Also, there may be 41 | > ### syntax differences that occur over time as packages evolve. These files 42 | > ### will reflect those changes. 43 | > ### 44 | > ### 3) In some cases, the calculations in the book were run in 45 | > ### parallel. The sub-processes may reset the random number seed. 46 | > ### Your results may slightly vary. 47 | > ### 48 | > ################################################################################ 49 | > 50 | > ################################################################################ 51 | > ### Section 11.1 Class Predictions 52 | > 53 | > library(AppliedPredictiveModeling) 54 | > 55 | > ### Simulate some two class data with two predictors 56 | > set.seed(975) 57 | > training <- quadBoundaryFunc(500) 58 | > testing <- quadBoundaryFunc(1000) 59 | > testing$class2 <- ifelse(testing$class == "Class1", 1, 0) 60 | > testing$ID <- 1:nrow(testing) 61 | > 62 | > ### Fit models 63 | > library(MASS) 64 | > qdaFit <- qda(class ~ X1 + X2, data = training) 65 | > library(randomForest) 66 | randomForest 4.6-7 67 | Type rfNews() to see new features/changes/bug fixes. 68 | > rfFit <- randomForest(class ~ X1 + X2, data = training, ntree = 2000) 69 | > 70 | > ### Predict the test set 71 | > testing$qda <- predict(qdaFit, testing)$posterior[,1] 72 | > testing$rf <- predict(rfFit, testing, type = "prob")[,1] 73 | > 74 | > 75 | > ### Generate the calibration analysis 76 | > library(caret) 77 | Loading required package: lattice 78 | Loading required package: ggplot2 79 | > calData1 <- calibration(class ~ qda + rf, data = testing, cuts = 10) 80 | > 81 | > ### Plot the curve 82 | > xyplot(calData1, auto.key = list(columns = 2)) 83 | > 84 | > ### To calibrate the data, treat the probabilities as inputs into the 85 | > ### model 86 | > 87 | > trainProbs <- training 88 | > trainProbs$qda <- predict(qdaFit)$posterior[,1] 89 | > 90 | > ### These models take the probabilities as inputs and, based on the 91 | > ### true class, re-calibrate them. 92 | > library(klaR) 93 | > nbCal <- NaiveBayes(class ~ qda, data = trainProbs, usekernel = TRUE) 94 | > 95 | > ### We use relevel() here because glm() models the probability of the 96 | > ### second factor level. 97 | > lrCal <- glm(relevel(class, "Class2") ~ qda, data = trainProbs, family = binomial) 98 | > 99 | > ### Now re-predict the test set using the modified class probability 100 | > ### estimates 101 | > testing$qda2 <- predict(nbCal, testing[, "qda", drop = FALSE])$posterior[,1] 102 | > testing$qda3 <- predict(lrCal, testing[, "qda", drop = FALSE], type = "response") 103 | > 104 | > 105 | > ### Manipulate the data a bit for pretty plotting 106 | > simulatedProbs <- testing[, c("class", "rf", "qda3")] 107 | > names(simulatedProbs) <- c("TrueClass", "RandomForestProb", "QDACalibrated") 108 | > simulatedProbs$RandomForestClass <- predict(rfFit, testing) 109 | > 110 | > calData2 <- calibration(class ~ qda + qda2 + qda3, data = testing) 111 | > calData2$data$calibModelVar <- as.character(calData2$data$calibModelVar) 112 | > calData2$data$calibModelVar <- ifelse(calData2$data$calibModelVar == "qda", 113 | + "QDA", 114 | + calData2$data$calibModelVar) 115 | > calData2$data$calibModelVar <- ifelse(calData2$data$calibModelVar == "qda2", 116 | + "Bayesian Calibration", 117 | + calData2$data$calibModelVar) 118 | > 119 | > calData2$data$calibModelVar <- ifelse(calData2$data$calibModelVar == "qda3", 120 | + "Sigmoidal Calibration", 121 | + calData2$data$calibModelVar) 122 | > 123 | > calData2$data$calibModelVar <- factor(calData2$data$calibModelVar, 124 | + levels = c("QDA", 125 | + "Bayesian Calibration", 126 | + "Sigmoidal Calibration")) 127 | > 128 | > xyplot(calData2, auto.key = list(columns = 1)) 129 | > 130 | > ### Recreate the model used in the over-fitting chapter 131 | > 132 | > library(caret) 133 | > data(GermanCredit) 134 | > 135 | > ## First, remove near-zero variance predictors then get rid of a few predictors 136 | > ## that duplicate values. For example, there are two possible values for the 137 | > ## housing variable: "Rent", "Own" and "ForFree". So that we don't have linear 138 | > ## dependencies, we get rid of one of the levels (e.g. "ForFree") 139 | > 140 | > GermanCredit <- GermanCredit[, -nearZeroVar(GermanCredit)] 141 | > GermanCredit$CheckingAccountStatus.lt.0 <- NULL 142 | > GermanCredit$SavingsAccountBonds.lt.100 <- NULL 143 | > GermanCredit$EmploymentDuration.lt.1 <- NULL 144 | > GermanCredit$EmploymentDuration.Unemployed <- NULL 145 | > GermanCredit$Personal.Male.Married.Widowed <- NULL 146 | > GermanCredit$Property.Unknown <- NULL 147 | > GermanCredit$Housing.ForFree <- NULL 148 | > 149 | > ## Split the data into training (80%) and test sets (20%) 150 | > set.seed(100) 151 | > inTrain <- createDataPartition(GermanCredit$Class, p = .8)[[1]] 152 | > GermanCreditTrain <- GermanCredit[ inTrain, ] 153 | > GermanCreditTest <- GermanCredit[-inTrain, ] 154 | > 155 | > set.seed(1056) 156 | > logisticReg <- train(Class ~ ., 157 | + data = GermanCreditTrain, 158 | + method = "glm", 159 | + trControl = trainControl(method = "repeatedcv", 160 | + repeats = 5)) 161 | Loading required package: class 162 | > logisticReg 163 | Generalized Linear Model 164 | 165 | 800 samples 166 | 41 predictors 167 | 2 classes: 'Bad', 'Good' 168 | 169 | No pre-processing 170 | Resampling: Cross-Validated (10 fold, repeated 5 times) 171 | 172 | Summary of sample sizes: 720, 720, 720, 720, 720, 720, ... 173 | 174 | Resampling results 175 | 176 | Accuracy Kappa Accuracy SD Kappa SD 177 | 0.749 0.365 0.0516 0.122 178 | 179 | 180 | > 181 | > ### Predict the test set 182 | > creditResults <- data.frame(obs = GermanCreditTest$Class) 183 | > creditResults$prob <- predict(logisticReg, GermanCreditTest, type = "prob")[, "Bad"] 184 | > creditResults$pred <- predict(logisticReg, GermanCreditTest) 185 | > creditResults$Label <- ifelse(creditResults$obs == "Bad", 186 | + "True Outcome: Bad Credit", 187 | + "True Outcome: Good Credit") 188 | > 189 | > ### Plot the probability of bad credit 190 | > histogram(~prob|Label, 191 | + data = creditResults, 192 | + layout = c(2, 1), 193 | + nint = 20, 194 | + xlab = "Probability of Bad Credit", 195 | + type = "count") 196 | > 197 | > ### Calculate and plot the calibration curve 198 | > creditCalib <- calibration(obs ~ prob, data = creditResults) 199 | > xyplot(creditCalib) 200 | > 201 | > ### Create the confusion matrix from the test set. 202 | > confusionMatrix(data = creditResults$pred, 203 | + reference = creditResults$obs) 204 | Confusion Matrix and Statistics 205 | 206 | Reference 207 | Prediction Bad Good 208 | Bad 24 10 209 | Good 36 130 210 | 211 | Accuracy : 0.77 212 | 95% CI : (0.7054, 0.8264) 213 | No Information Rate : 0.7 214 | P-Value [Acc > NIR] : 0.0168694 215 | 216 | Kappa : 0.375 217 | Mcnemar's Test P-Value : 0.0002278 218 | 219 | Sensitivity : 0.4000 220 | Specificity : 0.9286 221 | Pos Pred Value : 0.7059 222 | Neg Pred Value : 0.7831 223 | Prevalence : 0.3000 224 | Detection Rate : 0.1200 225 | Detection Prevalence : 0.1700 226 | Balanced Accuracy : 0.6643 227 | 228 | 'Positive' Class : Bad 229 | 230 | > 231 | > ### ROC curves: 232 | > 233 | > ### Like glm(), roc() treats the last level of the factor as the event 234 | > ### of interest so we use relevel() to change the observed class data 235 | > 236 | > library(pROC) 237 | Loading required package: plyr 238 | Type 'citation("pROC")' for a citation. 239 | 240 | Attaching package: ‘pROC’ 241 | 242 | The following object is masked from ‘package:stats’: 243 | 244 | cov, smooth, var 245 | 246 | > creditROC <- roc(relevel(creditResults$obs, "Good"), creditResults$prob) 247 | > 248 | > coords(creditROC, "all")[,1:3] 249 | all all all 250 | threshold -Inf 0.006199758 0.009708574 251 | specificity 0 0.007142857 0.014285714 252 | sensitivity 1 1.000000000 1.000000000 253 | > 254 | > auc(creditROC) 255 | Area under the curve: 0.775 256 | > ci.auc(creditROC) 257 | 95% CI: 0.7032-0.8468 (DeLong) 258 | > 259 | > ### Note the x-axis is reversed 260 | > plot(creditROC) 261 | 262 | Call: 263 | roc.default(response = relevel(creditResults$obs, "Good"), predictor = creditResults$prob) 264 | 265 | Data: creditResults$prob in 140 controls (relevel(creditResults$obs, "Good") Good) < 60 cases (relevel(creditResults$obs, "Good") Bad). 266 | Area under the curve: 0.775 267 | > 268 | > ### Old-school: 269 | > plot(creditROC, legacy.axes = TRUE) 270 | 271 | Call: 272 | roc.default(response = relevel(creditResults$obs, "Good"), predictor = creditResults$prob) 273 | 274 | Data: creditResults$prob in 140 controls (relevel(creditResults$obs, "Good") Good) < 60 cases (relevel(creditResults$obs, "Good") Bad). 275 | Area under the curve: 0.775 276 | > 277 | > ### Lift charts 278 | > 279 | > creditLift <- lift(obs ~ prob, data = creditResults) 280 | > xyplot(creditLift) 281 | > 282 | > 283 | > ################################################################################ 284 | > ### Session Information 285 | > 286 | > sessionInfo() 287 | R version 3.0.1 (2013-05-16) 288 | Platform: x86_64-apple-darwin10.8.0 (64-bit) 289 | 290 | locale: 291 | [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8 292 | 293 | attached base packages: 294 | [1] stats graphics grDevices utils datasets methods base 295 | 296 | other attached packages: 297 | [1] pROC_1.5.4 plyr_1.8 298 | [3] e1071_1.6-1 class_7.3-7 299 | [5] klaR_0.6-7 caret_6.0-22 300 | [7] ggplot2_0.9.3.1 lattice_0.20-15 301 | [9] randomForest_4.6-7 MASS_7.3-26 302 | [11] AppliedPredictiveModeling_1.1-5 303 | 304 | loaded via a namespace (and not attached): 305 | [1] car_2.0-16 codetools_0.2-8 colorspace_1.2-1 compiler_3.0.1 306 | [5] CORElearn_0.9.41 dichromat_2.0-0 digest_0.6.3 foreach_1.4.0 307 | [9] grid_3.0.1 gtable_0.1.2 iterators_1.0.6 labeling_0.1 308 | [13] munsell_0.4 proto_0.3-10 RColorBrewer_1.0-5 reshape2_1.2.2 309 | [17] scales_0.2.3 stringr_0.6.2 tools_3.0.1 310 | > 311 | > q("no") 312 | > proc.time() 313 | user system elapsed 314 | 11.120 0.526 11.698 315 | -------------------------------------------------------------------------------- /inst/chapters/12_Discriminant_Analysis.R: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson. 3 | ### Copyright 2013 Kuhn and Johnson 4 | ### Web Page: http://www.appliedpredictivemodeling.com 5 | ### Contact: Max Kuhn (mxkuhn@gmail.com) 6 | ### 7 | ### Chapter 12 Discriminant Analysis and Other Linear Classification Models 8 | ### 9 | ### Required packages: AppliedPredictiveModeling, caret, doMC (optional), 10 | ### glmnet, lattice, MASS, pamr, pls, pROC, sparseLDA 11 | ### 12 | ### Data used: The grant application data. See the file 'CreateGrantData.R' 13 | ### 14 | ### Notes: 15 | ### 1) This code is provided without warranty. 16 | ### 17 | ### 2) This code should help the user reproduce the results in the 18 | ### text. There will be differences between this code and what is is 19 | ### the computing section. For example, the computing sections show 20 | ### how the source functions work (e.g. randomForest() or plsr()), 21 | ### which were not directly used when creating the book. Also, there may be 22 | ### syntax differences that occur over time as packages evolve. These files 23 | ### will reflect those changes. 24 | ### 25 | ### 3) In some cases, the calculations in the book were run in 26 | ### parallel. The sub-processes may reset the random number seed. 27 | ### Your results may slightly vary. 28 | ### 29 | ################################################################################ 30 | 31 | ################################################################################ 32 | ### Section 12.1 Case Study: Predicting Successful Grant Applications 33 | 34 | load("grantData.RData") 35 | 36 | library(caret) 37 | library(doMC) 38 | registerDoMC(12) 39 | library(plyr) 40 | library(reshape2) 41 | 42 | ## Look at two different ways to split and resample the data. A support vector 43 | ## machine is used to illustrate the differences. The full set of predictors 44 | ## is used. 45 | 46 | pre2008Data <- training[pre2008,] 47 | year2008Data <- rbind(training[-pre2008,], testing) 48 | 49 | set.seed(552) 50 | test2008 <- createDataPartition(year2008Data$Class, p = .25)[[1]] 51 | 52 | allData <- rbind(pre2008Data, year2008Data[-test2008,]) 53 | holdout2008 <- year2008Data[test2008,] 54 | 55 | ## Use a common tuning grid for both approaches. 56 | svmrGrid <- expand.grid(sigma = c(.00007, .00009, .0001, .0002), 57 | C = 2^(-3:8)) 58 | 59 | ## Evaluate the model using overall 10-fold cross-validation 60 | ctrl0 <- trainControl(method = "cv", 61 | summaryFunction = twoClassSummary, 62 | classProbs = TRUE) 63 | set.seed(477) 64 | svmFit0 <- train(pre2008Data[,fullSet], pre2008Data$Class, 65 | method = "svmRadial", 66 | tuneGrid = svmrGrid, 67 | preProc = c("center", "scale"), 68 | metric = "ROC", 69 | trControl = ctrl0) 70 | svmFit0 71 | 72 | ### Now fit the single 2008 test set 73 | ctrl00 <- trainControl(method = "LGOCV", 74 | summaryFunction = twoClassSummary, 75 | classProbs = TRUE, 76 | index = list(TestSet = 1:nrow(pre2008Data))) 77 | 78 | 79 | set.seed(476) 80 | svmFit00 <- train(allData[,fullSet], allData$Class, 81 | method = "svmRadial", 82 | tuneGrid = svmrGrid, 83 | preProc = c("center", "scale"), 84 | metric = "ROC", 85 | trControl = ctrl00) 86 | svmFit00 87 | 88 | ## Combine the two sets of results and plot 89 | 90 | grid0 <- subset(svmFit0$results, sigma == svmFit0$bestTune$sigma) 91 | grid0$Model <- "10-Fold Cross-Validation" 92 | 93 | grid00 <- subset(svmFit00$results, sigma == svmFit00$bestTune$sigma) 94 | grid00$Model <- "Single 2008 Test Set" 95 | 96 | plotData <- rbind(grid00, grid0) 97 | 98 | plotData <- plotData[!is.na(plotData$ROC),] 99 | xyplot(ROC ~ C, data = plotData, 100 | groups = Model, 101 | type = c("g", "o"), 102 | scales = list(x = list(log = 2)), 103 | auto.key = list(columns = 1)) 104 | 105 | ################################################################################ 106 | ### Section 12.2 Logistic Regression 107 | 108 | modelFit <- glm(Class ~ Day, data = training[pre2008,], family = binomial) 109 | dataGrid <- data.frame(Day = seq(0, 365, length = 500)) 110 | dataGrid$Linear <- 1 - predict(modelFit, dataGrid, type = "response") 111 | linear2008 <- auc(roc(response = training[-pre2008, "Class"], 112 | predictor = 1 - predict(modelFit, 113 | training[-pre2008,], 114 | type = "response"), 115 | levels = rev(levels(training[-pre2008, "Class"])))) 116 | 117 | 118 | modelFit2 <- glm(Class ~ Day + I(Day^2), 119 | data = training[pre2008,], 120 | family = binomial) 121 | dataGrid$Quadratic <- 1 - predict(modelFit2, dataGrid, type = "response") 122 | quad2008 <- auc(roc(response = training[-pre2008, "Class"], 123 | predictor = 1 - predict(modelFit2, 124 | training[-pre2008,], 125 | type = "response"), 126 | levels = rev(levels(training[-pre2008, "Class"])))) 127 | 128 | dataGrid <- plyr::melt(dataGrid, id.vars = "Day") 129 | 130 | byDay <- training[pre2008, c("Day", "Class")] 131 | byDay$Binned <- cut(byDay$Day, seq(0, 360, by = 5)) 132 | 133 | observedProps <- ddply(byDay, .(Binned), 134 | function(x) c(n = nrow(x), mean = mean(x$Class == "successful"))) 135 | observedProps$midpoint <- seq(2.5, 357.5, by = 5) 136 | 137 | xyplot(value ~ Day|variable, data = dataGrid, 138 | ylab = "Probability of A Successful Grant", 139 | ylim = extendrange(0:1), 140 | between = list(x = 1), 141 | panel = function(...) 142 | { 143 | panel.xyplot(x = observedProps$midpoint, observedProps$mean, 144 | pch = 16., col = rgb(.2, .2, .2, .5)) 145 | panel.xyplot(..., type = "l", col = "black", lwd = 2) 146 | }) 147 | 148 | ## For the reduced set of factors, fit the logistic regression model (linear and 149 | ## quadratic) and evaluate on the 150 | training$Day2 <- training$Day^2 151 | testing$Day2 <- testing$Day^2 152 | fullSet <- c(fullSet, "Day2") 153 | reducedSet <- c(reducedSet, "Day2") 154 | 155 | ## This control object will be used across multiple models so that the 156 | ## data splitting is consistent 157 | 158 | ctrl <- trainControl(method = "LGOCV", 159 | summaryFunction = twoClassSummary, 160 | classProbs = TRUE, 161 | index = list(TrainSet = pre2008), 162 | savePredictions = TRUE) 163 | 164 | set.seed(476) 165 | lrFit <- train(x = training[,reducedSet], 166 | y = training$Class, 167 | method = "glm", 168 | metric = "ROC", 169 | trControl = ctrl) 170 | lrFit 171 | set.seed(476) 172 | lrFit2 <- train(x = training[,c(fullSet, "Day2")], 173 | y = training$Class, 174 | method = "glm", 175 | metric = "ROC", 176 | trControl = ctrl) 177 | lrFit2 178 | 179 | lrFit$pred <- merge(lrFit$pred, lrFit$bestTune) 180 | 181 | ## Get the confusion matrices for the hold-out set 182 | lrCM <- confusionMatrix(lrFit, norm = "none") 183 | lrCM 184 | lrCM2 <- confusionMatrix(lrFit2, norm = "none") 185 | lrCM2 186 | 187 | ## Get the area under the ROC curve for the hold-out set 188 | lrRoc <- roc(response = lrFit$pred$obs, 189 | predictor = lrFit$pred$successful, 190 | levels = rev(levels(lrFit$pred$obs))) 191 | lrRoc2 <- roc(response = lrFit2$pred$obs, 192 | predictor = lrFit2$pred$successful, 193 | levels = rev(levels(lrFit2$pred$obs))) 194 | lrImp <- varImp(lrFit, scale = FALSE) 195 | 196 | plot(lrRoc, legacy.axes = TRUE) 197 | 198 | ################################################################################ 199 | ### Section 12.3 Linear Discriminant Analysis 200 | 201 | ## Fit the model to the reduced set 202 | set.seed(476) 203 | ldaFit <- train(x = training[,reducedSet], 204 | y = training$Class, 205 | method = "lda", 206 | preProc = c("center","scale"), 207 | metric = "ROC", 208 | trControl = ctrl) 209 | ldaFit 210 | 211 | ldaFit$pred <- merge(ldaFit$pred, ldaFit$bestTune) 212 | ldaCM <- confusionMatrix(ldaFit, norm = "none") 213 | ldaCM 214 | ldaRoc <- roc(response = ldaFit$pred$obs, 215 | predictor = ldaFit$pred$successful, 216 | levels = rev(levels(ldaFit$pred$obs))) 217 | plot(lrRoc, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE) 218 | plot(ldaRoc, add = TRUE, type = "s", legacy.axes = TRUE) 219 | 220 | ################################################################################ 221 | ### Section 12.4 Partial Least Squares Discriminant Analysis 222 | 223 | ## This model uses all of the predictors 224 | set.seed(476) 225 | plsFit <- train(x = training[,fullSet], 226 | y = training$Class, 227 | method = "pls", 228 | tuneGrid = expand.grid(ncomp = 1:10), 229 | preProc = c("center","scale"), 230 | metric = "ROC", 231 | probMethod = "Bayes", 232 | trControl = ctrl) 233 | plsFit 234 | 235 | plsImpGrant <- varImp(plsFit, scale = FALSE) 236 | 237 | bestPlsNcomp <- plsFit$results[best(plsFit$results, "ROC", maximize = TRUE), "ncomp"] 238 | bestPlsROC <- plsFit$results[best(plsFit$results, "ROC", maximize = TRUE), "ROC"] 239 | 240 | ## Only keep the final tuning parameter data 241 | plsFit$pred <- merge(plsFit$pred, plsFit$bestTune) 242 | 243 | plsRoc <- roc(response = plsFit$pred$obs, 244 | predictor = plsFit$pred$successful, 245 | levels = rev(levels(plsFit$pred$obs))) 246 | 247 | ### PLS confusion matrix information 248 | plsCM <- confusionMatrix(plsFit, norm = "none") 249 | plsCM 250 | 251 | ## Now fit a model that uses a smaller set of predictors chosen by unsupervised 252 | ## filtering. 253 | 254 | set.seed(476) 255 | plsFit2 <- train(x = training[,reducedSet], 256 | y = training$Class, 257 | method = "pls", 258 | tuneGrid = expand.grid(ncomp = 1:10), 259 | preProc = c("center","scale"), 260 | metric = "ROC", 261 | probMethod = "Bayes", 262 | trControl = ctrl) 263 | plsFit2 264 | 265 | bestPlsNcomp2 <- plsFit2$results[best(plsFit2$results, "ROC", maximize = TRUE), "ncomp"] 266 | bestPlsROC2 <- plsFit2$results[best(plsFit2$results, "ROC", maximize = TRUE), "ROC"] 267 | 268 | plsFit2$pred <- merge(plsFit2$pred, plsFit2$bestTune) 269 | 270 | plsRoc2 <- roc(response = plsFit2$pred$obs, 271 | predictor = plsFit2$pred$successful, 272 | levels = rev(levels(plsFit2$pred$obs))) 273 | plsCM2 <- confusionMatrix(plsFit2, norm = "none") 274 | plsCM2 275 | 276 | pls.ROC <- cbind(plsFit$results,Descriptors="Full Set") 277 | pls2.ROC <- cbind(plsFit2$results,Descriptors="Reduced Set") 278 | 279 | plsCompareROC <- data.frame(rbind(pls.ROC,pls2.ROC)) 280 | 281 | xyplot(ROC ~ ncomp, 282 | data = plsCompareROC, 283 | xlab = "# Components", 284 | ylab = "ROC (2008 Hold-Out Data)", 285 | auto.key = list(columns = 2), 286 | groups = Descriptors, 287 | type = c("o", "g")) 288 | 289 | ## Plot ROC curves and variable importance scores 290 | plot(ldaRoc, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE) 291 | plot(lrRoc, type = "s", col = rgb(.2, .2, .2, .2), add = TRUE, legacy.axes = TRUE) 292 | plot(plsRoc2, type = "s", add = TRUE, legacy.axes = TRUE) 293 | 294 | plot(plsImpGrant, top=20, scales = list(y = list(cex = .95))) 295 | 296 | ################################################################################ 297 | ### Section 12.5 Penalized Models 298 | 299 | ## The glmnet model 300 | glmnGrid <- expand.grid(alpha = c(0, .1, .2, .4, .6, .8, 1), 301 | lambda = seq(.01, .2, length = 40)) 302 | set.seed(476) 303 | glmnFit <- train(x = training[,fullSet], 304 | y = training$Class, 305 | method = "glmnet", 306 | tuneGrid = glmnGrid, 307 | preProc = c("center", "scale"), 308 | metric = "ROC", 309 | trControl = ctrl) 310 | glmnFit 311 | 312 | glmnet2008 <- merge(glmnFit$pred, glmnFit$bestTune) 313 | glmnetCM <- confusionMatrix(glmnFit, norm = "none") 314 | glmnetCM 315 | 316 | glmnetRoc <- roc(response = glmnet2008$obs, 317 | predictor = glmnet2008$successful, 318 | levels = rev(levels(glmnet2008$obs))) 319 | 320 | glmnFit0 <- glmnFit 321 | glmnFit0$results$lambda <- format(round(glmnFit0$results$lambda, 3)) 322 | 323 | glmnPlot <- plot(glmnFit0, 324 | plotType = "level", 325 | cuts = 15, 326 | scales = list(x = list(rot = 90, cex = .65))) 327 | 328 | update(glmnPlot, 329 | ylab = "Mixing Percentage\nRidge <---------> Lasso", 330 | sub = "", 331 | main = "Area Under the ROC Curve", 332 | xlab = "Amount of Regularization") 333 | 334 | plot(plsRoc2, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE) 335 | plot(ldaRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE) 336 | plot(lrRoc, type = "s", col = rgb(.2, .2, .2, .2), add = TRUE, legacy.axes = TRUE) 337 | plot(glmnetRoc, type = "s", add = TRUE, legacy.axes = TRUE) 338 | 339 | ## Sparse logistic regression 340 | 341 | set.seed(476) 342 | spLDAFit <- train(x = training[,fullSet], 343 | y = training$Class, 344 | "sparseLDA", 345 | tuneGrid = expand.grid(lambda = c(.1), 346 | NumVars = c(1:20, 50, 75, 100, 250, 500, 750, 1000)), 347 | preProc = c("center", "scale"), 348 | metric = "ROC", 349 | trControl = ctrl) 350 | spLDAFit 351 | 352 | spLDA2008 <- merge(spLDAFit$pred, spLDAFit$bestTune) 353 | spLDACM <- confusionMatrix(spLDAFit, norm = "none") 354 | spLDACM 355 | 356 | spLDARoc <- roc(response = spLDA2008$obs, 357 | predictor = spLDA2008$successful, 358 | levels = rev(levels(spLDA2008$obs))) 359 | 360 | update(plot(spLDAFit, scales = list(x = list(log = 10))), 361 | ylab = "ROC AUC (2008 Hold-Out Data)") 362 | 363 | plot(plsRoc2, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE) 364 | plot(glmnetRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE) 365 | plot(ldaRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE) 366 | plot(lrRoc, type = "s", col = rgb(.2, .2, .2, .2), add = TRUE, legacy.axes = TRUE) 367 | plot(spLDARoc, type = "s", add = TRUE, legacy.axes = TRUE) 368 | 369 | ################################################################################ 370 | ### Section 12.6 Nearest Shrunken Centroids 371 | 372 | set.seed(476) 373 | nscFit <- train(x = training[,fullSet], 374 | y = training$Class, 375 | method = "pam", 376 | preProc = c("center", "scale"), 377 | tuneGrid = data.frame(threshold = seq(0, 25, length = 30)), 378 | metric = "ROC", 379 | trControl = ctrl) 380 | nscFit 381 | 382 | nsc2008 <- merge(nscFit$pred, nscFit$bestTune) 383 | nscCM <- confusionMatrix(nscFit, norm = "none") 384 | nscCM 385 | nscRoc <- roc(response = nsc2008$obs, 386 | predictor = nsc2008$successful, 387 | levels = rev(levels(nsc2008$obs))) 388 | update(plot(nscFit), ylab = "ROC AUC (2008 Hold-Out Data)") 389 | 390 | 391 | plot(plsRoc2, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE) 392 | plot(glmnetRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE) 393 | plot(ldaRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE) 394 | plot(lrRoc, type = "s", col = rgb(.2, .2, .2, .2), add = TRUE, legacy.axes = TRUE) 395 | plot(spLDARoc, type = "s", col = rgb(.2, .2, .2, .2), add = TRUE, legacy.axes = TRUE) 396 | plot(nscRoc, type = "s", add = TRUE, legacy.axes = TRUE) 397 | 398 | sessionInfo() 399 | 400 | q("no") 401 | 402 | -------------------------------------------------------------------------------- /inst/chapters/13_Non-Linear_Class.R: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson. 3 | ### Copyright 2013 Kuhn and Johnson 4 | ### Web Page: http://www.appliedpredictivemodeling.com 5 | ### Contact: Max Kuhn (mxkuhn@gmail.com) 6 | ### 7 | ### Chapter 13 Non-Linear Classification Models 8 | ### 9 | ### Required packages: AppliedPredictiveModeling, caret, doMC (optional) 10 | ### kernlab, klaR, lattice, latticeExtra, MASS, mda, nnet, 11 | ### pROC 12 | ### 13 | ### Data used: The grant application data. See the file 'CreateGrantData.R' 14 | ### 15 | ### Notes: 16 | ### 1) This code is provided without warranty. 17 | ### 18 | ### 2) This code should help the user reproduce the results in the 19 | ### text. There will be differences between this code and what is is 20 | ### the computing section. For example, the computing sections show 21 | ### how the source functions work (e.g. randomForest() or plsr()), 22 | ### which were not directly used when creating the book. Also, there may be 23 | ### syntax differences that occur over time as packages evolve. These files 24 | ### will reflect those changes. 25 | ### 26 | ### 3) In some cases, the calculations in the book were run in 27 | ### parallel. The sub-processes may reset the random number seed. 28 | ### Your results may slightly vary. 29 | ### 30 | ################################################################################ 31 | 32 | ################################################################################ 33 | ### Section 13.1 Nonlinear Discriminant Analysis 34 | 35 | 36 | load("grantData.RData") 37 | 38 | library(caret) 39 | 40 | ### Optional: parallel processing can be used via the 'do' packages, 41 | ### such as doMC, doMPI etc. We used doMC (not on Windows) to speed 42 | ### up the computations. 43 | 44 | ### WARNING: Be aware of how much memory is needed to parallel 45 | ### process. It can very quickly overwhelm the available hardware. We 46 | ### estimate the memory usage (VSIZE = total memory size) to be 47 | ### 2700M/core. 48 | 49 | library(doMC) 50 | registerDoMC(12) 51 | 52 | ## This control object will be used across multiple models so that the 53 | ## data splitting is consistent 54 | 55 | ctrl <- trainControl(method = "LGOCV", 56 | summaryFunction = twoClassSummary, 57 | classProbs = TRUE, 58 | index = list(TrainSet = pre2008), 59 | savePredictions = TRUE) 60 | 61 | set.seed(476) 62 | mdaFit <- train(x = training[,reducedSet], 63 | y = training$Class, 64 | method = "mda", 65 | metric = "ROC", 66 | tries = 40, 67 | tuneGrid = expand.grid(subclasses = 1:8), 68 | trControl = ctrl) 69 | mdaFit 70 | 71 | mdaFit$results <- mdaFit$results[!is.na(mdaFit$results$ROC),] 72 | mdaFit$pred <- merge(mdaFit$pred, mdaFit$bestTune) 73 | mdaCM <- confusionMatrix(mdaFit, norm = "none") 74 | mdaCM 75 | 76 | mdaRoc <- roc(response = mdaFit$pred$obs, 77 | predictor = mdaFit$pred$successful, 78 | levels = rev(levels(mdaFit$pred$obs))) 79 | mdaRoc 80 | 81 | update(plot(mdaFit, 82 | ylab = "ROC AUC (2008 Hold-Out Data)")) 83 | 84 | ################################################################################ 85 | ### Section 13.2 Neural Networks 86 | 87 | nnetGrid <- expand.grid(size = 1:10, decay = c(0, .1, 1, 2)) 88 | maxSize <- max(nnetGrid$size) 89 | 90 | 91 | ## Four different models are evaluate based on the data pre-processing and 92 | ## whethera single or multiple models are used 93 | 94 | set.seed(476) 95 | nnetFit <- train(x = training[,reducedSet], 96 | y = training$Class, 97 | method = "nnet", 98 | metric = "ROC", 99 | preProc = c("center", "scale"), 100 | tuneGrid = nnetGrid, 101 | trace = FALSE, 102 | maxit = 2000, 103 | MaxNWts = 1*(maxSize * (length(reducedSet) + 1) + maxSize + 1), 104 | trControl = ctrl) 105 | nnetFit 106 | 107 | set.seed(476) 108 | nnetFit2 <- train(x = training[,reducedSet], 109 | y = training$Class, 110 | method = "nnet", 111 | metric = "ROC", 112 | preProc = c("center", "scale", "spatialSign"), 113 | tuneGrid = nnetGrid, 114 | trace = FALSE, 115 | maxit = 2000, 116 | MaxNWts = 1*(maxSize * (length(reducedSet) + 1) + maxSize + 1), 117 | trControl = ctrl) 118 | nnetFit2 119 | 120 | nnetGrid$bag <- FALSE 121 | 122 | set.seed(476) 123 | nnetFit3 <- train(x = training[,reducedSet], 124 | y = training$Class, 125 | method = "avNNet", 126 | metric = "ROC", 127 | preProc = c("center", "scale"), 128 | tuneGrid = nnetGrid, 129 | repeats = 10, 130 | trace = FALSE, 131 | maxit = 2000, 132 | MaxNWts = 10*(maxSize * (length(reducedSet) + 1) + maxSize + 1), 133 | allowParallel = FALSE, ## this will cause to many workers to be launched. 134 | trControl = ctrl) 135 | nnetFit3 136 | 137 | set.seed(476) 138 | nnetFit4 <- train(x = training[,reducedSet], 139 | y = training$Class, 140 | method = "avNNet", 141 | metric = "ROC", 142 | preProc = c("center", "scale", "spatialSign"), 143 | tuneGrid = nnetGrid, 144 | trace = FALSE, 145 | maxit = 2000, 146 | repeats = 10, 147 | MaxNWts = 10*(maxSize * (length(reducedSet) + 1) + maxSize + 1), 148 | allowParallel = FALSE, 149 | trControl = ctrl) 150 | nnetFit4 151 | 152 | nnetFit4$pred <- merge(nnetFit4$pred, nnetFit4$bestTune) 153 | nnetCM <- confusionMatrix(nnetFit4, norm = "none") 154 | nnetCM 155 | 156 | nnetRoc <- roc(response = nnetFit4$pred$obs, 157 | predictor = nnetFit4$pred$successful, 158 | levels = rev(levels(nnetFit4$pred$obs))) 159 | 160 | 161 | nnet1 <- nnetFit$results 162 | nnet1$Transform <- "No Transformation" 163 | nnet1$Model <- "Single Model" 164 | 165 | nnet2 <- nnetFit2$results 166 | nnet2$Transform <- "Spatial Sign" 167 | nnet2$Model <- "Single Model" 168 | 169 | nnet3 <- nnetFit3$results 170 | nnet3$Transform <- "No Transformation" 171 | nnet3$Model <- "Model Averaging" 172 | nnet3$bag <- NULL 173 | 174 | nnet4 <- nnetFit4$results 175 | nnet4$Transform <- "Spatial Sign" 176 | nnet4$Model <- "Model Averaging" 177 | nnet4$bag <- NULL 178 | 179 | nnetResults <- rbind(nnet1, nnet2, nnet3, nnet4) 180 | nnetResults$Model <- factor(as.character(nnetResults$Model), 181 | levels = c("Single Model", "Model Averaging")) 182 | library(latticeExtra) 183 | useOuterStrips( 184 | xyplot(ROC ~ size|Model*Transform, 185 | data = nnetResults, 186 | groups = decay, 187 | as.table = TRUE, 188 | type = c("p", "l", "g"), 189 | lty = 1, 190 | ylab = "ROC AUC (2008 Hold-Out Data)", 191 | xlab = "Number of Hidden Units", 192 | auto.key = list(columns = 4, 193 | title = "Weight Decay", 194 | cex.title = 1))) 195 | 196 | plot(nnetRoc, type = "s", legacy.axes = TRUE) 197 | 198 | ################################################################################ 199 | ### Section 13.3 Flexible Discriminant Analysis 200 | 201 | set.seed(476) 202 | fdaFit <- train(x = training[,reducedSet], 203 | y = training$Class, 204 | method = "fda", 205 | metric = "ROC", 206 | tuneGrid = expand.grid(degree = 1, nprune = 2:25), 207 | trControl = ctrl) 208 | fdaFit 209 | 210 | fdaFit$pred <- merge(fdaFit$pred, fdaFit$bestTune) 211 | fdaCM <- confusionMatrix(fdaFit, norm = "none") 212 | fdaCM 213 | 214 | fdaRoc <- roc(response = fdaFit$pred$obs, 215 | predictor = fdaFit$pred$successful, 216 | levels = rev(levels(fdaFit$pred$obs))) 217 | 218 | update(plot(fdaFit), ylab = "ROC AUC (2008 Hold-Out Data)") 219 | 220 | plot(nnetRoc, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE) 221 | plot(fdaRoc, type = "s", add = TRUE, legacy.axes = TRUE) 222 | 223 | 224 | ################################################################################ 225 | ### Section 13.4 Support Vector Machines 226 | 227 | library(kernlab) 228 | 229 | set.seed(201) 230 | sigmaRangeFull <- sigest(as.matrix(training[,fullSet])) 231 | svmRGridFull <- expand.grid(sigma = as.vector(sigmaRangeFull)[1], 232 | C = 2^(-3:4)) 233 | set.seed(476) 234 | svmRFitFull <- train(x = training[,fullSet], 235 | y = training$Class, 236 | method = "svmRadial", 237 | metric = "ROC", 238 | preProc = c("center", "scale"), 239 | tuneGrid = svmRGridFull, 240 | trControl = ctrl) 241 | svmRFitFull 242 | 243 | set.seed(202) 244 | sigmaRangeReduced <- sigest(as.matrix(training[,reducedSet])) 245 | svmRGridReduced <- expand.grid(sigma = sigmaRangeReduced[1], 246 | C = 2^(seq(-4, 4))) 247 | set.seed(476) 248 | svmRFitReduced <- train(x = training[,reducedSet], 249 | y = training$Class, 250 | method = "svmRadial", 251 | metric = "ROC", 252 | preProc = c("center", "scale"), 253 | tuneGrid = svmRGridReduced, 254 | trControl = ctrl) 255 | svmRFitReduced 256 | 257 | svmPGrid <- expand.grid(degree = 1:2, 258 | scale = c(0.01, .005), 259 | C = 2^(seq(-6, -2, length = 10))) 260 | 261 | set.seed(476) 262 | svmPFitFull <- train(x = training[,fullSet], 263 | y = training$Class, 264 | method = "svmPoly", 265 | metric = "ROC", 266 | preProc = c("center", "scale"), 267 | tuneGrid = svmPGrid, 268 | trControl = ctrl) 269 | svmPFitFull 270 | 271 | svmPGrid2 <- expand.grid(degree = 1:2, 272 | scale = c(0.01, .005), 273 | C = 2^(seq(-6, -2, length = 10))) 274 | set.seed(476) 275 | svmPFitReduced <- train(x = training[,reducedSet], 276 | y = training$Class, 277 | method = "svmPoly", 278 | metric = "ROC", 279 | preProc = c("center", "scale"), 280 | tuneGrid = svmPGrid2, 281 | fit = FALSE, 282 | trControl = ctrl) 283 | svmPFitReduced 284 | 285 | svmPFitReduced$pred <- merge(svmPFitReduced$pred, svmPFitReduced$bestTune) 286 | svmPCM <- confusionMatrix(svmPFitReduced, norm = "none") 287 | svmPRoc <- roc(response = svmPFitReduced$pred$obs, 288 | predictor = svmPFitReduced$pred$successful, 289 | levels = rev(levels(svmPFitReduced$pred$obs))) 290 | 291 | 292 | svmRadialResults <- rbind(svmRFitReduced$results, 293 | svmRFitFull$results) 294 | svmRadialResults$Set <- c(rep("Reduced Set", nrow(svmRFitReduced$result)), 295 | rep("Full Set", nrow(svmRFitFull$result))) 296 | svmRadialResults$Sigma <- paste("sigma = ", 297 | format(svmRadialResults$sigma, 298 | scientific = FALSE, digits= 5)) 299 | svmRadialResults <- svmRadialResults[!is.na(svmRadialResults$ROC),] 300 | xyplot(ROC ~ C|Set, data = svmRadialResults, 301 | groups = Sigma, type = c("g", "o"), 302 | xlab = "Cost", 303 | ylab = "ROC (2008 Hold-Out Data)", 304 | auto.key = list(columns = 2), 305 | scales = list(x = list(log = 2))) 306 | 307 | svmPolyResults <- rbind(svmPFitReduced$results, 308 | svmPFitFull$results) 309 | svmPolyResults$Set <- c(rep("Reduced Set", nrow(svmPFitReduced$result)), 310 | rep("Full Set", nrow(svmPFitFull$result))) 311 | svmPolyResults <- svmPolyResults[!is.na(svmPolyResults$ROC),] 312 | svmPolyResults$scale <- paste("scale = ", 313 | format(svmPolyResults$scale, 314 | scientific = FALSE)) 315 | svmPolyResults$Degree <- "Linear" 316 | svmPolyResults$Degree[svmPolyResults$degree == 2] <- "Quadratic" 317 | useOuterStrips(xyplot(ROC ~ C|Degree*Set, data = svmPolyResults, 318 | groups = scale, type = c("g", "o"), 319 | xlab = "Cost", 320 | ylab = "ROC (2008 Hold-Out Data)", 321 | auto.key = list(columns = 2), 322 | scales = list(x = list(log = 2)))) 323 | 324 | plot(nnetRoc, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE) 325 | plot(fdaRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE) 326 | plot(svmPRoc, type = "s", add = TRUE, legacy.axes = TRUE) 327 | 328 | ################################################################################ 329 | ### Section 13.5 K-Nearest Neighbors 330 | 331 | 332 | set.seed(476) 333 | knnFit <- train(x = training[,reducedSet], 334 | y = training$Class, 335 | method = "knn", 336 | metric = "ROC", 337 | preProc = c("center", "scale"), 338 | tuneGrid = data.frame(k = c(4*(0:5)+1,20*(1:5)+1,50*(2:9)+1)), 339 | trControl = ctrl) 340 | knnFit 341 | 342 | knnFit$pred <- merge(knnFit$pred, knnFit$bestTune) 343 | knnCM <- confusionMatrix(knnFit, norm = "none") 344 | knnCM 345 | knnRoc <- roc(response = knnFit$pred$obs, 346 | predictor = knnFit$pred$successful, 347 | levels = rev(levels(knnFit$pred$obs))) 348 | 349 | update(plot(knnFit, ylab = "ROC (2008 Hold-Out Data)")) 350 | 351 | plot(fdaRoc, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE) 352 | plot(nnetRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE) 353 | plot(svmPRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE) 354 | plot(knnRoc, type = "s", add = TRUE, legacy.axes = TRUE) 355 | 356 | ################################################################################ 357 | ### Section 13.6 Naive Bayes 358 | 359 | ## Create factor versions of some of the predictors so that they are treated 360 | ## as categories and not dummy variables 361 | 362 | factors <- c("SponsorCode", "ContractValueBand", "Month", "Weekday") 363 | nbPredictors <- factorPredictors[factorPredictors %in% reducedSet] 364 | nbPredictors <- c(nbPredictors, factors) 365 | nbPredictors <- nbPredictors[nbPredictors != "SponsorUnk"] 366 | 367 | nbTraining <- training[, c("Class", nbPredictors)] 368 | nbTesting <- testing[, c("Class", nbPredictors)] 369 | 370 | for(i in nbPredictors) 371 | { 372 | if(length(unique(training[,i])) <= 15) 373 | { 374 | nbTraining[, i] <- factor(nbTraining[,i], levels = paste(sort(unique(training[,i])))) 375 | nbTesting[, i] <- factor(nbTesting[,i], levels = paste(sort(unique(training[,i])))) 376 | } 377 | } 378 | 379 | set.seed(476) 380 | nBayesFit <- train(x = nbTraining[,nbPredictors], 381 | y = nbTraining$Class, 382 | method = "nb", 383 | metric = "ROC", 384 | tuneGrid = data.frame(usekernel = c(TRUE, FALSE), fL = 2), 385 | trControl = ctrl) 386 | nBayesFit 387 | 388 | nBayesFit$pred <- merge(nBayesFit$pred, nBayesFit$bestTune) 389 | nBayesCM <- confusionMatrix(nBayesFit, norm = "none") 390 | nBayesCM 391 | nBayesRoc <- roc(response = nBayesFit$pred$obs, 392 | predictor = nBayesFit$pred$successful, 393 | levels = rev(levels(nBayesFit$pred$obs))) 394 | nBayesRoc 395 | 396 | 397 | sessionInfo() 398 | 399 | q("no") 400 | -------------------------------------------------------------------------------- /inst/chapters/17_Job_Scheduling.R: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson. 3 | ### Copyright 2013 Kuhn and Johnson 4 | ### Web Page: http://www.appliedpredictivemodeling.com 5 | ### Contact: Max Kuhn (mxkuhn@gmail.com) 6 | ### 7 | ### Chapter 17: Case Study: Job Scheduling 8 | ### 9 | ### Required packages: AppliedPredictiveModeling, C50, caret, doMC (optional), 10 | ### earth, Hmisc, ipred, tabplot, kernlab, lattice, MASS, 11 | ### mda, nnet, pls, randomForest, rpart, sparseLDA, 12 | ### 13 | ### Data used: The HPC job scheduling data in the AppliedPredictiveModeling 14 | ### package. 15 | ### 16 | ### Notes: 17 | ### 1) This code is provided without warranty. 18 | ### 19 | ### 2) This code should help the user reproduce the results in the 20 | ### text. There will be differences between this code and what is is 21 | ### the computing section. For example, the computing sections show 22 | ### how the source functions work (e.g. randomForest() or plsr()), 23 | ### which were not directly used when creating the book. Also, there may be 24 | ### syntax differences that occur over time as packages evolve. These files 25 | ### will reflect those changes. 26 | ### 27 | ### 3) In some cases, the calculations in the book were run in 28 | ### parallel. The sub-processes may reset the random number seed. 29 | ### Your results may slightly vary. 30 | ### 31 | ################################################################################ 32 | 33 | library(AppliedPredictiveModeling) 34 | data(schedulingData) 35 | 36 | ### Make a vector of predictor names 37 | predictors <- names(schedulingData)[!(names(schedulingData) %in% c("Class"))] 38 | 39 | ### A few summaries and plots of the data 40 | library(Hmisc) 41 | describe(schedulingData) 42 | 43 | library(tabplot) 44 | tableplot(schedulingData[, c( "Class", predictors)]) 45 | 46 | mosaicplot(table(schedulingData$Protocol, 47 | schedulingData$Class), 48 | main = "") 49 | 50 | library(lattice) 51 | xyplot(Compounds ~ InputFields|Protocol, 52 | data = schedulingData, 53 | scales = list(x = list(log = 10), y = list(log = 10)), 54 | groups = Class, 55 | xlab = "Input Fields", 56 | auto.key = list(columns = 4), 57 | aspect = 1, 58 | as.table = TRUE) 59 | 60 | 61 | ################################################################################ 62 | ### Section 17.1 Data Splitting and Model Strategy 63 | 64 | ## Split the data 65 | 66 | library(caret) 67 | set.seed(1104) 68 | inTrain <- createDataPartition(schedulingData$Class, p = .8, list = FALSE) 69 | 70 | ### There are a lot of zeros and the distribution is skewed. We add 71 | ### one so that we can log transform the data 72 | schedulingData$NumPending <- schedulingData$NumPending + 1 73 | 74 | trainData <- schedulingData[ inTrain,] 75 | testData <- schedulingData[-inTrain,] 76 | 77 | ### Create a main effects only model formula to use 78 | ### repeatedly. Another formula with nonlinear effects is created 79 | ### below. 80 | modForm <- as.formula(Class ~ Protocol + log10(Compounds) + 81 | log10(InputFields)+ log10(Iterations) + 82 | log10(NumPending) + Hour + Day) 83 | 84 | ### Create an expanded set of predictors with interactions. 85 | 86 | modForm2 <- as.formula(Class ~ (Protocol + log10(Compounds) + 87 | log10(InputFields)+ log10(Iterations) + 88 | log10(NumPending) + Hour + Day)^2) 89 | 90 | 91 | ### Some of these terms will not be estimable. For example, if there 92 | ### are no data points were a particular protocol was run on a 93 | ### particular day, the full interaction cannot be computed. We use 94 | ### model.matrix() to create the whole set of predictor columns, then 95 | ### remove those that are zero variance 96 | 97 | expandedTrain <- model.matrix(modForm2, data = trainData) 98 | expandedTest <- model.matrix(modForm2, data = testData) 99 | expandedTrain <- as.data.frame(expandedTrain) 100 | expandedTest <- as.data.frame(expandedTest) 101 | 102 | ### Some models have issues when there is a zero variance predictor 103 | ### within the data of a particular class, so we used caret's 104 | ### checkConditionalX() function to find the offending columns and 105 | ### remove them 106 | 107 | zv <- checkConditionalX(expandedTrain, trainData$Class) 108 | 109 | ### Keep the expanded set to use for models where we must manually add 110 | ### more complex terms (such as logistic regression) 111 | 112 | expandedTrain <- expandedTrain[,-zv] 113 | expandedTest <- expandedTest[, -zv] 114 | 115 | ### Create the cost matrix 116 | costMatrix <- ifelse(diag(4) == 1, 0, 1) 117 | costMatrix[4, 1] <- 10 118 | costMatrix[3, 1] <- 5 119 | costMatrix[4, 2] <- 5 120 | costMatrix[3, 2] <- 5 121 | rownames(costMatrix) <- colnames(costMatrix) <- levels(trainData$Class) 122 | 123 | ### Create a cost function 124 | cost <- function(pred, obs) 125 | { 126 | isNA <- is.na(pred) 127 | if(!all(isNA)) 128 | { 129 | pred <- pred[!isNA] 130 | obs <- obs[!isNA] 131 | 132 | cost <- ifelse(pred == obs, 0, 1) 133 | if(any(pred == "VF" & obs == "L")) cost[pred == "L" & obs == "VF"] <- 10 134 | if(any(pred == "F" & obs == "L")) cost[pred == "F" & obs == "L"] <- 5 135 | if(any(pred == "F" & obs == "M")) cost[pred == "F" & obs == "M"] <- 5 136 | if(any(pred == "VF" & obs == "M")) cost[pred == "VF" & obs == "M"] <- 5 137 | out <- mean(cost) 138 | } else out <- NA 139 | out 140 | } 141 | 142 | ### Make a summary function that can be used with caret's train() function 143 | costSummary <- function (data, lev = NULL, model = NULL) 144 | { 145 | if (is.character(data$obs)) data$obs <- factor(data$obs, levels = lev) 146 | c(postResample(data[, "pred"], data[, "obs"]), 147 | Cost = cost(data[, "pred"], data[, "obs"])) 148 | } 149 | 150 | ### Create a control object for the models 151 | ctrl <- trainControl(method = "repeatedcv", 152 | repeats = 5, 153 | summaryFunction = costSummary) 154 | 155 | ### Optional: parallel processing can be used via the 'do' packages, 156 | ### such as doMC, doMPI etc. We used doMC (not on Windows) to speed 157 | ### up the computations. 158 | 159 | ### WARNING: Be aware of how much memory is needed to parallel 160 | ### process. It can very quickly overwhelm the available hardware. The 161 | ### estimate of the median memory usage (VSIZE = total memory size) 162 | ### was 3300-4100M per core although the some calculations require as 163 | ### much as 3400M without parallel processing. 164 | 165 | library(doMC) 166 | registerDoMC(14) 167 | 168 | ### Fit the CART model with and without costs 169 | 170 | set.seed(857) 171 | rpFit <- train(x = trainData[, predictors], 172 | y = trainData$Class, 173 | method = "rpart", 174 | metric = "Cost", 175 | maximize = FALSE, 176 | tuneLength = 20, 177 | trControl = ctrl) 178 | rpFit 179 | 180 | set.seed(857) 181 | rpFitCost <- train(x = trainData[, predictors], 182 | y = trainData$Class, 183 | method = "rpart", 184 | metric = "Cost", 185 | maximize = FALSE, 186 | tuneLength = 20, 187 | parms =list(loss = costMatrix), 188 | trControl = ctrl) 189 | rpFitCost 190 | 191 | set.seed(857) 192 | ldaFit <- train(x = expandedTrain, 193 | y = trainData$Class, 194 | method = "lda", 195 | metric = "Cost", 196 | maximize = FALSE, 197 | trControl = ctrl) 198 | ldaFit 199 | 200 | sldaGrid <- expand.grid(NumVars = seq(2, 112, by = 5), 201 | lambda = c(0, 0.01, .1, 1, 10)) 202 | set.seed(857) 203 | sldaFit <- train(x = expandedTrain, 204 | y = trainData$Class, 205 | method = "sparseLDA", 206 | tuneGrid = sldaGrid, 207 | preProc = c("center", "scale"), 208 | metric = "Cost", 209 | maximize = FALSE, 210 | trControl = ctrl) 211 | sldaFit 212 | 213 | set.seed(857) 214 | nnetGrid <- expand.grid(decay = c(0, 0.001, 0.01, .1, .5), 215 | size = (1:10)*2 - 1) 216 | nnetFit <- train(modForm, 217 | data = trainData, 218 | method = "nnet", 219 | metric = "Cost", 220 | maximize = FALSE, 221 | tuneGrid = nnetGrid, 222 | trace = FALSE, 223 | MaxNWts = 2000, 224 | maxit = 1000, 225 | preProc = c("center", "scale"), 226 | trControl = ctrl) 227 | nnetFit 228 | 229 | set.seed(857) 230 | plsFit <- train(x = expandedTrain, 231 | y = trainData$Class, 232 | method = "pls", 233 | metric = "Cost", 234 | maximize = FALSE, 235 | tuneLength = 100, 236 | preProc = c("center", "scale"), 237 | trControl = ctrl) 238 | plsFit 239 | 240 | set.seed(857) 241 | fdaFit <- train(modForm, data = trainData, 242 | method = "fda", 243 | metric = "Cost", 244 | maximize = FALSE, 245 | tuneLength = 25, 246 | trControl = ctrl) 247 | fdaFit 248 | 249 | set.seed(857) 250 | rfFit <- train(x = trainData[, predictors], 251 | y = trainData$Class, 252 | method = "rf", 253 | metric = "Cost", 254 | maximize = FALSE, 255 | tuneLength = 10, 256 | ntree = 2000, 257 | importance = TRUE, 258 | trControl = ctrl) 259 | rfFit 260 | 261 | set.seed(857) 262 | rfFitCost <- train(x = trainData[, predictors], 263 | y = trainData$Class, 264 | method = "rf", 265 | metric = "Cost", 266 | maximize = FALSE, 267 | tuneLength = 10, 268 | ntree = 2000, 269 | classwt = c(VF = 1, F = 1, M = 5, L = 10), 270 | importance = TRUE, 271 | trControl = ctrl) 272 | rfFitCost 273 | 274 | c5Grid <- expand.grid(trials = c(1, (1:10)*10), 275 | model = "tree", 276 | winnow = c(TRUE, FALSE)) 277 | set.seed(857) 278 | c50Fit <- train(x = trainData[, predictors], 279 | y = trainData$Class, 280 | method = "C5.0", 281 | metric = "Cost", 282 | maximize = FALSE, 283 | tuneGrid = c5Grid, 284 | trControl = ctrl) 285 | c50Fit 286 | 287 | set.seed(857) 288 | c50Cost <- train(x = trainData[, predictors], 289 | y = trainData$Class, 290 | method = "C5.0", 291 | metric = "Cost", 292 | maximize = FALSE, 293 | costs = costMatrix, 294 | tuneGrid = c5Grid, 295 | trControl = ctrl) 296 | c50Cost 297 | 298 | set.seed(857) 299 | bagFit <- train(x = trainData[, predictors], 300 | y = trainData$Class, 301 | method = "treebag", 302 | metric = "Cost", 303 | maximize = FALSE, 304 | nbagg = 50, 305 | trControl = ctrl) 306 | bagFit 307 | 308 | ### Use the caret bag() function to bag the cost-sensitive CART model 309 | rpCost <- function(x, y) 310 | { 311 | costMatrix <- ifelse(diag(4) == 1, 0, 1) 312 | costMatrix[4, 1] <- 10 313 | costMatrix[3, 1] <- 5 314 | costMatrix[4, 2] <- 5 315 | costMatrix[3, 2] <- 5 316 | library(rpart) 317 | tmp <- x 318 | tmp$y <- y 319 | rpart(y~., data = tmp, control = rpart.control(cp = 0), 320 | parms =list(loss = costMatrix)) 321 | } 322 | rpPredict <- function(object, x) predict(object, x) 323 | 324 | rpAgg <- function (x, type = "class") 325 | { 326 | pooled <- x[[1]] * NA 327 | n <- nrow(pooled) 328 | classes <- colnames(pooled) 329 | for (i in 1:ncol(pooled)) 330 | { 331 | tmp <- lapply(x, function(y, col) y[, col], col = i) 332 | tmp <- do.call("rbind", tmp) 333 | pooled[, i] <- apply(tmp, 2, median) 334 | } 335 | pooled <- apply(pooled, 1, function(x) x/sum(x)) 336 | if (n != nrow(pooled)) pooled <- t(pooled) 337 | out <- factor(classes[apply(pooled, 1, which.max)], levels = classes) 338 | out 339 | } 340 | 341 | 342 | set.seed(857) 343 | rpCostBag <- train(trainData[, predictors], 344 | trainData$Class, 345 | "bag", 346 | B = 50, 347 | bagControl = bagControl(fit = rpCost, 348 | predict = rpPredict, 349 | aggregate = rpAgg, 350 | downSample = FALSE, 351 | allowParallel = FALSE), 352 | trControl = ctrl) 353 | rpCostBag 354 | 355 | set.seed(857) 356 | svmRFit <- train(modForm , 357 | data = trainData, 358 | method = "svmRadial", 359 | metric = "Cost", 360 | maximize = FALSE, 361 | preProc = c("center", "scale"), 362 | tuneLength = 15, 363 | trControl = ctrl) 364 | svmRFit 365 | 366 | set.seed(857) 367 | svmRFitCost <- train(modForm, data = trainData, 368 | method = "svmRadial", 369 | metric = "Cost", 370 | maximize = FALSE, 371 | preProc = c("center", "scale"), 372 | class.weights = c(VF = 1, F = 1, M = 5, L = 10), 373 | tuneLength = 15, 374 | trControl = ctrl) 375 | svmRFitCost 376 | 377 | modelList <- list(C5.0 = c50Fit, 378 | "C5.0 (Costs)" = c50Cost, 379 | CART =rpFit, 380 | "CART (Costs)" = rpFitCost, 381 | "Bagging (Costs)" = rpCostBag, 382 | FDA = fdaFit, 383 | SVM = svmRFit, 384 | "SVM (Weights)" = svmRFitCost, 385 | PLS = plsFit, 386 | "Random Forests" = rfFit, 387 | LDA = ldaFit, 388 | "LDA (Sparse)" = sldaFit, 389 | "Neural Networks" = nnetFit, 390 | Bagging = bagFit) 391 | 392 | 393 | ################################################################################ 394 | ### Section 17.2 Results 395 | 396 | rs <- resamples(modelList) 397 | summary(rs) 398 | 399 | confusionMatrix(rpFitCost, "none") 400 | confusionMatrix(rfFit, "none") 401 | 402 | plot(bwplot(rs, metric = "Cost")) 403 | 404 | rfPred <- predict(rfFit, testData) 405 | rpPred <- predict(rpFitCost, testData) 406 | 407 | confusionMatrix(rfPred, testData$Class) 408 | confusionMatrix(rpPred, testData$Class) 409 | 410 | 411 | ################################################################################ 412 | ### Session Information 413 | 414 | sessionInfo() 415 | 416 | q("no") 417 | -------------------------------------------------------------------------------- /inst/chapters/18_Importance.R: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson. 3 | ### Copyright 2013 Kuhn and Johnson 4 | ### Web Page: http://www.appliedpredictivemodeling.com 5 | ### Contact: Max Kuhn (mxkuhn@gmail.com) 6 | ### 7 | ### Chapter 18: Measuring Predictor Importance 8 | ### 9 | ### Required packages: AppliedPredictiveModeling, caret, CORElearn, corrplot, 10 | ### pROC, minerva 11 | ### 12 | ### 13 | ### Data used: The solubility data from the AppliedPredictiveModeling 14 | ### package, the segmentation data in the caret package and the 15 | ### grant data (created using "CreateGrantData.R" in the same 16 | ### directory as this file). 17 | ### 18 | ### Notes: 19 | ### 1) This code is provided without warranty. 20 | ### 21 | ### 2) This code should help the user reproduce the results in the 22 | ### text. There will be differences between this code and what is is 23 | ### the computing section. For example, the computing sections show 24 | ### how the source functions work (e.g. randomForest() or plsr()), 25 | ### which were not directly used when creating the book. Also, there may be 26 | ### syntax differences that occur over time as packages evolve. These files 27 | ### will reflect those changes. 28 | ### 29 | ### 3) In some cases, the calculations in the book were run in 30 | ### parallel. The sub-processes may reset the random number seed. 31 | ### Your results may slightly vary. 32 | ### 33 | ################################################################################ 34 | 35 | 36 | 37 | ################################################################################ 38 | ### Section 18.1 Numeric Outcomes 39 | 40 | ## Load the solubility data 41 | 42 | library(AppliedPredictiveModeling) 43 | data(solubility) 44 | 45 | trainData <- solTrainXtrans 46 | trainData$y <- solTrainY 47 | 48 | 49 | ## keep the continuous predictors and append the outcome to the data frame 50 | SolContPred <- solTrainXtrans[, !grepl("FP", names(solTrainXtrans))] 51 | numSolPred <- ncol(SolContPred) 52 | SolContPred$Sol <- solTrainY 53 | 54 | ## Get the LOESS smoother and the summary measure 55 | library(caret) 56 | smoother <- filterVarImp(x = SolContPred[, -ncol(SolContPred)], 57 | y = solTrainY, 58 | nonpara = TRUE) 59 | smoother$Predictor <- rownames(smoother) 60 | names(smoother)[1] <- "Smoother" 61 | 62 | ## Calculate the correlation matrices and keep the columns with the correlations 63 | ## between the predictors and the outcome 64 | 65 | correlations <- cor(SolContPred)[-(numSolPred+1),(numSolPred+1)] 66 | rankCorrelations <- cor(SolContPred, method = "spearman")[-(numSolPred+1),(numSolPred+1)] 67 | corrs <- data.frame(Predictor = names(SolContPred)[1:numSolPred], 68 | Correlation = correlations, 69 | RankCorrelation = rankCorrelations) 70 | 71 | ## The maximal information coefficient (MIC) values can be obtained from the 72 | ### minerva package: 73 | 74 | library(minerva) 75 | MIC <- mine(x = SolContPred[, 1:numSolPred], y = solTrainY)$MIC 76 | MIC <- data.frame(Predictor = rownames(MIC), 77 | MIC = MIC[,1]) 78 | 79 | 80 | ## The Relief values for regression can be computed using the CORElearn 81 | ## package: 82 | 83 | library(CORElearn) 84 | ReliefF <- attrEval(Sol ~ ., data = SolContPred, 85 | estimator = "RReliefFequalK") 86 | ReliefF <- data.frame(Predictor = names(ReliefF), 87 | Relief = ReliefF) 88 | 89 | ## Combine them all together for a plot 90 | contDescrScores <- merge(smoother, corrs) 91 | contDescrScores <- merge(contDescrScores, MIC) 92 | contDescrScores <- merge(contDescrScores, ReliefF) 93 | 94 | rownames(contDescrScores) <- contDescrScores$Predictor 95 | 96 | contDescrScores 97 | 98 | contDescrSplomData <- contDescrScores 99 | contDescrSplomData$Correlation <- abs(contDescrSplomData$Correlation) 100 | contDescrSplomData$RankCorrelation <- abs(contDescrSplomData$RankCorrelation) 101 | contDescrSplomData$Group <- "Other" 102 | contDescrSplomData$Group[grepl("Surface", contDescrSplomData$Predictor)] <- "SA" 103 | 104 | featurePlot(solTrainXtrans[, c("NumCarbon", "SurfaceArea2")], 105 | solTrainY, 106 | between = list(x = 1), 107 | type = c("g", "p", "smooth"), 108 | df = 3, 109 | aspect = 1, 110 | labels = c("", "Solubility")) 111 | 112 | 113 | splom(~contDescrSplomData[,c(3, 4, 2, 5)], 114 | groups = contDescrSplomData$Group, 115 | varnames = c("Correlation", "Rank\nCorrelation", "LOESS", "MIC")) 116 | 117 | 118 | ## Now look at the categorical (i.e. binary) predictors 119 | SolCatPred <- solTrainXtrans[, grepl("FP", names(solTrainXtrans))] 120 | SolCatPred$Sol <- solTrainY 121 | numSolCatPred <- ncol(SolCatPred) - 1 122 | 123 | tests <- apply(SolCatPred[, 1:numSolCatPred], 2, 124 | function(x, y) 125 | { 126 | tStats <- t.test(y ~ x)[c("statistic", "p.value", "estimate")] 127 | unlist(tStats) 128 | }, 129 | y = solTrainY) 130 | ## The results are a matrix with predictors in columns. We reverse this 131 | tests <- as.data.frame(t(tests)) 132 | names(tests) <- c("t.Statistic", "t.test_p.value", "mean0", "mean1") 133 | tests$difference <- tests$mean1 - tests$mean0 134 | tests 135 | 136 | ## Create a volcano plot 137 | 138 | xyplot(-log10(t.test_p.value) ~ difference, 139 | data = tests, 140 | xlab = "Mean With Structure - Mean Without Structure", 141 | ylab = "-log(p-Value)", 142 | type = "p") 143 | 144 | ################################################################################ 145 | ### Section 18.2 Categorical Outcomes 146 | 147 | ## Load the segmentation data 148 | 149 | data(segmentationData) 150 | segTrain <- subset(segmentationData, Case == "Train") 151 | segTrain$Case <- segTrain$Cell <- NULL 152 | 153 | segTest <- subset(segmentationData, Case != "Train") 154 | segTest$Case <- segTest$Cell <- NULL 155 | 156 | ## Compute the areas under the ROC curve 157 | aucVals <- filterVarImp(x = segTrain[, -1], y = segTrain$Class) 158 | aucVals$Predictor <- rownames(aucVals) 159 | 160 | ## Cacluate the t-tests as before but with x and y switched 161 | segTests <- apply(segTrain[, -1], 2, 162 | function(x, y) 163 | { 164 | tStats <- t.test(x ~ y)[c("statistic", "p.value", "estimate")] 165 | unlist(tStats) 166 | }, 167 | y = segTrain$Class) 168 | segTests <- as.data.frame(t(segTests)) 169 | names(segTests) <- c("t.Statistic", "t.test_p.value", "mean0", "mean1") 170 | segTests$Predictor <- rownames(segTests) 171 | 172 | ## Fit a random forest model and get the importance scores 173 | library(randomForest) 174 | set.seed(791) 175 | rfImp <- randomForest(Class ~ ., data = segTrain, 176 | ntree = 2000, 177 | importance = TRUE) 178 | rfValues <- data.frame(RF = importance(rfImp)[, "MeanDecreaseGini"], 179 | Predictor = rownames(importance(rfImp))) 180 | 181 | ## Now compute the Relief scores 182 | set.seed(791) 183 | 184 | ReliefValues <- attrEval(Class ~ ., data = segTrain, 185 | estimator="ReliefFequalK", ReliefIterations = 50) 186 | ReliefValues <- data.frame(Relief = ReliefValues, 187 | Predictor = names(ReliefValues)) 188 | 189 | ## and the MIC statistics 190 | set.seed(791) 191 | segMIC <- mine(x = segTrain[, -1], 192 | ## Pass the outcome as 0/1 193 | y = ifelse(segTrain$Class == "PS", 1, 0))$MIC 194 | segMIC <- data.frame(Predictor = rownames(segMIC), 195 | MIC = segMIC[,1]) 196 | 197 | 198 | rankings <- merge(segMIC, ReliefValues) 199 | rankings <- merge(rankings, rfValues) 200 | rankings <- merge(rankings, segTests) 201 | rankings <- merge(rankings, aucVals) 202 | rankings 203 | 204 | rankings$channel <- "Channel 1" 205 | rankings$channel[grepl("Ch2$", rankings$Predictor)] <- "Channel 2" 206 | rankings$channel[grepl("Ch3$", rankings$Predictor)] <- "Channel 3" 207 | rankings$channel[grepl("Ch4$", rankings$Predictor)] <- "Channel 4" 208 | rankings$t.Statistic <- abs(rankings$t.Statistic) 209 | 210 | splom(~rankings[, c("PS", "t.Statistic", "RF", "Relief", "MIC")], 211 | groups = rankings$channel, 212 | varnames = c("ROC\nAUC", "Abs\nt-Stat", "Random\nForest", "Relief", "MIC"), 213 | auto.key = list(columns = 2)) 214 | 215 | 216 | ## Load the grant data. A script to create and save these data is contained 217 | ## in the same directory as this file. 218 | 219 | load("grantData.RData") 220 | 221 | dataSubset <- training[pre2008, c("Sponsor62B", "ContractValueBandUnk", "RFCD240302")] 222 | 223 | ## This is a simple function to compute several statistics for binary predictors 224 | tableCalcs <- function(x, y) 225 | { 226 | tab <- table(x, y) 227 | fet <- fisher.test(tab) 228 | out <- c(OR = fet$estimate, 229 | P = fet$p.value, 230 | Gain = attrEval(y ~ x, estimator = "GainRatio")) 231 | } 232 | 233 | ## lapply() is used to execute the function on each column 234 | tableResults <- lapply(dataSubset, tableCalcs, y = training[pre2008, "Class"]) 235 | 236 | ## The results come back as a list of vectors, and "rbind" is used to join 237 | ## then together as rows of a table 238 | tableResults <- do.call("rbind", tableResults) 239 | tableResults 240 | 241 | ## The permuted Relief scores can be computed using a function from the 242 | ## AppliedPredictiveModeling package. 243 | 244 | permuted <- permuteRelief(x = training[pre2008, c("Sponsor62B", "Day", "NumCI")], 245 | y = training[pre2008, "Class"], 246 | nperm = 500, 247 | ### the remaining options are passed to attrEval() 248 | estimator="ReliefFequalK", 249 | ReliefIterations= 50) 250 | 251 | ## The original Relief scores: 252 | permuted$observed 253 | 254 | ## The number of standard deviations away from the permuted mean: 255 | permuted$standardized 256 | 257 | ## The distributions of the scores if there were no relationship between the 258 | ## predictors and outcomes 259 | 260 | histogram(~value|Predictor, 261 | data = permuted$permutations, 262 | xlim = extendrange(permuted$permutations$value), 263 | xlab = "Relief Score") 264 | 265 | 266 | ################################################################################ 267 | ### Session Information 268 | 269 | sessionInfo() 270 | 271 | q("no") 272 | -------------------------------------------------------------------------------- /man/AlzheimerDisease.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{AlzheimerDisease} 5 | \alias{AlzheimerDisease} 6 | \alias{diagnosis} 7 | \alias{predictors} 8 | \title{Alzheimer's Disease CSF Data} 9 | \source{ 10 | Craig-Schapiro, R., Kuhn, M., Xiong, C., Pickering, E. H., Liu, J., 11 | Misko, T. P., Perrin, R. J., et al. (2011). Multiplexed Immunoassay Panel 12 | Identifies Novel CSF Biomarkers for Alzheimer's Disease Diagnosis and 13 | Prognosis. PLoS ONE, 6(4), e18850. 14 | } 15 | \value{ 16 | \item{diagnosis}{labels for the patients, either "Impaired" or 17 | "Control". } \item{predictors}{predictors for demographic data (eg. age, 18 | gender), genotype and assay results.} 19 | } 20 | \description{ 21 | Washington University conducted a clinical study to determine if biological 22 | measurements made from cerebrospinal fluid (CSF) can be used to diagnose or 23 | predict Alzheimer's disease (Craig-Schapiro et al. 2011). These data are a 24 | modified version of the values used for the publication. 25 | } 26 | \details{ 27 | The R factor vector \code{diagnosis} contains the outcome data for 333 of 28 | the subjects. The demographic and laboratory results are collected in the 29 | data frame \code{predictors}. 30 | 31 | One important indicator of Alzheimer's disease is the genetic background of 32 | a subject. In particular, what versions of the Apolipoprotein E gene 33 | inherited from one's parents has an association with the disease. There are 34 | three variants of the gene: E2, E3 and E4. Since a child inherits a version 35 | of the gene from each parent, there are six possible combinations (e.g. 36 | E2/E2, E2/E3, and so on). This data is contained in the predictor column 37 | named \code{Genotype}. 38 | } 39 | \examples{ 40 | 41 | data(AlzheimerDisease) 42 | 43 | } 44 | \keyword{datasets} 45 | -------------------------------------------------------------------------------- /man/ChemicalManufacturingProcess.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{ChemicalManufacturingProcess} 5 | \alias{ChemicalManufacturingProcess} 6 | \title{Chemical Manufacturing Process Data} 7 | \value{ 8 | \code{ChemicalManufacturingProcess}: a data frame with columns for 9 | the outcome (\code{Yield}) and the predictors (\code{BiologicalMaterial01} 10 | though \code{BiologicalMaterial12} and \code{ManufacturingProcess01} though 11 | \code{ManufacturingProcess45} 12 | } 13 | \description{ 14 | This data set contains information about a chemical manufacturing process, 15 | in which the goal is to understand the relationship between the process and 16 | the resulting final product yield. Raw material in this process is put 17 | through a sequence of 27 steps to generate the final pharmaceutical product. 18 | The starting material is generated from a biological unit and has a range of 19 | quality and characteristics. The objective in this project was to develop a 20 | model to predict percent yield of the manufacturing process. The data set 21 | consisted of 177 samples of biological material for which 57 characteristics 22 | were measured. Of the 57 characteristics, there were 12 measurements of the 23 | biological starting material, and 45 measurements of the manufacturing 24 | process. The process variables included measurements such as temperature, 25 | drying time, washing time, and concentrations of by--products at various 26 | steps. Some of the process measurements can be controlled, while others are 27 | observed. Predictors are continuous, count, categorical; some are 28 | correlated, and some contain missing values. Samples are not independent 29 | because sets of samples come from the same batch of biological starting 30 | material. 31 | } 32 | \examples{ 33 | 34 | data(ChemicalManufacturingProcess) 35 | 36 | } 37 | \keyword{datasets} 38 | -------------------------------------------------------------------------------- /man/FuelEconomy.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{FuelEconomy} 5 | \alias{FuelEconomy} 6 | \alias{cars2010} 7 | \alias{cars2011} 8 | \alias{cars2012} 9 | \title{Fuel Economy Data} 10 | \value{ 11 | \item{cars2010}{data in cars from model year 2010. } 12 | \item{cars2011}{cars introduced in 2011 that were not in the model year 2010 13 | data.} \item{cars2012}{cars introduced in 2012 that were not in the model 14 | year 2010 or 2011 data } 15 | } 16 | \description{ 17 | The \url{http://fueleconomy.gov} website, run by the U.S. Department of 18 | Energy's Office of Energy Efficiency and Renewable Energy and the U.S. 19 | Environmental Protection Agency, lists different estimates of fuel economy 20 | for passenger cars and trucks. For each vehicle, various characteristics are 21 | recorded such as the engine displacement or number of cylinders. Along with 22 | these values, laboratory measurements are made for the city and highway 23 | miles per gallon (MPG) of the car. 24 | } 25 | \details{ 26 | Predictors extracted from the website include: \code{EngDispl}, 27 | \code{NumCyl}, \code{Transmission}, \code{AirAspirationMethod}, 28 | \code{NumGears}, \code{TransLockup}, \code{TransCreeperGear}, 29 | \code{DriveDesc}, \code{IntakeValvePerCyl}, \code{ExhaustValvesPerCyl}, 30 | \code{CarlineClassDesc}, \code{VarValveTiming} and \code{VarValveLift}. The 31 | outcome used in the book is in column \code{FE} and is the unadjusted 32 | highway data. 33 | } 34 | \examples{ 35 | 36 | data(FuelEconomy) 37 | 38 | library(lattice) 39 | 40 | ### Plot shown in the text: 41 | 42 | cars2010 <- cars2010[order(cars2010$EngDispl),] 43 | cars2011 <- cars2011[order(cars2011$EngDispl),] 44 | 45 | cars2010a <- cars2010 46 | cars2010a$Year <- "2010 Model Year" 47 | cars2011a <- cars2011 48 | cars2011a$Year <- "2011 Model Year" 49 | 50 | plotData <- rbind(cars2010a, cars2011a) 51 | 52 | plotTheme <- bookTheme(FALSE) 53 | plotTheme$plot.symbol$col <- rgb(.2, .2, .2, .5) 54 | plotTheme$plot.symbol$cex <- 0.7 55 | trellis.par.set(plotTheme) 56 | 57 | xyplot(FE ~ EngDispl|Year, plotData, 58 | xlab = "Engine Displacement", 59 | ylab = "Fuel Efficiency (MPG)", 60 | between = list(x = 1.2)) 61 | 62 | 63 | } 64 | \keyword{datasets} 65 | -------------------------------------------------------------------------------- /man/Hepatic.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{hepatic} 5 | \alias{hepatic} 6 | \alias{bio} 7 | \alias{chem} 8 | \alias{injury} 9 | \title{Hepatic Injury Data} 10 | \value{ 11 | \item{bio}{Biological screen results. } \item{chem}{Chemical 12 | fingerprints for sub-structures.} \item{injury}{A factor vector of 13 | outcomes.} 14 | } 15 | \description{ 16 | This data set was used to develop a model for predicting compounds' 17 | probability of causing hepatic injury (i.e. liver damage). This data set 18 | consisted of 281 unique compounds; 376 predictors were measured or computed 19 | for each. The response was categorical (either "None", "Mild" or "Severe" 20 | ),and was highly unbalanced. 21 | } 22 | \details{ 23 | This kind of response often occurs in pharmaceutical data because companies 24 | steer away from creating molecules that have undesirable characteristics. 25 | Therefore, well-behaved molecules often greatly outnumber undesirable 26 | molecules. The predictors consisted of measurements from 184 biological 27 | screens and 192 chemical feature predictors. The biological predictors 28 | represent activity for each screen and take values between 0 and 10 with a 29 | mode of 4. The chemical feature predictors represent counts of important 30 | sub-structures as well as measures of physical properties that are thought 31 | to be associated with hepatic injury. 32 | } 33 | \examples{ 34 | 35 | data(hepatic) 36 | 37 | } 38 | \keyword{datasets} 39 | -------------------------------------------------------------------------------- /man/abalone.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{abalone} 5 | \alias{abalone} 6 | \title{Abalone Data} 7 | \value{ 8 | \item{abalone}{a data frame with 4177 rows and 9 columns} 9 | } 10 | \description{ 11 | The Abalone data consist of data from 4177 abalones. The data consist of 12 | measurements of the type (male, female and infant), the longest shell 13 | measurement, the diameter, height and several weights (whole, shucked, 14 | viscera and shell). The outcome is the number of rings. The age of the 15 | abalone is the number of rings plus 1.5. 16 | } 17 | \details{ 18 | The data are taken from the UCI database 19 | (\url{http://archive.ics.uci.edu/ml/datasets/Abalone}). 20 | } 21 | \examples{ 22 | 23 | data(abalone) 24 | 25 | } 26 | \keyword{datasets} 27 | -------------------------------------------------------------------------------- /man/apm-internal.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \name{apm-internal} 4 | \alias{apm-internal} 5 | \alias{lowerp} 6 | \alias{upperp} 7 | \title{Internal Functions} 8 | \arguments{ 9 | \item{\dots}{optional arguments to pass to internal functions} 10 | } 11 | \description{ 12 | Internal functions 13 | } 14 | \author{ 15 | Max Kuhn 16 | } 17 | \keyword{internal} 18 | -------------------------------------------------------------------------------- /man/bookTheme.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bookTheme.R 3 | \name{bookTheme} 4 | \alias{bookTheme} 5 | \alias{transparentTheme} 6 | \title{Lattice Themes} 7 | \usage{ 8 | bookTheme(set = TRUE) 9 | 10 | transparentTheme(set = TRUE, pchSize = 1, trans = 0.2) 11 | } 12 | \arguments{ 13 | \item{set}{a logical: should these settings be applied to the current 14 | device?} 15 | 16 | \item{pchSize}{the size of the plot symbols} 17 | 18 | \item{trans}{the amount of transparency (via the alpha channel). Note that 19 | transparency is not supported by all graphics devices.} 20 | } 21 | \value{ 22 | Each function returns a list of theme parameters. See Sarkar (2008) 23 | or \code{\link[lattice]{trellis.par.get}} for specific details. 24 | } 25 | \description{ 26 | Two \pkg{lattice} themes used throughout the book. 27 | } 28 | \details{ 29 | When using these functions to save a plot, make sure to invoke them after 30 | the device has been opened (e.g. after calls such as \code{pdf()}. 31 | } 32 | \examples{ 33 | 34 | library(lattice) 35 | 36 | example <- quadBoundaryFunc(100) 37 | 38 | bookTheme(set = TRUE) 39 | xyplot(X2 ~ X1, data = example, groups = class, auto.key = TRUE) 40 | 41 | transparentTheme(set = TRUE, trans = .6) 42 | xyplot(X2 ~ X1, data = example, groups = class, auto.key = TRUE) 43 | 44 | } 45 | \references{ 46 | Some of the colors are based on values from ColorBrewer 47 | \url{http://www.colorbrewer.org}. 48 | 49 | Sarkar, D. (2008). Lattice: Multivariate Data Visualization with R. UseR! 50 | (1st ed. p. 286). Springer. 51 | } 52 | \author{ 53 | Max Kuhn 54 | } 55 | \keyword{hplot} 56 | -------------------------------------------------------------------------------- /man/concrete.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{concrete} 5 | \alias{concrete} 6 | \alias{mixtures} 7 | \title{Compressive Strength of Concrete from Yeh (1998)} 8 | \source{ 9 | Yeh, I. C. (1998). Modeling of strength of high-performance concrete 10 | using artificial neural networks. \emph{Cement and Concrete Research}, 11 | 28(12), 1797-1808. Elsevier. 12 | } 13 | \value{ 14 | \item{concrete}{data frame of data with predictor columns 15 | \code{Cement}, \code{BlastFurnaceSlag}, \code{FlyAsh}, \code{Water}, 16 | \code{Superplasticizer}, \code{CoarseAggregate}, \code{FineAggregate} and 17 | \code{Age} with response column \code{CompressiveStrength}. These are the 18 | amounts.} \item{mixtures}{The same data where all the ingredients have been 19 | converted to proportions of the total amounts.} 20 | } 21 | \description{ 22 | Yeh (1998) describes a collection of data sets from different sources that 23 | can be used for modeling the compressive strength of concrete formulations 24 | as a functions of their ingredients and age. 25 | } 26 | \details{ 27 | The data are from Yeh (1998) and taken from the UCI ML website 28 | \url{http://archive.ics.uci.edu/ml/datasets/Concrete+Compressive+Strength}. 29 | 30 | There are 1030 data points from the UCI website, but the paper states that 31 | approximately 1,000 samples were made, but only 727 were analyzed in the 32 | source material. It is unclear which samples were excluded. 33 | } 34 | \examples{ 35 | 36 | if (require("caret")) { 37 | data(concrete) 38 | 39 | library(caret) 40 | 41 | ### Split used in the book: 42 | set.seed(975) 43 | inTrain <- createDataPartition(mixtures$CompressiveStrength, p = 3/4)[[1]] 44 | training <- mixtures[ inTrain,] 45 | testing <- mixtures[-inTrain,] 46 | } 47 | 48 | 49 | } 50 | \keyword{datasets} 51 | -------------------------------------------------------------------------------- /man/getPackages.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/getPackages.R 3 | \name{getPackages} 4 | \alias{getPackages} 5 | \title{Install Packages for Each Chapter} 6 | \usage{ 7 | getPackages(chapter, ...) 8 | } 9 | \arguments{ 10 | \item{chapter}{an integer vector (or character versions of the integer) for 11 | the chapter number. See Details below:} 12 | 13 | \item{...}{options to pass to \code{\link[utils]{install.packages}}} 14 | } 15 | \description{ 16 | This function identifies the physical location on the user's computer where 17 | the chapter R scripts are located. 18 | } 19 | \details{ 20 | Chapter names and packages. about dependencies. 21 | } 22 | \examples{ 23 | 24 | \dontrun{ 25 | getPackages(2) 26 | getPackages(2:3) 27 | getPackages("4") 28 | } 29 | 30 | } 31 | \author{ 32 | Max Kuhn 33 | } 34 | \keyword{utilities} 35 | -------------------------------------------------------------------------------- /man/logisticCreditPredictions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{logisticCreditPredictions} 5 | \alias{logisticCreditPredictions} 6 | \title{Logistic Regression Predictions for the Credit Data} 7 | \value{ 8 | A data frame with columns \item{Bad}{The predicted class probability 9 | for bad credit. } \item{Good}{The predicted class probability for good 10 | credit.} \item{pred}{The predicted class. } \item{obs}{The observed class } 11 | } 12 | \description{ 13 | add some notes 14 | } 15 | \examples{ 16 | 17 | ## show code to make the predictions 18 | 19 | } 20 | \keyword{datasets} 21 | -------------------------------------------------------------------------------- /man/permeability.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{permeability} 5 | \alias{permeability} 6 | \alias{fingerprints} 7 | \title{Permeability Data} 8 | \source{ 9 | Kansy, M., Senner, F., and Gubernator, K. (1998). Physicochemical 10 | High Throughput Screening: Parallel Artificial Membrane Permeation Assay in 11 | the Description of Passive Absorption Processes. J. Med. Chem, 41(7), 12 | 1007-1010. 13 | } 14 | \value{ 15 | \item{permeability}{permeability values for each compound. } 16 | \item{fingerprints}{a matrix of binary fingerprint indicator variables.} 17 | } 18 | \description{ 19 | This pharmaceutical data set was used to develop a model for predicting 20 | compounds' permeability. In short, permeability is the measure of a 21 | molecule's ability to cross a membrane. The body, for example, has notable 22 | membranes between the body and brain, known as the blood-brain barrier, and 23 | between the gut and body in the intestines. These membranes help the body 24 | guard critical regions from receiving undesirable or detrimental substances. 25 | For an orally taken drug to be effective in the brain, it first must pass 26 | through the intestinal wall and then must pass through the blood-brain 27 | barrier in order to be present for the desired neurological target. 28 | Therefore, a compound's ability to permeate relevant biological membranes is 29 | critically important to understand early in the drug discovery process. 30 | Compounds that appear to be effective for a particular disease in research 31 | screening experiments, but appear to be poorly permeable may need to be 32 | altered in order improve permeability, and thus the compound's ability to 33 | reach the desired target. Identifying permeability problems can help guide 34 | chemists towards better molecules. 35 | } 36 | \details{ 37 | Permeability assays such as PAMPA and Caco-2 have been developed to help 38 | measure compounds' permeability (Kansy et al, 1998). These screens are 39 | effective at quantifying a compound's permeability, but the assay is 40 | expensive labor intensive. Given a sufficient number of compounds that have 41 | been screened, we could develop a predictive model for permeability in an 42 | attempt to potentially reduce the need for the assay. In this project there 43 | were 165 unique compounds; 1107 molecular fingerprints were determined for 44 | each. A molecular fingerprint is a binary sequence of numbers that 45 | represents the presence or absence of a specific molecular sub-structure. 46 | The response is highly skewed, the predictors are sparse (15.5 percent are 47 | present), and many predictors are strongly associated. 48 | } 49 | \examples{ 50 | 51 | data(permeability) 52 | 53 | hist(permeability) 54 | 55 | summary(apply(fingerprints, 2, mean)) 56 | 57 | } 58 | \keyword{datasets} 59 | -------------------------------------------------------------------------------- /man/permuteRelief.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/permuteRelief.R 3 | \name{permuteRelief} 4 | \alias{permuteRelief} 5 | \title{Permutation Statistics for the Relief Algorithm} 6 | \usage{ 7 | permuteRelief(x, y, nperm = 100, ...) 8 | } 9 | \arguments{ 10 | \item{x}{a data frame of predictor data} 11 | 12 | \item{y}{a vector of outcomes} 13 | 14 | \item{nperm}{the number of random permutations of the data} 15 | 16 | \item{\dots}{options to pass to \code{\link[CORElearn]{attrEval}}, such as 17 | the exact Relief algorithm, to use} 18 | } 19 | \value{ 20 | a list with elements \item{standardized }{a vector of standardized 21 | predictor scores} \item{permutations }{the values of the permuted scores, 22 | for plotting to assess the permutation distribution} \item{observed}{the 23 | observed scores} \item{options}{a list of options passed using \ldots{}} 24 | } 25 | \description{ 26 | This function uses a permutation approach to determining the relative 27 | magnitude of Relief scores (Kira and Rendell, 1992 and Kononenko, 1994). 28 | } 29 | \details{ 30 | The scores for each predictor are computed using the original data and after 31 | outcome data are randomly scrambled (\code{nprem} times). The mean and 32 | standard deviation of the permuted values are determined and a standardized 33 | version of the observed scores are determined by subtracting the permuted 34 | means from the original values, then dividing each by the corresponding 35 | standard deviation. 36 | } 37 | \examples{ 38 | 39 | set.seed(874) 40 | reliefEx3 <- easyBoundaryFunc(500) 41 | reliefEx3$X1 <- scale(reliefEx3$X1) 42 | reliefEx3$X2 <- scale(reliefEx3$X2) 43 | reliefEx3$prob <- NULL 44 | 45 | standardized <- permuteRelief(reliefEx3[, 1:2], reliefEx3$class, 46 | ## For efficiency, a small number of 47 | ## permutations are used here. 48 | nperm = 50, 49 | estimator="ReliefFequalK", 50 | ReliefIterations= 50) 51 | 52 | 53 | } 54 | \references{ 55 | Kira, K., & Rendell, L. (1992). The feature selection problem: 56 | Traditional methods and a new algorithm. \emph{Proceedings of the Eleventh 57 | International Conference on Machine Learning}, 129-129. 58 | 59 | Kononenko, I. (1994). Estimating attributes: analysis and extensions of 60 | RELIEF. Machine Learning: ECML-94, 171-182. 61 | } 62 | \seealso{ 63 | \code{\link[CORElearn]{attrEval}} 64 | } 65 | \author{ 66 | Max Kuhn 67 | } 68 | \keyword{htest} 69 | -------------------------------------------------------------------------------- /man/quadBoundaryFunc.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/quadBoundaryFunc.R 3 | \name{quadBoundaryFunc} 4 | \alias{quadBoundaryFunc} 5 | \alias{easyBoundaryFunc} 6 | \title{Functions for Simulating Data} 7 | \usage{ 8 | quadBoundaryFunc(n) 9 | 10 | easyBoundaryFunc(n, intercept = 0, interaction = 2) 11 | } 12 | \arguments{ 13 | \item{n}{the sample size} 14 | 15 | \item{intercept}{the coefficient for the logistic regression intercept term} 16 | 17 | \item{interaction}{the coefficient for the logistic regression interaction 18 | term} 19 | } 20 | \value{ 21 | Both functions return data frames with columns \item{X1}{numeric 22 | predictor value} \item{X2}{numeric predictor value} \item{prob }{numeric 23 | value reflecting the true probability of the first class} \item{class }{a 24 | factor variable with levels 'Class1' and 'Class2'} 25 | } 26 | \description{ 27 | These functions simulate data that are used in the text. 28 | } 29 | \details{ 30 | The \code{quadBoundaryFunc} function creates a class boundary that is a 31 | function of both predictors. The probability values are based on a logistic 32 | regression model with model equation: \eqn{-1-2X_1 -0.2X_1^2 + 33 | 2X_2^2}{-1-2*X1 -0.2*X1^2 + 2*X2^2}. The predictors here are multivariate 34 | normal with mean (1, 0) and a moderate degree of positive correlation. 35 | 36 | Similarly, the \code{easyBoundaryFunc} uses a logistic regression model with 37 | model equation: \eqn{intercept -4X_1 + 4X_2 + interaction \times X_1 \times 38 | X_2}{intercept -4*X1 + 4*X2 + interaction*X1*X2}. The predictors here are 39 | multivariate normal with mean (1, 0) and a strong positive correlation. 40 | } 41 | \examples{ 42 | 43 | ## in Chapter 11, 'Measuring Performance in Classification Model' 44 | set.seed(975) 45 | training <- quadBoundaryFunc(500) 46 | testing <- quadBoundaryFunc(1000) 47 | 48 | 49 | ## in Chapter 20, 'Factors That Can Affect Model Performance' 50 | set.seed(615) 51 | dat <- easyBoundaryFunc(200, interaction = 3, intercept = 3) 52 | dat$X1 <- scale(dat$X1) 53 | dat$X2 <- scale(dat$X2) 54 | dat$Data <- "Original" 55 | dat$prob <- NULL 56 | 57 | ## in Chapter X, 'An Introduction to Feature Selection' 58 | 59 | set.seed(874) 60 | reliefEx3 <- easyBoundaryFunc(500) 61 | reliefEx3$X1 <- scale(reliefEx3$X1) 62 | reliefEx3$X2 <- scale(reliefEx3$X2) 63 | reliefEx3$prob <- NULL 64 | 65 | 66 | } 67 | \author{ 68 | Max Kuhn 69 | } 70 | \keyword{utilities} 71 | -------------------------------------------------------------------------------- /man/schedulingData.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{schedulingData} 5 | \alias{schedulingData} 6 | \title{HPC Job Scheduling Data} 7 | \value{ 8 | \item{schedulingData}{a data frame with 4331 rows and 8 columns} 9 | } 10 | \description{ 11 | These data consist of information on 4331 jobs in a high performance 12 | computing environment. Seven attributes were recorded for each job along 13 | with a discrete class describing the execution time. 14 | } 15 | \details{ 16 | The predictors are: \code{Protocol} (the type of computation), 17 | \code{Compounds} (the number of data points for each jobs), 18 | \code{InputFields} (the number of characteristic being estimated), 19 | \code{Iterations} (maximum number of iterations for the computations), 20 | \code{NumPending} (the number of other jobs pending at the time of launch), 21 | \code{Hour} (decimal hour of day for launch time) and \code{Day} (of launch 22 | time). 23 | 24 | The classes are: \code{VF} (very fast), \code{F} (fast), \code{M} (moderate) 25 | and \code{L} (long). 26 | } 27 | \examples{ 28 | 29 | data(schedulingData) 30 | 31 | library(caret) 32 | 33 | set.seed(1104) 34 | inTrain <- createDataPartition(schedulingData$Class, p = .8, list = FALSE) 35 | 36 | schedulingData$NumPending <- schedulingData$NumPending + 1 37 | 38 | trainData <- schedulingData[ inTrain,] 39 | testData <- schedulingData[-inTrain,] 40 | 41 | modForm <- as.formula(Class ~ Protocol + log10(Compounds) + 42 | log10(InputFields)+ log10(Iterations) + 43 | log10(NumPending) + Hour + Day) 44 | 45 | 46 | 47 | } 48 | \keyword{datasets} 49 | -------------------------------------------------------------------------------- /man/scriptLocation.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scriptLocation.R 3 | \name{scriptLocation} 4 | \alias{scriptLocation} 5 | \title{Find Chapter Script Files} 6 | \usage{ 7 | scriptLocation() 8 | } 9 | \description{ 10 | This function identifies the physical location on the user's computer where 11 | the chapter R scripts are located. 12 | } 13 | \examples{ 14 | 15 | scriptLocation() 16 | 17 | } 18 | \author{ 19 | Max Kuhn 20 | } 21 | \keyword{utilities} 22 | -------------------------------------------------------------------------------- /man/segmentationOriginal.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{segmentationOriginal} 5 | \alias{segmentationOriginal} 6 | \title{Cell Body Segmentation} 7 | \source{ 8 | Hill, LaPan, Li and Haney (2007). Impact of image segmentation on 9 | high-content screening data quality for SK-BR-3 cells, \emph{BMC 10 | Bioinformatics}, Vol. 8, pg. 340, 11 | \url{http://www.biomedcentral.com/1471-2105/8/340}. 12 | } 13 | \value{ 14 | \item{segmentationOriginal}{data frame of cells} 15 | } 16 | \description{ 17 | Hill, LaPan, Li and Haney (2007) develop models to predict which cells in a 18 | high content screen were well segmented. The data consists of 119 imaging 19 | measurements on 2019. The original analysis used 1009 for training and 1010 20 | as a test set (see the column called \code{Case}). 21 | } 22 | \details{ 23 | The outcome class is contained in a factor variable called \code{Class} with 24 | levels "PS" for poorly segmented and "WS" for well segmented. 25 | 26 | A pre-processed version of these data can be found in the \pkg{caret} 27 | package. 28 | } 29 | \keyword{datasets} 30 | -------------------------------------------------------------------------------- /man/solubility.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{solubility} 5 | \alias{solubility} 6 | \alias{trainX} 7 | \alias{solTestXtrans} 8 | \alias{solTrainY} 9 | \alias{solTestX} 10 | \alias{solTrainX} 11 | \alias{solTrainXtrans} 12 | \alias{solTestY} 13 | \title{Solubility Data} 14 | \source{ 15 | Tetko, I., Tanchuk, V., Kasheva, T., and Villa, A. (2001). 16 | Estimation of aqueous solubility of chemical compounds using E-state 17 | indices. \emph{Journal of Chemical Information and Computer Sciences}, 18 | 41(6), 1488-1493. 19 | 20 | Huuskonen, J. (2000). Estimation of aqueous solubility for a diverse set of 21 | organic compounds based on molecular topology. \emph{Journal of Chemical 22 | Information and Computer Sciences}, 40(3), 773-777. 23 | } 24 | \value{ 25 | \item{solTrainX}{training set predictors in their natural units. } 26 | \item{solTrainXtrans}{training set predictors after transformations for 27 | skewness and centering/scaling.} \item{solTrainY}{a vector of log10 28 | solubility values for the training set. } \item{solTestX}{test set 29 | predictors in their natural units. } \item{solTestXtrans}{test set 30 | predictors after the same transformations used on the training set are 31 | applied.} \item{solTestY}{a vector of log10 solubility values for the 32 | training set. } 33 | } 34 | \description{ 35 | Tetko et al. (2001) and Huuskonen (2000) investigated a set of compounds 36 | with corresponding experimental solubility values using complex sets of 37 | descriptors. They used linear regression and neural network models to 38 | estimate the relationship between chemical structure and solubility. For our 39 | analyses, we will use 1267 compounds and a set of more understandable 40 | descriptors that fall into one of three groups: 208 binary "fingerprints" 41 | that indicate the presence or absence of a particular chemical 42 | sub-structure, 16 count descriptors (such as the number of bonds or the 43 | number of Bromine atoms) and 4 continuous descriptors (such as molecular 44 | weight or surface area). 45 | } 46 | \examples{ 47 | 48 | data(solubility) 49 | 50 | library(caret) 51 | 52 | ### Cross-validation splits used in the book: 53 | set.seed(100) 54 | indx <- createFolds(solTrainY, returnTrain = TRUE) 55 | 56 | ### To re-create the transformed version of the data: 57 | \dontrun{ 58 | ## Find the predictors that are not fingerprints 59 | contVars <- names(solTrainX)[!grepl("FP", names(solTrainX))] 60 | ## Some have zero values, so we need to add one to them so that 61 | ## we can use the Box-Cox transformation. Alternatively, we could 62 | ## use the Yeo-Johnson transformation without altering the data. 63 | contPredTrain <- solTrainX[,contVars] + 1 64 | contPredTest <- solTestX[,contVars] + 1 65 | 66 | pp <- preProcess(contPredTrain, method = "BoxCox") 67 | contPredTrain <- predict(pp, contPredTrain) 68 | contPredTest <- predict(pp, contPredTest) 69 | 70 | ## Reassemble the fingerprint data with the transformed values. 71 | trainXtrans <- cbind(solTrainX[,grep("FP", names(solTrainX))], contPredTrain) 72 | testXtrans <- cbind( solTestX[,grep("FP", names(solTestX))], contPredTest) 73 | 74 | all.equal(trainXtrans, solTrainXtrans) 75 | all.equal(testXtrans, solTestXtrans) 76 | } 77 | 78 | 79 | } 80 | \keyword{datasets} 81 | -------------------------------------------------------------------------------- /man/twoClassData.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{twoClassData} 5 | \alias{twoClassData} 6 | \alias{classes} 7 | \title{Two Class Example Data} 8 | \value{ 9 | \item{predictors}{data frame of two predictors} \item{classes}{a 10 | factor vector of class labeled} 11 | } 12 | \description{ 13 | These data contain two predictors measured for 208 samples. Of these, 111 14 | samples are labeled as \code{Class1} and the remaining 97 are \code{Class2}. 15 | } 16 | \examples{ 17 | 18 | data(twoClassData) 19 | 20 | library(lattice) 21 | xyplot(PredictorB ~ PredictorA, 22 | data = predictors, 23 | groups = classes, 24 | auto.key = TRUE) 25 | 26 | 27 | } 28 | \keyword{datasets} 29 | --------------------------------------------------------------------------------