├── .Rbuildignore
├── .gitignore
├── AppliedPredictiveModeling.Rproj
├── DESCRIPTION
├── NAMESPACE
├── R
    ├── bookTheme.R
    ├── data.R
    ├── getPackages.R
    ├── panels.R
    ├── permuteRelief.R
    ├── quadBoundaryFunc.R
    └── scriptLocation.R
├── README.md
├── data
    ├── AlzheimerDisease.RData
    ├── ChemicalManufacturingProcess.RData
    ├── FuelEconomy.RData
    ├── abalone.RData
    ├── concrete.RData
    ├── hepatic.RData
    ├── logisticCreditPredictions.RData
    ├── permeability.RData
    ├── schedulingData.RData
    ├── segmentationOriginal.RData
    ├── solubility.RData
    └── twoClassData.RData
├── inst
    ├── NEWS.Rd
    └── chapters
    │   ├── 02_A_Short_Tour.R
    │   ├── 02_A_Short_Tour.Rout
    │   ├── 03_Data_Pre_Processing.R
    │   ├── 03_Data_Pre_Processing.Rout
    │   ├── 04_Over_Fitting.R
    │   ├── 04_Over_Fitting.Rout
    │   ├── 06_Linear_Regression.R
    │   ├── 06_Linear_Regression.Rout
    │   ├── 07_Non-Linear_Reg.R
    │   ├── 07_Non-Linear_Reg.Rout
    │   ├── 08_Regression_Trees.R
    │   ├── 08_Regression_Trees.Rout
    │   ├── 10_Case_Study_Concrete.R
    │   ├── 10_Case_Study_Concrete.Rout
    │   ├── 11_Class_Performance.R
    │   ├── 11_Class_Performance.Rout
    │   ├── 12_Discriminant_Analysis.R
    │   ├── 12_Discriminant_Analysis.Rout
    │   ├── 13_Non-Linear_Class.R
    │   ├── 13_Non-Linear_Class.Rout
    │   ├── 14_Class_Trees.R
    │   ├── 14_Class_Trees.Rout
    │   ├── 16_Class_Imbalance.R
    │   ├── 16_Class_Imbalance.Rout
    │   ├── 17_Job_Scheduling.R
    │   ├── 17_Job_Scheduling.Rout
    │   ├── 18_Importance.R
    │   ├── 18_Importance.Rout
    │   ├── 19_Feature_Select.R
    │   ├── 19_Feature_Select.Rout
    │   ├── CreateGrantData.R
    │   └── CreateGrantData.Rout
└── man
    ├── AlzheimerDisease.Rd
    ├── ChemicalManufacturingProcess.Rd
    ├── FuelEconomy.Rd
    ├── Hepatic.Rd
    ├── abalone.Rd
    ├── apm-internal.Rd
    ├── bookTheme.Rd
    ├── concrete.Rd
    ├── getPackages.Rd
    ├── logisticCreditPredictions.Rd
    ├── permeability.Rd
    ├── permuteRelief.Rd
    ├── quadBoundaryFunc.Rd
    ├── schedulingData.Rd
    ├── scriptLocation.Rd
    ├── segmentationOriginal.Rd
    ├── solubility.Rd
    └── twoClassData.Rd


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Session Data files
 6 | .RData
 7 | 
 8 | # Example code in package build process
 9 | *-Ex.R
10 | 
11 | # Output files from R CMD build
12 | /*.tar.gz
13 | 
14 | # Output files from R CMD check
15 | /*.Rcheck/
16 | 
17 | # RStudio files
18 | .Rproj.user/
19 | 
20 | # produced vignettes
21 | vignettes/*.html
22 | vignettes/*.pdf
23 | 
24 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
25 | .httr-oauth
26 | 
27 | # knitr and R markdown default cache directories
28 | /*_cache/
29 | /cache/
30 | 
31 | # Temporary files created by R markdown
32 | *.utf8.md
33 | *.knit.md
34 | 
35 | # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html
36 | rsconnect/
37 | inst/.DS_Store
38 | 


--------------------------------------------------------------------------------
/AppliedPredictiveModeling.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | BuildType: Package
16 | PackageUseDevtools: Yes
17 | PackageInstallArgs: --no-multiarch --with-keep.source
18 | PackageRoxygenize: rd,collate,namespace
19 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: AppliedPredictiveModeling
 2 | Type: Package
 3 | Title: Functions and Data Sets for 'Applied Predictive Modeling'
 4 | Version: 1.1.9000
 5 | Authors@R: c(
 6 |     person("Max", "Kuhn", , "mxkuhn@gmail.com", role = c("aut", "cre")),
 7 |     person("Kjell", "Johnson", role = c("aut"))
 8 |   )
 9 | Description: A few functions and several data set for the Springer book 
10 |    'Applied Predictive Modeling'.
11 | URL: http://appliedpredictivemodeling.com/
12 | Depends: R (>= 2.10) 
13 | Imports: CORElearn, MASS, lattice, ellipse, reshape2
14 | Suggests: caret (>= 6.0-22)
15 | License: GPL-2
16 | RoxygenNote: 7.2.3
17 | Encoding: UTF-8
18 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(bookTheme)
 4 | export(easyBoundaryFunc)
 5 | export(getPackages)
 6 | export(permuteRelief)
 7 | export(quadBoundaryFunc)
 8 | export(scriptLocation)
 9 | export(transparentTheme)
10 | 


--------------------------------------------------------------------------------
/R/bookTheme.R:
--------------------------------------------------------------------------------
  1 | #' Lattice Themes
  2 | #' 
  3 | #' Two \pkg{lattice} themes used throughout the book.
  4 | #' 
  5 | #' When using these functions to save a plot, make sure to invoke them after
  6 | #' the device has been opened (e.g. after calls such as \code{pdf()}.
  7 | #' 
  8 | #' @aliases bookTheme transparentTheme
  9 | #' @param set a logical: should these settings be applied to the current
 10 | #' device?
 11 | #' @param pchSize the size of the plot symbols
 12 | #' @param trans the amount of transparency (via the alpha channel). Note that
 13 | #' transparency is not supported by all graphics devices.
 14 | #' @return Each function returns a list of theme parameters. See Sarkar (2008)
 15 | #' or \code{\link[lattice]{trellis.par.get}} for specific details.
 16 | #' @author Max Kuhn
 17 | #' @references Some of the colors are based on values from ColorBrewer
 18 | #' \url{http://www.colorbrewer.org}.
 19 | #' 
 20 | #' Sarkar, D. (2008). Lattice: Multivariate Data Visualization with R. UseR!
 21 | #' (1st ed. p. 286). Springer.
 22 | #' @keywords hplot
 23 | #' @examples
 24 | #' 
 25 | #' library(lattice)
 26 | #' 
 27 | #' example <- quadBoundaryFunc(100)
 28 | #' 
 29 | #' bookTheme(set = TRUE)
 30 | #' xyplot(X2 ~ X1, data = example, groups = class, auto.key = TRUE)
 31 | #' 
 32 | #' transparentTheme(set = TRUE, trans = .6)
 33 | #' xyplot(X2 ~ X1, data = example, groups = class, auto.key = TRUE)
 34 | #' 
 35 | #' @export bookTheme
 36 | bookTheme <- function(set = TRUE){
 37 |   theme <- list(
 38 |     plot.polygon = list(alpha = 1, col = "aliceblue", border = "black", lty = 1, lwd = 1),
 39 |     background = list(col = "transparent"), 
 40 |     bar.fill = list(col = "#cce6ff"), 
 41 |     box.rectangle = list(col = "black"), 
 42 |     box.umbrella = list(col = "black"), 
 43 |     dot.line = list(col = "#e8e8e8"), 
 44 |     dot.symbol = list(col = "black"), 
 45 |     plot.line = list(col = "black", lwd = 1, lty = 1), 
 46 |     plot.symbol = list(col = "black", pch = 16), 
 47 |     regions = list(col = 
 48 |                      c("#FEF8FA", "#FDF6F9", "#FBF5F9", "#FAF3F8", 
 49 |                        "#F8F2F7", "#F7F0F7", "#F5EEF6", "#F4EDF5", 
 50 |                        "#F2EBF5", "#F1EAF4", "#EFE8F3", "#EDE7F2", 
 51 |                        "#ECE5F1", "#EAE4F1", "#E8E2F0", "#E6E1EF", 
 52 |                        "#E4DFEE", "#E2DEED", "#E0DCEC", "#DEDAEB", 
 53 |                        "#DCD9EA", "#D9D7E9", "#D7D6E8", "#D4D4E7", 
 54 |                        "#D1D2E6", "#CED1E5", "#CCCFE4", "#C8CEE3",
 55 |                        "#C5CCE2", "#C2CAE1", "#BFC9E0", "#BBC7DF", 
 56 |                        "#B8C5DF", "#B4C4DE", "#B1C2DD", "#ADC0DC", 
 57 |                        "#A9BFDB", "#A6BDDA", "#A2BBD9", "#9EB9D9", 
 58 |                        "#9BB8D8", "#97B6D7", "#93B4D6", "#8FB2D5", 
 59 |                        "#8BB0D4", "#87AFD3", "#83ADD2", "#7FABD1", 
 60 |                        "#7AA9D0", "#76A7CF", "#71A5CE", "#6CA3CC", 
 61 |                        "#68A1CB", "#63A0CA", "#5D9EC9", "#589CC8",
 62 |                        "#539AC6", "#4E98C5", "#4996C4", "#4493C3", 
 63 |                        "#3F91C1", "#3A8FC0", "#358DBF", "#308BBE", 
 64 |                        "#2C89BD", "#2887BC", "#2385BB", "#1F83BA", 
 65 |                        "#1C80B9", "#187EB7", "#157CB6", "#127AB5", 
 66 |                        "#0F78B3", "#0D76B2", "#0A73B0", "#0971AE", 
 67 |                        "#076FAC", "#066DAA", "#056AA7", "#0568A5")
 68 |     ), 
 69 |     strip.shingle = list(col = c(
 70 |       "#ff7f00", "#00ff00", "#00ffff",
 71 |       "#ff00ff", "#ff0000", "#ffff00", "#0080ff")
 72 |     ), 
 73 |     strip.background = list(col = c(
 74 |       "#ffe5cc", "#ccffcc", "#ccffff",
 75 |       "#ffccff", "#ffcccc", "#ffffcc", "#cce6ff")
 76 |     ), 
 77 |     reference.line = list(col = "#e8e8e8"), 
 78 |     superpose.line = list(
 79 |       col = c(
 80 |         "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black", 
 81 |         "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black", 
 82 |         "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black",
 83 |         "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black", 
 84 |         "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black", 
 85 |         "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black"
 86 |       ), 
 87 |       lty = rep(1:6, each = 6)), 
 88 |     superpose.symbol = list(
 89 |       pch = c(
 90 |         1, 4, 6, 0, 5, 17,
 91 |         4, 6, 0, 5, 17, 1,
 92 |         6, 0, 5, 17, 1, 4,
 93 |         0, 5, 17, 1, 4, 6,
 94 |         5, 17, 1, 4, 6, 0 ,
 95 |         17, 1, 4, 6, 0, 5), 
 96 |       cex = rep(0.7,  6 * 6), 
 97 |       col = c(
 98 |         "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black", 
 99 |         "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black", 
100 |         "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black",
101 |         "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black", 
102 |         "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black", 
103 |         "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black"
104 |       )
105 |     )
106 |   )
107 |   
108 |   if(set) lattice::trellis.par.set(theme)
109 |   invisible(theme)
110 | }
111 | 
112 | #' @rdname bookTheme
113 | #' @export
114 | transparentTheme <-
115 |   function(set = TRUE, pchSize = 1, trans = .2) {
116 |     theme <- list(
117 |       plot.polygon = list(alpha = 1, col = "aliceblue", border = "black", lty = 1, lwd = 1),
118 |       background = list(col = "transparent"), 
119 |       bar.fill = list(col = "#cce6ff"), 
120 |       box.rectangle = list(col = "black"), 
121 |       box.umbrella = list(col = "black"), 
122 |       dot.line = list(col = "#e8e8e8"), 
123 |       dot.symbol = list(col = "black"), 
124 |       plot.line = list(col = "black"), 
125 |       plot.symbol = list(col = "black"), 
126 |       regions = list(col = 
127 |                        c("#FEF8FA", "#FDF6F9", "#FBF5F9", "#FAF3F8", 
128 |                          "#F8F2F7", "#F7F0F7", "#F5EEF6", "#F4EDF5", 
129 |                          "#F2EBF5", "#F1EAF4", "#EFE8F3", "#EDE7F2", 
130 |                          "#ECE5F1", "#EAE4F1", "#E8E2F0", "#E6E1EF", 
131 |                          "#E4DFEE", "#E2DEED", "#E0DCEC", "#DEDAEB", 
132 |                          "#DCD9EA", "#D9D7E9", "#D7D6E8", "#D4D4E7", 
133 |                          "#D1D2E6", "#CED1E5", "#CCCFE4", "#C8CEE3",
134 |                          "#C5CCE2", "#C2CAE1", "#BFC9E0", "#BBC7DF", 
135 |                          "#B8C5DF", "#B4C4DE", "#B1C2DD", "#ADC0DC", 
136 |                          "#A9BFDB", "#A6BDDA", "#A2BBD9", "#9EB9D9", 
137 |                          "#9BB8D8", "#97B6D7", "#93B4D6", "#8FB2D5", 
138 |                          "#8BB0D4", "#87AFD3", "#83ADD2", "#7FABD1", 
139 |                          "#7AA9D0", "#76A7CF", "#71A5CE", "#6CA3CC", 
140 |                          "#68A1CB", "#63A0CA", "#5D9EC9", "#589CC8",
141 |                          "#539AC6", "#4E98C5", "#4996C4", "#4493C3", 
142 |                          "#3F91C1", "#3A8FC0", "#358DBF", "#308BBE", 
143 |                          "#2C89BD", "#2887BC", "#2385BB", "#1F83BA", 
144 |                          "#1C80B9", "#187EB7", "#157CB6", "#127AB5", 
145 |                          "#0F78B3", "#0D76B2", "#0A73B0", "#0971AE", 
146 |                          "#076FAC", "#066DAA", "#056AA7", "#0568A5")
147 |       ), 
148 |       strip.shingle = list(col = c(
149 |         "#ff7f00", "#00ff00", "#00ffff",
150 |         "#ff00ff", "#ff0000", "#ffff00", "#0080ff")), 
151 |       strip.background = list(col = c(
152 |         "#ffe5cc", "#ccffcc", "#ccffff",
153 |         "#ffccff", "#ffcccc", "#ffffcc", "#cce6ff")), 
154 |       reference.line = list(col = "#e8e8e8"), 
155 |       superpose.line = list(
156 |         col = c(
157 |           grDevices::rgb(1, 0, 0, trans), grDevices::rgb(0, 0, 1, trans),
158 |           grDevices::rgb(0.3984375, 0.7578125, 0.6445312, max(.6, trans)),
159 |           grDevices::rgb(0, 0, 0, trans)), 
160 |         lty = rep(1:2, 6)), 
161 |       superpose.symbol = list(
162 |         pch = c(16, 15, 17, 18, 16), 
163 |         cex = rep(pchSize,  5), 
164 |         col = c(
165 |           grDevices::rgb(1, 0, 0, trans), grDevices::rgb(0, 0, 1, trans),
166 |           grDevices::rgb(0.3984375, 0.7578125, 0.6445312, max(.6, trans)),
167 |           grDevices::rgb(0, 0, 0, trans)))
168 |     )
169 |     
170 |     if(set) lattice::trellis.par.set(theme, warn = FALSE)
171 |     invisible(theme)
172 |   }
173 | 


--------------------------------------------------------------------------------
/R/getPackages.R:
--------------------------------------------------------------------------------
 1 | #' Install Packages for Each Chapter
 2 | #' 
 3 | #' This function identifies the physical location on the user's computer where
 4 | #' the chapter R scripts are located.
 5 | #' 
 6 | #' Chapter names and packages. about dependencies.
 7 | #' 
 8 | #' @param chapter an integer vector (or character versions of the integer) for
 9 | #' the chapter number. See Details below:
10 | #' @param ... options to pass to \code{\link[utils]{install.packages}}
11 | #' @author Max Kuhn
12 | #' @keywords utilities
13 | #' @examples
14 | #' 
15 | #' \dontrun{
16 | #' getPackages(2)
17 | #' getPackages(2:3)
18 | #' getPackages("4")
19 | #' }
20 | #' 
21 | #' @export getPackages
22 | getPackages <- function(chapter, ...) {
23 |   if(is.numeric(chapter)) 
24 |     chapter <- paste(chapter)
25 |   pkg <- list()
26 |   pkg[["2"]] <- c("earth", "caret", "lattice")
27 |   pkg[["3"]] <- c("e1071", "caret", "corrplot")
28 |   pkg[["4"]] <- c("kernlab", "caret")
29 |   pkg[["6"]] <- c("lattice", "corrplot", "pls", "elasticnet")
30 |   pkg[["7"]] <- c("caret", "earth", "kernlab","lattice", "nnet")  
31 |   pkg[["8"]] <- c("caret", "Cubist", "gbm", "lattice", "party", "partykit", 
32 |                   "randomForest", "rpart",  "RWeka") 
33 |   pkg[["10"]] <- c("caret", "Cubist", "earth", "elasticnet", "gbm", "ipred", 
34 |                    "lattice", "nnet", "party","pls", "randomForests", "rpart", 
35 |                    "RWeka")  
36 |   pkg[["11"]] <- c("caret", "MASS", "randomForest", "pROC", "klaR")  
37 |   pkg[["12"]] <- c("caret", "glmnet", "lattice", 
38 |                    "MASS", "pamr", "pls", "pROC", "sparseLDA")  
39 |   pkg[["13"]] <- c("caret", "kernlab", "klaR",  "lattice", "latticeExtra", 
40 |                    "MASS", "mda", "nnet", "pROC")  
41 |   pkg[["14"]] <- c("C50", "caret", "gbm",  "lattice", "partykit", "pROC", 
42 |                    "randomForest", "reshape2", 
43 |                    "rpart", "RWeka")
44 |   pkg[["16"]] <- c("caret", "C50", "earth", "DMwR", "DWD", " kernlab", "mda", 
45 |                    "pROC", "randomForest", "rpart") 
46 |   pkg[["17"]] <- c("C50", "caret", "earth", "Hmisc", "ipred", "tabplot", 
47 |                    "kernlab", "lattice", "MASS", "mda", "nnet", "pls", 
48 |                    "randomForest", "rpart", "sparseLDA")
49 |   pkg[["18"]] <- c("caret", "CORElearn", "corrplot", "pROC", "minerva")
50 |   pkg[["19"]] <- c("caret", "MASS", "corrplot", "RColorBrewer", "randomForest", 
51 |                    "kernlab", "klaR")
52 |   plist <-
53 |     paste(paste("'", names(pkg), "'", sep = ""), collapse = ", ")
54 |   if (!any(chapter %in% names(pkg)))
55 |     stop(paste("'chapter' must be: ",
56 |                paste(plist, collapse = ", ")))  
57 |   
58 |   pkg <- unlist(pkg[chapter])
59 |   pkg <- pkg[!is.na(pkg)]
60 |   pkg <- pkg[pkg != ""]
61 |   pkg <- pkg[order(tolower(pkg))]
62 |   
63 |   utils::install.packages(pkg, ...)
64 | }
65 | 


--------------------------------------------------------------------------------
/R/panels.R:
--------------------------------------------------------------------------------
 1 | upperp <- function(...) {
 2 |   args <- list(...)
 3 |   circ1 <- ellipse::ellipse(diag(rep(1, 2)), t = .1)
 4 |   lattice::panel.xyplot(
 5 |     circ1[, 1],
 6 |     circ1[, 2],
 7 |     type = "l",
 8 |     lty = lattice::trellis.par.get("reference.line")$lty,
 9 |     col = lattice::trellis.par.get("reference.line")$col,
10 |     lwd = lattice::trellis.par.get("reference.line")$lwd
11 |   )
12 |   circ2 <- ellipse::ellipse(diag(rep(1, 2)), t = .2)
13 |   lattice::panel.xyplot(
14 |     circ2[, 1],
15 |     circ2[, 2],
16 |     type = "l",
17 |     lty = lattice::trellis.par.get("reference.line")$lty,
18 |     col = lattice::trellis.par.get("reference.line")$col,
19 |     lwd = lattice::trellis.par.get("reference.line")$lwd
20 |   )
21 |   circ3 <- ellipse::ellipse(diag(rep(1, 2)), t = .3)
22 |   lattice::panel.xyplot(
23 |     circ3[, 1],
24 |     circ3[, 2],
25 |     type = "l",
26 |     lty = lattice::trellis.par.get("reference.line")$lty,
27 |     col = lattice::trellis.par.get("reference.line")$col,
28 |     lwd = lattice::trellis.par.get("reference.line")$lwd
29 |   )
30 |   lattice::panel.xyplot(args$x,
31 |                         args$y,
32 |                         groups = args$groups,
33 |                         subscripts = args$subscripts)
34 | }
35 | 
36 | lowerp <- function(...) {
37 |   
38 | }
39 | 


--------------------------------------------------------------------------------
/R/permuteRelief.R:
--------------------------------------------------------------------------------
 1 | #' Permutation Statistics for the Relief Algorithm
 2 | #' 
 3 | #' This function uses a permutation approach to determining the relative
 4 | #' magnitude of Relief scores (Kira and Rendell, 1992 and Kononenko, 1994).
 5 | #' 
 6 | #' The scores for each predictor are computed using the original data and after
 7 | #' outcome data are randomly scrambled (\code{nprem} times). The mean and
 8 | #' standard deviation of the permuted values are determined and a standardized
 9 | #' version of the observed scores are determined by subtracting the permuted
10 | #' means from the original values, then dividing each by the corresponding
11 | #' standard deviation.
12 | #' 
13 | #' @param x a data frame of predictor data
14 | #' @param y a vector of outcomes
15 | #' @param nperm the number of random permutations of the data
16 | #' @param \dots options to pass to \code{\link[CORElearn]{attrEval}}, such as
17 | #' the exact Relief algorithm, to use
18 | #' @return a list with elements \item{standardized }{a vector of standardized
19 | #' predictor scores} \item{permutations }{the values of the permuted scores,
20 | #' for plotting to assess the permutation distribution} \item{observed}{the
21 | #' observed scores} \item{options}{a list of options passed using \ldots{}}
22 | #' @author Max Kuhn
23 | #' @seealso \code{\link[CORElearn]{attrEval}}
24 | #' @references Kira, K., & Rendell, L. (1992). The feature selection problem:
25 | #' Traditional methods and a new algorithm. \emph{Proceedings of the Eleventh
26 | #' International Conference on Machine Learning}, 129-129.
27 | #' 
28 | #' Kononenko, I. (1994). Estimating attributes: analysis and extensions of
29 | #' RELIEF. Machine Learning: ECML-94, 171-182.
30 | #' @keywords htest
31 | #' @examples
32 | #' 
33 | #' set.seed(874)
34 | #' reliefEx3 <- easyBoundaryFunc(500)
35 | #' reliefEx3$X1 <- scale(reliefEx3$X1)
36 | #' reliefEx3$X2 <- scale(reliefEx3$X2)
37 | #' reliefEx3$prob <- NULL
38 | #' 
39 | #' standardized <- permuteRelief(reliefEx3[, 1:2], reliefEx3$class, 
40 | #'                               ## For efficiency, a small number of
41 | #'                               ## permutations are used here.
42 | #'                               nperm = 50,           
43 | #'                               estimator="ReliefFequalK", 
44 | #'                               ReliefIterations= 50)
45 | #' 
46 | #' 
47 | #' @export permuteRelief
48 | permuteRelief <-
49 |   function(x, y, nperm = 100, ...) {
50 |     dat <- x
51 |     dat$y <- y
52 |     
53 |     obs <- CORElearn::attrEval(y ~ ., data = dat, ...)
54 |     permuted <- matrix(NA, ncol = length(obs), nrow = nperm)
55 |     colnames(permuted) <- names(obs)
56 |     for (i in 1:nperm) {
57 |       dat$y <- sample(y)
58 |       permuted[i,] <- CORElearn::attrEval(y ~ ., data = dat, ...)
59 |     }
60 |     means <- colMeans(permuted)
61 |     sds <- apply(permuted, 2, stats::sd)
62 |     permuted <- reshape2::melt(permuted)
63 |     names(permuted)[2] <- "Predictor"
64 |     permuted$X1 <- NULL
65 |     list(
66 |       standardized = (obs - means) / sds,
67 |       permutations = permuted,
68 |       observed = obs,
69 |       options = list(...)
70 |     )
71 |   }
72 | 


--------------------------------------------------------------------------------
/R/quadBoundaryFunc.R:
--------------------------------------------------------------------------------
 1 | #' Functions for Simulating Data
 2 | #' 
 3 | #' These functions simulate data that are used in the text.
 4 | #' 
 5 | #' The \code{quadBoundaryFunc} function creates a class boundary that is a
 6 | #' function of both predictors. The probability values are based on a logistic
 7 | #' regression model with model equation: \eqn{-1-2X_1 -0.2X_1^2 +
 8 | #' 2X_2^2}{-1-2*X1 -0.2*X1^2 + 2*X2^2}. The predictors here are multivariate
 9 | #' normal with mean (1, 0) and a moderate degree of positive correlation.
10 | #' 
11 | #' Similarly, the \code{easyBoundaryFunc} uses a logistic regression model with
12 | #' model equation: \eqn{intercept -4X_1 + 4X_2 + interaction \times X_1 \times
13 | #' X_2}{intercept -4*X1 + 4*X2 + interaction*X1*X2}. The predictors here are
14 | #' multivariate normal with mean (1, 0) and a strong positive correlation.
15 | #' 
16 | #' @aliases quadBoundaryFunc easyBoundaryFunc
17 | #' @param n the sample size
18 | #' @param intercept the coefficient for the logistic regression intercept term
19 | #' @param interaction the coefficient for the logistic regression interaction
20 | #' term
21 | #' @return Both functions return data frames with columns \item{X1}{numeric
22 | #' predictor value} \item{X2}{numeric predictor value} \item{prob }{numeric
23 | #' value reflecting the true probability of the first class} \item{class }{a
24 | #' factor variable with levels 'Class1' and 'Class2'}
25 | #' @author Max Kuhn
26 | #' @keywords utilities
27 | #' @examples
28 | #' 
29 | #' ## in Chapter 11, 'Measuring Performance in Classification Model'
30 | #' set.seed(975)
31 | #' training <- quadBoundaryFunc(500)
32 | #' testing <- quadBoundaryFunc(1000)
33 | #'  
34 | #' 
35 | #' ## in Chapter 20, 'Factors That Can Affect Model Performance'
36 | #' set.seed(615)
37 | #' dat <- easyBoundaryFunc(200, interaction = 3, intercept = 3)
38 | #' dat$X1 <- scale(dat$X1)
39 | #' dat$X2 <- scale(dat$X2)
40 | #' dat$Data <- "Original"
41 | #' dat$prob <- NULL
42 | #' 
43 | #' ## in Chapter X, 'An Introduction to Feature Selection'
44 | #' 
45 | #' set.seed(874)
46 | #' reliefEx3 <- easyBoundaryFunc(500)
47 | #' reliefEx3$X1 <- scale(reliefEx3$X1)
48 | #' reliefEx3$X2 <- scale(reliefEx3$X2)
49 | #' reliefEx3$prob <- NULL
50 | #' 
51 | #' 
52 | #' @export quadBoundaryFunc
53 | quadBoundaryFunc <- function(n) {
54 |   sigma <- matrix(c(1, .7, .7, 2), 2, 2)
55 |   
56 |   tmpData <- data.frame(MASS::mvrnorm(n = n, c(1, 0), sigma))
57 |   xSeq <- seq(-4, 4, length = 40)
58 |   plotGrid <- expand.grid(x = xSeq, y = xSeq)
59 |   zFoo <- function(x, y)
60 |     - 1 - 2 * x - 0 * y - .2 * x ^ 2 + 2 * y ^ 2
61 |   z2p <- function(x)
62 |     1 / (1 + exp(-x))
63 |   
64 |   tmpData$prob <- z2p(zFoo(tmpData$X1, tmpData$X2))
65 |   tmpData$class <-
66 |     factor(ifelse(stats::runif(length(tmpData$prob)) <= tmpData$prob, "Class1", "Class2"))
67 |   tmpData
68 | }
69 | 
70 | 
71 | #' @rdname quadBoundaryFunc
72 | #' @export 
73 | easyBoundaryFunc <- function(n,
74 |                              intercept = 0,
75 |                              interaction = 2) {
76 |   sigma <- matrix(c(2, 1.3, 1.3, 2), 2, 2)
77 |   
78 |   tmpData <- data.frame(MASS::mvrnorm(n = n, c(0, 0), sigma))
79 |   xSeq <- seq(-4, 4, length = 40)
80 |   plotGrid <- expand.grid(x = xSeq, y = xSeq)
81 |   zFoo <- function(x, y)
82 |     intercept - 4 * x + 4 * y + interaction * x * y
83 |   z2p <- function(x)
84 |     1 / (1 + exp(-x))
85 |   
86 |   tmpData$prob <- z2p(zFoo(tmpData$X1, tmpData$X2))
87 |   tmpData$class <-
88 |     factor(ifelse(stats::runif(length(tmpData$prob)) <= tmpData$prob, "Class1", "Class2"))
89 |   tmpData
90 | }
91 | 


--------------------------------------------------------------------------------
/R/scriptLocation.R:
--------------------------------------------------------------------------------
 1 | #' Find Chapter Script Files
 2 | #' 
 3 | #' This function identifies the physical location on the user's computer where
 4 | #' the chapter R scripts are located.
 5 | #' 
 6 | #' 
 7 | #' @author Max Kuhn
 8 | #' @keywords utilities
 9 | #' @examples
10 | #' 
11 | #' scriptLocation()
12 | #' 
13 | #' @export scriptLocation
14 | scriptLocation <-
15 |   function()
16 |     system.file("chapters", package = "AppliedPredictiveModeling")
17 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # AppliedPredictiveModeling
2 | 
3 | Data and code from Applied Predictive Modeling (2013)
4 | 


--------------------------------------------------------------------------------
/data/AlzheimerDisease.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/topepo/AppliedPredictiveModeling/9a3717b2dcd040af8a674b57f4fe90e7995d5002/data/AlzheimerDisease.RData


--------------------------------------------------------------------------------
/data/ChemicalManufacturingProcess.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/topepo/AppliedPredictiveModeling/9a3717b2dcd040af8a674b57f4fe90e7995d5002/data/ChemicalManufacturingProcess.RData


--------------------------------------------------------------------------------
/data/FuelEconomy.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/topepo/AppliedPredictiveModeling/9a3717b2dcd040af8a674b57f4fe90e7995d5002/data/FuelEconomy.RData


--------------------------------------------------------------------------------
/data/abalone.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/topepo/AppliedPredictiveModeling/9a3717b2dcd040af8a674b57f4fe90e7995d5002/data/abalone.RData


--------------------------------------------------------------------------------
/data/concrete.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/topepo/AppliedPredictiveModeling/9a3717b2dcd040af8a674b57f4fe90e7995d5002/data/concrete.RData


--------------------------------------------------------------------------------
/data/hepatic.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/topepo/AppliedPredictiveModeling/9a3717b2dcd040af8a674b57f4fe90e7995d5002/data/hepatic.RData


--------------------------------------------------------------------------------
/data/logisticCreditPredictions.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/topepo/AppliedPredictiveModeling/9a3717b2dcd040af8a674b57f4fe90e7995d5002/data/logisticCreditPredictions.RData


--------------------------------------------------------------------------------
/data/permeability.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/topepo/AppliedPredictiveModeling/9a3717b2dcd040af8a674b57f4fe90e7995d5002/data/permeability.RData


--------------------------------------------------------------------------------
/data/schedulingData.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/topepo/AppliedPredictiveModeling/9a3717b2dcd040af8a674b57f4fe90e7995d5002/data/schedulingData.RData


--------------------------------------------------------------------------------
/data/segmentationOriginal.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/topepo/AppliedPredictiveModeling/9a3717b2dcd040af8a674b57f4fe90e7995d5002/data/segmentationOriginal.RData


--------------------------------------------------------------------------------
/data/solubility.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/topepo/AppliedPredictiveModeling/9a3717b2dcd040af8a674b57f4fe90e7995d5002/data/solubility.RData


--------------------------------------------------------------------------------
/data/twoClassData.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/topepo/AppliedPredictiveModeling/9a3717b2dcd040af8a674b57f4fe90e7995d5002/data/twoClassData.RData


--------------------------------------------------------------------------------
/inst/NEWS.Rd:
--------------------------------------------------------------------------------
 1 | \name{NEWS}
 2 | \title{News for Package \pkg{AppliedPredictiveModeling}}
 3 | \newcommand{\cpkg}{\href{https://cran.r-project.org/package=#1}{\pkg{#1}}}
 4 | 
 5 | 
 6 | \section{Changes in version 1.1-6}{
 7 | 
 8 | \itemize{
 9 | \item The file \code{CreateGrantData.R} was updated to include code to create the objects \code{factorPredictors} and \code{factorForm}. 
10 | 
11 | }
12 | }
13 | 
14 | \section{Changes in version 1.1-5}{
15 | 
16 | The package dependencies were updated. Some were moved to 'Imports'
17 | 
18 | The chapter scripts were re-run with the latest versions of the \cpkg{AppliedPredictiveModeling} and\cpkg{caret} packages. For \cpkg{caret}, the names of the tuning parameter columns were changed to remove the dot. These were made to be consistent with the newer version of \cpkg{caret} and are not required. For example, \code{.sigma} was changed to \code{sigma} and so on. 
19 | 
20 | Additional changes are:
21 | 
22 | \itemize{
23 | \item \code{03_Data_Pre_Processing.R} was changed to include code from the Computing section on creating dummy variables.
24 | 
25 | \item \code{04_Over_Fitting.R} was modified. Changes were made to:
26 |    \itemize{
27 |    \item avoid a warning message when the SVM grid was created
28 |    \item \code{classProbs = TRUE} was added to the control function for the object \code{svmFit}
29 |    \item some notes were made in the potential differences in SVM results between versions of \cpkg{caret}
30 |    }
31 | \item \code{06_Linear_Regression.R} was updated to use the newer "ridge" model in \cpkg{caret}.
32 | 
33 | \item In \code{07_Non-Linear_Reg.R}, some notes were made in the potential differences in SVM results between versions of \cpkg{caret}
34 | 
35 | \item In \code{16_Class_Imbalance.R}, verboseness (verbosity?) was turned off to make the results cleaner. Also, the code for \code{svmWtFit} was using the wrong tuning grid (\code{svmGrid2} instead of \code{svmGrid1}).
36 | 
37 | \item \code{19_Feature_Select.R} was changed so that the resampling values were only saved for the final model. 
38 | 
39 | }
40 | }
41 | 
42 | \section{Changes in version 1.1-4}{
43 | \itemize{
44 | \item The data set \code{ChemicalManufacturingProcess} did not contain 
45 | the rows with missing data. They were added back in.
46 | 
47 | \item Small changes to conform to R CMD check. 
48 | }}
49 | 
50 | \section{Changes in version 1.1-2}{
51 | \itemize{
52 | \item Code to create the \code{carsSubset} object in Seciton 3.8 was added 
53 |       to 03_Data_Pre_Processing.R
54 | }}
55 | 
56 | \section{Changes in version 1.1-1}{
57 | \itemize{
58 | \item Initial Version
59 | }}
60 | 
61 | 


--------------------------------------------------------------------------------
/inst/chapters/02_A_Short_Tour.R:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
  3 | ### Copyright 2013 Kuhn and Johnson
  4 | ### Web Page: http://www.appliedpredictivemodeling.com
  5 | ### Contact: Max Kuhn (mxkuhn@gmail.com)
  6 | ###
  7 | ### Chapter 2: A Short Tour of the Predictive Modeling Process
  8 | ###
  9 | ### Required packages: AppliedPredictiveModeling, earth, caret, lattice
 10 | ###
 11 | ### Data used: The FuelEconomy data in the AppliedPredictiveModeling package
 12 | ###
 13 | ### Notes: 
 14 | ### 1) This code is provided without warranty.
 15 | ###
 16 | ### 2) This code should help the user reproduce the results in the
 17 | ### text. There will be differences between this code and what is is
 18 | ### the computing section. For example, the computing sections show
 19 | ### how the source functions work (e.g. randomForest() or plsr()),
 20 | ### which were not directly used when creating the book. Also, there may be 
 21 | ### syntax differences that occur over time as packages evolve. These files 
 22 | ### will reflect those changes.
 23 | ###
 24 | ### 3) In some cases, the calculations in the book were run in 
 25 | ### parallel. The sub-processes may reset the random number seed.
 26 | ### Your results may slightly vary.
 27 | ###
 28 | ################################################################################
 29 | 
 30 | ################################################################################
 31 | ### Section 2.1 Case Study: Predicting Fuel Economy
 32 | 
 33 | library(AppliedPredictiveModeling)
 34 | data(FuelEconomy)
 35 | 
 36 | ## Format data for plotting against engine displacement
 37 | 
 38 | ## Sort by engine displacement
 39 | cars2010 <- cars2010[order(cars2010$EngDispl),]
 40 | cars2011 <- cars2011[order(cars2011$EngDispl),]
 41 | 
 42 | ## Combine data into one data frame
 43 | cars2010a <- cars2010
 44 | cars2010a$Year <- "2010 Model Year"
 45 | cars2011a <- cars2011
 46 | cars2011a$Year <- "2011 Model Year"
 47 | 
 48 | plotData <- rbind(cars2010a, cars2011a)
 49 | 
 50 | library(lattice)
 51 | xyplot(FE ~ EngDispl|Year, plotData,
 52 |        xlab = "Engine Displacement",
 53 |        ylab = "Fuel Efficiency (MPG)",
 54 |        between = list(x = 1.2))
 55 | 
 56 | ## Fit a single linear model and conduct 10-fold CV to estimate the error
 57 | library(caret)
 58 | set.seed(1)
 59 | lm1Fit <- train(FE ~ EngDispl, 
 60 |                 data = cars2010,
 61 |                 method = "lm", 
 62 |                 trControl = trainControl(method= "cv"))
 63 | lm1Fit
 64 | 
 65 | 
 66 | ## Fit a quadratic model too
 67 | 
 68 | ## Create squared terms
 69 | cars2010$ED2 <- cars2010$EngDispl^2
 70 | cars2011$ED2 <- cars2011$EngDispl^2
 71 | 
 72 | set.seed(1)
 73 | lm2Fit <- train(FE ~ EngDispl + ED2, 
 74 |                 data = cars2010,
 75 |                 method = "lm", 
 76 |                 trControl = trainControl(method= "cv"))
 77 | lm2Fit
 78 | 
 79 | ## Finally a MARS model (via the earth package)
 80 | 
 81 | library(earth)
 82 | set.seed(1)
 83 | marsFit <- train(FE ~ EngDispl, 
 84 |                  data = cars2010,
 85 |                  method = "earth",
 86 |                  tuneLength = 15,
 87 |                  trControl = trainControl(method= "cv"))
 88 | marsFit
 89 | 
 90 | plot(marsFit)
 91 | 
 92 | ## Predict the test set data
 93 | cars2011$lm1  <- predict(lm1Fit,  cars2011)
 94 | cars2011$lm2  <- predict(lm2Fit,  cars2011)
 95 | cars2011$mars <- predict(marsFit, cars2011)
 96 | 
 97 | ## Get test set performance values via caret's postResample function
 98 | 
 99 | postResample(pred = cars2011$lm1,  obs = cars2011$FE)
100 | postResample(pred = cars2011$lm2,  obs = cars2011$FE)
101 | postResample(pred = cars2011$mars, obs = cars2011$FE)
102 | 
103 | ################################################################################
104 | ### Session Information
105 | 
106 | sessionInfo()
107 | 
108 | q("no")
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/inst/chapters/02_A_Short_Tour.Rout:
--------------------------------------------------------------------------------
  1 | 
  2 | R version 3.0.1 (2013-05-16) -- "Good Sport"
  3 | Copyright (C) 2013 The R Foundation for Statistical Computing
  4 | Platform: x86_64-apple-darwin10.8.0 (64-bit)
  5 | 
  6 | R is free software and comes with ABSOLUTELY NO WARRANTY.
  7 | You are welcome to redistribute it under certain conditions.
  8 | Type 'license()' or 'licence()' for distribution details.
  9 | 
 10 |   Natural language support but running in an English locale
 11 | 
 12 | R is a collaborative project with many contributors.
 13 | Type 'contributors()' for more information and
 14 | 'citation()' on how to cite R or R packages in publications.
 15 | 
 16 | Type 'demo()' for some demos, 'help()' for on-line help, or
 17 | 'help.start()' for an HTML browser interface to help.
 18 | Type 'q()' to quit R.
 19 | 
 20 | > ################################################################################
 21 | > ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
 22 | > ### Copyright 2013 Kuhn and Johnson
 23 | > ### Web Page: http://www.appliedpredictivemodeling.com
 24 | > ### Contact: Max Kuhn (mxkuhn@gmail.com)
 25 | > ###
 26 | > ### Chapter 2: A Short Tour of the Predictive Modeling Process
 27 | > ###
 28 | > ### Required packages: AppliedPredictiveModeling, earth, caret, lattice
 29 | > ###
 30 | > ### Data used: The FuelEconomy data in the AppliedPredictiveModeling package
 31 | > ###
 32 | > ### Notes: 
 33 | > ### 1) This code is provided without warranty.
 34 | > ###
 35 | > ### 2) This code should help the user reproduce the results in the
 36 | > ### text. There will be differences between this code and what is is
 37 | > ### the computing section. For example, the computing sections show
 38 | > ### how the source functions work (e.g. randomForest() or plsr()),
 39 | > ### which were not directly used when creating the book. Also, there may be 
 40 | > ### syntax differences that occur over time as packages evolve. These files 
 41 | > ### will reflect those changes.
 42 | > ###
 43 | > ### 3) In some cases, the calculations in the book were run in 
 44 | > ### parallel. The sub-processes may reset the random number seed.
 45 | > ### Your results may slightly vary.
 46 | > ###
 47 | > ################################################################################
 48 | > 
 49 | > ################################################################################
 50 | > ### Section 2.1 Case Study: Predicting Fuel Economy
 51 | > 
 52 | > library(AppliedPredictiveModeling)
 53 | > data(FuelEconomy)
 54 | > 
 55 | > ## Format data for plotting against engine displacement
 56 | > 
 57 | > ## Sort by engine displacement
 58 | > cars2010 <- cars2010[order(cars2010$EngDispl),]
 59 | > cars2011 <- cars2011[order(cars2011$EngDispl),]
 60 | > 
 61 | > ## Combine data into one data frame
 62 | > cars2010a <- cars2010
 63 | > cars2010a$Year <- "2010 Model Year"
 64 | > cars2011a <- cars2011
 65 | > cars2011a$Year <- "2011 Model Year"
 66 | > 
 67 | > plotData <- rbind(cars2010a, cars2011a)
 68 | > 
 69 | > library(lattice)
 70 | > xyplot(FE ~ EngDispl|Year, plotData,
 71 | +        xlab = "Engine Displacement",
 72 | +        ylab = "Fuel Efficiency (MPG)",
 73 | +        between = list(x = 1.2))
 74 | > 
 75 | > ## Fit a single linear model and conduct 10-fold CV to estimate the error
 76 | > library(caret)
 77 | Loading required package: ggplot2
 78 | > set.seed(1)
 79 | > lm1Fit <- train(FE ~ EngDispl, 
 80 | +                 data = cars2010,
 81 | +                 method = "lm", 
 82 | +                 trControl = trainControl(method= "cv"))
 83 | > lm1Fit
 84 | Linear Regression 
 85 | 
 86 | 1107 samples
 87 |   13 predictors
 88 | 
 89 | No pre-processing
 90 | Resampling: Cross-Validated (10 fold) 
 91 | 
 92 | Summary of sample sizes: 997, 996, 995, 996, 997, 996, ... 
 93 | 
 94 | Resampling results
 95 | 
 96 |   RMSE  Rsquared  RMSE SD  Rsquared SD
 97 |   4.6   0.628     0.493    0.0442     
 98 | 
 99 |  
100 | > 
101 | > 
102 | > ## Fit a quadratic model too
103 | > 
104 | > ## Create squared terms
105 | > cars2010$ED2 <- cars2010$EngDispl^2
106 | > cars2011$ED2 <- cars2011$EngDispl^2
107 | > 
108 | > set.seed(1)
109 | > lm2Fit <- train(FE ~ EngDispl + ED2, 
110 | +                 data = cars2010,
111 | +                 method = "lm", 
112 | +                 trControl = trainControl(method= "cv"))
113 | > lm2Fit
114 | Linear Regression 
115 | 
116 | 1107 samples
117 |   14 predictors
118 | 
119 | No pre-processing
120 | Resampling: Cross-Validated (10 fold) 
121 | 
122 | Summary of sample sizes: 997, 996, 995, 996, 997, 996, ... 
123 | 
124 | Resampling results
125 | 
126 |   RMSE  Rsquared  RMSE SD  Rsquared SD
127 |   4.23  0.684     0.419    0.0421     
128 | 
129 |  
130 | > 
131 | > ## Finally a MARS model (via the earth package)
132 | > 
133 | > library(earth)
134 | Loading required package: plotmo
135 | Loading required package: plotrix
136 | > set.seed(1)
137 | > marsFit <- train(FE ~ EngDispl, 
138 | +                  data = cars2010,
139 | +                  method = "earth",
140 | +                  tuneLength = 15,
141 | +                  trControl = trainControl(method= "cv"))
142 | > marsFit
143 | Multivariate Adaptive Regression Spline 
144 | 
145 | 1107 samples
146 |   14 predictors
147 | 
148 | No pre-processing
149 | Resampling: Cross-Validated (10 fold) 
150 | 
151 | Summary of sample sizes: 997, 996, 995, 996, 997, 996, ... 
152 | 
153 | Resampling results across tuning parameters:
154 | 
155 |   nprune  RMSE  Rsquared  RMSE SD  Rsquared SD
156 |   2       4.3   0.673     0.441    0.0429     
157 |   3       4.26  0.68      0.44     0.0395     
158 |   4       4.23  0.685     0.449    0.0428     
159 |   5       4.25  0.682     0.489    0.0432     
160 | 
161 | Tuning parameter 'degree' was held constant at a value of 1
162 | RMSE was used to select the optimal model using  the smallest value.
163 | The final values used for the model were nprune = 4 and degree = 1. 
164 | > 
165 | > plot(marsFit)
166 | > 
167 | > ## Predict the test set data
168 | > cars2011$lm1  <- predict(lm1Fit,  cars2011)
169 | > cars2011$lm2  <- predict(lm2Fit,  cars2011)
170 | > cars2011$mars <- predict(marsFit, cars2011)
171 | > 
172 | > ## Get test set performance values via caret's postResample function
173 | > 
174 | > postResample(pred = cars2011$lm1,  obs = cars2011$FE)
175 |      RMSE  Rsquared 
176 | 5.1625309 0.7018642 
177 | > postResample(pred = cars2011$lm2,  obs = cars2011$FE)
178 |      RMSE  Rsquared 
179 | 4.7162853 0.7486074 
180 | > postResample(pred = cars2011$mars, obs = cars2011$FE)
181 |      RMSE  Rsquared 
182 | 4.6855501 0.7499953 
183 | > 
184 | > ################################################################################
185 | > ### Session Information
186 | > 
187 | > sessionInfo()
188 | R version 3.0.1 (2013-05-16)
189 | Platform: x86_64-apple-darwin10.8.0 (64-bit)
190 | 
191 | locale:
192 | [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
193 | 
194 | attached base packages:
195 | [1] stats     graphics  grDevices utils     datasets  methods   base     
196 | 
197 | other attached packages:
198 | [1] earth_3.2-6                     plotrix_3.4-7                  
199 | [3] plotmo_1.3-2                    caret_6.0-22                   
200 | [5] ggplot2_0.9.3.1                 lattice_0.20-15                
201 | [7] AppliedPredictiveModeling_1.1-5
202 | 
203 | loaded via a namespace (and not attached):
204 |  [1] car_2.0-17         codetools_0.2-8    colorspace_1.2-2   compiler_3.0.1    
205 |  [5] CORElearn_0.9.41   dichromat_2.0-0    digest_0.6.3       foreach_1.4.0     
206 |  [9] grid_3.0.1         gtable_0.1.2       iterators_1.0.6    labeling_0.1      
207 | [13] MASS_7.3-26        munsell_0.4        plyr_1.8           proto_0.3-10      
208 | [17] RColorBrewer_1.0-5 reshape2_1.2.2     scales_0.2.3       stringr_0.6.2     
209 | [21] tools_3.0.1       
210 | > 
211 | > q("no")
212 | > proc.time()
213 |    user  system elapsed 
214 |   4.971   0.114   5.292 
215 | 


--------------------------------------------------------------------------------
/inst/chapters/03_Data_Pre_Processing.R:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
  3 | ### Copyright 2013 Kuhn and Johnson
  4 | ### Web Page: http://www.appliedpredictivemodeling.com
  5 | ### Contact: Max Kuhn (mxkuhn@gmail.com)
  6 | ###
  7 | ### Chapter 3: Data Pre-Processing
  8 | ###
  9 | ### Required packages: AppliedPredictiveModeling, e1071, caret, corrplot
 10 | ###
 11 | ### Data used: The (unprocessed) cell segmentation data from the
 12 | ###            AppliedPredictiveModeling package.
 13 | ###
 14 | ### Notes: 
 15 | ### 1) This code is provided without warranty.
 16 | ###
 17 | ### 2) This code should help the user reproduce the results in the
 18 | ### text. There will be differences between this code and what is is
 19 | ### the computing section. For example, the computing sections show
 20 | ### how the source functions work (e.g. randomForest() or plsr()),
 21 | ### which were not directly used when creating the book. Also, there may be 
 22 | ### syntax differences that occur over time as packages evolve. These files 
 23 | ### will reflect those changes.
 24 | ###
 25 | ### 3) In some cases, the calculations in the book were run in 
 26 | ### parallel. The sub-processes may reset the random number seed.
 27 | ### Your results may slightly vary.
 28 | ###
 29 | ################################################################################
 30 | 
 31 | ################################################################################
 32 | ### Section 3.1 Case Study: Cell Segmentation in High-Content Screening
 33 | 
 34 | library(AppliedPredictiveModeling)
 35 | data(segmentationOriginal)
 36 | 
 37 | ## Retain the original training set
 38 | segTrain <- subset(segmentationOriginal, Case == "Train")
 39 | 
 40 | ## Remove the first three columns (identifier columns)
 41 | segTrainX <- segTrain[, -(1:3)]
 42 | segTrainClass <- segTrain$Class
 43 | 
 44 | ################################################################################
 45 | ### Section 3.2 Data Transformations for Individual Predictors
 46 | 
 47 | ## The column VarIntenCh3 measures the standard deviation of the intensity
 48 | ## of the pixels in the actin filaments
 49 | 
 50 | max(segTrainX$VarIntenCh3)/min(segTrainX$VarIntenCh3)
 51 | 
 52 | library(e1071)
 53 | skewness(segTrainX$VarIntenCh3)
 54 | 
 55 | library(caret)
 56 | 
 57 | ## Use caret's preProcess function to transform for skewness
 58 | segPP <- preProcess(segTrainX, method = "BoxCox")
 59 | 
 60 | ## Apply the transformations
 61 | segTrainTrans <- predict(segPP, segTrainX)
 62 | 
 63 | ## Results for a single predictor
 64 | segPP$bc$VarIntenCh3
 65 | 
 66 | histogram(~segTrainX$VarIntenCh3,
 67 |           xlab = "Natural Units",
 68 |           type = "count")
 69 | 
 70 | histogram(~log(segTrainX$VarIntenCh3),
 71 |           xlab = "Log Units",
 72 |           ylab = " ",
 73 |           type = "count")
 74 | 
 75 | segPP$bc$PerimCh1
 76 | 
 77 | histogram(~segTrainX$PerimCh1,
 78 |           xlab = "Natural Units",
 79 |           type = "count")
 80 | 
 81 | histogram(~segTrainTrans$PerimCh1,
 82 |           xlab = "Transformed Data",
 83 |           ylab = " ",
 84 |           type = "count")
 85 | 
 86 | ################################################################################
 87 | ### Section 3.3 Data Transformations for Multiple Predictors
 88 | 
 89 | ## R's prcomp is used to conduct PCA
 90 | pr <- prcomp(~ AvgIntenCh1 + EntropyIntenCh1, 
 91 |              data = segTrainTrans, 
 92 |              scale. = TRUE)
 93 | 
 94 | transparentTheme(pchSize = .7, trans = .3)
 95 | 
 96 | xyplot(AvgIntenCh1 ~ EntropyIntenCh1,
 97 |        data = segTrainTrans,
 98 |        groups = segTrain$Class,
 99 |        xlab = "Channel 1 Fiber Width",
100 |        ylab = "Intensity Entropy Channel 1",
101 |        auto.key = list(columns = 2),
102 |        type = c("p", "g"),
103 |        main = "Original Data",
104 |        aspect = 1)
105 | 
106 | xyplot(PC2 ~ PC1,
107 |        data = as.data.frame(pr$x),
108 |        groups = segTrain$Class,
109 |        xlab = "Principal Component #1",
110 |        ylab = "Principal Component #2",
111 |        main = "Transformed",
112 |        xlim = extendrange(pr$x),
113 |        ylim = extendrange(pr$x),
114 |        type = c("p", "g"),
115 |        aspect = 1)
116 | 
117 | 
118 | ## Apply PCA to the entire set of predictors.
119 | 
120 | ## There are a few predictors with only a single value, so we remove these first
121 | ## (since PCA uses variances, which would be zero)
122 | 
123 | isZV <- apply(segTrainX, 2, function(x) length(unique(x)) == 1)
124 | segTrainX <- segTrainX[, !isZV]
125 | 
126 | segPP <- preProcess(segTrainX, c("BoxCox", "center", "scale"))
127 | segTrainTrans <- predict(segPP, segTrainX)
128 | 
129 | segPCA <- prcomp(segTrainTrans, center = TRUE, scale. = TRUE)
130 | 
131 | ## Plot a scatterplot matrix of the first three components
132 | transparentTheme(pchSize = .8, trans = .3)
133 | 
134 | panelRange <- extendrange(segPCA$x[, 1:3])
135 | splom(as.data.frame(segPCA$x[, 1:3]),
136 |       groups = segTrainClass,
137 |       type = c("p", "g"),
138 |       as.table = TRUE,
139 |       auto.key = list(columns = 2),
140 |       prepanel.limits = function(x) panelRange)
141 | 
142 | ## Format the rotation values for plotting
143 | segRot <- as.data.frame(segPCA$rotation[, 1:3])
144 | 
145 | ## Derive the channel variable
146 | vars <- rownames(segPCA$rotation)
147 | channel <- rep(NA, length(vars))
148 | channel[grepl("Ch1$", vars)] <- "Channel 1"
149 | channel[grepl("Ch2$", vars)] <- "Channel 2"
150 | channel[grepl("Ch3$", vars)] <- "Channel 3"
151 | channel[grepl("Ch4$", vars)] <- "Channel 4"
152 | 
153 | segRot$Channel <- channel
154 | segRot <- segRot[complete.cases(segRot),]
155 | segRot$Channel <- factor(as.character(segRot$Channel))
156 | 
157 | ## Plot a scatterplot matrix of the first three rotation variables
158 | 
159 | transparentTheme(pchSize = .8, trans = .7)
160 | panelRange <- extendrange(segRot[, 1:3])
161 | library(ellipse)
162 | upperp <- function(...)
163 |   {
164 |     args <- list(...)
165 |     circ1 <- ellipse(diag(rep(1, 2)), t = .1)
166 |     panel.xyplot(circ1[,1], circ1[,2],
167 |                  type = "l",
168 |                  lty = trellis.par.get("reference.line")$lty,
169 |                  col = trellis.par.get("reference.line")$col,
170 |                  lwd = trellis.par.get("reference.line")$lwd)
171 |     circ2 <- ellipse(diag(rep(1, 2)), t = .2)
172 |     panel.xyplot(circ2[,1], circ2[,2],
173 |                  type = "l",
174 |                  lty = trellis.par.get("reference.line")$lty,
175 |                  col = trellis.par.get("reference.line")$col,
176 |                  lwd = trellis.par.get("reference.line")$lwd)
177 |     circ3 <- ellipse(diag(rep(1, 2)), t = .3)
178 |     panel.xyplot(circ3[,1], circ3[,2],
179 |                  type = "l",
180 |                  lty = trellis.par.get("reference.line")$lty,
181 |                  col = trellis.par.get("reference.line")$col,
182 |                  lwd = trellis.par.get("reference.line")$lwd)
183 |     panel.xyplot(args$x, args$y, groups = args$groups, subscripts = args$subscripts)
184 |   }
185 | splom(~segRot[, 1:3],
186 |       groups = segRot$Channel,
187 |       lower.panel = function(...){}, upper.panel = upperp,
188 |       prepanel.limits = function(x) panelRange,
189 |       auto.key = list(columns = 2))
190 | 
191 | ################################################################################
192 | ### Section 3.5 Removing Variables
193 | 
194 | ## To filter on correlations, we first get the correlation matrix for the 
195 | ## predictor set
196 | 
197 | segCorr <- cor(segTrainTrans)
198 | 
199 | library(corrplot)
200 | corrplot(segCorr, order = "hclust", tl.cex = .35)
201 | 
202 | ## caret's findCorrelation function is used to identify columns to remove.
203 | highCorr <- findCorrelation(segCorr, .75)
204 | 
205 | ################################################################################
206 | ### Section 3.8 Computing (Creating Dummy Variables)
207 | 
208 | data(cars)
209 | type <- c("convertible", "coupe", "hatchback", "sedan", "wagon")
210 | cars$Type <- factor(apply(cars[, 14:18], 1, function(x) type[which(x == 1)]))
211 | 
212 | carSubset <- cars[sample(1:nrow(cars), 20), c(1, 2, 19)]
213 | 
214 | head(carSubset)
215 | levels(carSubset$Type)
216 | 
217 | simpleMod <- dummyVars(~Mileage + Type,
218 |                        data = carSubset,
219 |                        ## Remove the variable name from the
220 |                        ## column name
221 |                        levelsOnly = TRUE)
222 | simpleMod
223 | 
224 | withInteraction <- dummyVars(~Mileage + Type + Mileage:Type,
225 |                              data = carSubset,
226 |                              levelsOnly = TRUE)
227 | withInteraction
228 | predict(withInteraction, head(carSubset))
229 | 
230 | 
231 | 
232 | ################################################################################
233 | ### Session Information
234 | 
235 | sessionInfo()
236 | 
237 | q("no")
238 | 
239 | 
240 | 


--------------------------------------------------------------------------------
/inst/chapters/03_Data_Pre_Processing.Rout:
--------------------------------------------------------------------------------
  1 | 
  2 | R version 3.0.1 (2013-05-16) -- "Good Sport"
  3 | Copyright (C) 2013 The R Foundation for Statistical Computing
  4 | Platform: x86_64-apple-darwin10.8.0 (64-bit)
  5 | 
  6 | R is free software and comes with ABSOLUTELY NO WARRANTY.
  7 | You are welcome to redistribute it under certain conditions.
  8 | Type 'license()' or 'licence()' for distribution details.
  9 | 
 10 |   Natural language support but running in an English locale
 11 | 
 12 | R is a collaborative project with many contributors.
 13 | Type 'contributors()' for more information and
 14 | 'citation()' on how to cite R or R packages in publications.
 15 | 
 16 | Type 'demo()' for some demos, 'help()' for on-line help, or
 17 | 'help.start()' for an HTML browser interface to help.
 18 | Type 'q()' to quit R.
 19 | 
 20 | > ################################################################################
 21 | > ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
 22 | > ### Copyright 2013 Kuhn and Johnson
 23 | > ### Web Page: http://www.appliedpredictivemodeling.com
 24 | > ### Contact: Max Kuhn (mxkuhn@gmail.com)
 25 | > ###
 26 | > ### Chapter 3: Data Pre-Processing
 27 | > ###
 28 | > ### Required packages: AppliedPredictiveModeling, e1071, caret, corrplot
 29 | > ###
 30 | > ### Data used: The (unprocessed) cell segmentation data from the
 31 | > ###            AppliedPredictiveModeling package.
 32 | > ###
 33 | > ### Notes: 
 34 | > ### 1) This code is provided without warranty.
 35 | > ###
 36 | > ### 2) This code should help the user reproduce the results in the
 37 | > ### text. There will be differences between this code and what is is
 38 | > ### the computing section. For example, the computing sections show
 39 | > ### how the source functions work (e.g. randomForest() or plsr()),
 40 | > ### which were not directly used when creating the book. Also, there may be 
 41 | > ### syntax differences that occur over time as packages evolve. These files 
 42 | > ### will reflect those changes.
 43 | > ###
 44 | > ### 3) In some cases, the calculations in the book were run in 
 45 | > ### parallel. The sub-processes may reset the random number seed.
 46 | > ### Your results may slightly vary.
 47 | > ###
 48 | > ################################################################################
 49 | > 
 50 | > ################################################################################
 51 | > ### Section 3.1 Case Study: Cell Segmentation in High-Content Screening
 52 | > 
 53 | > library(AppliedPredictiveModeling)
 54 | > data(segmentationOriginal)
 55 | > 
 56 | > ## Retain the original training set
 57 | > segTrain <- subset(segmentationOriginal, Case == "Train")
 58 | > 
 59 | > ## Remove the first three columns (identifier columns)
 60 | > segTrainX <- segTrain[, -(1:3)]
 61 | > segTrainClass <- segTrain$Class
 62 | > 
 63 | > ################################################################################
 64 | > ### Section 3.2 Data Transformations for Individual Predictors
 65 | > 
 66 | > ## The column VarIntenCh3 measures the standard deviation of the intensity
 67 | > ## of the pixels in the actin filaments
 68 | > 
 69 | > max(segTrainX$VarIntenCh3)/min(segTrainX$VarIntenCh3)
 70 | [1] 870.8872
 71 | > 
 72 | > library(e1071)
 73 | Loading required package: class
 74 | > skewness(segTrainX$VarIntenCh3)
 75 | [1] 2.391624
 76 | > 
 77 | > library(caret)
 78 | Loading required package: lattice
 79 | Loading required package: ggplot2
 80 | > 
 81 | > ## Use caret's preProcess function to transform for skewness
 82 | > segPP <- preProcess(segTrainX, method = "BoxCox")
 83 | > 
 84 | > ## Apply the transformations
 85 | > segTrainTrans <- predict(segPP, segTrainX)
 86 | > 
 87 | > ## Results for a single predictor
 88 | > segPP$bc$VarIntenCh3
 89 | Box-Cox Transformation
 90 | 
 91 | 1009 data points used to estimate Lambda
 92 | 
 93 | Input data summary:
 94 |     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
 95 |   0.8693  37.0600  68.1300 101.7000 125.0000 757.0000 
 96 | 
 97 | Largest/Smallest: 871 
 98 | Sample Skewness: 2.39 
 99 | 
100 | Estimated Lambda: 0.1 
101 | With fudge factor, Lambda = 0 will be used for transformations
102 | 
103 | > 
104 | > histogram(~segTrainX$VarIntenCh3,
105 | +           xlab = "Natural Units",
106 | +           type = "count")
107 | > 
108 | > histogram(~log(segTrainX$VarIntenCh3),
109 | +           xlab = "Log Units",
110 | +           ylab = " ",
111 | +           type = "count")
112 | > 
113 | > segPP$bc$PerimCh1
114 | Box-Cox Transformation
115 | 
116 | 1009 data points used to estimate Lambda
117 | 
118 | Input data summary:
119 |    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
120 |   47.74   64.37   79.02   91.61  103.20  459.80 
121 | 
122 | Largest/Smallest: 9.63 
123 | Sample Skewness: 2.59 
124 | 
125 | Estimated Lambda: -1.1 
126 | 
127 | > 
128 | > histogram(~segTrainX$PerimCh1,
129 | +           xlab = "Natural Units",
130 | +           type = "count")
131 | > 
132 | > histogram(~segTrainTrans$PerimCh1,
133 | +           xlab = "Transformed Data",
134 | +           ylab = " ",
135 | +           type = "count")
136 | > 
137 | > ################################################################################
138 | > ### Section 3.3 Data Transformations for Multiple Predictors
139 | > 
140 | > ## R's prcomp is used to conduct PCA
141 | > pr <- prcomp(~ AvgIntenCh1 + EntropyIntenCh1, 
142 | +              data = segTrainTrans, 
143 | +              scale. = TRUE)
144 | > 
145 | > transparentTheme(pchSize = .7, trans = .3)
146 | > 
147 | > xyplot(AvgIntenCh1 ~ EntropyIntenCh1,
148 | +        data = segTrainTrans,
149 | +        groups = segTrain$Class,
150 | +        xlab = "Channel 1 Fiber Width",
151 | +        ylab = "Intensity Entropy Channel 1",
152 | +        auto.key = list(columns = 2),
153 | +        type = c("p", "g"),
154 | +        main = "Original Data",
155 | +        aspect = 1)
156 | > 
157 | > xyplot(PC2 ~ PC1,
158 | +        data = as.data.frame(pr$x),
159 | +        groups = segTrain$Class,
160 | +        xlab = "Principal Component #1",
161 | +        ylab = "Principal Component #2",
162 | +        main = "Transformed",
163 | +        xlim = extendrange(pr$x),
164 | +        ylim = extendrange(pr$x),
165 | +        type = c("p", "g"),
166 | +        aspect = 1)
167 | > 
168 | > 
169 | > ## Apply PCA to the entire set of predictors.
170 | > 
171 | > ## There are a few predictors with only a single value, so we remove these first
172 | > ## (since PCA uses variances, which would be zero)
173 | > 
174 | > isZV <- apply(segTrainX, 2, function(x) length(unique(x)) == 1)
175 | > segTrainX <- segTrainX[, !isZV]
176 | > 
177 | > segPP <- preProcess(segTrainX, c("BoxCox", "center", "scale"))
178 | > segTrainTrans <- predict(segPP, segTrainX)
179 | > 
180 | > segPCA <- prcomp(segTrainTrans, center = TRUE, scale. = TRUE)
181 | > 
182 | > ## Plot a scatterplot matrix of the first three components
183 | > transparentTheme(pchSize = .8, trans = .3)
184 | > 
185 | > panelRange <- extendrange(segPCA$x[, 1:3])
186 | > splom(as.data.frame(segPCA$x[, 1:3]),
187 | +       groups = segTrainClass,
188 | +       type = c("p", "g"),
189 | +       as.table = TRUE,
190 | +       auto.key = list(columns = 2),
191 | +       prepanel.limits = function(x) panelRange)
192 | > 
193 | > ## Format the rotation values for plotting
194 | > segRot <- as.data.frame(segPCA$rotation[, 1:3])
195 | > 
196 | > ## Derive the channel variable
197 | > vars <- rownames(segPCA$rotation)
198 | > channel <- rep(NA, length(vars))
199 | > channel[grepl("Ch1$", vars)] <- "Channel 1"
200 | > channel[grepl("Ch2$", vars)] <- "Channel 2"
201 | > channel[grepl("Ch3$", vars)] <- "Channel 3"
202 | > channel[grepl("Ch4$", vars)] <- "Channel 4"
203 | > 
204 | > segRot$Channel <- channel
205 | > segRot <- segRot[complete.cases(segRot),]
206 | > segRot$Channel <- factor(as.character(segRot$Channel))
207 | > 
208 | > ## Plot a scatterplot matrix of the first three rotation variables
209 | > 
210 | > transparentTheme(pchSize = .8, trans = .7)
211 | > panelRange <- extendrange(segRot[, 1:3])
212 | > library(ellipse)
213 | > upperp <- function(...)
214 | +   {
215 | +     args <- list(...)
216 | +     circ1 <- ellipse(diag(rep(1, 2)), t = .1)
217 | +     panel.xyplot(circ1[,1], circ1[,2],
218 | +                  type = "l",
219 | +                  lty = trellis.par.get("reference.line")$lty,
220 | +                  col = trellis.par.get("reference.line")$col,
221 | +                  lwd = trellis.par.get("reference.line")$lwd)
222 | +     circ2 <- ellipse(diag(rep(1, 2)), t = .2)
223 | +     panel.xyplot(circ2[,1], circ2[,2],
224 | +                  type = "l",
225 | +                  lty = trellis.par.get("reference.line")$lty,
226 | +                  col = trellis.par.get("reference.line")$col,
227 | +                  lwd = trellis.par.get("reference.line")$lwd)
228 | +     circ3 <- ellipse(diag(rep(1, 2)), t = .3)
229 | +     panel.xyplot(circ3[,1], circ3[,2],
230 | +                  type = "l",
231 | +                  lty = trellis.par.get("reference.line")$lty,
232 | +                  col = trellis.par.get("reference.line")$col,
233 | +                  lwd = trellis.par.get("reference.line")$lwd)
234 | +     panel.xyplot(args$x, args$y, groups = args$groups, subscripts = args$subscripts)
235 | +   }
236 | > splom(~segRot[, 1:3],
237 | +       groups = segRot$Channel,
238 | +       lower.panel = function(...){}, upper.panel = upperp,
239 | +       prepanel.limits = function(x) panelRange,
240 | +       auto.key = list(columns = 2))
241 | > 
242 | > ################################################################################
243 | > ### Section 3.5 Removing Variables
244 | > 
245 | > ## To filter on correlations, we first get the correlation matrix for the 
246 | > ## predictor set
247 | > 
248 | > segCorr <- cor(segTrainTrans)
249 | > 
250 | > library(corrplot)
251 | > corrplot(segCorr, order = "hclust", tl.cex = .35)
252 | > 
253 | > ## caret's findCorrelation function is used to identify columns to remove.
254 | > highCorr <- findCorrelation(segCorr, .75)
255 | > 
256 | > ################################################################################
257 | > ### Section 3.8 Computing (Creating Dummy Variables)
258 | > 
259 | > data(cars)
260 | > type <- c("convertible", "coupe", "hatchback", "sedan", "wagon")
261 | > cars$Type <- factor(apply(cars[, 14:18], 1, function(x) type[which(x == 1)]))
262 | > 
263 | > carSubset <- cars[sample(1:nrow(cars), 20), c(1, 2, 19)]
264 | > 
265 | > head(carSubset)
266 |        Price Mileage      Type
267 | 415 51154.05    2202     sedan
268 | 503 14116.92   12878     sedan
269 | 484 18620.87   25516     sedan
270 | 642 19423.17   25557     sedan
271 | 337 11391.21   21421 hatchback
272 | 121 20538.09   15066     sedan
273 | > levels(carSubset$Type)
274 | [1] "convertible" "coupe"       "hatchback"   "sedan"       "wagon"      
275 | > 
276 | > simpleMod <- dummyVars(~Mileage + Type,
277 | +                        data = carSubset,
278 | +                        ## Remove the variable name from the
279 | +                        ## column name
280 | +                        levelsOnly = TRUE)
281 | > simpleMod
282 | Dummy Variable Object
283 | 
284 | Formula: ~Mileage + Type
285 | 2 variables, 1 factors
286 | Factor variable names will be removed
287 | A less than full rank encoding is used
288 | > 
289 | > withInteraction <- dummyVars(~Mileage + Type + Mileage:Type,
290 | +                              data = carSubset,
291 | +                              levelsOnly = TRUE)
292 | > withInteraction
293 | Dummy Variable Object
294 | 
295 | Formula: ~Mileage + Type + Mileage:Type
296 | 2 variables, 1 factors
297 | Factor variable names will be removed
298 | A less than full rank encoding is used
299 | > predict(withInteraction, head(carSubset))
300 |     Mileage convertible coupe hatchback sedan wagon Mileage:convertible
301 | 415    2202           0     0         0     1     0                   0
302 | 503   12878           0     0         0     1     0                   0
303 | 484   25516           0     0         0     1     0                   0
304 | 642   25557           0     0         0     1     0                   0
305 | 337   21421           0     0         1     0     0                   0
306 | 121   15066           0     0         0     1     0                   0
307 |     Mileage:coupe Mileage:hatchback Mileage:sedan Mileage:wagon
308 | 415             0                 0          2202             0
309 | 503             0                 0         12878             0
310 | 484             0                 0         25516             0
311 | 642             0                 0         25557             0
312 | 337             0             21421             0             0
313 | 121             0                 0         15066             0
314 | > 
315 | > 
316 | > 
317 | > ################################################################################
318 | > ### Session Information
319 | > 
320 | > sessionInfo()
321 | R version 3.0.1 (2013-05-16)
322 | Platform: x86_64-apple-darwin10.8.0 (64-bit)
323 | 
324 | locale:
325 | [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
326 | 
327 | attached base packages:
328 | [1] stats     graphics  grDevices utils     datasets  methods   base     
329 | 
330 | other attached packages:
331 | [1] corrplot_0.71                   ellipse_0.3-8                  
332 | [3] MASS_7.3-26                     caret_6.0-22                   
333 | [5] ggplot2_0.9.3.1                 lattice_0.20-15                
334 | [7] e1071_1.6-1                     class_7.3-7                    
335 | [9] AppliedPredictiveModeling_1.1-5
336 | 
337 | loaded via a namespace (and not attached):
338 |  [1] car_2.0-17         codetools_0.2-8    colorspace_1.2-2   CORElearn_0.9.41  
339 |  [5] dichromat_2.0-0    digest_0.6.3       foreach_1.4.0      grid_3.0.1        
340 |  [9] gtable_0.1.2       iterators_1.0.6    labeling_0.1       munsell_0.4       
341 | [13] plyr_1.8           proto_0.3-10       RColorBrewer_1.0-5 reshape2_1.2.2    
342 | [17] scales_0.2.3       stringr_0.6.2      tools_3.0.1       
343 | > 
344 | > q("no")
345 | > proc.time()
346 |    user  system elapsed 
347 |   5.791   0.147   6.146 
348 | 


--------------------------------------------------------------------------------
/inst/chapters/04_Over_Fitting.R:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
  3 | ### Copyright 2013 Kuhn and Johnson
  4 | ### Web Page: http://www.appliedpredictivemodeling.com
  5 | ### Contact: Max Kuhn (mxkuhn@gmail.com)
  6 | ###
  7 | ### Chapter 4: Over-Fitting and Model Tuning
  8 | ###
  9 | ### Required packages: caret, doMC (optional), kernlab
 10 | ###
 11 | ### Data used: 
 12 | ###
 13 | ### Notes: 
 14 | ### 1) This code is provided without warranty.
 15 | ###
 16 | ### 2) This code should help the user reproduce the results in the
 17 | ### text. There will be differences between this code and what is is
 18 | ### the computing section. For example, the computing sections show
 19 | ### how the source functions work (e.g. randomForest() or plsr()),
 20 | ### which were not directly used when creating the book. Also, there may be 
 21 | ### syntax differences that occur over time as packages evolve. These files 
 22 | ### will reflect those changes.
 23 | ###
 24 | ### 3) In some cases, the calculations in the book were run in 
 25 | ### parallel. The sub-processes may reset the random number seed.
 26 | ### Your results may slightly vary.
 27 | ###
 28 | ################################################################################
 29 | 
 30 | ################################################################################
 31 | ### Section 4.6 Choosing Final Tuning Parameters
 32 | 
 33 | library(caret)
 34 | data(GermanCredit)
 35 | 
 36 | ## First, remove near-zero variance predictors then get rid of a few predictors 
 37 | ## that duplicate values. For example, there are two possible values for the 
 38 | ## housing variable: "Rent", "Own" and "ForFree". So that we don't have linear
 39 | ## dependencies, we get rid of one of the levels (e.g. "ForFree")
 40 | 
 41 | GermanCredit <- GermanCredit[, -nearZeroVar(GermanCredit)]
 42 | GermanCredit$CheckingAccountStatus.lt.0 <- NULL
 43 | GermanCredit$SavingsAccountBonds.lt.100 <- NULL
 44 | GermanCredit$EmploymentDuration.lt.1 <- NULL
 45 | GermanCredit$EmploymentDuration.Unemployed <- NULL
 46 | GermanCredit$Personal.Male.Married.Widowed <- NULL
 47 | GermanCredit$Property.Unknown <- NULL
 48 | GermanCredit$Housing.ForFree <- NULL
 49 | 
 50 | ## Split the data into training (80%) and test sets (20%)
 51 | set.seed(100)
 52 | inTrain <- createDataPartition(GermanCredit$Class, p = .8)[[1]]
 53 | GermanCreditTrain <- GermanCredit[ inTrain, ]
 54 | GermanCreditTest  <- GermanCredit[-inTrain, ]
 55 | 
 56 | ## The model fitting code shown in the computing section is fairly
 57 | ## simplistic.  For the text we estimate the tuning parameter grid
 58 | ## up-front and pass it in explicitly. This generally is not needed,
 59 | ## but was used here so that we could trim the cost values to a
 60 | ## presentable range and to re-use later with different resampling
 61 | ## methods.
 62 | 
 63 | library(kernlab)
 64 | set.seed(231)
 65 | sigDist <- sigest(Class ~ ., data = GermanCreditTrain, frac = 1)
 66 | svmTuneGrid <- data.frame(sigma = as.vector(sigDist)[1], C = 2^(-2:7))
 67 | 
 68 | ### Optional: parallel processing can be used via the 'do' packages,
 69 | ### such as doMC, doMPI etc. We used doMC (not on Windows) to speed
 70 | ### up the computations.
 71 | 
 72 | ### WARNING: Be aware of how much memory is needed to parallel
 73 | ### process. It can very quickly overwhelm the available hardware. We
 74 | ### estimate the memory usage (VSIZE = total memory size) to be 
 75 | ### 2566M/core.
 76 | 
 77 | library(doMC)
 78 | registerDoMC(4)
 79 | 
 80 | set.seed(1056)
 81 | svmFit <- train(Class ~ .,
 82 |                 data = GermanCreditTrain,
 83 |                 method = "svmRadial",
 84 |                 preProc = c("center", "scale"),
 85 |                 tuneGrid = svmTuneGrid,
 86 |                 trControl = trainControl(method = "repeatedcv", 
 87 |                                          repeats = 5,
 88 |                                          classProbs = TRUE))
 89 | ## classProbs = TRUE was added since the text was written
 90 | 
 91 | ## Print the results
 92 | svmFit
 93 | 
 94 | ## A line plot of the average performance. The 'scales' argument is actually an 
 95 | ## argument to xyplot that converts the x-axis to log-2 units.
 96 | 
 97 | plot(svmFit, scales = list(x = list(log = 2)))
 98 | 
 99 | ## Test set predictions
100 | 
101 | predictedClasses <- predict(svmFit, GermanCreditTest)
102 | str(predictedClasses)
103 | 
104 | ## Use the "type" option to get class probabilities
105 | 
106 | predictedProbs <- predict(svmFit, newdata = GermanCreditTest, type = "prob")
107 | head(predictedProbs)
108 | 
109 | 
110 | ## Fit the same model using different resampling methods. The main syntax change
111 | ## is the control object.
112 | 
113 | set.seed(1056)
114 | svmFit10CV <- train(Class ~ .,
115 |                     data = GermanCreditTrain,
116 |                     method = "svmRadial",
117 |                     preProc = c("center", "scale"),
118 |                     tuneGrid = svmTuneGrid,
119 |                     trControl = trainControl(method = "cv", number = 10))
120 | svmFit10CV
121 | 
122 | set.seed(1056)
123 | svmFitLOO <- train(Class ~ .,
124 |                    data = GermanCreditTrain,
125 |                    method = "svmRadial",
126 |                    preProc = c("center", "scale"),
127 |                    tuneGrid = svmTuneGrid,
128 |                    trControl = trainControl(method = "LOOCV"))
129 | svmFitLOO
130 | 
131 | set.seed(1056)
132 | svmFitLGO <- train(Class ~ .,
133 |                    data = GermanCreditTrain,
134 |                    method = "svmRadial",
135 |                    preProc = c("center", "scale"),
136 |                    tuneGrid = svmTuneGrid,
137 |                    trControl = trainControl(method = "LGOCV", 
138 |                                             number = 50, 
139 |                                             p = .8))
140 | svmFitLGO 
141 | 
142 | set.seed(1056)
143 | svmFitBoot <- train(Class ~ .,
144 |                     data = GermanCreditTrain,
145 |                     method = "svmRadial",
146 |                     preProc = c("center", "scale"),
147 |                     tuneGrid = svmTuneGrid,
148 |                     trControl = trainControl(method = "boot", number = 50))
149 | svmFitBoot
150 | 
151 | set.seed(1056)
152 | svmFitBoot632 <- train(Class ~ .,
153 |                        data = GermanCreditTrain,
154 |                        method = "svmRadial",
155 |                        preProc = c("center", "scale"),
156 |                        tuneGrid = svmTuneGrid,
157 |                        trControl = trainControl(method = "boot632", 
158 |                                                 number = 50))
159 | svmFitBoot632
160 | 
161 | ################################################################################
162 | ### Section 4.8 Choosing Between Models
163 | 
164 | set.seed(1056)
165 | glmProfile <- train(Class ~ .,
166 |                     data = GermanCreditTrain,
167 |                     method = "glm",
168 |                     trControl = trainControl(method = "repeatedcv", 
169 |                                              repeats = 5))
170 | glmProfile
171 | 
172 | resamp <- resamples(list(SVM = svmFit, Logistic = glmProfile))
173 | summary(resamp)
174 | 
175 | ## These results are slightly different from those shown in the text.
176 | ## There are some differences in the train() function since the 
177 | ## original results were produced. This is due to a difference in
178 | ## predictions from the ksvm() function when class probs are requested
179 | ## and when they are not. See, for example, 
180 | ## https://stat.ethz.ch/pipermail/r-help/2013-November/363188.html
181 | 
182 | modelDifferences <- diff(resamp)
183 | summary(modelDifferences)
184 | 
185 | ## The actual paired t-test:
186 | modelDifferences$statistics$Accuracy
187 | 
188 | ################################################################################
189 | ### Session Information
190 | 
191 | sessionInfo()
192 | 
193 | q("no")
194 | 
195 | 
196 | 
197 | 


--------------------------------------------------------------------------------
/inst/chapters/06_Linear_Regression.R:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
  3 | ### Copyright 2013 Kuhn and Johnson
  4 | ### Web Page: http://www.appliedpredictivemodeling.com
  5 | ### Contact: Max Kuhn (mxkuhn@gmail.com)
  6 | ###
  7 | ### Chapter 6: Linear Regression and Its Cousins
  8 | ###
  9 | ### Required packages: AppliedPredictiveModeling, lattice, corrplot, pls, 
 10 | ###                    elasticnet,
 11 | ###
 12 | ### Data used: The solubility from the AppliedPredictiveModeling package
 13 | ###
 14 | ### Notes:
 15 | ### 1) This code is provided without warranty.
 16 | ###
 17 | ### 2) This code should help the user reproduce the results in the
 18 | ### text. There will be differences between this code and what is is
 19 | ### the computing section. For example, the computing sections show
 20 | ### how the source functions work (e.g. randomForest() or plsr()),
 21 | ### which were not directly used when creating the book. Also, there may be 
 22 | ### syntax differences that occur over time as packages evolve. These files 
 23 | ### will reflect those changes.
 24 | ###
 25 | ### 3) In some cases, the calculations in the book were run in 
 26 | ### parallel. The sub-processes may reset the random number seed.
 27 | ### Your results may slightly vary.
 28 | ###
 29 | ################################################################################
 30 | 
 31 | ################################################################################
 32 | ### Section 6.1 Case Study: Quantitative Structure- Activity
 33 | ### Relationship Modeling
 34 | 
 35 | library(AppliedPredictiveModeling)
 36 | data(solubility)
 37 | 
 38 | library(lattice)
 39 | 
 40 | ### Some initial plots of the data
 41 | 
 42 | xyplot(solTrainY ~ solTrainX$MolWeight, type = c("p", "g"),
 43 |        ylab = "Solubility (log)",
 44 |        main = "(a)",
 45 |        xlab = "Molecular Weight")
 46 | xyplot(solTrainY ~ solTrainX$NumRotBonds, type = c("p", "g"),
 47 |        ylab = "Solubility (log)",
 48 |        xlab = "Number of Rotatable Bonds")
 49 | bwplot(solTrainY ~ ifelse(solTrainX[,100] == 1, 
 50 |                           "structure present", 
 51 |                           "structure absent"),
 52 |        ylab = "Solubility (log)",
 53 |        main = "(b)",
 54 |        horizontal = FALSE)
 55 | 
 56 | ### Find the columns that are not fingerprints (i.e. the continuous
 57 | ### predictors). grep will return a list of integers corresponding to
 58 | ### column names that contain the pattern "FP".
 59 | 
 60 | notFingerprints <- grep("FP", names(solTrainXtrans))
 61 | 
 62 | library(caret)
 63 | featurePlot(solTrainXtrans[, -notFingerprints],
 64 |             solTrainY,
 65 |             between = list(x = 1, y = 1),
 66 |             type = c("g", "p", "smooth"),
 67 |             labels = rep("", 2))
 68 | 
 69 | library(corrplot)
 70 | 
 71 | ### We used the full namespace to call this function because the pls
 72 | ### package (also used in this chapter) has a function with the same
 73 | ### name.
 74 | 
 75 | corrplot::corrplot(cor(solTrainXtrans[, -notFingerprints]), 
 76 |                    order = "hclust", 
 77 |                    tl.cex = .8)
 78 | 
 79 | ################################################################################
 80 | ### Section 6.2 Linear Regression
 81 | 
 82 | ### Create a control function that will be used across models. We
 83 | ### create the fold assignments explicitly instead of relying on the
 84 | ### random number seed being set to identical values.
 85 | 
 86 | set.seed(100)
 87 | indx <- createFolds(solTrainY, returnTrain = TRUE)
 88 | ctrl <- trainControl(method = "cv", index = indx)
 89 | 
 90 | ### Linear regression model with all of the predictors. This will
 91 | ### produce some warnings that a 'rank-deficient fit may be
 92 | ### misleading'. This is related to the predictors being so highly
 93 | ### correlated that some of the math has broken down.
 94 | 
 95 | set.seed(100)
 96 | lmTune0 <- train(x = solTrainXtrans, y = solTrainY,
 97 |                  method = "lm",
 98 |                  trControl = ctrl)
 99 | 
100 | lmTune0                 
101 | 
102 | ### And another using a set of predictors reduced by unsupervised
103 | ### filtering. We apply a filter to reduce extreme between-predictor
104 | ### correlations. Note the lack of warnings.
105 | 
106 | tooHigh <- findCorrelation(cor(solTrainXtrans), .9)
107 | trainXfiltered <- solTrainXtrans[, -tooHigh]
108 | testXfiltered  <-  solTestXtrans[, -tooHigh]
109 | 
110 | set.seed(100)
111 | lmTune <- train(x = trainXfiltered, y = solTrainY,
112 |                 method = "lm",
113 |                 trControl = ctrl)
114 | 
115 | lmTune
116 | 
117 | ### Save the test set results in a data frame                 
118 | testResults <- data.frame(obs = solTestY,
119 |                           Linear_Regression = predict(lmTune, testXfiltered))
120 | 
121 | 
122 | ################################################################################
123 | ### Section 6.3 Partial Least Squares
124 | 
125 | ## Run PLS and PCR on solubility data and compare results
126 | set.seed(100)
127 | plsTune <- train(x = solTrainXtrans, y = solTrainY,
128 |                  method = "pls",
129 |                  tuneGrid = expand.grid(ncomp = 1:20),
130 |                  trControl = ctrl)
131 | plsTune
132 | 
133 | testResults$PLS <- predict(plsTune, solTestXtrans)
134 | 
135 | set.seed(100)
136 | pcrTune <- train(x = solTrainXtrans, y = solTrainY,
137 |                  method = "pcr",
138 |                  tuneGrid = expand.grid(ncomp = 1:35),
139 |                  trControl = ctrl)
140 | pcrTune                  
141 | 
142 | plsResamples <- plsTune$results
143 | plsResamples$Model <- "PLS"
144 | pcrResamples <- pcrTune$results
145 | pcrResamples$Model <- "PCR"
146 | plsPlotData <- rbind(plsResamples, pcrResamples)
147 | 
148 | xyplot(RMSE ~ ncomp,
149 |        data = plsPlotData,
150 |        #aspect = 1,
151 |        xlab = "# Components",
152 |        ylab = "RMSE (Cross-Validation)",
153 |        auto.key = list(columns = 2),
154 |        groups = Model,
155 |        type = c("o", "g"))
156 | 
157 | plsImp <- varImp(plsTune, scale = FALSE)
158 | plot(plsImp, top = 25, scales = list(y = list(cex = .95)))
159 | 
160 | ################################################################################
161 | ### Section 6.4 Penalized Models
162 | 
163 | ## The text used the elasticnet to obtain a ridge regression model.
164 | ## There is now a simple ridge regression method.
165 | 
166 | ridgeGrid <- expand.grid(lambda = seq(0, .1, length = 15))
167 | 
168 | set.seed(100)
169 | ridgeTune <- train(x = solTrainXtrans, y = solTrainY,
170 |                    method = "ridge",
171 |                    tuneGrid = ridgeGrid,
172 |                    trControl = ctrl,
173 |                    preProc = c("center", "scale"))
174 | ridgeTune
175 | 
176 | print(update(plot(ridgeTune), xlab = "Penalty"))
177 | 
178 | 
179 | enetGrid <- expand.grid(lambda = c(0, 0.01, .1), 
180 |                         fraction = seq(.05, 1, length = 20))
181 | set.seed(100)
182 | enetTune <- train(x = solTrainXtrans, y = solTrainY,
183 |                   method = "enet",
184 |                   tuneGrid = enetGrid,
185 |                   trControl = ctrl,
186 |                   preProc = c("center", "scale"))
187 | enetTune
188 | 
189 | plot(enetTune)
190 | 
191 | testResults$Enet <- predict(enetTune, solTestXtrans)
192 | 
193 | ################################################################################
194 | ### Session Information
195 | 
196 | sessionInfo()
197 | 
198 | q("no")
199 | 
200 | 
201 | 
202 | 


--------------------------------------------------------------------------------
/inst/chapters/07_Non-Linear_Reg.R:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
  3 | ### Copyright 2013 Kuhn and Johnson
  4 | ### Web Page: http://www.appliedpredictivemodeling.com
  5 | ### Contact: Max Kuhn (mxkuhn@gmail.com)
  6 | ###
  7 | ### Chapter 7: Non-Linear Regression Models
  8 | ###
  9 | ### Required packages: AppliedPredictiveModeling, caret, doMC (optional), earth,
 10 | ###                    kernlab, lattice, nnet
 11 | ###
 12 | ### Data used: The solubility from the AppliedPredictiveModeling package
 13 | ###
 14 | ### Notes: 
 15 | ### 1) This code is provided without warranty.
 16 | ###
 17 | ### 2) This code should help the user reproduce the results in the
 18 | ### text. There will be differences between this code and what is is
 19 | ### the computing section. For example, the computing sections show
 20 | ### how the source functions work (e.g. randomForest() or plsr()),
 21 | ### which were not directly used when creating the book. Also, there may be 
 22 | ### syntax differences that occur over time as packages evolve. These files 
 23 | ### will reflect those changes.
 24 | ###
 25 | ### 3) In some cases, the calculations in the book were run in 
 26 | ### parallel. The sub-processes may reset the random number seed.
 27 | ### Your results may slightly vary.
 28 | ###
 29 | ################################################################################
 30 | 
 31 | ################################################################################
 32 | ### Load the data
 33 | 
 34 | library(AppliedPredictiveModeling)
 35 | data(solubility)
 36 | 
 37 | ### Create a control funciton that will be used across models. We
 38 | ### create the fold assignments explictily instead of relying on the
 39 | ### random number seed being set to identical values.
 40 | 
 41 | library(caret)
 42 | set.seed(100)
 43 | indx <- createFolds(solTrainY, returnTrain = TRUE)
 44 | ctrl <- trainControl(method = "cv", index = indx)
 45 | 
 46 | ################################################################################
 47 | ### Section 7.1 Neural Networks
 48 | 
 49 | ### Optional: parallel processing can be used via the 'do' packages,
 50 | ### such as doMC, doMPI etc. We used doMC (not on Windows) to speed
 51 | ### up the computations.
 52 | 
 53 | ### WARNING: Be aware of how much memory is needed to parallel
 54 | ### process. It can very quickly overwhelm the availible hardware. We
 55 | ### estimate the memory usuage (VSIZE = total memory size) to be 
 56 | ### 2677M/core.
 57 | 
 58 | library(doMC)
 59 | registerDoMC(10)
 60 | 
 61 | 
 62 | library(caret)
 63 | 
 64 | nnetGrid <- expand.grid(decay = c(0, 0.01, .1), 
 65 |                         size = c(1, 3, 5, 7, 9, 11, 13), 
 66 |                         bag = FALSE)
 67 | 
 68 | set.seed(100)
 69 | nnetTune <- train(x = solTrainXtrans, y = solTrainY,
 70 |                   method = "avNNet",
 71 |                   tuneGrid = nnetGrid,
 72 |                   trControl = ctrl,
 73 |                   preProc = c("center", "scale"),
 74 |                   linout = TRUE,
 75 |                   trace = FALSE,
 76 |                   MaxNWts = 13 * (ncol(solTrainXtrans) + 1) + 13 + 1,
 77 |                   maxit = 1000,
 78 |                   allowParallel = FALSE)
 79 | nnetTune
 80 | 
 81 | plot(nnetTune)
 82 | 
 83 | testResults <- data.frame(obs = solTestY,
 84 |                           NNet = predict(nnetTune, solTestXtrans))
 85 | 
 86 | ################################################################################
 87 | ### Section 7.2 Multivariate Adaptive Regression Splines
 88 | 
 89 | set.seed(100)
 90 | marsTune <- train(x = solTrainXtrans, y = solTrainY,
 91 |                   method = "earth",
 92 |                   tuneGrid = expand.grid(degree = 1, nprune = 2:38),
 93 |                   trControl = ctrl)
 94 | marsTune
 95 | 
 96 | plot(marsTune)
 97 | 
 98 | testResults$MARS <- predict(marsTune, solTestXtrans)
 99 | 
100 | marsImp <- varImp(marsTune, scale = FALSE)
101 | plot(marsImp, top = 25)
102 | 
103 | ################################################################################
104 | ### Section 7.3 Support Vector Machines
105 | 
106 | ## In a recent update to caret, the method to estimate the
107 | ## sigma parameter was slightly changed. These results will
108 | ## slightly differ from the text for that reason.
109 | 
110 | set.seed(100)
111 | svmRTune <- train(x = solTrainXtrans, y = solTrainY,
112 |                   method = "svmRadial",
113 |                   preProc = c("center", "scale"),
114 |                   tuneLength = 14,
115 |                   trControl = ctrl)
116 | svmRTune
117 | plot(svmRTune, scales = list(x = list(log = 2)))                 
118 | 
119 | svmGrid <- expand.grid(degree = 1:2, 
120 |                        scale = c(0.01, 0.005, 0.001), 
121 |                        C = 2^(-2:5))
122 | set.seed(100)
123 | svmPTune <- train(x = solTrainXtrans, y = solTrainY,
124 |                   method = "svmPoly",
125 |                   preProc = c("center", "scale"),
126 |                   tuneGrid = svmGrid,
127 |                   trControl = ctrl)
128 | 
129 | svmPTune
130 | plot(svmPTune, 
131 |      scales = list(x = list(log = 2), 
132 |                    between = list(x = .5, y = 1)))                 
133 | 
134 | testResults$SVMr <- predict(svmRTune, solTestXtrans)
135 | testResults$SVMp <- predict(svmPTune, solTestXtrans)
136 | 
137 | ################################################################################
138 | ### Section 7.4 K-Nearest Neighbors
139 | 
140 | ### First we remove near-zero variance predictors
141 | knnDescr <- solTrainXtrans[, -nearZeroVar(solTrainXtrans)]
142 | 
143 | set.seed(100)
144 | knnTune <- train(x = knnDescr, y = solTrainY,
145 |                  method = "knn",
146 |                  preProc = c("center", "scale"),
147 |                  tuneGrid = data.frame(k = 1:20),
148 |                  trControl = ctrl)
149 |                  
150 | knnTune
151 | 
152 | plot(knnTune)
153 | 
154 | testResults$Knn <- predict(svmRTune, solTestXtrans[, names(knnDescr)])
155 | 
156 | ################################################################################
157 | ### Session Information
158 | 
159 | sessionInfo()
160 | 
161 | q("no")
162 | 
163 | 
164 | 


--------------------------------------------------------------------------------
/inst/chapters/08_Regression_Trees.R:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
  3 | ### Copyright 2013 Kuhn and Johnson
  4 | ### Web Page: http://www.appliedpredictivemodeling.com
  5 | ### Contact: Max Kuhn (mxkuhn@gmail.com) 
  6 | ###
  7 | ### Chapter 8: Regression Trees and Rule-Based Models 
  8 | ###
  9 | ### Required packages: AppliedPredictiveModeling, caret, Cubis, doMC (optional),
 10 | ###                    gbm, lattice, party, partykit, randomForest, rpart, RWeka
 11 | ###
 12 | ### Data used: The solubility from the AppliedPredictiveModeling package
 13 | ###
 14 | ### Notes: 
 15 | ### 1) This code is provided without warranty.
 16 | ###
 17 | ### 2) This code should help the user reproduce the results in the
 18 | ### text. There will be differences between this code and what is is
 19 | ### the computing section. For example, the computing sections show
 20 | ### how the source functions work (e.g. randomForest() or plsr()),
 21 | ### which were not directly used when creating the book. Also, there may be 
 22 | ### syntax differences that occur over time as packages evolve. These files 
 23 | ### will reflect those changes.
 24 | ###
 25 | ### 3) In some cases, the calculations in the book were run in 
 26 | ### parallel. The sub-processes may reset the random number seed.
 27 | ### Your results may slightly vary.
 28 | ###
 29 | ################################################################################
 30 | 
 31 | ################################################################################
 32 | ### Load the data
 33 | 
 34 | library(AppliedPredictiveModeling)
 35 | data(solubility)
 36 | 
 37 | ### Create a control function that will be used across models. We
 38 | ### create the fold assignments explicitly instead of relying on the
 39 | ### random number seed being set to identical values.
 40 | 
 41 | library(caret)
 42 | set.seed(100)
 43 | indx <- createFolds(solTrainY, returnTrain = TRUE)
 44 | ctrl <- trainControl(method = "cv", index = indx)
 45 | 
 46 | ################################################################################
 47 | ### Section 8.1 Basic Regression Trees
 48 | 
 49 | library(rpart)
 50 | 
 51 | ### Fit two CART models to show the initial splitting process. rpart 
 52 | ### only uses formulas, so we put the predictors and outcome into
 53 | ### a common data frame first.
 54 | 
 55 | trainData <- solTrainXtrans
 56 | trainData$y <- solTrainY
 57 | 
 58 | rpStump <- rpart(y ~ ., data = trainData, 
 59 |                  control = rpart.control(maxdepth = 1))
 60 | rpSmall <- rpart(y ~ ., data = trainData, 
 61 |                  control = rpart.control(maxdepth = 2))
 62 | 
 63 | ### Tune the model
 64 | library(caret)
 65 | 
 66 | set.seed(100)
 67 | cartTune <- train(x = solTrainXtrans, y = solTrainY,
 68 |                   method = "rpart",
 69 |                   tuneLength = 25,
 70 |                   trControl = ctrl)
 71 | cartTune
 72 | ## cartTune$finalModel
 73 | 
 74 | 
 75 | ### Plot the tuning results
 76 | plot(cartTune, scales = list(x = list(log = 10)))
 77 | 
 78 | ### Use the partykit package to make some nice plots. First, convert
 79 | ### the rpart objects to party objects.
 80 | 
 81 | # library(partykit)
 82 | # 
 83 | # cartTree <- as.party(cartTune$finalModel)
 84 | # plot(cartTree)
 85 | 
 86 | ### Get the variable importance. 'competes' is an argument that
 87 | ### controls whether splits not used in the tree should be included
 88 | ### in the importance calculations.
 89 | 
 90 | cartImp <- varImp(cartTune, scale = FALSE, competes = FALSE)
 91 | cartImp
 92 | 
 93 | ### Save the test set results in a data frame                 
 94 | testResults <- data.frame(obs = solTestY,
 95 |                           CART = predict(cartTune, solTestXtrans))
 96 | 
 97 | ### Tune the conditional inference tree
 98 | 
 99 | cGrid <- data.frame(mincriterion = sort(c(.95, seq(.75, .99, length = 2))))
100 | 
101 | set.seed(100)
102 | ctreeTune <- train(x = solTrainXtrans, y = solTrainY,
103 |                    method = "ctree",
104 |                    tuneGrid = cGrid,
105 |                    trControl = ctrl)
106 | ctreeTune
107 | plot(ctreeTune)
108 | 
109 | ##ctreeTune$finalModel               
110 | plot(ctreeTune$finalModel)
111 | 
112 | testResults$cTree <- predict(ctreeTune, solTestXtrans)
113 | 
114 | ################################################################################
115 | ### Section 8.2 Regression Model Trees and 8.3 Rule-Based Models
116 | 
117 | ### Tune the model tree. Using method = "M5" actually tunes over the
118 | ### tree- and rule-based versions of the model. M = 10 is also passed
119 | ### in to make sure that there are larger terminal nodes for the
120 | ### regression models.
121 | 
122 | set.seed(100)
123 | m5Tune <- train(x = solTrainXtrans, y = solTrainY,
124 |                 method = "M5",
125 |                 trControl = ctrl,
126 |                 control = Weka_control(M = 10))
127 | m5Tune
128 | 
129 | plot(m5Tune)
130 | 
131 | ## m5Tune$finalModel
132 | 
133 | ## plot(m5Tune$finalModel)
134 | 
135 | ### Show the rule-based model too
136 | 
137 | ruleFit <- M5Rules(y~., data = trainData, control = Weka_control(M = 10))
138 | ruleFit
139 | 
140 | ################################################################################
141 | ### Section 8.4 Bagged Trees
142 | 
143 | ### Optional: parallel processing can be used via the 'do' packages,
144 | ### such as doMC, doMPI etc. We used doMC (not on Windows) to speed
145 | ### up the computations.
146 | 
147 | ### WARNING: Be aware of how much memory is needed to parallel
148 | ### process. It can very quickly overwhelm the available hardware. The
149 | ### estimate of the median memory usage (VSIZE = total memory size) 
150 | ### was 9706M for a core, but could range up to 9706M. This becomes 
151 | ### severe when parallelizing randomForest() and (especially) calls 
152 | ### to cforest(). 
153 | 
154 | ### WARNING 2: The RWeka package does not work well with some forms of
155 | ### parallel processing, such as mutlicore (i.e. doMC).
156 | 
157 | library(doMC)
158 | registerDoMC(5)
159 | 
160 | set.seed(100)
161 | 
162 | treebagTune <- train(x = solTrainXtrans, y = solTrainY,
163 |                      method = "treebag",
164 |                      nbagg = 50,
165 |                      trControl = ctrl)
166 | 
167 | treebagTune
168 | 
169 | ################################################################################
170 | ### Section 8.5 Random Forests
171 | 
172 | mtryGrid <- data.frame(mtry = floor(seq(10, ncol(solTrainXtrans), length = 10)))
173 | 
174 | 
175 | ### Tune the model using cross-validation
176 | set.seed(100)
177 | rfTune <- train(x = solTrainXtrans, y = solTrainY,
178 |                 method = "rf",
179 |                 tuneGrid = mtryGrid,
180 |                 ntree = 1000,
181 |                 importance = TRUE,
182 |                 trControl = ctrl)
183 | rfTune
184 | 
185 | plot(rfTune)
186 | 
187 | rfImp <- varImp(rfTune, scale = FALSE)
188 | rfImp
189 | 
190 | ### Tune the model using the OOB estimates
191 | ctrlOOB <- trainControl(method = "oob")
192 | set.seed(100)
193 | rfTuneOOB <- train(x = solTrainXtrans, y = solTrainY,
194 |                    method = "rf",
195 |                    tuneGrid = mtryGrid,
196 |                    ntree = 1000,
197 |                    importance = TRUE,
198 |                    trControl = ctrlOOB)
199 | rfTuneOOB
200 | 
201 | plot(rfTuneOOB)
202 | 
203 | ### Tune the conditional inference forests
204 | set.seed(100)
205 | condrfTune <- train(x = solTrainXtrans, y = solTrainY,
206 |                     method = "cforest",
207 |                     tuneGrid = mtryGrid,
208 |                     controls = cforest_unbiased(ntree = 1000),
209 |                     trControl = ctrl)
210 | condrfTune
211 | 
212 | plot(condrfTune)
213 | 
214 | set.seed(100)
215 | condrfTuneOOB <- train(x = solTrainXtrans, y = solTrainY,
216 |                        method = "cforest",
217 |                        tuneGrid = mtryGrid,
218 |                        controls = cforest_unbiased(ntree = 1000),
219 |                        trControl = trainControl(method = "oob"))
220 | condrfTuneOOB
221 | 
222 | plot(condrfTuneOOB)
223 | 
224 | ################################################################################
225 | ### Section 8.6 Boosting
226 | 
227 | gbmGrid <- expand.grid(interaction.depth = seq(1, 7, by = 2),
228 |                        n.trees = seq(100, 1000, by = 50),
229 |                        shrinkage = c(0.01, 0.1))
230 | set.seed(100)
231 | gbmTune <- train(x = solTrainXtrans, y = solTrainY,
232 |                  method = "gbm",
233 |                  tuneGrid = gbmGrid,
234 |                  trControl = ctrl,
235 |                  verbose = FALSE)
236 | gbmTune
237 | 
238 | plot(gbmTune, auto.key = list(columns = 4, lines = TRUE))
239 | 
240 | gbmImp <- varImp(gbmTune, scale = FALSE)
241 | gbmImp
242 | 
243 | ################################################################################
244 | ### Section 8.7 Cubist
245 | 
246 | cbGrid <- expand.grid(committees = c(1:10, 20, 50, 75, 100), 
247 |                       neighbors = c(0, 1, 5, 9))
248 | 
249 | set.seed(100)
250 | cubistTune <- train(solTrainXtrans, solTrainY,
251 |                     "cubist",
252 |                     tuneGrid = cbGrid,
253 |                     trControl = ctrl)
254 | cubistTune
255 | 
256 | plot(cubistTune, auto.key = list(columns = 4, lines = TRUE))
257 | 
258 | cbImp <- varImp(cubistTune, scale = FALSE)
259 | cbImp
260 | 
261 | ################################################################################
262 | ### Session Information
263 | 
264 | sessionInfo()
265 | 
266 | q("no")
267 | 


--------------------------------------------------------------------------------
/inst/chapters/10_Case_Study_Concrete.R:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
  3 | ### Copyright 2013 Kuhn and Johnson
  4 | ### Web Page: http://www.appliedpredictivemodeling.com
  5 | ### Contact: Max Kuhn (mxkuhn@gmail.com)
  6 | ###
  7 | ### Chapter 10: Case Study: Compressive Strength of Concrete Mixtures
  8 | ###
  9 | ### Required packages: AppliedPredictiveModeling, caret, Cubist, doMC (optional),
 10 | ###                    earth, elasticnet, gbm, ipred, lattice, nnet, party, pls,
 11 | ###                    randomForests, rpart, RWeka      
 12 | ###
 13 | ### Data used: The concrete from the AppliedPredictiveModeling package
 14 | ###
 15 | ### Notes: 
 16 | ### 1) This code is provided without warranty.
 17 | ###
 18 | ### 2) This code should help the user reproduce the results in the
 19 | ### text. There will be differences between this code and what is is
 20 | ### the computing section. For example, the computing sections show
 21 | ### how the source functions work (e.g. randomForest() or plsr()),
 22 | ### which were not directly used when creating the book. Also, there may be 
 23 | ### syntax differences that occur over time as packages evolve. These files 
 24 | ### will reflect those changes.
 25 | ###
 26 | ### 3) In some cases, the calculations in the book were run in 
 27 | ### parallel. The sub-processes may reset the random number seed.
 28 | ### Your results may slightly vary.
 29 | ###
 30 | ################################################################################
 31 | 
 32 | ################################################################################
 33 | ### Load the data and plot the data
 34 | 
 35 | library(AppliedPredictiveModeling)
 36 | data(concrete)
 37 | 
 38 | library(caret)
 39 | library(plyr)
 40 | 
 41 | featurePlot(concrete[, -9], concrete$CompressiveStrength,
 42 |             between = list(x = 1, y = 1),
 43 |             type = c("g", "p", "smooth"))
 44 | 
 45 | 
 46 | ################################################################################
 47 | ### Section 10.1 Model Building Strategy
 48 | ### There are replicated mixtures, so take the average per mixture
 49 |             
 50 | averaged <- ddply(mixtures,
 51 |                   .(Cement, BlastFurnaceSlag, FlyAsh, Water, 
 52 |                     Superplasticizer, CoarseAggregate, 
 53 |                     FineAggregate, Age),
 54 |                   function(x) c(CompressiveStrength = 
 55 |                     mean(x$CompressiveStrength)))
 56 | 
 57 | ### Split the data and create a control object for train()
 58 | 
 59 | set.seed(975)
 60 | inTrain <- createDataPartition(averaged$CompressiveStrength, p = 3/4)[[1]]
 61 | training <- averaged[ inTrain,]
 62 | testing  <- averaged[-inTrain,]
 63 | 
 64 | ctrl <- trainControl(method = "repeatedcv", repeats = 5, number = 10)
 65 | 
 66 | ### Create a model formula that can be used repeatedly
 67 | 
 68 | modForm <- paste("CompressiveStrength ~ (.)^2 + I(Cement^2) + I(BlastFurnaceSlag^2) +",
 69 |                  "I(FlyAsh^2)  + I(Water^2) + I(Superplasticizer^2)  +",
 70 |                  "I(CoarseAggregate^2) +  I(FineAggregate^2) + I(Age^2)")
 71 | modForm <- as.formula(modForm)
 72 | 
 73 | ### Fit the various models
 74 | 
 75 | ### Optional: parallel processing can be used via the 'do' packages,
 76 | ### such as doMC, doMPI etc. We used doMC (not on Windows) to speed
 77 | ### up the computations.
 78 | 
 79 | ### WARNING: Be aware of how much memory is needed to parallel
 80 | ### process. It can very quickly overwhelm the available hardware. The
 81 | ### estimate of the median memory usage (VSIZE = total memory size) 
 82 | ### was 2800M for a core although the M5 calculations require about 
 83 | ### 3700M without parallel processing. 
 84 | 
 85 | ### WARNING 2: The RWeka package does not work well with some forms of
 86 | ### parallel processing, such as mutlicore (i.e. doMC). 
 87 | 
 88 | library(doMC)
 89 | registerDoMC(14)
 90 | 
 91 | set.seed(669)
 92 | lmFit <- train(modForm, data = training,
 93 |                method = "lm",
 94 |                trControl = ctrl)
 95 | 
 96 | set.seed(669)
 97 | plsFit <- train(modForm, data = training,
 98 |                 method = "pls",
 99 |                 preProc = c("center", "scale"),
100 |                 tuneLength = 15,
101 |                 trControl = ctrl)
102 | 
103 | lassoGrid <- expand.grid(lambda = c(0, .001, .01, .1), 
104 |                          fraction = seq(0.05, 1, length = 20))
105 | set.seed(669)
106 | lassoFit <- train(modForm, data = training,
107 |                   method = "enet",
108 |                   preProc = c("center", "scale"),
109 |                   tuneGrid = lassoGrid,
110 |                   trControl = ctrl)
111 | 
112 | set.seed(669)
113 | earthFit <- train(CompressiveStrength ~ ., data = training,
114 |                   method = "earth",
115 |                   tuneGrid = expand.grid(degree = 1, 
116 |                                          nprune = 2:25),
117 |                   trControl = ctrl)
118 | 
119 | set.seed(669)
120 | svmRFit <- train(CompressiveStrength ~ ., data = training,
121 |                  method = "svmRadial",
122 |                  tuneLength = 15,
123 |                  preProc = c("center", "scale"),
124 |                  trControl = ctrl)
125 | 
126 | 
127 | nnetGrid <- expand.grid(decay = c(0.001, .01, .1), 
128 |                         size = seq(1, 27, by = 2), 
129 |                         bag = FALSE)
130 | set.seed(669)
131 | nnetFit <- train(CompressiveStrength ~ .,
132 |                  data = training,
133 |                  method = "avNNet",
134 |                  tuneGrid = nnetGrid,
135 |                  preProc = c("center", "scale"),
136 |                  linout = TRUE,
137 |                  trace = FALSE,
138 |                  maxit = 1000,
139 |                  allowParallel = FALSE,
140 |                  trControl = ctrl)
141 | 
142 | set.seed(669)
143 | rpartFit <- train(CompressiveStrength ~ .,
144 |                   data = training,
145 |                   method = "rpart",
146 |                   tuneLength = 30,
147 |                   trControl = ctrl)
148 | 
149 | set.seed(669)
150 | treebagFit <- train(CompressiveStrength ~ .,
151 |                     data = training,
152 |                     method = "treebag",
153 |                     trControl = ctrl)
154 | 
155 | set.seed(669)
156 | ctreeFit <- train(CompressiveStrength ~ .,
157 |                   data = training,
158 |                   method = "ctree",
159 |                   tuneLength = 10,
160 |                   trControl = ctrl)
161 | 
162 | set.seed(669)
163 | rfFit <- train(CompressiveStrength ~ .,
164 |                data = training,
165 |                method = "rf",
166 |                tuneLength = 10,
167 |                ntrees = 1000,
168 |                importance = TRUE,
169 |                trControl = ctrl)
170 | 
171 | 
172 | gbmGrid <- expand.grid(interaction.depth = seq(1, 7, by = 2),
173 |                        n.trees = seq(100, 1000, by = 50),
174 |                        shrinkage = c(0.01, 0.1))
175 | set.seed(669)
176 | gbmFit <- train(CompressiveStrength ~ .,
177 |                 data = training,
178 |                 method = "gbm",
179 |                 tuneGrid = gbmGrid,
180 |                 verbose = FALSE,
181 |                 trControl = ctrl)
182 | 
183 | 
184 | cbGrid <- expand.grid(committees = c(1, 5, 10, 50, 75, 100), 
185 |                       neighbors = c(0, 1, 3, 5, 7, 9))
186 | set.seed(669)
187 | cbFit <- train(CompressiveStrength ~ .,
188 |                data = training,
189 |                method = "cubist",
190 |                tuneGrid = cbGrid,
191 |                trControl = ctrl)
192 | 
193 | ### Turn off the parallel processing to use RWeka. 
194 | registerDoSEQ()
195 | 
196 | 
197 | set.seed(669)
198 | mtFit <- train(CompressiveStrength ~ .,
199 |                data = training,
200 |                method = "M5",
201 |                trControl = ctrl)
202 | 
203 | ################################################################################
204 | ### Section 10.2 Model Performance
205 | 
206 | ### Collect the resampling statistics across all the models
207 | 
208 | rs <- resamples(list("Linear Reg" = lmFit, "
209 |                      PLS" = plsFit,
210 |                      "Elastic Net" = lassoFit, 
211 |                      MARS = earthFit,
212 |                      SVM = svmRFit, 
213 |                      "Neural Networks" = nnetFit,
214 |                      CART = rpartFit, 
215 |                      "Cond Inf Tree" = ctreeFit,
216 |                      "Bagged Tree" = treebagFit,
217 |                      "Boosted Tree" = gbmFit,
218 |                      "Random Forest" = rfFit,
219 |                      Cubist = cbFit))
220 | 
221 | #parallelPlot(rs)
222 | #parallelPlot(rs, metric = "Rsquared")
223 | 
224 | ### Get the test set results across several models
225 | 
226 | nnetPred <- predict(nnetFit, testing)
227 | gbmPred <- predict(gbmFit, testing)
228 | cbPred <- predict(cbFit, testing)
229 | 
230 | testResults <- rbind(postResample(nnetPred, testing$CompressiveStrength),
231 |                      postResample(gbmPred, testing$CompressiveStrength),
232 |                      postResample(cbPred, testing$CompressiveStrength))
233 | testResults <- as.data.frame(testResults)
234 | testResults$Model <- c("Neural Networks", "Boosted Tree", "Cubist")
235 | testResults <- testResults[order(testResults$RMSE),]
236 | 
237 | ################################################################################
238 | ### Section 10.3 Optimizing Compressive Strength
239 | 
240 | library(proxy)
241 | 
242 | ### Create a function to maximize compressive strength* while keeping
243 | ### the predictor values as mixtures. Water (in x[7]) is used as the 
244 | ### 'slack variable'. 
245 | 
246 | ### * We are actually minimizing the negative compressive strength
247 | 
248 | modelPrediction <- function(x, mod, limit = 2500)
249 | {
250 |   if(x[1] < 0 | x[1] > 1) return(10^38)
251 |   if(x[2] < 0 | x[2] > 1) return(10^38)
252 |   if(x[3] < 0 | x[3] > 1) return(10^38)
253 |   if(x[4] < 0 | x[4] > 1) return(10^38)
254 |   if(x[5] < 0 | x[5] > 1) return(10^38)
255 |   if(x[6] < 0 | x[6] > 1) return(10^38)
256 |   
257 |   x <- c(x, 1 - sum(x))
258 |   
259 |   if(x[7] < 0.05) return(10^38)
260 |   
261 |   tmp <- as.data.frame(t(x))
262 |   names(tmp) <- c('Cement','BlastFurnaceSlag','FlyAsh',
263 |                   'Superplasticizer','CoarseAggregate',
264 |                   'FineAggregate', 'Water')
265 |   tmp$Age <- 28
266 |   -predict(mod, tmp)
267 | }
268 | 
269 | ### Get mixtures at 28 days 
270 | subTrain <- subset(training, Age == 28)
271 | 
272 | ### Center and scale the data to use dissimilarity sampling
273 | pp1 <- preProcess(subTrain[, -(8:9)], c("center", "scale"))
274 | scaledTrain <- predict(pp1, subTrain[, 1:7])
275 | 
276 | ### Randomly select a few mixtures as a starting pool
277 | 
278 | set.seed(91)
279 | startMixture <- sample(1:nrow(subTrain), 1)
280 | starters <- scaledTrain[startMixture, 1:7]
281 | pool <- scaledTrain
282 | index <- maxDissim(starters, pool, 14)
283 | startPoints <- c(startMixture, index)
284 | 
285 | starters <- subTrain[startPoints,1:7]
286 | startingValues <- starters[, -4]
287 | 
288 | ### For each starting mixture, optimize the Cubist model using
289 | ### a simplex search routine
290 | 
291 | cbResults <- startingValues
292 | cbResults$Water <- NA
293 | cbResults$Prediction <- NA
294 | 
295 | for(i in 1:nrow(cbResults))
296 | {
297 |   results <- optim(unlist(cbResults[i,1:6]),
298 |                    modelPrediction,
299 |                    method = "Nelder-Mead",
300 |                    control=list(maxit=5000),
301 |                    mod = cbFit)
302 |   cbResults$Prediction[i] <- -results$value
303 |   cbResults[i,1:6] <- results$par
304 | }
305 | cbResults$Water <- 1 - apply(cbResults[,1:6], 1, sum)
306 | cbResults <- subset(cbResults, Prediction > 0 & Water > .02)
307 | cbResults <- cbResults[order(-cbResults$Prediction),][1:3,]
308 | cbResults$Model <- "Cubist"
309 | 
310 | ### Do the same for the neural network model
311 | 
312 | nnetResults <- startingValues
313 | nnetResults$Water <- NA
314 | nnetResults$Prediction <- NA
315 | 
316 | for(i in 1:nrow(nnetResults))
317 | {
318 |   results <- optim(unlist(nnetResults[i, 1:6,]),
319 |                    modelPrediction,
320 |                    method = "Nelder-Mead",
321 |                    control=list(maxit=5000),
322 |                    mod = nnetFit)
323 |   nnetResults$Prediction[i] <- -results$value
324 |   nnetResults[i,1:6] <- results$par
325 | }
326 | nnetResults$Water <- 1 - apply(nnetResults[,1:6], 1, sum)
327 | nnetResults <- subset(nnetResults, Prediction > 0 & Water > .02)
328 | nnetResults <- nnetResults[order(-nnetResults$Prediction),][1:3,]
329 | nnetResults$Model <- "NNet"
330 | 
331 | ### Convert the predicted mixtures to PCA space and plot
332 | 
333 | pp2 <- preProcess(subTrain[, 1:7], "pca")
334 | pca1 <- predict(pp2, subTrain[, 1:7])
335 | pca1$Data <- "Training Set"
336 | pca1$Data[startPoints] <- "Starting Values"
337 | pca3 <- predict(pp2, cbResults[, names(subTrain[, 1:7])])
338 | pca3$Data <- "Cubist"
339 | pca4 <- predict(pp2, nnetResults[, names(subTrain[, 1:7])])
340 | pca4$Data <- "Neural Network"
341 | 
342 | pcaData <- rbind(pca1, pca3, pca4)
343 | pcaData$Data <- factor(pcaData$Data,
344 |                        levels = c("Training Set","Starting Values",
345 |                                   "Cubist","Neural Network"))
346 | 
347 | lim <- extendrange(pcaData[, 1:2])
348 | 
349 | xyplot(PC2 ~ PC1, 
350 |        data = pcaData, 
351 |        groups = Data,
352 |        auto.key = list(columns = 2),
353 |        xlim = lim, 
354 |        ylim = lim,
355 |        type = c("g", "p"))
356 | 
357 | 
358 | ################################################################################
359 | ### Session Information
360 | 
361 | sessionInfo()
362 | 
363 | q("no")
364 | 


--------------------------------------------------------------------------------
/inst/chapters/11_Class_Performance.R:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
  3 | ### Copyright 2013 Kuhn and Johnson
  4 | ### Web Page: http://www.appliedpredictivemodeling.com
  5 | ### Contact: Max Kuhn (mxkuhn@gmail.com) 
  6 | ###
  7 | ### Chapter 11: Measuring Performance in Classification Models
  8 | ###
  9 | ### Required packages: AppliedPredictiveModeling, caret, MASS, randomForest,
 10 | ###                    pROC, klaR
 11 | ###
 12 | ### Data used: The solubility from the AppliedPredictiveModeling package
 13 | ###
 14 | ### Notes: 
 15 | ### 1) This code is provided without warranty.
 16 | ###
 17 | ### 2) This code should help the user reproduce the results in the
 18 | ### text. There will be differences between this code and what is is
 19 | ### the computing section. For example, the computing sections show
 20 | ### how the source functions work (e.g. randomForest() or plsr()),
 21 | ### which were not directly used when creating the book. Also, there may be 
 22 | ### syntax differences that occur over time as packages evolve. These files 
 23 | ### will reflect those changes.
 24 | ###
 25 | ### 3) In some cases, the calculations in the book were run in 
 26 | ### parallel. The sub-processes may reset the random number seed.
 27 | ### Your results may slightly vary.
 28 | ###
 29 | ################################################################################
 30 | 
 31 | ################################################################################
 32 | ### Section 11.1 Class Predictions
 33 | 
 34 | library(AppliedPredictiveModeling)
 35 | 
 36 | ### Simulate some two class data with two predictors
 37 | set.seed(975)
 38 | training <- quadBoundaryFunc(500)
 39 | testing <- quadBoundaryFunc(1000)
 40 | testing$class2 <- ifelse(testing$class == "Class1", 1, 0)
 41 | testing$ID <- 1:nrow(testing)
 42 | 
 43 | ### Fit models
 44 | library(MASS)
 45 | qdaFit <- qda(class ~ X1 + X2, data = training)
 46 | library(randomForest)
 47 | rfFit <- randomForest(class ~ X1 + X2, data = training, ntree = 2000)
 48 | 
 49 | ### Predict the test set
 50 | testing$qda <- predict(qdaFit, testing)$posterior[,1]
 51 | testing$rf <- predict(rfFit, testing, type = "prob")[,1]
 52 | 
 53 | 
 54 | ### Generate the calibration analysis
 55 | library(caret)
 56 | calData1 <- calibration(class ~ qda + rf, data = testing, cuts = 10)
 57 | 
 58 | ### Plot the curve
 59 | xyplot(calData1, auto.key = list(columns = 2))
 60 | 
 61 | ### To calibrate the data, treat the probabilities as inputs into the
 62 | ### model
 63 | 
 64 | trainProbs <- training
 65 | trainProbs$qda <- predict(qdaFit)$posterior[,1]
 66 | 
 67 | ### These models take the probabilities as inputs and, based on the
 68 | ### true class, re-calibrate them.
 69 | library(klaR)
 70 | nbCal <- NaiveBayes(class ~ qda, data = trainProbs, usekernel = TRUE)
 71 | 
 72 | ### We use relevel() here because glm() models the probability of the
 73 | ### second factor level.
 74 | lrCal <- glm(relevel(class, "Class2") ~ qda, data = trainProbs, family = binomial)
 75 | 
 76 | ### Now re-predict the test set using the modified class probability
 77 | ### estimates
 78 | testing$qda2 <- predict(nbCal, testing[, "qda", drop = FALSE])$posterior[,1]
 79 | testing$qda3 <- predict(lrCal, testing[, "qda", drop = FALSE], type = "response")
 80 | 
 81 | 
 82 | ### Manipulate the data a bit for pretty plotting
 83 | simulatedProbs <- testing[, c("class", "rf", "qda3")]
 84 | names(simulatedProbs) <- c("TrueClass", "RandomForestProb", "QDACalibrated")
 85 | simulatedProbs$RandomForestClass <-  predict(rfFit, testing)
 86 | 
 87 | calData2 <- calibration(class ~ qda + qda2 + qda3, data = testing)
 88 | calData2$data$calibModelVar <- as.character(calData2$data$calibModelVar)
 89 | calData2$data$calibModelVar <- ifelse(calData2$data$calibModelVar == "qda", 
 90 |                                       "QDA",
 91 |                                       calData2$data$calibModelVar)
 92 | calData2$data$calibModelVar <- ifelse(calData2$data$calibModelVar == "qda2", 
 93 |                                       "Bayesian Calibration",
 94 |                                       calData2$data$calibModelVar)
 95 | 
 96 | calData2$data$calibModelVar <- ifelse(calData2$data$calibModelVar == "qda3", 
 97 |                                       "Sigmoidal Calibration",
 98 |                                       calData2$data$calibModelVar)
 99 | 
100 | calData2$data$calibModelVar <- factor(calData2$data$calibModelVar,
101 |                                       levels = c("QDA", 
102 |                                                  "Bayesian Calibration", 
103 |                                                  "Sigmoidal Calibration"))
104 | 
105 | xyplot(calData2, auto.key = list(columns = 1))
106 | 
107 | ### Recreate the model used in the over-fitting chapter
108 | 
109 | library(caret)
110 | data(GermanCredit)
111 | 
112 | ## First, remove near-zero variance predictors then get rid of a few predictors 
113 | ## that duplicate values. For example, there are two possible values for the 
114 | ## housing variable: "Rent", "Own" and "ForFree". So that we don't have linear
115 | ## dependencies, we get rid of one of the levels (e.g. "ForFree")
116 | 
117 | GermanCredit <- GermanCredit[, -nearZeroVar(GermanCredit)]
118 | GermanCredit$CheckingAccountStatus.lt.0 <- NULL
119 | GermanCredit$SavingsAccountBonds.lt.100 <- NULL
120 | GermanCredit$EmploymentDuration.lt.1 <- NULL
121 | GermanCredit$EmploymentDuration.Unemployed <- NULL
122 | GermanCredit$Personal.Male.Married.Widowed <- NULL
123 | GermanCredit$Property.Unknown <- NULL
124 | GermanCredit$Housing.ForFree <- NULL
125 | 
126 | ## Split the data into training (80%) and test sets (20%)
127 | set.seed(100)
128 | inTrain <- createDataPartition(GermanCredit$Class, p = .8)[[1]]
129 | GermanCreditTrain <- GermanCredit[ inTrain, ]
130 | GermanCreditTest  <- GermanCredit[-inTrain, ]
131 | 
132 | set.seed(1056)
133 | logisticReg <- train(Class ~ .,
134 |                      data = GermanCreditTrain,
135 |                      method = "glm",
136 |                      trControl = trainControl(method = "repeatedcv", 
137 |                                               repeats = 5))
138 | logisticReg
139 | 
140 | ### Predict the test set
141 | creditResults <- data.frame(obs = GermanCreditTest$Class)
142 | creditResults$prob <- predict(logisticReg, GermanCreditTest, type = "prob")[, "Bad"]
143 | creditResults$pred <- predict(logisticReg, GermanCreditTest)
144 | creditResults$Label <- ifelse(creditResults$obs == "Bad", 
145 |                               "True Outcome: Bad Credit", 
146 |                               "True Outcome: Good Credit")
147 | 
148 | ### Plot the probability of bad credit
149 | histogram(~prob|Label,
150 |           data = creditResults,
151 |           layout = c(2, 1),
152 |           nint = 20,
153 |           xlab = "Probability of Bad Credit",
154 |           type = "count")
155 | 
156 | ### Calculate and plot the calibration curve
157 | creditCalib <- calibration(obs ~ prob, data = creditResults)
158 | xyplot(creditCalib)
159 | 
160 | ### Create the confusion matrix from the test set.
161 | confusionMatrix(data = creditResults$pred, 
162 |                 reference = creditResults$obs)
163 | 
164 | ### ROC curves:
165 | 
166 | ### Like glm(), roc() treats the last level of the factor as the event
167 | ### of interest so we use relevel() to change the observed class data
168 | 
169 | library(pROC)
170 | creditROC <- roc(relevel(creditResults$obs, "Good"), creditResults$prob)
171 | 
172 | coords(creditROC, "all")[,1:3]
173 | 
174 | auc(creditROC)
175 | ci.auc(creditROC)
176 | 
177 | ### Note the x-axis is reversed
178 | plot(creditROC)
179 | 
180 | ### Old-school:
181 | plot(creditROC, legacy.axes = TRUE)
182 | 
183 | ### Lift charts
184 | 
185 | creditLift <- lift(obs ~ prob, data = creditResults)
186 | xyplot(creditLift)
187 | 
188 | 
189 | ################################################################################
190 | ### Session Information
191 | 
192 | sessionInfo()
193 | 
194 | q("no")
195 | 


--------------------------------------------------------------------------------
/inst/chapters/11_Class_Performance.Rout:
--------------------------------------------------------------------------------
  1 | 
  2 | R version 3.0.1 (2013-05-16) -- "Good Sport"
  3 | Copyright (C) 2013 The R Foundation for Statistical Computing
  4 | Platform: x86_64-apple-darwin10.8.0 (64-bit)
  5 | 
  6 | R is free software and comes with ABSOLUTELY NO WARRANTY.
  7 | You are welcome to redistribute it under certain conditions.
  8 | Type 'license()' or 'licence()' for distribution details.
  9 | 
 10 |   Natural language support but running in an English locale
 11 | 
 12 | R is a collaborative project with many contributors.
 13 | Type 'contributors()' for more information and
 14 | 'citation()' on how to cite R or R packages in publications.
 15 | 
 16 | Type 'demo()' for some demos, 'help()' for on-line help, or
 17 | 'help.start()' for an HTML browser interface to help.
 18 | Type 'q()' to quit R.
 19 | 
 20 | > ################################################################################
 21 | > ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
 22 | > ### Copyright 2013 Kuhn and Johnson
 23 | > ### Web Page: http://www.appliedpredictivemodeling.com
 24 | > ### Contact: Max Kuhn (mxkuhn@gmail.com) 
 25 | > ###
 26 | > ### Chapter 11: Measuring Performance in Classification Models
 27 | > ###
 28 | > ### Required packages: AppliedPredictiveModeling, caret, MASS, randomForest,
 29 | > ###                    pROC, klaR
 30 | > ###
 31 | > ### Data used: The solubility from the AppliedPredictiveModeling package
 32 | > ###
 33 | > ### Notes: 
 34 | > ### 1) This code is provided without warranty.
 35 | > ###
 36 | > ### 2) This code should help the user reproduce the results in the
 37 | > ### text. There will be differences between this code and what is is
 38 | > ### the computing section. For example, the computing sections show
 39 | > ### how the source functions work (e.g. randomForest() or plsr()),
 40 | > ### which were not directly used when creating the book. Also, there may be 
 41 | > ### syntax differences that occur over time as packages evolve. These files 
 42 | > ### will reflect those changes.
 43 | > ###
 44 | > ### 3) In some cases, the calculations in the book were run in 
 45 | > ### parallel. The sub-processes may reset the random number seed.
 46 | > ### Your results may slightly vary.
 47 | > ###
 48 | > ################################################################################
 49 | > 
 50 | > ################################################################################
 51 | > ### Section 11.1 Class Predictions
 52 | > 
 53 | > library(AppliedPredictiveModeling)
 54 | > 
 55 | > ### Simulate some two class data with two predictors
 56 | > set.seed(975)
 57 | > training <- quadBoundaryFunc(500)
 58 | > testing <- quadBoundaryFunc(1000)
 59 | > testing$class2 <- ifelse(testing$class == "Class1", 1, 0)
 60 | > testing$ID <- 1:nrow(testing)
 61 | > 
 62 | > ### Fit models
 63 | > library(MASS)
 64 | > qdaFit <- qda(class ~ X1 + X2, data = training)
 65 | > library(randomForest)
 66 | randomForest 4.6-7
 67 | Type rfNews() to see new features/changes/bug fixes.
 68 | > rfFit <- randomForest(class ~ X1 + X2, data = training, ntree = 2000)
 69 | > 
 70 | > ### Predict the test set
 71 | > testing$qda <- predict(qdaFit, testing)$posterior[,1]
 72 | > testing$rf <- predict(rfFit, testing, type = "prob")[,1]
 73 | > 
 74 | > 
 75 | > ### Generate the calibration analysis
 76 | > library(caret)
 77 | Loading required package: lattice
 78 | Loading required package: ggplot2
 79 | > calData1 <- calibration(class ~ qda + rf, data = testing, cuts = 10)
 80 | > 
 81 | > ### Plot the curve
 82 | > xyplot(calData1, auto.key = list(columns = 2))
 83 | > 
 84 | > ### To calibrate the data, treat the probabilities as inputs into the
 85 | > ### model
 86 | > 
 87 | > trainProbs <- training
 88 | > trainProbs$qda <- predict(qdaFit)$posterior[,1]
 89 | > 
 90 | > ### These models take the probabilities as inputs and, based on the
 91 | > ### true class, re-calibrate them.
 92 | > library(klaR)
 93 | > nbCal <- NaiveBayes(class ~ qda, data = trainProbs, usekernel = TRUE)
 94 | > 
 95 | > ### We use relevel() here because glm() models the probability of the
 96 | > ### second factor level.
 97 | > lrCal <- glm(relevel(class, "Class2") ~ qda, data = trainProbs, family = binomial)
 98 | > 
 99 | > ### Now re-predict the test set using the modified class probability
100 | > ### estimates
101 | > testing$qda2 <- predict(nbCal, testing[, "qda", drop = FALSE])$posterior[,1]
102 | > testing$qda3 <- predict(lrCal, testing[, "qda", drop = FALSE], type = "response")
103 | > 
104 | > 
105 | > ### Manipulate the data a bit for pretty plotting
106 | > simulatedProbs <- testing[, c("class", "rf", "qda3")]
107 | > names(simulatedProbs) <- c("TrueClass", "RandomForestProb", "QDACalibrated")
108 | > simulatedProbs$RandomForestClass <-  predict(rfFit, testing)
109 | > 
110 | > calData2 <- calibration(class ~ qda + qda2 + qda3, data = testing)
111 | > calData2$data$calibModelVar <- as.character(calData2$data$calibModelVar)
112 | > calData2$data$calibModelVar <- ifelse(calData2$data$calibModelVar == "qda", 
113 | +                                       "QDA",
114 | +                                       calData2$data$calibModelVar)
115 | > calData2$data$calibModelVar <- ifelse(calData2$data$calibModelVar == "qda2", 
116 | +                                       "Bayesian Calibration",
117 | +                                       calData2$data$calibModelVar)
118 | > 
119 | > calData2$data$calibModelVar <- ifelse(calData2$data$calibModelVar == "qda3", 
120 | +                                       "Sigmoidal Calibration",
121 | +                                       calData2$data$calibModelVar)
122 | > 
123 | > calData2$data$calibModelVar <- factor(calData2$data$calibModelVar,
124 | +                                       levels = c("QDA", 
125 | +                                                  "Bayesian Calibration", 
126 | +                                                  "Sigmoidal Calibration"))
127 | > 
128 | > xyplot(calData2, auto.key = list(columns = 1))
129 | > 
130 | > ### Recreate the model used in the over-fitting chapter
131 | > 
132 | > library(caret)
133 | > data(GermanCredit)
134 | > 
135 | > ## First, remove near-zero variance predictors then get rid of a few predictors 
136 | > ## that duplicate values. For example, there are two possible values for the 
137 | > ## housing variable: "Rent", "Own" and "ForFree". So that we don't have linear
138 | > ## dependencies, we get rid of one of the levels (e.g. "ForFree")
139 | > 
140 | > GermanCredit <- GermanCredit[, -nearZeroVar(GermanCredit)]
141 | > GermanCredit$CheckingAccountStatus.lt.0 <- NULL
142 | > GermanCredit$SavingsAccountBonds.lt.100 <- NULL
143 | > GermanCredit$EmploymentDuration.lt.1 <- NULL
144 | > GermanCredit$EmploymentDuration.Unemployed <- NULL
145 | > GermanCredit$Personal.Male.Married.Widowed <- NULL
146 | > GermanCredit$Property.Unknown <- NULL
147 | > GermanCredit$Housing.ForFree <- NULL
148 | > 
149 | > ## Split the data into training (80%) and test sets (20%)
150 | > set.seed(100)
151 | > inTrain <- createDataPartition(GermanCredit$Class, p = .8)[[1]]
152 | > GermanCreditTrain <- GermanCredit[ inTrain, ]
153 | > GermanCreditTest  <- GermanCredit[-inTrain, ]
154 | > 
155 | > set.seed(1056)
156 | > logisticReg <- train(Class ~ .,
157 | +                      data = GermanCreditTrain,
158 | +                      method = "glm",
159 | +                      trControl = trainControl(method = "repeatedcv", 
160 | +                                               repeats = 5))
161 | Loading required package: class
162 | > logisticReg
163 | Generalized Linear Model 
164 | 
165 | 800 samples
166 |  41 predictors
167 |   2 classes: 'Bad', 'Good' 
168 | 
169 | No pre-processing
170 | Resampling: Cross-Validated (10 fold, repeated 5 times) 
171 | 
172 | Summary of sample sizes: 720, 720, 720, 720, 720, 720, ... 
173 | 
174 | Resampling results
175 | 
176 |   Accuracy  Kappa  Accuracy SD  Kappa SD
177 |   0.749     0.365  0.0516       0.122   
178 | 
179 |  
180 | > 
181 | > ### Predict the test set
182 | > creditResults <- data.frame(obs = GermanCreditTest$Class)
183 | > creditResults$prob <- predict(logisticReg, GermanCreditTest, type = "prob")[, "Bad"]
184 | > creditResults$pred <- predict(logisticReg, GermanCreditTest)
185 | > creditResults$Label <- ifelse(creditResults$obs == "Bad", 
186 | +                               "True Outcome: Bad Credit", 
187 | +                               "True Outcome: Good Credit")
188 | > 
189 | > ### Plot the probability of bad credit
190 | > histogram(~prob|Label,
191 | +           data = creditResults,
192 | +           layout = c(2, 1),
193 | +           nint = 20,
194 | +           xlab = "Probability of Bad Credit",
195 | +           type = "count")
196 | > 
197 | > ### Calculate and plot the calibration curve
198 | > creditCalib <- calibration(obs ~ prob, data = creditResults)
199 | > xyplot(creditCalib)
200 | > 
201 | > ### Create the confusion matrix from the test set.
202 | > confusionMatrix(data = creditResults$pred, 
203 | +                 reference = creditResults$obs)
204 | Confusion Matrix and Statistics
205 | 
206 |           Reference
207 | Prediction Bad Good
208 |       Bad   24   10
209 |       Good  36  130
210 |                                           
211 |                Accuracy : 0.77            
212 |                  95% CI : (0.7054, 0.8264)
213 |     No Information Rate : 0.7             
214 |     P-Value [Acc > NIR] : 0.0168694       
215 |                                           
216 |                   Kappa : 0.375           
217 |  Mcnemar's Test P-Value : 0.0002278       
218 |                                           
219 |             Sensitivity : 0.4000          
220 |             Specificity : 0.9286          
221 |          Pos Pred Value : 0.7059          
222 |          Neg Pred Value : 0.7831          
223 |              Prevalence : 0.3000          
224 |          Detection Rate : 0.1200          
225 |    Detection Prevalence : 0.1700          
226 |       Balanced Accuracy : 0.6643          
227 |                                           
228 |        'Positive' Class : Bad             
229 |                                           
230 | > 
231 | > ### ROC curves:
232 | > 
233 | > ### Like glm(), roc() treats the last level of the factor as the event
234 | > ### of interest so we use relevel() to change the observed class data
235 | > 
236 | > library(pROC)
237 | Loading required package: plyr
238 | Type 'citation("pROC")' for a citation.
239 | 
240 | Attaching package: ‘pROC’
241 | 
242 | The following object is masked from ‘package:stats’:
243 | 
244 |     cov, smooth, var
245 | 
246 | > creditROC <- roc(relevel(creditResults$obs, "Good"), creditResults$prob)
247 | > 
248 | > coords(creditROC, "all")[,1:3]
249 |              all         all         all
250 | threshold   -Inf 0.006199758 0.009708574
251 | specificity    0 0.007142857 0.014285714
252 | sensitivity    1 1.000000000 1.000000000
253 | > 
254 | > auc(creditROC)
255 | Area under the curve: 0.775
256 | > ci.auc(creditROC)
257 | 95% CI: 0.7032-0.8468 (DeLong)
258 | > 
259 | > ### Note the x-axis is reversed
260 | > plot(creditROC)
261 | 
262 | Call:
263 | roc.default(response = relevel(creditResults$obs, "Good"), predictor = creditResults$prob)
264 | 
265 | Data: creditResults$prob in 140 controls (relevel(creditResults$obs, "Good") Good) < 60 cases (relevel(creditResults$obs, "Good") Bad).
266 | Area under the curve: 0.775
267 | > 
268 | > ### Old-school:
269 | > plot(creditROC, legacy.axes = TRUE)
270 | 
271 | Call:
272 | roc.default(response = relevel(creditResults$obs, "Good"), predictor = creditResults$prob)
273 | 
274 | Data: creditResults$prob in 140 controls (relevel(creditResults$obs, "Good") Good) < 60 cases (relevel(creditResults$obs, "Good") Bad).
275 | Area under the curve: 0.775
276 | > 
277 | > ### Lift charts
278 | > 
279 | > creditLift <- lift(obs ~ prob, data = creditResults)
280 | > xyplot(creditLift)
281 | > 
282 | > 
283 | > ################################################################################
284 | > ### Session Information
285 | > 
286 | > sessionInfo()
287 | R version 3.0.1 (2013-05-16)
288 | Platform: x86_64-apple-darwin10.8.0 (64-bit)
289 | 
290 | locale:
291 | [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
292 | 
293 | attached base packages:
294 | [1] stats     graphics  grDevices utils     datasets  methods   base     
295 | 
296 | other attached packages:
297 |  [1] pROC_1.5.4                      plyr_1.8                       
298 |  [3] e1071_1.6-1                     class_7.3-7                    
299 |  [5] klaR_0.6-7                      caret_6.0-22                   
300 |  [7] ggplot2_0.9.3.1                 lattice_0.20-15                
301 |  [9] randomForest_4.6-7              MASS_7.3-26                    
302 | [11] AppliedPredictiveModeling_1.1-5
303 | 
304 | loaded via a namespace (and not attached):
305 |  [1] car_2.0-16         codetools_0.2-8    colorspace_1.2-1   compiler_3.0.1    
306 |  [5] CORElearn_0.9.41   dichromat_2.0-0    digest_0.6.3       foreach_1.4.0     
307 |  [9] grid_3.0.1         gtable_0.1.2       iterators_1.0.6    labeling_0.1      
308 | [13] munsell_0.4        proto_0.3-10       RColorBrewer_1.0-5 reshape2_1.2.2    
309 | [17] scales_0.2.3       stringr_0.6.2      tools_3.0.1       
310 | > 
311 | > q("no")
312 | > proc.time()
313 |    user  system elapsed 
314 |  11.120   0.526  11.698 
315 | 


--------------------------------------------------------------------------------
/inst/chapters/12_Discriminant_Analysis.R:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
  3 | ### Copyright 2013 Kuhn and Johnson
  4 | ### Web Page: http://www.appliedpredictivemodeling.com
  5 | ### Contact: Max Kuhn (mxkuhn@gmail.com) 
  6 | ###
  7 | ### Chapter 12 Discriminant Analysis and Other Linear Classification Models
  8 | ###
  9 | ### Required packages: AppliedPredictiveModeling, caret, doMC (optional),  
 10 | ###                    glmnet, lattice, MASS, pamr, pls, pROC, sparseLDA
 11 | ###
 12 | ### Data used: The grant application data. See the file 'CreateGrantData.R'
 13 | ###
 14 | ### Notes: 
 15 | ### 1) This code is provided without warranty.
 16 | ###
 17 | ### 2) This code should help the user reproduce the results in the
 18 | ### text. There will be differences between this code and what is is
 19 | ### the computing section. For example, the computing sections show
 20 | ### how the source functions work (e.g. randomForest() or plsr()),
 21 | ### which were not directly used when creating the book. Also, there may be 
 22 | ### syntax differences that occur over time as packages evolve. These files 
 23 | ### will reflect those changes.
 24 | ###
 25 | ### 3) In some cases, the calculations in the book were run in 
 26 | ### parallel. The sub-processes may reset the random number seed.
 27 | ### Your results may slightly vary.
 28 | ###
 29 | ################################################################################
 30 | 
 31 | ################################################################################
 32 | ### Section 12.1 Case Study: Predicting Successful Grant Applications
 33 | 
 34 | load("grantData.RData")
 35 | 
 36 | library(caret)
 37 | library(doMC)
 38 | registerDoMC(12)
 39 | library(plyr)
 40 | library(reshape2)
 41 | 
 42 | ## Look at two different ways to split and resample the data. A support vector
 43 | ## machine is used to illustrate the differences. The full set of predictors
 44 | ## is used. 
 45 | 
 46 | pre2008Data <- training[pre2008,]
 47 | year2008Data <- rbind(training[-pre2008,], testing)
 48 | 
 49 | set.seed(552)
 50 | test2008 <- createDataPartition(year2008Data$Class, p = .25)[[1]]
 51 | 
 52 | allData <- rbind(pre2008Data, year2008Data[-test2008,])
 53 | holdout2008 <- year2008Data[test2008,]
 54 | 
 55 | ## Use a common tuning grid for both approaches. 
 56 | svmrGrid <- expand.grid(sigma = c(.00007, .00009, .0001, .0002),
 57 |                         C = 2^(-3:8))
 58 | 
 59 | ## Evaluate the model using overall 10-fold cross-validation
 60 | ctrl0 <- trainControl(method = "cv",
 61 |                       summaryFunction = twoClassSummary,
 62 |                       classProbs = TRUE)
 63 | set.seed(477)
 64 | svmFit0 <- train(pre2008Data[,fullSet], pre2008Data$Class,
 65 |                  method = "svmRadial",
 66 |                  tuneGrid = svmrGrid,
 67 |                  preProc = c("center", "scale"),
 68 |                  metric = "ROC",
 69 |                  trControl = ctrl0)
 70 | svmFit0
 71 | 
 72 | ### Now fit the single 2008 test set
 73 | ctrl00 <- trainControl(method = "LGOCV",
 74 |                        summaryFunction = twoClassSummary,
 75 |                        classProbs = TRUE,
 76 |                        index = list(TestSet = 1:nrow(pre2008Data)))
 77 | 
 78 | 
 79 | set.seed(476)
 80 | svmFit00 <- train(allData[,fullSet], allData$Class,
 81 |                   method = "svmRadial",
 82 |                   tuneGrid = svmrGrid,
 83 |                   preProc = c("center", "scale"),
 84 |                   metric = "ROC",
 85 |                   trControl = ctrl00)
 86 | svmFit00
 87 | 
 88 | ## Combine the two sets of results and plot
 89 | 
 90 | grid0 <- subset(svmFit0$results,  sigma == svmFit0$bestTune$sigma)
 91 | grid0$Model <- "10-Fold Cross-Validation"
 92 | 
 93 | grid00 <- subset(svmFit00$results,  sigma == svmFit00$bestTune$sigma)
 94 | grid00$Model <- "Single 2008 Test Set"
 95 | 
 96 | plotData <- rbind(grid00, grid0)
 97 | 
 98 | plotData <- plotData[!is.na(plotData$ROC),]
 99 | xyplot(ROC ~ C, data = plotData,
100 |        groups = Model,
101 |        type = c("g", "o"),
102 |        scales = list(x = list(log = 2)),
103 |        auto.key = list(columns = 1))
104 | 
105 | ################################################################################
106 | ### Section 12.2 Logistic Regression
107 | 
108 | modelFit <- glm(Class ~ Day, data = training[pre2008,], family = binomial)
109 | dataGrid <- data.frame(Day = seq(0, 365, length = 500))
110 | dataGrid$Linear <- 1 - predict(modelFit, dataGrid, type = "response")
111 | linear2008 <- auc(roc(response = training[-pre2008, "Class"],
112 |                       predictor = 1 - predict(modelFit, 
113 |                                               training[-pre2008,], 
114 |                                               type = "response"),
115 |                       levels = rev(levels(training[-pre2008, "Class"]))))
116 | 
117 | 
118 | modelFit2 <- glm(Class ~ Day + I(Day^2), 
119 |                  data = training[pre2008,], 
120 |                  family = binomial)
121 | dataGrid$Quadratic <- 1 - predict(modelFit2, dataGrid, type = "response")
122 | quad2008 <- auc(roc(response = training[-pre2008, "Class"],
123 |                     predictor = 1 - predict(modelFit2, 
124 |                                             training[-pre2008,], 
125 |                                             type = "response"),
126 |                     levels = rev(levels(training[-pre2008, "Class"]))))
127 | 
128 | dataGrid <- plyr::melt(dataGrid, id.vars = "Day")
129 | 
130 | byDay <- training[pre2008, c("Day", "Class")]
131 | byDay$Binned <- cut(byDay$Day, seq(0, 360, by = 5))
132 | 
133 | observedProps <- ddply(byDay, .(Binned),
134 |                        function(x) c(n = nrow(x), mean = mean(x$Class == "successful")))
135 | observedProps$midpoint <- seq(2.5, 357.5, by = 5)
136 | 
137 | xyplot(value ~ Day|variable, data = dataGrid,
138 |        ylab = "Probability of A Successful Grant",
139 |        ylim = extendrange(0:1),
140 |        between = list(x = 1),
141 |        panel = function(...)
142 |        {
143 |          panel.xyplot(x = observedProps$midpoint, observedProps$mean,
144 |                       pch = 16., col = rgb(.2, .2, .2, .5))
145 |          panel.xyplot(..., type = "l", col = "black", lwd = 2)
146 |        })
147 | 
148 | ## For the reduced set of factors, fit the logistic regression model (linear and
149 | ## quadratic) and evaluate on the 
150 | training$Day2 <- training$Day^2
151 | testing$Day2 <- testing$Day^2
152 | fullSet <- c(fullSet, "Day2")
153 | reducedSet <- c(reducedSet, "Day2")
154 | 
155 | ## This control object will be used across multiple models so that the
156 | ## data splitting is consistent
157 | 
158 | ctrl <- trainControl(method = "LGOCV",
159 |                      summaryFunction = twoClassSummary,
160 |                      classProbs = TRUE,
161 |                      index = list(TrainSet = pre2008),
162 |                      savePredictions = TRUE)
163 | 
164 | set.seed(476)
165 | lrFit <- train(x = training[,reducedSet], 
166 |                y = training$Class,
167 |                method = "glm",
168 |                metric = "ROC",
169 |                trControl = ctrl)
170 | lrFit
171 | set.seed(476)
172 | lrFit2 <- train(x = training[,c(fullSet, "Day2")], 
173 |                 y = training$Class,
174 |                 method = "glm",
175 |                 metric = "ROC",
176 |                 trControl = ctrl)
177 | lrFit2
178 | 
179 | lrFit$pred <- merge(lrFit$pred,  lrFit$bestTune)
180 | 
181 | ## Get the confusion matrices for the hold-out set
182 | lrCM <- confusionMatrix(lrFit, norm = "none")
183 | lrCM
184 | lrCM2 <- confusionMatrix(lrFit2, norm = "none")
185 | lrCM2
186 | 
187 | ## Get the area under the ROC curve for the hold-out set
188 | lrRoc <- roc(response = lrFit$pred$obs,
189 |              predictor = lrFit$pred$successful,
190 |              levels = rev(levels(lrFit$pred$obs)))
191 | lrRoc2 <- roc(response = lrFit2$pred$obs,
192 |               predictor = lrFit2$pred$successful,
193 |               levels = rev(levels(lrFit2$pred$obs)))
194 | lrImp <- varImp(lrFit, scale = FALSE)
195 | 
196 | plot(lrRoc, legacy.axes = TRUE)
197 | 
198 | ################################################################################
199 | ### Section 12.3 Linear Discriminant Analysis
200 | 
201 | ## Fit the model to the reduced set
202 | set.seed(476)
203 | ldaFit <- train(x = training[,reducedSet], 
204 |                 y = training$Class,
205 |                 method = "lda",
206 |                 preProc = c("center","scale"),
207 |                 metric = "ROC",
208 |                 trControl = ctrl)
209 | ldaFit
210 | 
211 | ldaFit$pred <- merge(ldaFit$pred,  ldaFit$bestTune)
212 | ldaCM <- confusionMatrix(ldaFit, norm = "none")
213 | ldaCM
214 | ldaRoc <- roc(response = ldaFit$pred$obs,
215 |               predictor = ldaFit$pred$successful,
216 |               levels = rev(levels(ldaFit$pred$obs)))
217 | plot(lrRoc, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
218 | plot(ldaRoc, add = TRUE, type = "s", legacy.axes = TRUE)
219 | 
220 | ################################################################################
221 | ### Section 12.4 Partial Least Squares Discriminant Analysis
222 | 
223 | ## This model uses all of the predictors
224 | set.seed(476)
225 | plsFit <- train(x = training[,fullSet], 
226 |                 y = training$Class,
227 |                 method = "pls",
228 |                 tuneGrid = expand.grid(ncomp = 1:10),
229 |                 preProc = c("center","scale"),
230 |                 metric = "ROC",
231 |                 probMethod = "Bayes",
232 |                 trControl = ctrl)
233 | plsFit
234 | 
235 | plsImpGrant <- varImp(plsFit, scale = FALSE)
236 | 
237 | bestPlsNcomp <- plsFit$results[best(plsFit$results, "ROC", maximize = TRUE), "ncomp"]
238 | bestPlsROC <- plsFit$results[best(plsFit$results, "ROC", maximize = TRUE), "ROC"]
239 | 
240 | ## Only keep the final tuning parameter data
241 | plsFit$pred <- merge(plsFit$pred,  plsFit$bestTune)
242 | 
243 | plsRoc <- roc(response = plsFit$pred$obs,
244 |               predictor = plsFit$pred$successful,
245 |               levels = rev(levels(plsFit$pred$obs)))
246 | 
247 | ### PLS confusion matrix information
248 | plsCM <- confusionMatrix(plsFit, norm = "none")
249 | plsCM
250 | 
251 | ## Now fit a model that uses a smaller set of predictors chosen by unsupervised 
252 | ## filtering. 
253 | 
254 | set.seed(476)
255 | plsFit2 <- train(x = training[,reducedSet], 
256 |                  y = training$Class,
257 |                  method = "pls",
258 |                  tuneGrid = expand.grid(ncomp = 1:10),
259 |                  preProc = c("center","scale"),
260 |                  metric = "ROC",
261 |                  probMethod = "Bayes",
262 |                  trControl = ctrl)
263 | plsFit2
264 | 
265 | bestPlsNcomp2 <- plsFit2$results[best(plsFit2$results, "ROC", maximize = TRUE), "ncomp"]
266 | bestPlsROC2 <- plsFit2$results[best(plsFit2$results, "ROC", maximize = TRUE), "ROC"]
267 | 
268 | plsFit2$pred <- merge(plsFit2$pred,  plsFit2$bestTune)
269 | 
270 | plsRoc2 <- roc(response = plsFit2$pred$obs,
271 |                predictor = plsFit2$pred$successful,
272 |                levels = rev(levels(plsFit2$pred$obs)))
273 | plsCM2 <- confusionMatrix(plsFit2, norm = "none")
274 | plsCM2
275 | 
276 | pls.ROC <- cbind(plsFit$results,Descriptors="Full Set")
277 | pls2.ROC <- cbind(plsFit2$results,Descriptors="Reduced Set")
278 | 
279 | plsCompareROC <- data.frame(rbind(pls.ROC,pls2.ROC))
280 | 
281 | xyplot(ROC ~ ncomp,
282 |        data = plsCompareROC,
283 |        xlab = "# Components",
284 |        ylab = "ROC (2008 Hold-Out Data)",
285 |        auto.key = list(columns = 2),
286 |        groups = Descriptors,
287 |        type = c("o", "g"))
288 | 
289 | ## Plot ROC curves and variable importance scores
290 | plot(ldaRoc, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
291 | plot(lrRoc, type = "s", col = rgb(.2, .2, .2, .2), add = TRUE, legacy.axes = TRUE)
292 | plot(plsRoc2, type = "s", add = TRUE, legacy.axes = TRUE)
293 | 
294 | plot(plsImpGrant, top=20, scales = list(y = list(cex = .95)))
295 | 
296 | ################################################################################
297 | ### Section 12.5 Penalized Models
298 | 
299 | ## The glmnet model
300 | glmnGrid <- expand.grid(alpha = c(0,  .1,  .2, .4, .6, .8, 1),
301 |                         lambda = seq(.01, .2, length = 40))
302 | set.seed(476)
303 | glmnFit <- train(x = training[,fullSet], 
304 |                  y = training$Class,
305 |                  method = "glmnet",
306 |                  tuneGrid = glmnGrid,
307 |                  preProc = c("center", "scale"),
308 |                  metric = "ROC",
309 |                  trControl = ctrl)
310 | glmnFit
311 | 
312 | glmnet2008 <- merge(glmnFit$pred,  glmnFit$bestTune)
313 | glmnetCM <- confusionMatrix(glmnFit, norm = "none")
314 | glmnetCM
315 | 
316 | glmnetRoc <- roc(response = glmnet2008$obs,
317 |                  predictor = glmnet2008$successful,
318 |                  levels = rev(levels(glmnet2008$obs)))
319 | 
320 | glmnFit0 <- glmnFit
321 | glmnFit0$results$lambda <- format(round(glmnFit0$results$lambda, 3))
322 | 
323 | glmnPlot <- plot(glmnFit0,
324 |                  plotType = "level",
325 |                  cuts = 15,
326 |                  scales = list(x = list(rot = 90, cex = .65)))
327 | 
328 | update(glmnPlot,
329 |        ylab = "Mixing Percentage\nRidge <---------> Lasso",
330 |        sub = "",
331 |        main = "Area Under the ROC Curve",
332 |        xlab = "Amount of Regularization")
333 | 
334 | plot(plsRoc2, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
335 | plot(ldaRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
336 | plot(lrRoc, type = "s", col = rgb(.2, .2, .2, .2), add = TRUE, legacy.axes = TRUE)
337 | plot(glmnetRoc, type = "s", add = TRUE, legacy.axes = TRUE)
338 | 
339 | ## Sparse logistic regression
340 | 
341 | set.seed(476)
342 | spLDAFit <- train(x = training[,fullSet], 
343 |                   y = training$Class,
344 |                   "sparseLDA",
345 |                   tuneGrid = expand.grid(lambda = c(.1),
346 |                                          NumVars = c(1:20, 50, 75, 100, 250, 500, 750, 1000)),
347 |                   preProc = c("center", "scale"),
348 |                   metric = "ROC",
349 |                   trControl = ctrl)
350 | spLDAFit
351 | 
352 | spLDA2008 <- merge(spLDAFit$pred,  spLDAFit$bestTune)
353 | spLDACM <- confusionMatrix(spLDAFit, norm = "none")
354 | spLDACM
355 | 
356 | spLDARoc <- roc(response = spLDA2008$obs,
357 |                 predictor = spLDA2008$successful,
358 |                 levels = rev(levels(spLDA2008$obs)))
359 | 
360 | update(plot(spLDAFit, scales = list(x = list(log = 10))),
361 |        ylab = "ROC AUC (2008 Hold-Out Data)")
362 | 
363 | plot(plsRoc2, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
364 | plot(glmnetRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
365 | plot(ldaRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
366 | plot(lrRoc, type = "s", col = rgb(.2, .2, .2, .2), add = TRUE, legacy.axes = TRUE)
367 | plot(spLDARoc, type = "s", add = TRUE, legacy.axes = TRUE)
368 | 
369 | ################################################################################
370 | ### Section 12.6 Nearest Shrunken Centroids
371 | 
372 | set.seed(476)
373 | nscFit <- train(x = training[,fullSet], 
374 |                 y = training$Class,
375 |                 method = "pam",
376 |                 preProc = c("center", "scale"),
377 |                 tuneGrid = data.frame(threshold = seq(0, 25, length = 30)),
378 |                 metric = "ROC",
379 |                 trControl = ctrl)
380 | nscFit
381 | 
382 | nsc2008 <- merge(nscFit$pred,  nscFit$bestTune)
383 | nscCM <- confusionMatrix(nscFit, norm = "none")
384 | nscCM
385 | nscRoc <- roc(response = nsc2008$obs,
386 |               predictor = nsc2008$successful,
387 |               levels = rev(levels(nsc2008$obs)))
388 | update(plot(nscFit), ylab = "ROC AUC (2008 Hold-Out Data)")
389 | 
390 | 
391 | plot(plsRoc2, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
392 | plot(glmnetRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
393 | plot(ldaRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
394 | plot(lrRoc, type = "s", col = rgb(.2, .2, .2, .2), add = TRUE, legacy.axes = TRUE)
395 | plot(spLDARoc, type = "s", col = rgb(.2, .2, .2, .2), add = TRUE, legacy.axes = TRUE)
396 | plot(nscRoc, type = "s", add = TRUE, legacy.axes = TRUE)
397 | 
398 | sessionInfo()
399 | 
400 | q("no")
401 | 
402 | 


--------------------------------------------------------------------------------
/inst/chapters/13_Non-Linear_Class.R:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
  3 | ### Copyright 2013 Kuhn and Johnson
  4 | ### Web Page: http://www.appliedpredictivemodeling.com
  5 | ### Contact: Max Kuhn (mxkuhn@gmail.com) 
  6 | ###
  7 | ### Chapter 13 Non-Linear Classification Models
  8 | ###
  9 | ### Required packages: AppliedPredictiveModeling, caret, doMC (optional) 
 10 | ###                    kernlab, klaR, lattice, latticeExtra, MASS, mda, nnet,
 11 | ###                    pROC
 12 | ###
 13 | ### Data used: The grant application data. See the file 'CreateGrantData.R'
 14 | ###
 15 | ### Notes: 
 16 | ### 1) This code is provided without warranty.
 17 | ###
 18 | ### 2) This code should help the user reproduce the results in the
 19 | ### text. There will be differences between this code and what is is
 20 | ### the computing section. For example, the computing sections show
 21 | ### how the source functions work (e.g. randomForest() or plsr()),
 22 | ### which were not directly used when creating the book. Also, there may be 
 23 | ### syntax differences that occur over time as packages evolve. These files 
 24 | ### will reflect those changes.
 25 | ###
 26 | ### 3) In some cases, the calculations in the book were run in 
 27 | ### parallel. The sub-processes may reset the random number seed.
 28 | ### Your results may slightly vary.
 29 | ###
 30 | ################################################################################
 31 | 
 32 | ################################################################################
 33 | ### Section 13.1 Nonlinear Discriminant Analysis
 34 | 
 35 | 
 36 | load("grantData.RData")
 37 | 
 38 | library(caret)
 39 | 
 40 | ### Optional: parallel processing can be used via the 'do' packages,
 41 | ### such as doMC, doMPI etc. We used doMC (not on Windows) to speed
 42 | ### up the computations.
 43 |  
 44 | ### WARNING: Be aware of how much memory is needed to parallel
 45 | ### process. It can very quickly overwhelm the available hardware. We
 46 | ### estimate the memory usage (VSIZE = total memory size) to be 
 47 | ### 2700M/core.
 48 | 
 49 | library(doMC)
 50 | registerDoMC(12)
 51 | 
 52 | ## This control object will be used across multiple models so that the
 53 | ## data splitting is consistent
 54 | 
 55 | ctrl <- trainControl(method = "LGOCV",
 56 |                      summaryFunction = twoClassSummary,
 57 |                      classProbs = TRUE,
 58 |                      index = list(TrainSet = pre2008),
 59 |                      savePredictions = TRUE)
 60 | 
 61 | set.seed(476)
 62 | mdaFit <- train(x = training[,reducedSet], 
 63 |                 y = training$Class,
 64 |                 method = "mda",
 65 |                 metric = "ROC",
 66 |                 tries = 40,
 67 |                 tuneGrid = expand.grid(subclasses = 1:8),
 68 |                 trControl = ctrl)
 69 | mdaFit
 70 | 
 71 | mdaFit$results <- mdaFit$results[!is.na(mdaFit$results$ROC),]                
 72 | mdaFit$pred <- merge(mdaFit$pred,  mdaFit$bestTune)
 73 | mdaCM <- confusionMatrix(mdaFit, norm = "none")
 74 | mdaCM
 75 | 
 76 | mdaRoc <- roc(response = mdaFit$pred$obs,
 77 |               predictor = mdaFit$pred$successful,
 78 |               levels = rev(levels(mdaFit$pred$obs)))
 79 | mdaRoc
 80 | 
 81 | update(plot(mdaFit,
 82 |             ylab = "ROC AUC (2008 Hold-Out Data)"))
 83 | 
 84 | ################################################################################
 85 | ### Section 13.2 Neural Networks
 86 | 
 87 | nnetGrid <- expand.grid(size = 1:10, decay = c(0, .1, 1, 2))
 88 | maxSize <- max(nnetGrid$size)
 89 | 
 90 | 
 91 | ## Four different models are evaluate based on the data pre-processing and 
 92 | ## whethera single or multiple models are used
 93 | 
 94 | set.seed(476)
 95 | nnetFit <- train(x = training[,reducedSet], 
 96 |                  y = training$Class,
 97 |                  method = "nnet",
 98 |                  metric = "ROC",
 99 |                  preProc = c("center", "scale"),
100 |                  tuneGrid = nnetGrid,
101 |                  trace = FALSE,
102 |                  maxit = 2000,
103 |                  MaxNWts = 1*(maxSize * (length(reducedSet) + 1) + maxSize + 1),
104 |                  trControl = ctrl)
105 | nnetFit
106 | 
107 | set.seed(476)
108 | nnetFit2 <- train(x = training[,reducedSet], 
109 |                   y = training$Class,
110 |                   method = "nnet",
111 |                   metric = "ROC",
112 |                   preProc = c("center", "scale", "spatialSign"),
113 |                   tuneGrid = nnetGrid,
114 |                   trace = FALSE,
115 |                   maxit = 2000,
116 |                   MaxNWts = 1*(maxSize * (length(reducedSet) + 1) + maxSize + 1),
117 |                   trControl = ctrl)
118 | nnetFit2
119 | 
120 | nnetGrid$bag <- FALSE
121 | 
122 | set.seed(476)
123 | nnetFit3 <- train(x = training[,reducedSet], 
124 |                   y = training$Class,
125 |                   method = "avNNet",
126 |                   metric = "ROC",
127 |                   preProc = c("center", "scale"),
128 |                   tuneGrid = nnetGrid,
129 |                   repeats = 10,
130 |                   trace = FALSE,
131 |                   maxit = 2000,
132 |                   MaxNWts = 10*(maxSize * (length(reducedSet) + 1) + maxSize + 1),
133 |                   allowParallel = FALSE, ## this will cause to many workers to be launched.
134 |                   trControl = ctrl)
135 | nnetFit3
136 | 
137 | set.seed(476)
138 | nnetFit4 <- train(x = training[,reducedSet], 
139 |                   y = training$Class,
140 |                   method = "avNNet",
141 |                   metric = "ROC",
142 |                   preProc = c("center", "scale", "spatialSign"),
143 |                   tuneGrid = nnetGrid,
144 |                   trace = FALSE,
145 |                   maxit = 2000,
146 |                   repeats = 10,
147 |                   MaxNWts = 10*(maxSize * (length(reducedSet) + 1) + maxSize + 1),
148 |                   allowParallel = FALSE, 
149 |                   trControl = ctrl)
150 | nnetFit4
151 | 
152 | nnetFit4$pred <- merge(nnetFit4$pred,  nnetFit4$bestTune)
153 | nnetCM <- confusionMatrix(nnetFit4, norm = "none")
154 | nnetCM
155 | 
156 | nnetRoc <- roc(response = nnetFit4$pred$obs,
157 |                predictor = nnetFit4$pred$successful,
158 |                levels = rev(levels(nnetFit4$pred$obs)))
159 | 
160 | 
161 | nnet1 <- nnetFit$results
162 | nnet1$Transform <- "No Transformation"
163 | nnet1$Model <- "Single Model"
164 | 
165 | nnet2 <- nnetFit2$results
166 | nnet2$Transform <- "Spatial Sign"
167 | nnet2$Model <- "Single Model"
168 | 
169 | nnet3 <- nnetFit3$results
170 | nnet3$Transform <- "No Transformation"
171 | nnet3$Model <- "Model Averaging"
172 | nnet3$bag <- NULL
173 | 
174 | nnet4 <- nnetFit4$results
175 | nnet4$Transform <- "Spatial Sign"
176 | nnet4$Model <- "Model Averaging"
177 | nnet4$bag <- NULL
178 | 
179 | nnetResults <- rbind(nnet1, nnet2, nnet3, nnet4)
180 | nnetResults$Model <- factor(as.character(nnetResults$Model),
181 |                             levels = c("Single Model", "Model Averaging"))
182 | library(latticeExtra)
183 | useOuterStrips(
184 |   xyplot(ROC ~ size|Model*Transform,
185 |          data = nnetResults,
186 |          groups = decay,
187 |          as.table = TRUE,
188 |          type = c("p", "l", "g"),
189 |          lty = 1,
190 |          ylab = "ROC AUC (2008 Hold-Out Data)",
191 |          xlab = "Number of Hidden Units",
192 |          auto.key = list(columns = 4, 
193 |                          title = "Weight Decay", 
194 |                          cex.title = 1)))
195 | 
196 | plot(nnetRoc, type = "s", legacy.axes = TRUE)
197 | 
198 | ################################################################################
199 | ### Section 13.3 Flexible Discriminant Analysis
200 | 
201 | set.seed(476)
202 | fdaFit <- train(x = training[,reducedSet], 
203 |                 y = training$Class,
204 |                 method = "fda",
205 |                 metric = "ROC",
206 |                 tuneGrid = expand.grid(degree = 1, nprune = 2:25),
207 |                 trControl = ctrl)
208 | fdaFit
209 | 
210 | fdaFit$pred <- merge(fdaFit$pred,  fdaFit$bestTune)
211 | fdaCM <- confusionMatrix(fdaFit, norm = "none")
212 | fdaCM
213 | 
214 | fdaRoc <- roc(response = fdaFit$pred$obs,
215 |               predictor = fdaFit$pred$successful,
216 |               levels = rev(levels(fdaFit$pred$obs)))
217 | 
218 | update(plot(fdaFit), ylab = "ROC AUC (2008 Hold-Out Data)")
219 | 
220 | plot(nnetRoc, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
221 | plot(fdaRoc, type = "s", add = TRUE, legacy.axes = TRUE)
222 | 
223 | 
224 | ################################################################################
225 | ### Section 13.4 Support Vector Machines
226 | 
227 | library(kernlab)
228 | 
229 | set.seed(201)
230 | sigmaRangeFull <- sigest(as.matrix(training[,fullSet]))
231 | svmRGridFull <- expand.grid(sigma =  as.vector(sigmaRangeFull)[1],
232 |                             C = 2^(-3:4))
233 | set.seed(476)
234 | svmRFitFull <- train(x = training[,fullSet], 
235 |                      y = training$Class,
236 |                      method = "svmRadial",
237 |                      metric = "ROC",
238 |                      preProc = c("center", "scale"),
239 |                      tuneGrid = svmRGridFull,
240 |                      trControl = ctrl)
241 | svmRFitFull
242 | 
243 | set.seed(202)
244 | sigmaRangeReduced <- sigest(as.matrix(training[,reducedSet]))
245 | svmRGridReduced <- expand.grid(sigma = sigmaRangeReduced[1],
246 |                                C = 2^(seq(-4, 4)))
247 | set.seed(476)
248 | svmRFitReduced <- train(x = training[,reducedSet], 
249 |                         y = training$Class,
250 |                         method = "svmRadial",
251 |                         metric = "ROC",
252 |                         preProc = c("center", "scale"),
253 |                         tuneGrid = svmRGridReduced,
254 |                         trControl = ctrl)
255 | svmRFitReduced
256 | 
257 | svmPGrid <-  expand.grid(degree = 1:2,
258 |                          scale = c(0.01, .005),
259 |                          C = 2^(seq(-6, -2, length = 10)))
260 | 
261 | set.seed(476)
262 | svmPFitFull <- train(x = training[,fullSet], 
263 |                      y = training$Class,
264 |                      method = "svmPoly",
265 |                      metric = "ROC",
266 |                      preProc = c("center", "scale"),
267 |                      tuneGrid = svmPGrid,
268 |                      trControl = ctrl)
269 | svmPFitFull
270 | 
271 | svmPGrid2 <-  expand.grid(degree = 1:2,
272 |                           scale = c(0.01, .005),
273 |                           C = 2^(seq(-6, -2, length = 10)))
274 | set.seed(476)
275 | svmPFitReduced <- train(x = training[,reducedSet], 
276 |                         y = training$Class,
277 |                         method = "svmPoly",
278 |                         metric = "ROC",
279 |                         preProc = c("center", "scale"),
280 |                         tuneGrid = svmPGrid2,
281 |                         fit = FALSE,
282 |                         trControl = ctrl)
283 | svmPFitReduced
284 | 
285 | svmPFitReduced$pred <- merge(svmPFitReduced$pred,  svmPFitReduced$bestTune)
286 | svmPCM <- confusionMatrix(svmPFitReduced, norm = "none")
287 | svmPRoc <- roc(response = svmPFitReduced$pred$obs,
288 |                predictor = svmPFitReduced$pred$successful,
289 |                levels = rev(levels(svmPFitReduced$pred$obs)))
290 | 
291 | 
292 | svmRadialResults <- rbind(svmRFitReduced$results,
293 |                           svmRFitFull$results)
294 | svmRadialResults$Set <- c(rep("Reduced Set", nrow(svmRFitReduced$result)),
295 |                           rep("Full Set", nrow(svmRFitFull$result)))
296 | svmRadialResults$Sigma <- paste("sigma = ", 
297 |                                 format(svmRadialResults$sigma, 
298 |                                        scientific = FALSE, digits= 5))
299 | svmRadialResults <- svmRadialResults[!is.na(svmRadialResults$ROC),]
300 | xyplot(ROC ~ C|Set, data = svmRadialResults,
301 |        groups = Sigma, type = c("g", "o"),
302 |        xlab = "Cost",
303 |        ylab = "ROC (2008 Hold-Out Data)",
304 |        auto.key = list(columns = 2),
305 |        scales = list(x = list(log = 2)))
306 | 
307 | svmPolyResults <- rbind(svmPFitReduced$results,
308 |                         svmPFitFull$results)
309 | svmPolyResults$Set <- c(rep("Reduced Set", nrow(svmPFitReduced$result)),
310 |                         rep("Full Set", nrow(svmPFitFull$result)))
311 | svmPolyResults <- svmPolyResults[!is.na(svmPolyResults$ROC),]
312 | svmPolyResults$scale <- paste("scale = ", 
313 |                               format(svmPolyResults$scale, 
314 |                                      scientific = FALSE))
315 | svmPolyResults$Degree <- "Linear"
316 | svmPolyResults$Degree[svmPolyResults$degree == 2] <- "Quadratic"
317 | useOuterStrips(xyplot(ROC ~ C|Degree*Set, data = svmPolyResults,
318 |                       groups = scale, type = c("g", "o"),
319 |                       xlab = "Cost",
320 |                       ylab = "ROC (2008 Hold-Out Data)",
321 |                       auto.key = list(columns = 2),
322 |                       scales = list(x = list(log = 2))))
323 | 
324 | plot(nnetRoc, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
325 | plot(fdaRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
326 | plot(svmPRoc, type = "s", add = TRUE, legacy.axes = TRUE)
327 | 
328 | ################################################################################
329 | ### Section 13.5 K-Nearest Neighbors
330 | 
331 | 
332 | set.seed(476)
333 | knnFit <- train(x = training[,reducedSet], 
334 |                 y = training$Class,
335 |                 method = "knn",
336 |                 metric = "ROC",
337 |                 preProc = c("center", "scale"),
338 |                 tuneGrid = data.frame(k = c(4*(0:5)+1,20*(1:5)+1,50*(2:9)+1)),
339 |                 trControl = ctrl)
340 | knnFit
341 | 
342 | knnFit$pred <- merge(knnFit$pred,  knnFit$bestTune)
343 | knnCM <- confusionMatrix(knnFit, norm = "none")
344 | knnCM
345 | knnRoc <- roc(response = knnFit$pred$obs,
346 |               predictor = knnFit$pred$successful,
347 |               levels = rev(levels(knnFit$pred$obs)))
348 | 
349 | update(plot(knnFit, ylab = "ROC (2008 Hold-Out Data)"))
350 | 
351 | plot(fdaRoc, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
352 | plot(nnetRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
353 | plot(svmPRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
354 | plot(knnRoc, type = "s", add = TRUE, legacy.axes = TRUE)
355 | 
356 | ################################################################################
357 | ### Section 13.6 Naive Bayes
358 | 
359 | ## Create factor versions of some of the predictors so that they are treated
360 | ## as categories and not dummy variables
361 | 
362 | factors <- c("SponsorCode", "ContractValueBand", "Month", "Weekday")
363 | nbPredictors <- factorPredictors[factorPredictors %in% reducedSet]
364 | nbPredictors <- c(nbPredictors, factors)
365 | nbPredictors <- nbPredictors[nbPredictors != "SponsorUnk"]
366 | 
367 | nbTraining <- training[, c("Class", nbPredictors)]
368 | nbTesting <- testing[, c("Class", nbPredictors)]
369 | 
370 | for(i in nbPredictors)
371 | {
372 |   if(length(unique(training[,i])) <= 15)
373 |   {
374 |     nbTraining[, i] <- factor(nbTraining[,i], levels = paste(sort(unique(training[,i]))))
375 |     nbTesting[, i] <- factor(nbTesting[,i], levels = paste(sort(unique(training[,i]))))
376 |   }
377 | }
378 | 
379 | set.seed(476)
380 | nBayesFit <- train(x = nbTraining[,nbPredictors],
381 |                    y = nbTraining$Class,
382 |                    method = "nb",
383 |                    metric = "ROC",
384 |                    tuneGrid = data.frame(usekernel = c(TRUE, FALSE), fL = 2),
385 |                    trControl = ctrl)
386 | nBayesFit
387 | 
388 | nBayesFit$pred <- merge(nBayesFit$pred,  nBayesFit$bestTune)
389 | nBayesCM <- confusionMatrix(nBayesFit, norm = "none")
390 | nBayesCM
391 | nBayesRoc <- roc(response = nBayesFit$pred$obs,
392 |                  predictor = nBayesFit$pred$successful,
393 |                  levels = rev(levels(nBayesFit$pred$obs)))
394 | nBayesRoc
395 | 
396 | 
397 | sessionInfo()
398 | 
399 | q("no")
400 | 


--------------------------------------------------------------------------------
/inst/chapters/17_Job_Scheduling.R:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
  3 | ### Copyright 2013 Kuhn and Johnson
  4 | ### Web Page: http://www.appliedpredictivemodeling.com
  5 | ### Contact: Max Kuhn (mxkuhn@gmail.com) 
  6 | ###
  7 | ### Chapter 17: Case Study: Job Scheduling
  8 | ###
  9 | ### Required packages: AppliedPredictiveModeling, C50, caret, doMC (optional),
 10 | ###                    earth, Hmisc, ipred, tabplot, kernlab, lattice, MASS,
 11 | ###                    mda, nnet, pls, randomForest, rpart, sparseLDA, 
 12 | ###
 13 | ### Data used: The HPC job scheduling data in the AppliedPredictiveModeling
 14 | ###            package.
 15 | ###
 16 | ### Notes: 
 17 | ### 1) This code is provided without warranty.
 18 | ###
 19 | ### 2) This code should help the user reproduce the results in the
 20 | ### text. There will be differences between this code and what is is
 21 | ### the computing section. For example, the computing sections show
 22 | ### how the source functions work (e.g. randomForest() or plsr()),
 23 | ### which were not directly used when creating the book. Also, there may be 
 24 | ### syntax differences that occur over time as packages evolve. These files 
 25 | ### will reflect those changes.
 26 | ###
 27 | ### 3) In some cases, the calculations in the book were run in 
 28 | ### parallel. The sub-processes may reset the random number seed.
 29 | ### Your results may slightly vary.
 30 | ###
 31 | ################################################################################
 32 | 
 33 | library(AppliedPredictiveModeling)
 34 | data(schedulingData)
 35 | 
 36 | ### Make a vector of predictor names
 37 | predictors <- names(schedulingData)[!(names(schedulingData) %in% c("Class"))]
 38 | 
 39 | ### A few summaries and plots of the data
 40 | library(Hmisc)
 41 | describe(schedulingData)
 42 | 
 43 | library(tabplot)
 44 | tableplot(schedulingData[, c( "Class", predictors)])
 45 | 
 46 | mosaicplot(table(schedulingData$Protocol, 
 47 |                  schedulingData$Class), 
 48 |            main = "")
 49 | 
 50 | library(lattice)
 51 | xyplot(Compounds ~ InputFields|Protocol,
 52 |        data = schedulingData,
 53 |        scales = list(x = list(log = 10), y = list(log = 10)),
 54 |        groups = Class,
 55 |        xlab = "Input Fields",
 56 |        auto.key = list(columns = 4),
 57 |        aspect = 1,
 58 |        as.table = TRUE)
 59 | 
 60 | 
 61 | ################################################################################
 62 | ### Section 17.1 Data Splitting and Model Strategy
 63 | 
 64 | ## Split the data
 65 | 
 66 | library(caret)
 67 | set.seed(1104)
 68 | inTrain <- createDataPartition(schedulingData$Class, p = .8, list = FALSE)
 69 | 
 70 | ### There are a lot of zeros and the distribution is skewed. We add
 71 | ### one so that we can log transform the data
 72 | schedulingData$NumPending <- schedulingData$NumPending + 1
 73 | 
 74 | trainData <- schedulingData[ inTrain,]
 75 | testData  <- schedulingData[-inTrain,]
 76 | 
 77 | ### Create a main effects only model formula to use
 78 | ### repeatedly. Another formula with nonlinear effects is created
 79 | ### below.
 80 | modForm <- as.formula(Class ~ Protocol + log10(Compounds) +
 81 |   log10(InputFields)+ log10(Iterations) +
 82 |   log10(NumPending) + Hour + Day)
 83 | 
 84 | ### Create an expanded set of predictors with interactions. 
 85 | 
 86 | modForm2 <- as.formula(Class ~ (Protocol + log10(Compounds) +
 87 |   log10(InputFields)+ log10(Iterations) +
 88 |   log10(NumPending) + Hour + Day)^2)
 89 | 
 90 | 
 91 | ### Some of these terms will not be estimable. For example, if there
 92 | ### are no data points were a particular protocol was run on a
 93 | ### particular day, the full interaction cannot be computed. We use
 94 | ### model.matrix() to create the whole set of predictor columns, then
 95 | ### remove those that are zero variance
 96 | 
 97 | expandedTrain <- model.matrix(modForm2, data = trainData)
 98 | expandedTest  <- model.matrix(modForm2, data = testData)
 99 | expandedTrain <- as.data.frame(expandedTrain)
100 | expandedTest  <-  as.data.frame(expandedTest)
101 | 
102 | ### Some models have issues when there is a zero variance predictor
103 | ### within the data of a particular class, so we used caret's
104 | ### checkConditionalX() function to find the offending columns and
105 | ### remove them
106 | 
107 | zv <- checkConditionalX(expandedTrain, trainData$Class)
108 | 
109 | ### Keep the expanded set to use for models where we must manually add
110 | ### more complex terms (such as logistic regression)
111 | 
112 | expandedTrain <-  expandedTrain[,-zv]
113 | expandedTest  <-  expandedTest[, -zv]
114 | 
115 | ### Create the cost matrix
116 | costMatrix <- ifelse(diag(4) == 1, 0, 1)
117 | costMatrix[4, 1] <- 10
118 | costMatrix[3, 1] <- 5
119 | costMatrix[4, 2] <- 5
120 | costMatrix[3, 2] <- 5
121 | rownames(costMatrix) <- colnames(costMatrix) <- levels(trainData$Class)
122 | 
123 | ### Create a cost function
124 | cost <- function(pred, obs)
125 | {
126 |   isNA <- is.na(pred)
127 |   if(!all(isNA))
128 |   {
129 |     pred <- pred[!isNA]
130 |     obs <- obs[!isNA]
131 |     
132 |     cost <- ifelse(pred == obs, 0, 1)
133 |     if(any(pred == "VF" & obs == "L")) cost[pred == "L" & obs == "VF"] <- 10
134 |     if(any(pred == "F" & obs == "L")) cost[pred == "F" & obs == "L"] <- 5
135 |     if(any(pred == "F" & obs == "M")) cost[pred == "F" & obs == "M"] <- 5
136 |     if(any(pred == "VF" & obs == "M")) cost[pred == "VF" & obs == "M"] <- 5
137 |     out <- mean(cost)
138 |   } else out <- NA
139 |   out
140 | }
141 | 
142 | ### Make a summary function that can be used with caret's train() function
143 | costSummary <- function (data, lev = NULL, model = NULL)
144 | {
145 |   if (is.character(data$obs))  data$obs <- factor(data$obs, levels = lev)
146 |   c(postResample(data[, "pred"], data[, "obs"]),
147 |     Cost = cost(data[, "pred"], data[, "obs"]))
148 | }
149 | 
150 | ### Create a control object for the models
151 | ctrl <- trainControl(method = "repeatedcv", 
152 |                      repeats = 5,
153 |                      summaryFunction = costSummary)
154 | 
155 | ### Optional: parallel processing can be used via the 'do' packages,
156 | ### such as doMC, doMPI etc. We used doMC (not on Windows) to speed
157 | ### up the computations.
158 | 
159 | ### WARNING: Be aware of how much memory is needed to parallel
160 | ### process. It can very quickly overwhelm the available hardware. The
161 | ### estimate of the median memory usage (VSIZE = total memory size) 
162 | ### was 3300-4100M per core although the some calculations require as  
163 | ### much as 3400M without parallel processing. 
164 | 
165 | library(doMC)
166 | registerDoMC(14)
167 | 
168 | ### Fit the CART model with and without costs
169 | 
170 | set.seed(857)
171 | rpFit <- train(x = trainData[, predictors],
172 |                y = trainData$Class,
173 |                method = "rpart",
174 |                metric = "Cost",
175 |                maximize = FALSE,
176 |                tuneLength = 20,
177 |                trControl = ctrl)
178 | rpFit
179 | 
180 | set.seed(857)
181 | rpFitCost <- train(x = trainData[, predictors],
182 |                    y = trainData$Class,
183 |                    method = "rpart",
184 |                    metric = "Cost",
185 |                    maximize = FALSE,
186 |                    tuneLength = 20,
187 |                    parms =list(loss = costMatrix),
188 |                    trControl = ctrl)
189 | rpFitCost
190 | 
191 | set.seed(857)
192 | ldaFit <- train(x = expandedTrain,
193 |                 y = trainData$Class,
194 |                 method = "lda",
195 |                 metric = "Cost",
196 |                 maximize = FALSE,
197 |                 trControl = ctrl)
198 | ldaFit
199 | 
200 | sldaGrid <- expand.grid(NumVars = seq(2, 112, by = 5),
201 |                         lambda = c(0, 0.01, .1, 1, 10))
202 | set.seed(857)
203 | sldaFit <- train(x = expandedTrain,
204 |                  y = trainData$Class,
205 |                  method = "sparseLDA",
206 |                  tuneGrid = sldaGrid,
207 |                  preProc = c("center", "scale"),
208 |                  metric = "Cost",
209 |                  maximize = FALSE,
210 |                  trControl = ctrl)
211 | sldaFit
212 | 
213 | set.seed(857)
214 | nnetGrid <- expand.grid(decay = c(0, 0.001, 0.01, .1, .5),
215 |                         size = (1:10)*2 - 1)
216 | nnetFit <- train(modForm, 
217 |                  data = trainData,
218 |                  method = "nnet",
219 |                  metric = "Cost",
220 |                  maximize = FALSE,
221 |                  tuneGrid = nnetGrid,
222 |                  trace = FALSE,
223 |                  MaxNWts = 2000,
224 |                  maxit = 1000,
225 |                  preProc = c("center", "scale"),
226 |                  trControl = ctrl)
227 | nnetFit
228 | 
229 | set.seed(857)
230 | plsFit <- train(x = expandedTrain,
231 |                 y = trainData$Class,
232 |                 method = "pls",
233 |                 metric = "Cost",
234 |                 maximize = FALSE,
235 |                 tuneLength = 100,
236 |                 preProc = c("center", "scale"),
237 |                 trControl = ctrl)
238 | plsFit
239 | 
240 | set.seed(857)
241 | fdaFit <- train(modForm, data = trainData,
242 |                 method = "fda",
243 |                 metric = "Cost",
244 |                 maximize = FALSE,
245 |                 tuneLength = 25,
246 |                 trControl = ctrl)
247 | fdaFit
248 | 
249 | set.seed(857)
250 | rfFit <- train(x = trainData[, predictors],
251 |                y = trainData$Class,
252 |                method = "rf",
253 |                metric = "Cost",
254 |                maximize = FALSE,
255 |                tuneLength = 10,
256 |                ntree = 2000,
257 |                importance = TRUE,
258 |                trControl = ctrl)
259 | rfFit
260 | 
261 | set.seed(857)
262 | rfFitCost <- train(x = trainData[, predictors],
263 |                    y = trainData$Class,
264 |                    method = "rf",
265 |                    metric = "Cost",
266 |                    maximize = FALSE,
267 |                    tuneLength = 10,
268 |                    ntree = 2000,
269 |                    classwt = c(VF = 1, F = 1, M = 5, L = 10),
270 |                    importance = TRUE,
271 |                    trControl = ctrl)
272 | rfFitCost
273 | 
274 | c5Grid <- expand.grid(trials = c(1, (1:10)*10),
275 |                       model = "tree",
276 |                       winnow = c(TRUE, FALSE))
277 | set.seed(857)
278 | c50Fit <- train(x = trainData[, predictors],
279 |                 y = trainData$Class,
280 |                 method = "C5.0",
281 |                 metric = "Cost",
282 |                 maximize = FALSE,
283 |                 tuneGrid = c5Grid,
284 |                 trControl = ctrl)
285 | c50Fit
286 | 
287 | set.seed(857)
288 | c50Cost <- train(x = trainData[, predictors],
289 |                  y = trainData$Class,
290 |                  method = "C5.0",
291 |                  metric = "Cost",
292 |                  maximize = FALSE,
293 |                  costs = costMatrix,
294 |                  tuneGrid = c5Grid,
295 |                  trControl = ctrl)
296 | c50Cost
297 | 
298 | set.seed(857)
299 | bagFit <- train(x = trainData[, predictors],
300 |                 y = trainData$Class,
301 |                 method = "treebag",
302 |                 metric = "Cost",
303 |                 maximize = FALSE,
304 |                 nbagg = 50,
305 |                 trControl = ctrl)
306 | bagFit
307 | 
308 | ### Use the caret bag() function to bag the cost-sensitive CART model
309 | rpCost <- function(x, y)
310 | {
311 |   costMatrix <- ifelse(diag(4) == 1, 0, 1)
312 |   costMatrix[4, 1] <- 10
313 |   costMatrix[3, 1] <- 5
314 |   costMatrix[4, 2] <- 5
315 |   costMatrix[3, 2] <- 5
316 |   library(rpart)
317 |   tmp <- x
318 |   tmp$y <- y
319 |   rpart(y~., data = tmp, control = rpart.control(cp = 0),
320 |         parms =list(loss = costMatrix))
321 | }
322 | rpPredict <- function(object, x) predict(object, x)
323 | 
324 | rpAgg <- function (x, type = "class")
325 | {
326 |   pooled <- x[[1]] * NA
327 |   n <- nrow(pooled)
328 |   classes <- colnames(pooled)
329 |   for (i in 1:ncol(pooled))
330 |   {
331 |     tmp <- lapply(x, function(y, col) y[, col], col = i)
332 |     tmp <- do.call("rbind", tmp)
333 |     pooled[, i] <- apply(tmp, 2, median)
334 |   }
335 |   pooled <- apply(pooled, 1, function(x) x/sum(x))
336 |   if (n != nrow(pooled)) pooled <- t(pooled)
337 |   out <- factor(classes[apply(pooled, 1, which.max)], levels = classes)
338 |   out
339 | }
340 | 
341 | 
342 | set.seed(857)
343 | rpCostBag <- train(trainData[, predictors],
344 |                    trainData$Class,
345 |                    "bag",
346 |                    B = 50,
347 |                    bagControl = bagControl(fit = rpCost,
348 |                                            predict = rpPredict,
349 |                                            aggregate = rpAgg,
350 |                                            downSample = FALSE,
351 |                                            allowParallel = FALSE),
352 |                    trControl = ctrl)
353 | rpCostBag
354 | 
355 | set.seed(857)
356 | svmRFit <- train(modForm ,
357 |                  data = trainData,
358 |                  method = "svmRadial",
359 |                  metric = "Cost",
360 |                  maximize = FALSE,
361 |                  preProc = c("center", "scale"),
362 |                  tuneLength = 15,
363 |                  trControl = ctrl)
364 | svmRFit
365 | 
366 | set.seed(857)
367 | svmRFitCost <- train(modForm, data = trainData,
368 |                      method = "svmRadial",
369 |                      metric = "Cost",
370 |                      maximize = FALSE,
371 |                      preProc = c("center", "scale"),
372 |                      class.weights = c(VF = 1, F = 1, M = 5, L = 10),
373 |                      tuneLength = 15,
374 |                      trControl = ctrl)
375 | svmRFitCost
376 | 
377 | modelList <- list(C5.0 = c50Fit,
378 |                   "C5.0 (Costs)" = c50Cost,
379 |                   CART =rpFit,
380 |                   "CART (Costs)" = rpFitCost,
381 |                   "Bagging (Costs)" = rpCostBag,
382 |                   FDA = fdaFit,
383 |                   SVM = svmRFit,
384 |                   "SVM (Weights)" = svmRFitCost,
385 |                   PLS = plsFit,
386 |                   "Random Forests" = rfFit,
387 |                   LDA = ldaFit,
388 |                   "LDA (Sparse)" = sldaFit,
389 |                   "Neural Networks" = nnetFit,
390 |                   Bagging = bagFit)
391 | 
392 | 
393 | ################################################################################
394 | ### Section 17.2 Results
395 | 
396 | rs <- resamples(modelList)
397 | summary(rs)
398 | 
399 | confusionMatrix(rpFitCost, "none")
400 | confusionMatrix(rfFit, "none") 
401 | 
402 | plot(bwplot(rs, metric = "Cost"))
403 | 
404 | rfPred <- predict(rfFit, testData)
405 | rpPred <- predict(rpFitCost, testData)
406 | 
407 | confusionMatrix(rfPred, testData$Class)
408 | confusionMatrix(rpPred, testData$Class)
409 | 
410 | 
411 | ################################################################################
412 | ### Session Information
413 | 
414 | sessionInfo()
415 | 
416 | q("no")
417 | 


--------------------------------------------------------------------------------
/inst/chapters/18_Importance.R:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
  3 | ### Copyright 2013 Kuhn and Johnson
  4 | ### Web Page: http://www.appliedpredictivemodeling.com
  5 | ### Contact: Max Kuhn (mxkuhn@gmail.com) 
  6 | ###
  7 | ### Chapter 18: Measuring Predictor Importance
  8 | ###
  9 | ### Required packages: AppliedPredictiveModeling, caret, CORElearn, corrplot,
 10 | ###                    pROC, minerva
 11 | ###                   
 12 | ###
 13 | ### Data used: The solubility data from the AppliedPredictiveModeling 
 14 | ###            package, the segmentation data in the caret package and the 
 15 | ###            grant data (created using "CreateGrantData.R" in the same
 16 | ###            directory as this file).
 17 | ###
 18 | ### Notes: 
 19 | ### 1) This code is provided without warranty.
 20 | ###
 21 | ### 2) This code should help the user reproduce the results in the
 22 | ### text. There will be differences between this code and what is is
 23 | ### the computing section. For example, the computing sections show
 24 | ### how the source functions work (e.g. randomForest() or plsr()),
 25 | ### which were not directly used when creating the book. Also, there may be 
 26 | ### syntax differences that occur over time as packages evolve. These files 
 27 | ### will reflect those changes.
 28 | ###
 29 | ### 3) In some cases, the calculations in the book were run in 
 30 | ### parallel. The sub-processes may reset the random number seed.
 31 | ### Your results may slightly vary.
 32 | ###
 33 | ################################################################################
 34 | 
 35 | 
 36 | 
 37 | ################################################################################
 38 | ### Section 18.1 Numeric Outcomes
 39 | 
 40 | ## Load the solubility data
 41 | 
 42 | library(AppliedPredictiveModeling)
 43 | data(solubility)
 44 | 
 45 | trainData <- solTrainXtrans
 46 | trainData$y <- solTrainY
 47 | 
 48 | 
 49 | ## keep the continuous predictors and append the outcome to the data frame
 50 | SolContPred <- solTrainXtrans[, !grepl("FP", names(solTrainXtrans))]
 51 | numSolPred <- ncol(SolContPred)
 52 | SolContPred$Sol <- solTrainY
 53 | 
 54 | ## Get the LOESS smoother and the summary measure
 55 | library(caret)
 56 | smoother <- filterVarImp(x = SolContPred[, -ncol(SolContPred)], 
 57 |                          y = solTrainY, 
 58 |                          nonpara = TRUE)
 59 | smoother$Predictor <- rownames(smoother)
 60 | names(smoother)[1] <- "Smoother"
 61 | 
 62 | ## Calculate the correlation matrices and keep the columns with the correlations
 63 | ## between the predictors and the outcome
 64 | 
 65 | correlations <- cor(SolContPred)[-(numSolPred+1),(numSolPred+1)]
 66 | rankCorrelations <- cor(SolContPred, method = "spearman")[-(numSolPred+1),(numSolPred+1)]
 67 | corrs <- data.frame(Predictor = names(SolContPred)[1:numSolPred],
 68 |                     Correlation = correlations,
 69 |                     RankCorrelation  = rankCorrelations)
 70 | 
 71 | ## The maximal information coefficient (MIC) values can be obtained from the
 72 | ### minerva package:
 73 | 
 74 | library(minerva)
 75 | MIC <- mine(x = SolContPred[, 1:numSolPred], y = solTrainY)$MIC
 76 | MIC <- data.frame(Predictor = rownames(MIC),
 77 |                   MIC = MIC[,1])
 78 | 
 79 | 
 80 | ## The Relief values for regression can be computed using the CORElearn
 81 | ## package:
 82 | 
 83 | library(CORElearn)
 84 | ReliefF <- attrEval(Sol ~ .,  data = SolContPred,
 85 |                     estimator = "RReliefFequalK")
 86 | ReliefF <- data.frame(Predictor = names(ReliefF),
 87 |                   Relief = ReliefF)
 88 | 
 89 | ## Combine them all together for a plot
 90 | contDescrScores <- merge(smoother, corrs)
 91 | contDescrScores <- merge(contDescrScores, MIC)
 92 | contDescrScores <- merge(contDescrScores, ReliefF)
 93 | 
 94 | rownames(contDescrScores) <- contDescrScores$Predictor
 95 | 
 96 | contDescrScores
 97 | 
 98 | contDescrSplomData <- contDescrScores
 99 | contDescrSplomData$Correlation <- abs(contDescrSplomData$Correlation)
100 | contDescrSplomData$RankCorrelation <- abs(contDescrSplomData$RankCorrelation)
101 | contDescrSplomData$Group <- "Other"
102 | contDescrSplomData$Group[grepl("Surface", contDescrSplomData$Predictor)] <- "SA"
103 | 
104 | featurePlot(solTrainXtrans[, c("NumCarbon", "SurfaceArea2")],
105 |             solTrainY,
106 |             between = list(x = 1),
107 |             type = c("g", "p", "smooth"),
108 |             df = 3,
109 |             aspect = 1,
110 |             labels = c("", "Solubility"))
111 | 
112 | 
113 | splom(~contDescrSplomData[,c(3, 4, 2, 5)],
114 |       groups = contDescrSplomData$Group,
115 |       varnames = c("Correlation", "Rank\nCorrelation", "LOESS", "MIC"))
116 | 
117 | 
118 | ## Now look at the categorical (i.e. binary) predictors
119 | SolCatPred <- solTrainXtrans[, grepl("FP", names(solTrainXtrans))]
120 | SolCatPred$Sol <- solTrainY
121 | numSolCatPred <- ncol(SolCatPred) - 1
122 | 
123 | tests <- apply(SolCatPred[, 1:numSolCatPred], 2,
124 |                   function(x, y)
125 |                     {
126 |                     tStats <- t.test(y ~ x)[c("statistic", "p.value", "estimate")]
127 |                     unlist(tStats)
128 |                     },
129 |                y = solTrainY)
130 | ## The results are a matrix with predictors in columns. We reverse this
131 | tests <- as.data.frame(t(tests))
132 | names(tests) <- c("t.Statistic", "t.test_p.value", "mean0", "mean1")
133 | tests$difference <- tests$mean1 - tests$mean0
134 | tests
135 | 
136 | ## Create a volcano plot
137 | 
138 | xyplot(-log10(t.test_p.value) ~ difference,
139 |        data = tests,
140 |        xlab = "Mean With Structure - Mean Without Structure",
141 |        ylab = "-log(p-Value)",
142 |        type = "p")
143 | 
144 | ################################################################################
145 | ### Section 18.2 Categorical Outcomes
146 | 
147 | ## Load the segmentation data
148 | 
149 | data(segmentationData)
150 | segTrain <- subset(segmentationData, Case == "Train")
151 | segTrain$Case <- segTrain$Cell <- NULL
152 | 
153 | segTest <- subset(segmentationData, Case != "Train")
154 | segTest$Case <- segTest$Cell <- NULL
155 | 
156 | ## Compute the areas under the ROC curve
157 | aucVals <- filterVarImp(x = segTrain[, -1], y = segTrain$Class)
158 | aucVals$Predictor <- rownames(aucVals)
159 | 
160 | ## Cacluate the t-tests as before but with x and y switched
161 | segTests <- apply(segTrain[, -1], 2,
162 |                   function(x, y)
163 |                     {
164 |                     tStats <- t.test(x ~ y)[c("statistic", "p.value", "estimate")]
165 |                     unlist(tStats)
166 |                     },
167 |                y = segTrain$Class)
168 | segTests <- as.data.frame(t(segTests))
169 | names(segTests) <- c("t.Statistic", "t.test_p.value", "mean0", "mean1")
170 | segTests$Predictor <- rownames(segTests)
171 | 
172 | ## Fit a random forest model and get the importance scores
173 | library(randomForest)
174 | set.seed(791)
175 | rfImp <- randomForest(Class ~ ., data = segTrain, 
176 |                       ntree = 2000, 
177 |                       importance = TRUE)
178 | rfValues <- data.frame(RF = importance(rfImp)[, "MeanDecreaseGini"],
179 |                        Predictor = rownames(importance(rfImp)))
180 | 
181 | ## Now compute the Relief scores
182 | set.seed(791)
183 | 
184 | ReliefValues <- attrEval(Class ~ ., data = segTrain,
185 |                          estimator="ReliefFequalK", ReliefIterations = 50)
186 | ReliefValues <- data.frame(Relief = ReliefValues,
187 |                            Predictor = names(ReliefValues))
188 | 
189 | ## and the MIC statistics
190 | set.seed(791)
191 | segMIC <- mine(x = segTrain[, -1],
192 |                ## Pass the outcome as 0/1
193 |                y = ifelse(segTrain$Class == "PS", 1, 0))$MIC
194 | segMIC <- data.frame(Predictor = rownames(segMIC),
195 |                   MIC = segMIC[,1])
196 | 
197 | 
198 | rankings <- merge(segMIC, ReliefValues)
199 | rankings <- merge(rankings, rfValues)
200 | rankings <- merge(rankings, segTests)
201 | rankings <- merge(rankings, aucVals)
202 | rankings
203 | 
204 | rankings$channel <- "Channel 1"
205 | rankings$channel[grepl("Ch2$", rankings$Predictor)] <- "Channel 2"
206 | rankings$channel[grepl("Ch3$", rankings$Predictor)] <- "Channel 3"
207 | rankings$channel[grepl("Ch4$", rankings$Predictor)] <- "Channel 4"
208 | rankings$t.Statistic <- abs(rankings$t.Statistic)
209 | 
210 | splom(~rankings[, c("PS", "t.Statistic", "RF", "Relief", "MIC")],
211 |       groups = rankings$channel,
212 |       varnames = c("ROC\nAUC", "Abs\nt-Stat", "Random\nForest", "Relief", "MIC"),
213 |       auto.key = list(columns = 2))
214 | 
215 | 
216 | ## Load the grant data. A script to create and save these data is contained
217 | ## in the same directory as this file.
218 | 
219 | load("grantData.RData")
220 | 
221 | dataSubset <- training[pre2008, c("Sponsor62B", "ContractValueBandUnk", "RFCD240302")]
222 | 
223 | ## This is a simple function to compute several statistics for binary predictors
224 | tableCalcs <- function(x, y)
225 |   {
226 |   tab <- table(x, y)
227 |   fet <- fisher.test(tab)
228 |   out <- c(OR = fet$estimate,
229 |            P = fet$p.value,
230 |            Gain = attrEval(y ~ x, estimator = "GainRatio"))
231 |   }
232 | 
233 | ## lapply() is used to execute the function on each column
234 | tableResults <- lapply(dataSubset, tableCalcs, y = training[pre2008, "Class"])
235 | 
236 | ## The results come back as a list of vectors, and "rbind" is used to join
237 | ## then together as rows of a table
238 | tableResults <- do.call("rbind", tableResults)
239 | tableResults
240 | 
241 | ## The permuted Relief scores can be computed using a function from the
242 | ## AppliedPredictiveModeling package. 
243 | 
244 | permuted <- permuteRelief(x = training[pre2008, c("Sponsor62B", "Day", "NumCI")], 
245 |                           y = training[pre2008, "Class"],
246 |                           nperm = 500,
247 |                           ### the remaining options are passed to attrEval()
248 |                           estimator="ReliefFequalK", 
249 |                           ReliefIterations= 50)
250 | 
251 | ## The original Relief scores:
252 | permuted$observed
253 | 
254 | ## The number of standard deviations away from the permuted mean:
255 | permuted$standardized
256 | 
257 | ## The distributions of the scores if there were no relationship between the
258 | ## predictors and outcomes
259 | 
260 | histogram(~value|Predictor, 
261 |           data = permuted$permutations, 
262 |           xlim = extendrange(permuted$permutations$value),
263 |           xlab = "Relief Score")
264 | 
265 | 
266 | ################################################################################
267 | ### Session Information
268 | 
269 | sessionInfo()
270 | 
271 | q("no")
272 | 


--------------------------------------------------------------------------------
/man/AlzheimerDisease.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{AlzheimerDisease}
 5 | \alias{AlzheimerDisease}
 6 | \alias{diagnosis}
 7 | \alias{predictors}
 8 | \title{Alzheimer's Disease CSF Data}
 9 | \source{
10 | Craig-Schapiro, R., Kuhn, M., Xiong, C., Pickering, E. H., Liu, J.,
11 | Misko, T. P., Perrin, R. J., et al. (2011). Multiplexed Immunoassay Panel
12 | Identifies Novel CSF Biomarkers for Alzheimer's Disease Diagnosis and
13 | Prognosis. PLoS ONE, 6(4), e18850.
14 | }
15 | \value{
16 | \item{diagnosis}{labels for the patients, either "Impaired" or
17 | "Control". } \item{predictors}{predictors for demographic data (eg. age,
18 | gender), genotype and assay results.}
19 | }
20 | \description{
21 | Washington University conducted a clinical study to determine if biological
22 | measurements made from cerebrospinal fluid (CSF) can be used to diagnose or
23 | predict Alzheimer's disease (Craig-Schapiro et al. 2011). These data are a
24 | modified version of the values used for the publication.
25 | }
26 | \details{
27 | The R factor vector \code{diagnosis} contains the outcome data for 333 of
28 | the subjects. The demographic and laboratory results are collected in the
29 | data frame \code{predictors}.
30 | 
31 | One important indicator of Alzheimer's disease is the genetic background of
32 | a subject. In particular, what versions of the Apolipoprotein E gene
33 | inherited from one's parents has an association with the disease. There are
34 | three variants of the gene: E2, E3 and E4. Since a child inherits a version
35 | of the gene from each parent, there are six possible combinations (e.g.
36 | E2/E2, E2/E3, and so on). This data is contained in the predictor column
37 | named \code{Genotype}.
38 | }
39 | \examples{
40 | 
41 | data(AlzheimerDisease)
42 | 
43 | }
44 | \keyword{datasets}
45 | 


--------------------------------------------------------------------------------
/man/ChemicalManufacturingProcess.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{ChemicalManufacturingProcess}
 5 | \alias{ChemicalManufacturingProcess}
 6 | \title{Chemical Manufacturing Process Data}
 7 | \value{
 8 | \code{ChemicalManufacturingProcess}: a data frame with columns for
 9 | the outcome (\code{Yield}) and the predictors (\code{BiologicalMaterial01}
10 | though \code{BiologicalMaterial12} and \code{ManufacturingProcess01} though
11 | \code{ManufacturingProcess45}
12 | }
13 | \description{
14 | This data set contains information about a chemical manufacturing process,
15 | in which the goal is to understand the relationship between the process and
16 | the resulting final product yield.  Raw material in this process is put
17 | through a sequence of 27 steps to generate the final pharmaceutical product.
18 | The starting material is generated from a biological unit and has a range of
19 | quality and characteristics.  The objective in this project was to develop a
20 | model to predict percent yield of the manufacturing process.  The data set
21 | consisted of 177 samples of biological material for which 57 characteristics
22 | were measured.  Of the 57 characteristics, there were 12 measurements of the
23 | biological starting material, and 45 measurements of the manufacturing
24 | process.  The process variables included measurements such as temperature,
25 | drying time, washing time, and concentrations of by--products at various
26 | steps.  Some of the process measurements can be controlled, while others are
27 | observed.  Predictors are continuous, count, categorical; some are
28 | correlated, and some contain missing values.  Samples are not independent
29 | because sets of samples come from the same batch of biological starting
30 | material.
31 | }
32 | \examples{
33 | 
34 | data(ChemicalManufacturingProcess)
35 | 
36 | }
37 | \keyword{datasets}
38 | 


--------------------------------------------------------------------------------
/man/FuelEconomy.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{FuelEconomy}
 5 | \alias{FuelEconomy}
 6 | \alias{cars2010}
 7 | \alias{cars2011}
 8 | \alias{cars2012}
 9 | \title{Fuel Economy Data}
10 | \value{
11 | \item{cars2010}{data in cars from model year 2010. }
12 | \item{cars2011}{cars introduced in 2011 that were not in the model year 2010
13 | data.} \item{cars2012}{cars introduced in 2012 that were not in the model
14 | year 2010 or 2011 data }
15 | }
16 | \description{
17 | The \url{http://fueleconomy.gov} website, run by the U.S. Department of
18 | Energy's Office of Energy Efficiency and Renewable Energy and the U.S.
19 | Environmental Protection Agency, lists different estimates of fuel economy
20 | for passenger cars and trucks. For each vehicle, various characteristics are
21 | recorded such as the engine displacement or number of cylinders. Along with
22 | these values, laboratory measurements are made for the city and highway
23 | miles per gallon (MPG) of the car.
24 | }
25 | \details{
26 | Predictors extracted from the website include: \code{EngDispl},
27 | \code{NumCyl}, \code{Transmission}, \code{AirAspirationMethod},
28 | \code{NumGears}, \code{TransLockup}, \code{TransCreeperGear},
29 | \code{DriveDesc}, \code{IntakeValvePerCyl}, \code{ExhaustValvesPerCyl},
30 | \code{CarlineClassDesc}, \code{VarValveTiming} and \code{VarValveLift}. The
31 | outcome used in the book is in column \code{FE} and is the unadjusted
32 | highway data.
33 | }
34 | \examples{
35 | 
36 | data(FuelEconomy)
37 | 
38 | library(lattice)
39 | 
40 | ### Plot shown in the text:
41 | 
42 | cars2010 <- cars2010[order(cars2010$EngDispl),]
43 | cars2011 <- cars2011[order(cars2011$EngDispl),]
44 | 
45 | cars2010a <- cars2010
46 | cars2010a$Year <- "2010 Model Year"
47 | cars2011a <- cars2011
48 | cars2011a$Year <- "2011 Model Year"
49 | 
50 | plotData <- rbind(cars2010a, cars2011a)
51 | 
52 | plotTheme <- bookTheme(FALSE)
53 | plotTheme$plot.symbol$col <- rgb(.2, .2, .2, .5)
54 | plotTheme$plot.symbol$cex <- 0.7
55 | trellis.par.set(plotTheme)
56 | 
57 | xyplot(FE ~ EngDispl|Year, plotData,
58 |        xlab = "Engine Displacement",
59 |        ylab = "Fuel Efficiency (MPG)",
60 |        between = list(x = 1.2))
61 | 
62 | 
63 | }
64 | \keyword{datasets}
65 | 


--------------------------------------------------------------------------------
/man/Hepatic.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{hepatic}
 5 | \alias{hepatic}
 6 | \alias{bio}
 7 | \alias{chem}
 8 | \alias{injury}
 9 | \title{Hepatic Injury Data}
10 | \value{
11 | \item{bio}{Biological screen results. } \item{chem}{Chemical
12 | fingerprints for sub-structures.} \item{injury}{A factor vector of
13 | outcomes.}
14 | }
15 | \description{
16 | This data set was used to develop a model for predicting compounds'
17 | probability of causing hepatic injury (i.e. liver damage). This data set
18 | consisted of 281 unique compounds; 376 predictors were measured or computed
19 | for each. The response was categorical (either "None", "Mild" or "Severe"
20 | ),and was highly unbalanced.
21 | }
22 | \details{
23 | This kind of response often occurs in pharmaceutical data because companies
24 | steer away from creating molecules that have undesirable characteristics.
25 | Therefore, well-behaved molecules often greatly outnumber undesirable
26 | molecules. The predictors consisted of measurements from 184 biological
27 | screens and 192 chemical feature predictors. The biological predictors
28 | represent activity for each screen and take values between 0 and 10 with a
29 | mode of 4. The chemical feature predictors represent counts of important
30 | sub-structures as well as measures of physical properties that are thought
31 | to be associated with hepatic injury.
32 | }
33 | \examples{
34 | 
35 | data(hepatic)
36 | 
37 | }
38 | \keyword{datasets}
39 | 


--------------------------------------------------------------------------------
/man/abalone.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{abalone}
 5 | \alias{abalone}
 6 | \title{Abalone Data}
 7 | \value{
 8 | \item{abalone}{a data frame with 4177 rows and 9 columns}
 9 | }
10 | \description{
11 | The Abalone data consist of data from 4177 abalones. The data consist of
12 | measurements of the type (male, female and infant), the longest shell
13 | measurement, the diameter, height and several weights (whole, shucked,
14 | viscera and shell). The outcome is the number of rings. The age of the
15 | abalone is the number of rings plus 1.5.
16 | }
17 | \details{
18 | The data are taken from the UCI database
19 | (\url{http://archive.ics.uci.edu/ml/datasets/Abalone}).
20 | }
21 | \examples{
22 | 
23 | data(abalone)
24 | 
25 | }
26 | \keyword{datasets}
27 | 


--------------------------------------------------------------------------------
/man/apm-internal.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \name{apm-internal}
 4 | \alias{apm-internal}
 5 | \alias{lowerp}
 6 | \alias{upperp}
 7 | \title{Internal Functions}
 8 | \arguments{
 9 | \item{\dots}{optional arguments to pass to internal functions}
10 | }
11 | \description{
12 | Internal functions
13 | }
14 | \author{
15 | Max Kuhn
16 | }
17 | \keyword{internal}
18 | 


--------------------------------------------------------------------------------
/man/bookTheme.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/bookTheme.R
 3 | \name{bookTheme}
 4 | \alias{bookTheme}
 5 | \alias{transparentTheme}
 6 | \title{Lattice Themes}
 7 | \usage{
 8 | bookTheme(set = TRUE)
 9 | 
10 | transparentTheme(set = TRUE, pchSize = 1, trans = 0.2)
11 | }
12 | \arguments{
13 | \item{set}{a logical: should these settings be applied to the current
14 | device?}
15 | 
16 | \item{pchSize}{the size of the plot symbols}
17 | 
18 | \item{trans}{the amount of transparency (via the alpha channel). Note that
19 | transparency is not supported by all graphics devices.}
20 | }
21 | \value{
22 | Each function returns a list of theme parameters. See Sarkar (2008)
23 | or \code{\link[lattice]{trellis.par.get}} for specific details.
24 | }
25 | \description{
26 | Two \pkg{lattice} themes used throughout the book.
27 | }
28 | \details{
29 | When using these functions to save a plot, make sure to invoke them after
30 | the device has been opened (e.g. after calls such as \code{pdf()}.
31 | }
32 | \examples{
33 | 
34 | library(lattice)
35 | 
36 | example <- quadBoundaryFunc(100)
37 | 
38 | bookTheme(set = TRUE)
39 | xyplot(X2 ~ X1, data = example, groups = class, auto.key = TRUE)
40 | 
41 | transparentTheme(set = TRUE, trans = .6)
42 | xyplot(X2 ~ X1, data = example, groups = class, auto.key = TRUE)
43 | 
44 | }
45 | \references{
46 | Some of the colors are based on values from ColorBrewer
47 | \url{http://www.colorbrewer.org}.
48 | 
49 | Sarkar, D. (2008). Lattice: Multivariate Data Visualization with R. UseR!
50 | (1st ed. p. 286). Springer.
51 | }
52 | \author{
53 | Max Kuhn
54 | }
55 | \keyword{hplot}
56 | 


--------------------------------------------------------------------------------
/man/concrete.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{concrete}
 5 | \alias{concrete}
 6 | \alias{mixtures}
 7 | \title{Compressive Strength of Concrete from Yeh (1998)}
 8 | \source{
 9 | Yeh, I. C. (1998). Modeling of strength of high-performance concrete
10 | using artificial neural networks. \emph{Cement and Concrete Research},
11 | 28(12), 1797-1808. Elsevier.
12 | }
13 | \value{
14 | \item{concrete}{data frame of data with predictor columns
15 | \code{Cement}, \code{BlastFurnaceSlag}, \code{FlyAsh}, \code{Water},
16 | \code{Superplasticizer}, \code{CoarseAggregate}, \code{FineAggregate} and
17 | \code{Age} with response column \code{CompressiveStrength}. These are the
18 | amounts.} \item{mixtures}{The same data where all the ingredients have been
19 | converted to proportions of the total amounts.}
20 | }
21 | \description{
22 | Yeh (1998) describes a collection of data sets from different sources that
23 | can be used for modeling the compressive strength of concrete formulations
24 | as a functions of their ingredients and age.
25 | }
26 | \details{
27 | The data are from Yeh (1998) and taken from the UCI ML website
28 | \url{http://archive.ics.uci.edu/ml/datasets/Concrete+Compressive+Strength}.
29 | 
30 | There are 1030 data points from the UCI website, but the paper states that
31 | approximately 1,000 samples were made, but only 727 were analyzed in the
32 | source material. It is unclear which samples were excluded.
33 | }
34 | \examples{
35 | 
36 | if (require("caret")) {
37 |   data(concrete)
38 | 
39 |   library(caret)
40 | 
41 |   ### Split used in the book:
42 |   set.seed(975)
43 |   inTrain <- createDataPartition(mixtures$CompressiveStrength, p = 3/4)[[1]]
44 |   training <- mixtures[ inTrain,]
45 |   testing  <- mixtures[-inTrain,]
46 | }
47 | 
48 | 
49 | }
50 | \keyword{datasets}
51 | 


--------------------------------------------------------------------------------
/man/getPackages.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/getPackages.R
 3 | \name{getPackages}
 4 | \alias{getPackages}
 5 | \title{Install Packages for Each Chapter}
 6 | \usage{
 7 | getPackages(chapter, ...)
 8 | }
 9 | \arguments{
10 | \item{chapter}{an integer vector (or character versions of the integer) for
11 | the chapter number. See Details below:}
12 | 
13 | \item{...}{options to pass to \code{\link[utils]{install.packages}}}
14 | }
15 | \description{
16 | This function identifies the physical location on the user's computer where
17 | the chapter R scripts are located.
18 | }
19 | \details{
20 | Chapter names and packages. about dependencies.
21 | }
22 | \examples{
23 | 
24 | \dontrun{
25 | getPackages(2)
26 | getPackages(2:3)
27 | getPackages("4")
28 | }
29 | 
30 | }
31 | \author{
32 | Max Kuhn
33 | }
34 | \keyword{utilities}
35 | 


--------------------------------------------------------------------------------
/man/logisticCreditPredictions.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{logisticCreditPredictions}
 5 | \alias{logisticCreditPredictions}
 6 | \title{Logistic Regression Predictions for the Credit Data}
 7 | \value{
 8 | A data frame with columns \item{Bad}{The predicted class probability
 9 | for bad credit. } \item{Good}{The predicted class probability for good
10 | credit.} \item{pred}{The predicted class. } \item{obs}{The observed class }
11 | }
12 | \description{
13 | add some notes
14 | }
15 | \examples{
16 | 
17 | ## show code to make the predictions
18 | 
19 | }
20 | \keyword{datasets}
21 | 


--------------------------------------------------------------------------------
/man/permeability.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{permeability}
 5 | \alias{permeability}
 6 | \alias{fingerprints}
 7 | \title{Permeability Data}
 8 | \source{
 9 | Kansy, M., Senner, F., and Gubernator, K. (1998). Physicochemical
10 | High Throughput Screening: Parallel Artificial Membrane Permeation Assay in
11 | the Description of Passive Absorption Processes. J. Med. Chem, 41(7),
12 | 1007-1010.
13 | }
14 | \value{
15 | \item{permeability}{permeability values for each compound. }
16 | \item{fingerprints}{a matrix of binary fingerprint indicator variables.}
17 | }
18 | \description{
19 | This pharmaceutical data set was used to develop a model for predicting
20 | compounds' permeability.  In short, permeability is the measure of a
21 | molecule's ability to cross a membrane.  The body, for example, has notable
22 | membranes between the body and brain, known as the blood-brain barrier, and
23 | between the gut and body in the intestines.  These membranes help the body
24 | guard critical regions from receiving undesirable or detrimental substances.
25 | For an orally taken drug to be effective in the brain, it first must pass
26 | through the intestinal wall and then must pass through the blood-brain
27 | barrier in order to be present for the desired neurological target.
28 | Therefore, a compound's ability to permeate relevant biological membranes is
29 | critically important to understand early in the drug discovery process.
30 | Compounds that appear to be effective for a particular disease in research
31 | screening experiments, but appear to be poorly permeable may need to be
32 | altered in order improve permeability, and thus the compound's ability to
33 | reach the desired target.  Identifying permeability problems can help guide
34 | chemists towards better molecules.
35 | }
36 | \details{
37 | Permeability assays such as PAMPA and Caco-2 have been developed to help
38 | measure compounds' permeability (Kansy et al, 1998).  These screens are
39 | effective at quantifying a compound's permeability, but the assay is
40 | expensive labor intensive.  Given a sufficient number of compounds that have
41 | been screened, we could develop a predictive model for permeability in an
42 | attempt to potentially reduce the need for the assay.  In this project there
43 | were 165 unique compounds; 1107 molecular fingerprints were determined for
44 | each.  A molecular fingerprint is a binary sequence of numbers that
45 | represents the presence or absence of a specific molecular sub-structure.
46 | The response is highly skewed, the predictors are sparse (15.5 percent are
47 | present), and many predictors are strongly associated.
48 | }
49 | \examples{
50 | 
51 | data(permeability)
52 | 
53 | hist(permeability)
54 | 
55 | summary(apply(fingerprints, 2, mean))
56 | 
57 | }
58 | \keyword{datasets}
59 | 


--------------------------------------------------------------------------------
/man/permuteRelief.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/permuteRelief.R
 3 | \name{permuteRelief}
 4 | \alias{permuteRelief}
 5 | \title{Permutation Statistics for the Relief Algorithm}
 6 | \usage{
 7 | permuteRelief(x, y, nperm = 100, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{a data frame of predictor data}
11 | 
12 | \item{y}{a vector of outcomes}
13 | 
14 | \item{nperm}{the number of random permutations of the data}
15 | 
16 | \item{\dots}{options to pass to \code{\link[CORElearn]{attrEval}}, such as
17 | the exact Relief algorithm, to use}
18 | }
19 | \value{
20 | a list with elements \item{standardized }{a vector of standardized
21 | predictor scores} \item{permutations }{the values of the permuted scores,
22 | for plotting to assess the permutation distribution} \item{observed}{the
23 | observed scores} \item{options}{a list of options passed using \ldots{}}
24 | }
25 | \description{
26 | This function uses a permutation approach to determining the relative
27 | magnitude of Relief scores (Kira and Rendell, 1992 and Kononenko, 1994).
28 | }
29 | \details{
30 | The scores for each predictor are computed using the original data and after
31 | outcome data are randomly scrambled (\code{nprem} times). The mean and
32 | standard deviation of the permuted values are determined and a standardized
33 | version of the observed scores are determined by subtracting the permuted
34 | means from the original values, then dividing each by the corresponding
35 | standard deviation.
36 | }
37 | \examples{
38 | 
39 | set.seed(874)
40 | reliefEx3 <- easyBoundaryFunc(500)
41 | reliefEx3$X1 <- scale(reliefEx3$X1)
42 | reliefEx3$X2 <- scale(reliefEx3$X2)
43 | reliefEx3$prob <- NULL
44 | 
45 | standardized <- permuteRelief(reliefEx3[, 1:2], reliefEx3$class, 
46 |                               ## For efficiency, a small number of
47 |                               ## permutations are used here.
48 |                               nperm = 50,           
49 |                               estimator="ReliefFequalK", 
50 |                               ReliefIterations= 50)
51 | 
52 | 
53 | }
54 | \references{
55 | Kira, K., & Rendell, L. (1992). The feature selection problem:
56 | Traditional methods and a new algorithm. \emph{Proceedings of the Eleventh
57 | International Conference on Machine Learning}, 129-129.
58 | 
59 | Kononenko, I. (1994). Estimating attributes: analysis and extensions of
60 | RELIEF. Machine Learning: ECML-94, 171-182.
61 | }
62 | \seealso{
63 | \code{\link[CORElearn]{attrEval}}
64 | }
65 | \author{
66 | Max Kuhn
67 | }
68 | \keyword{htest}
69 | 


--------------------------------------------------------------------------------
/man/quadBoundaryFunc.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/quadBoundaryFunc.R
 3 | \name{quadBoundaryFunc}
 4 | \alias{quadBoundaryFunc}
 5 | \alias{easyBoundaryFunc}
 6 | \title{Functions for Simulating Data}
 7 | \usage{
 8 | quadBoundaryFunc(n)
 9 | 
10 | easyBoundaryFunc(n, intercept = 0, interaction = 2)
11 | }
12 | \arguments{
13 | \item{n}{the sample size}
14 | 
15 | \item{intercept}{the coefficient for the logistic regression intercept term}
16 | 
17 | \item{interaction}{the coefficient for the logistic regression interaction
18 | term}
19 | }
20 | \value{
21 | Both functions return data frames with columns \item{X1}{numeric
22 | predictor value} \item{X2}{numeric predictor value} \item{prob }{numeric
23 | value reflecting the true probability of the first class} \item{class }{a
24 | factor variable with levels 'Class1' and 'Class2'}
25 | }
26 | \description{
27 | These functions simulate data that are used in the text.
28 | }
29 | \details{
30 | The \code{quadBoundaryFunc} function creates a class boundary that is a
31 | function of both predictors. The probability values are based on a logistic
32 | regression model with model equation: \eqn{-1-2X_1 -0.2X_1^2 +
33 | 2X_2^2}{-1-2*X1 -0.2*X1^2 + 2*X2^2}. The predictors here are multivariate
34 | normal with mean (1, 0) and a moderate degree of positive correlation.
35 | 
36 | Similarly, the \code{easyBoundaryFunc} uses a logistic regression model with
37 | model equation: \eqn{intercept -4X_1 + 4X_2 + interaction \times X_1 \times
38 | X_2}{intercept -4*X1 + 4*X2 + interaction*X1*X2}. The predictors here are
39 | multivariate normal with mean (1, 0) and a strong positive correlation.
40 | }
41 | \examples{
42 | 
43 | ## in Chapter 11, 'Measuring Performance in Classification Model'
44 | set.seed(975)
45 | training <- quadBoundaryFunc(500)
46 | testing <- quadBoundaryFunc(1000)
47 |  
48 | 
49 | ## in Chapter 20, 'Factors That Can Affect Model Performance'
50 | set.seed(615)
51 | dat <- easyBoundaryFunc(200, interaction = 3, intercept = 3)
52 | dat$X1 <- scale(dat$X1)
53 | dat$X2 <- scale(dat$X2)
54 | dat$Data <- "Original"
55 | dat$prob <- NULL
56 | 
57 | ## in Chapter X, 'An Introduction to Feature Selection'
58 | 
59 | set.seed(874)
60 | reliefEx3 <- easyBoundaryFunc(500)
61 | reliefEx3$X1 <- scale(reliefEx3$X1)
62 | reliefEx3$X2 <- scale(reliefEx3$X2)
63 | reliefEx3$prob <- NULL
64 | 
65 | 
66 | }
67 | \author{
68 | Max Kuhn
69 | }
70 | \keyword{utilities}
71 | 


--------------------------------------------------------------------------------
/man/schedulingData.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{schedulingData}
 5 | \alias{schedulingData}
 6 | \title{HPC Job Scheduling Data}
 7 | \value{
 8 | \item{schedulingData}{a data frame with 4331 rows and 8 columns}
 9 | }
10 | \description{
11 | These data consist of information on 4331 jobs in a high performance
12 | computing environment. Seven attributes were recorded for each job along
13 | with a discrete class describing the execution time.
14 | }
15 | \details{
16 | The predictors are: \code{Protocol} (the type of computation),
17 | \code{Compounds} (the number of data points for each jobs),
18 | \code{InputFields} (the number of characteristic being estimated),
19 | \code{Iterations} (maximum number of iterations for the computations),
20 | \code{NumPending} (the number of other jobs pending at the time of launch),
21 | \code{Hour} (decimal hour of day for launch time) and \code{Day} (of launch
22 | time).
23 | 
24 | The classes are: \code{VF} (very fast), \code{F} (fast), \code{M} (moderate)
25 | and \code{L} (long).
26 | }
27 | \examples{
28 | 
29 | data(schedulingData)
30 | 
31 | library(caret)
32 | 
33 | set.seed(1104)
34 | inTrain <- createDataPartition(schedulingData$Class, p = .8, list = FALSE)
35 | 
36 | schedulingData$NumPending <- schedulingData$NumPending + 1
37 | 
38 | trainData <- schedulingData[ inTrain,]
39 | testData  <- schedulingData[-inTrain,]
40 | 
41 | modForm <- as.formula(Class ~ Protocol + log10(Compounds) +
42 |                       log10(InputFields)+ log10(Iterations) +
43 |                       log10(NumPending) + Hour + Day)
44 | 
45 | 
46 | 
47 | }
48 | \keyword{datasets}
49 | 


--------------------------------------------------------------------------------
/man/scriptLocation.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scriptLocation.R
 3 | \name{scriptLocation}
 4 | \alias{scriptLocation}
 5 | \title{Find Chapter Script Files}
 6 | \usage{
 7 | scriptLocation()
 8 | }
 9 | \description{
10 | This function identifies the physical location on the user's computer where
11 | the chapter R scripts are located.
12 | }
13 | \examples{
14 | 
15 | scriptLocation()
16 | 
17 | }
18 | \author{
19 | Max Kuhn
20 | }
21 | \keyword{utilities}
22 | 


--------------------------------------------------------------------------------
/man/segmentationOriginal.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{segmentationOriginal}
 5 | \alias{segmentationOriginal}
 6 | \title{Cell Body Segmentation}
 7 | \source{
 8 | Hill, LaPan, Li and Haney (2007). Impact of image segmentation on
 9 | high-content screening data quality for SK-BR-3 cells, \emph{BMC
10 | Bioinformatics}, Vol. 8, pg. 340,
11 | \url{http://www.biomedcentral.com/1471-2105/8/340}.
12 | }
13 | \value{
14 | \item{segmentationOriginal}{data frame of cells}
15 | }
16 | \description{
17 | Hill, LaPan, Li and Haney (2007) develop models to predict which cells in a
18 | high content screen were well segmented.  The data consists of 119 imaging
19 | measurements on 2019. The original analysis used 1009 for training and 1010
20 | as a test set (see the column called \code{Case}).
21 | }
22 | \details{
23 | The outcome class is contained in a factor variable called \code{Class} with
24 | levels "PS" for poorly segmented and "WS" for well segmented.
25 | 
26 | A pre-processed version of these data can be found in the \pkg{caret}
27 | package.
28 | }
29 | \keyword{datasets}
30 | 


--------------------------------------------------------------------------------
/man/solubility.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{solubility}
 5 | \alias{solubility}
 6 | \alias{trainX}
 7 | \alias{solTestXtrans}
 8 | \alias{solTrainY}
 9 | \alias{solTestX}
10 | \alias{solTrainX}
11 | \alias{solTrainXtrans}
12 | \alias{solTestY}
13 | \title{Solubility Data}
14 | \source{
15 | Tetko, I., Tanchuk, V., Kasheva, T., and Villa, A. (2001).
16 | Estimation of aqueous solubility of chemical compounds using E-state
17 | indices. \emph{Journal of Chemical Information and Computer Sciences},
18 | 41(6), 1488-1493.
19 | 
20 | Huuskonen, J. (2000). Estimation of aqueous solubility for a diverse set of
21 | organic compounds based on molecular topology. \emph{Journal of Chemical
22 | Information and Computer Sciences}, 40(3), 773-777.
23 | }
24 | \value{
25 | \item{solTrainX}{training set predictors in their natural units. }
26 | \item{solTrainXtrans}{training set predictors after transformations for
27 | skewness and centering/scaling.} \item{solTrainY}{a vector of log10
28 | solubility values for the training set. } \item{solTestX}{test set
29 | predictors in their natural units. } \item{solTestXtrans}{test set
30 | predictors after the same transformations used on the training set are
31 | applied.} \item{solTestY}{a vector of log10 solubility values for the
32 | training set. }
33 | }
34 | \description{
35 | Tetko et al. (2001) and Huuskonen (2000) investigated a set of compounds
36 | with corresponding experimental solubility values using complex sets of
37 | descriptors. They used linear regression and neural network models to
38 | estimate the relationship between chemical structure and solubility. For our
39 | analyses, we will use 1267 compounds and a set of more understandable
40 | descriptors that fall into one of three groups: 208 binary "fingerprints"
41 | that indicate the presence or absence of a particular chemical
42 | sub-structure, 16 count descriptors (such as the number of bonds or the
43 | number of Bromine atoms) and 4 continuous descriptors (such as molecular
44 | weight or surface area).
45 | }
46 | \examples{
47 | 
48 | data(solubility)
49 | 
50 | library(caret)
51 | 
52 | ### Cross-validation splits used in the book:
53 | set.seed(100)
54 | indx <- createFolds(solTrainY, returnTrain = TRUE)
55 | 
56 | ### To re-create the transformed version of the data:
57 | \dontrun{
58 | ## Find the predictors that are not fingerprints
59 | contVars <- names(solTrainX)[!grepl("FP", names(solTrainX))]
60 | ## Some have zero values, so we need to add one to them so that
61 | ## we can use the Box-Cox transformation. Alternatively, we could 
62 | ## use the Yeo-Johnson transformation without altering the data.
63 | contPredTrain <- solTrainX[,contVars] + 1
64 | contPredTest  <-  solTestX[,contVars] + 1
65 | 
66 | pp <- preProcess(contPredTrain, method = "BoxCox")
67 | contPredTrain <- predict(pp, contPredTrain)
68 | contPredTest  <- predict(pp, contPredTest)
69 | 
70 | ## Reassemble the fingerprint data with the transformed values.
71 | trainXtrans <- cbind(solTrainX[,grep("FP", names(solTrainX))], contPredTrain)
72 | testXtrans  <- cbind( solTestX[,grep("FP", names(solTestX))],  contPredTest)
73 | 
74 | all.equal(trainXtrans, solTrainXtrans)
75 | all.equal(testXtrans, solTestXtrans)
76 | 	}
77 | 
78 | 
79 | }
80 | \keyword{datasets}
81 | 


--------------------------------------------------------------------------------
/man/twoClassData.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{twoClassData}
 5 | \alias{twoClassData}
 6 | \alias{classes}
 7 | \title{Two Class Example Data}
 8 | \value{
 9 | \item{predictors}{data frame of two predictors} \item{classes}{a
10 | factor vector of class labeled}
11 | }
12 | \description{
13 | These data contain two predictors measured for 208 samples. Of these, 111
14 | samples are labeled as \code{Class1} and the remaining 97 are \code{Class2}.
15 | }
16 | \examples{
17 | 
18 | data(twoClassData)
19 | 
20 | library(lattice)
21 | xyplot(PredictorB ~ PredictorA, 
22 |        data = predictors, 
23 |        groups = classes, 
24 |        auto.key = TRUE)
25 | 
26 | 
27 | }
28 | \keyword{datasets}
29 | 


--------------------------------------------------------------------------------