├── README.md
├── data
    ├── abalone.RData
    ├── hepatic.RData
    ├── concrete.RData
    ├── solubility.RData
    ├── FuelEconomy.RData
    ├── permeability.RData
    ├── twoClassData.RData
    ├── schedulingData.RData
    ├── AlzheimerDisease.RData
    ├── segmentationOriginal.RData
    ├── logisticCreditPredictions.RData
    ├── ChemicalManufacturingProcess.RData
    └── datalist
├── R
    ├── scriptLocation.R
    ├── quadBoundaryFunc.R
    ├── easyBoundaryFunc.R
    ├── permuteRelief.R
    ├── panels.R
    ├── getPackages.R
    ├── transparentTheme.R
    └── bookTheme.R
├── NAMESPACE
├── man
    ├── internal.Rd
    ├── scriptLocation.Rd
    ├── logisticCreditPredictions.Rd
    ├── twoClassData.Rd
    ├── abalone.Rd
    ├── getPackages.Rd
    ├── AppliedPredictiveModeling-package.Rd
    ├── segmentationOrignal.Rd
    ├── Hepatic.Rd
    ├── bookTheme.Rd
    ├── jobScheduling.Rd
    ├── concrete.Rd
    ├── AlzheimerDisease.Rd
    ├── ChemicalManufacturingProcess.Rd
    ├── FuelEconomy.Rd
    ├── quadBoundaryFunc.Rd
    ├── permuteRelief.Rd
    ├── permeability.Rd
    └── solubility.Rd
├── DESCRIPTION
├── inst
    ├── NEWS.Rd
    └── chapters
    │   ├── 02_A_Short_Tour.R
    │   ├── 07_Non-Linear_Reg.R
    │   ├── 06_Linear_Regression.R
    │   ├── 02_A_Short_Tour.Rout
    │   ├── 11_Class_Performance.R
    │   ├── 04_Over_Fitting.R
    │   ├── 03_Data_Pre_Processing.R
    │   ├── 08_Regression_Trees.R
    │   ├── 18_Importance.R
    │   ├── 11_Class_Performance.Rout
    │   ├── 10_Case_Study_Concrete.R
    │   ├── 03_Data_Pre_Processing.Rout
    │   ├── 17_Job_Scheduling.R
    │   ├── 13_Non-Linear_Class.R
    │   ├── 12_Discriminant_Analysis.R
    │   └── 19_Feature_Select.R
└── MD5


/README.md:
--------------------------------------------------------------------------------
1 | # AppliedPredictiveModeling
2 | Data and code from Applied Predictive Modeling (2013)
3 | 


--------------------------------------------------------------------------------
/data/abalone.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cran/AppliedPredictiveModeling/HEAD/data/abalone.RData


--------------------------------------------------------------------------------
/data/hepatic.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cran/AppliedPredictiveModeling/HEAD/data/hepatic.RData


--------------------------------------------------------------------------------
/data/concrete.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cran/AppliedPredictiveModeling/HEAD/data/concrete.RData


--------------------------------------------------------------------------------
/data/solubility.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cran/AppliedPredictiveModeling/HEAD/data/solubility.RData


--------------------------------------------------------------------------------
/data/FuelEconomy.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cran/AppliedPredictiveModeling/HEAD/data/FuelEconomy.RData


--------------------------------------------------------------------------------
/data/permeability.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cran/AppliedPredictiveModeling/HEAD/data/permeability.RData


--------------------------------------------------------------------------------
/data/twoClassData.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cran/AppliedPredictiveModeling/HEAD/data/twoClassData.RData


--------------------------------------------------------------------------------
/data/schedulingData.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cran/AppliedPredictiveModeling/HEAD/data/schedulingData.RData


--------------------------------------------------------------------------------
/data/AlzheimerDisease.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cran/AppliedPredictiveModeling/HEAD/data/AlzheimerDisease.RData


--------------------------------------------------------------------------------
/R/scriptLocation.R:
--------------------------------------------------------------------------------
1 | scriptLocation <-
2 |   function()
3 |     system.file("chapters", package = "AppliedPredictiveModeling")
4 | 


--------------------------------------------------------------------------------
/data/segmentationOriginal.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cran/AppliedPredictiveModeling/HEAD/data/segmentationOriginal.RData


--------------------------------------------------------------------------------
/data/logisticCreditPredictions.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cran/AppliedPredictiveModeling/HEAD/data/logisticCreditPredictions.RData


--------------------------------------------------------------------------------
/data/ChemicalManufacturingProcess.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cran/AppliedPredictiveModeling/HEAD/data/ChemicalManufacturingProcess.RData


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | exportPattern("^[[:alpha:]]+")
2 | 
3 | import(CORElearn, MASS, plyr, reshape2)
4 | importFrom("grDevices", "rgb")
5 | importFrom("stats", "runif", "sd")
6 | importFrom("utils", "install.packages")
7 | importFrom("lattice", "trellis.par.set", "trellis.par.get", "panel.xyplot")
8 | importFrom("ellipse", "ellipse")
9 | 


--------------------------------------------------------------------------------
/man/internal.Rd:
--------------------------------------------------------------------------------
 1 | \name{caret-internal}
 2 | \title{Internal Functions}
 3 | \alias{lowerp}
 4 | \alias{upperp}
 5 | 
 6 | \description{Internal functions}
 7 | \usage{
 8 | lowerp(...)
 9 | upperp(...)
10 | }
11 | \arguments{
12 |   \item{\dots}{
13 | optional arguments to pass to internal functions
14 | }
15 | }
16 | 
17 | \author{Max Kuhn}
18 | 
19 | 
20 | \keyword{internal} 
21 | 


--------------------------------------------------------------------------------
/man/scriptLocation.Rd:
--------------------------------------------------------------------------------
 1 | \name{scriptLocation}
 2 | \alias{scriptLocation}
 3 | \title{
 4 | Find Chapter Script Files
 5 | }
 6 | \description{
 7 | This function identifies the physical location on the user's computer where the chapter R scripts are located.
 8 | }
 9 | \usage{
10 | scriptLocation()
11 | }
12 | 
13 | \author{
14 | Max Kuhn
15 | }
16 | 
17 | \examples{
18 | scriptLocation()
19 | }
20 | 
21 | \keyword{utilities}
22 | 


--------------------------------------------------------------------------------
/data/datalist:
--------------------------------------------------------------------------------
 1 | AlzheimerDisease: diagnosis predictors
 2 | ChemicalManufacturingProcess
 3 | FuelEconomy: cars2010 cars2011 cars2012
 4 | abalone
 5 | concrete: concrete mixtures
 6 | hepatic: bio chem injury
 7 | logisticCreditPredictions
 8 | permeability: fingerprints permeability
 9 | schedulingData
10 | segmentationOriginal
11 | solubility: solTestX solTestXtrans solTestY solTrainX solTrainXtrans solTrainY
12 | twoClassData: classes predictors
13 | 


--------------------------------------------------------------------------------
/R/quadBoundaryFunc.R:
--------------------------------------------------------------------------------
 1 | quadBoundaryFunc <- function(n) {
 2 |   sigma <- matrix(c(1, .7, .7, 2), 2, 2)
 3 |   
 4 |   tmpData <- data.frame(mvrnorm(n = n, c(1, 0), sigma))
 5 |   xSeq <- seq(-4, 4, length = 40)
 6 |   plotGrid <- expand.grid(x = xSeq, y = xSeq)
 7 |   zFoo <- function(x, y)
 8 |     - 1 - 2 * x - 0 * y - .2 * x ^ 2 + 2 * y ^ 2
 9 |   z2p <- function(x)
10 |     1 / (1 + exp(-x))
11 |   
12 |   tmpData$prob <- z2p(zFoo(tmpData$X1, tmpData$X2))
13 |   tmpData$class <-
14 |     factor(ifelse(runif(length(tmpData$prob)) <= tmpData$prob, "Class1", "Class2"))
15 |   tmpData
16 | }
17 | 


--------------------------------------------------------------------------------
/man/logisticCreditPredictions.Rd:
--------------------------------------------------------------------------------
 1 | \name{logisticCreditPredictions}
 2 | \docType{data}
 3 | \alias{logisticCreditPredictions}
 4 | \title{Logistic Regression Predictions for the Credit Data}
 5 | \description{
 6 | add some notes
 7 | }
 8 | 
 9 | \usage{data(solubility)}
10 | 
11 | \value{
12 | A data frame with columns
13 |   \item{Bad}{The predicted class probability for bad credit. }
14 |   \item{Good}{The predicted class probability for good credit.}
15 |   \item{pred}{The predicted class. }
16 |   \item{obs}{The observed class }
17 |   }
18 | 
19 | 
20 | 
21 | \examples{
22 | ## show code to make the predictions
23 | }
24 | 
25 | \keyword{datasets}  
26 | 
27 | 


--------------------------------------------------------------------------------
/R/easyBoundaryFunc.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | easyBoundaryFunc <- function(n,
 4 |                              intercept = 0,
 5 |                              interaction = 2) {
 6 |   sigma <- matrix(c(2, 1.3, 1.3, 2), 2, 2)
 7 |   
 8 |   tmpData <- data.frame(mvrnorm(n = n, c(0, 0), sigma))
 9 |   xSeq <- seq(-4, 4, length = 40)
10 |   plotGrid <- expand.grid(x = xSeq, y = xSeq)
11 |   zFoo <- function(x, y)
12 |     intercept - 4 * x + 4 * y + interaction * x * y
13 |   z2p <- function(x)
14 |     1 / (1 + exp(-x))
15 |   
16 |   tmpData$prob <- z2p(zFoo(tmpData$X1, tmpData$X2))
17 |   tmpData$class <-
18 |     factor(ifelse(runif(length(tmpData$prob)) <= tmpData$prob, "Class1", "Class2"))
19 |   tmpData
20 | }
21 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: AppliedPredictiveModeling
 2 | Type: Package
 3 | Title: Functions and Data Sets for 'Applied Predictive Modeling'
 4 | Version: 1.1-7
 5 | Date: 2018-05-22
 6 | Author: Max Kuhn, Kjell Johnson
 7 | Maintainer: Max Kuhn <mxkuhn@gmail.com>
 8 | Description: A few functions and several data set for the Springer book 'Applied Predictive Modeling'.
 9 | URL: http://appliedpredictivemodeling.com/
10 | Depends: R (>= 2.10)
11 | Imports: CORElearn, MASS, plyr, reshape2, lattice, ellipse
12 | Suggests: caret (>= 6.0-22)
13 | License: GPL-2
14 | RoxygenNote: 6.0.1
15 | NeedsCompilation: no
16 | Packaged: 2018-05-22 19:03:00 UTC; max
17 | Repository: CRAN
18 | Date/Publication: 2018-05-22 19:14:21 UTC
19 | 


--------------------------------------------------------------------------------
/man/twoClassData.Rd:
--------------------------------------------------------------------------------
 1 | \name{twoClassData}
 2 | \docType{data}
 3 | \alias{twoClassData}
 4 | \alias{classes}
 5 | \title{Two Class Example Data}
 6 | \description{
 7 | These data contain two predictors measured for 208 samples. Of these, 111 samples are 
 8 | labeled as \code{Class1} and the remaining 97 are \code{Class2}.
 9 | }
10 | 
11 | \usage{data(twoClassData)}
12 | \value{
13 |   \item{predictors}{data frame of two predictors}
14 |   \item{classes}{a factor vector of class labeled}  
15 |   }
16 | 
17 | \examples{
18 | data(twoClassData)
19 | 
20 | library(lattice)
21 | xyplot(PredictorB ~ PredictorA, 
22 |        data = predictors, 
23 |        groups = classes, 
24 |        auto.key = TRUE)
25 | 
26 | }
27 | 
28 | \keyword{datasets}  
29 | 
30 | 


--------------------------------------------------------------------------------
/R/permuteRelief.R:
--------------------------------------------------------------------------------
 1 | permuteRelief <-
 2 |   function(x, y, nperm = 100, ...) {
 3 |     dat <- x
 4 |     dat$y <- y
 5 |     
 6 |     obs <- attrEval(y ~ ., data = dat, ...)
 7 |     permuted <- matrix(NA, ncol = length(obs), nrow = nperm)
 8 |     colnames(permuted) <- names(obs)
 9 |     for (i in 1:nperm) {
10 |       dat$y <- sample(y)
11 |       permuted[i,] <- attrEval(y ~ ., data = dat, ...)
12 |     }
13 |     means <- colMeans(permuted)
14 |     sds <- apply(permuted, 2, sd)
15 |     permuted <- melt(permuted)
16 |     names(permuted)[2] <- "Predictor"
17 |     permuted$X1 <- NULL
18 |     list(
19 |       standardized = (obs - means) / sds,
20 |       permutations = permuted,
21 |       observed = obs,
22 |       options = list(...)
23 |     )
24 |   }
25 | 


--------------------------------------------------------------------------------
/man/abalone.Rd:
--------------------------------------------------------------------------------
 1 | \name{abalone}
 2 | \docType{data}
 3 | \alias{abalone}
 4 | \title{Abalone Data}
 5 | \description{
 6 | The Abalone data consist of data from 4177 abalones. The data consist of measurements of the type (male, female and infant), the longest shell measurement, the diameter, height and several weights (whole, shucked, viscera and shell). The outcome is the number of rings. The age of the abalone is the number of rings plus 1.5. 
 7 | 
 8 | The data are taken from the UCI database (\url{http://archive.ics.uci.edu/ml/datasets/Abalone}).
 9 | }
10 | 
11 | \usage{data(abalone)}
12 | 
13 | \value{
14 |   \item{abalone}{a data frame with 4177 rows and 9 columns}
15 |   }
16 | 
17 | \examples{
18 | data(abalone)
19 | }
20 | 
21 | \keyword{datasets}  
22 | 
23 | 


--------------------------------------------------------------------------------
/man/getPackages.Rd:
--------------------------------------------------------------------------------
 1 | \name{getPackages}
 2 | \alias{getPackages}
 3 | \title{
 4 | Install Packages for Each Chapter
 5 | }
 6 | \description{
 7 | This function identifies the physical location on the user's computer where the chapter R scripts are located.
 8 | }
 9 | \usage{
10 | getPackages(chapter, ...)
11 | }
12 | 
13 | \arguments{
14 |   \item{chapter}{an integer vector (or character versions of the integer) for the chapter number. See Details below:}
15 |   \item{\ldots}{options to pass to \code{\link[utils]{install.packages}}}
16 | }
17 | 
18 | \details{
19 | Chapter names and packages. about dependencies. 
20 | }
21 | 
22 | \author{
23 | Max Kuhn
24 | }
25 | 
26 | \examples{
27 | \dontrun{
28 | getPackages(2)
29 | getPackages(2:3)
30 | getPackages("4")
31 | }
32 | }
33 | 
34 | \keyword{utilities}
35 | 


--------------------------------------------------------------------------------
/man/AppliedPredictiveModeling-package.Rd:
--------------------------------------------------------------------------------
 1 | \name{AppliedPredictiveModeling-package}
 2 | \alias{AppliedPredictiveModeling-package}
 3 | \alias{AppliedPredictiveModeling}
 4 | \docType{package}
 5 | \title{
 6 | Data, Functions and Scripts for 'scriptLocation'
 7 | }
 8 | \description{
 9 | This package can be used to reproduce the analyses in the text. Scripts for each chapter are located in the "chapters" directory. Use \code{scriptLocation()} to find their exact location.
10 | }
11 | \details{
12 | \tabular{ll}{
13 | Package: \tab AppliedPredictiveModeling\cr
14 | Type: \tab Package\cr
15 | Version: \tab 1.1-1\cr
16 | Date: \tab 2013-05-29\cr
17 | License: \tab GPL\cr
18 | }
19 | }
20 | \author{
21 | Max Kuhn
22 | 
23 | Maintainer: Max Kuhn <mkuhn@gmail.com>
24 | }
25 | \references{
26 |  Kuhn M and Johnson K (2013) Applied Predictive Modeling, Springer, NY
27 | }
28 | \keyword{ package }
29 | 


--------------------------------------------------------------------------------
/man/segmentationOrignal.Rd:
--------------------------------------------------------------------------------
 1 | \name{segmentationOriginal}
 2 | \docType{data}
 3 | \alias{segmentationOriginal}
 4 |  
 5 | \title{Cell Body Segmentation}
 6 | \description{
 7 | Hill, LaPan, Li and Haney (2007) develop models to predict which cells in a high content screen were well segmented. 
 8 | The data consists of 119 imaging measurements on 2019. The original analysis used 1009 for training and 1010 as a test set (see the column called \code{Case}). 
 9 | 
10 | The outcome class is contained in a factor variable called \code{Class} with levels "PS" for poorly segmented and "WS" for well segmented.
11 | 
12 | A pre-processed version of these data can be found in the \pkg{caret} package.
13 | }
14 | 
15 | \usage{data(segmentationOriginal)}
16 | \value{
17 |   \item{segmentationOriginal}{data frame of cells}
18 |   }
19 | 
20 | \source{Hill, LaPan, Li and Haney (2007). Impact of image segmentation on high-content screening data quality for SK-BR-3 cells, 
21 | \emph{BMC Bioinformatics}, Vol. 8, pg. 340, \url{http://www.biomedcentral.com/1471-2105/8/340}.
22 | }
23 | 
24 | \keyword{datasets}  
25 | 
26 | 


--------------------------------------------------------------------------------
/R/panels.R:
--------------------------------------------------------------------------------
 1 | upperp <- function(...) {
 2 |   args <- list(...)
 3 |   circ1 <- ellipse(diag(rep(1, 2)), t = .1)
 4 |   panel.xyplot(
 5 |     circ1[, 1],
 6 |     circ1[, 2],
 7 |     type = "l",
 8 |     lty = trellis.par.get("reference.line")$lty,
 9 |     col = trellis.par.get("reference.line")$col,
10 |     lwd = trellis.par.get("reference.line")$lwd
11 |   )
12 |   circ2 <- ellipse(diag(rep(1, 2)), t = .2)
13 |   panel.xyplot(
14 |     circ2[, 1],
15 |     circ2[, 2],
16 |     type = "l",
17 |     lty = trellis.par.get("reference.line")$lty,
18 |     col = trellis.par.get("reference.line")$col,
19 |     lwd = trellis.par.get("reference.line")$lwd
20 |   )
21 |   circ3 <- ellipse(diag(rep(1, 2)), t = .3)
22 |   panel.xyplot(
23 |     circ3[, 1],
24 |     circ3[, 2],
25 |     type = "l",
26 |     lty = trellis.par.get("reference.line")$lty,
27 |     col = trellis.par.get("reference.line")$col,
28 |     lwd = trellis.par.get("reference.line")$lwd
29 |   )
30 |   panel.xyplot(args$x,
31 |                args$y,
32 |                groups = args$groups,
33 |                subscripts = args$subscripts)
34 | }
35 | 
36 | lowerp <- function(...) {
37 |   
38 | }
39 | 


--------------------------------------------------------------------------------
/man/Hepatic.Rd:
--------------------------------------------------------------------------------
 1 | \name{hepatic}
 2 | \docType{data}
 3 | \alias{bio}
 4 | \alias{chem}
 5 | \alias{injury}
 6 | \title{Hepatic Injury Data}
 7 | \description{
 8 | This  data set was used to develop a model for predicting compounds' probability of causing hepatic injury (i.e. liver damage). This data set consisted of 281 unique compounds; 376 predictors were measured or computed for each. The response was categorical (either "None", "Mild" or "Severe" ),and was highly unbalanced.
 9 | 
10 | This kind of response often occurs in pharmaceutical data because companies steer away from creating molecules that have undesirable characteristics. Therefore, well-behaved molecules often greatly outnumber undesirable molecules. The predictors consisted of measurements from 184 biological screens and 192 chemical feature predictors. The biological predictors represent activity for each screen and take values between 0 and 10 with a mode of 4. The chemical feature predictors represent counts of important sub-structures as well as measures of physical properties that are thought to be associated with hepatic injury. 
11 |   
12 | }
13 | 
14 | \usage{data(hepatic)}
15 | 
16 | \value{
17 |   \item{bio}{Biological screen results. }
18 |   \item{chem}{Chemical fingerprints for sub-structures.}
19 |   \item{injury}{A factor vector of outcomes.}
20 |   }
21 | 
22 | \examples{
23 | data(hepatic)
24 | }
25 | 
26 | \keyword{datasets}  
27 | 
28 | 


--------------------------------------------------------------------------------
/man/bookTheme.Rd:
--------------------------------------------------------------------------------
 1 | \name{bookTheme}
 2 | \alias{bookTheme}
 3 | \alias{transparentTheme}
 4 | \title{
 5 | Lattice Themes
 6 | }
 7 | \description{
 8 | Two \pkg{lattice} themes used throughout the book.
 9 | }
10 | \usage{
11 | bookTheme(set = TRUE)
12 | 
13 | transparentTheme(set = TRUE, pchSize = 1, trans = 0.2)
14 | }
15 | \arguments{
16 |   \item{set}{
17 | a logical: should these settings be applied to the current device?
18 | }
19 |   \item{pchSize}{
20 | the size of the plot symbols
21 | }
22 |   \item{trans}{
23 | the amount of transparency (via the alpha channel). Note that transparency is not supported by all graphics devices.
24 | }
25 | }
26 | \details{
27 | When using these functions to save a plot, make sure to invoke them after the device has been opened (e.g. after calls such as \code{pdf()}.
28 | 
29 | }
30 | \value{
31 | Each function returns a list of theme parameters. See Sarkar (2008) or \code{\link[lattice]{trellis.par.get}} for specific details. 
32 | }
33 | \references{
34 | Some of the colors are based on values from ColorBrewer \url{http://www.colorbrewer.org}.
35 | 
36 | Sarkar, D. (2008). Lattice: Multivariate Data Visualization with R. UseR! (1st ed. p. 286). Springer.
37 | }
38 | \author{
39 | Max Kuhn
40 | }
41 | 
42 | \examples{
43 | library(lattice)
44 | 
45 | example <- quadBoundaryFunc(100)
46 | 
47 | bookTheme(set = TRUE)
48 | xyplot(X2 ~ X1, data = example, groups = class, auto.key = TRUE)
49 | 
50 | transparentTheme(set = TRUE, trans = .6)
51 | xyplot(X2 ~ X1, data = example, groups = class, auto.key = TRUE)
52 | }
53 | 
54 | \keyword{hplot}
55 | 
56 | 


--------------------------------------------------------------------------------
/man/jobScheduling.Rd:
--------------------------------------------------------------------------------
 1 | \name{schedulingData}
 2 | \docType{data}
 3 | \alias{schedulingData}
 4 | \title{HPC Job Scheduling Data}
 5 | \description{
 6 | These data consist of information on 4331 jobs in a high performance computing environment. Seven attributes were recorded for each job along with a discrete class describing the execution time. 
 7 | 
 8 | The predictors are: \code{Protocol} (the type of computation), \code{Compounds} (the number of data points for each jobs), \code{InputFields} (the number of characteristic being estimated), \code{Iterations} (maximum number of iterations for the computations), \code{NumPending} (the number of other jobs pending at the time of launch), \code{Hour} (decimal hour of day for launch time) and \code{Day} (of launch time).
 9 | 
10 | The classes are: \code{VF} (very fast), \code{F} (fast), \code{M} (moderate) and \code{L} (long).
11 | }
12 | 
13 | \usage{data(schedulingData)}
14 | 
15 | \value{
16 |   \item{schedulingData}{a data frame with 4331 rows and 8 columns}
17 |   }
18 | 
19 | \examples{
20 | data(schedulingData)
21 | 
22 | library(caret)
23 | 
24 | set.seed(1104)
25 | inTrain <- createDataPartition(schedulingData$Class, p = .8, list = FALSE)
26 | 
27 | schedulingData$NumPending <- schedulingData$NumPending + 1
28 | 
29 | trainData <- schedulingData[ inTrain,]
30 | testData  <- schedulingData[-inTrain,]
31 | 
32 | modForm <- as.formula(Class ~ Protocol + log10(Compounds) +
33 |                       log10(InputFields)+ log10(Iterations) +
34 |                       log10(NumPending) + Hour + Day)
35 | 
36 | 
37 | }
38 | 
39 | \keyword{datasets}  
40 | 
41 | 


--------------------------------------------------------------------------------
/man/concrete.Rd:
--------------------------------------------------------------------------------
 1 | \name{concrete}
 2 | \docType{data}
 3 | \alias{concrete}
 4 | \alias{mixtures}
 5 | \title{Compressive Strength of Concrete from Yeh (1998)}
 6 | \description{
 7 | Yeh (1998) describes a collection of data sets from different sources that can be used for modeling the compressive strength of concrete formulations as a functions of their ingredients and age. 
 8 | }
 9 | 
10 | \usage{data(concrete)}
11 | 
12 | \value{
13 |   \item{concrete}{data frame of data with predictor columns \code{Cement}, \code{BlastFurnaceSlag}, \code{FlyAsh}, \code{Water}, \code{Superplasticizer}, \code{CoarseAggregate}, \code{FineAggregate} and \code{Age} with response column \code{CompressiveStrength}. These are the amounts.}
14 |   \item{mixtures}{The same data where all the ingredients have been converted to proportions of the total amounts.}
15 |   }
16 | 
17 | \details{
18 | The data are from Yeh (1998) and taken from the UCI ML website \url{http://archive.ics.uci.edu/ml/datasets/Concrete+Compressive+Strength}.
19 | 
20 | There are 1030 data points from the UCI website, but the paper states that approximately 1,000 samples were made, but only 727 were analyzed in the source material. It is unclear which samples were excluded.
21 | }
22 | 
23 | \source{Yeh, I. C. (1998). Modeling of strength of high-performance concrete using artificial neural networks. \emph{Cement and Concrete Research}, 28(12), 1797-1808. Elsevier.
24 | }
25 | 
26 | \examples{
27 | data(concrete)
28 | 
29 | library(caret)
30 | 
31 | ### Split used in the book:
32 | set.seed(975)
33 | inTrain <- createDataPartition(mixtures$CompressiveStrength, p = 3/4)[[1]]
34 | training <- mixtures[ inTrain,]
35 | testing  <- mixtures[-inTrain,]
36 | 
37 | }
38 | 
39 | \keyword{datasets}  
40 | 
41 | 


--------------------------------------------------------------------------------
/man/AlzheimerDisease.Rd:
--------------------------------------------------------------------------------
 1 | \name{AlzheimerDisease}
 2 | \docType{data}
 3 | \alias{diagnosis}
 4 | \alias{predictors}
 5 | \title{Alzheimer's Disease CSF Data}
 6 | \description{
 7 | Washington University conducted a clinical study to determine if biological measurements made from cerebrospinal fluid (CSF) can be used to diagnose or predict Alzheimer's disease (Craig-Schapiro et al. 2011). These data are a modified version of the values used for the publication.
 8 | 
 9 | The R factor vector \code{diagnosis} contains the outcome data for 333 of the subjects. The demographic and laboratory results are collected in the data frame \code{predictors}.
10 | 
11 | One important indicator of Alzheimer's disease is the genetic background of a subject. In particular, what versions of the Apolipoprotein E gene inherited from one's parents has an association with the disease. There are three variants of the gene: E2, E3 and E4. Since a child inherits a version of the gene from each parent, there are six possible combinations (e.g. E2/E2, E2/E3, and so on). This data is contained in the predictor column named \code{Genotype}.
12 |   
13 | }
14 | 
15 | \usage{data(AlzheimerDisease)}
16 | 
17 | \value{
18 |   \item{diagnosis}{labels for the patients, either "Impaired" or "Control". }
19 |   \item{predictors}{predictors for demographic data (eg. age, gender), genotype and assay results.}
20 |   }
21 | 
22 | 
23 | \source{Craig-Schapiro, R., Kuhn, M., Xiong, C., Pickering, E. H., Liu, J., Misko, T. P., Perrin, R. J., et al. (2011). Multiplexed Immunoassay Panel Identifies Novel CSF Biomarkers for Alzheimer's Disease Diagnosis and Prognosis. PLoS ONE, 6(4), e18850.
24 | 
25 | }
26 | 
27 | \examples{
28 | data(AlzheimerDisease)
29 | }
30 | 
31 | \keyword{datasets}  
32 | 
33 | 


--------------------------------------------------------------------------------
/man/ChemicalManufacturingProcess.Rd:
--------------------------------------------------------------------------------
 1 | \name{ChemicalManufacturingProcess}
 2 | \docType{data}
 3 | \alias{ChemicalManufacturingProcess}
 4 | 
 5 | \title{Chemical Manufacturing Process Data}
 6 | 
 7 | \description{
 8 | This data set contains information about a chemical manufacturing
 9 | process, in which the goal is to understand the relationship between
10 | the process and the resulting final product yield.  Raw material in
11 | this process is put through a sequence of 27 steps to generate the
12 | final pharmaceutical product.  The starting material is generated from
13 | a biological unit and has a range of quality and characteristics.  The
14 | objective in this project was to develop a model to predict percent
15 | yield of the manufacturing process.  The data set consisted of 177
16 | samples of biological material for which 57 characteristics were
17 | measured.  Of the 57 characteristics, there were 12 measurements of
18 | the biological starting material, and 45 measurements of the
19 | manufacturing process.  The process variables included measurements
20 | such as temperature, drying time, washing time, and concentrations of
21 | by--products at various steps.  Some of the process measurements can
22 | be controlled, while others are observed.  Predictors are continuous,
23 | count, categorical; some are correlated, and some contain missing
24 | values.  Samples are not independent because sets of samples come from
25 | the same batch of biological starting material.
26 | }
27 | 
28 | \usage{data(ChemicalManufacturingProcess)}
29 | \value{
30 | \code{ChemicalManufacturingProcess}: a data frame with columns for the outcome (\code{Yield}) and the predictors (\code{BiologicalMaterial01} though \code{BiologicalMaterial12} and \code{ManufacturingProcess01} though \code{ManufacturingProcess45}
31 | }
32 | 
33 | \examples{
34 | data(ChemicalManufacturingProcess)
35 | }
36 | 
37 | \keyword{datasets}  
38 | 
39 | 


--------------------------------------------------------------------------------
/man/FuelEconomy.Rd:
--------------------------------------------------------------------------------
 1 | \name{FuelEconomy}
 2 | \docType{data}
 3 | \alias{cars2010}
 4 | \alias{cars2011}
 5 | \alias{cars2012}
 6 | \title{Fuel Economy Data}
 7 | \description{
 8 | The \url{http://fueleconomy.gov} website, run by the U.S. Department of Energy's Office of Energy Efficiency and Renewable Energy and the U.S. Environmental Protection Agency, lists different estimates of fuel economy for passenger cars and trucks. For each vehicle, various characteristics are recorded such as the engine displacement or number of cylinders. Along with these values, laboratory measurements are made for the city and highway miles per gallon (MPG) of the car.
 9 | 
10 | Predictors extracted from the website include: \code{EngDispl}, \code{NumCyl}, \code{Transmission}, \code{AirAspirationMethod}, \code{NumGears}, \code{TransLockup}, \code{TransCreeperGear}, \code{DriveDesc}, \code{IntakeValvePerCyl}, \code{ExhaustValvesPerCyl}, \code{CarlineClassDesc}, \code{VarValveTiming} and \code{VarValveLift}. The outcome used in the book is in column  \code{FE} and is the unadjusted highway data.
11 | 
12 | }
13 | 
14 | \usage{data(FuelEconomy)}
15 | 
16 | \value{
17 |   \item{cars2010}{data in cars from model year 2010. }
18 |   \item{cars2011}{cars introduced in 2011 that were not in the model year 2010 data.}
19 |   \item{cars2012}{cars introduced in 2012 that were not in the model year 2010 or 2011 data }
20 |   }
21 | 
22 | \examples{
23 | data(FuelEconomy)
24 | 
25 | library(lattice)
26 | 
27 | ### Plot shown in the text:
28 | 
29 | cars2010 <- cars2010[order(cars2010$EngDispl),]
30 | cars2011 <- cars2011[order(cars2011$EngDispl),]
31 | 
32 | cars2010a <- cars2010
33 | cars2010a$Year <- "2010 Model Year"
34 | cars2011a <- cars2011
35 | cars2011a$Year <- "2011 Model Year"
36 | 
37 | plotData <- rbind(cars2010a, cars2011a)
38 | 
39 | plotTheme <- bookTheme(FALSE)
40 | plotTheme$plot.symbol$col <- rgb(.2, .2, .2, .5)
41 | plotTheme$plot.symbol$cex <- 0.7
42 | trellis.par.set(plotTheme)
43 | 
44 | xyplot(FE ~ EngDispl|Year, plotData,
45 |        xlab = "Engine Displacement",
46 |        ylab = "Fuel Efficiency (MPG)",
47 |        between = list(x = 1.2))
48 | 
49 | }
50 | 
51 | \keyword{datasets}  
52 | 
53 | 


--------------------------------------------------------------------------------
/R/getPackages.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | getPackages <- function(chapter, ...) {
 4 |   if(is.numeric(chapter)) 
 5 |     chapter <- paste(chapter)
 6 |   pkg <- list()
 7 |   pkg[["2"]] <- c("earth", "caret", "lattice")
 8 |   pkg[["3"]] <- c("e1071", "caret", "corrplot")
 9 |   pkg[["4"]] <- c("kernlab", "caret")
10 |   pkg[["6"]] <- c("lattice", "corrplot", "pls", "elasticnet")
11 |   pkg[["7"]] <- c("caret", "earth", "kernlab","lattice", "nnet")  
12 |   pkg[["8"]] <- c("caret", "Cubist", "gbm", "lattice", "party", "partykit", 
13 |                   "randomForest", "rpart",  "RWeka") 
14 |   pkg[["10"]] <- c("caret", "Cubist", "earth", "elasticnet", "gbm", "ipred", 
15 |                    "lattice", "nnet", "party","pls", "randomForests", "rpart", 
16 |                    "RWeka")  
17 |   pkg[["11"]] <- c("caret", "MASS", "randomForest", "pROC", "klaR")  
18 |   pkg[["12"]] <- c("caret", "glmnet", "lattice", 
19 |                    "MASS", "pamr", "pls", "pROC", "sparseLDA")  
20 |   pkg[["13"]] <- c("caret", "kernlab", "klaR",  "lattice", "latticeExtra", 
21 |                    "MASS", "mda", "nnet", "pROC")  
22 |   pkg[["14"]] <- c("C50", "caret", "gbm",  "lattice", "partykit", "pROC", 
23 |                    "randomForest", "reshape2", 
24 |                    "rpart", "RWeka")
25 |   pkg[["16"]] <- c("caret", "C50", "earth", "DMwR", "DWD", " kernlab", "mda", 
26 |                    "pROC", "randomForest", "rpart") 
27 |   pkg[["17"]] <- c("C50", "caret", "earth", "Hmisc", "ipred", "tabplot", 
28 |                    "kernlab", "lattice", "MASS", "mda", "nnet", "pls", 
29 |                    "randomForest", "rpart", "sparseLDA")
30 |   pkg[["18"]] <- c("caret", "CORElearn", "corrplot", "pROC", "minerva")
31 |   pkg[["19"]] <- c("caret", "MASS", "corrplot", "RColorBrewer", "randomForest", 
32 |                    "kernlab", "klaR")
33 |   plist <-
34 |     paste(paste("'", names(pkg), "'", sep = ""), collapse = ", ")
35 |   if (!any(chapter %in% names(pkg)))
36 |     stop(paste("'chapter' must be: ",
37 |                paste(plist, collapse = ", ")))  
38 |   
39 |   pkg <- unlist(pkg[chapter])
40 |   pkg <- pkg[!is.na(pkg)]
41 |   pkg <- pkg[pkg != ""]
42 |   pkg <- pkg[order(tolower(pkg))]
43 |   
44 |   install.packages(pkg, ...)
45 | }
46 | 


--------------------------------------------------------------------------------
/man/quadBoundaryFunc.Rd:
--------------------------------------------------------------------------------
 1 | \name{quadBoundaryFunc}
 2 | \alias{quadBoundaryFunc}
 3 | \alias{easyBoundaryFunc}
 4 | 
 5 | \title{
 6 | Functions for Simulating Data
 7 | }
 8 | \description{
 9 | These functions simulate data that are used in the text. 
10 | }
11 | \usage{
12 | quadBoundaryFunc(n)
13 | 
14 | easyBoundaryFunc(n, intercept = 0, interaction = 2)
15 | }
16 | \arguments{
17 |   \item{n}{the sample size}
18 |   \item{intercept}{the coefficient for the logistic regression intercept term}
19 |   \item{interaction}{the coefficient for the logistic regression interaction term}
20 | 
21 | }
22 | \details{
23 | The \code{quadBoundaryFunc} function creates a class boundary that is a function of both predictors. The probability values are based on a logistic regression model with model equation: \eqn{-1-2X_1 -0.2X_1^2 + 2X_2^2}{-1-2*X1 -0.2*X1^2 + 2*X2^2}. The predictors here are multivariate normal with mean (1, 0) and a moderate degree of positive correlation.
24 | 
25 | Similarly, the \code{easyBoundaryFunc} uses a logistic regression model with model equation: \eqn{intercept -4X_1 + 4X_2 + interaction \times X_1 \times X_2}{intercept -4*X1 + 4*X2 + interaction*X1*X2}. The predictors here are multivariate normal with mean (1, 0) and a strong positive correlation.
26 | }
27 | \value{
28 | Both functions return data frames with columns
29 | \item{X1}{numeric predictor value}
30 | \item{X2}{numeric predictor value}
31 | \item{prob }{numeric value reflecting the true probability of the first class}
32 | \item{class }{a factor variable with levels 'Class1' and 'Class2'}
33 | }
34 | 
35 | \author{
36 | Max Kuhn
37 | }
38 | 
39 | \examples{
40 | ## in Chapter 11, 'Measuring Performance in Classification Model'
41 | set.seed(975)
42 | training <- quadBoundaryFunc(500)
43 | testing <- quadBoundaryFunc(1000)
44 |  
45 | 
46 | ## in Chapter 20, 'Factors That Can Affect Model Performance'
47 | set.seed(615)
48 | dat <- easyBoundaryFunc(200, interaction = 3, intercept = 3)
49 | dat$X1 <- scale(dat$X1)
50 | dat$X2 <- scale(dat$X2)
51 | dat$Data <- "Original"
52 | dat$prob <- NULL
53 | 
54 | ## in Chapter X, 'An Introduction to Feature Selection'
55 | 
56 | set.seed(874)
57 | reliefEx3 <- easyBoundaryFunc(500)
58 | reliefEx3$X1 <- scale(reliefEx3$X1)
59 | reliefEx3$X2 <- scale(reliefEx3$X2)
60 | reliefEx3$prob <- NULL
61 | 
62 | }
63 | 
64 | \keyword{utilities}
65 | 
66 | 


--------------------------------------------------------------------------------
/man/permuteRelief.Rd:
--------------------------------------------------------------------------------
 1 | \name{permuteRelief}
 2 | \alias{permuteRelief}
 3 | \title{
 4 | Permutation Statistics for the Relief Algorithm
 5 | }
 6 | \description{
 7 | This function uses a permutation approach to determining the relative magnitude of Relief scores (Kira and Rendell, 1992 and Kononenko, 1994).
 8 | }
 9 | \usage{
10 | permuteRelief(x, y, nperm = 100, ...)
11 | }
12 | \arguments{
13 |   \item{x}{
14 | a data frame of predictor data
15 | }
16 |   \item{y}{
17 | a vector of outcomes
18 | }
19 |   \item{nperm}{
20 | the number of random permutations of the data
21 | }
22 |   \item{\dots}{
23 | options to pass to \code{\link[CORElearn]{attrEval}}, such as the exact Relief algorithm, to use
24 | }
25 | }
26 | \details{
27 | The scores for each predictor are computed using the original data and after outcome data are randomly scrambled (\code{nprem} times). The mean and standard deviation of the permuted values are determined and a standardized version of the observed scores are determined by subtracting the permuted means from the original values, then dividing each by the corresponding standard deviation. 
28 | }
29 | \value{
30 | a list with elements
31 | \item{standardized }{a vector of standardized predictor scores}
32 | \item{permutations }{the values of the permuted scores, for plotting to assess the permutation distribution}
33 | \item{observed}{the observed scores}
34 | \item{options}{a list of options passed using \ldots}
35 | }
36 | \references{
37 | Kira, K., & Rendell, L. (1992). The feature selection problem: Traditional methods and a new algorithm. \emph{Proceedings of the Eleventh International Conference on Machine Learning}, 129-129.
38 | 
39 | Kononenko, I. (1994). Estimating attributes: analysis and extensions of RELIEF. Machine Learning: ECML-94, 171-182.
40 | }
41 | \author{
42 | Max Kuhn
43 | }
44 | 
45 | 
46 | \seealso{
47 |  \code{\link[CORElearn]{attrEval}}
48 | }
49 | \examples{
50 | set.seed(874)
51 | reliefEx3 <- easyBoundaryFunc(500)
52 | reliefEx3$X1 <- scale(reliefEx3$X1)
53 | reliefEx3$X2 <- scale(reliefEx3$X2)
54 | reliefEx3$prob <- NULL
55 | 
56 | standardized <- permuteRelief(reliefEx3[, 1:2], reliefEx3$class, 
57 |                               ## For efficiency, a small number of
58 |                               ## permutations are used here.
59 |                               nperm = 50,           
60 |                               estimator="ReliefFequalK", 
61 |                               ReliefIterations= 50)
62 | 
63 | }
64 | 
65 | \keyword{htest}
66 | 
67 | 


--------------------------------------------------------------------------------
/inst/NEWS.Rd:
--------------------------------------------------------------------------------
 1 | \name{NEWS}
 2 | \title{News for Package \pkg{AppliedPredictiveModeling}}
 3 | \newcommand{\cpkg}{\href{https://cran.r-project.org/package=#1}{\pkg{#1}}}
 4 | 
 5 | 
 6 | \section{Changes in version 1.1-6}{
 7 | 
 8 | \itemize{
 9 | \item The file \code{CreateGrantData.R} was updated to include code to create the objects \code{factorPredictors} and \code{factorForm}. 
10 | 
11 | }
12 | }
13 | 
14 | \section{Changes in version 1.1-5}{
15 | 
16 | The package dependencies were updated. Some were moved to 'Imports'
17 | 
18 | The chapter scripts were re-run with the latest versions of the \cpkg{AppliedPredictiveModeling} and\cpkg{caret} packages. For \cpkg{caret}, the names of the tuning parameter columns were changed to remove the dot. These were made to be consistent with the newer version of \cpkg{caret} and are not required. For example, \code{.sigma} was changed to \code{sigma} and so on. 
19 | 
20 | Additional changes are:
21 | 
22 | \itemize{
23 | \item \code{03_Data_Pre_Processing.R} was changed to include code from the Computing section on creating dummy variables.
24 | 
25 | \item \code{04_Over_Fitting.R} was modified. Changes were made to:
26 |    \itemize{
27 |    \item avoid a warning message when the SVM grid was created
28 |    \item \code{classProbs = TRUE} was added to the control function for the object \code{svmFit}
29 |    \item some notes were made in the potential differences in SVM results between versions of \cpkg{caret}
30 |    }
31 | \item \code{06_Linear_Regression.R} was updated to use the newer "ridge" model in \cpkg{caret}.
32 | 
33 | \item In \code{07_Non-Linear_Reg.R}, some notes were made in the potential differences in SVM results between versions of \cpkg{caret}
34 | 
35 | \item In \code{16_Class_Imbalance.R}, verboseness (verbosity?) was turned off to make the results cleaner. Also, the code for \code{svmWtFit} was using the wrong tuning grid (\code{svmGrid2} instead of \code{svmGrid1}).
36 | 
37 | \item \code{19_Feature_Select.R} was changed so that the resampling values were only saved for the final model. 
38 | 
39 | }
40 | }
41 | 
42 | \section{Changes in version 1.1-4}{
43 | \itemize{
44 | \item The data set \code{ChemicalManufacturingProcess} did not contain 
45 | the rows with missing data. They were added back in.
46 | 
47 | \item Small changes to conform to R CMD check. 
48 | }}
49 | 
50 | \section{Changes in version 1.1-2}{
51 | \itemize{
52 | \item Code to create the \code{carsSubset} object in Seciton 3.8 was added 
53 |       to 03_Data_Pre_Processing.R
54 | }}
55 | 
56 | \section{Changes in version 1.1-1}{
57 | \itemize{
58 | \item Initial Version
59 | }}
60 | 
61 | 


--------------------------------------------------------------------------------
/man/permeability.Rd:
--------------------------------------------------------------------------------
 1 | \name{permeability}
 2 | \docType{data}
 3 | \alias{permeability}
 4 | \alias{fingerprints}
 5 | \title{Permeability Data}
 6 | \description{
 7 | This pharmaceutical data set was used to develop a model for predicting compounds' permeability.  In short, permeability is the measure of a molecule's ability to cross a membrane.  The body, for example, has notable membranes between the body and brain, known as the blood-brain barrier, and between the gut and body in the intestines.  These membranes help the body guard critical regions from receiving undesirable or detrimental substances.  For an orally taken drug to be effective in the brain, it first must pass through the intestinal wall and then must pass through the blood-brain barrier in order to be present for the desired neurological target.  Therefore, a compound's ability to permeate relevant biological membranes is critically important to understand early in the drug discovery process.  Compounds that appear to be effective for a particular disease in research screening experiments, but appear to be poorly permeable may need to be altered in order improve permeability, and thus the compound's ability to reach the desired target.  Identifying permeability problems can help guide chemists towards better molecules.
 8 | 
 9 | Permeability assays such as PAMPA and Caco-2 have been developed to help measure compounds' permeability (Kansy et al, 1998).  These screens are effective at quantifying a compound's permeability, but the assay is expensive labor intensive.  Given a sufficient number of compounds that have been screened, we could develop a predictive model for permeability in an attempt to potentially reduce the need for the assay.  In this project there were 165 unique compounds; 1107 molecular fingerprints were determined for each.  A molecular fingerprint is a binary sequence of numbers that represents the presence or absence of a specific molecular sub-structure.  The response is highly skewed, the predictors are sparse (15.5 percent are present), and many predictors are strongly associated.
10 | }
11 | 
12 | \usage{data(permeability)}
13 | 
14 | \value{
15 |   \item{permeability}{permeability values for each compound. }
16 |   \item{fingerprints}{a matrix of binary fingerprint indicator variables.}
17 |   }
18 | 
19 | 
20 | \source{Kansy, M., Senner, F., and Gubernator, K. (1998). Physicochemical High Throughput Screening:  Parallel Artificial Membrane Permeation Assay in the Description of Passive Absorption Processes. J. Med. Chem, 41(7), 1007-1010.
21 | }
22 | 
23 | \examples{
24 | data(permeability)
25 | 
26 | hist(permeability)
27 | 
28 | summary(apply(fingerprints, 2, mean))
29 | }
30 | 
31 | \keyword{datasets}  
32 | 
33 | 


--------------------------------------------------------------------------------
/R/transparentTheme.R:
--------------------------------------------------------------------------------
 1 | transparentTheme <-
 2 |   function(set = TRUE, pchSize = 1, trans = .2) {
 3 |     theme <- list(
 4 |       plot.polygon = list(alpha = 1, col = "aliceblue", border = "black", lty = 1, lwd = 1),
 5 |       background = list(col = "transparent"), 
 6 |       bar.fill = list(col = "#cce6ff"), 
 7 |       box.rectangle = list(col = "black"), 
 8 |       box.umbrella = list(col = "black"), 
 9 |       dot.line = list(col = "#e8e8e8"), 
10 |       dot.symbol = list(col = "black"), 
11 |       plot.line = list(col = "black"), 
12 |       plot.symbol = list(col = "black"), 
13 |       regions = list(col = 
14 |                        c("#FEF8FA", "#FDF6F9", "#FBF5F9", "#FAF3F8", 
15 |                          "#F8F2F7", "#F7F0F7", "#F5EEF6", "#F4EDF5", 
16 |                          "#F2EBF5", "#F1EAF4", "#EFE8F3", "#EDE7F2", 
17 |                          "#ECE5F1", "#EAE4F1", "#E8E2F0", "#E6E1EF", 
18 |                          "#E4DFEE", "#E2DEED", "#E0DCEC", "#DEDAEB", 
19 |                          "#DCD9EA", "#D9D7E9", "#D7D6E8", "#D4D4E7", 
20 |                          "#D1D2E6", "#CED1E5", "#CCCFE4", "#C8CEE3",
21 |                          "#C5CCE2", "#C2CAE1", "#BFC9E0", "#BBC7DF", 
22 |                          "#B8C5DF", "#B4C4DE", "#B1C2DD", "#ADC0DC", 
23 |                          "#A9BFDB", "#A6BDDA", "#A2BBD9", "#9EB9D9", 
24 |                          "#9BB8D8", "#97B6D7", "#93B4D6", "#8FB2D5", 
25 |                          "#8BB0D4", "#87AFD3", "#83ADD2", "#7FABD1", 
26 |                          "#7AA9D0", "#76A7CF", "#71A5CE", "#6CA3CC", 
27 |                          "#68A1CB", "#63A0CA", "#5D9EC9", "#589CC8",
28 |                          "#539AC6", "#4E98C5", "#4996C4", "#4493C3", 
29 |                          "#3F91C1", "#3A8FC0", "#358DBF", "#308BBE", 
30 |                          "#2C89BD", "#2887BC", "#2385BB", "#1F83BA", 
31 |                          "#1C80B9", "#187EB7", "#157CB6", "#127AB5", 
32 |                          "#0F78B3", "#0D76B2", "#0A73B0", "#0971AE", 
33 |                          "#076FAC", "#066DAA", "#056AA7", "#0568A5")
34 |       ), 
35 |       strip.shingle = list(col = c(
36 |         "#ff7f00", "#00ff00", "#00ffff",
37 |         "#ff00ff", "#ff0000", "#ffff00", "#0080ff")), 
38 |       strip.background = list(col = c(
39 |         "#ffe5cc", "#ccffcc", "#ccffff",
40 |         "#ffccff", "#ffcccc", "#ffffcc", "#cce6ff")), 
41 |       reference.line = list(col = "#e8e8e8"), 
42 |       superpose.line = list(
43 |         col = c(
44 |           rgb(1, 0, 0, trans), rgb(0, 0, 1, trans),
45 |           rgb(0.3984375, 0.7578125, 0.6445312, max(.6, trans)),
46 |           rgb(0, 0, 0, trans)), 
47 |         lty = rep(1:2, 6)), 
48 |       superpose.symbol = list(
49 |         pch = c(16, 15, 17, 18, 16), 
50 |         cex = rep(pchSize,  5), 
51 |         col = c(
52 |           rgb(1, 0, 0, trans), rgb(0, 0, 1, trans),
53 |           rgb(0.3984375, 0.7578125, 0.6445312, max(.6, trans)),
54 |           rgb(0, 0, 0, trans)))
55 |     )
56 |     
57 |     if(set) trellis.par.set(theme, warn = FALSE)
58 |     invisible(theme)
59 |   }
60 | 


--------------------------------------------------------------------------------
/man/solubility.Rd:
--------------------------------------------------------------------------------
 1 | \name{solubility}
 2 | \docType{data}
 3 | \alias{trainX}
 4 | \alias{solTestXtrans}
 5 | \alias{solTrainY}
 6 | \alias{solTestX}
 7 | \alias{solTrainX}
 8 | \alias{solTrainXtrans}
 9 | \alias{solTestY}
10 | \title{Solubility Data}
11 | \description{
12 | Tetko et al. (2001) and Huuskonen (2000) investigated a set of compounds with corresponding experimental solubility values using complex sets of descriptors. They used linear regression and neural network models to estimate the relationship between chemical structure and solubility. For our analyses, we will use 1267 compounds and a set of more understandable descriptors that fall into one of three groups: 208 binary "fingerprints" that indicate the presence or absence of a particular chemical sub-structure, 16 count descriptors (such as the number of bonds or the number of Bromine atoms) and 4 continuous descriptors (such as molecular weight or surface area).
13 | }
14 | 
15 | \usage{data(solubility)}
16 | 
17 | \value{
18 |   \item{solTrainX}{training set predictors in their natural units. }
19 |   \item{solTrainXtrans}{training set predictors after transformations for skewness and centering/scaling.}
20 |   \item{solTrainY}{a vector of log10 solubility values for the training set. }
21 |   \item{solTestX}{test set predictors in their natural units. }
22 |   \item{solTestXtrans}{test set predictors after the same transformations used on the training set are applied.}
23 |   \item{solTestY}{a vector of log10 solubility values for the training set. }
24 |   }
25 | 
26 | 
27 | \source{Tetko, I., Tanchuk, V., Kasheva, T., and Villa, A. (2001). Estimation of aqueous solubility of chemical compounds using E-state indices. \emph{Journal of Chemical Information and Computer Sciences}, 41(6), 1488-1493.
28 | 
29 | Huuskonen, J. (2000). Estimation of aqueous solubility for a diverse set of organic compounds based on molecular topology. \emph{Journal of Chemical Information and Computer Sciences}, 40(3), 773-777.
30 | }
31 | 
32 | \examples{
33 | data(solubility)
34 | 
35 | library(caret)
36 | 
37 | ### Cross-validation splits used in the book:
38 | set.seed(100)
39 | indx <- createFolds(solTrainY, returnTrain = TRUE)
40 | 
41 | ### To re-create the transformed version of the data:
42 | \dontrun{
43 | ## Find the predictors that are not fingerprints
44 | contVars <- names(solTrainX)[!grepl("FP", names(solTrainX))]
45 | ## Some have zero values, so we need to add one to them so that
46 | ## we can use the Box-Cox transformation. Alternatively, we could 
47 | ## use the Yeo-Johnson transformation without altering the data.
48 | contPredTrain <- solTrainX[,contVars] + 1
49 | contPredTest  <-  solTestX[,contVars] + 1
50 | 
51 | pp <- preProcess(contPredTrain, method = "BoxCox")
52 | contPredTrain <- predict(pp, contPredTrain)
53 | contPredTest  <- predict(pp, contPredTest)
54 | 
55 | ## Reassemble the fingerprint data with the transformed values.
56 | trainXtrans <- cbind(solTrainX[,grep("FP", names(solTrainX))], contPredTrain)
57 | testXtrans  <- cbind( solTestX[,grep("FP", names(solTestX))],  contPredTest)
58 | 
59 | all.equal(trainXtrans, solTrainXtrans)
60 | all.equal(testXtrans, solTestXtrans)
61 | 	}
62 | 
63 | }
64 | 
65 | \keyword{datasets}  
66 | 
67 | 


--------------------------------------------------------------------------------
/R/bookTheme.R:
--------------------------------------------------------------------------------
 1 | bookTheme <- function(set = TRUE){
 2 |   theme <- list(
 3 |     plot.polygon = list(alpha = 1, col = "aliceblue", border = "black", lty = 1, lwd = 1),
 4 |     background = list(col = "transparent"), 
 5 |     bar.fill = list(col = "#cce6ff"), 
 6 |     box.rectangle = list(col = "black"), 
 7 |     box.umbrella = list(col = "black"), 
 8 |     dot.line = list(col = "#e8e8e8"), 
 9 |     dot.symbol = list(col = "black"), 
10 |     plot.line = list(col = "black", lwd = 1, lty = 1), 
11 |     plot.symbol = list(col = "black", pch = 16), 
12 |     regions = list(col = 
13 |                      c("#FEF8FA", "#FDF6F9", "#FBF5F9", "#FAF3F8", 
14 |                        "#F8F2F7", "#F7F0F7", "#F5EEF6", "#F4EDF5", 
15 |                        "#F2EBF5", "#F1EAF4", "#EFE8F3", "#EDE7F2", 
16 |                        "#ECE5F1", "#EAE4F1", "#E8E2F0", "#E6E1EF", 
17 |                        "#E4DFEE", "#E2DEED", "#E0DCEC", "#DEDAEB", 
18 |                        "#DCD9EA", "#D9D7E9", "#D7D6E8", "#D4D4E7", 
19 |                        "#D1D2E6", "#CED1E5", "#CCCFE4", "#C8CEE3",
20 |                        "#C5CCE2", "#C2CAE1", "#BFC9E0", "#BBC7DF", 
21 |                        "#B8C5DF", "#B4C4DE", "#B1C2DD", "#ADC0DC", 
22 |                        "#A9BFDB", "#A6BDDA", "#A2BBD9", "#9EB9D9", 
23 |                        "#9BB8D8", "#97B6D7", "#93B4D6", "#8FB2D5", 
24 |                        "#8BB0D4", "#87AFD3", "#83ADD2", "#7FABD1", 
25 |                        "#7AA9D0", "#76A7CF", "#71A5CE", "#6CA3CC", 
26 |                        "#68A1CB", "#63A0CA", "#5D9EC9", "#589CC8",
27 |                        "#539AC6", "#4E98C5", "#4996C4", "#4493C3", 
28 |                        "#3F91C1", "#3A8FC0", "#358DBF", "#308BBE", 
29 |                        "#2C89BD", "#2887BC", "#2385BB", "#1F83BA", 
30 |                        "#1C80B9", "#187EB7", "#157CB6", "#127AB5", 
31 |                        "#0F78B3", "#0D76B2", "#0A73B0", "#0971AE", 
32 |                        "#076FAC", "#066DAA", "#056AA7", "#0568A5")
33 |     ), 
34 |     strip.shingle = list(col = c(
35 |       "#ff7f00", "#00ff00", "#00ffff",
36 |       "#ff00ff", "#ff0000", "#ffff00", "#0080ff")
37 |     ), 
38 |     strip.background = list(col = c(
39 |       "#ffe5cc", "#ccffcc", "#ccffff",
40 |       "#ffccff", "#ffcccc", "#ffffcc", "#cce6ff")
41 |     ), 
42 |     reference.line = list(col = "#e8e8e8"), 
43 |     superpose.line = list(
44 |       col = c(
45 |         "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black", 
46 |         "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black", 
47 |         "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black",
48 |         "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black", 
49 |         "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black", 
50 |         "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black"
51 |       ), 
52 |       lty = rep(1:6, each = 6)), 
53 |     superpose.symbol = list(
54 |       pch = c(
55 |         1, 4, 6, 0, 5, 17,
56 |         4, 6, 0, 5, 17, 1,
57 |         6, 0, 5, 17, 1, 4,
58 |         0, 5, 17, 1, 4, 6,
59 |         5, 17, 1, 4, 6, 0 ,
60 |         17, 1, 4, 6, 0, 5), 
61 |       cex = rep(0.7,  6 * 6), 
62 |       col = c(
63 |         "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black", 
64 |         "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black", 
65 |         "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black",
66 |         "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black", 
67 |         "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black", 
68 |         "#053061", "#B2182B", "#F46D43", "#5E4FA2", "#66C2A5", "black"
69 |       )
70 |     )
71 |   )
72 |   
73 |   if(set) trellis.par.set(theme)
74 |   invisible(theme)
75 | }
76 | 


--------------------------------------------------------------------------------
/inst/chapters/02_A_Short_Tour.R:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
  3 | ### Copyright 2013 Kuhn and Johnson
  4 | ### Web Page: http://www.appliedpredictivemodeling.com
  5 | ### Contact: Max Kuhn (mxkuhn@gmail.com)
  6 | ###
  7 | ### Chapter 2: A Short Tour of the Predictive Modeling Process
  8 | ###
  9 | ### Required packages: AppliedPredictiveModeling, earth, caret, lattice
 10 | ###
 11 | ### Data used: The FuelEconomy data in the AppliedPredictiveModeling package
 12 | ###
 13 | ### Notes: 
 14 | ### 1) This code is provided without warranty.
 15 | ###
 16 | ### 2) This code should help the user reproduce the results in the
 17 | ### text. There will be differences between this code and what is is
 18 | ### the computing section. For example, the computing sections show
 19 | ### how the source functions work (e.g. randomForest() or plsr()),
 20 | ### which were not directly used when creating the book. Also, there may be 
 21 | ### syntax differences that occur over time as packages evolve. These files 
 22 | ### will reflect those changes.
 23 | ###
 24 | ### 3) In some cases, the calculations in the book were run in 
 25 | ### parallel. The sub-processes may reset the random number seed.
 26 | ### Your results may slightly vary.
 27 | ###
 28 | ################################################################################
 29 | 
 30 | ################################################################################
 31 | ### Section 2.1 Case Study: Predicting Fuel Economy
 32 | 
 33 | library(AppliedPredictiveModeling)
 34 | data(FuelEconomy)
 35 | 
 36 | ## Format data for plotting against engine displacement
 37 | 
 38 | ## Sort by engine displacement
 39 | cars2010 <- cars2010[order(cars2010$EngDispl),]
 40 | cars2011 <- cars2011[order(cars2011$EngDispl),]
 41 | 
 42 | ## Combine data into one data frame
 43 | cars2010a <- cars2010
 44 | cars2010a$Year <- "2010 Model Year"
 45 | cars2011a <- cars2011
 46 | cars2011a$Year <- "2011 Model Year"
 47 | 
 48 | plotData <- rbind(cars2010a, cars2011a)
 49 | 
 50 | library(lattice)
 51 | xyplot(FE ~ EngDispl|Year, plotData,
 52 |        xlab = "Engine Displacement",
 53 |        ylab = "Fuel Efficiency (MPG)",
 54 |        between = list(x = 1.2))
 55 | 
 56 | ## Fit a single linear model and conduct 10-fold CV to estimate the error
 57 | library(caret)
 58 | set.seed(1)
 59 | lm1Fit <- train(FE ~ EngDispl, 
 60 |                 data = cars2010,
 61 |                 method = "lm", 
 62 |                 trControl = trainControl(method= "cv"))
 63 | lm1Fit
 64 | 
 65 | 
 66 | ## Fit a quadratic model too
 67 | 
 68 | ## Create squared terms
 69 | cars2010$ED2 <- cars2010$EngDispl^2
 70 | cars2011$ED2 <- cars2011$EngDispl^2
 71 | 
 72 | set.seed(1)
 73 | lm2Fit <- train(FE ~ EngDispl + ED2, 
 74 |                 data = cars2010,
 75 |                 method = "lm", 
 76 |                 trControl = trainControl(method= "cv"))
 77 | lm2Fit
 78 | 
 79 | ## Finally a MARS model (via the earth package)
 80 | 
 81 | library(earth)
 82 | set.seed(1)
 83 | marsFit <- train(FE ~ EngDispl, 
 84 |                  data = cars2010,
 85 |                  method = "earth",
 86 |                  tuneLength = 15,
 87 |                  trControl = trainControl(method= "cv"))
 88 | marsFit
 89 | 
 90 | plot(marsFit)
 91 | 
 92 | ## Predict the test set data
 93 | cars2011$lm1  <- predict(lm1Fit,  cars2011)
 94 | cars2011$lm2  <- predict(lm2Fit,  cars2011)
 95 | cars2011$mars <- predict(marsFit, cars2011)
 96 | 
 97 | ## Get test set performance values via caret's postResample function
 98 | 
 99 | postResample(pred = cars2011$lm1,  obs = cars2011$FE)
100 | postResample(pred = cars2011$lm2,  obs = cars2011$FE)
101 | postResample(pred = cars2011$mars, obs = cars2011$FE)
102 | 
103 | ################################################################################
104 | ### Session Information
105 | 
106 | sessionInfo()
107 | 
108 | q("no")
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/MD5:
--------------------------------------------------------------------------------
 1 | d9a47751fc337024c1a1c41e1da27221 *DESCRIPTION
 2 | f78a4ad513d2c6c72c50fb7c8c3c4fa6 *NAMESPACE
 3 | 4ba4117b0411d3dffc0016338e086487 *R/bookTheme.R
 4 | ba7591c480e18250a36afca890626919 *R/easyBoundaryFunc.R
 5 | 4bcab4abad6714d7dded5c1bc3924c0c *R/getPackages.R
 6 | 4eeffcbe6c341af031b5ca224ddc01e3 *R/panels.R
 7 | 2cd8668d7140eff6b9bbb10e281ff643 *R/permuteRelief.R
 8 | f8804618082623af1ab53f6eda5d6c6e *R/quadBoundaryFunc.R
 9 | ea3ad783d73b8ecd91c3df840f944bfe *R/scriptLocation.R
10 | b7ba633adffdafee73c75abba35be85c *R/transparentTheme.R
11 | cb9165768774234eeda3f6e724fa4bae *README.md
12 | 8a34126ad3a2f9d077653b26d950dddb *data/AlzheimerDisease.RData
13 | fe3de40e923db3e0133b269f1610afa2 *data/ChemicalManufacturingProcess.RData
14 | ef3addd28ad9449688f0c33ba9bfc2d0 *data/FuelEconomy.RData
15 | 833d3d4a90e6afe16ec007d5fc628cd2 *data/abalone.RData
16 | 8fe13332a2419a2c253fb51c396f6000 *data/concrete.RData
17 | 8fc335fdac839cc09b91ff3b5595f00b *data/datalist
18 | 812edff8771d7faaada62fdb2c662e40 *data/hepatic.RData
19 | 7d5b8064233260c344793aaecc045136 *data/logisticCreditPredictions.RData
20 | ce7019b604378875d2a55e32dde9c05a *data/permeability.RData
21 | e1590269851cf810fdffa832b6cf6d65 *data/schedulingData.RData
22 | 669172e9b524f9194a23fbc84a2816f8 *data/segmentationOriginal.RData
23 | 06780bd86a4db76cb2a8eb12ef107df7 *data/solubility.RData
24 | 5e5422a8c05125f3ab1822f6c525296a *data/twoClassData.RData
25 | 7973bb1a98c7ee6c2e08604fbd923d59 *inst/NEWS.Rd
26 | 55afb317aa767a6e82c6c52ee985563f *inst/chapters/02_A_Short_Tour.R
27 | 5a487c219abde639b85d7275c6a4bf31 *inst/chapters/02_A_Short_Tour.Rout
28 | ec4768cf8bf24124e998a1ce680dceb6 *inst/chapters/03_Data_Pre_Processing.R
29 | 7676eb3d1f98e148e846a4dc2a1a2966 *inst/chapters/03_Data_Pre_Processing.Rout
30 | ebd06017c5910d16195af333a30aa3c1 *inst/chapters/04_Over_Fitting.R
31 | 1f7b3293506bf590766d34324a3364a2 *inst/chapters/04_Over_Fitting.Rout
32 | 161fc8e0549cc8cfbe631c867a87b352 *inst/chapters/06_Linear_Regression.R
33 | 421935c8783eae8dd82a43f7d7e48c91 *inst/chapters/06_Linear_Regression.Rout
34 | 35b94de4347d8077de03148192f55bc5 *inst/chapters/07_Non-Linear_Reg.R
35 | 30704984b3e08972410cb3d950df44ae *inst/chapters/07_Non-Linear_Reg.Rout
36 | 2f4a5558ef2f471fff8a9cd0aa6aebbb *inst/chapters/08_Regression_Trees.R
37 | 9a3ae5858f7dc208d50eceb7d4e7e982 *inst/chapters/08_Regression_Trees.Rout
38 | e1784aacc84cac62daa177d25c2d392c *inst/chapters/10_Case_Study_Concrete.R
39 | ea537049d1f8c99b19b9b17725fdf2fc *inst/chapters/10_Case_Study_Concrete.Rout
40 | c3693cc6dc941a60844d864cf264db28 *inst/chapters/11_Class_Performance.R
41 | 8a4f757b7c72ab2081f7446685a8b529 *inst/chapters/11_Class_Performance.Rout
42 | ea46874fb704eacf0251f85a14840e1e *inst/chapters/12_Discriminant_Analysis.R
43 | e929e79c48473605d1d2a86353e39520 *inst/chapters/12_Discriminant_Analysis.Rout
44 | 31024776ecc67808d7cb140b89971636 *inst/chapters/13_Non-Linear_Class.R
45 | 1e884bec1b83a834adc6e571058b778f *inst/chapters/13_Non-Linear_Class.Rout
46 | e9e1cedc8663992b155812a15abab389 *inst/chapters/14_Class_Trees.R
47 | 367863eba7ddbd2e7fc740b8a3ef41ea *inst/chapters/14_Class_Trees.Rout
48 | 24635d76b796d99d385ea39963c05709 *inst/chapters/16_Class_Imbalance.R
49 | 5748314017abcfde3e772bb0bffaa540 *inst/chapters/16_Class_Imbalance.Rout
50 | eb873435aec408cbb728e95120df75c4 *inst/chapters/17_Job_Scheduling.R
51 | 53b627e8a907d84ac8657eee31adad9d *inst/chapters/17_Job_Scheduling.Rout
52 | cdd39d98758aa17566201c45150265b8 *inst/chapters/18_Importance.R
53 | 4288e31b3484d3719f98cf377e756e7f *inst/chapters/18_Importance.Rout
54 | 001bc824c0505d4c462039b112364d9d *inst/chapters/19_Feature_Select.R
55 | 487d84200b36ed46159a608ec81fedc0 *inst/chapters/19_Feature_Select.Rout
56 | 3afe6f6859238c711c0ce0ba33678051 *inst/chapters/CreateGrantData.R
57 | 8a85f9749667d240d73b57de77df5b31 *inst/chapters/CreateGrantData.Rout
58 | 6a51123bb7533bc6ac7cc60e20c30f7c *man/AlzheimerDisease.Rd
59 | 79b66304686ea5f41624e941a839f783 *man/AppliedPredictiveModeling-package.Rd
60 | b5c2029d7b9d21d128b3084b108404a8 *man/ChemicalManufacturingProcess.Rd
61 | b8fb23f2d87770651df5c0b9ab178180 *man/FuelEconomy.Rd
62 | a114aed8c4e19f6e471f76aa10607efc *man/Hepatic.Rd
63 | bb766d31a2c9a73fb64a83ad8edcbf9d *man/abalone.Rd
64 | 7b4f4f04359281d886b4dd90765c2a29 *man/bookTheme.Rd
65 | 1a3a9b303f7a599f89cd2292a359511e *man/concrete.Rd
66 | 20acadadd6df9fce573aea2b7ee52020 *man/getPackages.Rd
67 | 8a02ecb81e0750c23ee2711b492dde91 *man/internal.Rd
68 | f3f357c33a2b2433fee6c8cecd4876fe *man/jobScheduling.Rd
69 | e422ed025d73fac0cd25de1a2146af1a *man/logisticCreditPredictions.Rd
70 | 1b347ea3e594dc8f4b0081d90b5764a1 *man/permeability.Rd
71 | 8559d6e7451f0060acb9b80827bb5fc3 *man/permuteRelief.Rd
72 | a0b9d85cec1c624144825536cc0b4993 *man/quadBoundaryFunc.Rd
73 | 94865b7fd486f04a94e7dae86599f242 *man/scriptLocation.Rd
74 | d242e9c533e5abb92513999c16dd91d1 *man/segmentationOrignal.Rd
75 | 1e481abc63c674153b4b7c700c7d830f *man/solubility.Rd
76 | bc21567d7b20d731be212decec057ab5 *man/twoClassData.Rd
77 | 


--------------------------------------------------------------------------------
/inst/chapters/07_Non-Linear_Reg.R:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
  3 | ### Copyright 2013 Kuhn and Johnson
  4 | ### Web Page: http://www.appliedpredictivemodeling.com
  5 | ### Contact: Max Kuhn (mxkuhn@gmail.com)
  6 | ###
  7 | ### Chapter 7: Non-Linear Regression Models
  8 | ###
  9 | ### Required packages: AppliedPredictiveModeling, caret, doMC (optional), earth,
 10 | ###                    kernlab, lattice, nnet
 11 | ###
 12 | ### Data used: The solubility from the AppliedPredictiveModeling package
 13 | ###
 14 | ### Notes: 
 15 | ### 1) This code is provided without warranty.
 16 | ###
 17 | ### 2) This code should help the user reproduce the results in the
 18 | ### text. There will be differences between this code and what is is
 19 | ### the computing section. For example, the computing sections show
 20 | ### how the source functions work (e.g. randomForest() or plsr()),
 21 | ### which were not directly used when creating the book. Also, there may be 
 22 | ### syntax differences that occur over time as packages evolve. These files 
 23 | ### will reflect those changes.
 24 | ###
 25 | ### 3) In some cases, the calculations in the book were run in 
 26 | ### parallel. The sub-processes may reset the random number seed.
 27 | ### Your results may slightly vary.
 28 | ###
 29 | ################################################################################
 30 | 
 31 | ################################################################################
 32 | ### Load the data
 33 | 
 34 | library(AppliedPredictiveModeling)
 35 | data(solubility)
 36 | 
 37 | ### Create a control funciton that will be used across models. We
 38 | ### create the fold assignments explictily instead of relying on the
 39 | ### random number seed being set to identical values.
 40 | 
 41 | library(caret)
 42 | set.seed(100)
 43 | indx <- createFolds(solTrainY, returnTrain = TRUE)
 44 | ctrl <- trainControl(method = "cv", index = indx)
 45 | 
 46 | ################################################################################
 47 | ### Section 7.1 Neural Networks
 48 | 
 49 | ### Optional: parallel processing can be used via the 'do' packages,
 50 | ### such as doMC, doMPI etc. We used doMC (not on Windows) to speed
 51 | ### up the computations.
 52 | 
 53 | ### WARNING: Be aware of how much memory is needed to parallel
 54 | ### process. It can very quickly overwhelm the availible hardware. We
 55 | ### estimate the memory usuage (VSIZE = total memory size) to be 
 56 | ### 2677M/core.
 57 | 
 58 | library(doMC)
 59 | registerDoMC(10)
 60 | 
 61 | 
 62 | library(caret)
 63 | 
 64 | nnetGrid <- expand.grid(decay = c(0, 0.01, .1), 
 65 |                         size = c(1, 3, 5, 7, 9, 11, 13), 
 66 |                         bag = FALSE)
 67 | 
 68 | set.seed(100)
 69 | nnetTune <- train(x = solTrainXtrans, y = solTrainY,
 70 |                   method = "avNNet",
 71 |                   tuneGrid = nnetGrid,
 72 |                   trControl = ctrl,
 73 |                   preProc = c("center", "scale"),
 74 |                   linout = TRUE,
 75 |                   trace = FALSE,
 76 |                   MaxNWts = 13 * (ncol(solTrainXtrans) + 1) + 13 + 1,
 77 |                   maxit = 1000,
 78 |                   allowParallel = FALSE)
 79 | nnetTune
 80 | 
 81 | plot(nnetTune)
 82 | 
 83 | testResults <- data.frame(obs = solTestY,
 84 |                           NNet = predict(nnetTune, solTestXtrans))
 85 | 
 86 | ################################################################################
 87 | ### Section 7.2 Multivariate Adaptive Regression Splines
 88 | 
 89 | set.seed(100)
 90 | marsTune <- train(x = solTrainXtrans, y = solTrainY,
 91 |                   method = "earth",
 92 |                   tuneGrid = expand.grid(degree = 1, nprune = 2:38),
 93 |                   trControl = ctrl)
 94 | marsTune
 95 | 
 96 | plot(marsTune)
 97 | 
 98 | testResults$MARS <- predict(marsTune, solTestXtrans)
 99 | 
100 | marsImp <- varImp(marsTune, scale = FALSE)
101 | plot(marsImp, top = 25)
102 | 
103 | ################################################################################
104 | ### Section 7.3 Support Vector Machines
105 | 
106 | ## In a recent update to caret, the method to estimate the
107 | ## sigma parameter was slightly changed. These results will
108 | ## slightly differ from the text for that reason.
109 | 
110 | set.seed(100)
111 | svmRTune <- train(x = solTrainXtrans, y = solTrainY,
112 |                   method = "svmRadial",
113 |                   preProc = c("center", "scale"),
114 |                   tuneLength = 14,
115 |                   trControl = ctrl)
116 | svmRTune
117 | plot(svmRTune, scales = list(x = list(log = 2)))                 
118 | 
119 | svmGrid <- expand.grid(degree = 1:2, 
120 |                        scale = c(0.01, 0.005, 0.001), 
121 |                        C = 2^(-2:5))
122 | set.seed(100)
123 | svmPTune <- train(x = solTrainXtrans, y = solTrainY,
124 |                   method = "svmPoly",
125 |                   preProc = c("center", "scale"),
126 |                   tuneGrid = svmGrid,
127 |                   trControl = ctrl)
128 | 
129 | svmPTune
130 | plot(svmPTune, 
131 |      scales = list(x = list(log = 2), 
132 |                    between = list(x = .5, y = 1)))                 
133 | 
134 | testResults$SVMr <- predict(svmRTune, solTestXtrans)
135 | testResults$SVMp <- predict(svmPTune, solTestXtrans)
136 | 
137 | ################################################################################
138 | ### Section 7.4 K-Nearest Neighbors
139 | 
140 | ### First we remove near-zero variance predictors
141 | knnDescr <- solTrainXtrans[, -nearZeroVar(solTrainXtrans)]
142 | 
143 | set.seed(100)
144 | knnTune <- train(x = knnDescr, y = solTrainY,
145 |                  method = "knn",
146 |                  preProc = c("center", "scale"),
147 |                  tuneGrid = data.frame(k = 1:20),
148 |                  trControl = ctrl)
149 |                  
150 | knnTune
151 | 
152 | plot(knnTune)
153 | 
154 | testResults$Knn <- predict(svmRTune, solTestXtrans[, names(knnDescr)])
155 | 
156 | ################################################################################
157 | ### Session Information
158 | 
159 | sessionInfo()
160 | 
161 | q("no")
162 | 
163 | 
164 | 


--------------------------------------------------------------------------------
/inst/chapters/06_Linear_Regression.R:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
  3 | ### Copyright 2013 Kuhn and Johnson
  4 | ### Web Page: http://www.appliedpredictivemodeling.com
  5 | ### Contact: Max Kuhn (mxkuhn@gmail.com)
  6 | ###
  7 | ### Chapter 6: Linear Regression and Its Cousins
  8 | ###
  9 | ### Required packages: AppliedPredictiveModeling, lattice, corrplot, pls, 
 10 | ###                    elasticnet,
 11 | ###
 12 | ### Data used: The solubility from the AppliedPredictiveModeling package
 13 | ###
 14 | ### Notes:
 15 | ### 1) This code is provided without warranty.
 16 | ###
 17 | ### 2) This code should help the user reproduce the results in the
 18 | ### text. There will be differences between this code and what is is
 19 | ### the computing section. For example, the computing sections show
 20 | ### how the source functions work (e.g. randomForest() or plsr()),
 21 | ### which were not directly used when creating the book. Also, there may be 
 22 | ### syntax differences that occur over time as packages evolve. These files 
 23 | ### will reflect those changes.
 24 | ###
 25 | ### 3) In some cases, the calculations in the book were run in 
 26 | ### parallel. The sub-processes may reset the random number seed.
 27 | ### Your results may slightly vary.
 28 | ###
 29 | ################################################################################
 30 | 
 31 | ################################################################################
 32 | ### Section 6.1 Case Study: Quantitative Structure- Activity
 33 | ### Relationship Modeling
 34 | 
 35 | library(AppliedPredictiveModeling)
 36 | data(solubility)
 37 | 
 38 | library(lattice)
 39 | 
 40 | ### Some initial plots of the data
 41 | 
 42 | xyplot(solTrainY ~ solTrainX$MolWeight, type = c("p", "g"),
 43 |        ylab = "Solubility (log)",
 44 |        main = "(a)",
 45 |        xlab = "Molecular Weight")
 46 | xyplot(solTrainY ~ solTrainX$NumRotBonds, type = c("p", "g"),
 47 |        ylab = "Solubility (log)",
 48 |        xlab = "Number of Rotatable Bonds")
 49 | bwplot(solTrainY ~ ifelse(solTrainX[,100] == 1, 
 50 |                           "structure present", 
 51 |                           "structure absent"),
 52 |        ylab = "Solubility (log)",
 53 |        main = "(b)",
 54 |        horizontal = FALSE)
 55 | 
 56 | ### Find the columns that are not fingerprints (i.e. the continuous
 57 | ### predictors). grep will return a list of integers corresponding to
 58 | ### column names that contain the pattern "FP".
 59 | 
 60 | notFingerprints <- grep("FP", names(solTrainXtrans))
 61 | 
 62 | library(caret)
 63 | featurePlot(solTrainXtrans[, -notFingerprints],
 64 |             solTrainY,
 65 |             between = list(x = 1, y = 1),
 66 |             type = c("g", "p", "smooth"),
 67 |             labels = rep("", 2))
 68 | 
 69 | library(corrplot)
 70 | 
 71 | ### We used the full namespace to call this function because the pls
 72 | ### package (also used in this chapter) has a function with the same
 73 | ### name.
 74 | 
 75 | corrplot::corrplot(cor(solTrainXtrans[, -notFingerprints]), 
 76 |                    order = "hclust", 
 77 |                    tl.cex = .8)
 78 | 
 79 | ################################################################################
 80 | ### Section 6.2 Linear Regression
 81 | 
 82 | ### Create a control function that will be used across models. We
 83 | ### create the fold assignments explicitly instead of relying on the
 84 | ### random number seed being set to identical values.
 85 | 
 86 | set.seed(100)
 87 | indx <- createFolds(solTrainY, returnTrain = TRUE)
 88 | ctrl <- trainControl(method = "cv", index = indx)
 89 | 
 90 | ### Linear regression model with all of the predictors. This will
 91 | ### produce some warnings that a 'rank-deficient fit may be
 92 | ### misleading'. This is related to the predictors being so highly
 93 | ### correlated that some of the math has broken down.
 94 | 
 95 | set.seed(100)
 96 | lmTune0 <- train(x = solTrainXtrans, y = solTrainY,
 97 |                  method = "lm",
 98 |                  trControl = ctrl)
 99 | 
100 | lmTune0                 
101 | 
102 | ### And another using a set of predictors reduced by unsupervised
103 | ### filtering. We apply a filter to reduce extreme between-predictor
104 | ### correlations. Note the lack of warnings.
105 | 
106 | tooHigh <- findCorrelation(cor(solTrainXtrans), .9)
107 | trainXfiltered <- solTrainXtrans[, -tooHigh]
108 | testXfiltered  <-  solTestXtrans[, -tooHigh]
109 | 
110 | set.seed(100)
111 | lmTune <- train(x = trainXfiltered, y = solTrainY,
112 |                 method = "lm",
113 |                 trControl = ctrl)
114 | 
115 | lmTune
116 | 
117 | ### Save the test set results in a data frame                 
118 | testResults <- data.frame(obs = solTestY,
119 |                           Linear_Regression = predict(lmTune, testXfiltered))
120 | 
121 | 
122 | ################################################################################
123 | ### Section 6.3 Partial Least Squares
124 | 
125 | ## Run PLS and PCR on solubility data and compare results
126 | set.seed(100)
127 | plsTune <- train(x = solTrainXtrans, y = solTrainY,
128 |                  method = "pls",
129 |                  tuneGrid = expand.grid(ncomp = 1:20),
130 |                  trControl = ctrl)
131 | plsTune
132 | 
133 | testResults$PLS <- predict(plsTune, solTestXtrans)
134 | 
135 | set.seed(100)
136 | pcrTune <- train(x = solTrainXtrans, y = solTrainY,
137 |                  method = "pcr",
138 |                  tuneGrid = expand.grid(ncomp = 1:35),
139 |                  trControl = ctrl)
140 | pcrTune                  
141 | 
142 | plsResamples <- plsTune$results
143 | plsResamples$Model <- "PLS"
144 | pcrResamples <- pcrTune$results
145 | pcrResamples$Model <- "PCR"
146 | plsPlotData <- rbind(plsResamples, pcrResamples)
147 | 
148 | xyplot(RMSE ~ ncomp,
149 |        data = plsPlotData,
150 |        #aspect = 1,
151 |        xlab = "# Components",
152 |        ylab = "RMSE (Cross-Validation)",
153 |        auto.key = list(columns = 2),
154 |        groups = Model,
155 |        type = c("o", "g"))
156 | 
157 | plsImp <- varImp(plsTune, scale = FALSE)
158 | plot(plsImp, top = 25, scales = list(y = list(cex = .95)))
159 | 
160 | ################################################################################
161 | ### Section 6.4 Penalized Models
162 | 
163 | ## The text used the elasticnet to obtain a ridge regression model.
164 | ## There is now a simple ridge regression method.
165 | 
166 | ridgeGrid <- expand.grid(lambda = seq(0, .1, length = 15))
167 | 
168 | set.seed(100)
169 | ridgeTune <- train(x = solTrainXtrans, y = solTrainY,
170 |                    method = "ridge",
171 |                    tuneGrid = ridgeGrid,
172 |                    trControl = ctrl,
173 |                    preProc = c("center", "scale"))
174 | ridgeTune
175 | 
176 | print(update(plot(ridgeTune), xlab = "Penalty"))
177 | 
178 | 
179 | enetGrid <- expand.grid(lambda = c(0, 0.01, .1), 
180 |                         fraction = seq(.05, 1, length = 20))
181 | set.seed(100)
182 | enetTune <- train(x = solTrainXtrans, y = solTrainY,
183 |                   method = "enet",
184 |                   tuneGrid = enetGrid,
185 |                   trControl = ctrl,
186 |                   preProc = c("center", "scale"))
187 | enetTune
188 | 
189 | plot(enetTune)
190 | 
191 | testResults$Enet <- predict(enetTune, solTestXtrans)
192 | 
193 | ################################################################################
194 | ### Session Information
195 | 
196 | sessionInfo()
197 | 
198 | q("no")
199 | 
200 | 
201 | 
202 | 


--------------------------------------------------------------------------------
/inst/chapters/02_A_Short_Tour.Rout:
--------------------------------------------------------------------------------
  1 | 
  2 | R version 3.0.1 (2013-05-16) -- "Good Sport"
  3 | Copyright (C) 2013 The R Foundation for Statistical Computing
  4 | Platform: x86_64-apple-darwin10.8.0 (64-bit)
  5 | 
  6 | R is free software and comes with ABSOLUTELY NO WARRANTY.
  7 | You are welcome to redistribute it under certain conditions.
  8 | Type 'license()' or 'licence()' for distribution details.
  9 | 
 10 |   Natural language support but running in an English locale
 11 | 
 12 | R is a collaborative project with many contributors.
 13 | Type 'contributors()' for more information and
 14 | 'citation()' on how to cite R or R packages in publications.
 15 | 
 16 | Type 'demo()' for some demos, 'help()' for on-line help, or
 17 | 'help.start()' for an HTML browser interface to help.
 18 | Type 'q()' to quit R.
 19 | 
 20 | > ################################################################################
 21 | > ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
 22 | > ### Copyright 2013 Kuhn and Johnson
 23 | > ### Web Page: http://www.appliedpredictivemodeling.com
 24 | > ### Contact: Max Kuhn (mxkuhn@gmail.com)
 25 | > ###
 26 | > ### Chapter 2: A Short Tour of the Predictive Modeling Process
 27 | > ###
 28 | > ### Required packages: AppliedPredictiveModeling, earth, caret, lattice
 29 | > ###
 30 | > ### Data used: The FuelEconomy data in the AppliedPredictiveModeling package
 31 | > ###
 32 | > ### Notes: 
 33 | > ### 1) This code is provided without warranty.
 34 | > ###
 35 | > ### 2) This code should help the user reproduce the results in the
 36 | > ### text. There will be differences between this code and what is is
 37 | > ### the computing section. For example, the computing sections show
 38 | > ### how the source functions work (e.g. randomForest() or plsr()),
 39 | > ### which were not directly used when creating the book. Also, there may be 
 40 | > ### syntax differences that occur over time as packages evolve. These files 
 41 | > ### will reflect those changes.
 42 | > ###
 43 | > ### 3) In some cases, the calculations in the book were run in 
 44 | > ### parallel. The sub-processes may reset the random number seed.
 45 | > ### Your results may slightly vary.
 46 | > ###
 47 | > ################################################################################
 48 | > 
 49 | > ################################################################################
 50 | > ### Section 2.1 Case Study: Predicting Fuel Economy
 51 | > 
 52 | > library(AppliedPredictiveModeling)
 53 | > data(FuelEconomy)
 54 | > 
 55 | > ## Format data for plotting against engine displacement
 56 | > 
 57 | > ## Sort by engine displacement
 58 | > cars2010 <- cars2010[order(cars2010$EngDispl),]
 59 | > cars2011 <- cars2011[order(cars2011$EngDispl),]
 60 | > 
 61 | > ## Combine data into one data frame
 62 | > cars2010a <- cars2010
 63 | > cars2010a$Year <- "2010 Model Year"
 64 | > cars2011a <- cars2011
 65 | > cars2011a$Year <- "2011 Model Year"
 66 | > 
 67 | > plotData <- rbind(cars2010a, cars2011a)
 68 | > 
 69 | > library(lattice)
 70 | > xyplot(FE ~ EngDispl|Year, plotData,
 71 | +        xlab = "Engine Displacement",
 72 | +        ylab = "Fuel Efficiency (MPG)",
 73 | +        between = list(x = 1.2))
 74 | > 
 75 | > ## Fit a single linear model and conduct 10-fold CV to estimate the error
 76 | > library(caret)
 77 | Loading required package: ggplot2
 78 | > set.seed(1)
 79 | > lm1Fit <- train(FE ~ EngDispl, 
 80 | +                 data = cars2010,
 81 | +                 method = "lm", 
 82 | +                 trControl = trainControl(method= "cv"))
 83 | > lm1Fit
 84 | Linear Regression 
 85 | 
 86 | 1107 samples
 87 |   13 predictors
 88 | 
 89 | No pre-processing
 90 | Resampling: Cross-Validated (10 fold) 
 91 | 
 92 | Summary of sample sizes: 997, 996, 995, 996, 997, 996, ... 
 93 | 
 94 | Resampling results
 95 | 
 96 |   RMSE  Rsquared  RMSE SD  Rsquared SD
 97 |   4.6   0.628     0.493    0.0442     
 98 | 
 99 |  
100 | > 
101 | > 
102 | > ## Fit a quadratic model too
103 | > 
104 | > ## Create squared terms
105 | > cars2010$ED2 <- cars2010$EngDispl^2
106 | > cars2011$ED2 <- cars2011$EngDispl^2
107 | > 
108 | > set.seed(1)
109 | > lm2Fit <- train(FE ~ EngDispl + ED2, 
110 | +                 data = cars2010,
111 | +                 method = "lm", 
112 | +                 trControl = trainControl(method= "cv"))
113 | > lm2Fit
114 | Linear Regression 
115 | 
116 | 1107 samples
117 |   14 predictors
118 | 
119 | No pre-processing
120 | Resampling: Cross-Validated (10 fold) 
121 | 
122 | Summary of sample sizes: 997, 996, 995, 996, 997, 996, ... 
123 | 
124 | Resampling results
125 | 
126 |   RMSE  Rsquared  RMSE SD  Rsquared SD
127 |   4.23  0.684     0.419    0.0421     
128 | 
129 |  
130 | > 
131 | > ## Finally a MARS model (via the earth package)
132 | > 
133 | > library(earth)
134 | Loading required package: plotmo
135 | Loading required package: plotrix
136 | > set.seed(1)
137 | > marsFit <- train(FE ~ EngDispl, 
138 | +                  data = cars2010,
139 | +                  method = "earth",
140 | +                  tuneLength = 15,
141 | +                  trControl = trainControl(method= "cv"))
142 | > marsFit
143 | Multivariate Adaptive Regression Spline 
144 | 
145 | 1107 samples
146 |   14 predictors
147 | 
148 | No pre-processing
149 | Resampling: Cross-Validated (10 fold) 
150 | 
151 | Summary of sample sizes: 997, 996, 995, 996, 997, 996, ... 
152 | 
153 | Resampling results across tuning parameters:
154 | 
155 |   nprune  RMSE  Rsquared  RMSE SD  Rsquared SD
156 |   2       4.3   0.673     0.441    0.0429     
157 |   3       4.26  0.68      0.44     0.0395     
158 |   4       4.23  0.685     0.449    0.0428     
159 |   5       4.25  0.682     0.489    0.0432     
160 | 
161 | Tuning parameter 'degree' was held constant at a value of 1
162 | RMSE was used to select the optimal model using  the smallest value.
163 | The final values used for the model were nprune = 4 and degree = 1. 
164 | > 
165 | > plot(marsFit)
166 | > 
167 | > ## Predict the test set data
168 | > cars2011$lm1  <- predict(lm1Fit,  cars2011)
169 | > cars2011$lm2  <- predict(lm2Fit,  cars2011)
170 | > cars2011$mars <- predict(marsFit, cars2011)
171 | > 
172 | > ## Get test set performance values via caret's postResample function
173 | > 
174 | > postResample(pred = cars2011$lm1,  obs = cars2011$FE)
175 |      RMSE  Rsquared 
176 | 5.1625309 0.7018642 
177 | > postResample(pred = cars2011$lm2,  obs = cars2011$FE)
178 |      RMSE  Rsquared 
179 | 4.7162853 0.7486074 
180 | > postResample(pred = cars2011$mars, obs = cars2011$FE)
181 |      RMSE  Rsquared 
182 | 4.6855501 0.7499953 
183 | > 
184 | > ################################################################################
185 | > ### Session Information
186 | > 
187 | > sessionInfo()
188 | R version 3.0.1 (2013-05-16)
189 | Platform: x86_64-apple-darwin10.8.0 (64-bit)
190 | 
191 | locale:
192 | [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
193 | 
194 | attached base packages:
195 | [1] stats     graphics  grDevices utils     datasets  methods   base     
196 | 
197 | other attached packages:
198 | [1] earth_3.2-6                     plotrix_3.4-7                  
199 | [3] plotmo_1.3-2                    caret_6.0-22                   
200 | [5] ggplot2_0.9.3.1                 lattice_0.20-15                
201 | [7] AppliedPredictiveModeling_1.1-5
202 | 
203 | loaded via a namespace (and not attached):
204 |  [1] car_2.0-17         codetools_0.2-8    colorspace_1.2-2   compiler_3.0.1    
205 |  [5] CORElearn_0.9.41   dichromat_2.0-0    digest_0.6.3       foreach_1.4.0     
206 |  [9] grid_3.0.1         gtable_0.1.2       iterators_1.0.6    labeling_0.1      
207 | [13] MASS_7.3-26        munsell_0.4        plyr_1.8           proto_0.3-10      
208 | [17] RColorBrewer_1.0-5 reshape2_1.2.2     scales_0.2.3       stringr_0.6.2     
209 | [21] tools_3.0.1       
210 | > 
211 | > q("no")
212 | > proc.time()
213 |    user  system elapsed 
214 |   4.971   0.114   5.292 
215 | 


--------------------------------------------------------------------------------
/inst/chapters/11_Class_Performance.R:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
  3 | ### Copyright 2013 Kuhn and Johnson
  4 | ### Web Page: http://www.appliedpredictivemodeling.com
  5 | ### Contact: Max Kuhn (mxkuhn@gmail.com) 
  6 | ###
  7 | ### Chapter 11: Measuring Performance in Classification Models
  8 | ###
  9 | ### Required packages: AppliedPredictiveModeling, caret, MASS, randomForest,
 10 | ###                    pROC, klaR
 11 | ###
 12 | ### Data used: The solubility from the AppliedPredictiveModeling package
 13 | ###
 14 | ### Notes: 
 15 | ### 1) This code is provided without warranty.
 16 | ###
 17 | ### 2) This code should help the user reproduce the results in the
 18 | ### text. There will be differences between this code and what is is
 19 | ### the computing section. For example, the computing sections show
 20 | ### how the source functions work (e.g. randomForest() or plsr()),
 21 | ### which were not directly used when creating the book. Also, there may be 
 22 | ### syntax differences that occur over time as packages evolve. These files 
 23 | ### will reflect those changes.
 24 | ###
 25 | ### 3) In some cases, the calculations in the book were run in 
 26 | ### parallel. The sub-processes may reset the random number seed.
 27 | ### Your results may slightly vary.
 28 | ###
 29 | ################################################################################
 30 | 
 31 | ################################################################################
 32 | ### Section 11.1 Class Predictions
 33 | 
 34 | library(AppliedPredictiveModeling)
 35 | 
 36 | ### Simulate some two class data with two predictors
 37 | set.seed(975)
 38 | training <- quadBoundaryFunc(500)
 39 | testing <- quadBoundaryFunc(1000)
 40 | testing$class2 <- ifelse(testing$class == "Class1", 1, 0)
 41 | testing$ID <- 1:nrow(testing)
 42 | 
 43 | ### Fit models
 44 | library(MASS)
 45 | qdaFit <- qda(class ~ X1 + X2, data = training)
 46 | library(randomForest)
 47 | rfFit <- randomForest(class ~ X1 + X2, data = training, ntree = 2000)
 48 | 
 49 | ### Predict the test set
 50 | testing$qda <- predict(qdaFit, testing)$posterior[,1]
 51 | testing$rf <- predict(rfFit, testing, type = "prob")[,1]
 52 | 
 53 | 
 54 | ### Generate the calibration analysis
 55 | library(caret)
 56 | calData1 <- calibration(class ~ qda + rf, data = testing, cuts = 10)
 57 | 
 58 | ### Plot the curve
 59 | xyplot(calData1, auto.key = list(columns = 2))
 60 | 
 61 | ### To calibrate the data, treat the probabilities as inputs into the
 62 | ### model
 63 | 
 64 | trainProbs <- training
 65 | trainProbs$qda <- predict(qdaFit)$posterior[,1]
 66 | 
 67 | ### These models take the probabilities as inputs and, based on the
 68 | ### true class, re-calibrate them.
 69 | library(klaR)
 70 | nbCal <- NaiveBayes(class ~ qda, data = trainProbs, usekernel = TRUE)
 71 | 
 72 | ### We use relevel() here because glm() models the probability of the
 73 | ### second factor level.
 74 | lrCal <- glm(relevel(class, "Class2") ~ qda, data = trainProbs, family = binomial)
 75 | 
 76 | ### Now re-predict the test set using the modified class probability
 77 | ### estimates
 78 | testing$qda2 <- predict(nbCal, testing[, "qda", drop = FALSE])$posterior[,1]
 79 | testing$qda3 <- predict(lrCal, testing[, "qda", drop = FALSE], type = "response")
 80 | 
 81 | 
 82 | ### Manipulate the data a bit for pretty plotting
 83 | simulatedProbs <- testing[, c("class", "rf", "qda3")]
 84 | names(simulatedProbs) <- c("TrueClass", "RandomForestProb", "QDACalibrated")
 85 | simulatedProbs$RandomForestClass <-  predict(rfFit, testing)
 86 | 
 87 | calData2 <- calibration(class ~ qda + qda2 + qda3, data = testing)
 88 | calData2$data$calibModelVar <- as.character(calData2$data$calibModelVar)
 89 | calData2$data$calibModelVar <- ifelse(calData2$data$calibModelVar == "qda", 
 90 |                                       "QDA",
 91 |                                       calData2$data$calibModelVar)
 92 | calData2$data$calibModelVar <- ifelse(calData2$data$calibModelVar == "qda2", 
 93 |                                       "Bayesian Calibration",
 94 |                                       calData2$data$calibModelVar)
 95 | 
 96 | calData2$data$calibModelVar <- ifelse(calData2$data$calibModelVar == "qda3", 
 97 |                                       "Sigmoidal Calibration",
 98 |                                       calData2$data$calibModelVar)
 99 | 
100 | calData2$data$calibModelVar <- factor(calData2$data$calibModelVar,
101 |                                       levels = c("QDA", 
102 |                                                  "Bayesian Calibration", 
103 |                                                  "Sigmoidal Calibration"))
104 | 
105 | xyplot(calData2, auto.key = list(columns = 1))
106 | 
107 | ### Recreate the model used in the over-fitting chapter
108 | 
109 | library(caret)
110 | data(GermanCredit)
111 | 
112 | ## First, remove near-zero variance predictors then get rid of a few predictors 
113 | ## that duplicate values. For example, there are two possible values for the 
114 | ## housing variable: "Rent", "Own" and "ForFree". So that we don't have linear
115 | ## dependencies, we get rid of one of the levels (e.g. "ForFree")
116 | 
117 | GermanCredit <- GermanCredit[, -nearZeroVar(GermanCredit)]
118 | GermanCredit$CheckingAccountStatus.lt.0 <- NULL
119 | GermanCredit$SavingsAccountBonds.lt.100 <- NULL
120 | GermanCredit$EmploymentDuration.lt.1 <- NULL
121 | GermanCredit$EmploymentDuration.Unemployed <- NULL
122 | GermanCredit$Personal.Male.Married.Widowed <- NULL
123 | GermanCredit$Property.Unknown <- NULL
124 | GermanCredit$Housing.ForFree <- NULL
125 | 
126 | ## Split the data into training (80%) and test sets (20%)
127 | set.seed(100)
128 | inTrain <- createDataPartition(GermanCredit$Class, p = .8)[[1]]
129 | GermanCreditTrain <- GermanCredit[ inTrain, ]
130 | GermanCreditTest  <- GermanCredit[-inTrain, ]
131 | 
132 | set.seed(1056)
133 | logisticReg <- train(Class ~ .,
134 |                      data = GermanCreditTrain,
135 |                      method = "glm",
136 |                      trControl = trainControl(method = "repeatedcv", 
137 |                                               repeats = 5))
138 | logisticReg
139 | 
140 | ### Predict the test set
141 | creditResults <- data.frame(obs = GermanCreditTest$Class)
142 | creditResults$prob <- predict(logisticReg, GermanCreditTest, type = "prob")[, "Bad"]
143 | creditResults$pred <- predict(logisticReg, GermanCreditTest)
144 | creditResults$Label <- ifelse(creditResults$obs == "Bad", 
145 |                               "True Outcome: Bad Credit", 
146 |                               "True Outcome: Good Credit")
147 | 
148 | ### Plot the probability of bad credit
149 | histogram(~prob|Label,
150 |           data = creditResults,
151 |           layout = c(2, 1),
152 |           nint = 20,
153 |           xlab = "Probability of Bad Credit",
154 |           type = "count")
155 | 
156 | ### Calculate and plot the calibration curve
157 | creditCalib <- calibration(obs ~ prob, data = creditResults)
158 | xyplot(creditCalib)
159 | 
160 | ### Create the confusion matrix from the test set.
161 | confusionMatrix(data = creditResults$pred, 
162 |                 reference = creditResults$obs)
163 | 
164 | ### ROC curves:
165 | 
166 | ### Like glm(), roc() treats the last level of the factor as the event
167 | ### of interest so we use relevel() to change the observed class data
168 | 
169 | library(pROC)
170 | creditROC <- roc(relevel(creditResults$obs, "Good"), creditResults$prob)
171 | 
172 | coords(creditROC, "all")[,1:3]
173 | 
174 | auc(creditROC)
175 | ci.auc(creditROC)
176 | 
177 | ### Note the x-axis is reversed
178 | plot(creditROC)
179 | 
180 | ### Old-school:
181 | plot(creditROC, legacy.axes = TRUE)
182 | 
183 | ### Lift charts
184 | 
185 | creditLift <- lift(obs ~ prob, data = creditResults)
186 | xyplot(creditLift)
187 | 
188 | 
189 | ################################################################################
190 | ### Session Information
191 | 
192 | sessionInfo()
193 | 
194 | q("no")
195 | 


--------------------------------------------------------------------------------
/inst/chapters/04_Over_Fitting.R:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
  3 | ### Copyright 2013 Kuhn and Johnson
  4 | ### Web Page: http://www.appliedpredictivemodeling.com
  5 | ### Contact: Max Kuhn (mxkuhn@gmail.com)
  6 | ###
  7 | ### Chapter 4: Over-Fitting and Model Tuning
  8 | ###
  9 | ### Required packages: caret, doMC (optional), kernlab
 10 | ###
 11 | ### Data used: 
 12 | ###
 13 | ### Notes: 
 14 | ### 1) This code is provided without warranty.
 15 | ###
 16 | ### 2) This code should help the user reproduce the results in the
 17 | ### text. There will be differences between this code and what is is
 18 | ### the computing section. For example, the computing sections show
 19 | ### how the source functions work (e.g. randomForest() or plsr()),
 20 | ### which were not directly used when creating the book. Also, there may be 
 21 | ### syntax differences that occur over time as packages evolve. These files 
 22 | ### will reflect those changes.
 23 | ###
 24 | ### 3) In some cases, the calculations in the book were run in 
 25 | ### parallel. The sub-processes may reset the random number seed.
 26 | ### Your results may slightly vary.
 27 | ###
 28 | ################################################################################
 29 | 
 30 | ################################################################################
 31 | ### Section 4.6 Choosing Final Tuning Parameters
 32 | 
 33 | library(caret)
 34 | data(GermanCredit)
 35 | 
 36 | ## First, remove near-zero variance predictors then get rid of a few predictors 
 37 | ## that duplicate values. For example, there are two possible values for the 
 38 | ## housing variable: "Rent", "Own" and "ForFree". So that we don't have linear
 39 | ## dependencies, we get rid of one of the levels (e.g. "ForFree")
 40 | 
 41 | GermanCredit <- GermanCredit[, -nearZeroVar(GermanCredit)]
 42 | GermanCredit$CheckingAccountStatus.lt.0 <- NULL
 43 | GermanCredit$SavingsAccountBonds.lt.100 <- NULL
 44 | GermanCredit$EmploymentDuration.lt.1 <- NULL
 45 | GermanCredit$EmploymentDuration.Unemployed <- NULL
 46 | GermanCredit$Personal.Male.Married.Widowed <- NULL
 47 | GermanCredit$Property.Unknown <- NULL
 48 | GermanCredit$Housing.ForFree <- NULL
 49 | 
 50 | ## Split the data into training (80%) and test sets (20%)
 51 | set.seed(100)
 52 | inTrain <- createDataPartition(GermanCredit$Class, p = .8)[[1]]
 53 | GermanCreditTrain <- GermanCredit[ inTrain, ]
 54 | GermanCreditTest  <- GermanCredit[-inTrain, ]
 55 | 
 56 | ## The model fitting code shown in the computing section is fairly
 57 | ## simplistic.  For the text we estimate the tuning parameter grid
 58 | ## up-front and pass it in explicitly. This generally is not needed,
 59 | ## but was used here so that we could trim the cost values to a
 60 | ## presentable range and to re-use later with different resampling
 61 | ## methods.
 62 | 
 63 | library(kernlab)
 64 | set.seed(231)
 65 | sigDist <- sigest(Class ~ ., data = GermanCreditTrain, frac = 1)
 66 | svmTuneGrid <- data.frame(sigma = as.vector(sigDist)[1], C = 2^(-2:7))
 67 | 
 68 | ### Optional: parallel processing can be used via the 'do' packages,
 69 | ### such as doMC, doMPI etc. We used doMC (not on Windows) to speed
 70 | ### up the computations.
 71 | 
 72 | ### WARNING: Be aware of how much memory is needed to parallel
 73 | ### process. It can very quickly overwhelm the available hardware. We
 74 | ### estimate the memory usage (VSIZE = total memory size) to be 
 75 | ### 2566M/core.
 76 | 
 77 | library(doMC)
 78 | registerDoMC(4)
 79 | 
 80 | set.seed(1056)
 81 | svmFit <- train(Class ~ .,
 82 |                 data = GermanCreditTrain,
 83 |                 method = "svmRadial",
 84 |                 preProc = c("center", "scale"),
 85 |                 tuneGrid = svmTuneGrid,
 86 |                 trControl = trainControl(method = "repeatedcv", 
 87 |                                          repeats = 5,
 88 |                                          classProbs = TRUE))
 89 | ## classProbs = TRUE was added since the text was written
 90 | 
 91 | ## Print the results
 92 | svmFit
 93 | 
 94 | ## A line plot of the average performance. The 'scales' argument is actually an 
 95 | ## argument to xyplot that converts the x-axis to log-2 units.
 96 | 
 97 | plot(svmFit, scales = list(x = list(log = 2)))
 98 | 
 99 | ## Test set predictions
100 | 
101 | predictedClasses <- predict(svmFit, GermanCreditTest)
102 | str(predictedClasses)
103 | 
104 | ## Use the "type" option to get class probabilities
105 | 
106 | predictedProbs <- predict(svmFit, newdata = GermanCreditTest, type = "prob")
107 | head(predictedProbs)
108 | 
109 | 
110 | ## Fit the same model using different resampling methods. The main syntax change
111 | ## is the control object.
112 | 
113 | set.seed(1056)
114 | svmFit10CV <- train(Class ~ .,
115 |                     data = GermanCreditTrain,
116 |                     method = "svmRadial",
117 |                     preProc = c("center", "scale"),
118 |                     tuneGrid = svmTuneGrid,
119 |                     trControl = trainControl(method = "cv", number = 10))
120 | svmFit10CV
121 | 
122 | set.seed(1056)
123 | svmFitLOO <- train(Class ~ .,
124 |                    data = GermanCreditTrain,
125 |                    method = "svmRadial",
126 |                    preProc = c("center", "scale"),
127 |                    tuneGrid = svmTuneGrid,
128 |                    trControl = trainControl(method = "LOOCV"))
129 | svmFitLOO
130 | 
131 | set.seed(1056)
132 | svmFitLGO <- train(Class ~ .,
133 |                    data = GermanCreditTrain,
134 |                    method = "svmRadial",
135 |                    preProc = c("center", "scale"),
136 |                    tuneGrid = svmTuneGrid,
137 |                    trControl = trainControl(method = "LGOCV", 
138 |                                             number = 50, 
139 |                                             p = .8))
140 | svmFitLGO 
141 | 
142 | set.seed(1056)
143 | svmFitBoot <- train(Class ~ .,
144 |                     data = GermanCreditTrain,
145 |                     method = "svmRadial",
146 |                     preProc = c("center", "scale"),
147 |                     tuneGrid = svmTuneGrid,
148 |                     trControl = trainControl(method = "boot", number = 50))
149 | svmFitBoot
150 | 
151 | set.seed(1056)
152 | svmFitBoot632 <- train(Class ~ .,
153 |                        data = GermanCreditTrain,
154 |                        method = "svmRadial",
155 |                        preProc = c("center", "scale"),
156 |                        tuneGrid = svmTuneGrid,
157 |                        trControl = trainControl(method = "boot632", 
158 |                                                 number = 50))
159 | svmFitBoot632
160 | 
161 | ################################################################################
162 | ### Section 4.8 Choosing Between Models
163 | 
164 | set.seed(1056)
165 | glmProfile <- train(Class ~ .,
166 |                     data = GermanCreditTrain,
167 |                     method = "glm",
168 |                     trControl = trainControl(method = "repeatedcv", 
169 |                                              repeats = 5))
170 | glmProfile
171 | 
172 | resamp <- resamples(list(SVM = svmFit, Logistic = glmProfile))
173 | summary(resamp)
174 | 
175 | ## These results are slightly different from those shown in the text.
176 | ## There are some differences in the train() function since the 
177 | ## original results were produced. This is due to a difference in
178 | ## predictions from the ksvm() function when class probs are requested
179 | ## and when they are not. See, for example, 
180 | ## https://stat.ethz.ch/pipermail/r-help/2013-November/363188.html
181 | 
182 | modelDifferences <- diff(resamp)
183 | summary(modelDifferences)
184 | 
185 | ## The actual paired t-test:
186 | modelDifferences$statistics$Accuracy
187 | 
188 | ################################################################################
189 | ### Session Information
190 | 
191 | sessionInfo()
192 | 
193 | q("no")
194 | 
195 | 
196 | 
197 | 


--------------------------------------------------------------------------------
/inst/chapters/03_Data_Pre_Processing.R:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
  3 | ### Copyright 2013 Kuhn and Johnson
  4 | ### Web Page: http://www.appliedpredictivemodeling.com
  5 | ### Contact: Max Kuhn (mxkuhn@gmail.com)
  6 | ###
  7 | ### Chapter 3: Data Pre-Processing
  8 | ###
  9 | ### Required packages: AppliedPredictiveModeling, e1071, caret, corrplot
 10 | ###
 11 | ### Data used: The (unprocessed) cell segmentation data from the
 12 | ###            AppliedPredictiveModeling package.
 13 | ###
 14 | ### Notes: 
 15 | ### 1) This code is provided without warranty.
 16 | ###
 17 | ### 2) This code should help the user reproduce the results in the
 18 | ### text. There will be differences between this code and what is is
 19 | ### the computing section. For example, the computing sections show
 20 | ### how the source functions work (e.g. randomForest() or plsr()),
 21 | ### which were not directly used when creating the book. Also, there may be 
 22 | ### syntax differences that occur over time as packages evolve. These files 
 23 | ### will reflect those changes.
 24 | ###
 25 | ### 3) In some cases, the calculations in the book were run in 
 26 | ### parallel. The sub-processes may reset the random number seed.
 27 | ### Your results may slightly vary.
 28 | ###
 29 | ################################################################################
 30 | 
 31 | ################################################################################
 32 | ### Section 3.1 Case Study: Cell Segmentation in High-Content Screening
 33 | 
 34 | library(AppliedPredictiveModeling)
 35 | data(segmentationOriginal)
 36 | 
 37 | ## Retain the original training set
 38 | segTrain <- subset(segmentationOriginal, Case == "Train")
 39 | 
 40 | ## Remove the first three columns (identifier columns)
 41 | segTrainX <- segTrain[, -(1:3)]
 42 | segTrainClass <- segTrain$Class
 43 | 
 44 | ################################################################################
 45 | ### Section 3.2 Data Transformations for Individual Predictors
 46 | 
 47 | ## The column VarIntenCh3 measures the standard deviation of the intensity
 48 | ## of the pixels in the actin filaments
 49 | 
 50 | max(segTrainX$VarIntenCh3)/min(segTrainX$VarIntenCh3)
 51 | 
 52 | library(e1071)
 53 | skewness(segTrainX$VarIntenCh3)
 54 | 
 55 | library(caret)
 56 | 
 57 | ## Use caret's preProcess function to transform for skewness
 58 | segPP <- preProcess(segTrainX, method = "BoxCox")
 59 | 
 60 | ## Apply the transformations
 61 | segTrainTrans <- predict(segPP, segTrainX)
 62 | 
 63 | ## Results for a single predictor
 64 | segPP$bc$VarIntenCh3
 65 | 
 66 | histogram(~segTrainX$VarIntenCh3,
 67 |           xlab = "Natural Units",
 68 |           type = "count")
 69 | 
 70 | histogram(~log(segTrainX$VarIntenCh3),
 71 |           xlab = "Log Units",
 72 |           ylab = " ",
 73 |           type = "count")
 74 | 
 75 | segPP$bc$PerimCh1
 76 | 
 77 | histogram(~segTrainX$PerimCh1,
 78 |           xlab = "Natural Units",
 79 |           type = "count")
 80 | 
 81 | histogram(~segTrainTrans$PerimCh1,
 82 |           xlab = "Transformed Data",
 83 |           ylab = " ",
 84 |           type = "count")
 85 | 
 86 | ################################################################################
 87 | ### Section 3.3 Data Transformations for Multiple Predictors
 88 | 
 89 | ## R's prcomp is used to conduct PCA
 90 | pr <- prcomp(~ AvgIntenCh1 + EntropyIntenCh1, 
 91 |              data = segTrainTrans, 
 92 |              scale. = TRUE)
 93 | 
 94 | transparentTheme(pchSize = .7, trans = .3)
 95 | 
 96 | xyplot(AvgIntenCh1 ~ EntropyIntenCh1,
 97 |        data = segTrainTrans,
 98 |        groups = segTrain$Class,
 99 |        xlab = "Channel 1 Fiber Width",
100 |        ylab = "Intensity Entropy Channel 1",
101 |        auto.key = list(columns = 2),
102 |        type = c("p", "g"),
103 |        main = "Original Data",
104 |        aspect = 1)
105 | 
106 | xyplot(PC2 ~ PC1,
107 |        data = as.data.frame(pr$x),
108 |        groups = segTrain$Class,
109 |        xlab = "Principal Component #1",
110 |        ylab = "Principal Component #2",
111 |        main = "Transformed",
112 |        xlim = extendrange(pr$x),
113 |        ylim = extendrange(pr$x),
114 |        type = c("p", "g"),
115 |        aspect = 1)
116 | 
117 | 
118 | ## Apply PCA to the entire set of predictors.
119 | 
120 | ## There are a few predictors with only a single value, so we remove these first
121 | ## (since PCA uses variances, which would be zero)
122 | 
123 | isZV <- apply(segTrainX, 2, function(x) length(unique(x)) == 1)
124 | segTrainX <- segTrainX[, !isZV]
125 | 
126 | segPP <- preProcess(segTrainX, c("BoxCox", "center", "scale"))
127 | segTrainTrans <- predict(segPP, segTrainX)
128 | 
129 | segPCA <- prcomp(segTrainTrans, center = TRUE, scale. = TRUE)
130 | 
131 | ## Plot a scatterplot matrix of the first three components
132 | transparentTheme(pchSize = .8, trans = .3)
133 | 
134 | panelRange <- extendrange(segPCA$x[, 1:3])
135 | splom(as.data.frame(segPCA$x[, 1:3]),
136 |       groups = segTrainClass,
137 |       type = c("p", "g"),
138 |       as.table = TRUE,
139 |       auto.key = list(columns = 2),
140 |       prepanel.limits = function(x) panelRange)
141 | 
142 | ## Format the rotation values for plotting
143 | segRot <- as.data.frame(segPCA$rotation[, 1:3])
144 | 
145 | ## Derive the channel variable
146 | vars <- rownames(segPCA$rotation)
147 | channel <- rep(NA, length(vars))
148 | channel[grepl("Ch1$", vars)] <- "Channel 1"
149 | channel[grepl("Ch2$", vars)] <- "Channel 2"
150 | channel[grepl("Ch3$", vars)] <- "Channel 3"
151 | channel[grepl("Ch4$", vars)] <- "Channel 4"
152 | 
153 | segRot$Channel <- channel
154 | segRot <- segRot[complete.cases(segRot),]
155 | segRot$Channel <- factor(as.character(segRot$Channel))
156 | 
157 | ## Plot a scatterplot matrix of the first three rotation variables
158 | 
159 | transparentTheme(pchSize = .8, trans = .7)
160 | panelRange <- extendrange(segRot[, 1:3])
161 | library(ellipse)
162 | upperp <- function(...)
163 |   {
164 |     args <- list(...)
165 |     circ1 <- ellipse(diag(rep(1, 2)), t = .1)
166 |     panel.xyplot(circ1[,1], circ1[,2],
167 |                  type = "l",
168 |                  lty = trellis.par.get("reference.line")$lty,
169 |                  col = trellis.par.get("reference.line")$col,
170 |                  lwd = trellis.par.get("reference.line")$lwd)
171 |     circ2 <- ellipse(diag(rep(1, 2)), t = .2)
172 |     panel.xyplot(circ2[,1], circ2[,2],
173 |                  type = "l",
174 |                  lty = trellis.par.get("reference.line")$lty,
175 |                  col = trellis.par.get("reference.line")$col,
176 |                  lwd = trellis.par.get("reference.line")$lwd)
177 |     circ3 <- ellipse(diag(rep(1, 2)), t = .3)
178 |     panel.xyplot(circ3[,1], circ3[,2],
179 |                  type = "l",
180 |                  lty = trellis.par.get("reference.line")$lty,
181 |                  col = trellis.par.get("reference.line")$col,
182 |                  lwd = trellis.par.get("reference.line")$lwd)
183 |     panel.xyplot(args$x, args$y, groups = args$groups, subscripts = args$subscripts)
184 |   }
185 | splom(~segRot[, 1:3],
186 |       groups = segRot$Channel,
187 |       lower.panel = function(...){}, upper.panel = upperp,
188 |       prepanel.limits = function(x) panelRange,
189 |       auto.key = list(columns = 2))
190 | 
191 | ################################################################################
192 | ### Section 3.5 Removing Variables
193 | 
194 | ## To filter on correlations, we first get the correlation matrix for the 
195 | ## predictor set
196 | 
197 | segCorr <- cor(segTrainTrans)
198 | 
199 | library(corrplot)
200 | corrplot(segCorr, order = "hclust", tl.cex = .35)
201 | 
202 | ## caret's findCorrelation function is used to identify columns to remove.
203 | highCorr <- findCorrelation(segCorr, .75)
204 | 
205 | ################################################################################
206 | ### Section 3.8 Computing (Creating Dummy Variables)
207 | 
208 | data(cars)
209 | type <- c("convertible", "coupe", "hatchback", "sedan", "wagon")
210 | cars$Type <- factor(apply(cars[, 14:18], 1, function(x) type[which(x == 1)]))
211 | 
212 | carSubset <- cars[sample(1:nrow(cars), 20), c(1, 2, 19)]
213 | 
214 | head(carSubset)
215 | levels(carSubset$Type)
216 | 
217 | simpleMod <- dummyVars(~Mileage + Type,
218 |                        data = carSubset,
219 |                        ## Remove the variable name from the
220 |                        ## column name
221 |                        levelsOnly = TRUE)
222 | simpleMod
223 | 
224 | withInteraction <- dummyVars(~Mileage + Type + Mileage:Type,
225 |                              data = carSubset,
226 |                              levelsOnly = TRUE)
227 | withInteraction
228 | predict(withInteraction, head(carSubset))
229 | 
230 | 
231 | 
232 | ################################################################################
233 | ### Session Information
234 | 
235 | sessionInfo()
236 | 
237 | q("no")
238 | 
239 | 
240 | 


--------------------------------------------------------------------------------
/inst/chapters/08_Regression_Trees.R:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
  3 | ### Copyright 2013 Kuhn and Johnson
  4 | ### Web Page: http://www.appliedpredictivemodeling.com
  5 | ### Contact: Max Kuhn (mxkuhn@gmail.com) 
  6 | ###
  7 | ### Chapter 8: Regression Trees and Rule-Based Models 
  8 | ###
  9 | ### Required packages: AppliedPredictiveModeling, caret, Cubis, doMC (optional),
 10 | ###                    gbm, lattice, party, partykit, randomForest, rpart, RWeka
 11 | ###
 12 | ### Data used: The solubility from the AppliedPredictiveModeling package
 13 | ###
 14 | ### Notes: 
 15 | ### 1) This code is provided without warranty.
 16 | ###
 17 | ### 2) This code should help the user reproduce the results in the
 18 | ### text. There will be differences between this code and what is is
 19 | ### the computing section. For example, the computing sections show
 20 | ### how the source functions work (e.g. randomForest() or plsr()),
 21 | ### which were not directly used when creating the book. Also, there may be 
 22 | ### syntax differences that occur over time as packages evolve. These files 
 23 | ### will reflect those changes.
 24 | ###
 25 | ### 3) In some cases, the calculations in the book were run in 
 26 | ### parallel. The sub-processes may reset the random number seed.
 27 | ### Your results may slightly vary.
 28 | ###
 29 | ################################################################################
 30 | 
 31 | ################################################################################
 32 | ### Load the data
 33 | 
 34 | library(AppliedPredictiveModeling)
 35 | data(solubility)
 36 | 
 37 | ### Create a control function that will be used across models. We
 38 | ### create the fold assignments explicitly instead of relying on the
 39 | ### random number seed being set to identical values.
 40 | 
 41 | library(caret)
 42 | set.seed(100)
 43 | indx <- createFolds(solTrainY, returnTrain = TRUE)
 44 | ctrl <- trainControl(method = "cv", index = indx)
 45 | 
 46 | ################################################################################
 47 | ### Section 8.1 Basic Regression Trees
 48 | 
 49 | library(rpart)
 50 | 
 51 | ### Fit two CART models to show the initial splitting process. rpart 
 52 | ### only uses formulas, so we put the predictors and outcome into
 53 | ### a common data frame first.
 54 | 
 55 | trainData <- solTrainXtrans
 56 | trainData$y <- solTrainY
 57 | 
 58 | rpStump <- rpart(y ~ ., data = trainData, 
 59 |                  control = rpart.control(maxdepth = 1))
 60 | rpSmall <- rpart(y ~ ., data = trainData, 
 61 |                  control = rpart.control(maxdepth = 2))
 62 | 
 63 | ### Tune the model
 64 | library(caret)
 65 | 
 66 | set.seed(100)
 67 | cartTune <- train(x = solTrainXtrans, y = solTrainY,
 68 |                   method = "rpart",
 69 |                   tuneLength = 25,
 70 |                   trControl = ctrl)
 71 | cartTune
 72 | ## cartTune$finalModel
 73 | 
 74 | 
 75 | ### Plot the tuning results
 76 | plot(cartTune, scales = list(x = list(log = 10)))
 77 | 
 78 | ### Use the partykit package to make some nice plots. First, convert
 79 | ### the rpart objects to party objects.
 80 | 
 81 | # library(partykit)
 82 | # 
 83 | # cartTree <- as.party(cartTune$finalModel)
 84 | # plot(cartTree)
 85 | 
 86 | ### Get the variable importance. 'competes' is an argument that
 87 | ### controls whether splits not used in the tree should be included
 88 | ### in the importance calculations.
 89 | 
 90 | cartImp <- varImp(cartTune, scale = FALSE, competes = FALSE)
 91 | cartImp
 92 | 
 93 | ### Save the test set results in a data frame                 
 94 | testResults <- data.frame(obs = solTestY,
 95 |                           CART = predict(cartTune, solTestXtrans))
 96 | 
 97 | ### Tune the conditional inference tree
 98 | 
 99 | cGrid <- data.frame(mincriterion = sort(c(.95, seq(.75, .99, length = 2))))
100 | 
101 | set.seed(100)
102 | ctreeTune <- train(x = solTrainXtrans, y = solTrainY,
103 |                    method = "ctree",
104 |                    tuneGrid = cGrid,
105 |                    trControl = ctrl)
106 | ctreeTune
107 | plot(ctreeTune)
108 | 
109 | ##ctreeTune$finalModel               
110 | plot(ctreeTune$finalModel)
111 | 
112 | testResults$cTree <- predict(ctreeTune, solTestXtrans)
113 | 
114 | ################################################################################
115 | ### Section 8.2 Regression Model Trees and 8.3 Rule-Based Models
116 | 
117 | ### Tune the model tree. Using method = "M5" actually tunes over the
118 | ### tree- and rule-based versions of the model. M = 10 is also passed
119 | ### in to make sure that there are larger terminal nodes for the
120 | ### regression models.
121 | 
122 | set.seed(100)
123 | m5Tune <- train(x = solTrainXtrans, y = solTrainY,
124 |                 method = "M5",
125 |                 trControl = ctrl,
126 |                 control = Weka_control(M = 10))
127 | m5Tune
128 | 
129 | plot(m5Tune)
130 | 
131 | ## m5Tune$finalModel
132 | 
133 | ## plot(m5Tune$finalModel)
134 | 
135 | ### Show the rule-based model too
136 | 
137 | ruleFit <- M5Rules(y~., data = trainData, control = Weka_control(M = 10))
138 | ruleFit
139 | 
140 | ################################################################################
141 | ### Section 8.4 Bagged Trees
142 | 
143 | ### Optional: parallel processing can be used via the 'do' packages,
144 | ### such as doMC, doMPI etc. We used doMC (not on Windows) to speed
145 | ### up the computations.
146 | 
147 | ### WARNING: Be aware of how much memory is needed to parallel
148 | ### process. It can very quickly overwhelm the available hardware. The
149 | ### estimate of the median memory usage (VSIZE = total memory size) 
150 | ### was 9706M for a core, but could range up to 9706M. This becomes 
151 | ### severe when parallelizing randomForest() and (especially) calls 
152 | ### to cforest(). 
153 | 
154 | ### WARNING 2: The RWeka package does not work well with some forms of
155 | ### parallel processing, such as mutlicore (i.e. doMC).
156 | 
157 | library(doMC)
158 | registerDoMC(5)
159 | 
160 | set.seed(100)
161 | 
162 | treebagTune <- train(x = solTrainXtrans, y = solTrainY,
163 |                      method = "treebag",
164 |                      nbagg = 50,
165 |                      trControl = ctrl)
166 | 
167 | treebagTune
168 | 
169 | ################################################################################
170 | ### Section 8.5 Random Forests
171 | 
172 | mtryGrid <- data.frame(mtry = floor(seq(10, ncol(solTrainXtrans), length = 10)))
173 | 
174 | 
175 | ### Tune the model using cross-validation
176 | set.seed(100)
177 | rfTune <- train(x = solTrainXtrans, y = solTrainY,
178 |                 method = "rf",
179 |                 tuneGrid = mtryGrid,
180 |                 ntree = 1000,
181 |                 importance = TRUE,
182 |                 trControl = ctrl)
183 | rfTune
184 | 
185 | plot(rfTune)
186 | 
187 | rfImp <- varImp(rfTune, scale = FALSE)
188 | rfImp
189 | 
190 | ### Tune the model using the OOB estimates
191 | ctrlOOB <- trainControl(method = "oob")
192 | set.seed(100)
193 | rfTuneOOB <- train(x = solTrainXtrans, y = solTrainY,
194 |                    method = "rf",
195 |                    tuneGrid = mtryGrid,
196 |                    ntree = 1000,
197 |                    importance = TRUE,
198 |                    trControl = ctrlOOB)
199 | rfTuneOOB
200 | 
201 | plot(rfTuneOOB)
202 | 
203 | ### Tune the conditional inference forests
204 | set.seed(100)
205 | condrfTune <- train(x = solTrainXtrans, y = solTrainY,
206 |                     method = "cforest",
207 |                     tuneGrid = mtryGrid,
208 |                     controls = cforest_unbiased(ntree = 1000),
209 |                     trControl = ctrl)
210 | condrfTune
211 | 
212 | plot(condrfTune)
213 | 
214 | set.seed(100)
215 | condrfTuneOOB <- train(x = solTrainXtrans, y = solTrainY,
216 |                        method = "cforest",
217 |                        tuneGrid = mtryGrid,
218 |                        controls = cforest_unbiased(ntree = 1000),
219 |                        trControl = trainControl(method = "oob"))
220 | condrfTuneOOB
221 | 
222 | plot(condrfTuneOOB)
223 | 
224 | ################################################################################
225 | ### Section 8.6 Boosting
226 | 
227 | gbmGrid <- expand.grid(interaction.depth = seq(1, 7, by = 2),
228 |                        n.trees = seq(100, 1000, by = 50),
229 |                        shrinkage = c(0.01, 0.1))
230 | set.seed(100)
231 | gbmTune <- train(x = solTrainXtrans, y = solTrainY,
232 |                  method = "gbm",
233 |                  tuneGrid = gbmGrid,
234 |                  trControl = ctrl,
235 |                  verbose = FALSE)
236 | gbmTune
237 | 
238 | plot(gbmTune, auto.key = list(columns = 4, lines = TRUE))
239 | 
240 | gbmImp <- varImp(gbmTune, scale = FALSE)
241 | gbmImp
242 | 
243 | ################################################################################
244 | ### Section 8.7 Cubist
245 | 
246 | cbGrid <- expand.grid(committees = c(1:10, 20, 50, 75, 100), 
247 |                       neighbors = c(0, 1, 5, 9))
248 | 
249 | set.seed(100)
250 | cubistTune <- train(solTrainXtrans, solTrainY,
251 |                     "cubist",
252 |                     tuneGrid = cbGrid,
253 |                     trControl = ctrl)
254 | cubistTune
255 | 
256 | plot(cubistTune, auto.key = list(columns = 4, lines = TRUE))
257 | 
258 | cbImp <- varImp(cubistTune, scale = FALSE)
259 | cbImp
260 | 
261 | ################################################################################
262 | ### Session Information
263 | 
264 | sessionInfo()
265 | 
266 | q("no")
267 | 


--------------------------------------------------------------------------------
/inst/chapters/18_Importance.R:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
  3 | ### Copyright 2013 Kuhn and Johnson
  4 | ### Web Page: http://www.appliedpredictivemodeling.com
  5 | ### Contact: Max Kuhn (mxkuhn@gmail.com) 
  6 | ###
  7 | ### Chapter 18: Measuring Predictor Importance
  8 | ###
  9 | ### Required packages: AppliedPredictiveModeling, caret, CORElearn, corrplot,
 10 | ###                    pROC, minerva
 11 | ###                   
 12 | ###
 13 | ### Data used: The solubility data from the AppliedPredictiveModeling 
 14 | ###            package, the segmentation data in the caret package and the 
 15 | ###            grant data (created using "CreateGrantData.R" in the same
 16 | ###            directory as this file).
 17 | ###
 18 | ### Notes: 
 19 | ### 1) This code is provided without warranty.
 20 | ###
 21 | ### 2) This code should help the user reproduce the results in the
 22 | ### text. There will be differences between this code and what is is
 23 | ### the computing section. For example, the computing sections show
 24 | ### how the source functions work (e.g. randomForest() or plsr()),
 25 | ### which were not directly used when creating the book. Also, there may be 
 26 | ### syntax differences that occur over time as packages evolve. These files 
 27 | ### will reflect those changes.
 28 | ###
 29 | ### 3) In some cases, the calculations in the book were run in 
 30 | ### parallel. The sub-processes may reset the random number seed.
 31 | ### Your results may slightly vary.
 32 | ###
 33 | ################################################################################
 34 | 
 35 | 
 36 | 
 37 | ################################################################################
 38 | ### Section 18.1 Numeric Outcomes
 39 | 
 40 | ## Load the solubility data
 41 | 
 42 | library(AppliedPredictiveModeling)
 43 | data(solubility)
 44 | 
 45 | trainData <- solTrainXtrans
 46 | trainData$y <- solTrainY
 47 | 
 48 | 
 49 | ## keep the continuous predictors and append the outcome to the data frame
 50 | SolContPred <- solTrainXtrans[, !grepl("FP", names(solTrainXtrans))]
 51 | numSolPred <- ncol(SolContPred)
 52 | SolContPred$Sol <- solTrainY
 53 | 
 54 | ## Get the LOESS smoother and the summary measure
 55 | library(caret)
 56 | smoother <- filterVarImp(x = SolContPred[, -ncol(SolContPred)], 
 57 |                          y = solTrainY, 
 58 |                          nonpara = TRUE)
 59 | smoother$Predictor <- rownames(smoother)
 60 | names(smoother)[1] <- "Smoother"
 61 | 
 62 | ## Calculate the correlation matrices and keep the columns with the correlations
 63 | ## between the predictors and the outcome
 64 | 
 65 | correlations <- cor(SolContPred)[-(numSolPred+1),(numSolPred+1)]
 66 | rankCorrelations <- cor(SolContPred, method = "spearman")[-(numSolPred+1),(numSolPred+1)]
 67 | corrs <- data.frame(Predictor = names(SolContPred)[1:numSolPred],
 68 |                     Correlation = correlations,
 69 |                     RankCorrelation  = rankCorrelations)
 70 | 
 71 | ## The maximal information coefficient (MIC) values can be obtained from the
 72 | ### minerva package:
 73 | 
 74 | library(minerva)
 75 | MIC <- mine(x = SolContPred[, 1:numSolPred], y = solTrainY)$MIC
 76 | MIC <- data.frame(Predictor = rownames(MIC),
 77 |                   MIC = MIC[,1])
 78 | 
 79 | 
 80 | ## The Relief values for regression can be computed using the CORElearn
 81 | ## package:
 82 | 
 83 | library(CORElearn)
 84 | ReliefF <- attrEval(Sol ~ .,  data = SolContPred,
 85 |                     estimator = "RReliefFequalK")
 86 | ReliefF <- data.frame(Predictor = names(ReliefF),
 87 |                   Relief = ReliefF)
 88 | 
 89 | ## Combine them all together for a plot
 90 | contDescrScores <- merge(smoother, corrs)
 91 | contDescrScores <- merge(contDescrScores, MIC)
 92 | contDescrScores <- merge(contDescrScores, ReliefF)
 93 | 
 94 | rownames(contDescrScores) <- contDescrScores$Predictor
 95 | 
 96 | contDescrScores
 97 | 
 98 | contDescrSplomData <- contDescrScores
 99 | contDescrSplomData$Correlation <- abs(contDescrSplomData$Correlation)
100 | contDescrSplomData$RankCorrelation <- abs(contDescrSplomData$RankCorrelation)
101 | contDescrSplomData$Group <- "Other"
102 | contDescrSplomData$Group[grepl("Surface", contDescrSplomData$Predictor)] <- "SA"
103 | 
104 | featurePlot(solTrainXtrans[, c("NumCarbon", "SurfaceArea2")],
105 |             solTrainY,
106 |             between = list(x = 1),
107 |             type = c("g", "p", "smooth"),
108 |             df = 3,
109 |             aspect = 1,
110 |             labels = c("", "Solubility"))
111 | 
112 | 
113 | splom(~contDescrSplomData[,c(3, 4, 2, 5)],
114 |       groups = contDescrSplomData$Group,
115 |       varnames = c("Correlation", "Rank\nCorrelation", "LOESS", "MIC"))
116 | 
117 | 
118 | ## Now look at the categorical (i.e. binary) predictors
119 | SolCatPred <- solTrainXtrans[, grepl("FP", names(solTrainXtrans))]
120 | SolCatPred$Sol <- solTrainY
121 | numSolCatPred <- ncol(SolCatPred) - 1
122 | 
123 | tests <- apply(SolCatPred[, 1:numSolCatPred], 2,
124 |                   function(x, y)
125 |                     {
126 |                     tStats <- t.test(y ~ x)[c("statistic", "p.value", "estimate")]
127 |                     unlist(tStats)
128 |                     },
129 |                y = solTrainY)
130 | ## The results are a matrix with predictors in columns. We reverse this
131 | tests <- as.data.frame(t(tests))
132 | names(tests) <- c("t.Statistic", "t.test_p.value", "mean0", "mean1")
133 | tests$difference <- tests$mean1 - tests$mean0
134 | tests
135 | 
136 | ## Create a volcano plot
137 | 
138 | xyplot(-log10(t.test_p.value) ~ difference,
139 |        data = tests,
140 |        xlab = "Mean With Structure - Mean Without Structure",
141 |        ylab = "-log(p-Value)",
142 |        type = "p")
143 | 
144 | ################################################################################
145 | ### Section 18.2 Categorical Outcomes
146 | 
147 | ## Load the segmentation data
148 | 
149 | data(segmentationData)
150 | segTrain <- subset(segmentationData, Case == "Train")
151 | segTrain$Case <- segTrain$Cell <- NULL
152 | 
153 | segTest <- subset(segmentationData, Case != "Train")
154 | segTest$Case <- segTest$Cell <- NULL
155 | 
156 | ## Compute the areas under the ROC curve
157 | aucVals <- filterVarImp(x = segTrain[, -1], y = segTrain$Class)
158 | aucVals$Predictor <- rownames(aucVals)
159 | 
160 | ## Cacluate the t-tests as before but with x and y switched
161 | segTests <- apply(segTrain[, -1], 2,
162 |                   function(x, y)
163 |                     {
164 |                     tStats <- t.test(x ~ y)[c("statistic", "p.value", "estimate")]
165 |                     unlist(tStats)
166 |                     },
167 |                y = segTrain$Class)
168 | segTests <- as.data.frame(t(segTests))
169 | names(segTests) <- c("t.Statistic", "t.test_p.value", "mean0", "mean1")
170 | segTests$Predictor <- rownames(segTests)
171 | 
172 | ## Fit a random forest model and get the importance scores
173 | library(randomForest)
174 | set.seed(791)
175 | rfImp <- randomForest(Class ~ ., data = segTrain, 
176 |                       ntree = 2000, 
177 |                       importance = TRUE)
178 | rfValues <- data.frame(RF = importance(rfImp)[, "MeanDecreaseGini"],
179 |                        Predictor = rownames(importance(rfImp)))
180 | 
181 | ## Now compute the Relief scores
182 | set.seed(791)
183 | 
184 | ReliefValues <- attrEval(Class ~ ., data = segTrain,
185 |                          estimator="ReliefFequalK", ReliefIterations = 50)
186 | ReliefValues <- data.frame(Relief = ReliefValues,
187 |                            Predictor = names(ReliefValues))
188 | 
189 | ## and the MIC statistics
190 | set.seed(791)
191 | segMIC <- mine(x = segTrain[, -1],
192 |                ## Pass the outcome as 0/1
193 |                y = ifelse(segTrain$Class == "PS", 1, 0))$MIC
194 | segMIC <- data.frame(Predictor = rownames(segMIC),
195 |                   MIC = segMIC[,1])
196 | 
197 | 
198 | rankings <- merge(segMIC, ReliefValues)
199 | rankings <- merge(rankings, rfValues)
200 | rankings <- merge(rankings, segTests)
201 | rankings <- merge(rankings, aucVals)
202 | rankings
203 | 
204 | rankings$channel <- "Channel 1"
205 | rankings$channel[grepl("Ch2$", rankings$Predictor)] <- "Channel 2"
206 | rankings$channel[grepl("Ch3$", rankings$Predictor)] <- "Channel 3"
207 | rankings$channel[grepl("Ch4$", rankings$Predictor)] <- "Channel 4"
208 | rankings$t.Statistic <- abs(rankings$t.Statistic)
209 | 
210 | splom(~rankings[, c("PS", "t.Statistic", "RF", "Relief", "MIC")],
211 |       groups = rankings$channel,
212 |       varnames = c("ROC\nAUC", "Abs\nt-Stat", "Random\nForest", "Relief", "MIC"),
213 |       auto.key = list(columns = 2))
214 | 
215 | 
216 | ## Load the grant data. A script to create and save these data is contained
217 | ## in the same directory as this file.
218 | 
219 | load("grantData.RData")
220 | 
221 | dataSubset <- training[pre2008, c("Sponsor62B", "ContractValueBandUnk", "RFCD240302")]
222 | 
223 | ## This is a simple function to compute several statistics for binary predictors
224 | tableCalcs <- function(x, y)
225 |   {
226 |   tab <- table(x, y)
227 |   fet <- fisher.test(tab)
228 |   out <- c(OR = fet$estimate,
229 |            P = fet$p.value,
230 |            Gain = attrEval(y ~ x, estimator = "GainRatio"))
231 |   }
232 | 
233 | ## lapply() is used to execute the function on each column
234 | tableResults <- lapply(dataSubset, tableCalcs, y = training[pre2008, "Class"])
235 | 
236 | ## The results come back as a list of vectors, and "rbind" is used to join
237 | ## then together as rows of a table
238 | tableResults <- do.call("rbind", tableResults)
239 | tableResults
240 | 
241 | ## The permuted Relief scores can be computed using a function from the
242 | ## AppliedPredictiveModeling package. 
243 | 
244 | permuted <- permuteRelief(x = training[pre2008, c("Sponsor62B", "Day", "NumCI")], 
245 |                           y = training[pre2008, "Class"],
246 |                           nperm = 500,
247 |                           ### the remaining options are passed to attrEval()
248 |                           estimator="ReliefFequalK", 
249 |                           ReliefIterations= 50)
250 | 
251 | ## The original Relief scores:
252 | permuted$observed
253 | 
254 | ## The number of standard deviations away from the permuted mean:
255 | permuted$standardized
256 | 
257 | ## The distributions of the scores if there were no relationship between the
258 | ## predictors and outcomes
259 | 
260 | histogram(~value|Predictor, 
261 |           data = permuted$permutations, 
262 |           xlim = extendrange(permuted$permutations$value),
263 |           xlab = "Relief Score")
264 | 
265 | 
266 | ################################################################################
267 | ### Session Information
268 | 
269 | sessionInfo()
270 | 
271 | q("no")
272 | 


--------------------------------------------------------------------------------
/inst/chapters/11_Class_Performance.Rout:
--------------------------------------------------------------------------------
  1 | 
  2 | R version 3.0.1 (2013-05-16) -- "Good Sport"
  3 | Copyright (C) 2013 The R Foundation for Statistical Computing
  4 | Platform: x86_64-apple-darwin10.8.0 (64-bit)
  5 | 
  6 | R is free software and comes with ABSOLUTELY NO WARRANTY.
  7 | You are welcome to redistribute it under certain conditions.
  8 | Type 'license()' or 'licence()' for distribution details.
  9 | 
 10 |   Natural language support but running in an English locale
 11 | 
 12 | R is a collaborative project with many contributors.
 13 | Type 'contributors()' for more information and
 14 | 'citation()' on how to cite R or R packages in publications.
 15 | 
 16 | Type 'demo()' for some demos, 'help()' for on-line help, or
 17 | 'help.start()' for an HTML browser interface to help.
 18 | Type 'q()' to quit R.
 19 | 
 20 | > ################################################################################
 21 | > ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
 22 | > ### Copyright 2013 Kuhn and Johnson
 23 | > ### Web Page: http://www.appliedpredictivemodeling.com
 24 | > ### Contact: Max Kuhn (mxkuhn@gmail.com) 
 25 | > ###
 26 | > ### Chapter 11: Measuring Performance in Classification Models
 27 | > ###
 28 | > ### Required packages: AppliedPredictiveModeling, caret, MASS, randomForest,
 29 | > ###                    pROC, klaR
 30 | > ###
 31 | > ### Data used: The solubility from the AppliedPredictiveModeling package
 32 | > ###
 33 | > ### Notes: 
 34 | > ### 1) This code is provided without warranty.
 35 | > ###
 36 | > ### 2) This code should help the user reproduce the results in the
 37 | > ### text. There will be differences between this code and what is is
 38 | > ### the computing section. For example, the computing sections show
 39 | > ### how the source functions work (e.g. randomForest() or plsr()),
 40 | > ### which were not directly used when creating the book. Also, there may be 
 41 | > ### syntax differences that occur over time as packages evolve. These files 
 42 | > ### will reflect those changes.
 43 | > ###
 44 | > ### 3) In some cases, the calculations in the book were run in 
 45 | > ### parallel. The sub-processes may reset the random number seed.
 46 | > ### Your results may slightly vary.
 47 | > ###
 48 | > ################################################################################
 49 | > 
 50 | > ################################################################################
 51 | > ### Section 11.1 Class Predictions
 52 | > 
 53 | > library(AppliedPredictiveModeling)
 54 | > 
 55 | > ### Simulate some two class data with two predictors
 56 | > set.seed(975)
 57 | > training <- quadBoundaryFunc(500)
 58 | > testing <- quadBoundaryFunc(1000)
 59 | > testing$class2 <- ifelse(testing$class == "Class1", 1, 0)
 60 | > testing$ID <- 1:nrow(testing)
 61 | > 
 62 | > ### Fit models
 63 | > library(MASS)
 64 | > qdaFit <- qda(class ~ X1 + X2, data = training)
 65 | > library(randomForest)
 66 | randomForest 4.6-7
 67 | Type rfNews() to see new features/changes/bug fixes.
 68 | > rfFit <- randomForest(class ~ X1 + X2, data = training, ntree = 2000)
 69 | > 
 70 | > ### Predict the test set
 71 | > testing$qda <- predict(qdaFit, testing)$posterior[,1]
 72 | > testing$rf <- predict(rfFit, testing, type = "prob")[,1]
 73 | > 
 74 | > 
 75 | > ### Generate the calibration analysis
 76 | > library(caret)
 77 | Loading required package: lattice
 78 | Loading required package: ggplot2
 79 | > calData1 <- calibration(class ~ qda + rf, data = testing, cuts = 10)
 80 | > 
 81 | > ### Plot the curve
 82 | > xyplot(calData1, auto.key = list(columns = 2))
 83 | > 
 84 | > ### To calibrate the data, treat the probabilities as inputs into the
 85 | > ### model
 86 | > 
 87 | > trainProbs <- training
 88 | > trainProbs$qda <- predict(qdaFit)$posterior[,1]
 89 | > 
 90 | > ### These models take the probabilities as inputs and, based on the
 91 | > ### true class, re-calibrate them.
 92 | > library(klaR)
 93 | > nbCal <- NaiveBayes(class ~ qda, data = trainProbs, usekernel = TRUE)
 94 | > 
 95 | > ### We use relevel() here because glm() models the probability of the
 96 | > ### second factor level.
 97 | > lrCal <- glm(relevel(class, "Class2") ~ qda, data = trainProbs, family = binomial)
 98 | > 
 99 | > ### Now re-predict the test set using the modified class probability
100 | > ### estimates
101 | > testing$qda2 <- predict(nbCal, testing[, "qda", drop = FALSE])$posterior[,1]
102 | > testing$qda3 <- predict(lrCal, testing[, "qda", drop = FALSE], type = "response")
103 | > 
104 | > 
105 | > ### Manipulate the data a bit for pretty plotting
106 | > simulatedProbs <- testing[, c("class", "rf", "qda3")]
107 | > names(simulatedProbs) <- c("TrueClass", "RandomForestProb", "QDACalibrated")
108 | > simulatedProbs$RandomForestClass <-  predict(rfFit, testing)
109 | > 
110 | > calData2 <- calibration(class ~ qda + qda2 + qda3, data = testing)
111 | > calData2$data$calibModelVar <- as.character(calData2$data$calibModelVar)
112 | > calData2$data$calibModelVar <- ifelse(calData2$data$calibModelVar == "qda", 
113 | +                                       "QDA",
114 | +                                       calData2$data$calibModelVar)
115 | > calData2$data$calibModelVar <- ifelse(calData2$data$calibModelVar == "qda2", 
116 | +                                       "Bayesian Calibration",
117 | +                                       calData2$data$calibModelVar)
118 | > 
119 | > calData2$data$calibModelVar <- ifelse(calData2$data$calibModelVar == "qda3", 
120 | +                                       "Sigmoidal Calibration",
121 | +                                       calData2$data$calibModelVar)
122 | > 
123 | > calData2$data$calibModelVar <- factor(calData2$data$calibModelVar,
124 | +                                       levels = c("QDA", 
125 | +                                                  "Bayesian Calibration", 
126 | +                                                  "Sigmoidal Calibration"))
127 | > 
128 | > xyplot(calData2, auto.key = list(columns = 1))
129 | > 
130 | > ### Recreate the model used in the over-fitting chapter
131 | > 
132 | > library(caret)
133 | > data(GermanCredit)
134 | > 
135 | > ## First, remove near-zero variance predictors then get rid of a few predictors 
136 | > ## that duplicate values. For example, there are two possible values for the 
137 | > ## housing variable: "Rent", "Own" and "ForFree". So that we don't have linear
138 | > ## dependencies, we get rid of one of the levels (e.g. "ForFree")
139 | > 
140 | > GermanCredit <- GermanCredit[, -nearZeroVar(GermanCredit)]
141 | > GermanCredit$CheckingAccountStatus.lt.0 <- NULL
142 | > GermanCredit$SavingsAccountBonds.lt.100 <- NULL
143 | > GermanCredit$EmploymentDuration.lt.1 <- NULL
144 | > GermanCredit$EmploymentDuration.Unemployed <- NULL
145 | > GermanCredit$Personal.Male.Married.Widowed <- NULL
146 | > GermanCredit$Property.Unknown <- NULL
147 | > GermanCredit$Housing.ForFree <- NULL
148 | > 
149 | > ## Split the data into training (80%) and test sets (20%)
150 | > set.seed(100)
151 | > inTrain <- createDataPartition(GermanCredit$Class, p = .8)[[1]]
152 | > GermanCreditTrain <- GermanCredit[ inTrain, ]
153 | > GermanCreditTest  <- GermanCredit[-inTrain, ]
154 | > 
155 | > set.seed(1056)
156 | > logisticReg <- train(Class ~ .,
157 | +                      data = GermanCreditTrain,
158 | +                      method = "glm",
159 | +                      trControl = trainControl(method = "repeatedcv", 
160 | +                                               repeats = 5))
161 | Loading required package: class
162 | > logisticReg
163 | Generalized Linear Model 
164 | 
165 | 800 samples
166 |  41 predictors
167 |   2 classes: 'Bad', 'Good' 
168 | 
169 | No pre-processing
170 | Resampling: Cross-Validated (10 fold, repeated 5 times) 
171 | 
172 | Summary of sample sizes: 720, 720, 720, 720, 720, 720, ... 
173 | 
174 | Resampling results
175 | 
176 |   Accuracy  Kappa  Accuracy SD  Kappa SD
177 |   0.749     0.365  0.0516       0.122   
178 | 
179 |  
180 | > 
181 | > ### Predict the test set
182 | > creditResults <- data.frame(obs = GermanCreditTest$Class)
183 | > creditResults$prob <- predict(logisticReg, GermanCreditTest, type = "prob")[, "Bad"]
184 | > creditResults$pred <- predict(logisticReg, GermanCreditTest)
185 | > creditResults$Label <- ifelse(creditResults$obs == "Bad", 
186 | +                               "True Outcome: Bad Credit", 
187 | +                               "True Outcome: Good Credit")
188 | > 
189 | > ### Plot the probability of bad credit
190 | > histogram(~prob|Label,
191 | +           data = creditResults,
192 | +           layout = c(2, 1),
193 | +           nint = 20,
194 | +           xlab = "Probability of Bad Credit",
195 | +           type = "count")
196 | > 
197 | > ### Calculate and plot the calibration curve
198 | > creditCalib <- calibration(obs ~ prob, data = creditResults)
199 | > xyplot(creditCalib)
200 | > 
201 | > ### Create the confusion matrix from the test set.
202 | > confusionMatrix(data = creditResults$pred, 
203 | +                 reference = creditResults$obs)
204 | Confusion Matrix and Statistics
205 | 
206 |           Reference
207 | Prediction Bad Good
208 |       Bad   24   10
209 |       Good  36  130
210 |                                           
211 |                Accuracy : 0.77            
212 |                  95% CI : (0.7054, 0.8264)
213 |     No Information Rate : 0.7             
214 |     P-Value [Acc > NIR] : 0.0168694       
215 |                                           
216 |                   Kappa : 0.375           
217 |  Mcnemar's Test P-Value : 0.0002278       
218 |                                           
219 |             Sensitivity : 0.4000          
220 |             Specificity : 0.9286          
221 |          Pos Pred Value : 0.7059          
222 |          Neg Pred Value : 0.7831          
223 |              Prevalence : 0.3000          
224 |          Detection Rate : 0.1200          
225 |    Detection Prevalence : 0.1700          
226 |       Balanced Accuracy : 0.6643          
227 |                                           
228 |        'Positive' Class : Bad             
229 |                                           
230 | > 
231 | > ### ROC curves:
232 | > 
233 | > ### Like glm(), roc() treats the last level of the factor as the event
234 | > ### of interest so we use relevel() to change the observed class data
235 | > 
236 | > library(pROC)
237 | Loading required package: plyr
238 | Type 'citation("pROC")' for a citation.
239 | 
240 | Attaching package: ‘pROC’
241 | 
242 | The following object is masked from ‘package:stats’:
243 | 
244 |     cov, smooth, var
245 | 
246 | > creditROC <- roc(relevel(creditResults$obs, "Good"), creditResults$prob)
247 | > 
248 | > coords(creditROC, "all")[,1:3]
249 |              all         all         all
250 | threshold   -Inf 0.006199758 0.009708574
251 | specificity    0 0.007142857 0.014285714
252 | sensitivity    1 1.000000000 1.000000000
253 | > 
254 | > auc(creditROC)
255 | Area under the curve: 0.775
256 | > ci.auc(creditROC)
257 | 95% CI: 0.7032-0.8468 (DeLong)
258 | > 
259 | > ### Note the x-axis is reversed
260 | > plot(creditROC)
261 | 
262 | Call:
263 | roc.default(response = relevel(creditResults$obs, "Good"), predictor = creditResults$prob)
264 | 
265 | Data: creditResults$prob in 140 controls (relevel(creditResults$obs, "Good") Good) < 60 cases (relevel(creditResults$obs, "Good") Bad).
266 | Area under the curve: 0.775
267 | > 
268 | > ### Old-school:
269 | > plot(creditROC, legacy.axes = TRUE)
270 | 
271 | Call:
272 | roc.default(response = relevel(creditResults$obs, "Good"), predictor = creditResults$prob)
273 | 
274 | Data: creditResults$prob in 140 controls (relevel(creditResults$obs, "Good") Good) < 60 cases (relevel(creditResults$obs, "Good") Bad).
275 | Area under the curve: 0.775
276 | > 
277 | > ### Lift charts
278 | > 
279 | > creditLift <- lift(obs ~ prob, data = creditResults)
280 | > xyplot(creditLift)
281 | > 
282 | > 
283 | > ################################################################################
284 | > ### Session Information
285 | > 
286 | > sessionInfo()
287 | R version 3.0.1 (2013-05-16)
288 | Platform: x86_64-apple-darwin10.8.0 (64-bit)
289 | 
290 | locale:
291 | [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
292 | 
293 | attached base packages:
294 | [1] stats     graphics  grDevices utils     datasets  methods   base     
295 | 
296 | other attached packages:
297 |  [1] pROC_1.5.4                      plyr_1.8                       
298 |  [3] e1071_1.6-1                     class_7.3-7                    
299 |  [5] klaR_0.6-7                      caret_6.0-22                   
300 |  [7] ggplot2_0.9.3.1                 lattice_0.20-15                
301 |  [9] randomForest_4.6-7              MASS_7.3-26                    
302 | [11] AppliedPredictiveModeling_1.1-5
303 | 
304 | loaded via a namespace (and not attached):
305 |  [1] car_2.0-16         codetools_0.2-8    colorspace_1.2-1   compiler_3.0.1    
306 |  [5] CORElearn_0.9.41   dichromat_2.0-0    digest_0.6.3       foreach_1.4.0     
307 |  [9] grid_3.0.1         gtable_0.1.2       iterators_1.0.6    labeling_0.1      
308 | [13] munsell_0.4        proto_0.3-10       RColorBrewer_1.0-5 reshape2_1.2.2    
309 | [17] scales_0.2.3       stringr_0.6.2      tools_3.0.1       
310 | > 
311 | > q("no")
312 | > proc.time()
313 |    user  system elapsed 
314 |  11.120   0.526  11.698 
315 | 


--------------------------------------------------------------------------------
/inst/chapters/10_Case_Study_Concrete.R:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
  3 | ### Copyright 2013 Kuhn and Johnson
  4 | ### Web Page: http://www.appliedpredictivemodeling.com
  5 | ### Contact: Max Kuhn (mxkuhn@gmail.com)
  6 | ###
  7 | ### Chapter 10: Case Study: Compressive Strength of Concrete Mixtures
  8 | ###
  9 | ### Required packages: AppliedPredictiveModeling, caret, Cubist, doMC (optional),
 10 | ###                    earth, elasticnet, gbm, ipred, lattice, nnet, party, pls,
 11 | ###                    randomForests, rpart, RWeka      
 12 | ###
 13 | ### Data used: The concrete from the AppliedPredictiveModeling package
 14 | ###
 15 | ### Notes: 
 16 | ### 1) This code is provided without warranty.
 17 | ###
 18 | ### 2) This code should help the user reproduce the results in the
 19 | ### text. There will be differences between this code and what is is
 20 | ### the computing section. For example, the computing sections show
 21 | ### how the source functions work (e.g. randomForest() or plsr()),
 22 | ### which were not directly used when creating the book. Also, there may be 
 23 | ### syntax differences that occur over time as packages evolve. These files 
 24 | ### will reflect those changes.
 25 | ###
 26 | ### 3) In some cases, the calculations in the book were run in 
 27 | ### parallel. The sub-processes may reset the random number seed.
 28 | ### Your results may slightly vary.
 29 | ###
 30 | ################################################################################
 31 | 
 32 | ################################################################################
 33 | ### Load the data and plot the data
 34 | 
 35 | library(AppliedPredictiveModeling)
 36 | data(concrete)
 37 | 
 38 | library(caret)
 39 | library(plyr)
 40 | 
 41 | featurePlot(concrete[, -9], concrete$CompressiveStrength,
 42 |             between = list(x = 1, y = 1),
 43 |             type = c("g", "p", "smooth"))
 44 | 
 45 | 
 46 | ################################################################################
 47 | ### Section 10.1 Model Building Strategy
 48 | ### There are replicated mixtures, so take the average per mixture
 49 |             
 50 | averaged <- ddply(mixtures,
 51 |                   .(Cement, BlastFurnaceSlag, FlyAsh, Water, 
 52 |                     Superplasticizer, CoarseAggregate, 
 53 |                     FineAggregate, Age),
 54 |                   function(x) c(CompressiveStrength = 
 55 |                     mean(x$CompressiveStrength)))
 56 | 
 57 | ### Split the data and create a control object for train()
 58 | 
 59 | set.seed(975)
 60 | inTrain <- createDataPartition(averaged$CompressiveStrength, p = 3/4)[[1]]
 61 | training <- averaged[ inTrain,]
 62 | testing  <- averaged[-inTrain,]
 63 | 
 64 | ctrl <- trainControl(method = "repeatedcv", repeats = 5, number = 10)
 65 | 
 66 | ### Create a model formula that can be used repeatedly
 67 | 
 68 | modForm <- paste("CompressiveStrength ~ (.)^2 + I(Cement^2) + I(BlastFurnaceSlag^2) +",
 69 |                  "I(FlyAsh^2)  + I(Water^2) + I(Superplasticizer^2)  +",
 70 |                  "I(CoarseAggregate^2) +  I(FineAggregate^2) + I(Age^2)")
 71 | modForm <- as.formula(modForm)
 72 | 
 73 | ### Fit the various models
 74 | 
 75 | ### Optional: parallel processing can be used via the 'do' packages,
 76 | ### such as doMC, doMPI etc. We used doMC (not on Windows) to speed
 77 | ### up the computations.
 78 | 
 79 | ### WARNING: Be aware of how much memory is needed to parallel
 80 | ### process. It can very quickly overwhelm the available hardware. The
 81 | ### estimate of the median memory usage (VSIZE = total memory size) 
 82 | ### was 2800M for a core although the M5 calculations require about 
 83 | ### 3700M without parallel processing. 
 84 | 
 85 | ### WARNING 2: The RWeka package does not work well with some forms of
 86 | ### parallel processing, such as mutlicore (i.e. doMC). 
 87 | 
 88 | library(doMC)
 89 | registerDoMC(14)
 90 | 
 91 | set.seed(669)
 92 | lmFit <- train(modForm, data = training,
 93 |                method = "lm",
 94 |                trControl = ctrl)
 95 | 
 96 | set.seed(669)
 97 | plsFit <- train(modForm, data = training,
 98 |                 method = "pls",
 99 |                 preProc = c("center", "scale"),
100 |                 tuneLength = 15,
101 |                 trControl = ctrl)
102 | 
103 | lassoGrid <- expand.grid(lambda = c(0, .001, .01, .1), 
104 |                          fraction = seq(0.05, 1, length = 20))
105 | set.seed(669)
106 | lassoFit <- train(modForm, data = training,
107 |                   method = "enet",
108 |                   preProc = c("center", "scale"),
109 |                   tuneGrid = lassoGrid,
110 |                   trControl = ctrl)
111 | 
112 | set.seed(669)
113 | earthFit <- train(CompressiveStrength ~ ., data = training,
114 |                   method = "earth",
115 |                   tuneGrid = expand.grid(degree = 1, 
116 |                                          nprune = 2:25),
117 |                   trControl = ctrl)
118 | 
119 | set.seed(669)
120 | svmRFit <- train(CompressiveStrength ~ ., data = training,
121 |                  method = "svmRadial",
122 |                  tuneLength = 15,
123 |                  preProc = c("center", "scale"),
124 |                  trControl = ctrl)
125 | 
126 | 
127 | nnetGrid <- expand.grid(decay = c(0.001, .01, .1), 
128 |                         size = seq(1, 27, by = 2), 
129 |                         bag = FALSE)
130 | set.seed(669)
131 | nnetFit <- train(CompressiveStrength ~ .,
132 |                  data = training,
133 |                  method = "avNNet",
134 |                  tuneGrid = nnetGrid,
135 |                  preProc = c("center", "scale"),
136 |                  linout = TRUE,
137 |                  trace = FALSE,
138 |                  maxit = 1000,
139 |                  allowParallel = FALSE,
140 |                  trControl = ctrl)
141 | 
142 | set.seed(669)
143 | rpartFit <- train(CompressiveStrength ~ .,
144 |                   data = training,
145 |                   method = "rpart",
146 |                   tuneLength = 30,
147 |                   trControl = ctrl)
148 | 
149 | set.seed(669)
150 | treebagFit <- train(CompressiveStrength ~ .,
151 |                     data = training,
152 |                     method = "treebag",
153 |                     trControl = ctrl)
154 | 
155 | set.seed(669)
156 | ctreeFit <- train(CompressiveStrength ~ .,
157 |                   data = training,
158 |                   method = "ctree",
159 |                   tuneLength = 10,
160 |                   trControl = ctrl)
161 | 
162 | set.seed(669)
163 | rfFit <- train(CompressiveStrength ~ .,
164 |                data = training,
165 |                method = "rf",
166 |                tuneLength = 10,
167 |                ntrees = 1000,
168 |                importance = TRUE,
169 |                trControl = ctrl)
170 | 
171 | 
172 | gbmGrid <- expand.grid(interaction.depth = seq(1, 7, by = 2),
173 |                        n.trees = seq(100, 1000, by = 50),
174 |                        shrinkage = c(0.01, 0.1))
175 | set.seed(669)
176 | gbmFit <- train(CompressiveStrength ~ .,
177 |                 data = training,
178 |                 method = "gbm",
179 |                 tuneGrid = gbmGrid,
180 |                 verbose = FALSE,
181 |                 trControl = ctrl)
182 | 
183 | 
184 | cbGrid <- expand.grid(committees = c(1, 5, 10, 50, 75, 100), 
185 |                       neighbors = c(0, 1, 3, 5, 7, 9))
186 | set.seed(669)
187 | cbFit <- train(CompressiveStrength ~ .,
188 |                data = training,
189 |                method = "cubist",
190 |                tuneGrid = cbGrid,
191 |                trControl = ctrl)
192 | 
193 | ### Turn off the parallel processing to use RWeka. 
194 | registerDoSEQ()
195 | 
196 | 
197 | set.seed(669)
198 | mtFit <- train(CompressiveStrength ~ .,
199 |                data = training,
200 |                method = "M5",
201 |                trControl = ctrl)
202 | 
203 | ################################################################################
204 | ### Section 10.2 Model Performance
205 | 
206 | ### Collect the resampling statistics across all the models
207 | 
208 | rs <- resamples(list("Linear Reg" = lmFit, "
209 |                      PLS" = plsFit,
210 |                      "Elastic Net" = lassoFit, 
211 |                      MARS = earthFit,
212 |                      SVM = svmRFit, 
213 |                      "Neural Networks" = nnetFit,
214 |                      CART = rpartFit, 
215 |                      "Cond Inf Tree" = ctreeFit,
216 |                      "Bagged Tree" = treebagFit,
217 |                      "Boosted Tree" = gbmFit,
218 |                      "Random Forest" = rfFit,
219 |                      Cubist = cbFit))
220 | 
221 | #parallelPlot(rs)
222 | #parallelPlot(rs, metric = "Rsquared")
223 | 
224 | ### Get the test set results across several models
225 | 
226 | nnetPred <- predict(nnetFit, testing)
227 | gbmPred <- predict(gbmFit, testing)
228 | cbPred <- predict(cbFit, testing)
229 | 
230 | testResults <- rbind(postResample(nnetPred, testing$CompressiveStrength),
231 |                      postResample(gbmPred, testing$CompressiveStrength),
232 |                      postResample(cbPred, testing$CompressiveStrength))
233 | testResults <- as.data.frame(testResults)
234 | testResults$Model <- c("Neural Networks", "Boosted Tree", "Cubist")
235 | testResults <- testResults[order(testResults$RMSE),]
236 | 
237 | ################################################################################
238 | ### Section 10.3 Optimizing Compressive Strength
239 | 
240 | library(proxy)
241 | 
242 | ### Create a function to maximize compressive strength* while keeping
243 | ### the predictor values as mixtures. Water (in x[7]) is used as the 
244 | ### 'slack variable'. 
245 | 
246 | ### * We are actually minimizing the negative compressive strength
247 | 
248 | modelPrediction <- function(x, mod, limit = 2500)
249 | {
250 |   if(x[1] < 0 | x[1] > 1) return(10^38)
251 |   if(x[2] < 0 | x[2] > 1) return(10^38)
252 |   if(x[3] < 0 | x[3] > 1) return(10^38)
253 |   if(x[4] < 0 | x[4] > 1) return(10^38)
254 |   if(x[5] < 0 | x[5] > 1) return(10^38)
255 |   if(x[6] < 0 | x[6] > 1) return(10^38)
256 |   
257 |   x <- c(x, 1 - sum(x))
258 |   
259 |   if(x[7] < 0.05) return(10^38)
260 |   
261 |   tmp <- as.data.frame(t(x))
262 |   names(tmp) <- c('Cement','BlastFurnaceSlag','FlyAsh',
263 |                   'Superplasticizer','CoarseAggregate',
264 |                   'FineAggregate', 'Water')
265 |   tmp$Age <- 28
266 |   -predict(mod, tmp)
267 | }
268 | 
269 | ### Get mixtures at 28 days 
270 | subTrain <- subset(training, Age == 28)
271 | 
272 | ### Center and scale the data to use dissimilarity sampling
273 | pp1 <- preProcess(subTrain[, -(8:9)], c("center", "scale"))
274 | scaledTrain <- predict(pp1, subTrain[, 1:7])
275 | 
276 | ### Randomly select a few mixtures as a starting pool
277 | 
278 | set.seed(91)
279 | startMixture <- sample(1:nrow(subTrain), 1)
280 | starters <- scaledTrain[startMixture, 1:7]
281 | pool <- scaledTrain
282 | index <- maxDissim(starters, pool, 14)
283 | startPoints <- c(startMixture, index)
284 | 
285 | starters <- subTrain[startPoints,1:7]
286 | startingValues <- starters[, -4]
287 | 
288 | ### For each starting mixture, optimize the Cubist model using
289 | ### a simplex search routine
290 | 
291 | cbResults <- startingValues
292 | cbResults$Water <- NA
293 | cbResults$Prediction <- NA
294 | 
295 | for(i in 1:nrow(cbResults))
296 | {
297 |   results <- optim(unlist(cbResults[i,1:6]),
298 |                    modelPrediction,
299 |                    method = "Nelder-Mead",
300 |                    control=list(maxit=5000),
301 |                    mod = cbFit)
302 |   cbResults$Prediction[i] <- -results$value
303 |   cbResults[i,1:6] <- results$par
304 | }
305 | cbResults$Water <- 1 - apply(cbResults[,1:6], 1, sum)
306 | cbResults <- subset(cbResults, Prediction > 0 & Water > .02)
307 | cbResults <- cbResults[order(-cbResults$Prediction),][1:3,]
308 | cbResults$Model <- "Cubist"
309 | 
310 | ### Do the same for the neural network model
311 | 
312 | nnetResults <- startingValues
313 | nnetResults$Water <- NA
314 | nnetResults$Prediction <- NA
315 | 
316 | for(i in 1:nrow(nnetResults))
317 | {
318 |   results <- optim(unlist(nnetResults[i, 1:6,]),
319 |                    modelPrediction,
320 |                    method = "Nelder-Mead",
321 |                    control=list(maxit=5000),
322 |                    mod = nnetFit)
323 |   nnetResults$Prediction[i] <- -results$value
324 |   nnetResults[i,1:6] <- results$par
325 | }
326 | nnetResults$Water <- 1 - apply(nnetResults[,1:6], 1, sum)
327 | nnetResults <- subset(nnetResults, Prediction > 0 & Water > .02)
328 | nnetResults <- nnetResults[order(-nnetResults$Prediction),][1:3,]
329 | nnetResults$Model <- "NNet"
330 | 
331 | ### Convert the predicted mixtures to PCA space and plot
332 | 
333 | pp2 <- preProcess(subTrain[, 1:7], "pca")
334 | pca1 <- predict(pp2, subTrain[, 1:7])
335 | pca1$Data <- "Training Set"
336 | pca1$Data[startPoints] <- "Starting Values"
337 | pca3 <- predict(pp2, cbResults[, names(subTrain[, 1:7])])
338 | pca3$Data <- "Cubist"
339 | pca4 <- predict(pp2, nnetResults[, names(subTrain[, 1:7])])
340 | pca4$Data <- "Neural Network"
341 | 
342 | pcaData <- rbind(pca1, pca3, pca4)
343 | pcaData$Data <- factor(pcaData$Data,
344 |                        levels = c("Training Set","Starting Values",
345 |                                   "Cubist","Neural Network"))
346 | 
347 | lim <- extendrange(pcaData[, 1:2])
348 | 
349 | xyplot(PC2 ~ PC1, 
350 |        data = pcaData, 
351 |        groups = Data,
352 |        auto.key = list(columns = 2),
353 |        xlim = lim, 
354 |        ylim = lim,
355 |        type = c("g", "p"))
356 | 
357 | 
358 | ################################################################################
359 | ### Session Information
360 | 
361 | sessionInfo()
362 | 
363 | q("no")
364 | 


--------------------------------------------------------------------------------
/inst/chapters/03_Data_Pre_Processing.Rout:
--------------------------------------------------------------------------------
  1 | 
  2 | R version 3.0.1 (2013-05-16) -- "Good Sport"
  3 | Copyright (C) 2013 The R Foundation for Statistical Computing
  4 | Platform: x86_64-apple-darwin10.8.0 (64-bit)
  5 | 
  6 | R is free software and comes with ABSOLUTELY NO WARRANTY.
  7 | You are welcome to redistribute it under certain conditions.
  8 | Type 'license()' or 'licence()' for distribution details.
  9 | 
 10 |   Natural language support but running in an English locale
 11 | 
 12 | R is a collaborative project with many contributors.
 13 | Type 'contributors()' for more information and
 14 | 'citation()' on how to cite R or R packages in publications.
 15 | 
 16 | Type 'demo()' for some demos, 'help()' for on-line help, or
 17 | 'help.start()' for an HTML browser interface to help.
 18 | Type 'q()' to quit R.
 19 | 
 20 | > ################################################################################
 21 | > ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
 22 | > ### Copyright 2013 Kuhn and Johnson
 23 | > ### Web Page: http://www.appliedpredictivemodeling.com
 24 | > ### Contact: Max Kuhn (mxkuhn@gmail.com)
 25 | > ###
 26 | > ### Chapter 3: Data Pre-Processing
 27 | > ###
 28 | > ### Required packages: AppliedPredictiveModeling, e1071, caret, corrplot
 29 | > ###
 30 | > ### Data used: The (unprocessed) cell segmentation data from the
 31 | > ###            AppliedPredictiveModeling package.
 32 | > ###
 33 | > ### Notes: 
 34 | > ### 1) This code is provided without warranty.
 35 | > ###
 36 | > ### 2) This code should help the user reproduce the results in the
 37 | > ### text. There will be differences between this code and what is is
 38 | > ### the computing section. For example, the computing sections show
 39 | > ### how the source functions work (e.g. randomForest() or plsr()),
 40 | > ### which were not directly used when creating the book. Also, there may be 
 41 | > ### syntax differences that occur over time as packages evolve. These files 
 42 | > ### will reflect those changes.
 43 | > ###
 44 | > ### 3) In some cases, the calculations in the book were run in 
 45 | > ### parallel. The sub-processes may reset the random number seed.
 46 | > ### Your results may slightly vary.
 47 | > ###
 48 | > ################################################################################
 49 | > 
 50 | > ################################################################################
 51 | > ### Section 3.1 Case Study: Cell Segmentation in High-Content Screening
 52 | > 
 53 | > library(AppliedPredictiveModeling)
 54 | > data(segmentationOriginal)
 55 | > 
 56 | > ## Retain the original training set
 57 | > segTrain <- subset(segmentationOriginal, Case == "Train")
 58 | > 
 59 | > ## Remove the first three columns (identifier columns)
 60 | > segTrainX <- segTrain[, -(1:3)]
 61 | > segTrainClass <- segTrain$Class
 62 | > 
 63 | > ################################################################################
 64 | > ### Section 3.2 Data Transformations for Individual Predictors
 65 | > 
 66 | > ## The column VarIntenCh3 measures the standard deviation of the intensity
 67 | > ## of the pixels in the actin filaments
 68 | > 
 69 | > max(segTrainX$VarIntenCh3)/min(segTrainX$VarIntenCh3)
 70 | [1] 870.8872
 71 | > 
 72 | > library(e1071)
 73 | Loading required package: class
 74 | > skewness(segTrainX$VarIntenCh3)
 75 | [1] 2.391624
 76 | > 
 77 | > library(caret)
 78 | Loading required package: lattice
 79 | Loading required package: ggplot2
 80 | > 
 81 | > ## Use caret's preProcess function to transform for skewness
 82 | > segPP <- preProcess(segTrainX, method = "BoxCox")
 83 | > 
 84 | > ## Apply the transformations
 85 | > segTrainTrans <- predict(segPP, segTrainX)
 86 | > 
 87 | > ## Results for a single predictor
 88 | > segPP$bc$VarIntenCh3
 89 | Box-Cox Transformation
 90 | 
 91 | 1009 data points used to estimate Lambda
 92 | 
 93 | Input data summary:
 94 |     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
 95 |   0.8693  37.0600  68.1300 101.7000 125.0000 757.0000 
 96 | 
 97 | Largest/Smallest: 871 
 98 | Sample Skewness: 2.39 
 99 | 
100 | Estimated Lambda: 0.1 
101 | With fudge factor, Lambda = 0 will be used for transformations
102 | 
103 | > 
104 | > histogram(~segTrainX$VarIntenCh3,
105 | +           xlab = "Natural Units",
106 | +           type = "count")
107 | > 
108 | > histogram(~log(segTrainX$VarIntenCh3),
109 | +           xlab = "Log Units",
110 | +           ylab = " ",
111 | +           type = "count")
112 | > 
113 | > segPP$bc$PerimCh1
114 | Box-Cox Transformation
115 | 
116 | 1009 data points used to estimate Lambda
117 | 
118 | Input data summary:
119 |    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
120 |   47.74   64.37   79.02   91.61  103.20  459.80 
121 | 
122 | Largest/Smallest: 9.63 
123 | Sample Skewness: 2.59 
124 | 
125 | Estimated Lambda: -1.1 
126 | 
127 | > 
128 | > histogram(~segTrainX$PerimCh1,
129 | +           xlab = "Natural Units",
130 | +           type = "count")
131 | > 
132 | > histogram(~segTrainTrans$PerimCh1,
133 | +           xlab = "Transformed Data",
134 | +           ylab = " ",
135 | +           type = "count")
136 | > 
137 | > ################################################################################
138 | > ### Section 3.3 Data Transformations for Multiple Predictors
139 | > 
140 | > ## R's prcomp is used to conduct PCA
141 | > pr <- prcomp(~ AvgIntenCh1 + EntropyIntenCh1, 
142 | +              data = segTrainTrans, 
143 | +              scale. = TRUE)
144 | > 
145 | > transparentTheme(pchSize = .7, trans = .3)
146 | > 
147 | > xyplot(AvgIntenCh1 ~ EntropyIntenCh1,
148 | +        data = segTrainTrans,
149 | +        groups = segTrain$Class,
150 | +        xlab = "Channel 1 Fiber Width",
151 | +        ylab = "Intensity Entropy Channel 1",
152 | +        auto.key = list(columns = 2),
153 | +        type = c("p", "g"),
154 | +        main = "Original Data",
155 | +        aspect = 1)
156 | > 
157 | > xyplot(PC2 ~ PC1,
158 | +        data = as.data.frame(pr$x),
159 | +        groups = segTrain$Class,
160 | +        xlab = "Principal Component #1",
161 | +        ylab = "Principal Component #2",
162 | +        main = "Transformed",
163 | +        xlim = extendrange(pr$x),
164 | +        ylim = extendrange(pr$x),
165 | +        type = c("p", "g"),
166 | +        aspect = 1)
167 | > 
168 | > 
169 | > ## Apply PCA to the entire set of predictors.
170 | > 
171 | > ## There are a few predictors with only a single value, so we remove these first
172 | > ## (since PCA uses variances, which would be zero)
173 | > 
174 | > isZV <- apply(segTrainX, 2, function(x) length(unique(x)) == 1)
175 | > segTrainX <- segTrainX[, !isZV]
176 | > 
177 | > segPP <- preProcess(segTrainX, c("BoxCox", "center", "scale"))
178 | > segTrainTrans <- predict(segPP, segTrainX)
179 | > 
180 | > segPCA <- prcomp(segTrainTrans, center = TRUE, scale. = TRUE)
181 | > 
182 | > ## Plot a scatterplot matrix of the first three components
183 | > transparentTheme(pchSize = .8, trans = .3)
184 | > 
185 | > panelRange <- extendrange(segPCA$x[, 1:3])
186 | > splom(as.data.frame(segPCA$x[, 1:3]),
187 | +       groups = segTrainClass,
188 | +       type = c("p", "g"),
189 | +       as.table = TRUE,
190 | +       auto.key = list(columns = 2),
191 | +       prepanel.limits = function(x) panelRange)
192 | > 
193 | > ## Format the rotation values for plotting
194 | > segRot <- as.data.frame(segPCA$rotation[, 1:3])
195 | > 
196 | > ## Derive the channel variable
197 | > vars <- rownames(segPCA$rotation)
198 | > channel <- rep(NA, length(vars))
199 | > channel[grepl("Ch1$", vars)] <- "Channel 1"
200 | > channel[grepl("Ch2$", vars)] <- "Channel 2"
201 | > channel[grepl("Ch3$", vars)] <- "Channel 3"
202 | > channel[grepl("Ch4$", vars)] <- "Channel 4"
203 | > 
204 | > segRot$Channel <- channel
205 | > segRot <- segRot[complete.cases(segRot),]
206 | > segRot$Channel <- factor(as.character(segRot$Channel))
207 | > 
208 | > ## Plot a scatterplot matrix of the first three rotation variables
209 | > 
210 | > transparentTheme(pchSize = .8, trans = .7)
211 | > panelRange <- extendrange(segRot[, 1:3])
212 | > library(ellipse)
213 | > upperp <- function(...)
214 | +   {
215 | +     args <- list(...)
216 | +     circ1 <- ellipse(diag(rep(1, 2)), t = .1)
217 | +     panel.xyplot(circ1[,1], circ1[,2],
218 | +                  type = "l",
219 | +                  lty = trellis.par.get("reference.line")$lty,
220 | +                  col = trellis.par.get("reference.line")$col,
221 | +                  lwd = trellis.par.get("reference.line")$lwd)
222 | +     circ2 <- ellipse(diag(rep(1, 2)), t = .2)
223 | +     panel.xyplot(circ2[,1], circ2[,2],
224 | +                  type = "l",
225 | +                  lty = trellis.par.get("reference.line")$lty,
226 | +                  col = trellis.par.get("reference.line")$col,
227 | +                  lwd = trellis.par.get("reference.line")$lwd)
228 | +     circ3 <- ellipse(diag(rep(1, 2)), t = .3)
229 | +     panel.xyplot(circ3[,1], circ3[,2],
230 | +                  type = "l",
231 | +                  lty = trellis.par.get("reference.line")$lty,
232 | +                  col = trellis.par.get("reference.line")$col,
233 | +                  lwd = trellis.par.get("reference.line")$lwd)
234 | +     panel.xyplot(args$x, args$y, groups = args$groups, subscripts = args$subscripts)
235 | +   }
236 | > splom(~segRot[, 1:3],
237 | +       groups = segRot$Channel,
238 | +       lower.panel = function(...){}, upper.panel = upperp,
239 | +       prepanel.limits = function(x) panelRange,
240 | +       auto.key = list(columns = 2))
241 | > 
242 | > ################################################################################
243 | > ### Section 3.5 Removing Variables
244 | > 
245 | > ## To filter on correlations, we first get the correlation matrix for the 
246 | > ## predictor set
247 | > 
248 | > segCorr <- cor(segTrainTrans)
249 | > 
250 | > library(corrplot)
251 | > corrplot(segCorr, order = "hclust", tl.cex = .35)
252 | > 
253 | > ## caret's findCorrelation function is used to identify columns to remove.
254 | > highCorr <- findCorrelation(segCorr, .75)
255 | > 
256 | > ################################################################################
257 | > ### Section 3.8 Computing (Creating Dummy Variables)
258 | > 
259 | > data(cars)
260 | > type <- c("convertible", "coupe", "hatchback", "sedan", "wagon")
261 | > cars$Type <- factor(apply(cars[, 14:18], 1, function(x) type[which(x == 1)]))
262 | > 
263 | > carSubset <- cars[sample(1:nrow(cars), 20), c(1, 2, 19)]
264 | > 
265 | > head(carSubset)
266 |        Price Mileage      Type
267 | 415 51154.05    2202     sedan
268 | 503 14116.92   12878     sedan
269 | 484 18620.87   25516     sedan
270 | 642 19423.17   25557     sedan
271 | 337 11391.21   21421 hatchback
272 | 121 20538.09   15066     sedan
273 | > levels(carSubset$Type)
274 | [1] "convertible" "coupe"       "hatchback"   "sedan"       "wagon"      
275 | > 
276 | > simpleMod <- dummyVars(~Mileage + Type,
277 | +                        data = carSubset,
278 | +                        ## Remove the variable name from the
279 | +                        ## column name
280 | +                        levelsOnly = TRUE)
281 | > simpleMod
282 | Dummy Variable Object
283 | 
284 | Formula: ~Mileage + Type
285 | 2 variables, 1 factors
286 | Factor variable names will be removed
287 | A less than full rank encoding is used
288 | > 
289 | > withInteraction <- dummyVars(~Mileage + Type + Mileage:Type,
290 | +                              data = carSubset,
291 | +                              levelsOnly = TRUE)
292 | > withInteraction
293 | Dummy Variable Object
294 | 
295 | Formula: ~Mileage + Type + Mileage:Type
296 | 2 variables, 1 factors
297 | Factor variable names will be removed
298 | A less than full rank encoding is used
299 | > predict(withInteraction, head(carSubset))
300 |     Mileage convertible coupe hatchback sedan wagon Mileage:convertible
301 | 415    2202           0     0         0     1     0                   0
302 | 503   12878           0     0         0     1     0                   0
303 | 484   25516           0     0         0     1     0                   0
304 | 642   25557           0     0         0     1     0                   0
305 | 337   21421           0     0         1     0     0                   0
306 | 121   15066           0     0         0     1     0                   0
307 |     Mileage:coupe Mileage:hatchback Mileage:sedan Mileage:wagon
308 | 415             0                 0          2202             0
309 | 503             0                 0         12878             0
310 | 484             0                 0         25516             0
311 | 642             0                 0         25557             0
312 | 337             0             21421             0             0
313 | 121             0                 0         15066             0
314 | > 
315 | > 
316 | > 
317 | > ################################################################################
318 | > ### Session Information
319 | > 
320 | > sessionInfo()
321 | R version 3.0.1 (2013-05-16)
322 | Platform: x86_64-apple-darwin10.8.0 (64-bit)
323 | 
324 | locale:
325 | [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
326 | 
327 | attached base packages:
328 | [1] stats     graphics  grDevices utils     datasets  methods   base     
329 | 
330 | other attached packages:
331 | [1] corrplot_0.71                   ellipse_0.3-8                  
332 | [3] MASS_7.3-26                     caret_6.0-22                   
333 | [5] ggplot2_0.9.3.1                 lattice_0.20-15                
334 | [7] e1071_1.6-1                     class_7.3-7                    
335 | [9] AppliedPredictiveModeling_1.1-5
336 | 
337 | loaded via a namespace (and not attached):
338 |  [1] car_2.0-17         codetools_0.2-8    colorspace_1.2-2   CORElearn_0.9.41  
339 |  [5] dichromat_2.0-0    digest_0.6.3       foreach_1.4.0      grid_3.0.1        
340 |  [9] gtable_0.1.2       iterators_1.0.6    labeling_0.1       munsell_0.4       
341 | [13] plyr_1.8           proto_0.3-10       RColorBrewer_1.0-5 reshape2_1.2.2    
342 | [17] scales_0.2.3       stringr_0.6.2      tools_3.0.1       
343 | > 
344 | > q("no")
345 | > proc.time()
346 |    user  system elapsed 
347 |   5.791   0.147   6.146 
348 | 


--------------------------------------------------------------------------------
/inst/chapters/17_Job_Scheduling.R:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
  3 | ### Copyright 2013 Kuhn and Johnson
  4 | ### Web Page: http://www.appliedpredictivemodeling.com
  5 | ### Contact: Max Kuhn (mxkuhn@gmail.com) 
  6 | ###
  7 | ### Chapter 17: Case Study: Job Scheduling
  8 | ###
  9 | ### Required packages: AppliedPredictiveModeling, C50, caret, doMC (optional),
 10 | ###                    earth, Hmisc, ipred, tabplot, kernlab, lattice, MASS,
 11 | ###                    mda, nnet, pls, randomForest, rpart, sparseLDA, 
 12 | ###
 13 | ### Data used: The HPC job scheduling data in the AppliedPredictiveModeling
 14 | ###            package.
 15 | ###
 16 | ### Notes: 
 17 | ### 1) This code is provided without warranty.
 18 | ###
 19 | ### 2) This code should help the user reproduce the results in the
 20 | ### text. There will be differences between this code and what is is
 21 | ### the computing section. For example, the computing sections show
 22 | ### how the source functions work (e.g. randomForest() or plsr()),
 23 | ### which were not directly used when creating the book. Also, there may be 
 24 | ### syntax differences that occur over time as packages evolve. These files 
 25 | ### will reflect those changes.
 26 | ###
 27 | ### 3) In some cases, the calculations in the book were run in 
 28 | ### parallel. The sub-processes may reset the random number seed.
 29 | ### Your results may slightly vary.
 30 | ###
 31 | ################################################################################
 32 | 
 33 | library(AppliedPredictiveModeling)
 34 | data(schedulingData)
 35 | 
 36 | ### Make a vector of predictor names
 37 | predictors <- names(schedulingData)[!(names(schedulingData) %in% c("Class"))]
 38 | 
 39 | ### A few summaries and plots of the data
 40 | library(Hmisc)
 41 | describe(schedulingData)
 42 | 
 43 | library(tabplot)
 44 | tableplot(schedulingData[, c( "Class", predictors)])
 45 | 
 46 | mosaicplot(table(schedulingData$Protocol, 
 47 |                  schedulingData$Class), 
 48 |            main = "")
 49 | 
 50 | library(lattice)
 51 | xyplot(Compounds ~ InputFields|Protocol,
 52 |        data = schedulingData,
 53 |        scales = list(x = list(log = 10), y = list(log = 10)),
 54 |        groups = Class,
 55 |        xlab = "Input Fields",
 56 |        auto.key = list(columns = 4),
 57 |        aspect = 1,
 58 |        as.table = TRUE)
 59 | 
 60 | 
 61 | ################################################################################
 62 | ### Section 17.1 Data Splitting and Model Strategy
 63 | 
 64 | ## Split the data
 65 | 
 66 | library(caret)
 67 | set.seed(1104)
 68 | inTrain <- createDataPartition(schedulingData$Class, p = .8, list = FALSE)
 69 | 
 70 | ### There are a lot of zeros and the distribution is skewed. We add
 71 | ### one so that we can log transform the data
 72 | schedulingData$NumPending <- schedulingData$NumPending + 1
 73 | 
 74 | trainData <- schedulingData[ inTrain,]
 75 | testData  <- schedulingData[-inTrain,]
 76 | 
 77 | ### Create a main effects only model formula to use
 78 | ### repeatedly. Another formula with nonlinear effects is created
 79 | ### below.
 80 | modForm <- as.formula(Class ~ Protocol + log10(Compounds) +
 81 |   log10(InputFields)+ log10(Iterations) +
 82 |   log10(NumPending) + Hour + Day)
 83 | 
 84 | ### Create an expanded set of predictors with interactions. 
 85 | 
 86 | modForm2 <- as.formula(Class ~ (Protocol + log10(Compounds) +
 87 |   log10(InputFields)+ log10(Iterations) +
 88 |   log10(NumPending) + Hour + Day)^2)
 89 | 
 90 | 
 91 | ### Some of these terms will not be estimable. For example, if there
 92 | ### are no data points were a particular protocol was run on a
 93 | ### particular day, the full interaction cannot be computed. We use
 94 | ### model.matrix() to create the whole set of predictor columns, then
 95 | ### remove those that are zero variance
 96 | 
 97 | expandedTrain <- model.matrix(modForm2, data = trainData)
 98 | expandedTest  <- model.matrix(modForm2, data = testData)
 99 | expandedTrain <- as.data.frame(expandedTrain)
100 | expandedTest  <-  as.data.frame(expandedTest)
101 | 
102 | ### Some models have issues when there is a zero variance predictor
103 | ### within the data of a particular class, so we used caret's
104 | ### checkConditionalX() function to find the offending columns and
105 | ### remove them
106 | 
107 | zv <- checkConditionalX(expandedTrain, trainData$Class)
108 | 
109 | ### Keep the expanded set to use for models where we must manually add
110 | ### more complex terms (such as logistic regression)
111 | 
112 | expandedTrain <-  expandedTrain[,-zv]
113 | expandedTest  <-  expandedTest[, -zv]
114 | 
115 | ### Create the cost matrix
116 | costMatrix <- ifelse(diag(4) == 1, 0, 1)
117 | costMatrix[4, 1] <- 10
118 | costMatrix[3, 1] <- 5
119 | costMatrix[4, 2] <- 5
120 | costMatrix[3, 2] <- 5
121 | rownames(costMatrix) <- colnames(costMatrix) <- levels(trainData$Class)
122 | 
123 | ### Create a cost function
124 | cost <- function(pred, obs)
125 | {
126 |   isNA <- is.na(pred)
127 |   if(!all(isNA))
128 |   {
129 |     pred <- pred[!isNA]
130 |     obs <- obs[!isNA]
131 |     
132 |     cost <- ifelse(pred == obs, 0, 1)
133 |     if(any(pred == "VF" & obs == "L")) cost[pred == "L" & obs == "VF"] <- 10
134 |     if(any(pred == "F" & obs == "L")) cost[pred == "F" & obs == "L"] <- 5
135 |     if(any(pred == "F" & obs == "M")) cost[pred == "F" & obs == "M"] <- 5
136 |     if(any(pred == "VF" & obs == "M")) cost[pred == "VF" & obs == "M"] <- 5
137 |     out <- mean(cost)
138 |   } else out <- NA
139 |   out
140 | }
141 | 
142 | ### Make a summary function that can be used with caret's train() function
143 | costSummary <- function (data, lev = NULL, model = NULL)
144 | {
145 |   if (is.character(data$obs))  data$obs <- factor(data$obs, levels = lev)
146 |   c(postResample(data[, "pred"], data[, "obs"]),
147 |     Cost = cost(data[, "pred"], data[, "obs"]))
148 | }
149 | 
150 | ### Create a control object for the models
151 | ctrl <- trainControl(method = "repeatedcv", 
152 |                      repeats = 5,
153 |                      summaryFunction = costSummary)
154 | 
155 | ### Optional: parallel processing can be used via the 'do' packages,
156 | ### such as doMC, doMPI etc. We used doMC (not on Windows) to speed
157 | ### up the computations.
158 | 
159 | ### WARNING: Be aware of how much memory is needed to parallel
160 | ### process. It can very quickly overwhelm the available hardware. The
161 | ### estimate of the median memory usage (VSIZE = total memory size) 
162 | ### was 3300-4100M per core although the some calculations require as  
163 | ### much as 3400M without parallel processing. 
164 | 
165 | library(doMC)
166 | registerDoMC(14)
167 | 
168 | ### Fit the CART model with and without costs
169 | 
170 | set.seed(857)
171 | rpFit <- train(x = trainData[, predictors],
172 |                y = trainData$Class,
173 |                method = "rpart",
174 |                metric = "Cost",
175 |                maximize = FALSE,
176 |                tuneLength = 20,
177 |                trControl = ctrl)
178 | rpFit
179 | 
180 | set.seed(857)
181 | rpFitCost <- train(x = trainData[, predictors],
182 |                    y = trainData$Class,
183 |                    method = "rpart",
184 |                    metric = "Cost",
185 |                    maximize = FALSE,
186 |                    tuneLength = 20,
187 |                    parms =list(loss = costMatrix),
188 |                    trControl = ctrl)
189 | rpFitCost
190 | 
191 | set.seed(857)
192 | ldaFit <- train(x = expandedTrain,
193 |                 y = trainData$Class,
194 |                 method = "lda",
195 |                 metric = "Cost",
196 |                 maximize = FALSE,
197 |                 trControl = ctrl)
198 | ldaFit
199 | 
200 | sldaGrid <- expand.grid(NumVars = seq(2, 112, by = 5),
201 |                         lambda = c(0, 0.01, .1, 1, 10))
202 | set.seed(857)
203 | sldaFit <- train(x = expandedTrain,
204 |                  y = trainData$Class,
205 |                  method = "sparseLDA",
206 |                  tuneGrid = sldaGrid,
207 |                  preProc = c("center", "scale"),
208 |                  metric = "Cost",
209 |                  maximize = FALSE,
210 |                  trControl = ctrl)
211 | sldaFit
212 | 
213 | set.seed(857)
214 | nnetGrid <- expand.grid(decay = c(0, 0.001, 0.01, .1, .5),
215 |                         size = (1:10)*2 - 1)
216 | nnetFit <- train(modForm, 
217 |                  data = trainData,
218 |                  method = "nnet",
219 |                  metric = "Cost",
220 |                  maximize = FALSE,
221 |                  tuneGrid = nnetGrid,
222 |                  trace = FALSE,
223 |                  MaxNWts = 2000,
224 |                  maxit = 1000,
225 |                  preProc = c("center", "scale"),
226 |                  trControl = ctrl)
227 | nnetFit
228 | 
229 | set.seed(857)
230 | plsFit <- train(x = expandedTrain,
231 |                 y = trainData$Class,
232 |                 method = "pls",
233 |                 metric = "Cost",
234 |                 maximize = FALSE,
235 |                 tuneLength = 100,
236 |                 preProc = c("center", "scale"),
237 |                 trControl = ctrl)
238 | plsFit
239 | 
240 | set.seed(857)
241 | fdaFit <- train(modForm, data = trainData,
242 |                 method = "fda",
243 |                 metric = "Cost",
244 |                 maximize = FALSE,
245 |                 tuneLength = 25,
246 |                 trControl = ctrl)
247 | fdaFit
248 | 
249 | set.seed(857)
250 | rfFit <- train(x = trainData[, predictors],
251 |                y = trainData$Class,
252 |                method = "rf",
253 |                metric = "Cost",
254 |                maximize = FALSE,
255 |                tuneLength = 10,
256 |                ntree = 2000,
257 |                importance = TRUE,
258 |                trControl = ctrl)
259 | rfFit
260 | 
261 | set.seed(857)
262 | rfFitCost <- train(x = trainData[, predictors],
263 |                    y = trainData$Class,
264 |                    method = "rf",
265 |                    metric = "Cost",
266 |                    maximize = FALSE,
267 |                    tuneLength = 10,
268 |                    ntree = 2000,
269 |                    classwt = c(VF = 1, F = 1, M = 5, L = 10),
270 |                    importance = TRUE,
271 |                    trControl = ctrl)
272 | rfFitCost
273 | 
274 | c5Grid <- expand.grid(trials = c(1, (1:10)*10),
275 |                       model = "tree",
276 |                       winnow = c(TRUE, FALSE))
277 | set.seed(857)
278 | c50Fit <- train(x = trainData[, predictors],
279 |                 y = trainData$Class,
280 |                 method = "C5.0",
281 |                 metric = "Cost",
282 |                 maximize = FALSE,
283 |                 tuneGrid = c5Grid,
284 |                 trControl = ctrl)
285 | c50Fit
286 | 
287 | set.seed(857)
288 | c50Cost <- train(x = trainData[, predictors],
289 |                  y = trainData$Class,
290 |                  method = "C5.0",
291 |                  metric = "Cost",
292 |                  maximize = FALSE,
293 |                  costs = costMatrix,
294 |                  tuneGrid = c5Grid,
295 |                  trControl = ctrl)
296 | c50Cost
297 | 
298 | set.seed(857)
299 | bagFit <- train(x = trainData[, predictors],
300 |                 y = trainData$Class,
301 |                 method = "treebag",
302 |                 metric = "Cost",
303 |                 maximize = FALSE,
304 |                 nbagg = 50,
305 |                 trControl = ctrl)
306 | bagFit
307 | 
308 | ### Use the caret bag() function to bag the cost-sensitive CART model
309 | rpCost <- function(x, y)
310 | {
311 |   costMatrix <- ifelse(diag(4) == 1, 0, 1)
312 |   costMatrix[4, 1] <- 10
313 |   costMatrix[3, 1] <- 5
314 |   costMatrix[4, 2] <- 5
315 |   costMatrix[3, 2] <- 5
316 |   library(rpart)
317 |   tmp <- x
318 |   tmp$y <- y
319 |   rpart(y~., data = tmp, control = rpart.control(cp = 0),
320 |         parms =list(loss = costMatrix))
321 | }
322 | rpPredict <- function(object, x) predict(object, x)
323 | 
324 | rpAgg <- function (x, type = "class")
325 | {
326 |   pooled <- x[[1]] * NA
327 |   n <- nrow(pooled)
328 |   classes <- colnames(pooled)
329 |   for (i in 1:ncol(pooled))
330 |   {
331 |     tmp <- lapply(x, function(y, col) y[, col], col = i)
332 |     tmp <- do.call("rbind", tmp)
333 |     pooled[, i] <- apply(tmp, 2, median)
334 |   }
335 |   pooled <- apply(pooled, 1, function(x) x/sum(x))
336 |   if (n != nrow(pooled)) pooled <- t(pooled)
337 |   out <- factor(classes[apply(pooled, 1, which.max)], levels = classes)
338 |   out
339 | }
340 | 
341 | 
342 | set.seed(857)
343 | rpCostBag <- train(trainData[, predictors],
344 |                    trainData$Class,
345 |                    "bag",
346 |                    B = 50,
347 |                    bagControl = bagControl(fit = rpCost,
348 |                                            predict = rpPredict,
349 |                                            aggregate = rpAgg,
350 |                                            downSample = FALSE,
351 |                                            allowParallel = FALSE),
352 |                    trControl = ctrl)
353 | rpCostBag
354 | 
355 | set.seed(857)
356 | svmRFit <- train(modForm ,
357 |                  data = trainData,
358 |                  method = "svmRadial",
359 |                  metric = "Cost",
360 |                  maximize = FALSE,
361 |                  preProc = c("center", "scale"),
362 |                  tuneLength = 15,
363 |                  trControl = ctrl)
364 | svmRFit
365 | 
366 | set.seed(857)
367 | svmRFitCost <- train(modForm, data = trainData,
368 |                      method = "svmRadial",
369 |                      metric = "Cost",
370 |                      maximize = FALSE,
371 |                      preProc = c("center", "scale"),
372 |                      class.weights = c(VF = 1, F = 1, M = 5, L = 10),
373 |                      tuneLength = 15,
374 |                      trControl = ctrl)
375 | svmRFitCost
376 | 
377 | modelList <- list(C5.0 = c50Fit,
378 |                   "C5.0 (Costs)" = c50Cost,
379 |                   CART =rpFit,
380 |                   "CART (Costs)" = rpFitCost,
381 |                   "Bagging (Costs)" = rpCostBag,
382 |                   FDA = fdaFit,
383 |                   SVM = svmRFit,
384 |                   "SVM (Weights)" = svmRFitCost,
385 |                   PLS = plsFit,
386 |                   "Random Forests" = rfFit,
387 |                   LDA = ldaFit,
388 |                   "LDA (Sparse)" = sldaFit,
389 |                   "Neural Networks" = nnetFit,
390 |                   Bagging = bagFit)
391 | 
392 | 
393 | ################################################################################
394 | ### Section 17.2 Results
395 | 
396 | rs <- resamples(modelList)
397 | summary(rs)
398 | 
399 | confusionMatrix(rpFitCost, "none")
400 | confusionMatrix(rfFit, "none") 
401 | 
402 | plot(bwplot(rs, metric = "Cost"))
403 | 
404 | rfPred <- predict(rfFit, testData)
405 | rpPred <- predict(rpFitCost, testData)
406 | 
407 | confusionMatrix(rfPred, testData$Class)
408 | confusionMatrix(rpPred, testData$Class)
409 | 
410 | 
411 | ################################################################################
412 | ### Session Information
413 | 
414 | sessionInfo()
415 | 
416 | q("no")
417 | 


--------------------------------------------------------------------------------
/inst/chapters/13_Non-Linear_Class.R:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
  3 | ### Copyright 2013 Kuhn and Johnson
  4 | ### Web Page: http://www.appliedpredictivemodeling.com
  5 | ### Contact: Max Kuhn (mxkuhn@gmail.com) 
  6 | ###
  7 | ### Chapter 13 Non-Linear Classification Models
  8 | ###
  9 | ### Required packages: AppliedPredictiveModeling, caret, doMC (optional) 
 10 | ###                    kernlab, klaR, lattice, latticeExtra, MASS, mda, nnet,
 11 | ###                    pROC
 12 | ###
 13 | ### Data used: The grant application data. See the file 'CreateGrantData.R'
 14 | ###
 15 | ### Notes: 
 16 | ### 1) This code is provided without warranty.
 17 | ###
 18 | ### 2) This code should help the user reproduce the results in the
 19 | ### text. There will be differences between this code and what is is
 20 | ### the computing section. For example, the computing sections show
 21 | ### how the source functions work (e.g. randomForest() or plsr()),
 22 | ### which were not directly used when creating the book. Also, there may be 
 23 | ### syntax differences that occur over time as packages evolve. These files 
 24 | ### will reflect those changes.
 25 | ###
 26 | ### 3) In some cases, the calculations in the book were run in 
 27 | ### parallel. The sub-processes may reset the random number seed.
 28 | ### Your results may slightly vary.
 29 | ###
 30 | ################################################################################
 31 | 
 32 | ################################################################################
 33 | ### Section 13.1 Nonlinear Discriminant Analysis
 34 | 
 35 | 
 36 | load("grantData.RData")
 37 | 
 38 | library(caret)
 39 | 
 40 | ### Optional: parallel processing can be used via the 'do' packages,
 41 | ### such as doMC, doMPI etc. We used doMC (not on Windows) to speed
 42 | ### up the computations.
 43 |  
 44 | ### WARNING: Be aware of how much memory is needed to parallel
 45 | ### process. It can very quickly overwhelm the available hardware. We
 46 | ### estimate the memory usage (VSIZE = total memory size) to be 
 47 | ### 2700M/core.
 48 | 
 49 | library(doMC)
 50 | registerDoMC(12)
 51 | 
 52 | ## This control object will be used across multiple models so that the
 53 | ## data splitting is consistent
 54 | 
 55 | ctrl <- trainControl(method = "LGOCV",
 56 |                      summaryFunction = twoClassSummary,
 57 |                      classProbs = TRUE,
 58 |                      index = list(TrainSet = pre2008),
 59 |                      savePredictions = TRUE)
 60 | 
 61 | set.seed(476)
 62 | mdaFit <- train(x = training[,reducedSet], 
 63 |                 y = training$Class,
 64 |                 method = "mda",
 65 |                 metric = "ROC",
 66 |                 tries = 40,
 67 |                 tuneGrid = expand.grid(subclasses = 1:8),
 68 |                 trControl = ctrl)
 69 | mdaFit
 70 | 
 71 | mdaFit$results <- mdaFit$results[!is.na(mdaFit$results$ROC),]                
 72 | mdaFit$pred <- merge(mdaFit$pred,  mdaFit$bestTune)
 73 | mdaCM <- confusionMatrix(mdaFit, norm = "none")
 74 | mdaCM
 75 | 
 76 | mdaRoc <- roc(response = mdaFit$pred$obs,
 77 |               predictor = mdaFit$pred$successful,
 78 |               levels = rev(levels(mdaFit$pred$obs)))
 79 | mdaRoc
 80 | 
 81 | update(plot(mdaFit,
 82 |             ylab = "ROC AUC (2008 Hold-Out Data)"))
 83 | 
 84 | ################################################################################
 85 | ### Section 13.2 Neural Networks
 86 | 
 87 | nnetGrid <- expand.grid(size = 1:10, decay = c(0, .1, 1, 2))
 88 | maxSize <- max(nnetGrid$size)
 89 | 
 90 | 
 91 | ## Four different models are evaluate based on the data pre-processing and 
 92 | ## whethera single or multiple models are used
 93 | 
 94 | set.seed(476)
 95 | nnetFit <- train(x = training[,reducedSet], 
 96 |                  y = training$Class,
 97 |                  method = "nnet",
 98 |                  metric = "ROC",
 99 |                  preProc = c("center", "scale"),
100 |                  tuneGrid = nnetGrid,
101 |                  trace = FALSE,
102 |                  maxit = 2000,
103 |                  MaxNWts = 1*(maxSize * (length(reducedSet) + 1) + maxSize + 1),
104 |                  trControl = ctrl)
105 | nnetFit
106 | 
107 | set.seed(476)
108 | nnetFit2 <- train(x = training[,reducedSet], 
109 |                   y = training$Class,
110 |                   method = "nnet",
111 |                   metric = "ROC",
112 |                   preProc = c("center", "scale", "spatialSign"),
113 |                   tuneGrid = nnetGrid,
114 |                   trace = FALSE,
115 |                   maxit = 2000,
116 |                   MaxNWts = 1*(maxSize * (length(reducedSet) + 1) + maxSize + 1),
117 |                   trControl = ctrl)
118 | nnetFit2
119 | 
120 | nnetGrid$bag <- FALSE
121 | 
122 | set.seed(476)
123 | nnetFit3 <- train(x = training[,reducedSet], 
124 |                   y = training$Class,
125 |                   method = "avNNet",
126 |                   metric = "ROC",
127 |                   preProc = c("center", "scale"),
128 |                   tuneGrid = nnetGrid,
129 |                   repeats = 10,
130 |                   trace = FALSE,
131 |                   maxit = 2000,
132 |                   MaxNWts = 10*(maxSize * (length(reducedSet) + 1) + maxSize + 1),
133 |                   allowParallel = FALSE, ## this will cause to many workers to be launched.
134 |                   trControl = ctrl)
135 | nnetFit3
136 | 
137 | set.seed(476)
138 | nnetFit4 <- train(x = training[,reducedSet], 
139 |                   y = training$Class,
140 |                   method = "avNNet",
141 |                   metric = "ROC",
142 |                   preProc = c("center", "scale", "spatialSign"),
143 |                   tuneGrid = nnetGrid,
144 |                   trace = FALSE,
145 |                   maxit = 2000,
146 |                   repeats = 10,
147 |                   MaxNWts = 10*(maxSize * (length(reducedSet) + 1) + maxSize + 1),
148 |                   allowParallel = FALSE, 
149 |                   trControl = ctrl)
150 | nnetFit4
151 | 
152 | nnetFit4$pred <- merge(nnetFit4$pred,  nnetFit4$bestTune)
153 | nnetCM <- confusionMatrix(nnetFit4, norm = "none")
154 | nnetCM
155 | 
156 | nnetRoc <- roc(response = nnetFit4$pred$obs,
157 |                predictor = nnetFit4$pred$successful,
158 |                levels = rev(levels(nnetFit4$pred$obs)))
159 | 
160 | 
161 | nnet1 <- nnetFit$results
162 | nnet1$Transform <- "No Transformation"
163 | nnet1$Model <- "Single Model"
164 | 
165 | nnet2 <- nnetFit2$results
166 | nnet2$Transform <- "Spatial Sign"
167 | nnet2$Model <- "Single Model"
168 | 
169 | nnet3 <- nnetFit3$results
170 | nnet3$Transform <- "No Transformation"
171 | nnet3$Model <- "Model Averaging"
172 | nnet3$bag <- NULL
173 | 
174 | nnet4 <- nnetFit4$results
175 | nnet4$Transform <- "Spatial Sign"
176 | nnet4$Model <- "Model Averaging"
177 | nnet4$bag <- NULL
178 | 
179 | nnetResults <- rbind(nnet1, nnet2, nnet3, nnet4)
180 | nnetResults$Model <- factor(as.character(nnetResults$Model),
181 |                             levels = c("Single Model", "Model Averaging"))
182 | library(latticeExtra)
183 | useOuterStrips(
184 |   xyplot(ROC ~ size|Model*Transform,
185 |          data = nnetResults,
186 |          groups = decay,
187 |          as.table = TRUE,
188 |          type = c("p", "l", "g"),
189 |          lty = 1,
190 |          ylab = "ROC AUC (2008 Hold-Out Data)",
191 |          xlab = "Number of Hidden Units",
192 |          auto.key = list(columns = 4, 
193 |                          title = "Weight Decay", 
194 |                          cex.title = 1)))
195 | 
196 | plot(nnetRoc, type = "s", legacy.axes = TRUE)
197 | 
198 | ################################################################################
199 | ### Section 13.3 Flexible Discriminant Analysis
200 | 
201 | set.seed(476)
202 | fdaFit <- train(x = training[,reducedSet], 
203 |                 y = training$Class,
204 |                 method = "fda",
205 |                 metric = "ROC",
206 |                 tuneGrid = expand.grid(degree = 1, nprune = 2:25),
207 |                 trControl = ctrl)
208 | fdaFit
209 | 
210 | fdaFit$pred <- merge(fdaFit$pred,  fdaFit$bestTune)
211 | fdaCM <- confusionMatrix(fdaFit, norm = "none")
212 | fdaCM
213 | 
214 | fdaRoc <- roc(response = fdaFit$pred$obs,
215 |               predictor = fdaFit$pred$successful,
216 |               levels = rev(levels(fdaFit$pred$obs)))
217 | 
218 | update(plot(fdaFit), ylab = "ROC AUC (2008 Hold-Out Data)")
219 | 
220 | plot(nnetRoc, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
221 | plot(fdaRoc, type = "s", add = TRUE, legacy.axes = TRUE)
222 | 
223 | 
224 | ################################################################################
225 | ### Section 13.4 Support Vector Machines
226 | 
227 | library(kernlab)
228 | 
229 | set.seed(201)
230 | sigmaRangeFull <- sigest(as.matrix(training[,fullSet]))
231 | svmRGridFull <- expand.grid(sigma =  as.vector(sigmaRangeFull)[1],
232 |                             C = 2^(-3:4))
233 | set.seed(476)
234 | svmRFitFull <- train(x = training[,fullSet], 
235 |                      y = training$Class,
236 |                      method = "svmRadial",
237 |                      metric = "ROC",
238 |                      preProc = c("center", "scale"),
239 |                      tuneGrid = svmRGridFull,
240 |                      trControl = ctrl)
241 | svmRFitFull
242 | 
243 | set.seed(202)
244 | sigmaRangeReduced <- sigest(as.matrix(training[,reducedSet]))
245 | svmRGridReduced <- expand.grid(sigma = sigmaRangeReduced[1],
246 |                                C = 2^(seq(-4, 4)))
247 | set.seed(476)
248 | svmRFitReduced <- train(x = training[,reducedSet], 
249 |                         y = training$Class,
250 |                         method = "svmRadial",
251 |                         metric = "ROC",
252 |                         preProc = c("center", "scale"),
253 |                         tuneGrid = svmRGridReduced,
254 |                         trControl = ctrl)
255 | svmRFitReduced
256 | 
257 | svmPGrid <-  expand.grid(degree = 1:2,
258 |                          scale = c(0.01, .005),
259 |                          C = 2^(seq(-6, -2, length = 10)))
260 | 
261 | set.seed(476)
262 | svmPFitFull <- train(x = training[,fullSet], 
263 |                      y = training$Class,
264 |                      method = "svmPoly",
265 |                      metric = "ROC",
266 |                      preProc = c("center", "scale"),
267 |                      tuneGrid = svmPGrid,
268 |                      trControl = ctrl)
269 | svmPFitFull
270 | 
271 | svmPGrid2 <-  expand.grid(degree = 1:2,
272 |                           scale = c(0.01, .005),
273 |                           C = 2^(seq(-6, -2, length = 10)))
274 | set.seed(476)
275 | svmPFitReduced <- train(x = training[,reducedSet], 
276 |                         y = training$Class,
277 |                         method = "svmPoly",
278 |                         metric = "ROC",
279 |                         preProc = c("center", "scale"),
280 |                         tuneGrid = svmPGrid2,
281 |                         fit = FALSE,
282 |                         trControl = ctrl)
283 | svmPFitReduced
284 | 
285 | svmPFitReduced$pred <- merge(svmPFitReduced$pred,  svmPFitReduced$bestTune)
286 | svmPCM <- confusionMatrix(svmPFitReduced, norm = "none")
287 | svmPRoc <- roc(response = svmPFitReduced$pred$obs,
288 |                predictor = svmPFitReduced$pred$successful,
289 |                levels = rev(levels(svmPFitReduced$pred$obs)))
290 | 
291 | 
292 | svmRadialResults <- rbind(svmRFitReduced$results,
293 |                           svmRFitFull$results)
294 | svmRadialResults$Set <- c(rep("Reduced Set", nrow(svmRFitReduced$result)),
295 |                           rep("Full Set", nrow(svmRFitFull$result)))
296 | svmRadialResults$Sigma <- paste("sigma = ", 
297 |                                 format(svmRadialResults$sigma, 
298 |                                        scientific = FALSE, digits= 5))
299 | svmRadialResults <- svmRadialResults[!is.na(svmRadialResults$ROC),]
300 | xyplot(ROC ~ C|Set, data = svmRadialResults,
301 |        groups = Sigma, type = c("g", "o"),
302 |        xlab = "Cost",
303 |        ylab = "ROC (2008 Hold-Out Data)",
304 |        auto.key = list(columns = 2),
305 |        scales = list(x = list(log = 2)))
306 | 
307 | svmPolyResults <- rbind(svmPFitReduced$results,
308 |                         svmPFitFull$results)
309 | svmPolyResults$Set <- c(rep("Reduced Set", nrow(svmPFitReduced$result)),
310 |                         rep("Full Set", nrow(svmPFitFull$result)))
311 | svmPolyResults <- svmPolyResults[!is.na(svmPolyResults$ROC),]
312 | svmPolyResults$scale <- paste("scale = ", 
313 |                               format(svmPolyResults$scale, 
314 |                                      scientific = FALSE))
315 | svmPolyResults$Degree <- "Linear"
316 | svmPolyResults$Degree[svmPolyResults$degree == 2] <- "Quadratic"
317 | useOuterStrips(xyplot(ROC ~ C|Degree*Set, data = svmPolyResults,
318 |                       groups = scale, type = c("g", "o"),
319 |                       xlab = "Cost",
320 |                       ylab = "ROC (2008 Hold-Out Data)",
321 |                       auto.key = list(columns = 2),
322 |                       scales = list(x = list(log = 2))))
323 | 
324 | plot(nnetRoc, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
325 | plot(fdaRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
326 | plot(svmPRoc, type = "s", add = TRUE, legacy.axes = TRUE)
327 | 
328 | ################################################################################
329 | ### Section 13.5 K-Nearest Neighbors
330 | 
331 | 
332 | set.seed(476)
333 | knnFit <- train(x = training[,reducedSet], 
334 |                 y = training$Class,
335 |                 method = "knn",
336 |                 metric = "ROC",
337 |                 preProc = c("center", "scale"),
338 |                 tuneGrid = data.frame(k = c(4*(0:5)+1,20*(1:5)+1,50*(2:9)+1)),
339 |                 trControl = ctrl)
340 | knnFit
341 | 
342 | knnFit$pred <- merge(knnFit$pred,  knnFit$bestTune)
343 | knnCM <- confusionMatrix(knnFit, norm = "none")
344 | knnCM
345 | knnRoc <- roc(response = knnFit$pred$obs,
346 |               predictor = knnFit$pred$successful,
347 |               levels = rev(levels(knnFit$pred$obs)))
348 | 
349 | update(plot(knnFit, ylab = "ROC (2008 Hold-Out Data)"))
350 | 
351 | plot(fdaRoc, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
352 | plot(nnetRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
353 | plot(svmPRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
354 | plot(knnRoc, type = "s", add = TRUE, legacy.axes = TRUE)
355 | 
356 | ################################################################################
357 | ### Section 13.6 Naive Bayes
358 | 
359 | ## Create factor versions of some of the predictors so that they are treated
360 | ## as categories and not dummy variables
361 | 
362 | factors <- c("SponsorCode", "ContractValueBand", "Month", "Weekday")
363 | nbPredictors <- factorPredictors[factorPredictors %in% reducedSet]
364 | nbPredictors <- c(nbPredictors, factors)
365 | nbPredictors <- nbPredictors[nbPredictors != "SponsorUnk"]
366 | 
367 | nbTraining <- training[, c("Class", nbPredictors)]
368 | nbTesting <- testing[, c("Class", nbPredictors)]
369 | 
370 | for(i in nbPredictors)
371 | {
372 |   if(length(unique(training[,i])) <= 15)
373 |   {
374 |     nbTraining[, i] <- factor(nbTraining[,i], levels = paste(sort(unique(training[,i]))))
375 |     nbTesting[, i] <- factor(nbTesting[,i], levels = paste(sort(unique(training[,i]))))
376 |   }
377 | }
378 | 
379 | set.seed(476)
380 | nBayesFit <- train(x = nbTraining[,nbPredictors],
381 |                    y = nbTraining$Class,
382 |                    method = "nb",
383 |                    metric = "ROC",
384 |                    tuneGrid = data.frame(usekernel = c(TRUE, FALSE), fL = 2),
385 |                    trControl = ctrl)
386 | nBayesFit
387 | 
388 | nBayesFit$pred <- merge(nBayesFit$pred,  nBayesFit$bestTune)
389 | nBayesCM <- confusionMatrix(nBayesFit, norm = "none")
390 | nBayesCM
391 | nBayesRoc <- roc(response = nBayesFit$pred$obs,
392 |                  predictor = nBayesFit$pred$successful,
393 |                  levels = rev(levels(nBayesFit$pred$obs)))
394 | nBayesRoc
395 | 
396 | 
397 | sessionInfo()
398 | 
399 | q("no")
400 | 


--------------------------------------------------------------------------------
/inst/chapters/12_Discriminant_Analysis.R:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
  3 | ### Copyright 2013 Kuhn and Johnson
  4 | ### Web Page: http://www.appliedpredictivemodeling.com
  5 | ### Contact: Max Kuhn (mxkuhn@gmail.com) 
  6 | ###
  7 | ### Chapter 12 Discriminant Analysis and Other Linear Classification Models
  8 | ###
  9 | ### Required packages: AppliedPredictiveModeling, caret, doMC (optional),  
 10 | ###                    glmnet, lattice, MASS, pamr, pls, pROC, sparseLDA
 11 | ###
 12 | ### Data used: The grant application data. See the file 'CreateGrantData.R'
 13 | ###
 14 | ### Notes: 
 15 | ### 1) This code is provided without warranty.
 16 | ###
 17 | ### 2) This code should help the user reproduce the results in the
 18 | ### text. There will be differences between this code and what is is
 19 | ### the computing section. For example, the computing sections show
 20 | ### how the source functions work (e.g. randomForest() or plsr()),
 21 | ### which were not directly used when creating the book. Also, there may be 
 22 | ### syntax differences that occur over time as packages evolve. These files 
 23 | ### will reflect those changes.
 24 | ###
 25 | ### 3) In some cases, the calculations in the book were run in 
 26 | ### parallel. The sub-processes may reset the random number seed.
 27 | ### Your results may slightly vary.
 28 | ###
 29 | ################################################################################
 30 | 
 31 | ################################################################################
 32 | ### Section 12.1 Case Study: Predicting Successful Grant Applications
 33 | 
 34 | load("grantData.RData")
 35 | 
 36 | library(caret)
 37 | library(doMC)
 38 | registerDoMC(12)
 39 | library(plyr)
 40 | library(reshape2)
 41 | 
 42 | ## Look at two different ways to split and resample the data. A support vector
 43 | ## machine is used to illustrate the differences. The full set of predictors
 44 | ## is used. 
 45 | 
 46 | pre2008Data <- training[pre2008,]
 47 | year2008Data <- rbind(training[-pre2008,], testing)
 48 | 
 49 | set.seed(552)
 50 | test2008 <- createDataPartition(year2008Data$Class, p = .25)[[1]]
 51 | 
 52 | allData <- rbind(pre2008Data, year2008Data[-test2008,])
 53 | holdout2008 <- year2008Data[test2008,]
 54 | 
 55 | ## Use a common tuning grid for both approaches. 
 56 | svmrGrid <- expand.grid(sigma = c(.00007, .00009, .0001, .0002),
 57 |                         C = 2^(-3:8))
 58 | 
 59 | ## Evaluate the model using overall 10-fold cross-validation
 60 | ctrl0 <- trainControl(method = "cv",
 61 |                       summaryFunction = twoClassSummary,
 62 |                       classProbs = TRUE)
 63 | set.seed(477)
 64 | svmFit0 <- train(pre2008Data[,fullSet], pre2008Data$Class,
 65 |                  method = "svmRadial",
 66 |                  tuneGrid = svmrGrid,
 67 |                  preProc = c("center", "scale"),
 68 |                  metric = "ROC",
 69 |                  trControl = ctrl0)
 70 | svmFit0
 71 | 
 72 | ### Now fit the single 2008 test set
 73 | ctrl00 <- trainControl(method = "LGOCV",
 74 |                        summaryFunction = twoClassSummary,
 75 |                        classProbs = TRUE,
 76 |                        index = list(TestSet = 1:nrow(pre2008Data)))
 77 | 
 78 | 
 79 | set.seed(476)
 80 | svmFit00 <- train(allData[,fullSet], allData$Class,
 81 |                   method = "svmRadial",
 82 |                   tuneGrid = svmrGrid,
 83 |                   preProc = c("center", "scale"),
 84 |                   metric = "ROC",
 85 |                   trControl = ctrl00)
 86 | svmFit00
 87 | 
 88 | ## Combine the two sets of results and plot
 89 | 
 90 | grid0 <- subset(svmFit0$results,  sigma == svmFit0$bestTune$sigma)
 91 | grid0$Model <- "10-Fold Cross-Validation"
 92 | 
 93 | grid00 <- subset(svmFit00$results,  sigma == svmFit00$bestTune$sigma)
 94 | grid00$Model <- "Single 2008 Test Set"
 95 | 
 96 | plotData <- rbind(grid00, grid0)
 97 | 
 98 | plotData <- plotData[!is.na(plotData$ROC),]
 99 | xyplot(ROC ~ C, data = plotData,
100 |        groups = Model,
101 |        type = c("g", "o"),
102 |        scales = list(x = list(log = 2)),
103 |        auto.key = list(columns = 1))
104 | 
105 | ################################################################################
106 | ### Section 12.2 Logistic Regression
107 | 
108 | modelFit <- glm(Class ~ Day, data = training[pre2008,], family = binomial)
109 | dataGrid <- data.frame(Day = seq(0, 365, length = 500))
110 | dataGrid$Linear <- 1 - predict(modelFit, dataGrid, type = "response")
111 | linear2008 <- auc(roc(response = training[-pre2008, "Class"],
112 |                       predictor = 1 - predict(modelFit, 
113 |                                               training[-pre2008,], 
114 |                                               type = "response"),
115 |                       levels = rev(levels(training[-pre2008, "Class"]))))
116 | 
117 | 
118 | modelFit2 <- glm(Class ~ Day + I(Day^2), 
119 |                  data = training[pre2008,], 
120 |                  family = binomial)
121 | dataGrid$Quadratic <- 1 - predict(modelFit2, dataGrid, type = "response")
122 | quad2008 <- auc(roc(response = training[-pre2008, "Class"],
123 |                     predictor = 1 - predict(modelFit2, 
124 |                                             training[-pre2008,], 
125 |                                             type = "response"),
126 |                     levels = rev(levels(training[-pre2008, "Class"]))))
127 | 
128 | dataGrid <- melt(dataGrid, id.vars = "Day")
129 | 
130 | byDay <- training[pre2008, c("Day", "Class")]
131 | byDay$Binned <- cut(byDay$Day, seq(0, 360, by = 5))
132 | 
133 | observedProps <- ddply(byDay, .(Binned),
134 |                        function(x) c(n = nrow(x), mean = mean(x$Class == "successful")))
135 | observedProps$midpoint <- seq(2.5, 357.5, by = 5)
136 | 
137 | xyplot(value ~ Day|variable, data = dataGrid,
138 |        ylab = "Probability of A Successful Grant",
139 |        ylim = extendrange(0:1),
140 |        between = list(x = 1),
141 |        panel = function(...)
142 |        {
143 |          panel.xyplot(x = observedProps$midpoint, observedProps$mean,
144 |                       pch = 16., col = rgb(.2, .2, .2, .5))
145 |          panel.xyplot(..., type = "l", col = "black", lwd = 2)
146 |        })
147 | 
148 | ## For the reduced set of factors, fit the logistic regression model (linear and
149 | ## quadratic) and evaluate on the 
150 | training$Day2 <- training$Day^2
151 | testing$Day2 <- testing$Day^2
152 | fullSet <- c(fullSet, "Day2")
153 | reducedSet <- c(reducedSet, "Day2")
154 | 
155 | ## This control object will be used across multiple models so that the
156 | ## data splitting is consistent
157 | 
158 | ctrl <- trainControl(method = "LGOCV",
159 |                      summaryFunction = twoClassSummary,
160 |                      classProbs = TRUE,
161 |                      index = list(TrainSet = pre2008),
162 |                      savePredictions = TRUE)
163 | 
164 | set.seed(476)
165 | lrFit <- train(x = training[,reducedSet], 
166 |                y = training$Class,
167 |                method = "glm",
168 |                metric = "ROC",
169 |                trControl = ctrl)
170 | lrFit
171 | set.seed(476)
172 | lrFit2 <- train(x = training[,c(fullSet, "Day2")], 
173 |                 y = training$Class,
174 |                 method = "glm",
175 |                 metric = "ROC",
176 |                 trControl = ctrl)
177 | lrFit2
178 | 
179 | lrFit$pred <- merge(lrFit$pred,  lrFit$bestTune)
180 | 
181 | ## Get the confusion matrices for the hold-out set
182 | lrCM <- confusionMatrix(lrFit, norm = "none")
183 | lrCM
184 | lrCM2 <- confusionMatrix(lrFit2, norm = "none")
185 | lrCM2
186 | 
187 | ## Get the area under the ROC curve for the hold-out set
188 | lrRoc <- roc(response = lrFit$pred$obs,
189 |              predictor = lrFit$pred$successful,
190 |              levels = rev(levels(lrFit$pred$obs)))
191 | lrRoc2 <- roc(response = lrFit2$pred$obs,
192 |               predictor = lrFit2$pred$successful,
193 |               levels = rev(levels(lrFit2$pred$obs)))
194 | lrImp <- varImp(lrFit, scale = FALSE)
195 | 
196 | plot(lrRoc, legacy.axes = TRUE)
197 | 
198 | ################################################################################
199 | ### Section 12.3 Linear Discriminant Analysis
200 | 
201 | ## Fit the model to the reduced set
202 | set.seed(476)
203 | ldaFit <- train(x = training[,reducedSet], 
204 |                 y = training$Class,
205 |                 method = "lda",
206 |                 preProc = c("center","scale"),
207 |                 metric = "ROC",
208 |                 trControl = ctrl)
209 | ldaFit
210 | 
211 | ldaFit$pred <- merge(ldaFit$pred,  ldaFit$bestTune)
212 | ldaCM <- confusionMatrix(ldaFit, norm = "none")
213 | ldaCM
214 | ldaRoc <- roc(response = ldaFit$pred$obs,
215 |               predictor = ldaFit$pred$successful,
216 |               levels = rev(levels(ldaFit$pred$obs)))
217 | plot(lrRoc, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
218 | plot(ldaRoc, add = TRUE, type = "s", legacy.axes = TRUE)
219 | 
220 | ################################################################################
221 | ### Section 12.4 Partial Least Squares Discriminant Analysis
222 | 
223 | ## This model uses all of the predictors
224 | set.seed(476)
225 | plsFit <- train(x = training[,fullSet], 
226 |                 y = training$Class,
227 |                 method = "pls",
228 |                 tuneGrid = expand.grid(ncomp = 1:10),
229 |                 preProc = c("center","scale"),
230 |                 metric = "ROC",
231 |                 probMethod = "Bayes",
232 |                 trControl = ctrl)
233 | plsFit
234 | 
235 | plsImpGrant <- varImp(plsFit, scale = FALSE)
236 | 
237 | bestPlsNcomp <- plsFit$results[best(plsFit$results, "ROC", maximize = TRUE), "ncomp"]
238 | bestPlsROC <- plsFit$results[best(plsFit$results, "ROC", maximize = TRUE), "ROC"]
239 | 
240 | ## Only keep the final tuning parameter data
241 | plsFit$pred <- merge(plsFit$pred,  plsFit$bestTune)
242 | 
243 | plsRoc <- roc(response = plsFit$pred$obs,
244 |               predictor = plsFit$pred$successful,
245 |               levels = rev(levels(plsFit$pred$obs)))
246 | 
247 | ### PLS confusion matrix information
248 | plsCM <- confusionMatrix(plsFit, norm = "none")
249 | plsCM
250 | 
251 | ## Now fit a model that uses a smaller set of predictors chosen by unsupervised 
252 | ## filtering. 
253 | 
254 | set.seed(476)
255 | plsFit2 <- train(x = training[,reducedSet], 
256 |                  y = training$Class,
257 |                  method = "pls",
258 |                  tuneGrid = expand.grid(ncomp = 1:10),
259 |                  preProc = c("center","scale"),
260 |                  metric = "ROC",
261 |                  probMethod = "Bayes",
262 |                  trControl = ctrl)
263 | plsFit2
264 | 
265 | bestPlsNcomp2 <- plsFit2$results[best(plsFit2$results, "ROC", maximize = TRUE), "ncomp"]
266 | bestPlsROC2 <- plsFit2$results[best(plsFit2$results, "ROC", maximize = TRUE), "ROC"]
267 | 
268 | plsFit2$pred <- merge(plsFit2$pred,  plsFit2$bestTune)
269 | 
270 | plsRoc2 <- roc(response = plsFit2$pred$obs,
271 |                predictor = plsFit2$pred$successful,
272 |                levels = rev(levels(plsFit2$pred$obs)))
273 | plsCM2 <- confusionMatrix(plsFit2, norm = "none")
274 | plsCM2
275 | 
276 | pls.ROC <- cbind(plsFit$results,Descriptors="Full Set")
277 | pls2.ROC <- cbind(plsFit2$results,Descriptors="Reduced Set")
278 | 
279 | plsCompareROC <- data.frame(rbind(pls.ROC,pls2.ROC))
280 | 
281 | xyplot(ROC ~ ncomp,
282 |        data = plsCompareROC,
283 |        xlab = "# Components",
284 |        ylab = "ROC (2008 Hold-Out Data)",
285 |        auto.key = list(columns = 2),
286 |        groups = Descriptors,
287 |        type = c("o", "g"))
288 | 
289 | ## Plot ROC curves and variable importance scores
290 | plot(ldaRoc, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
291 | plot(lrRoc, type = "s", col = rgb(.2, .2, .2, .2), add = TRUE, legacy.axes = TRUE)
292 | plot(plsRoc2, type = "s", add = TRUE, legacy.axes = TRUE)
293 | 
294 | plot(plsImpGrant, top=20, scales = list(y = list(cex = .95)))
295 | 
296 | ################################################################################
297 | ### Section 12.5 Penalized Models
298 | 
299 | ## The glmnet model
300 | glmnGrid <- expand.grid(alpha = c(0,  .1,  .2, .4, .6, .8, 1),
301 |                         lambda = seq(.01, .2, length = 40))
302 | set.seed(476)
303 | glmnFit <- train(x = training[,fullSet], 
304 |                  y = training$Class,
305 |                  method = "glmnet",
306 |                  tuneGrid = glmnGrid,
307 |                  preProc = c("center", "scale"),
308 |                  metric = "ROC",
309 |                  trControl = ctrl)
310 | glmnFit
311 | 
312 | glmnet2008 <- merge(glmnFit$pred,  glmnFit$bestTune)
313 | glmnetCM <- confusionMatrix(glmnFit, norm = "none")
314 | glmnetCM
315 | 
316 | glmnetRoc <- roc(response = glmnet2008$obs,
317 |                  predictor = glmnet2008$successful,
318 |                  levels = rev(levels(glmnet2008$obs)))
319 | 
320 | glmnFit0 <- glmnFit
321 | glmnFit0$results$lambda <- format(round(glmnFit0$results$lambda, 3))
322 | 
323 | glmnPlot <- plot(glmnFit0,
324 |                  plotType = "level",
325 |                  cuts = 15,
326 |                  scales = list(x = list(rot = 90, cex = .65)))
327 | 
328 | update(glmnPlot,
329 |        ylab = "Mixing Percentage\nRidge <---------> Lasso",
330 |        sub = "",
331 |        main = "Area Under the ROC Curve",
332 |        xlab = "Amount of Regularization")
333 | 
334 | plot(plsRoc2, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
335 | plot(ldaRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
336 | plot(lrRoc, type = "s", col = rgb(.2, .2, .2, .2), add = TRUE, legacy.axes = TRUE)
337 | plot(glmnetRoc, type = "s", add = TRUE, legacy.axes = TRUE)
338 | 
339 | ## Sparse logistic regression
340 | 
341 | set.seed(476)
342 | spLDAFit <- train(x = training[,fullSet], 
343 |                   y = training$Class,
344 |                   "sparseLDA",
345 |                   tuneGrid = expand.grid(lambda = c(.1),
346 |                                          NumVars = c(1:20, 50, 75, 100, 250, 500, 750, 1000)),
347 |                   preProc = c("center", "scale"),
348 |                   metric = "ROC",
349 |                   trControl = ctrl)
350 | spLDAFit
351 | 
352 | spLDA2008 <- merge(spLDAFit$pred,  spLDAFit$bestTune)
353 | spLDACM <- confusionMatrix(spLDAFit, norm = "none")
354 | spLDACM
355 | 
356 | spLDARoc <- roc(response = spLDA2008$obs,
357 |                 predictor = spLDA2008$successful,
358 |                 levels = rev(levels(spLDA2008$obs)))
359 | 
360 | update(plot(spLDAFit, scales = list(x = list(log = 10))),
361 |        ylab = "ROC AUC (2008 Hold-Out Data)")
362 | 
363 | plot(plsRoc2, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
364 | plot(glmnetRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
365 | plot(ldaRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
366 | plot(lrRoc, type = "s", col = rgb(.2, .2, .2, .2), add = TRUE, legacy.axes = TRUE)
367 | plot(spLDARoc, type = "s", add = TRUE, legacy.axes = TRUE)
368 | 
369 | ################################################################################
370 | ### Section 12.6 Nearest Shrunken Centroids
371 | 
372 | set.seed(476)
373 | nscFit <- train(x = training[,fullSet], 
374 |                 y = training$Class,
375 |                 method = "pam",
376 |                 preProc = c("center", "scale"),
377 |                 tuneGrid = data.frame(threshold = seq(0, 25, length = 30)),
378 |                 metric = "ROC",
379 |                 trControl = ctrl)
380 | nscFit
381 | 
382 | nsc2008 <- merge(nscFit$pred,  nscFit$bestTune)
383 | nscCM <- confusionMatrix(nscFit, norm = "none")
384 | nscCM
385 | nscRoc <- roc(response = nsc2008$obs,
386 |               predictor = nsc2008$successful,
387 |               levels = rev(levels(nsc2008$obs)))
388 | update(plot(nscFit), ylab = "ROC AUC (2008 Hold-Out Data)")
389 | 
390 | 
391 | plot(plsRoc2, type = "s", col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
392 | plot(glmnetRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
393 | plot(ldaRoc, type = "s", add = TRUE, col = rgb(.2, .2, .2, .2), legacy.axes = TRUE)
394 | plot(lrRoc, type = "s", col = rgb(.2, .2, .2, .2), add = TRUE, legacy.axes = TRUE)
395 | plot(spLDARoc, type = "s", col = rgb(.2, .2, .2, .2), add = TRUE, legacy.axes = TRUE)
396 | plot(nscRoc, type = "s", add = TRUE, legacy.axes = TRUE)
397 | 
398 | sessionInfo()
399 | 
400 | q("no")
401 | 
402 | 


--------------------------------------------------------------------------------
/inst/chapters/19_Feature_Select.R:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ### R code from Applied Predictive Modeling (2013) by Kuhn and Johnson.
  3 | ### Copyright 2013 Kuhn and Johnson
  4 | ### Web Page: http://www.appliedpredictivemodeling.com
  5 | ### Contact: Max Kuhn (mxkuhn@gmail.com) 
  6 | ###
  7 | ### Chapter 19: An Introduction to Feature Selection
  8 | ###
  9 | ### Required packages: AppliedPredictiveModeling, caret, MASS, corrplot,
 10 | ###                    RColorBrewer, randomForest, kernlab, klaR,
 11 | ###                   
 12 | ###
 13 | ### Data used: The Alzheimer disease data from the AppliedPredictiveModeling 
 14 | ###            package
 15 | ###
 16 | ### Notes: 
 17 | ### 1) This code is provided without warranty.
 18 | ###
 19 | ### 2) This code should help the user reproduce the results in the
 20 | ### text. There will be differences between this code and what is is
 21 | ### the computing section. For example, the computing sections show
 22 | ### how the source functions work (e.g. randomForest() or plsr()),
 23 | ### which were not directly used when creating the book. Also, there may be 
 24 | ### syntax differences that occur over time as packages evolve. These files 
 25 | ### will reflect those changes.
 26 | ###
 27 | ### 3) In some cases, the calculations in the book were run in 
 28 | ### parallel. The sub-processes may reset the random number seed.
 29 | ### Your results may slightly vary.
 30 | ###
 31 | ################################################################################
 32 | 
 33 | 
 34 | 
 35 | ################################################################################
 36 | ### Section 19.6 Case Study: Predicting Cognitive Impairment
 37 | 
 38 | 
 39 | library(AppliedPredictiveModeling)
 40 | data(AlzheimerDisease)
 41 | 
 42 | ## The baseline set of predictors
 43 | bl <- c("Genotype", "age", "tau", "p_tau", "Ab_42", "male")
 44 | 
 45 | ## The set of new assays
 46 | newAssays <- colnames(predictors)
 47 | newAssays <- newAssays[!(newAssays %in% c("Class", bl))]
 48 | 
 49 | ## Decompose the genotype factor into binary dummy variables
 50 | 
 51 | predictors$E2 <- predictors$E3 <- predictors$E4 <- 0
 52 | predictors$E2[grepl("2", predictors$Genotype)] <- 1
 53 | predictors$E3[grepl("3", predictors$Genotype)] <- 1
 54 | predictors$E4[grepl("4", predictors$Genotype)] <- 1
 55 | genotype <-  predictors$Genotype
 56 | 
 57 | ## Partition the data
 58 | library(caret)
 59 | set.seed(730)
 60 | split <- createDataPartition(diagnosis, p = .8, list = FALSE)
 61 | 
 62 | adData <- predictors
 63 | adData$Class <- diagnosis
 64 | 
 65 | training <- adData[ split, ]
 66 | testing  <- adData[-split, ]
 67 | 
 68 | predVars <- names(adData)[!(names(adData) %in% c("Class",  "Genotype"))]
 69 | 
 70 | ## This summary function is used to evaluate the models.
 71 | fiveStats <- function(...) c(twoClassSummary(...), defaultSummary(...))
 72 | 
 73 | ## We create the cross-validation files as a list to use with different 
 74 | ## functions
 75 | 
 76 | set.seed(104)
 77 | index <- createMultiFolds(training$Class, times = 5)
 78 | 
 79 | ## The candidate set of the number of predictors to evaluate
 80 | varSeq <- seq(1, length(predVars)-1, by = 2)
 81 | 
 82 | ## We can also use parallel processing to run each resampled RFE
 83 | ## iteration (or resampled model with train()) using different
 84 | ## workers.
 85 | 
 86 | library(doMC)
 87 | registerDoMC(15)
 88 | 
 89 | 
 90 | ## The rfe() function in the caret package is used for recursive feature 
 91 | ## elimiation. We setup control functions for this and train() that use
 92 | ## the same cross-validation folds. The 'ctrl' object will be modifed several
 93 | ## times as we try different models
 94 | 
 95 | ctrl <- rfeControl(method = "repeatedcv", repeats = 5,
 96 |                    saveDetails = TRUE,
 97 |                    index = index,
 98 |                    returnResamp = "final")
 99 | 
100 | fullCtrl <- trainControl(method = "repeatedcv",
101 |                          repeats = 5,
102 |                          summaryFunction = fiveStats,
103 |                          classProbs = TRUE,
104 |                          index = index)
105 | 
106 | ## The correlation matrix of the new data
107 | predCor <- cor(training[, newAssays])
108 | 
109 | library(RColorBrewer)
110 | cols <- c(rev(brewer.pal(7, "Blues")),
111 |           brewer.pal(7, "Reds"))
112 | library(corrplot)
113 | corrplot(predCor,
114 |          order = "hclust",
115 |          tl.pos = "n",addgrid.col = rgb(1,1,1,.01),
116 |          col = colorRampPalette(cols)(51))
117 | 
118 | ## Fit a series of models with the full set of predictors
119 | set.seed(721)
120 | rfFull <- train(training[, predVars],
121 |                 training$Class,
122 |                 method = "rf",
123 |                 metric = "ROC",
124 |                 tuneGrid = data.frame(mtry = floor(sqrt(length(predVars)))),
125 |                 ntree = 1000,
126 |                 trControl = fullCtrl)
127 | rfFull
128 | 
129 | set.seed(721)
130 | ldaFull <- train(training[, predVars],
131 |                  training$Class,
132 |                  method = "lda",
133 |                  metric = "ROC",
134 |                  ## The 'tol' argument helps lda() know when a matrix is 
135 |                  ## singular. One of the predictors has values very close to 
136 |                  ## zero, so we raise the vaue to be smaller than the default
137 |                  ## value of 1.0e-4.
138 |                  tol = 1.0e-12,
139 |                  trControl = fullCtrl)
140 | ldaFull
141 | 
142 | set.seed(721)
143 | svmFull <- train(training[, predVars],
144 |                  training$Class,
145 |                  method = "svmRadial",
146 |                  metric = "ROC",
147 |                  tuneLength = 12,
148 |                  preProc = c("center", "scale"),
149 |                  trControl = fullCtrl)
150 | svmFull
151 | 
152 | set.seed(721)
153 | nbFull <- train(training[, predVars],
154 |                 training$Class,
155 |                 method = "nb",
156 |                 metric = "ROC",
157 |                 trControl = fullCtrl)
158 | nbFull
159 | 
160 | lrFull <- train(training[, predVars],
161 |                 training$Class,
162 |                 method = "glm",
163 |                 metric = "ROC",
164 |                 trControl = fullCtrl)
165 | lrFull
166 | 
167 | set.seed(721)
168 | knnFull <- train(training[, predVars],
169 |                  training$Class,
170 |                  method = "knn",
171 |                  metric = "ROC",
172 |                  tuneLength = 20,
173 |                  preProc = c("center", "scale"),
174 |                  trControl = fullCtrl)
175 | knnFull
176 | 
177 | ## Now fit the RFE versions. To do this, the 'functions' argument of the rfe()
178 | ## object is modified to the approproate functions. For model details about 
179 | ## these functions and their arguments, see 
180 | ##
181 | ##   http://caret.r-forge.r-project.org/featureSelection.html
182 | ##
183 | ## for more information.
184 | 
185 | 
186 | 
187 | 
188 | ctrl$functions <- rfFuncs
189 | ctrl$functions$summary <- fiveStats
190 | set.seed(721)
191 | rfRFE <- rfe(training[, predVars],
192 |              training$Class,
193 |              sizes = varSeq,
194 |              metric = "ROC",
195 |              ntree = 1000,
196 |              rfeControl = ctrl)
197 | rfRFE
198 | 
199 | ctrl$functions <- ldaFuncs
200 | ctrl$functions$summary <- fiveStats
201 | 
202 | set.seed(721)
203 | ldaRFE <- rfe(training[, predVars],
204 |               training$Class,
205 |               sizes = varSeq,
206 |               metric = "ROC",
207 |               tol = 1.0e-12,
208 |               rfeControl = ctrl)
209 | ldaRFE
210 | 
211 | ctrl$functions <- nbFuncs
212 | ctrl$functions$summary <- fiveStats
213 | set.seed(721)
214 | nbRFE <- rfe(training[, predVars],
215 |               training$Class,
216 |               sizes = varSeq,
217 |               metric = "ROC",
218 |               rfeControl = ctrl)
219 | nbRFE
220 | 
221 | ## Here, the caretFuncs list allows for a model to be tuned at each iteration 
222 | ## of feature seleciton.
223 | 
224 | ctrl$functions <- caretFuncs
225 | ctrl$functions$summary <- fiveStats
226 | 
227 | ## This options tells train() to run it's model tuning
228 | ## sequentially. Otherwise, there would be parallel processing at two
229 | ## levels, which is possible but requires W^2 workers. On our machine,
230 | ## it was more efficient to only run the RFE process in parallel. 
231 | 
232 | cvCtrl <- trainControl(method = "cv",
233 |                        verboseIter = FALSE,
234 |                        classProbs = TRUE,
235 |                        allowParallel = FALSE)
236 | 
237 | set.seed(721)
238 | svmRFE <- rfe(training[, predVars],
239 |               training$Class,
240 |               sizes = varSeq,
241 |               rfeControl = ctrl,
242 |               metric = "ROC",
243 |               ## Now arguments to train() are used.
244 |               method = "svmRadial",
245 |               tuneLength = 12,
246 |               preProc = c("center", "scale"),
247 |               trControl = cvCtrl)
248 | svmRFE
249 | 
250 | ctrl$functions <- lrFuncs
251 | ctrl$functions$summary <- fiveStats
252 | 
253 | set.seed(721)
254 | lrRFE <- rfe(training[, predVars],
255 |                training$Class,
256 |                sizes = varSeq,
257 |                metric = "ROC",
258 |                rfeControl = ctrl)
259 | lrRFE
260 | 
261 | ctrl$functions <- caretFuncs
262 | ctrl$functions$summary <- fiveStats
263 | 
264 | set.seed(721)
265 | knnRFE <- rfe(training[, predVars],
266 |               training$Class,
267 |               sizes = varSeq,
268 |               metric = "ROC",
269 |               method = "knn",
270 |               tuneLength = 20,
271 |               preProc = c("center", "scale"),
272 |               trControl = cvCtrl,
273 |               rfeControl = ctrl)
274 | knnRFE
275 | 
276 | ## Each of these models can be evaluate using the plot() function to see
277 | ## the profile across subset sizes.
278 | 
279 | ## Test set ROC results:
280 | rfROCfull <- roc(testing$Class,
281 |                  predict(rfFull, testing[,predVars], type = "prob")[,1])
282 | rfROCfull
283 | rfROCrfe <- roc(testing$Class,
284 |                 predict(rfRFE, testing[,predVars])$Impaired)
285 | rfROCrfe
286 | 
287 | ldaROCfull <- roc(testing$Class,
288 |                   predict(ldaFull, testing[,predVars], type = "prob")[,1])
289 | ldaROCfull
290 | ldaROCrfe <- roc(testing$Class,
291 |                  predict(ldaRFE, testing[,predVars])$Impaired)
292 | ldaROCrfe
293 | 
294 | nbROCfull <- roc(testing$Class,
295 |                   predict(nbFull, testing[,predVars], type = "prob")[,1])
296 | nbROCfull
297 | nbROCrfe <- roc(testing$Class,
298 |                  predict(nbRFE, testing[,predVars])$Impaired)
299 | nbROCrfe
300 | 
301 | svmROCfull <- roc(testing$Class,
302 |                   predict(svmFull, testing[,predVars], type = "prob")[,1])
303 | svmROCfull
304 | svmROCrfe <- roc(testing$Class,
305 |                  predict(svmRFE, testing[,predVars])$Impaired)
306 | svmROCrfe
307 | 
308 | lrROCfull <- roc(testing$Class,
309 |                   predict(lrFull, testing[,predVars], type = "prob")[,1])
310 | lrROCfull
311 | lrROCrfe <- roc(testing$Class,
312 |                  predict(lrRFE, testing[,predVars])$Impaired)
313 | lrROCrfe
314 | 
315 | knnROCfull <- roc(testing$Class,
316 |                   predict(knnFull, testing[,predVars], type = "prob")[,1])
317 | knnROCfull
318 | knnROCrfe <- roc(testing$Class,
319 |                  predict(knnRFE, testing[,predVars])$Impaired)
320 | knnROCrfe
321 | 
322 | 
323 | ## For filter methods, the sbf() function (named for Selection By Filter) is
324 | ## used. It has similar arguments to rfe() to control the model fitting and
325 | ## filtering methods. 
326 | 
327 | ## P-values are created for filtering. 
328 | 
329 | ## A set of four LDA models are fit based on two factors: p-value adjustment 
330 | ## using a Bonferroni adjustment and whether the predictors should be 
331 | ## pre-screened for high correlations. 
332 | 
333 | sbfResamp <- function(x, fun = mean)
334 | {
335 |   x <- unlist(lapply(x$variables, length))
336 |   fun(x)
337 | }
338 | sbfROC <- function(mod) auc(roc(testing$Class, predict(mod, testing)$Impaired))
339 | 
340 | ## This function calculates p-values using either a t-test (when the predictor
341 | ## has 2+ distinct values) or using Fisher's Exact Test otherwise.
342 | 
343 | pScore <- function(x, y)
344 |   {
345 |     numX <- length(unique(x))
346 |     if(numX > 2)
347 |       {
348 |        out <- t.test(x ~ y)$p.value
349 |       } else {
350 |        out <- fisher.test(factor(x), y)$p.value
351 |       }
352 |     out
353 |   }
354 | ldaWithPvalues <- ldaSBF
355 | ldaWithPvalues$score <- pScore
356 | ldaWithPvalues$summary <- fiveStats
357 | 
358 | ## Predictors are retained if their p-value is less than the completely 
359 | ## subjective cut-off of 0.05.
360 | 
361 | ldaWithPvalues$filter <- function (score, x, y)
362 | {
363 |   keepers <- score <= 0.05
364 |   keepers
365 | }
366 | 
367 | sbfCtrl <- sbfControl(method = "repeatedcv",
368 |                       repeats = 5,
369 |                       verbose = TRUE,
370 |                       functions = ldaWithPvalues,
371 |                       index = index)
372 | 
373 | rawCorr <- sbf(training[, predVars],
374 |                training$Class,
375 |                tol = 1.0e-12,
376 |                sbfControl = sbfCtrl)
377 | rawCorr
378 | 
379 | ldaWithPvalues$filter <- function (score, x, y)
380 | {
381 |   score <- p.adjust(score,  "bonferroni")
382 |   keepers <- score <= 0.05
383 |   keepers
384 | }
385 | sbfCtrl <- sbfControl(method = "repeatedcv",
386 |                       repeats = 5,
387 |                       verbose = TRUE,
388 |                       functions = ldaWithPvalues,
389 |                       index = index)
390 | 
391 | adjCorr <- sbf(training[, predVars],
392 |                training$Class,
393 |                tol = 1.0e-12,
394 |                sbfControl = sbfCtrl)
395 | adjCorr
396 | 
397 | ldaWithPvalues$filter <- function (score, x, y)
398 | {
399 |   keepers <- score <= 0.05
400 |   corrMat <- cor(x[,keepers])
401 |   tooHigh <- findCorrelation(corrMat, .75)
402 |   if(length(tooHigh) > 0) keepers[tooHigh] <- FALSE
403 |   keepers
404 | }
405 | sbfCtrl <- sbfControl(method = "repeatedcv",
406 |                       repeats = 5,
407 |                       verbose = TRUE,
408 |                       functions = ldaWithPvalues,
409 |                       index = index)
410 | 
411 | rawNoCorr <- sbf(training[, predVars],
412 |                  training$Class,
413 |                  tol = 1.0e-12,
414 |                  sbfControl = sbfCtrl)
415 | rawNoCorr
416 | 
417 | ldaWithPvalues$filter <- function (score, x, y)
418 | {
419 |   score <- p.adjust(score,  "bonferroni")
420 |   keepers <- score <= 0.05
421 |   corrMat <- cor(x[,keepers])
422 |   tooHigh <- findCorrelation(corrMat, .75)
423 |   if(length(tooHigh) > 0) keepers[tooHigh] <- FALSE
424 |   keepers
425 | }
426 | sbfCtrl <- sbfControl(method = "repeatedcv",
427 |                       repeats = 5,
428 |                       verbose = TRUE,
429 |                       functions = ldaWithPvalues,
430 |                       index = index)
431 | 
432 | adjNoCorr <- sbf(training[, predVars],
433 |                  training$Class,
434 |                  tol = 1.0e-12,
435 |                  sbfControl = sbfCtrl)
436 | adjNoCorr
437 | 
438 | ## Filter methods test set ROC results:
439 | 
440 | sbfROC(rawCorr)
441 | sbfROC(rawNoCorr)
442 | sbfROC(adjCorr)
443 | sbfROC(adjNoCorr)
444 | 
445 | ## Get the resampling results for all the models
446 | 
447 | rfeResamples <- resamples(list(RF = rfRFE,
448 |                                "Logistic Reg." = lrRFE,
449 |                                "SVM" = svmRFE,
450 |                                "$K$--NN" = knnRFE,
451 |                                "N. Bayes" = nbRFE,
452 |                                "LDA" = ldaRFE))
453 | summary(rfeResamples)
454 | 
455 | fullResamples <- resamples(list(RF = rfFull,
456 |                                 "Logistic Reg." = lrFull,
457 |                                 "SVM" = svmFull,
458 |                                 "$K$--NN" = knnFull,
459 |                                 "N. Bayes" = nbFull,
460 |                                 "LDA" = ldaFull))
461 | summary(fullResamples)
462 | 
463 | filteredResamples <- resamples(list("No Adjustment, Corr Vars" = rawCorr,
464 |                                     "No Adjustment, No Corr Vars" = rawNoCorr,
465 |                                     "Bonferroni, Corr Vars" = adjCorr,
466 |                                     "Bonferroni, No Corr Vars" = adjNoCorr))
467 | summary(filteredResamples)
468 | 
469 | sessionInfo()
470 | 
471 | 
472 | 


--------------------------------------------------------------------------------