├── .gitignore
├── R
    ├── chap1.R
    ├── chap10.R
    ├── chap11.R
    ├── chap12.R
    ├── chap13.R
    ├── chap14.R
    ├── chap15.R
    ├── chap16.R
    ├── chap17.R
    ├── chap18.R
    ├── chap19.R
    ├── chap2.R
    ├── chap20.R
    ├── chap21.R
    ├── chap22.R
    ├── chap23.R
    ├── chap3.R
    ├── chap4.R
    ├── chap5.R
    ├── chap6.R
    ├── chap7.R
    ├── chap8.R
    └── chap9.R
├── README.md
├── Rmd
    ├── chap1.Rmd
    ├── chap10.Rmd
    ├── chap11.Rmd
    ├── chap12.Rmd
    ├── chap13.Rmd
    ├── chap14.Rmd
    ├── chap15.Rmd
    ├── chap16.Rmd
    ├── chap17.Rmd
    ├── chap18.Rmd
    ├── chap19.Rmd
    ├── chap2.Rmd
    ├── chap20.Rmd
    ├── chap21.Rmd
    ├── chap22.Rmd
    ├── chap23.Rmd
    ├── chap3.Rmd
    ├── chap4.Rmd
    ├── chap5.Rmd
    ├── chap6.Rmd
    ├── chap7.Rmd
    ├── chap8.Rmd
    └── chap9.Rmd
├── _config.yml
├── _layouts
    └── default.html
├── assets
    └── css
    │   └── style.scss
├── images
    ├── anaconda-create-environment.png
    ├── anaconda-environment.png
    ├── anaconda-mlba-r-environment.png
    ├── anaconda-open-terminal.png
    ├── anaconda-packagelist.png
    ├── anaconda-update.png
    ├── jupyter-executing-python.png
    ├── jupyter-new-notebook.png
    ├── jupyter-notebook-filemanager.png
    └── logo.png
├── img
    └── mlba-bookcover.png
├── installPython.md
├── mlba-R.zip
└── mlba-Rmd.zip


/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Session Data files
 6 | .RData
 7 | 
 8 | # User-specific files
 9 | .Ruserdata
10 | 
11 | # Example code in package build process
12 | *-Ex.R
13 | 
14 | # Output files from R CMD build
15 | /*.tar.gz
16 | 
17 | # Output files from R CMD check
18 | /*.Rcheck/
19 | 
20 | # RStudio files
21 | .Rproj.user/
22 | 
23 | # produced vignettes
24 | vignettes/*.html
25 | vignettes/*.pdf
26 | 
27 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
28 | .httr-oauth
29 | 
30 | # knitr and R markdown default cache directories
31 | *_cache/
32 | /cache/
33 | 
34 | # Temporary files created by R markdown
35 | *.utf8.md
36 | *.knit.md
37 | 
38 | # R Environment Variables
39 | .Renviron
40 | 


--------------------------------------------------------------------------------
/R/chap1.R:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/R/chap1.R


--------------------------------------------------------------------------------
/R/chap11.R:
--------------------------------------------------------------------------------
  1 | 
  2 | if (!require(mlba)) {
  3 |   library(devtools)
  4 |   install_github("gedeck/mlba/mlba", force=TRUE)
  5 | }
  6 | 
  7 | # Neural Nets
  8 | ## Fitting a Network to Data
  9 | ### Training the Model
 10 | #### Back Propagation of Error
 11 | 
 12 | library(neuralnet)
 13 | df <- mlba::TinyData
 14 | df$Like <- df$Acceptance=="like"
 15 | df$Dislike <- df$Acceptance=="dislike"
 16 | set.seed(1)
 17 | 
 18 | 
 19 | nn <- neuralnet(Like + Dislike ~ Salt + Fat, data = df, linear.output = F, hidden = 3)
 20 | 
 21 | # display weights
 22 | nn$weights
 23 | 
 24 | # display predictions
 25 | prediction(nn)
 26 | 
 27 | # plot network
 28 | plot(nn, rep="best")
 29 | 
 30 | 
 31 | 
 32 | 
 33 | library(caret)
 34 | predict <- compute(nn, data.frame(df$Salt, df$Fat))
 35 | predicted.class=apply(predict$net.result,1,which.max)-1
 36 | confusionMatrix(factor(ifelse(predicted.class=="1", "dislike", "like")),
 37 |                 factor(df$Acceptance))
 38 | 
 39 | 
 40 | 
 41 | ### Example 2: Classifying Accident Severity
 42 | 
 43 | library(tidyverse)
 44 | library(fastDummies)
 45 | 
 46 | # convert SUR_COND and ALCHL_I to dummy variables (remove firest dummy)
 47 | # convert outcome MAX_SEV_IR to dummy variables keeping all
 48 | accidents.df <- mlba::AccidentsNN %>%
 49 |   dummy_cols(select_columns=c("ALCHL_I", "SUR_COND"),
 50 |              remove_selected_columns=TRUE, remove_first_dummy=TRUE) %>%
 51 |   dummy_cols(select_columns=c("MAX_SEV_IR"),
 52 |              remove_selected_columns=TRUE)
 53 | 
 54 | # partition the data
 55 | set.seed(1)
 56 | idx <- createDataPartition(mlba::AccidentsNN$MAX_SEV_IR, p=0.6, list=FALSE)
 57 | train.df <- accidents.df[idx, ]
 58 | holdout.df <- accidents.df[-idx, ]
 59 | train.actual <- mlba::AccidentsNN[idx, ]$MAX_SEV_IR
 60 | holdout.actual <- mlba::AccidentsNN[-idx, ]$MAX_SEV_IR
 61 | 
 62 | nn <- neuralnet(MAX_SEV_IR_0 + MAX_SEV_IR_1 + MAX_SEV_IR_2 ~ .,
 63 |                 data=train.df, hidden=2)
 64 | 
 65 | # predict the three outcome variables and assign class using maximum score
 66 | pred.train <- predict(nn, train.df)
 67 | class.train <- apply(pred.train, 1, which.max)-1
 68 | confusionMatrix(factor(class.train), factor(train.actual))
 69 | 
 70 | pred.holdout <- predict(nn, holdout.df)
 71 | class.holdout <- apply(pred.holdout, 1, which.max)-1
 72 | confusionMatrix(factor(class.holdout), factor(holdout.actual))
 73 | 
 74 | 
 75 | # 1 not able to predict 2 0.8496
 76 | # 2, 3 0.8697
 77 | # 4, 5 0.8647
 78 | 
 79 | ## Deep Learning
 80 | ### Example: Classification of Fashion Images
 81 | 
 82 | # code to prepare a similar picture using R
 83 | library(reticulate)
 84 | use_condaenv('mlba-r')
 85 | library(keras)
 86 | 
 87 | fashion_mnist <- keras::dataset_fashion_mnist()
 88 | clothes.labels <- c('Top', 'Trouser', 'Pullover', 'Dress', 'Coat',
 89 |                     'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Boot')
 90 | x_train <- fashion_mnist$train$x
 91 | y_train <- fashion_mnist$train$y
 92 | 
 93 | rotate <- function(x) t(apply(x, 2, rev))
 94 | plot_image <- function(x, title = "", title.color = "black") {
 95 |   x <- rotate(x)
 96 |   image(x, axes=FALSE, col=grey(seq(0, 1, length = 256)),
 97 |         main=list(title, col=title.color))
 98 | }
 99 | plot_sample <- function() {
100 |   par(mfrow=c(4, 5), mar=c(0, 0.2, 1, 0.2))
101 |   for (offset in c(0, 5)) {
102 |     range <- (1+offset):(5+offset)
103 |     for (i in range) {
104 |       examples <- which(y_train %in% (i-1))
105 |       example <- examples[1]
106 |       plot_image(x_train[example, , ], clothes.labels[y_train[example] + 1])
107 |     }
108 |     for (i in range) {
109 |       examples <- which(y_train %in% (i-1))
110 |       example <- examples[2]
111 |       plot_image(x_train[example, , ])
112 |     }
113 |   }
114 | }
115 | plot_sample()
116 | 
117 | pdf(file=file.path("..", "figures", "chapter_11", "fashion-mnist-sample.pdf"), width=5, height=4)
118 |   plot_sample()
119 | dev.off()
120 | 
121 | #### Data Preprocessing
122 | 
123 | # load required packages
124 | # Keras and TensorFlow require a Python conda environment with these packages installed
125 | library(reticulate)
126 | use_condaenv('mlba-r')
127 | library(keras)
128 | library(tensorflow)
129 | 
130 | # load the data and split into training and validation sets
131 | fashion_mnist <- keras::dataset_fashion_mnist()
132 | x_train <- fashion_mnist$train$x
133 | y_train <- fashion_mnist$train$y
134 | x_valid <- fashion_mnist$test$x
135 | y_valid <- fashion_mnist$test$y
136 | 
137 | # pixel values need to be scaled to range [0, 1]
138 | x_train <- x_train / 255
139 | x_valid <- x_valid / 255
140 | 
141 | # input require an additional dimension to describe pixel values
142 | # dimensions are (samples, row, column, pixel)
143 | x_train <- array_reshape(x_train, c(dim(x_train), 1))
144 | x_valid <- array_reshape(x_valid, c(dim(x_valid), 1))
145 | 
146 | # output values need to be converted into a matrix with one-hot-encoding of classes
147 | # dimensions are (samples, classes)
148 | y_train <- to_categorical(y_train, 10)
149 | y_valid <- to_categorical(y_valid, 10)
150 | dim(x_train)
151 | dim(y_train)
152 | 
153 | 
154 | # Model definition (architecture taken from
155 | # https://keras.rstudio.com/articles/examples/mnist_cnn.html
156 | input_shape = dim(x_train)[2:4]
157 | num_classes <- 10
158 | 
159 | model <- keras_model_sequential()
160 | model %>%
161 |   layer_conv_2d(filters=32, kernel_size=c(5,5), activation='relu',
162 |                 input_shape=input_shape) %>%
163 |   layer_conv_2d(filters=64, kernel_size=c(3,3), activation='relu') %>%
164 |   layer_max_pooling_2d(pool_size=c(2,2)) %>%
165 |   layer_dropout(rate=0.25) %>%
166 |   layer_flatten() %>%
167 |   layer_dense(units = 128, activation = 'relu') %>%
168 |   layer_dropout(rate = 0.5) %>%
169 |   layer_dense(units = num_classes, activation = 'softmax')
170 | 
171 | model
172 | 
173 | # compile model
174 | model %>% compile(
175 |   loss = loss_categorical_crossentropy,
176 |   optimizer = optimizer_adadelta(),
177 |   metrics = c('accuracy')
178 | )
179 | 
180 | # train and evaluate
181 | model %>% fit(
182 |   x_train, y_train,
183 |   batch_size = 128,
184 |   epochs = 20,
185 |   verbose = 1,
186 |   validation_data = list(x_valid, y_valid)
187 | )
188 | 
189 | #### Training a Deep Learning Network
190 | 
191 | library(gridExtra)
192 | df <- read.csv("cv-training.csv")
193 | g1 <- ggplot(df, aes(x=epoch)) +
194 |   geom_line(aes(y=loss), color="steelblue") +
195 |   geom_line(aes(y=val_loss), color="tomato") +
196 |   geom_hline(yintercept=min(df$val_loss), color="black", linetype="dotted") +
197 |   geom_vline(xintercept=which.min(df$val_loss), color="black", linetype="dotted") +
198 |   labs(x="Epoch", y="Loss") +
199 |   theme_bw()
200 | g2 <- ggplot(df, aes(x=epoch)) +
201 |   geom_line(aes(y=accuracy), color="steelblue") +
202 |   geom_line(aes(y=val_accuracy), color="tomato") +
203 |   geom_hline(yintercept=max(df$val_accuracy), color="black", linetype="dotted") +
204 |   labs(x="Epoch", y="Accuracy") +
205 |   theme_bw()
206 | grid.arrange(g1, g2, ncol=2)
207 | 
208 | 
209 | g <- arrangeGrob(g1, g2, ncol=2)
210 | ggsave(file=file.path("..", "figures", "chapter_11", "fashion-mnist-learning.pdf"),
211 |        g, width=5, height=2.5, units="in")
212 | 
213 | #### Applying the Predictive Model
214 | 
215 |   model <- load_model_tf("cnn-model.tf")
216 | 
217 | 
218 | propensities <- predict(model, x_valid)
219 | propensities[1:5, ]
220 | 
221 | # convert to class using winner takes all
222 | predClass <- apply(propensities, 1, which.max)
223 | predClass[1:5]
224 | 
225 | # confusion matrix
226 | caret::confusionMatrix(factor(predClass), factor(fashion_mnist$test$y + 1))
227 | 
228 | 
229 | 


--------------------------------------------------------------------------------
/R/chap12.R:
--------------------------------------------------------------------------------
  1 | 
  2 | if (!require(mlba)) {
  3 |   library(devtools)
  4 |   install_github("gedeck/mlba/mlba", force=TRUE)
  5 | }
  6 | 
  7 | # Discriminant Analysis
  8 | ## Introduction
  9 | ### Example 1: Riding Mowers
 10 | 
 11 | library(ggplot2)
 12 | mowers.df <- mlba::RidingMowers
 13 | g <- ggplot(mowers.df, mapping=aes(x=Income, y=Lot_Size, color=Ownership, fill=Ownership)) +
 14 |   geom_point(size=4) +
 15 |   geom_abline(intercept=40, slope=-0.34) +
 16 |   scale_shape_manual(values = c(15, 21)) +
 17 |   scale_color_manual(values = c('darkorange', 'steelblue')) +
 18 |   scale_fill_manual(values = c('darkorange', 'lightblue'))
 19 | 
 20 | g
 21 | 
 22 | ggsave(file=file.path("..", "figures", "chapter_12", "riding-mower.pdf"),
 23 |        g + theme_bw(), width=6, height=4, units="in")
 24 | 
 25 | ### Example 2: Personal Loan Acceptance
 26 | 
 27 | library(gridExtra)
 28 | makePlot  <- function(df, title, alpha) {
 29 |   no_personal_loan <- subset(df, Personal.Loan == 0)
 30 |   personal_loan <- subset(df, Personal.Loan == 1)
 31 | 
 32 |   g <- ggplot(universal.df, aes(x=Income, y=CCAvg, color=Personal.Loan)) +
 33 |       geom_point(aes(color="nonacceptor"), data=no_personal_loan, alpha=alpha) +
 34 |       geom_point(aes(color="acceptor"), data=personal_loan) +
 35 |       labs(title=title, colour="Personal Loan", x='Annual income ($000s)',
 36 |            y='Monthly average credit card spending ($000s)') +
 37 |       scale_color_manual(values=c("lightblue", "steelblue"),
 38 |              guide=guide_legend(override.aes=list(size=3, alpha=1))) +
 39 |       scale_x_log10() +
 40 |       scale_y_log10() +
 41 |       theme_bw()
 42 | 
 43 |   return (g)
 44 | }
 45 | 
 46 | set.seed(1)
 47 | universal.df <- mlba::UniversalBank
 48 | idx <- sample(dim(universal.df)[1], 200)
 49 | g1 <- makePlot(universal.df[idx, ], 'Sample of 200 customers', 1.0) +
 50 |         theme(legend.position = c(0.2, 0.85))
 51 | g2 <- makePlot(universal.df, 'All 5000 customers', 0.5) +
 52 |         guides(color="none")
 53 | grid.arrange(g1, g2, ncol=2)
 54 | 
 55 | g <- arrangeGrob(g1, g2, ncol=2)
 56 | ggsave(file=file.path("..", "figures", "chapter_12", "personalLoan_sampled.pdf"),
 57 |       g, width=8, height=4, units="in")
 58 | 
 59 | ## Fisher's Linear Classification Functions
 60 | 
 61 | library(caret)
 62 | mowers.df <- mlba::RidingMowers
 63 | trControl <- caret::trainControl(method='none')
 64 | model <- train(Ownership ~ Income + Lot_Size, data=mowers.df,
 65 |                method='lda', trControl=trControl)
 66 | model$finalModel  # access the wrapped LDA model
 67 | 
 68 | # DiscriMiner exposes the Fisher's linear classification function
 69 | library(DiscriMiner)
 70 | mowers.df <- mlba::RidingMowers
 71 | da.mower <- linDA(mowers.df[,1:2], mowers.df[,3])
 72 | da.mower$functions
 73 | 
 74 | 
 75 | da.mower <- linDA(mowers.df[,1:2], mowers.df[,3])
 76 | # compute propensities manually (below); or, use lda() in package MASS or caret with predict()
 77 | propensity.owner <- exp(da.mower$scores[,2])/(exp(da.mower$scores[,1])+exp(da.mower$scores[,2]))
 78 | data.frame(Actual=mowers.df$Ownership, Predicted=da.mower$classification,
 79 |            da.mower$scores, propensity.owner=propensity.owner)
 80 | 
 81 | 
 82 | library(ggplot2)
 83 | da.mower <- train(Ownership ~ Income + Lot_Size, data=mowers.df,
 84 |                   method='lda', trControl=trControl)
 85 | means <- colSums(da.mower$finalModel$means) / 2
 86 | sIncome <- da.mower$finalModel$scaling['Income', 'LD1']
 87 | sLotSize <- da.mower$finalModel$scaling['Lot_Size', 'LD1']
 88 | m <- - sIncome / sLotSize
 89 | y0 <- means['Lot_Size'] - m * means['Income']
 90 | 
 91 | mowers.df <- mlba::RidingMowers
 92 | g <- ggplot(mowers.df, mapping=aes(x=Income, y=Lot_Size, color=Ownership, fill=Ownership)) +
 93 |   geom_point(size=4) +
 94 |   geom_point(data=data.frame(da.mower$finalModel$means), color='black', fill='black', shape=4, size=3) +
 95 |   geom_abline(aes(linetype='ad hoc line', intercept=40, slope=-0.34), color='darkgrey') +
 96 |   geom_abline(aes(linetype='LDA line', intercept=y0, slope=m)) +
 97 |   scale_shape_manual(values = c(15, 21)) +
 98 |   scale_color_manual(values = c('darkorange', 'steelblue')) +
 99 |   scale_fill_manual(values = c('darkorange', 'lightblue')) +
100 |   scale_linetype_manual(name='Linetype', values=c(2, 1), labels=c('ad hoc line', 'LDA line')) +
101 |   guides(fill = guide_legend(order = 1), color = guide_legend(order = 1), 
102 |          linetype = guide_legend(order = 2))
103 | g
104 | ggsave(file=file.path("..", "figures", "chapter_12", "LDA-riding-mower.pdf"),
105 |        g + theme_bw(), width=6, height=4, units="in")
106 | 
107 | ## Prior Probabilities
108 | 
109 | trControl <- caret::trainControl(method='none')
110 | model <- train(Ownership ~ Income + Lot_Size, data=mowers.df,
111 |                method='lda', trControl=trControl)
112 | model.prior <- train(Ownership ~ Income + Lot_Size, data=mowers.df,
113 |                method='lda', prior=c(0.85, 0.15),
114 |                trControl=trControl)
115 | 
116 | family.13 <- mowers.df[13,]
117 | predict(model, family.13)
118 | predict(model.prior, family.13)
119 | 
120 | ## Classifying More Than Two Classes
121 | ### Example 3: Medical Dispatch to Accident Scenes
122 | 
123 | library(DiscriMiner)
124 | library(caret)
125 | 
126 | accidents.df <-  mlba::Accidents
127 | lda.model <- linDA(accidents.df[,1:10], accidents.df[,11])
128 | lda.model$functions
129 | confusionMatrix(as.factor(lda.model$classification), as.factor(accidents.df$MAX_SEV))
130 | 
131 | 
132 | propensity <- exp(lda.model$scores[,1:3])/
133 |     (exp(lda.model$scores[,1])+exp(lda.model$scores[,2])+exp(lda.model$scores[,3]))
134 | 
135 | res <- data.frame(Actual = accidents.df$MAX_SEV,
136 |            Classification = lda.model$classification,
137 |            Score = round(lda.model$scores,2),
138 |            Propensity = round(propensity,2))
139 | head(res)
140 | 
141 | 
142 | 
143 | 
144 | library(tidyverse)
145 | accidents.df <-  mlba::Accidents %>%
146 |   mutate(MAX_SEV = factor(MAX_SEV))
147 | da.model <- train(MAX_SEV ~ ., data=accidents.df, method='lda')
148 | res <- data.frame(Actual=accidents.df$MAX_SEV,
149 |            Classification=predict(da.model),
150 |            Propensity=predict(da.model, type='prob') %>% round(2))
151 | head(res)
152 | 
153 | 
154 | 


--------------------------------------------------------------------------------
/R/chap13.R:
--------------------------------------------------------------------------------
  1 | 
  2 | if (!require(mlba)) {
  3 |   library(devtools)
  4 |   install_github("gedeck/mlba/mlba", force=TRUE)
  5 | }
  6 | 
  7 | # Generating, Comparing, and Combining Multiple Models
  8 | ## Ensembles
  9 | ### Bagging and Boosting in R
 10 | #### Combining Propensities
 11 | 
 12 | library(tidyverse)
 13 | library(adabag)
 14 | library(rpart)
 15 | library(caret)
 16 | 
 17 | set.seed(1)
 18 | # load and preprocess the data
 19 | bank.df <- mlba::UniversalBank %>%
 20 |     select(-c(ID, ZIP.Code)) %>%
 21 |     mutate(
 22 |       Personal.Loan = factor(Personal.Loan, levels=c(0, 1), labels=c("No", "Yes"))
 23 |     )
 24 | 
 25 | # partition the data
 26 | train.index <- sample(c(1:dim(bank.df)[1]), dim(bank.df)[1]*0.6)
 27 | train.df <- bank.df[train.index, ]
 28 | holdout.df <- bank.df[-train.index, ]
 29 | 
 30 | # single tree (rpart)
 31 | tr <- rpart(Personal.Loan ~ ., data=train.df)
 32 | 
 33 | # bagging and boosting using adabag
 34 | bag <- bagging(Personal.Loan ~ ., data=train.df)
 35 | boost <- boosting(Personal.Loan ~ ., data=train.df)
 36 | 
 37 | # bagging and boosting using randomForest and xgboost with parameter tuning
 38 | bag.rf <- train(Personal.Loan ~ ., data=train.df, method="rf")
 39 | boost.xgb <- train(Personal.Loan ~ ., data=train.df, method="xgbTree", verbosity=0)
 40 | 
 41 | 
 42 | library(ROCR)
 43 | rocCurveData <- function(prob, data) {
 44 |   predob <- prediction(prob, data$Personal.Loan)
 45 |   perf <- performance(predob, "tpr", "fpr")
 46 |   return (data.frame(tpr=perf@x.values[[1]], fpr=perf@y.values[[1]]))
 47 | }
 48 | performance.df <- rbind(
 49 |   cbind(rocCurveData(predict(tr, holdout.df, type="prob")[,"Yes"], holdout.df), model="Single tree"),
 50 |   cbind(rocCurveData(predict(bag, holdout.df)$prob[, 2], holdout.df), model="Bagging"),
 51 |   cbind(rocCurveData(predict(boost, holdout.df)$prob[, 2], holdout.df), model="Boosting")
 52 | )
 53 | colors <- c("Single tree"="grey", "Bagging"="blue", "Boosting"="tomato")
 54 | g1 <- ggplot(performance.df, aes(x=tpr, y=fpr, color=model)) +
 55 |   geom_line() +
 56 |   scale_color_manual(values=colors) +
 57 |   geom_segment(aes(x=0, y=0, xend=1, yend=1), color="grey", linetype="dashed") +
 58 |   labs(x="1 - Specificity", y="Sensitivity", color="Model")
 59 | g1
 60 | 
 61 | performance.df <- rbind(
 62 |   cbind(rocCurveData(predict(tr, holdout.df, type="prob")[,"Yes"], holdout.df),
 63 |         model="Single tree"),
 64 |   cbind(rocCurveData(predict(bag.rf, holdout.df, type="prob")[,"Yes"], holdout.df),
 65 |         model="Random forest"),
 66 |   cbind(rocCurveData(predict(boost.xgb, holdout.df, type="prob")[,"Yes"], holdout.df),
 67 |         model="xgboost")
 68 | )
 69 | colors <- c("Single tree"="grey", "Random forest"="blue", "xgboost"="tomato")
 70 | g2 <- ggplot(performance.df, aes(x=tpr, y=fpr, color=model)) +
 71 |   geom_line() +
 72 |   scale_color_manual(values=colors) +
 73 |   geom_segment(aes(x=0, y=0, xend=1, yend=1), color="grey", linetype="dashed") +
 74 |   labs(x="1 - Specificity", y="Sensitivity", color="Model")
 75 | g2
 76 | library(gridExtra)
 77 | g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw(), ncol=2, widths=c(0.49, 0.51))
 78 | ggsave(file=file.path("..", "figures", "chapter_13", "bagging-boosting.pdf"),
 79 |        g, width=8, height=3, units="in")
 80 | 
 81 | ## Automated Machine Learning (AutoML)
 82 | ### AutoML: Explore and Clean Data
 83 | 
 84 | library(tidyverse)
 85 | 
 86 | # load and preprocess the data
 87 | bank.df <- mlba::UniversalBank %>%
 88 |   # Drop ID and zip code columns.
 89 |   select(-c(ID, ZIP.Code)) %>%
 90 |   # convert Personal.Loan to a factor with labels Yes and No
 91 |   mutate(Personal.Loan = factor(Personal.Loan, levels=c(0, 1), labels=c("No", "Yes")))
 92 | 
 93 | # partition the data
 94 | set.seed(1)
 95 | idx <- caret::createDataPartition(bank.df$Personal.Loan, p=0.6, list=FALSE)
 96 | train.df <- bank.df[idx, ]
 97 | holdout.df <- bank.df[-idx, ]
 98 | 
 99 | 
100 | library(h2o)
101 | 
102 | # Start the H2O cluster (locally)
103 | h2o.init()
104 | 
105 | train.h2o <- as.h2o(train.df)
106 | holdout.h2o <- as.h2o(holdout.df)
107 | 
108 | 
109 | 
110 | ### AutoML: Choose Features and Machine Learning Methods
111 | 
112 | # identify outcome and predictors
113 | y <- "Personal.Loan"
114 | x <- setdiff(names(train.df), y)
115 | 
116 | # run AutoML for 20 base models
117 | aml <- h2o.automl(x=x, y=y, training_frame=train.h2o,
118 |                   max_models=20, exclude_algos=c("DeepLearning"),
119 |                   seed=1)
120 | aml.balanced <- h2o.automl(x=x, y=y, training_frame=train.h2o,
121 |                   max_models=20, exclude_algos=c("DeepLearning"),
122 |                   balance_classes=TRUE,
123 |                   seed=1)
124 | 
125 | aml
126 | 
127 | 
128 | aml.balanced
129 | 
130 | ### AutoML: Evaluate Model Performance
131 | 
132 | h2o.confusionMatrix(aml@leader, holdout.h2o)
133 | h2o.confusionMatrix(aml.balanced@leader, holdout.h2o)
134 | 
135 | ## Explaining Model Predictions
136 | ### Explaining Model Predictions: LIME
137 | 
138 | cases <- c('3055', '3358', # predicted Yes
139 |            '2', '1178')    # predicted No
140 | explainer <- lime::lime(train.df, aml@leader, bin_continuous=TRUE, quantile_bins=FALSE)
141 | explanations <- lime::explain(holdout.df[cases,], explainer, n_labels=1, n_features=8)
142 | 
143 | lime::plot_features(explanations, ncol=2)
144 | 
145 | 
146 | pdf(file=file.path("..", "figures", "chapter_13", "lime-analysis.pdf"),
147 |     width=7, height=6)
148 |     lime::plot_features(explanations, ncol=2)
149 | dev.off()
150 | 


--------------------------------------------------------------------------------
/R/chap14.R:
--------------------------------------------------------------------------------
  1 | 
  2 | if (!require(mlba)) {
  3 |   library(devtools)
  4 |   install_github("gedeck/mlba/mlba", force=TRUE)
  5 | }
  6 | 
  7 | # Interventions: Experiments, Uplift Models, and Reinforcement Learning
  8 | ## A/B Testing
  9 | ### The statistical test for comparing two groups (T-test)
 10 | 
 11 | pt(q=2.828, df=1998, lower.tail=FALSE)
 12 | pt(q=2.626, df=1998, lower.tail=FALSE)
 13 | 
 14 | ## Uplift (Persuasion) Modeling
 15 | ### Computing Uplift with R
 16 | 
 17 | library(tidyverse)
 18 | # load and preprocess the data
 19 | predictors <- c("AGE", "NH_WHITE", "COMM_PT", "H_F1", "REG_DAYS",
 20 |                 "PR_PELIG", "E_PELIG", "POLITICALC", "MESSAGE_A")
 21 | outcome <- "MOVED_AD"
 22 | voter.df <- mlba::VoterPersuasion %>%
 23 |   select(all_of(c(predictors, outcome)))
 24 | 
 25 | 
 26 | set.seed(1)
 27 | nrows <- dim(voter.df)[1]
 28 | train.index <- sample(1:nrows, nrows * 0.6)
 29 | train.df <- voter.df[train.index, ]
 30 | holdout.df <- voter.df[-train.index, ]
 31 | 
 32 | # build a random forest model using caret
 33 | train_control <- caret::trainControl(method="none")
 34 | model <- caret::train(MOVED_AD ~ ., data=train.df,
 35 |                       trControl=train_control,
 36 |                       method="rf")
 37 | 
 38 | # calculating the uplift
 39 | uplift_df <- data.frame(holdout.df)
 40 | uplift_df$MESSAGE_A <- 1
 41 | predTreatment <- predict(model, newdata=uplift_df, type="prob")
 42 | uplift_df$MESSAGE_A <- 0
 43 | predControl <- predict(model, newdata=uplift_df, type="prob")
 44 | upliftResult <- data.frame(
 45 |   probMessage = predTreatment[, 1],
 46 |   probNoMessage = predControl[, 1]
 47 | )
 48 | upliftResult$uplift <- upliftResult$probMessage - upliftResult$probNoMessage
 49 | head(upliftResult)
 50 | 
 51 | 
 52 | 
 53 | ## Reinforcement Learning
 54 | ### Example of using a Contextual Multi-Arm Bandit for Movie Recommendations
 55 | 
 56 | library(tidyverse)
 57 | library(mlba)
 58 | library(contextual)
 59 | library(data.table)
 60 | library(splitstackshape)
 61 | 
 62 | # preprocess movies data to create indicator variables for the different genres
 63 | movies_dat <- as.data.table(mlba::MovieLensMovies)
 64 | movies_dat <- splitstackshape::cSplit_e(movies_dat, "genres", sep="|", type="character",
 65 |                                         fill=0, drop=TRUE)
 66 | movies_dat[[3]] <- NULL  # deletes the third column
 67 | 
 68 | ratings_dat <- as.data.table(mlba::MovieLensRatings)
 69 | all_movies <- ratings_dat[movies_dat, on=c(movieId="movieId")]
 70 | all_movies <- na.omit(all_movies, cols=c("movieId", "userId"))
 71 | # renumber userId to sequential numbers starting at 1
 72 | all_movies[, userId := as.numeric(as.factor(userId))]
 73 | 
 74 | # find the top-50 most frequently rated movies
 75 | top_50 <- all_movies %>%
 76 |   count(movieId) %>%
 77 |   slice_max(n, n=50) %>%
 78 |   pull(movieId)
 79 | top_50_movies   <- all_movies[movieId %in% top_50]
 80 | # renumber movieId to sequential numbers starting at 1
 81 | top_50_movies[, movieId   := as.numeric(as.factor(movieId))]
 82 | 
 83 | # create profile of genres for each movie in the top-50 (arm_features)
 84 | arm_features <- top_50_movies %>%
 85 |   select(-c(userId, rating, timestamp, title)) %>%
 86 |   # select one row for each movieId
 87 |   group_by(movieId) %>% slice(1) %>% ungroup()
 88 | 
 89 | # for each user, create their profile of genre preferences based on
 90 | # their viewed movies that are not in the top-50 (user_features)
 91 | user_features <- all_movies %>%
 92 |   filter(! movieId %in% top_50) %>%   # restrict to movies not in the top-50
 93 |   select(-c(movieId, rating, timestamp, title)) %>%
 94 |   # for each user, sum
 95 |   group_by(userId) %>%
 96 |   summarise_all(sum) %>%
 97 |   # normalize user profile
 98 |   group_by(userId) %>%
 99 |   mutate(
100 |     total = sum(c_across(genres_Action:genres_Western)),
101 |     across(genres_Action:genres_Western, ~ ./total)
102 |   ) %>%
103 |   select(-c(total)) %>%
104 |   as.data.table()
105 | 
106 | 
107 | # add users who only rated top-50 movies
108 | # their genre preference profile is set to 0 for all genres
109 | all_users <- as.data.table(unique(all_movies$userId))
110 | user_features <- user_features[all_users, on=c(userId="V1")]
111 | user_features[is.na(user_features)] <- 0
112 | setorder(user_features, userId)
113 | 
114 | 
115 | 
116 | 
117 | # prepare the data for use with the contextual package
118 | top_50_movies[, t := .I]
119 | top_50_movies[, sim := 1]
120 | top_50_movies[, agent := "Offline"]
121 | top_50_movies[, choice := movieId]
122 | top_50_movies[, reward := ifelse(rating <= 4, 0, 1)]
123 | setorder(top_50_movies,timestamp, title)
124 | 
125 | # the bandit samples users with their genre preferences (user_features),
126 | # movie choices (choice), and ratings.
127 | # each movie is characterized by the genre profile (arm_features)
128 | # these data are used to train the agent
129 | environment <- OfflineLookupReplayEvaluatorBandit$new(
130 |     top_50_movies,
131 |     k = 50,
132 |     unique_col = "userId",
133 |     unique_lookup = user_features,
134 |     shared_lookup = arm_features)
135 | 
136 | # define list of strategies to evaluate
137 | agents <-list(
138 |     Agent$new(RandomPolicy$new(), environment, "Random"),
139 |     Agent$new(LinUCBDisjointOptimizedPolicy$new(2.1), environment, "LinUCB Dis"))
140 | 
141 | # setup and run simulation
142 | simulation  <- Simulator$new(
143 |     agents = agents,
144 |     simulations = 20,
145 |     horizon = 10000L,
146 |     save_interval = 1)
147 | results  <- simulation$run()
148 | 
149 | plot(results, type="cumulative", regret=FALSE, rate=TRUE,
150 |      legend_position="topleft", disp="sd")
151 | 
152 | 
153 | pdf(file=file.path("..", "figures", "chapter_14", "mab-movielens.pdf"), width=6, height=4)
154 |   plot(results, type="cumulative", regret=FALSE, rate=TRUE,
155 |      legend_position="topleft", disp="sd")
156 | dev.off()
157 | 


--------------------------------------------------------------------------------
/R/chap15.R:
--------------------------------------------------------------------------------
  1 | 
  2 | if (!require(mlba)) {
  3 |   library(devtools)
  4 |   install_github("gedeck/mlba/mlba", force=TRUE)
  5 | }
  6 | 
  7 | # Association Rules and Collaborative Filtering
  8 | ## Association Rules
  9 | ### The Process of Rule Selection
 10 | #### Lift
 11 | 
 12 | library(arules)
 13 | fp.df <- mlba::Faceplate
 14 | 
 15 | # remove first column and convert to matrix
 16 | fp.mat <- as.matrix(fp.df[, -1])
 17 | 
 18 | # convert the binary incidence matrix into a transactions database
 19 | fp.trans <- as(fp.mat, "transactions")
 20 | inspect(fp.trans)
 21 | 
 22 | ## get rules
 23 | # when running apriori(), include the minimum support, minimum confidence, and target
 24 | # as arguments.
 25 | rules <- apriori(fp.trans, parameter = list(supp = 0.2, conf = 0.5, target = "rules"))
 26 | 
 27 | # inspect the first six rules, sorted by their lift
 28 | inspect(head(sort(rules, by = "lift"), n = 6))
 29 | 
 30 | ### Example 2: Rules for Similar Book Purchases
 31 | 
 32 | all.books.df <- mlba::CharlesBookClub
 33 | 
 34 | # create a binary incidence matrix
 35 | count.books.df <- all.books.df[, 8:18]
 36 | incid.books.mat <- as.matrix(count.books.df > 0)
 37 | 
 38 | #  convert the binary incidence matrix into a transactions database
 39 | books.trans <- as(incid.books.mat, "transactions")
 40 | inspect(books.trans[1:10])
 41 | 
 42 | # plot data
 43 | itemFrequencyPlot(books.trans)
 44 | 
 45 | # run apriori function
 46 | rules <- apriori(books.trans,
 47 |     parameter = list(supp= 200/4000, conf = 0.5, target = "rules"))
 48 | 
 49 | # inspect top-30 rules sorted by lift
 50 | inspect(head(sort(rules, by = "lift"), n=30))
 51 | 
 52 | ## Collaborative Filtering
 53 | ### Example 4: Predicting Movie Ratings with MovieLens Data
 54 | 
 55 | library(recommenderlab)
 56 | 
 57 | # download MovieLens data
 58 | ratings <- mlba::MovieLensRatings
 59 | movies <- mlba::MovieLensMovies
 60 | 
 61 | # convert ratings to rating matrix
 62 | idxUserId <- sort(unique(ratings$userId))
 63 | idxMovieId <- sort(unique(ratings$movieId))
 64 | m <- matrix(NA, nrow=length(idxUserId), ncol=length(idxMovieId),
 65 |             dimnames=list(
 66 |               user=paste("u", 1:length(idxUserId), sep=''),
 67 |               item=movies$title[match(idxMovieId, movies$movieId)]
 68 |             ))
 69 | for (i in 1:nrow(ratings)) {
 70 |   rating <- ratings[i,]
 71 |   irow <- match(rating$userId, idxUserId)
 72 |   icol <- match(rating$movieId, idxMovieId)
 73 |   m[irow, icol] <- rating$rating
 74 | }
 75 | ratingMatrix <- as(m, "realRatingMatrix")
 76 | 
 77 | 
 78 | 
 79 | 
 80 | # UBCF model and prediction
 81 | recommender <- Recommender(ratingMatrix[-1], method="UBCF")
 82 | pred <- predict(recommender, ratingMatrix[1])
 83 | as(pred, 'list')
 84 | 
 85 | # IBCF model and prediction
 86 | recommender <- Recommender(ratingMatrix[-1], method="IBCF")
 87 | pred <- predict(recommender, ratingMatrix[1])
 88 | as(pred, 'list')
 89 | 
 90 | 
 91 | set.seed(1)
 92 | e <- evaluationScheme(ratingMatrix, method="split", train=0.9, given=10)
 93 | 
 94 | r1 <- Recommender(getData(e, "train"), "UBCF")
 95 | r2 <- Recommender(getData(e, "train"), "IBCF")
 96 | r3 <- Recommender(getData(e, "train"), "RANDOM")
 97 | 
 98 | p1 <- predict(r1, getData(e, "known"), type="ratings")
 99 | p2 <- predict(r2, getData(e, "known"), type="ratings")
100 | p3 <- predict(r3, getData(e, "known"), type="ratings")
101 | error <- rbind(
102 |   UBCF = calcPredictionAccuracy(p1, getData(e, "unknown")),
103 |   IBCF = calcPredictionAccuracy(p2, getData(e, "unknown")),
104 |   RANDOM = calcPredictionAccuracy(p3, getData(e, "unknown"))
105 | )
106 | error
107 | 


--------------------------------------------------------------------------------
/R/chap16.R:
--------------------------------------------------------------------------------
  1 | 
  2 | if (!require(mlba)) {
  3 |   library(devtools)
  4 |   install_github("gedeck/mlba/mlba", force=TRUE)
  5 | }
  6 | 
  7 | # Cluster Analysis
  8 | ## Measuring Distance Between Two Records
  9 | ### Euclidean Distance
 10 | 
 11 | library(tidyverse)
 12 | # load data and use Company column as row names
 13 | utilities.df <- mlba::Utilities %>%
 14 |   column_to_rownames("Company")
 15 | 
 16 | # compute Euclidean distance
 17 | # (to compute other distance measures, change the value in the method argument)
 18 | d <- dist(utilities.df, method = "euclidean")
 19 | 
 20 | ### Normalizing Numerical Variables
 21 | 
 22 | # normalize input variables
 23 | utilities.df.norm <- scale(utilities.df)
 24 | 
 25 | # compute normalized distance based on Sales and Fuel Cost
 26 | d.norm <- dist(utilities.df.norm[,c("Sales","Fuel_Cost")], method="euclidean")
 27 | 
 28 | ## Hierarchical (Agglomerative) Clustering
 29 | ### Dendrograms: Displaying Clustering Process and Results
 30 | 
 31 | library(ggplot2)
 32 | library(ggdendro)
 33 | d.norm <- dist(utilities.df.norm, method="euclidean")
 34 | 
 35 | # in hclust() set argument method \galit{to}
 36 | # "ward.D", "single", "complete", "average", "median", or "centroid"
 37 | hc1 <- hclust(d.norm, method="single")
 38 | plot(hc1, hang=-1, ann=FALSE)  # use baseR
 39 | ggdendrogram(hc1)      # use ggdendro package (shown in figure below)
 40 | hc2 <- hclust(d.norm, method="average")
 41 | plot(hc2, hang=-1, ann=FALSE)
 42 | ggdendrogram(hc2)
 43 | 
 44 | 
 45 | library(gridExtra)
 46 | addCutline <- function(g, hc, ncluster) {
 47 |   heights <- rev(hc$height)
 48 |   cut_at <- 0.5 * (heights[ncluster] + heights[ncluster - 1])
 49 |   return (g + geom_hline(yintercept=cut_at, color='red', linetype=2))
 50 | }
 51 | g1 <- ggdendrogram(hc1)
 52 | g2 <- ggdendrogram(hc2)
 53 | grid.arrange(addCutline(g1, hc1, 6), addCutline(g2, hc2, 6), nrow=2)
 54 | g <- arrangeGrob(addCutline(g1, hc1, 6), addCutline(g2, hc2, 6), nrow=2)
 55 | ggsave(file=file.path("..", "figures", "chapter_16", "utilities-dendrograms.pdf"),
 56 |        g, width=5, height=8, units="in")
 57 | 
 58 | 
 59 | memb <- cutree(hc1, k = 6)
 60 | memb
 61 | memb <- cutree(hc2, k = 6)
 62 | memb
 63 | 
 64 | ### Validating Clusters
 65 | 
 66 | # set labels as cluster membership and utility name
 67 | row.names(utilities.df.norm) <- paste(memb, ": ", row.names(utilities.df), sep = "")
 68 | 
 69 | # plot heatmap
 70 | heatmap(utilities.df.norm, Colv=NA, hclustfun=hclust)
 71 | 
 72 | 
 73 | # grey scale
 74 | # rev() reverses the color mapping to large = dark
 75 | heatmap(as.matrix(utilities.df.norm), Colv = NA, hclustfun = hclust,
 76 |         col=rev(paste("gray",1:99,sep="")))
 77 | 
 78 | pdf(file=file.path("..", "figures", "chapter_16", "utilities-heatmap.pdf"),
 79 |     width=5, height=5)
 80 | heatmap(utilities.df.norm, Colv=NA, hclustfun=hclust)
 81 | dev.off()
 82 | 
 83 | ## Non-hierarchical Clustering: The 
 84 | ### Choosing the Number of Clusters ($k$)
 85 | 
 86 | set.seed(123) # set random seed for reproducability
 87 | # load and preprocess data
 88 | utilities.df <- mlba::Utilities %>%
 89 |   column_to_rownames("Company")
 90 | 
 91 | # normalized distance:
 92 | utilities.df.norm <- scale(utilities.df)
 93 | 
 94 | # run kmeans algorithm
 95 | km <- kmeans(utilities.df.norm, 6)
 96 | 
 97 | # show cluster membership
 98 | sort(km$cluster)
 99 | 
100 | 
101 |   # centroids
102 |   km$centers
103 |   # within-cluster sum of squares
104 |   km$withinss
105 |   # cluster size
106 |   km$size
107 | 
108 | 
109 | library(GGally)
110 | centroids <- data.frame(km$centers)
111 | centroids['Cluster'] = paste('Cluster', seq(1, 6))
112 | 
113 | ggparcoord(centroids, columns=1:8, groupColumn='Cluster', showPoints=TRUE) +
114 |     scale_color_viridis_d() +
115 |     labs(x='Variable', y='Value')
116 | 
117 | 
118 |   ggsave(file=file.path("..", "figures", "chapter_16", "utilities-clusterProfile.pdf"),
119 |          last_plot() + theme_bw(), width=8.5, height=3.2, units="in")
120 | 
121 | 
122 | result <- tibble()
123 | for (k in 1:6) {
124 |   km <- kmeans(utilities.df.norm, k)
125 |   result <- bind_rows(result, tibble(k=k, average_withinss=mean(km$withinss)))
126 | }
127 | 
128 | ggplot(result, aes(x=k, y=average_withinss)) +
129 |   geom_line() +
130 |   geom_point() +
131 |   labs(y="Average within-cluster squared distance",
132 |        x=expression(paste("Number of clusters ", italic("k")))) +
133 |   theme_bw()
134 | ggsave(file=file.path("..", "figures", "chapter_16", "utilities-ellbow.pdf"),
135 |        last_plot(), width=4, height=4, units="in")
136 | 
137 | 
138 |   dist(km$centers)
139 | 


--------------------------------------------------------------------------------
/R/chap17.R:
--------------------------------------------------------------------------------
  1 | 
  2 | if (!require(mlba)) {
  3 |   library(devtools)
  4 |   install_github("gedeck/mlba/mlba", force=TRUE)
  5 | }
  6 | 
  7 | # Handling Time Series
  8 | ## Time Series Components
  9 | ### Example: Ridership on Amtrak Trains
 10 | 
 11 | library(forecast)
 12 | library(ggplot2)
 13 | Amtrak.data <- mlba::Amtrak
 14 | 
 15 | # create time series object using ts()
 16 | # ts() takes three arguments: start, end, and freq.
 17 | # with monthly data, the frequency of periods per cycle is 12 (per year).
 18 | # arguments start and end are (cycle [=year] number, seasonal period [=month] number) pairs.
 19 | # here start is Jan 1991: start = c(1991, 1); end is Mar 2004: end = c(2004, 3).
 20 | ridership.ts <- ts(Amtrak.data$Ridership,
 21 |     start = c(1991, 1), end = c(2004, 3), freq = 12)
 22 | 
 23 | # plot the series using the autoplot function to make use of ggplot
 24 | autoplot(ridership.ts, xlab="Time", ylab="Ridership (in 000s)") +
 25 |   scale_y_continuous(limits=c(1300, 2300))
 26 | 
 27 | 
 28 |   g <- last_plot() +
 29 |     scale_x_continuous(n.breaks=10) +
 30 |     theme_bw()
 31 |   ggsave(file=file.path("..", "figures", "chapter_17", "AmtrakFirstPlot.pdf"),
 32 |          g, width=6, height=3, units="in")
 33 | 
 34 | 
 35 |   library(gridExtra)
 36 |   library(lubridate)
 37 |   library(zoo)
 38 | 
 39 |   BareggTunnel <- mlba::BareggTunnel
 40 |   # convert Day information to a dates object
 41 |   dates <- as.POSIXct(BareggTunnel$Day, format='%d %b %Y')
 42 |   tunnel.ts <- ts(BareggTunnel$Number.of.vehicles,
 43 |                   start=c(2003, yday(dates[1])),
 44 |                   frequency=365)
 45 | 
 46 |   options(scipen=999)
 47 |   g1 <- autoplot(tunnel.ts, xlab="Time", ylab="Number of vehicles") +
 48 |     scale_x_yearmon() +
 49 |     scale_y_continuous(labels = scales::comma)
 50 |   g2 <- autoplot(window(tunnel.ts,
 51 |                  start=c(2004, yday(ISOdate(2004, 2, 1))),
 52 |                  end=c(2004, yday(ISOdate(2004, 6, 1)))),
 53 |                  xlab="Time", ylab="Number of vehicles") +
 54 |     scale_x_yearmon() +
 55 |     scale_y_continuous(labels = scales::comma)
 56 | 
 57 |   grid.arrange(g1, g2, nrow=2)
 58 |   g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw())
 59 |   ggsave(file=file.path("..", "figures", "chapter_17", "TS-TunnelPlots.pdf"), g,
 60 |            width=6, height=4)
 61 | 
 62 | 
 63 | library(gridExtra)
 64 | 
 65 | # to zoom in to a certain period, use window() to create a new, shorter time series
 66 | # we create a new, 3-year time series of ridership.ts from Jan 1997 to Dec 1999
 67 | ridership.ts.3yrs <- window(ridership.ts, start = c(1997, 1), end = c(1999, 12))
 68 | g1 <- autoplot(ridership.ts.3yrs, xlab="Time", ylab="Ridership (in 000s)") +
 69 |   scale_y_continuous(limits=c(1300, 2300))
 70 | 
 71 | # fit a trend line to the time series
 72 | g2 <- autoplot(ridership.ts, xlab="Time", ylab="Ridership (in 000s)") +
 73 |   scale_y_continuous(limits=c(1300, 2300)) +
 74 |   geom_smooth(method="lm", formula=y~poly(x, 2))
 75 | 
 76 | grid.arrange(g1, g2, nrow=2)
 77 | 
 78 | 
 79 |   g <- arrangeGrob(g1 + theme_bw(),
 80 |                    g2 + scale_x_continuous(n.breaks=10) + theme_bw())
 81 |   ggsave(file=file.path("..", "figures", "chapter_17", "AmtrakZoomPlots.pdf"), g,
 82 |          width=6, height=6)
 83 | 
 84 | # we can also use tslm to create the quadratic fit
 85 | ridership.lm <- tslm(ridership.ts ~ trend + I(trend^2))
 86 | autoplot(ridership.ts, xlab="Time", ylab="Ridership (in 000s)") +
 87 |   scale_y_continuous(limits=c(1300, 2300)) +
 88 |   autolayer(ridership.lm$fitted.values)
 89 | 
 90 | ## Data Partitioning and Performance Evaluation
 91 | ### Benchmark Performance: Naive Forecasts
 92 | 
 93 | nTest <- 36
 94 | nTrain <- length(ridership.ts) - nTest
 95 | 
 96 | # partition the data
 97 | train.ts <- window(ridership.ts, start = c(1991, 1), end = c(1991, nTrain))
 98 | test.ts <- window(ridership.ts, start = c(1991, nTrain + 1),
 99 |                                    end = c(1991, nTrain + nTest))
100 | 
101 | # generate the naive and seasonal naive forecasts
102 | naive.pred <- naive(train.ts, h=nTest)
103 | snaive.pred <- snaive(train.ts, h=nTest)
104 | 
105 | # compare the actual values and forecasts for both methods
106 | colData <- "steelblue"; colModel <- "tomato"
107 | autoplot(train.ts, xlab="Time", ylab="Ridership (in 000s$)", color=colData) +
108 |   autolayer(test.ts, linetype=2, color=colData) +
109 |   autolayer(naive.pred, PI=FALSE, color=colModel, size=0.75) +
110 |   autolayer(snaive.pred, PI=FALSE, color=colModel, size=0.75)
111 | 
112 | 
113 | # for the book visualization add additional annotation
114 | delta <- 1/12
115 | date_t <- time(train.ts)[1]
116 | date_th <- time(test.ts)[1] - delta
117 | date_hf <- tail(time(test.ts), 1) + delta
118 | g <- last_plot() +
119 |   geom_vline(xintercept=date_th, color="darkgrey") + geom_vline(xintercept=date_hf, color="darkgrey") +
120 |   geom_segment(aes(x=date_t, xend=date_th-delta, y=2300, yend=2300), color="darkgrey") +
121 |   geom_segment(aes(x=date_th+delta, xend=date_hf-delta, y=2300, yend=2300), color="darkgrey")  +
122 |   geom_segment(aes(x=date_hf+delta, xend=date_hf+2, y=2300, yend=2300), color="darkgrey") +
123 |   geom_text(aes(x=(date_t+date_th)/2, y=2350, label='Training')) +
124 |   geom_text(aes(x=(date_th+date_hf)/2, y=2350, label='Test')) +
125 |   geom_text(aes(x=date_hf+1, y=2350, label='Future')) +
126 |   scale_x_continuous(n.breaks=10) +
127 |   theme_bw()
128 | ggsave(file=file.path("..", "figures", "chapter_17", "AmtrakNaive.pdf"),
129 |        g, width=7, height=4.5)
130 | 
131 | 
132 | accuracy(naive.pred, test.ts)
133 | accuracy(snaive.pred, test.ts)
134 | 
135 | ### Generating Future Forecasts
136 | 
137 | ts <- ts(mlba::CanadianWorkHours$Hours,
138 |     start = c(mlba::CanadianWorkHours$Year[1], 1), freq = 1)
139 | 
140 | # plot the series using the autoplot function to make use of ggplot
141 | autoplot(ts, xlab="Year", ylab="Hours per week")
142 | ggsave(file=file.path("..", "figures", "chapter_17", "Exercise-CanadianWorkers.pdf"),
143 |          last_plot() + theme_bw(), width=5, height=4, units="in")
144 | 


--------------------------------------------------------------------------------
/R/chap2.R:
--------------------------------------------------------------------------------
  1 | 
  2 | if (!require(mlba)) {
  3 |   library(devtools)
  4 |   install_github("gedeck/mlba/mlba", force=TRUE)
  5 | }
  6 | options(scipen=999)
  7 | 
  8 | # Overview of the Machine Learning Process
  9 | ## Preliminary Steps 
 10 | ### Loading and Looking at the Data in R
 11 | 
 12 | housing.df <- read.csv('WestRoxbury.csv') # load data from file
 13 | housing.df <- mlba::WestRoxbury # load data from mlba package
 14 | dim(housing.df)  # find the dimension of data frame
 15 | head(housing.df)  # show the first six rows
 16 | View(housing.df)  # show all the data in a new tab
 17 | 
 18 | # Practice showing different subsets of the data
 19 | housing.df[1:10, 1]  # show the first 10 rows of the first column only
 20 | housing.df[1:10, ]  # show the first 10 rows of each of the columns
 21 | housing.df[5, 1:10]  # show the fifth row of the first 10 columns
 22 | housing.df[5, c(1:2, 4, 8:10)]  # show the fifth row of some columns
 23 | housing.df[, 1]  # show the whole first column
 24 | housing.df$TOTAL.VALUE  # a different way to show the whole first column
 25 | housing.df$TOTAL.VALUE[1:10]  # show the first 10 rows of the first column
 26 | length(housing.df$TOTAL.VALUE)  # find the length of the first column
 27 | mean(housing.df$TOTAL.VALUE)  # find the mean of the first column
 28 | summary(housing.df)  # find summary statistics for each column
 29 | 
 30 | ### Sampling from a Database
 31 | 
 32 | housing.df <- mlba::WestRoxbury
 33 | 
 34 | # random sample of 5 observations
 35 | s <- sample(row.names(housing.df), 5)
 36 | housing.df[s,]
 37 | 
 38 | # oversample houses with over 10 rooms
 39 | s <- sample(row.names(housing.df), 5, prob=ifelse(housing.df$ROOMS>10, 0.9, 0.01))
 40 | housing.df[s,]
 41 | 
 42 | # rebalance
 43 | housing.df$REMODEL <- factor(housing.df$REMODEL)
 44 | table(housing.df$REMODEL)
 45 | upsampled.df <- caret::upSample(housing.df, housing.df$REMODEL, list=TRUE)$x
 46 | table(upsampled.df$REMODEL)
 47 | 
 48 | 
 49 | 
 50 | ### Preprocessing and Cleaning the Data
 51 | #### Types of Variables
 52 | 
 53 | library(tidyverse)
 54 | 
 55 | # get overview
 56 | str(housing.df)
 57 | 
 58 | # make REMODEL a factor variable
 59 | housing.df$REMODEL <- factor(housing.df$REMODEL)
 60 | str(housing.df$REMODEL)
 61 | levels(housing.df$REMODEL) # show factor's categories (levels)
 62 | 
 63 | # use tidyverse to load and preprocess data in one statement
 64 | # the %>% operator inserts the result of the expression on the left
 65 | # as the first argument into the function on the right
 66 | housing.df <- mlba::WestRoxbury %>%
 67 |   mutate(REMODEL=factor(REMODEL))
 68 | 
 69 | #### Handling Categorical Variables
 70 | 
 71 | library(fastDummies)
 72 | library(tidyverse)
 73 | 
 74 | housing.df <- dummy_cols(mlba::WestRoxbury,
 75 |                  remove_selected_columns=TRUE,  # remove the original column
 76 |                  remove_first_dummy=TRUE)  # removes the first created dummy variable
 77 | housing.df %>% head(2)
 78 | 
 79 | #### Missing Values
 80 | 
 81 | # To illustrate missing data procedures, we first convert a few entries for
 82 | # BEDROOMS to NA's. Then we impute these missing values using the median of the
 83 | # remaining values.
 84 | rows.to.missing <- sample(row.names(housing.df), 10)
 85 | housing.df[rows.to.missing,]$BEDROOMS <- NA
 86 | summary(housing.df$BEDROOMS)
 87 | # Now we have 10 NA's and the median of the remaining values is 3.
 88 | 
 89 | # replace the missing values using the median of the remaining values
 90 | # use median() with na.rm=TRUE to ignore missing values when computing the median.
 91 | housing.df <- housing.df %>%
 92 |   replace_na(list(BEDROOMS=median(housing.df$BEDROOMS, na.rm=TRUE)))
 93 | 
 94 | summary(housing.df$BEDROOMS)
 95 | 
 96 | ## Predictive Power and Overfitting
 97 | ### Creating and Using Data Partitions
 98 | #### Holdout Partition
 99 | 
100 | housing.df <- mlba::WestRoxbury %>%
101 |   mutate(REMODEL=factor(REMODEL))
102 | 
103 | # use set.seed() to get the same partitions when re-running the R code.
104 | set.seed(1)
105 | 
106 | ## partitioning into training (60%) and holdout (40%)
107 | # randomly sample 60% of the row IDs for training; the remaining 40% serve
108 | # as holdout
109 | train.rows <- sample(rownames(housing.df), nrow(housing.df)*0.6)
110 | # collect all the columns with training row ID into training set:
111 | train.df <- housing.df[train.rows, ]
112 | # assign row IDs that are not already in the training set, into holdout
113 | holdout.rows <- setdiff(rownames(housing.df), train.rows)
114 | holdout.df <- housing.df[holdout.rows, ]
115 | 
116 | ## partitioning into training (50%), validation (30%), holdout (20%)
117 | # randomly sample 50% of the row IDs for training
118 | train.rows <- sample(rownames(housing.df), nrow(housing.df)*0.5)
119 | 
120 | # sample 30% of the row IDs into the validation set, drawing only from records
121 | # not already in the training set
122 | # use setdiff() to find records not already in the training set
123 | valid.rows <- sample(setdiff(rownames(housing.df), train.rows),
124 |               nrow(housing.df)*0.3)
125 | 
126 | # assign the remaining 20% row IDs serve as holdout
127 | holdout.rows <- setdiff(rownames(housing.df), union(train.rows, valid.rows))
128 | 
129 | # create the 3 data frames by collecting all columns from the appropriate rows
130 | train.df <- housing.df[train.rows, ]
131 | valid.df <- housing.df[valid.rows, ]
132 | holdout.df <- housing.df[holdout.rows, ]
133 | 
134 | ## partitioning into training (60%) and holdout (40%) using caret
135 | set.seed(1)
136 | idx <- caret::createDataPartition(housing.df$TOTAL.VALUE, p=0.6, list=FALSE)
137 | train.df <- housing.df[idx, ]
138 | holdout.df <- housing.df[-idx, ]
139 | 
140 | 
141 | 
142 | ## Building a Predictive Model
143 | ### Modeling Process
144 | #### Cross-Validation
145 | 
146 | library(tidyverse)
147 | library(mlba)
148 | library(fastDummies)
149 | 
150 | housing.df <- mlba::WestRoxbury %>%
151 |   # remove rows with missing values
152 |   drop_na() %>%
153 |   # remove column TAX
154 |   select(-TAX) %>%
155 |   # make REMODEL a factor and convert to dummy variables
156 |   mutate(REMODEL=factor(REMODEL)) %>%
157 |   dummy_cols(select_columns=c('REMODEL'),
158 |              remove_selected_columns=TRUE, remove_first_dummy=TRUE)
159 | 
160 | 
161 | set.seed(1)
162 | idx <- caret::createDataPartition(housing.df$TOTAL.VALUE, p=0.6, list=FALSE)
163 | train.df <- housing.df[idx, ]
164 | holdout.df <- housing.df[-idx, ]
165 | 
166 | 
167 | reg <- lm(TOTAL.VALUE ~ ., data=train.df)
168 | train.res <- data.frame(actual=train.df$TOTAL.VALUE, predicted=reg$fitted.values,
169 |                         residuals=reg$residuals)
170 | head(train.res)
171 | 
172 | 
173 | pred <- predict(reg, newdata=holdout.df)
174 | holdout.res <- data.frame(actual=holdout.df$TOTAL.VALUE, predicted=pred,
175 |                           residuals=holdout.df$TOTAL.VALUE - pred)
176 | head(holdout.res)
177 | 
178 | 
179 | library(caret)
180 | # compute metrics on training set
181 | data.frame(
182 |     ME = round(mean(train.res$residuals), 5),
183 |     RMSE = RMSE(pred=train.res$predicted, obs=train.res$actual),
184 |     MAE = MAE(pred=train.res$predicted, obs=train.res$actual)
185 | )
186 | 
187 | # compute metrics on holdout set
188 | data.frame(
189 |     ME = round(mean(holdout.res$residuals), 5),
190 |     RMSE = RMSE(pred=holdout.res$predicted, obs=holdout.res$actual),
191 |     MAE = MAE(pred=holdout.res$predicted, obs=holdout.res$actual)
192 | )
193 | 
194 | 
195 | # For demonstration purposes, we construct the new.data from the original dataset
196 | housing.df <- mlba::WestRoxbury
197 | new.data <- housing.df[100:102, -1] %>%
198 |   mutate(REMODEL=factor(REMODEL, levels=c("None", "Old", "Recent"))) %>%
199 |   dummy_cols(select_columns=c('REMODEL'),
200 |            remove_selected_columns=TRUE, remove_first_dummy=TRUE)
201 | new.data
202 | pred <- predict(reg, newdata = new.data)
203 | pred
204 | 


--------------------------------------------------------------------------------
/R/chap20.R:
--------------------------------------------------------------------------------
  1 | 
  2 | if (!require(mlba)) {
  3 |   library(devtools)
  4 |   install_github("gedeck/mlba/mlba", force=TRUE)
  5 | }
  6 | set.seed(1)
  7 | 
  8 | # Social Network Analytics
  9 | ## Introduction
 10 | 
 11 | library(igraph)
 12 | 
 13 | # define links in data
 14 | edges <- rbind(
 15 |   c("Dave", "Jenny"), c("Peter", "Jenny"), c("John", "Jenny"),
 16 |   c("Dave", "Peter"), c("Dave", "John"), c("Peter", "Sam"),
 17 |   c("Sam", "Albert"), c("Peter", "John")
 18 | )
 19 | 
 20 | # generate and plot network
 21 | # set argument directed = FALSE in graph.edgelist() to plot an undirected network.
 22 | g <- graph.edgelist(edges, directed = FALSE)
 23 | plot(g, vertex.size = 5, vertex.label.dist = 2)
 24 | 
 25 | 
 26 | pdf(file=file.path("..", "figures", "chapter_20", "fig1.pdf"), width=5, height=5)
 27 |     par(mar=c(0,0,0,1)+.1)
 28 |     plot(g, vertex.size = 5, vertex.label.dist = 2)
 29 | dev.off()
 30 | 
 31 | ## Directed vs. Undirected Networks
 32 | 
 33 | # generate and plot network
 34 | # set argument directed = TRUE in graph.edgelist() to plot a directed network.
 35 | g <- graph.edgelist(edges, directed = TRUE)
 36 | plot(g, vertex.size = 5, vertex.label.dist = 2)
 37 | 
 38 | 
 39 | pdf(file=file.path("..", "figures", "chapter_20", "fig2.pdf"), width=5, height=5)
 40 |     par(mar=c(0,0,0,1)+.5)
 41 |     plot(g, vertex.size = 5, vertex.label.dist=2)
 42 | dev.off()
 43 | 
 44 | 
 45 | edges <- rbind(c("A", "B"), c("B", "C"), c("C", "A"))
 46 | g <- graph.edgelist(edges, directed = FALSE)
 47 | E(g)$width <- c(20, 5, 5)
 48 | plot(g, vertex.size = 5, vertex.label.dist = 2)
 49 | 
 50 | pdf(file=file.path("..", "figures", "chapter_20", "fig3.pdf"), width=2, height=2)
 51 |     par(mar=c(0,0,0,0)+.2)
 52 |     plot(g, vertex.size = 5, vertex.label.dist = 2)
 53 | dev.off()
 54 | # $
 55 | 
 56 | ## Visualizing and Analyzing Networks
 57 | ### Plot Layout
 58 | 
 59 | library(igraph)
 60 | drug.df <- mlba::Drug
 61 | 
 62 | # convert edges to edge list matrix
 63 | edges <- as.matrix(drug.df[, c(1,2)])
 64 | g <- graph.edgelist(edges,directed=FALSE)
 65 | 
 66 | # plot network
 67 | # nodes' size is proportional to their eigenvector centrality
 68 | plot(g, vertex.label = NA, vertex.size = eigen_centrality(g)$vector * 20)
 69 | 
 70 | 
 71 | pdf(file=file.path("..", "figures", "chapter_20", "SNA_Drug_Laundry.pdf"), width=5, height=5)
 72 |     par(mar=c(0,0,0,0)+.1)
 73 |     plot(g, vertex.label = NA, vertex.size = eigen_centrality(g)$vector * 20)
 74 | dev.off()
 75 | 
 76 | 
 77 | edges <- rbind(
 78 |   c("Dave", "Jenny"), c("Peter", "Jenny"), c("John", "Jenny"),
 79 |   c("Dave", "Peter"), c("Dave", "John"), c("Peter", "Sam"),
 80 |   c("Sam", "Albert"), c("Peter", "John")
 81 | )
 82 | g <- graph.edgelist(edges)
 83 | pdf(file=file.path("..", "figures", "chapter_20", "fig5_circle.pdf"), width=3, height=3)
 84 |     par(mar=c(0,0,0,1)+.7)
 85 |     plot(g, layout = layout_in_circle, vertex.size = 5, vertex.label.dist = 2)
 86 | dev.off()
 87 | pdf(file=file.path("..", "figures", "chapter_20", "fig5_grid.pdf"), width=3, height=3)
 88 |     par(mar=c(0,0,0,1)+.7)
 89 |     plot(g, layout = layout_on_grid, vertex.size = 5, vertex.label.dist = 2)
 90 | dev.off()
 91 | 
 92 | 
 93 | # Building on the code presented in Figure 19.1
 94 | plot(g, layout = layout_in_circle, vertex.size = 5, vertex.label.dist = 2)
 95 | plot(g, layout = layout_on_grid, vertex.size = 5, vertex.label.dist = 2)
 96 | 
 97 | ## Social Data Metrics and Taxonomy
 98 | ### Node-Level Centrality Metrics
 99 | 
100 | edges <- rbind(
101 |   c("Dave", "Jenny"), c("Peter", "Jenny"), c("John", "Jenny"),
102 |   c("Dave", "Peter"), c("Dave", "John"), c("Peter", "Sam"),
103 |   c("Sam", "Albert"), c("Peter", "John")
104 | )
105 | g <- graph.edgelist(edges, directed=FALSE)
106 | 
107 | degree(g)
108 | betweenness(g)
109 | betweenness(g)/sum(betweenness(g))
110 | closeness(g)
111 | eigen_centrality(g)
112 | 
113 | ### Egocentric Network
114 | 
115 | # get Peter's 1-level ego network
116 | # for a 2-level ego network set argument order = 2 in make_ego_graph().
117 | peter.ego <- make_ego_graph(g, order = 1, nodes = "Peter")
118 | plot(peter.ego[[1]], vertex.size = 1, vertex.label.dist = 0.5)
119 | 
120 | 
121 | pdf(file=file.path("..", "figures", "chapter_20", "fig6_1.pdf"), width=5, height=5)
122 |     par(mar=c(0,0,0,1)+.5)
123 |     peter.ego <- make_ego_graph(g, order = 1, nodes = "Peter")
124 |     g.ego <- peter.ego[[1]]
125 |     V(g.ego)$color <- "orange"
126 |     V(g.ego)["Peter"]$color <- "red"
127 |     plot(g.ego, vertex.size = 5, vertex.label.dist = 2)
128 | dev.off()
129 | pdf(file=file.path("..", "figures", "chapter_20", "fig6_2.pdf"), width=5, height=5)
130 |     par(mar=c(0,0,0,1)+.5)
131 |     peter.ego <- make_ego_graph(g, order = 2, nodes = "Peter")
132 |     g.ego <- peter.ego[[1]]
133 |     V(g.ego)$color <- "orange"
134 |     V(g.ego)["Peter"]$color <- "red"
135 |     plot(g.ego, vertex.size = 5, vertex.label.dist = 2)
136 | dev.off()
137 | 
138 | ### Network Metrics
139 | 
140 | degree.distribution(g) # normalized
141 | edge_density(g)
142 | 
143 | ## Collecting Social Network Data with R
144 | ### Collaborative Filtering
145 | 
146 | library(twitteR)
147 | # replace key and secret number with those you obtained from Twitter
148 | setup_twitter_oauth(consumer_key = "XXX", consumer_secret = "XXX",
149 |                     access_token = "XXX", access_secret = "XXX")
150 | 
151 | # get recent tweets
152 | recent.25.tweets <- searchTwitter("text mining", resultType="recent", n = 25)
153 | 
154 | 
155 | library(Rfacebook)
156 | # replace the app id and secret number with those you obtained from Facebook
157 | fb_oauth <- fbOAuth(app_id = "XXX", app_secret = "XXX")
158 | fb_oauth_credentials <- fromJSON(names(fb_oauth$credentials))
159 | 
160 | # get recent posts on page "dataminingbook"
161 | fb_page <- getPage(page = "dataminingbook", token = fb_oauth_credentials$access_token)
162 | 
163 | # a facebook page contains the following information:
164 | t(t(names(fb_page)))
165 | fb_page[1,]
166 | 
167 | # get information about most recent post
168 | post <- getPost(post=fb_page$id[1], n=20, token=fb_oauth_credentials$access_token)
169 | 
170 | post$likes
171 | post$comments
172 | 
173 | 


--------------------------------------------------------------------------------
/R/chap21.R:
--------------------------------------------------------------------------------
  1 | 
  2 | if (!require(mlba)) {
  3 |   library(devtools)
  4 |   install_github("gedeck/mlba/mlba", force=TRUE)
  5 | }
  6 | 
  7 | # Text Mining
  8 | ## The Tabular Representation of Text: Term-Document Matrix and ``Bag-of-Words"
  9 | 
 10 | library(tm)
 11 | 
 12 | # define vector of sentences ("docs")
 13 | text <- c("this is the first sentence",
 14 |           "this is a second sentence",
 15 |           "the third sentence is here")
 16 | 
 17 | # convert sentences into a corpus
 18 | corp <- Corpus(VectorSource(text))
 19 | 
 20 | # compute term frequency
 21 | tdm <- TermDocumentMatrix(corp)
 22 | inspect(tdm)
 23 | 
 24 | ## Preprocessing the Text
 25 | 
 26 | text <- c("this is the first     sentence!!",
 27 |           "this is a second Sentence :)",
 28 |           "the third sentence, is here",
 29 |           "forth of all sentences")
 30 | corp <- Corpus(VectorSource(text))
 31 | tdm <- TermDocumentMatrix(corp)
 32 | inspect(tdm)
 33 | 
 34 | ### Tokenization
 35 | 
 36 | # tokenization
 37 | corp <- tm_map(corp, stripWhitespace)
 38 | corp <- tm_map(corp, removePunctuation)
 39 | tdm <- TermDocumentMatrix(corp)
 40 | inspect(tdm)
 41 | 
 42 | ### Text Reduction
 43 | 
 44 | stopwords('english')
 45 | 
 46 | 
 47 | # stopwords
 48 | library(SnowballC)
 49 | corp <- tm_map(corp, removeWords, stopwords("english"))
 50 | 
 51 | # stemming
 52 | corp <- tm_map(corp, stemDocument)
 53 | 
 54 | tdm <- TermDocumentMatrix(corp)
 55 | inspect(tdm)
 56 | 
 57 | ### Term Frequency--Inverse Document Frequency (TF-IDF)
 58 | 
 59 | tfidf <- weightTfIdf(tdm)
 60 | inspect(tfidf)
 61 | 
 62 | ## Example: Online Discussions on Autos and Electronics
 63 | ### Importing and Labeling the Records
 64 | 
 65 | library(tm)
 66 | # step 1: import and label records
 67 | # read zip file into a corpus
 68 | corp <- Corpus(ZipSource(mlba::AutosElectronics, recursive = T))
 69 | 
 70 | # create an array of records labels
 71 | label <- c(rep(1, 1000), rep(0, 1000))
 72 | 
 73 | # step 2: text preprocessing
 74 | # tokenization
 75 | corp <- tm_map(corp, stripWhitespace)
 76 | corp <- tm_map(corp, removePunctuation)
 77 | corp <- tm_map(corp, removeNumbers)
 78 | 
 79 | # stopwords
 80 | corp <- tm_map(corp, removeWords, stopwords("english"))
 81 | 
 82 | # stemming
 83 | corp <- tm_map(corp, stemDocument)
 84 | 
 85 | # step 3: TF-IDF and latent semantic analysis
 86 | # compute TF-IDF
 87 | tdm <- TermDocumentMatrix(corp)
 88 | tfidf <- weightTfIdf(tdm)
 89 | 
 90 | # extract (20) concepts
 91 | library(lsa)
 92 | lsa.tfidf <- lsa(tfidf, dim = 20)
 93 | 
 94 | # convert to data frame
 95 | words.df <- as.data.frame(as.matrix(lsa.tfidf$dk))
 96 | 
 97 | ### Fitting a Predictive Model
 98 | 
 99 | library(caret)
100 | 
101 | # prepare training and holdout sets
102 | set.seed(1)
103 | df <- cbind(label=factor(label), words.df)
104 | idx <- caret::createDataPartition(df$label, p=0.6, list=FALSE)
105 | train.df <- df[idx, ]
106 | holdout.df <- df[-idx, ]
107 | 
108 | # fit logistic regression
109 | logit.reg <- train(label ~ ., data=train.df,
110 |                           trControl=trainControl(method="none"),
111 |                           method="glm", family="binomial")
112 | 
113 | # compute accuracy on holdout set
114 | pred <- predict(logit.reg, newdata=holdout.df)
115 | confusionMatrix(pred, holdout.df$label)
116 | 
117 | 
118 | library(gains)
119 | 
120 | prob <- predict(logit.reg, newdata=holdout.df, type="prob")[,2]
121 | actual <- ifelse(holdout.df$label == 1, 1, 0)
122 | gain <- gains(actual, prob)
123 | barplot(gain$mean.resp/mean(actual), names.arg=seq(10, 100, by=10),
124 |         xlab="Percentile", ylab="Decile mean / global mean")
125 | 
126 | 
127 | pdf(file=file.path("..", "figures", "chapter_21", "decileLiftClassification.pdf"),
128 |     width=6, height=4)
129 |     barplot(gain$mean.resp/mean(actual), names.arg=seq(10, 100, by=10),
130 |             xlab="Percentile", ylab="Decile mean / global mean")
131 | dev.off()
132 | 
133 | 
134 | ## Example: Sentiment Analysis of Movie Reviews
135 | ### Data Loading, Preparation, and Partitioning
136 | 
137 | library(tidyverse)
138 | library(text2vec)
139 | 
140 | # load and split data into training and holdout set
141 | data <- mlba::IMDBdataset10K %>%
142 |   mutate(
143 |     id = row_number(),
144 |     sentiment = as.factor(sentiment)
145 |   )
146 | 
147 | set.seed(1)
148 | trainIndex <- createDataPartition(data$sentiment, p=0.8, list=FALSE)
149 | data_train <- data[trainIndex, ]
150 | data_holdout <- data[-trainIndex, ]
151 | 
152 | 
153 | prep_fun <- tolower
154 | tok_fun <- word_tokenizer
155 | 
156 | it_train <- itoken(data_train$review, ids=data_train$id,
157 |                    preprocessor=prep_fun, tokenizer=tok_fun)
158 | it_holdout <- itoken(data_holdout$review, ids=data_holdout$id,
159 |                      preprocessor=prep_fun, tokenizer=tok_fun)
160 | 
161 | vocab <- create_vocabulary(it_train)
162 | vocab <- prune_vocabulary(vocab, term_count_min = 5L)
163 | vectorizer <- vocab_vectorizer(vocab)
164 | tcm_train <- create_tcm(it_train, vectorizer, skip_grams_window = 5L)
165 | 
166 | 
167 | 
168 | ### Generating and Applying pb
169 | 
170 | # determine word vectors
171 | glove <- GlobalVectors$new(rank=100, x_max=10)
172 | wv_main <- glove$fit_transform(tcm_train, n_iter=10, convergence_tol=0.01, n_threads=8)
173 | wv_context <- glove$components
174 | word_vectors <- wv_main + t(wv_context)
175 | 
176 | 
177 | 
178 | 
179 | dtm_train <- create_dtm(it_train, vectorizer)
180 | common_terms <- intersect(colnames(dtm_train), rownames(word_vectors) )
181 | dtm_averaged <-  normalize(dtm_train[, common_terms], "l1")
182 | sentence_vectors_train <- dtm_averaged %*% word_vectors[common_terms, ]
183 | 
184 | dtm_holdout <- create_dtm(it_holdout, vectorizer)
185 | common_terms <- intersect(colnames(dtm_holdout), rownames(word_vectors) )
186 | dtm_averaged <-  normalize(dtm_holdout[, common_terms], "l1")
187 | sentence_vectors_holdout <- dtm_averaged %*% word_vectors[common_terms, ]
188 | 
189 | ### Fitting a Predictive Model
190 | 
191 | train.df <- as.data.frame(as.matrix(sentence_vectors_train))
192 | train.df$sentiment <- data_train$sentiment
193 | 
194 | trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
195 | logit.reg <- caret::train(sentiment ~ ., data=train.df, trControl=trControl,
196 |                       # fit logistic regression with a generalized linear model
197 |                       method="glm", family="binomial")
198 | 
199 | holdout.df <- as.data.frame(as.matrix(sentence_vectors_holdout))
200 | holdout.df$sentiment <- data_holdout$sentiment
201 | 
202 | caret::confusionMatrix(predict(logit.reg, holdout.df), holdout.df$sentiment)
203 | 
204 | 
205 | 
206 | 
207 | library(ROCR)
208 | prob <- predict(logit.reg, newdata=holdout.df, type="prob")$positive
209 | 
210 | predob <- prediction(prob, holdout.df$sentiment)
211 | perf <- performance(predob, "tpr", "fpr")
212 | perf.df <- data.frame(
213 |   tpr=perf@x.values[[1]],
214 |   fpr=perf@y.values[[1]]
215 | )
216 | ggplot(perf.df, aes(x=tpr, y=fpr)) +
217 |   geom_line() +
218 |   geom_segment(aes(x=0, y=0, xend=1, yend=1), color="grey", linetype="dashed") +
219 |   labs(x="1 - Specificity", y="Sensitivity")
220 | 
221 | 
222 |   ggsave(file=file.path("..", "figures", "chapter_21", "glove-ROC.pdf"),
223 |    last_plot() + theme_bw())
224 | 


--------------------------------------------------------------------------------
/R/chap22.R:
--------------------------------------------------------------------------------
  1 | 
  2 | if (!require(mlba)) {
  3 |   library(devtools)
  4 |   install_github("gedeck/mlba/mlba", force=TRUE)
  5 | }
  6 | 
  7 | # Responsible Data Science
  8 | ## Example: Applying the RDS Framework to the COMPAS Example
  9 | ### Data Issues
 10 | 
 11 | library(caret)
 12 | library(tidyverse)
 13 | # load COMPAS data
 14 | compas.df <- mlba::COMPAS_clean %>%
 15 |   select(-id) %>%
 16 |   mutate(
 17 |     age_cat = factor(age_cat, levels=c("Less than 25", "25 - 45", "Greater than 45")),
 18 |     c_charge_degree = factor(c_charge_degree, levels=c("F", "M")),
 19 |     race = factor(race, levels=c("African-American", "Asian", "Caucasian", "Hispanic",
 20 |                                  "Native American", "Other")),
 21 |     sex = factor(sex, levels=c("Female", "Male")),
 22 |     two_year_recid = factor(two_year_recid, levels=c(0, 1), labels=c("No", "Yes"))
 23 |   )
 24 | 
 25 | # split dataset and train models
 26 | set.seed(1)
 27 | idx <- createDataPartition(compas.df$two_year_recid, p=0.7, list=FALSE)
 28 | train.df <- compas.df[idx, ]
 29 | valid.df <- compas.df[-idx, ]
 30 | 
 31 | 
 32 | trControl <- trainControl(method="cv", number=5, allowParallel=TRUE)
 33 | # logistic regression model
 34 | logreg.model <- train(two_year_recid ~ . - race, data=train.df,
 35 |                       method="glm", family="binomial", trControl=trControl)
 36 | # random forest model
 37 | rf.model <- train(two_year_recid ~ . - race, data=train.df,
 38 |                          method="rf", trControl=trControl)
 39 | 
 40 | # extract coefficients and calculate odds shown in table
 41 | logreg.coef <- coef(logreg.model$finalModel)
 42 | data.frame(
 43 |   coefficient=logreg.coef,
 44 |   odds=c(NA, exp(logreg.coef)[-c(1)])
 45 | ) %>% round(3)
 46 | 
 47 | 
 48 | # calculation of model accuracy based on cross-validation results
 49 | caret::confusionMatrix(logreg.model)
 50 | caret::confusionMatrix(rf.model)
 51 | 
 52 | ### Auditing the Model
 53 | 
 54 | library(ROCR)
 55 | holdoutMetrics <- function(df, model) {
 56 |   result <- data.frame(obs = df$two_year_recid, pred = predict(model, newdata=df),
 57 |     prob = predict(model, newdata=df, type="prob")$Yes)
 58 |   pred <- prediction(result$prob, result$obs)
 59 |   # compute overall performance
 60 |   perf_AUC <- performance(pred, "auc")
 61 |   AUC <- perf_AUC@y.values[[1]]
 62 |   cm <- confusionMatrix(result$pred, result$obs, positive="Yes")
 63 |   return (tibble(AUC=AUC, Accuracy = cm$overall["Accuracy"],
 64 |     FPR = 100*(1-cm$byClass["Specificity"]),
 65 |     FNR = 100*(1-cm$byClass["Sensitivity"])))
 66 | }
 67 | # compute performance by race
 68 | metricsByRace <- function(model) {
 69 |   metrics <- tibble()
 70 |   for (raceValue in levels(compas.df$race)) {
 71 |     df <- compas.df %>% filter(race==raceValue)
 72 |     metrics <- bind_rows(metrics, tibble(race=raceValue, holdoutMetrics(df, model)))
 73 |   }
 74 |   return (metrics)
 75 | }
 76 | # combine metrics for logistic and random forest
 77 | metrics <- bind_rows(
 78 |   tibble(Model="Random forest", metricsByRace(rf.model)),
 79 |   tibble(Model="Logistic regression", metricsByRace(logreg.model))
 80 | ) %>% filter(! race %in% c("Asian", "Native American"))
 81 | 
 82 | 
 83 | library(gridExtra)
 84 | makeBarchart <- function(metrics, aesthetics) {
 85 |   g <- ggplot(metrics, aesthetics) +
 86 |     geom_bar(position="dodge", stat="identity") +
 87 |     geom_text(hjust=1.5, position=position_dodge(.9)) +
 88 |     coord_flip() +
 89 |     scale_x_discrete(limits=rev) +
 90 |     labs(x="Race") +
 91 |     theme_bw()
 92 |   return (g)
 93 | }
 94 | g1 <- makeBarchart(metrics, aes(x=race, y=Accuracy, fill=Model, label=round(Accuracy, 3))) +
 95 |   theme(legend.position="none")
 96 | g2 <- makeBarchart(metrics, aes(x=race, y=AUC, fill=Model, label=round(AUC, 3))) +
 97 |   theme(legend.position="bottom")
 98 | grid.arrange(g1, g2, nrow=2, heights=c(6.25, 7))
 99 | 
100 | g <- arrangeGrob(g1, g2, heights=c(6.25, 7))
101 | ggsave(file=file.path("..", "figures", "chapter_22", "c22_acc_auc.pdf"),
102 |        g, width=5, height=5, units="in")
103 | 
104 | g1 <- makeBarchart(metrics, aes(x=race, y=FPR, fill=Model, label=round(FPR, 3))) +
105 |   theme(legend.position="none")
106 | g2 <- makeBarchart(metrics, aes(x=race, y=FNR, fill=Model, label=round(FNR, 3))) +
107 |   theme(legend.position="bottom")
108 | grid.arrange(g1, g2, heights=c(6.25, 7))
109 | 
110 | g <- arrangeGrob(g1, g2, heights=c(6.25, 7))
111 | ggsave(file=file.path("..", "figures", "chapter_22", "c22_fpr_fnr.pdf"),
112 |     g, width=5, height=5, units="in")
113 | 
114 | #### Interpretability Methods
115 | 
116 | library(iml)
117 | predictor.rf = Predictor$new(rf.model, data=valid.df, y=valid.df$two_year_recid)
118 | predictor.lm = Predictor$new(logreg.model, data=valid.df, y=valid.df$two_year_recid)
119 | 
120 | 
121 | featureEffect.lm = FeatureEffect$new(predictor.lm, feature='priors_count', method='pdp')
122 | featureEffect.rf = FeatureEffect$new(predictor.rf, feature='priors_count', method='pdp')
123 | combined <- bind_rows(
124 |   tibble(Method="Logistic regression", featureEffect.lm$results %>% filter(.class=="Yes")),
125 |   tibble(Method="Random forest", featureEffect.rf$results %>% filter(.class=="Yes"))
126 | )
127 | ggplot(combined, aes(x=priors_count, y=.value, color=Method)) +
128 |   geom_line() +
129 |   labs(x="Feature value", y="Probability of recidivism")
130 | 
131 | 
132 | ggsave(file=file.path("..", "figures", "chapter_22", "c22f005.pdf"),
133 |   last_plot() + theme_bw(), width=5, height=3, units="in")
134 | 
135 | 
136 | library(iml)
137 | predictor.rf = Predictor$new(rf.model, data=valid.df, y=valid.df$two_year_recid)
138 | predictor.lm = Predictor$new(logreg.model, data=valid.df, y=valid.df$two_year_recid)
139 | 
140 | # permutation feature importance
141 | FeatureImp$new(predictor.lm, "ce", compare="ratio", n.repetitions=5)
142 | FeatureImp$new(predictor.rf, "ce", compare="ratio", n.repetitions=5)
143 | 


--------------------------------------------------------------------------------
/R/chap23.R:
--------------------------------------------------------------------------------
1 | 
2 | if (!require(mlba)) {
3 |   library(devtools)
4 |   install_github("gedeck/mlba/mlba", force=TRUE)
5 | }
6 | 


--------------------------------------------------------------------------------
/R/chap4.R:
--------------------------------------------------------------------------------
  1 | 
  2 | if (!require(mlba)) {
  3 |   library(devtools)
  4 |   install_github("gedeck/mlba/mlba", force=TRUE)
  5 | }
  6 | 
  7 | options(dplyr.summarise.inform = FALSE)
  8 | library(tidyverse)
  9 | 
 10 | # Dimension Reduction
 11 | ## Data Summaries
 12 | ### Example 1: House Prices in Boston
 13 | 
 14 | boston.housing.df <- mlba::BostonHousing
 15 | head(boston.housing.df, 9)
 16 | summary(boston.housing.df)
 17 | 
 18 | # compute mean, standard dev., min, max, median, length, and missing values of CRIM
 19 | mean(boston.housing.df$CRIM)
 20 | sd(boston.housing.df$CRIM)
 21 | min(boston.housing.df$CRIM)
 22 | max(boston.housing.df$CRIM)
 23 | median(boston.housing.df$CRIM)
 24 | length(boston.housing.df$CRIM)
 25 | 
 26 | # find the number of missing values of variable CRIM
 27 | sum(is.na(boston.housing.df$CRIM))
 28 | 
 29 | # compute mean, standard dev., min, max, median, length, and missing values for all
 30 | # variables
 31 | data.frame(mean=sapply(boston.housing.df, mean),
 32 |            sd=sapply(boston.housing.df, sd),
 33 |            min=sapply(boston.housing.df, min),
 34 |            max=sapply(boston.housing.df, max),
 35 |            median=sapply(boston.housing.df, median),
 36 |            length=sapply(boston.housing.df, length),
 37 |            miss.val=sapply(boston.housing.df,
 38 |                            function(x) sum(length(which(is.na(x))))))
 39 | 
 40 | 
 41 | 
 42 | ### Summary Statistics
 43 | 
 44 | round(cor(boston.housing.df),2)
 45 | 
 46 | ### Aggregation and Pivot Tables
 47 | 
 48 | boston.housing.df <- mlba::BostonHousing
 49 | table(boston.housing.df$CHAS)
 50 | 
 51 | # tidyverse version
 52 | boston.housing.df %>% count(CHAS)
 53 | 
 54 | 
 55 | # create bins of size 1
 56 | boston.housing.df <- boston.housing.df %>%
 57 |   mutate(RM.bin = cut(RM, c(1:9), labels=FALSE))
 58 | 
 59 | # compute the average of MEDV by (binned) RM and CHAS
 60 | # in aggregate() use the argument by= to define the list of aggregating variables,
 61 | # and FUN= as an aggregating function.
 62 | aggregate(boston.housing.df$MEDV, by=list(RM=boston.housing.df$RM.bin,
 63 |           CHAS=boston.housing.df$CHAS), FUN=mean)
 64 | 
 65 | # tidyverse version
 66 | boston.housing.df %>%
 67 |   group_by(RM.bin, CHAS) %>%
 68 |   summarise(mean(MEDV))
 69 | 
 70 | 
 71 | # use install.packages("reshape") the first time the package is used
 72 | library(reshape)
 73 | boston.housing.df <- mlba::BostonHousing
 74 | # create bins of size 1
 75 | boston.housing.df <- boston.housing.df %>%
 76 |   mutate(RM.bin = cut(RM, c(1:9), labels=FALSE))
 77 | 
 78 | # use melt() to stack a set of columns into a single column of data.
 79 | # stack MEDV values for each combination of (binned) RM and CHAS
 80 | mlt <- melt(boston.housing.df, id=c("RM.bin", "CHAS"), measure=c("MEDV"))
 81 | head(mlt, 5)
 82 | 
 83 | # use cast() to reshape data and generate pivot table
 84 | cast(mlt, RM.bin ~ CHAS, subset=variable=="MEDV",
 85 |      margins=c("grand_row", "grand_col"), mean)
 86 | 
 87 | # tidyverse version
 88 | boston.housing.df %>%
 89 |   group_by(RM.bin, CHAS) %>%
 90 |   summarize(mean=mean(MEDV)) %>%
 91 |   spread(CHAS, mean)
 92 | 
 93 | ## Reducing the Number of Categories in Categorical Variables
 94 | 
 95 | boston.housing.df <- mlba::BostonHousing
 96 | 
 97 | tbl <- table(boston.housing.df$CAT.MEDV, boston.housing.df$ZN)
 98 | prop.tbl <- prop.table(tbl, margin=2)
 99 | barplot(prop.tbl, xlab="ZN", ylab="", yaxt="n",main="Distribution of CAT.MEDV by ZN")
100 | axis(2, at=(seq(0,1, 0.2)), paste(seq(0,100,20), "%"))
101 | 
102 | library(tidyverse)
103 | df <- data.frame(prop.tbl)
104 | ggplot(df, aes(x=Var2, y=Freq, group=Var1, fill=Var1)) +
105 |   geom_bar(stat="identity", color="grey", width=1) +
106 |   scale_y_continuous(labels = scales::percent, expand=expansion()) +
107 |   scale_fill_manual("CAT.MEDV", values=c("0"="#eeeeee", "1"="darkgrey")) +
108 |   labs(x="ZN", y="", title="Distribution of CAT.MEDV by ZN")
109 | 
110 | 
111 |   g <- last_plot() + theme_bw()
112 |   ggsave(file=file.path("..", "figures", "chapter_04", "reduction-pivot-bar.pdf"),
113 |          g, width=9, height=4, units="in")
114 | 
115 | 
116 | library(forecast)
117 | tru.data <- mlba::ToysRUsRevenues
118 | tru.ts <- ts(tru.data[, 3], start = c(1992, 1), end = c(1995, 4), freq = 4)
119 | autoplot(tru.ts) +
120 |   geom_point(size=0.5) +
121 |   labs(x="Time", y="Revenue ($ millions)") +
122 |   theme_bw()
123 | ggsave(file=file.path("..", "figures", "chapter_04", "ToysRUs.pdf"),
124 |        last_plot())
125 | 
126 | ## Principal Components Analysis
127 | ### Example 2: Breakfast Cereals
128 | 
129 | library(tidyverse)
130 | cereals.df <- mlba::Cereals %>% select(calories, rating)
131 | # compute PCs on two dimensions
132 | pcs <- prcomp(cereals.df %>% select(calories, rating))
133 | summary(pcs)
134 | pcs$rot
135 | scores <- pcs$x
136 | head(scores, 5)
137 | 
138 | 
139 | getPCaxis <- function(f, pcs, pcLabel) {
140 |   return (data.frame(
141 |     rbind(pcs$center + f * pcs$rotation[, pcLabel],
142 |           pcs$center - f * pcs$rotation[, pcLabel]))
143 |   )
144 | }
145 | PC1 <- getPCaxis(90, pcs, "PC1")
146 | PC2 <- getPCaxis(50, pcs, "PC2")
147 | ggplot(cereals.df, aes(x=calories, y=rating)) +
148 |   geom_point() +
149 |   geom_line(data=PC1) +
150 |   geom_line(data=PC2) +
151 |   coord_cartesian(xlim=c(0, 200), ylim=c(0, 110)) +
152 |   labs(x="Calories", y="Rating") +
153 |   annotate(geom="text", x=30, y=80, label="z[1]",parse=TRUE) +
154 |   annotate(geom="text", x=120, y=80, label="z[2]",parse=TRUE) +
155 |   theme_bw()
156 | 
157 | ggsave(file=file.path("..", "figures", "chapter_04", "pca_subset.pdf"),
158 |        width=5, height=3, last_plot())
159 | 
160 | ### Principal Components
161 | 
162 | # load and preprocess the data
163 | cereals.df <- mlba::Cereals %>%
164 |   column_to_rownames("name") %>%
165 |   select(-c(mfr, type)) %>%
166 |   drop_na()
167 | 
168 | pcs <- prcomp(cereals.df)
169 | summary(pcs)
170 | pcs$rotation[,1:5]
171 | 
172 | ### Normalizing the Data
173 | 
174 | # Use function prcomp() with scale. = T to run PCA on normalized data
175 | pcs.cor <- prcomp(cereals.df, scale. = T)
176 | 
177 | summary(pcs.cor)
178 | pcs.cor$rotation[,1:5]
179 | 
180 | 
181 | library(ggrepel)
182 | ggplot(data.frame(pcs.cor$x), aes(x=PC1, y=PC2, label=rownames(pcs.cor$x))) +
183 |   geom_point(shape=21) +
184 |   geom_text_repel(size=2, max.overlaps=7) +
185 |   theme_bw()
186 | 
187 | f <- 1.3
188 | ggsave(file=file.path("..", "figures", "chapter_04", "pca_full.pdf"),
189 |        width=f * 4, height=f * 5, last_plot())
190 | 
191 | ## Dimension Reduction Using Classification and Regression Trees
192 | ### Using Principal Components for Classification and Prediction
193 | 
194 |   wine.df <- mlba::Wine %>% select(-Type)
195 |   pcs.cor <- prcomp(wine.df)
196 |   summary(pcs.cor)
197 |   pcs.cor$rotation[,1:4]
198 | 


--------------------------------------------------------------------------------
/R/chap6.R:
--------------------------------------------------------------------------------
  1 | 
  2 | if (!require(mlba)) {
  3 |   library(devtools)
  4 |   install_github("gedeck/mlba/mlba", force=TRUE)
  5 | }
  6 | options(scipen=999, digits = 3)
  7 | 
  8 | # Multiple Linear Regression
  9 | ## Estimating the Regression Equation and Prediction
 10 | ### Example: Predicting the Price of Used Toyota Corolla Cars
 11 | 
 12 | library(caret)
 13 | car.df <- mlba::ToyotaCorolla
 14 | # select variables for regression
 15 | outcome <- "Price"
 16 | predictors <- c("Age_08_04", "KM", "Fuel_Type", "HP", "Met_Color", "Automatic",
 17 |                 "CC", "Doors", "Quarterly_Tax", "Weight")
 18 | # reduce data set to first 1000 rows and selected variables
 19 | car.df <- car.df[1:1000, c(outcome, predictors)]
 20 | 
 21 | # partition data
 22 | set.seed(1)  # set seed for reproducing the partition
 23 | idx <- createDataPartition(car.df$Price, p=0.6, list=FALSE)
 24 | train.df <- car.df[idx, ]
 25 | holdout.df <- car.df[-idx, ]
 26 | 
 27 | # use lm() to run a linear regression of Price on all 11 predictors in the
 28 | # training set.
 29 | # use . after ~ to include all the remaining columns in train.df as predictors.
 30 | car.lm <- lm(Price ~ ., data = train.df)
 31 | #  use options() to ensure numbers are not displayed in scientific notation.
 32 | options(scipen = 999)
 33 | summary(car.lm)
 34 | 
 35 | 
 36 | 
 37 | 
 38 | # use predict() to make predictions on a new set.
 39 | pred <- predict(car.lm, holdout.df)
 40 | 
 41 | options(scipen=999, digits=0)
 42 | data.frame(
 43 |   'Predicted' = pred[1:20],
 44 |   'Actual' = holdout.df$Price[1:20],
 45 |   'Residual' = holdout.df$Price[1:20] - pred[1:20]
 46 | )
 47 | options(scipen=999, digits = 3)
 48 | 
 49 | # calculate performance metrics
 50 | rbind(
 51 |   Training=mlba::regressionSummary(predict(car.lm, train.df), train.df$Price),
 52 |   Holdout=mlba::regressionSummary(pred, holdout.df$Price)
 53 | )
 54 | 
 55 | 
 56 | library(ggplot2)
 57 | pred <- predict(car.lm, holdout.df)
 58 | all.residuals <- holdout.df$Price - pred
 59 | 
 60 | ggplot() +
 61 |     geom_histogram(aes(x=all.residuals), fill="lightgray", color="grey") +
 62 |     labs(x="Residuals", y="Frequency")
 63 | 
 64 | 
 65 | g <- ggplot() +
 66 |          geom_histogram(aes(x=all.residuals), fill="lightgray", color="grey") +
 67 |          labs(x="Residuals", y="Frequency") +
 68 |          theme_bw()
 69 | ggsave(file=file.path("..", "figures", "chapter_06", "residuals-histogram.pdf"),
 70 |        g, width=5, height=3, units="in")
 71 | 
 72 | ### Cross-validation and caret
 73 | 
 74 | set.seed(1)
 75 | library(caret)
 76 | # define 5-fold
 77 | trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
 78 | model <- caret::train(Price ~ ., data=car.df,
 79 |                       method="lm",  # specify the model
 80 |                       trControl=trControl)
 81 | model
 82 | coef(model$finalModel)
 83 | 
 84 | 
 85 | library(tidyverse)
 86 | collectMetrics <- function(model, train.df, holdout.df, nPredictors) {
 87 |   if (missing(nPredictors)) {
 88 |     coefs = coef(model$finalModel)
 89 |     nPredictors = length(coefs) - 1
 90 |   }
 91 |   return (cbind(
 92 |     CV=model$results %>% slice_min(RMSE) %>% dplyr::select(c(RMSE, MAE)),
 93 |     Training=mlba::regressionSummary(predict(model, train.df), train.df$Price),
 94 |     Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$Price),
 95 |     nPredictors=nPredictors
 96 |   ))
 97 | }
 98 | 
 99 | metric.full <- collectMetrics(model, train.df, holdout.df)
100 | 
101 | 
102 | predict(model, car.df[1:3,])
103 | 
104 | 
105 | 
106 | ## Variable Selection in Linear Regression
107 | ### How to Reduce the Number of Predictors
108 | #### Exhaustive Search
109 | 
110 | # use regsubsets() in package leaps to run an exhaustive search.
111 | # unlike with lm, categorical predictors must be turned into dummies manually.
112 | library(leaps)
113 | library(fastDummies)
114 | 
115 | # create dummies for fuel type
116 | leaps.train.df <- dummy_cols(train.df, remove_first_dummy=TRUE,
117 |                              remove_selected_columns=TRUE)
118 | search <- regsubsets(Price ~ ., data=leaps.train.df, nbest=1,
119 |                      nvmax=ncol(leaps.train.df), method="exhaustive")
120 | sum <- summary(search)
121 | 
122 | # show models
123 | sum$which
124 | 
125 | # show metrics
126 | sum$rsq
127 | sum$adjr2
128 | sum$cp
129 | 
130 | 
131 | optimal <- which.min(sum$cp)
132 | 
133 | # determine the variable names for the optimal model
134 | X <- summary(search)$which[, -1]  # information about included predictors
135 | xvars <- dimnames(X)[[2]] ## column names (all covariates except intercept)
136 | xvars <- xvars[X[optimal,]]
137 | 
138 | # the optimal model contains all dummy variables of Fuel_Type
139 | xvars <- c("Age_08_04", "KM", "HP", "Quarterly_Tax", "Weight", "Fuel_Type")
140 | 
141 | # rebuild model for best predictor set
142 | set.seed(1)
143 | trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
144 | model <- caret::train(Price ~ ., data=car.df[, c("Price", xvars)],
145 |                       method="lm",  # specify the model
146 |                       trControl=trControl)
147 | model
148 | coef(model$finalModel)
149 | 
150 | metric.exhaustive <- collectMetrics(model, train.df, holdout.df)
151 | 
152 | #### Popular Subset Selection Algorithms
153 | 
154 | # as model performance is estimated using AIC, we don't need to use cross-validation
155 | trControl <- caret::trainControl(method="none")
156 | model <- caret::train(Price ~ ., data=train.df, trControl=trControl,
157 |                       # select backward elmination
158 |                       method="glmStepAIC", direction='backward')
159 | 
160 | coef(model$finalModel)
161 | 
162 | 
163 | model <- caret::train(Price ~ ., data=train.df, trControl=trControl,
164 |                       method="glmStepAIC", direction='forward')
165 | 
166 | coef(model$finalModel)
167 | 
168 | 
169 | model <- caret::train(Price ~ ., data=train.df, trControl=trControl,
170 |                       method="glmStepAIC", direction='both')
171 | 
172 | coef(model$finalModel)
173 | 
174 | 
175 | rbind(Training=mlba::regressionSummary(predict(model, train.df), train.df$Price),
176 |       Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$Price))
177 | 
178 | 
179 | 
180 | # The models are identical to the best model obtained from the exhaustive search.
181 | # We therefore duplicate the metrics.
182 | metric.stepwise <- metric.exhaustive
183 | 
184 | ## Regularization (Shrinkage Models)
185 | 
186 | set.seed(1)
187 | library(caret)
188 | trControl <- caret::trainControl(method='cv', number=5, allowParallel=TRUE)
189 | tuneGrid <- expand.grid(lambda=10^seq(5, 2, by=-0.1), alpha=0)
190 | model <- caret::train(Price ~ ., data=train.df,
191 |                  method='glmnet',
192 |                  family='gaussian',  # set the family for linear regression
193 |                  trControl=trControl,
194 |                  tuneGrid=tuneGrid)
195 | model$bestTune
196 | coef(model$finalModel, s=model$bestTune$lambda)
197 | 
198 | 
199 | metric.ridge <- collectMetrics(model, train.df, holdout.df,
200 |   length(coef(model$finalModel, s=model$bestTune$lambda)) - 1)
201 | ridge.model <- model
202 | 
203 | 
204 | rbind(
205 |   Training=mlba::regressionSummary(predict(model, train.df), train.df$Price),
206 |   Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$Price)
207 | )
208 | 
209 | 
210 | set.seed(1)
211 | tuneGrid <- expand.grid(lambda=10^seq(4, 0, by=-0.1), alpha=1)
212 | model <- caret::train(Price ~ ., data=train.df,
213 |                  method='glmnet',
214 |                  family='gaussian',  # set the family for linear regression
215 |                  trControl=trControl,
216 |                  tuneGrid=tuneGrid)
217 | model$bestTune
218 | coef(model$finalModel, s=model$bestTune$lambda)
219 | 
220 | 
221 | lasso.model <- model
222 | metric.lasso <- collectMetrics(lasso.model, train.df, holdout.df,
223 |   sum(coef(lasso.model$finalModel, s=lasso.model$bestTune$lambda) != 0) - 1)
224 | 
225 | 
226 | rbind(
227 |   Training=mlba::regressionSummary(predict(model, train.df), train.df$Price),
228 |   Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$Price)
229 | )
230 | 
231 | 
232 | library(tidyverse)
233 | library(gridExtra)
234 | g1 <- ggplot(ridge.model$results, aes(x=lambda, y=RMSE)) +
235 |     geom_pointrange(aes(ymin=RMSE-RMSESD, ymax=RMSE+RMSESD), color='grey') +
236 |     geom_line() +
237 |     geom_point(data=ridge.model$results %>% subset(RMSE == min(RMSE)), color='red') +
238 |     labs(x=expression(paste('Ridge parameter ', lambda)),
239 |          y='RMSE (cross-validation)') +
240 |     scale_x_log10()
241 | g2 <- ggplot(lasso.model$results, aes(x=lambda, y=RMSE)) +
242 |     geom_pointrange(aes(ymin=RMSE-RMSESD, ymax=RMSE+RMSESD), color='grey') +
243 |     geom_line() +
244 |     geom_point(data=lasso.model$results %>% subset(RMSE == min(RMSE)), color='red') +
245 |     labs(x=expression(paste('Lasso parameter ', lambda)),
246 |          y='RMSE (cross-validation)') +
247 |     scale_x_log10()
248 | grid.arrange(g1, g2, ncol=2)
249 | 
250 | g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw(), ncol=2)
251 | ggsave(file=file.path('..', 'figures', 'chapter_06', 'shrinkage-parameter-tuning.pdf'),
252 |        g, width=6, height=2.5, units='in')
253 | 
254 | 
255 | data.frame(rbind(
256 |   'full'= metric.full,
257 |   'exhaustive' = metric.exhaustive,
258 |   'stepwise' = metric.stepwise,
259 |   'ridge' = metric.ridge,
260 |   'lasso' = metric.lasso
261 | ))
262 | 


--------------------------------------------------------------------------------
/R/chap7.R:
--------------------------------------------------------------------------------
 1 | 
 2 | if (!require(mlba)) {
 3 |   library(devtools)
 4 |   install_github("gedeck/mlba/mlba", force=TRUE)
 5 | }
 6 | 
 7 | # $k$-Nearest Neighbors ($k$
 8 | ## The $k$-NN Classifier (Categorical Outcome)
 9 | ### Example: Riding Mowers
10 | 
11 | library(ggrepel)
12 | mowers.df <- mlba::RidingMowers
13 | set.seed(35)
14 | 
15 | idx <- sample(nrow(mowers.df), 0.6*nrow(mowers.df))
16 | train.df <- mowers.df[idx, ]
17 | holdout.df <- mowers.df[-idx, ]
18 | ## new household
19 | new.df <- data.frame(Income = 60, Lot_Size = 20)
20 | 
21 | ggplot(mapping=aes(x=Income, y=Lot_Size, shape=Ownership, color=Ownership)) +
22 |   geom_point(data=train.df) +
23 |   geom_text_repel(aes(label=rownames(train.df)), data=train.df, show.legend = FALSE) +
24 |   geom_point(data=cbind(new.df, Ownership='New'))
25 | 
26 | 
27 | g <- ggplot(mapping=aes(x=Income, y=Lot_Size, shape=Ownership, color=Ownership, fill=Ownership)) +
28 |   geom_point(data=train.df, size=4) +
29 |   geom_text_repel(aes(label=rownames(train.df)), data=train.df, show.legend = FALSE) +
30 |   geom_point(data=cbind(new.df, Ownership='New'),  size=5) +
31 |   scale_shape_manual(values = c(18, 15, 21)) +
32 |   scale_color_manual(values = c('black', 'darkorange', 'steelblue')) +
33 |   scale_fill_manual(values = c('black', 'darkorange', 'lightblue'))
34 | 
35 | g
36 | 
37 | ggsave(file=file.path("..", "figures", "chapter_07", "knn-riding-mower.pdf"),
38 |        g + theme_bw(), width=6, height=4, units="in")
39 | 
40 | 
41 | library(caret)
42 | # train k-NN model with k=3
43 | model <- train(Ownership ~ ., data=train.df,
44 |                method="knn",  # specify the model
45 |                preProcess=c("center", "scale"),  # normalize data
46 |                tuneGrid=expand.grid(k=3),
47 |                trControl=trainControl(method="none"))
48 | model
49 | 
50 | # predict new data point
51 | predict(model, new.df)
52 | 
53 | # determine nearest neighbors to new data point
54 | train.norm.df <- predict(model$preProcess, train.df)
55 | new.norm.df <- predict(model$preProcess, new.df)
56 | distances <- apply(train.norm.df[, 1:2], 1,
57 |                    function(d){ sqrt(sum((d - new.norm.df)^2)) })
58 | rownames(train.df)[order(distances)][1:3]
59 | 
60 | ### Choosing $k$
61 | 
62 | # use leave-one-out cross-validation for small dataset
63 | trControl <- trainControl(method="loocv", number=5, allowParallel=TRUE)
64 | model <- train(Ownership ~ ., data=train.df,
65 |                method="knn",
66 |                preProcess=c("center", "scale"),
67 |                tuneGrid=expand.grid(k=seq(1, 13, 2)),
68 |                trControl=trControl)
69 | model
70 | 
71 | 
72 | model <- train(Ownership ~ ., data=mowers.df,
73 |                method="knn",
74 |                preProcess=c("center", "scale"),
75 |                tuneGrid=expand.grid(k=7),
76 |                trControl=trainControl(method="none"))
77 | predict(model, new.df)
78 | 
79 | ### Setting the Cutoff Value
80 | 
81 | train.norm.df <- predict(model$preProcess, train.df)
82 | new.norm.df <- predict(model$preProcess, new.df)
83 | distances <- apply(train.norm.df[, 1:2], 1,
84 |                    function(d){ sqrt(sum((d - new.norm.df)^2)) })
85 | train.df[order(distances)[1:8],]
86 | 


--------------------------------------------------------------------------------
/R/chap8.R:
--------------------------------------------------------------------------------
 1 | 
 2 | if (!require(mlba)) {
 3 |   library(devtools)
 4 |   install_github("gedeck/mlba/mlba", force=TRUE)
 5 | }
 6 | options(scipen=999, digits = 3)
 7 | 
 8 | # The Naive Bayes Classifier
 9 | ## Solution: Naive Bayes
10 | ### Example 3: Predicting Delayed Flights
11 | 
12 | library(tidyverse)
13 | library(caret)
14 | library(e1071)
15 | # load and preprocess dataset
16 | delays.df <- mlba::FlightDelays %>%
17 |   mutate(
18 |     # change numerical variables to categorical
19 |     DAY_WEEK = factor(DAY_WEEK),
20 |     ORIGIN = factor(ORIGIN),
21 |     DEST = factor(DEST),
22 |     CARRIER = factor(CARRIER),
23 |     Flight.Status = factor(Flight.Status),
24 |     # create hourly bins for departure time
25 |     CRS_DEP_TIME = factor(round(CRS_DEP_TIME / 100))
26 |   ) %>%
27 |   select(DAY_WEEK, CRS_DEP_TIME, ORIGIN, DEST, CARRIER, Flight.Status)
28 | 
29 | # create training and holdout sets
30 | set.seed(1)
31 | idx <- createDataPartition(delays.df$Flight.Status, p=0.6, list=FALSE)
32 | train.df <- delays.df[idx, ]
33 | holdout.df <- delays.df[-idx, ]
34 | 
35 | # run naive bayes
36 | delays.nb <- naiveBayes(Flight.Status ~ ., data = train.df)
37 | delays.nb
38 | 
39 | 
40 | 
41 | 
42 | # use prop.table() with margin = 1 to convert a count table to a proportions table,
43 | # where each row sums up to 1 (use margin = 2 for column sums)
44 | prop.table(table(train.df$Flight.Status, train.df$DEST), margin = 1)
45 | 
46 | 
47 | ## predict probabilities
48 | pred.prob <- predict(delays.nb, newdata=holdout.df, type="raw")
49 | ## predict class membership
50 | pred.class <- predict(delays.nb, newdata=holdout.df)
51 | 
52 | df <- data.frame(actual=holdout.df$Flight.Status, predicted=pred.class, pred.prob)
53 | 
54 | df[holdout.df$CARRIER == "DL" & holdout.df$DAY_WEEK == 7 & holdout.df$CRS_DEP_TIME == 10 &
55 |      holdout.df$DEST == "LGA" & holdout.df$ORIGIN == "DCA",]
56 | 
57 | 
58 | # training
59 | confusionMatrix(predict(delays.nb, newdata=train.df), train.df$Flight.Status)
60 | 
61 | # holdout
62 | confusionMatrix(predict(delays.nb, newdata=holdout.df), holdout.df$Flight.Status)
63 | 
64 | 
65 | library(gains)
66 | actual <- ifelse(holdout.df$Flight.Status == "delayed", 1, 0)
67 | gain <- gains(actual, pred.prob[,"delayed"], groups=length(actual) - 2)
68 | 
69 | nactual <-sum(actual)
70 | ggplot() +
71 |   geom_line(aes(x=gain$cume.obs, y=gain$cume.pct.of.total * nactual)) +
72 |   geom_line(aes(x=c(0, max(gain$cume.obs)), y=c(0, nactual)), color="darkgrey") +
73 |   labs(x="# Cases", y="Cumulative")
74 | 
75 | 
76 | ggsave(file=file.path("..", "figures", "chapter_08", "Flights-NB-gain.pdf"),
77 |        width=3, height=3,
78 |        last_plot() + theme_bw())
79 | 
80 | ### Working with Continuous Predictors
81 | 
82 |   (p_delayed = dnorm(213, mean=211.36215, sd=15.31))
83 |   (p_ontime = dnorm(213, mean=211.99436, sd=12.79))
84 | 
85 |   p_ontime * 0.805
86 |   p_delayed * 0.195
87 | 


--------------------------------------------------------------------------------
/R/chap9.R:
--------------------------------------------------------------------------------
  1 | 
  2 | if (!require(mlba)) {
  3 |   library(devtools)
  4 |   install_github("gedeck/mlba/mlba", force=TRUE)
  5 | }
  6 | options(scipen=999)
  7 | 
  8 | # Classification and Regression Trees
  9 | ## Classification Trees
 10 | ### Example 1: Riding Mowers
 11 | 
 12 | library(rpart)
 13 | library(rpart.plot)
 14 | mowers.df <- mlba::RidingMowers
 15 | class.tree <- rpart(Ownership ~ ., data = mowers.df,
 16 |                     control = rpart.control(minsplit = 0),
 17 |                     method = "class")
 18 | rpart.rules(class.tree)
 19 | 
 20 | plot_common_styling <- function(g, filename) {
 21 |   g <- g +
 22 |     geom_point(size=2) +
 23 |     scale_color_manual(values=c("darkorange", "steelblue")) +
 24 |     scale_fill_manual(values=c("darkorange", "lightblue")) +
 25 |     labs(x="Income ($000s)", y="Lot size (000s sqft)") +
 26 |     theme_bw() +
 27 |     theme(legend.position=c(0.89, 0.91),
 28 |           legend.title=element_blank(),
 29 |           legend.key=element_blank(),
 30 |           legend.background=element_blank())
 31 |   ggsave(file=file.path("..", "figures", "chapter_09", filename),
 32 |          g, width=5, height=3, units="in")
 33 |   return(g)
 34 | }
 35 | g <- ggplot(mowers.df, mapping=aes(x=Income, y=Lot_Size, color=Ownership, fill=Ownership))
 36 | plot_common_styling(g, "mowers_tree_0.pdf")
 37 | g <- g + geom_vline(xintercept=59.7)
 38 | plot_common_styling(g, "mowers_tree_1.pdf")
 39 | g <- g + geom_segment(x=59.9, y=21, xend=25, yend=21, color="black")
 40 | plot_common_styling(g, "mowers_tree_2.pdf")
 41 | g <- g + geom_segment(x=59.9, y=19.8, xend=120, yend=19.8, color="black")
 42 | plot_common_styling(g, "mowers_tree_3.pdf")
 43 | g <- g + geom_segment(x=84.75, y=19.8, xend=84.75, yend=10, color="black")
 44 | plot_common_styling(g, "mowers_tree_4.pdf")
 45 | g <- g + geom_segment(x=61.5, y=19.8, xend=61.5, yend=10, color="black")
 46 | plot_common_styling(g, "mowers_tree_5.pdf")
 47 | 
 48 | ### Measures of Impurity
 49 | #### Normalization
 50 | 
 51 | ggplot() +
 52 |   scale_x_continuous(limits=c(0,1)) +
 53 |   geom_hline(yintercept=0.5, linetype=2, color="grey") +
 54 |   geom_hline(yintercept=1, linetype=2, color="grey") +
 55 |   geom_function(aes(color="Entropy measure"),
 56 |                 fun = function(x) {- x*log2(x) - (1-x)*log2(1-x)},
 57 |                 xlim=c(0.0001, 0.9999), n=100) +
 58 |   geom_function(aes(color="Gini index"),
 59 |                 fun = function(x) {1 - x^2 - (1-x)^2}) +
 60 |   labs(y="Impurity measure", x=expression(~italic(p)[1]), color="Impurity measure") +
 61 |   scale_color_manual(values=c("Entropy measure"="darkorange", "Gini Index"="steelblue"))
 62 | 
 63 | ggsave(file=file.path("..", "figures", "chapter_09", "gini_entropy.pdf"),
 64 |        last_plot() + theme_bw(), width=5, height=2.5, units="in")
 65 | 
 66 | 
 67 | 
 68 | library(rpart)
 69 | library(rpart.plot)
 70 | mowers.df <- mlba::RidingMowers
 71 | 
 72 | # use rpart() to run a classification tree.
 73 | # define rpart.control() in rpart() to determine the depth of the tree.
 74 | class.tree <- rpart(Ownership ~ ., data = mowers.df,
 75 |     control=rpart.control(maxdepth=2), method="class")
 76 | ## plot tree
 77 | # use rpart.plot() to plot the tree. You can control plotting parameters such
 78 | # as color, shape, and information displayed (which and where).
 79 | rpart.plot(class.tree, extra=1, fallen.leaves=FALSE)
 80 | 
 81 | 
 82 |   pdf(file.path("..", "figures", "chapter_09", "CT-mowerTree1.pdf"), width=3, height=3)
 83 |     rpart.plot(class.tree, extra=1, fallen.leaves=FALSE)
 84 |   dev.off()
 85 | 
 86 |   class.tree <- rpart(Ownership ~ ., data = mowers.df,
 87 |       control=rpart.control(minsplit=1), method="class")
 88 |   rpart.plot(class.tree, extra=1, fallen.leaves=FALSE)
 89 |   pdf(file.path("..", "figures", "chapter_09", "CT-mowerTree3.pdf"), width=5, height=5)
 90 |     rpart.plot(class.tree, extra=1, fallen.leaves=FALSE)
 91 |   dev.off()
 92 | 
 93 | 
 94 | class.tree
 95 | 
 96 | ## Evaluating the Performance of a Classification Tree
 97 | ### Example 2: Acceptance of Personal Loan
 98 | 
 99 | library(tidyverse)
100 | library(caret)
101 | 
102 | # Load and preprocess data
103 | bank.df <- mlba::UniversalBank %>%
104 |   # Drop ID and zip code columns.
105 |   select(-c(ID, ZIP.Code)) %>%
106 |   # convert Personal.Loan to a factor with labels Yes and No
107 |   mutate(Personal.Loan = factor(Personal.Loan, levels=c(0, 1), labels=c("No", "Yes")),
108 |          Education = factor(Education, levels=c(1, 2, 3), labels=c("UG", "Grad", "Prof")))
109 | 
110 | # partition
111 | set.seed(1)
112 | idx <- createDataPartition(bank.df$Personal.Loan, p=0.6, list=FALSE)
113 | train.df <- bank.df[idx, ]
114 | holdout.df <- bank.df[-idx, ]
115 | 
116 | # classification tree
117 | default.ct <- rpart(Personal.Loan ~ ., data=train.df, method="class")
118 | # plot tree
119 | rpart.plot(default.ct, extra=1, fallen.leaves=FALSE)
120 | 
121 | 
122 | pdf(file.path("..", "figures", "chapter_09", "CT-universalTree1.pdf"), width=5, height=5)
123 |   rpart.plot(default.ct, extra=1, fallen.leaves=FALSE)
124 | dev.off()
125 | 
126 | 
127 | deeper.ct <- rpart(Personal.Loan ~ ., data=train.df, method="class", cp=0, minsplit=1)
128 | # count number of leaves
129 | sum(deeper.ct$frame$var == "<leaf>")
130 | # plot tree
131 | rpart.plot(deeper.ct, extra=1, fallen.leaves=FALSE)
132 | 
133 | 
134 | pdf(file.path("..", "figures", "chapter_09", "CT-universalTree2.pdf"), width=5, height=2.5)
135 |   rpart.plot(deeper.ct, extra=1, fallen.leaves=FALSE)
136 | dev.off()
137 | 
138 | 
139 | # classify records in the holdout data.
140 | # set argument type = "class" in predict() to generate predicted class membership.
141 | default.ct.point.pred.train <- predict(default.ct,train.df,type = "class")
142 | # generate confusion matrix for training data
143 | confusionMatrix(default.ct.point.pred.train, train.df$Personal.Loan)
144 | ### repeat the code for the holdout set, and the deeper tree
145 | 
146 | 
147 | default.ct.point.pred.holdout <- predict(default.ct,holdout.df,type = "class")
148 | confusionMatrix(default.ct.point.pred.holdout, holdout.df$Personal.Loan)
149 | 
150 | deeper.ct.point.pred.train <- predict(deeper.ct,train.df,type = "class")
151 | confusionMatrix(deeper.ct.point.pred.train, train.df$Personal.Loan)
152 | 
153 | deeper.ct.point.pred.holdout <- predict(deeper.ct,holdout.df,type = "class")
154 | confusionMatrix(default.ct.point.pred.holdout, holdout.df$Personal.Loan)
155 | 
156 | ## Avoiding Overfitting
157 | ### Stopping Tree Growth
158 | #### Stopping Tree Growth: Grid Search for Parameter Tuning
159 | 
160 | set.seed(1)
161 | trControl <- trainControl(method="cv", number=5, allowParallel=TRUE)
162 | model1 <- train(Personal.Loan ~ ., data=train.df,
163 |                 method="rpart", trControl=trControl,
164 |                 tuneGrid=data.frame(cp=c(1, 0.1, 0.01, 0.001, 0.0001)))
165 | model1$results
166 | # focus grid search around cp=0.001
167 | model2 <- train(Personal.Loan ~ ., data=train.df,
168 |                 method="rpart", trControl=trControl,
169 |                 tuneGrid=data.frame(cp=c(0.005, 0.002, 0.001, 0.0005, 0.0002)))
170 | model2$results
171 | 
172 | ### Pruning the Tree
173 | #### Stopping Tree Growth: Conditional Inference Trees
174 | 
175 | # argument xval refers to the number of folds to use in rpart's built-in
176 | # cross-validation procedure
177 | # argument cp sets the smallest value for the complexity parameter.
178 | cv.ct <- rpart(Personal.Loan ~ ., data=train.df, method="class",
179 |     cp=0.00001, minsplit=5, xval=5)
180 | # use printcp() to print the table.
181 | printcp(cv.ct)
182 | 
183 | 
184 | # prune by lower cp
185 | pruned.ct <- prune(cv.ct,
186 |     cp=cv.ct$cptable[which.min(cv.ct$cptable[,"xerror"]),"CP"])
187 | sum(pruned.ct$frame$var == "<leaf>")
188 | rpart.plot(pruned.ct, extra=1, fallen.leaves=FALSE)
189 | 
190 | 
191 | pdf(file.path("..", "figures", "chapter_09", "CT-universalTree-pruned.pdf"), width=5, height=2.5)
192 |   rpart.plot(pruned.ct, extra=1, fallen.leaves=FALSE)
193 | dev.off()
194 | 
195 | ### Best-Pruned Tree
196 | 
197 |   # prune by lower cp
198 |   minErrorRow <- cv.ct$cptable[which.min(cv.ct$cptable[,"xerror"]), ]
199 |   cutoff <- minErrorRow["xerror"] + minErrorRow["xstd"]
200 |   best.cp <- cv.ct$cptable[cv.ct$cptable[,"xerror"] < cutoff,][1, "CP"]
201 | 
202 |   best.ct <- prune(cv.ct, cp=best.cp)
203 |   sum(best.ct$frame$var == "<leaf>")
204 |   rpart.plot(best.ct, extra=1, fallen.leaves=FALSE)
205 |   pdf(file.path("..", "figures", "chapter_09", "CT-universalTree-best.pdf"), width=4, height=2.75)
206 |     rpart.plot(best.ct, extra=1, fallen.leaves=FALSE)
207 |   dev.off()
208 | 
209 | ## Classification Rules from Trees
210 | 
211 | rpart.rules(best.ct)
212 | 
213 | ## Regression Trees
214 | 
215 | # select variables for regression
216 | outcome <- "Price"
217 | predictors <- c("Age_08_04", "KM", "Fuel_Type", "HP", "Met_Color", "Automatic",
218 |                 "CC", "Doors", "Quarterly_Tax", "Weight")
219 | # reduce data set to first 1000 rows and selected variables
220 | car.df <- mlba::ToyotaCorolla[1:1000, c(outcome, predictors)]
221 | 
222 | # partition data
223 | set.seed(1)  # set seed for reproducing the partition
224 | idx <- createDataPartition(car.df$Price, p=0.6, list=FALSE)
225 | car.train.df <- car.df[idx, ]
226 | car.holdout.df <- car.df[-idx, ]
227 | 
228 | # use method "anova" for a regression model
229 | cv.rt <- rpart(Price ~ ., data=car.train.df, method="anova",
230 |     cp=0.00001, minsplit=5, xval=5)
231 | 
232 | # prune by lower cp
233 | minErrorRow <- cv.rt$cptable[which.min(cv.rt$cptable[,"xerror"]), ]
234 | cutoff <- minErrorRow["xerror"] + minErrorRow["xstd"]
235 | best.cp <- cv.rt$cptable[cv.rt$cptable[,"xerror"] < cutoff,][1, "CP"]
236 | 
237 | best.rt <- prune(cv.rt, cp=best.cp)
238 | 
239 | # set digits to a negative number to avoid scientific notation
240 | rpart.plot(best.rt, extra=1, fallen.leaves=FALSE, digits=-4)
241 | 
242 | pdf(file.path("..", "figures", "chapter_09", "RT-ToyotaTree.pdf"), width=7, height=4)
243 |   rpart.plot(best.rt, extra=1, fallen.leaves=FALSE, digits=-4)
244 | dev.off()
245 | 
246 | ## Improving Prediction: Random Forests and Boosted Trees
247 | ### Random Forests
248 | 
249 | library(randomForest)
250 | ## random forest
251 | rf <- randomForest(Personal.Loan ~ ., data=train.df, ntree=500,
252 |     mtry=4, nodesize=5, importance=TRUE)
253 | 
254 | ## variable importance plot
255 | varImpPlot(rf, type=1)
256 | 
257 | ## confusion matrix
258 | rf.pred <- predict(rf, holdout.df)
259 | confusionMatrix(rf.pred, holdout.df$Personal.Loan)
260 | 
261 | 
262 |   pdf(file.path("..", "figures", "chapter_09", "VarImp.pdf"), width=7, height=4)
263 |     varImpPlot(rf, type=1, main="")
264 |   dev.off()
265 | 
266 | ### Boosted Trees
267 | 
268 | library(caret)
269 | library(xgboost)
270 | 
271 | xgb <- train(Personal.Loan ~ ., data=train.df, method="xgbTree", verbosity=0)
272 | 
273 | # compare ROC curves for classification tree, random forest, and boosted tree models
274 | library(ROCR)
275 | rocCurveData <- function(model, data) {
276 |   prob <- predict(model, data, type="prob")[, "Yes"]
277 |   predob <- prediction(prob, data$Personal.Loan)
278 |   perf <- performance(predob, "tpr", "fpr")
279 |   return (data.frame(tpr=perf@x.values[[1]], fpr=perf@y.values[[1]]))
280 | }
281 | 
282 | performance.df <- rbind(
283 |   cbind(rocCurveData(best.ct, holdout.df), model="Best-pruned tree"),
284 |   cbind(rocCurveData(rf, holdout.df), model="Random forest"),
285 |   cbind(rocCurveData(xgb, holdout.df), model="xgboost")
286 | )
287 | colors <- c("Best-pruned tree"="grey", "Random forest"="blue", "xgboost"="tomato")
288 | ggplot(performance.df, aes(x=tpr, y=fpr, color=model)) +
289 |   geom_line() +
290 |   scale_color_manual(values=colors) +
291 |   geom_segment(aes(x=0, y=0, xend=1, yend=1), color="grey", linetype="dashed") +
292 |   labs(x="1 - Specificity", y="Sensitivity", color="Model")
293 | 
294 | 
295 | library(gridExtra)
296 | g <- last_plot() + theme_bw()
297 | g1 <- g + guides(color="none")
298 | g2 <- g + scale_x_continuous(limits=c(0, 0.2)) + scale_y_continuous(limits=c(0.8, 1.0))
299 | 
300 | g <- arrangeGrob(g1, g2, widths=c(3, 4.5), ncol=2)
301 | ggsave(file=file.path("..", "figures", "chapter_09", "xgboost-ROC-1.pdf"),
302 |        g, width=8, height=3, units="in")
303 | 
304 | 
305 | xgb.focused <- train(Personal.Loan ~ ., data=train.df,
306 |              method="xgbTree", verbosity=0,
307 |              scale_pos_weight=10)
308 | 
309 | performance.df <- rbind(
310 | cbind(rocCurveData(xgb, holdout.df), model="xgboost"),
311 | cbind(rocCurveData(xgb.focused, holdout.df), model="xgboost (focused)")
312 | )
313 | 
314 | colors <- c("xgboost"="tomato", "xgboost (focused)"="darkgreen")
315 | ggplot(performance.df, aes(x=tpr, y=fpr, color=model)) +
316 |   geom_line() +
317 |   scale_color_manual(values=colors) +
318 |   geom_segment(aes(x=0, y=0, xend=1, yend=1), color="grey", linetype="dashed") +
319 |   labs(x="1 - Specificity", y="Sensitivity", color="Model")
320 | 
321 | 
322 | library(gridExtra)
323 | g <- last_plot() + theme_bw()
324 | g1 <- g + guides(color="none")
325 | g2 <- g + scale_x_continuous(limits=c(0, 0.2)) + scale_y_continuous(limits=c(0.8, 1.0))
326 | 
327 | g <- arrangeGrob(g1, g2, widths=c(3, 4.5), ncol=2)
328 | ggsave(file=file.path("..", "figures", "chapter_09", "xgboost-ROC-2.pdf"),
329 |        g, width=8, height=3, units="in")
330 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <table>
 2 | <tr>
 3 | <td><img src="img/mlba-bookcover.png" width=300></td>
 4 | <td>
 5 | <h1>Machine Learning for Business Analytics<br>
 6 | <small>Concepts, Techniques, and Applications in R</small></h1>
 7 | 
 8 | by Galit Shmueli, Peter C. Bruce, Peter Gedeck, Inbal Yahav, Nitin R. Patel
 9 | 
10 | Publisher: Wiley; 2nd edition (February, 2023)
11 | ISBN: 978-1-118-83517-2
12 | Buy at
13 | <a href="https://www.amazon.com/Machine-Learning-Business-Analytics-Applications/dp/1119835178/?&_encoding=UTF8&tag=petergedeck-20&linkCode=ur2&linkId=681b98b51c3d935ffec95c8284e2e0bc&camp=1789&creative=9325">Amazon</a>
14 | or
15 | <a href="https://www.wiley.com/en-us/Machine+Learning+for+Business+Analytics%3A+Concepts%2C+Techniques%2C+and+Applications+in+R%2C+2nd+Edition-p-9781119835172">Wiley</a>
16 | 
17 | <!-- Errata: http://oreilly.com/catalog/errata.csp?isbn=9781492072942 -->
18 | </td>
19 | </tr>
20 | </table>
21 | 
22 | # Description
23 | Machine learning —also known as data mining or data analytics— is a fundamental part of data science. It is used by organizations in a wide variety of arenas to turn raw data into actionable information.
24 | 
25 | Machine Learning for Business Analytics: Concepts, Techniques, and Applications in R provides a comprehensive introduction and an overview of this methodology. This best-selling textbook covers both statistical and machine learning algorithms for prediction, classification, visualization, dimension reduction, rule mining, recommendations, clustering, text mining, experimentation and network analytics. Along with hands-on exercises and real-life case studies, it also discusses managerial and ethical issues for responsible use of machine learning techniques.
26 | 
27 | # Source code, datasets, and instructors material
28 | This repository contains:
29 | 
30 | - `Rmd`: R code of individual chapters as 
31 |   [R markdown files](https://github.com/gedeck/mlba-R-code/tree/main/Rmd) - 
32 |   download all as [mlba-Rmd.zip](mlba-Rmd.zip)
33 | - `R`: R code of individual chapters as plain R
34 |   [R files](https://github.com/gedeck/mlba-R-code/tree/main/R) - 
35 |   download all as [mlba-R.zip](mlba-R.zip)
36 | 
37 | The datasets are distributed using the [mlba](https://github.com/gedeck/mlba) package; see below for installation instructions. 
38 | To find instructors material go to [www.dataminingbook.com](https://www.dataminingbook.com/book/r-2nd-edition-2022). 
39 | 
40 | 
41 | # Installation of R packages used in the book
42 | R and most packages can be installed directly from [CRAN](https://cran.r-project.org/). Go there for instructions on how to install R and individual packages. 
43 | The [RStudio IDE](https://posit.co/downloads/) is a 
44 | 
45 | ## MLBA
46 | The `mlba` package is available from [](https://github.com/gedeck/mlba). You can install this package using the following commands:
47 | ```
48 | if (!require(mlba)) {
49 |   library(devtools)
50 |   install_github("gedeck/mlba/mlba", force=TRUE)
51 | }
52 | ```
53 | Note that this requires the installation of the `devtools` package
54 | 
55 | ## DiscriMiner
56 | The `DiscriMiner` package is currently not available from CRAN. You can install it directly from Github as described in https://github.com/gastonstat/DiscriMiner
57 | ```
58 | if (!require(DiscriMiner)) {
59 |     library(devtools)
60 |     install_github('DiscriMiner’, username='gastonstat')
61 | }
62 | ```
63 | 
64 | # Setting up an environment for deep learning applications
65 | In order to run the code for the deep learning applications, you will need to create a Python environment with the required packages. A convenient way to do this is to use [Anaconda](https://www.anaconda.com/products/individual). See [installPython.md](installPython.md) for instructions on how to install Anaconda and create the required environment.


--------------------------------------------------------------------------------
/Rmd/chap1.Rmd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/Rmd/chap1.Rmd


--------------------------------------------------------------------------------
/Rmd/chap11.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Machine Learning for Business Analytics"
  3 | author: "Chapter 11: Neural Nets"
  4 | output:
  5 |   pdf_document:
  6 |     toc: no
  7 |     highlight: tango
  8 | #  html_document:
  9 | #    toc: yes
 10 | #    toc_depth: 4
 11 | #    toc_float: yes
 12 | ---
 13 | <style>
 14 | h1.title { font-size: 28px; }
 15 | h1 { font-size: 22px; }
 16 | h2 { font-size: 18px; }
 17 | h3 { font-size: 14px; }
 18 | h4 { font-size: 12px; }
 19 | </style>
 20 | ```{r global_options, include=FALSE}
 21 | knitr::opts_chunk$set(error=TRUE,        # Keep compiling upon error
 22 |                       collapse=FALSE,    # collapse by default
 23 |                       echo=TRUE,         # echo code by default
 24 |                       comment = "#>",    # change comment character
 25 |                       fig.width = 5.5,     # set figure width
 26 |                       fig.align = "center",# set figure position
 27 |                       out.width = "49%", # set width of displayed images
 28 |                       warning=FALSE,     # do not show R warnings
 29 |                       message=FALSE,     # do not show R messages
 30 |                       row.print=25)
 31 | ```
 32 | 
 33 | 
 34 | ```{r}
 35 | if (!require(mlba)) {
 36 |   library(devtools)
 37 |   install_github("gedeck/mlba/mlba", force=TRUE)
 38 | }
 39 | ```
 40 | 
 41 | # Fitting a Network to Data
 42 | ## Training the Model
 43 | ### Back Propagation of Error
 44 | ```{r}
 45 | library(neuralnet)
 46 | df <- mlba::TinyData
 47 | df$Like <- df$Acceptance=="like"
 48 | df$Dislike <- df$Acceptance=="dislike"
 49 | set.seed(1)
 50 | ```
 51 | 
 52 | ```{r}
 53 | nn <- neuralnet(Like + Dislike ~ Salt + Fat, data = df, linear.output = F, hidden = 3)
 54 | 
 55 | # display weights
 56 | nn$weights
 57 | 
 58 | # display predictions
 59 | prediction(nn)
 60 | 
 61 | # plot network
 62 | plot(nn, rep="best")
 63 | ```
 64 | 
 65 | ```{r}
 66 | 
 67 | ```
 68 | 
 69 | ```{r}
 70 | library(caret)
 71 | predict <- compute(nn, data.frame(df$Salt, df$Fat))
 72 | predicted.class=apply(predict$net.result,1,which.max)-1
 73 | confusionMatrix(factor(ifelse(predicted.class=="1", "dislike", "like")),
 74 |                 factor(df$Acceptance))
 75 | ```
 76 | 
 77 | ```{r}
 78 | 
 79 | ```
 80 | 
 81 | ## Example 2: Classifying Accident Severity
 82 | ```{r}
 83 | library(tidyverse)
 84 | library(fastDummies)
 85 | 
 86 | # convert SUR_COND and ALCHL_I to dummy variables (remove firest dummy)
 87 | # convert outcome MAX_SEV_IR to dummy variables keeping all
 88 | accidents.df <- mlba::AccidentsNN %>%
 89 |   dummy_cols(select_columns=c("ALCHL_I", "SUR_COND"),
 90 |              remove_selected_columns=TRUE, remove_first_dummy=TRUE) %>%
 91 |   dummy_cols(select_columns=c("MAX_SEV_IR"),
 92 |              remove_selected_columns=TRUE)
 93 | 
 94 | # partition the data
 95 | set.seed(1)
 96 | idx <- createDataPartition(mlba::AccidentsNN$MAX_SEV_IR, p=0.6, list=FALSE)
 97 | train.df <- accidents.df[idx, ]
 98 | holdout.df <- accidents.df[-idx, ]
 99 | train.actual <- mlba::AccidentsNN[idx, ]$MAX_SEV_IR
100 | holdout.actual <- mlba::AccidentsNN[-idx, ]$MAX_SEV_IR
101 | 
102 | nn <- neuralnet(MAX_SEV_IR_0 + MAX_SEV_IR_1 + MAX_SEV_IR_2 ~ .,
103 |                 data=train.df, hidden=2)
104 | 
105 | # predict the three outcome variables and assign class using maximum score
106 | pred.train <- predict(nn, train.df)
107 | class.train <- apply(pred.train, 1, which.max)-1
108 | confusionMatrix(factor(class.train), factor(train.actual))
109 | 
110 | pred.holdout <- predict(nn, holdout.df)
111 | class.holdout <- apply(pred.holdout, 1, which.max)-1
112 | confusionMatrix(factor(class.holdout), factor(holdout.actual))
113 | ```
114 | 
115 | ```{r}
116 | # 1 not able to predict 2 0.8496
117 | # 2, 3 0.8697
118 | # 4, 5 0.8647
119 | ```
120 | 
121 | # Deep Learning
122 | ## Example: Classification of Fashion Images
123 | ```{r}
124 | # code to prepare a similar picture using R
125 | library(reticulate)
126 | use_condaenv('mlba-r')
127 | library(keras)
128 | 
129 | fashion_mnist <- keras::dataset_fashion_mnist()
130 | clothes.labels <- c('Top', 'Trouser', 'Pullover', 'Dress', 'Coat',
131 |                     'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Boot')
132 | x_train <- fashion_mnist$train$x
133 | y_train <- fashion_mnist$train$y
134 | 
135 | rotate <- function(x) t(apply(x, 2, rev))
136 | plot_image <- function(x, title = "", title.color = "black") {
137 |   x <- rotate(x)
138 |   image(x, axes=FALSE, col=grey(seq(0, 1, length = 256)),
139 |         main=list(title, col=title.color))
140 | }
141 | plot_sample <- function() {
142 |   par(mfrow=c(4, 5), mar=c(0, 0.2, 1, 0.2))
143 |   for (offset in c(0, 5)) {
144 |     range <- (1+offset):(5+offset)
145 |     for (i in range) {
146 |       examples <- which(y_train %in% (i-1))
147 |       example <- examples[1]
148 |       plot_image(x_train[example, , ], clothes.labels[y_train[example] + 1])
149 |     }
150 |     for (i in range) {
151 |       examples <- which(y_train %in% (i-1))
152 |       example <- examples[2]
153 |       plot_image(x_train[example, , ])
154 |     }
155 |   }
156 | }
157 | plot_sample()
158 | 
159 | pdf(file=file.path("..", "figures", "chapter_11", "fashion-mnist-sample.pdf"), width=5, height=4)
160 |   plot_sample()
161 | dev.off()
162 | ```
163 | 
164 | ### Data Preprocessing
165 | ```{r}
166 | # load required packages
167 | # Keras and TensorFlow require a Python conda environment with these packages installed
168 | library(reticulate)
169 | use_condaenv('mlba-r')
170 | library(keras)
171 | library(tensorflow)
172 | 
173 | # load the data and split into training and validation sets
174 | fashion_mnist <- keras::dataset_fashion_mnist()
175 | x_train <- fashion_mnist$train$x
176 | y_train <- fashion_mnist$train$y
177 | x_valid <- fashion_mnist$test$x
178 | y_valid <- fashion_mnist$test$y
179 | 
180 | # pixel values need to be scaled to range [0, 1]
181 | x_train <- x_train / 255
182 | x_valid <- x_valid / 255
183 | 
184 | # input require an additional dimension to describe pixel values
185 | # dimensions are (samples, row, column, pixel)
186 | x_train <- array_reshape(x_train, c(dim(x_train), 1))
187 | x_valid <- array_reshape(x_valid, c(dim(x_valid), 1))
188 | 
189 | # output values need to be converted into a matrix with one-hot-encoding of classes
190 | # dimensions are (samples, classes)
191 | y_train <- to_categorical(y_train, 10)
192 | y_valid <- to_categorical(y_valid, 10)
193 | dim(x_train)
194 | dim(y_train)
195 | ```
196 | 
197 | ```{r, eval=FALSE}
198 | # Model definition (architecture taken from
199 | # https://keras.rstudio.com/articles/examples/mnist_cnn.html
200 | input_shape = dim(x_train)[2:4]
201 | num_classes <- 10
202 | 
203 | model <- keras_model_sequential()
204 | model %>%
205 |   layer_conv_2d(filters=32, kernel_size=c(5,5), activation='relu',
206 |                 input_shape=input_shape) %>%
207 |   layer_conv_2d(filters=64, kernel_size=c(3,3), activation='relu') %>%
208 |   layer_max_pooling_2d(pool_size=c(2,2)) %>%
209 |   layer_dropout(rate=0.25) %>%
210 |   layer_flatten() %>%
211 |   layer_dense(units = 128, activation = 'relu') %>%
212 |   layer_dropout(rate = 0.5) %>%
213 |   layer_dense(units = num_classes, activation = 'softmax')
214 | 
215 | model
216 | 
217 | # compile model
218 | model %>% compile(
219 |   loss = loss_categorical_crossentropy,
220 |   optimizer = optimizer_adadelta(),
221 |   metrics = c('accuracy')
222 | )
223 | 
224 | # train and evaluate
225 | model %>% fit(
226 |   x_train, y_train,
227 |   batch_size = 128,
228 |   epochs = 20,
229 |   verbose = 1,
230 |   validation_data = list(x_valid, y_valid)
231 | )
232 | ```
233 | 
234 | ### Training a Deep Learning Network
235 | ```{r}
236 | library(gridExtra)
237 | df <- read.csv("cv-training.csv")
238 | g1 <- ggplot(df, aes(x=epoch)) +
239 |   geom_line(aes(y=loss), color="steelblue") +
240 |   geom_line(aes(y=val_loss), color="tomato") +
241 |   geom_hline(yintercept=min(df$val_loss), color="black", linetype="dotted") +
242 |   geom_vline(xintercept=which.min(df$val_loss), color="black", linetype="dotted") +
243 |   labs(x="Epoch", y="Loss") +
244 |   theme_bw()
245 | g2 <- ggplot(df, aes(x=epoch)) +
246 |   geom_line(aes(y=accuracy), color="steelblue") +
247 |   geom_line(aes(y=val_accuracy), color="tomato") +
248 |   geom_hline(yintercept=max(df$val_accuracy), color="black", linetype="dotted") +
249 |   labs(x="Epoch", y="Accuracy") +
250 |   theme_bw()
251 | grid.arrange(g1, g2, ncol=2)
252 | 
253 | 
254 | g <- arrangeGrob(g1, g2, ncol=2)
255 | ggsave(file=file.path("..", "figures", "chapter_11", "fashion-mnist-learning.pdf"),
256 |        g, width=5, height=2.5, units="in")
257 | ```
258 | 
259 | ### Applying the Predictive Model
260 | ```{r}
261 | model <- load_model_tf("cnn-model.tf")
262 | ```
263 | 
264 | ```{r}
265 | propensities <- predict(model, x_valid)
266 | propensities[1:5, ]
267 | 
268 | # convert to class using winner takes all
269 | predClass <- apply(propensities, 1, which.max)
270 | predClass[1:5]
271 | 
272 | # confusion matrix
273 | caret::confusionMatrix(factor(predClass), factor(fashion_mnist$test$y + 1))
274 | ```
275 | 
276 | ```{r}
277 | 
278 | ```
279 | 
280 | 


--------------------------------------------------------------------------------
/Rmd/chap12.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Machine Learning for Business Analytics"
  3 | author: "Chapter 12: Discriminant Analysis"
  4 | output:
  5 |   pdf_document:
  6 |     toc: no
  7 |     highlight: tango
  8 | #  html_document:
  9 | #    toc: yes
 10 | #    toc_depth: 4
 11 | #    toc_float: yes
 12 | ---
 13 | <style>
 14 | h1.title { font-size: 28px; }
 15 | h1 { font-size: 22px; }
 16 | h2 { font-size: 18px; }
 17 | h3 { font-size: 14px; }
 18 | h4 { font-size: 12px; }
 19 | </style>
 20 | ```{r global_options, include=FALSE}
 21 | knitr::opts_chunk$set(error=TRUE,        # Keep compiling upon error
 22 |                       collapse=FALSE,    # collapse by default
 23 |                       echo=TRUE,         # echo code by default
 24 |                       comment = "#>",    # change comment character
 25 |                       fig.width = 5.5,     # set figure width
 26 |                       fig.align = "center",# set figure position
 27 |                       out.width = "49%", # set width of displayed images
 28 |                       warning=FALSE,     # do not show R warnings
 29 |                       message=FALSE,     # do not show R messages
 30 |                       row.print=25)
 31 | ```
 32 | 
 33 | 
 34 | ```{r}
 35 | if (!require(mlba)) {
 36 |   library(devtools)
 37 |   install_github("gedeck/mlba/mlba", force=TRUE)
 38 | }
 39 | ```
 40 | 
 41 | # Introduction
 42 | ## Example 1: Riding Mowers
 43 | ```{r}
 44 | library(ggplot2)
 45 | mowers.df <- mlba::RidingMowers
 46 | g <- ggplot(mowers.df, mapping=aes(x=Income, y=Lot_Size, color=Ownership, fill=Ownership)) +
 47 |   geom_point(size=4) +
 48 |   geom_abline(intercept=40, slope=-0.34) +
 49 |   scale_shape_manual(values = c(15, 21)) +
 50 |   scale_color_manual(values = c('darkorange', 'steelblue')) +
 51 |   scale_fill_manual(values = c('darkorange', 'lightblue'))
 52 | 
 53 | g
 54 | 
 55 | ggsave(file=file.path("..", "figures", "chapter_12", "riding-mower.pdf"),
 56 |        g + theme_bw(), width=6, height=4, units="in")
 57 | ```
 58 | 
 59 | ## Example 2: Personal Loan Acceptance
 60 | ```{r}
 61 | library(gridExtra)
 62 | makePlot  <- function(df, title, alpha) {
 63 |   no_personal_loan <- subset(df, Personal.Loan == 0)
 64 |   personal_loan <- subset(df, Personal.Loan == 1)
 65 | 
 66 |   g <- ggplot(universal.df, aes(x=Income, y=CCAvg, color=Personal.Loan)) +
 67 |       geom_point(aes(color="nonacceptor"), data=no_personal_loan, alpha=alpha) +
 68 |       geom_point(aes(color="acceptor"), data=personal_loan) +
 69 |       labs(title=title, colour="Personal Loan", x='Annual income ($000s)',
 70 |            y='Monthly average credit card spending ($000s)') +
 71 |       scale_color_manual(values=c("lightblue", "steelblue"),
 72 |              guide=guide_legend(override.aes=list(size=3, alpha=1))) +
 73 |       scale_x_log10() +
 74 |       scale_y_log10() +
 75 |       theme_bw()
 76 | 
 77 |   return (g)
 78 | }
 79 | 
 80 | set.seed(1)
 81 | universal.df <- mlba::UniversalBank
 82 | idx <- sample(dim(universal.df)[1], 200)
 83 | g1 <- makePlot(universal.df[idx, ], 'Sample of 200 customers', 1.0) +
 84 |         theme(legend.position = c(0.2, 0.85))
 85 | g2 <- makePlot(universal.df, 'All 5000 customers', 0.5) +
 86 |         guides(color="none")
 87 | grid.arrange(g1, g2, ncol=2)
 88 | 
 89 | g <- arrangeGrob(g1, g2, ncol=2)
 90 | ggsave(file=file.path("..", "figures", "chapter_12", "personalLoan_sampled.pdf"),
 91 |       g, width=8, height=4, units="in")
 92 | ```
 93 | 
 94 | # Fisher's Linear Classification Functions
 95 | ```{r}
 96 | library(caret)
 97 | mowers.df <- mlba::RidingMowers
 98 | trControl <- caret::trainControl(method='none')
 99 | model <- train(Ownership ~ Income + Lot_Size, data=mowers.df,
100 |                method='lda', trControl=trControl)
101 | model$finalModel  # access the wrapped LDA model
102 | 
103 | # DiscriMiner exposes the Fisher's linear classification function
104 | library(DiscriMiner)
105 | mowers.df <- mlba::RidingMowers
106 | da.mower <- linDA(mowers.df[,1:2], mowers.df[,3])
107 | da.mower$functions
108 | ```
109 | 
110 | ```{r}
111 | da.mower <- linDA(mowers.df[,1:2], mowers.df[,3])
112 | # compute propensities manually (below); or, use lda() in package MASS or caret with predict()
113 | propensity.owner <- exp(da.mower$scores[,2])/(exp(da.mower$scores[,1])+exp(da.mower$scores[,2]))
114 | data.frame(Actual=mowers.df$Ownership, Predicted=da.mower$classification,
115 |            da.mower$scores, propensity.owner=propensity.owner)
116 | ```
117 | 
118 | ```{r}
119 | library(ggplot2)
120 | da.mower <- train(Ownership ~ Income + Lot_Size, data=mowers.df,
121 |                   method='lda', trControl=trControl)
122 | means <- colSums(da.mower$finalModel$means) / 2
123 | sIncome <- da.mower$finalModel$scaling['Income', 'LD1']
124 | sLotSize <- da.mower$finalModel$scaling['Lot_Size', 'LD1']
125 | m <- - sIncome / sLotSize
126 | y0 <- means['Lot_Size'] - m * means['Income']
127 | 
128 | mowers.df <- mlba::RidingMowers
129 | g <- ggplot(mowers.df, mapping=aes(x=Income, y=Lot_Size, color=Ownership, fill=Ownership)) +
130 |   geom_point(size=4) +
131 |   geom_point(data=data.frame(da.mower$finalModel$means), color='black', fill='black', shape=4, size=3) +
132 |   geom_abline(aes(linetype='ad hoc line', intercept=40, slope=-0.34), color='darkgrey') +
133 |   geom_abline(aes(linetype='LDA line', intercept=y0, slope=m)) +
134 |   scale_shape_manual(values = c(15, 21)) +
135 |   scale_color_manual(values = c('darkorange', 'steelblue')) +
136 |   scale_fill_manual(values = c('darkorange', 'lightblue')) +
137 |   scale_linetype_manual(name='Linetype', values=c(2, 1), labels=c('ad hoc line', 'LDA line')) +
138 |   guides(fill = guide_legend(order = 1), color = guide_legend(order = 1), 
139 |          linetype = guide_legend(order = 2))
140 | g
141 | ggsave(file=file.path("..", "figures", "chapter_12", "LDA-riding-mower.pdf"),
142 |        g + theme_bw(), width=6, height=4, units="in")
143 | ```
144 | 
145 | # Prior Probabilities
146 | ```{r}
147 | trControl <- caret::trainControl(method='none')
148 | model <- train(Ownership ~ Income + Lot_Size, data=mowers.df,
149 |                method='lda', trControl=trControl)
150 | model.prior <- train(Ownership ~ Income + Lot_Size, data=mowers.df,
151 |                method='lda', prior=c(0.85, 0.15),
152 |                trControl=trControl)
153 | 
154 | family.13 <- mowers.df[13,]
155 | predict(model, family.13)
156 | predict(model.prior, family.13)
157 | ```
158 | 
159 | # Classifying More Than Two Classes
160 | ## Example 3: Medical Dispatch to Accident Scenes
161 | ```{r}
162 | library(DiscriMiner)
163 | library(caret)
164 | 
165 | accidents.df <-  mlba::Accidents
166 | lda.model <- linDA(accidents.df[,1:10], accidents.df[,11])
167 | lda.model$functions
168 | confusionMatrix(as.factor(lda.model$classification), as.factor(accidents.df$MAX_SEV))
169 | ```
170 | 
171 | ```{r}
172 | propensity <- exp(lda.model$scores[,1:3])/
173 |     (exp(lda.model$scores[,1])+exp(lda.model$scores[,2])+exp(lda.model$scores[,3]))
174 | 
175 | res <- data.frame(Actual = accidents.df$MAX_SEV,
176 |            Classification = lda.model$classification,
177 |            Score = round(lda.model$scores,2),
178 |            Propensity = round(propensity,2))
179 | head(res)
180 | ```
181 | 
182 | ```{r}
183 | 
184 | ```
185 | 
186 | ```{r}
187 | library(tidyverse)
188 | accidents.df <-  mlba::Accidents %>%
189 |   mutate(MAX_SEV = factor(MAX_SEV))
190 | da.model <- train(MAX_SEV ~ ., data=accidents.df, method='lda')
191 | res <- data.frame(Actual=accidents.df$MAX_SEV,
192 |            Classification=predict(da.model),
193 |            Propensity=predict(da.model, type='prob') %>% round(2))
194 | head(res)
195 | ```
196 | 
197 | ```{r}
198 | 
199 | ```
200 | 
201 | 


--------------------------------------------------------------------------------
/Rmd/chap13.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Machine Learning for Business Analytics"
  3 | author: "Chapter 13: Generating, Comparing, and Combining Multiple Models"
  4 | output:
  5 |   pdf_document:
  6 |     toc: no
  7 |     highlight: tango
  8 | #  html_document:
  9 | #    toc: yes
 10 | #    toc_depth: 4
 11 | #    toc_float: yes
 12 | ---
 13 | <style>
 14 | h1.title { font-size: 28px; }
 15 | h1 { font-size: 22px; }
 16 | h2 { font-size: 18px; }
 17 | h3 { font-size: 14px; }
 18 | h4 { font-size: 12px; }
 19 | </style>
 20 | ```{r global_options, include=FALSE}
 21 | knitr::opts_chunk$set(error=TRUE,        # Keep compiling upon error
 22 |                       collapse=FALSE,    # collapse by default
 23 |                       echo=TRUE,         # echo code by default
 24 |                       comment = "#>",    # change comment character
 25 |                       fig.width = 5.5,     # set figure width
 26 |                       fig.align = "center",# set figure position
 27 |                       out.width = "49%", # set width of displayed images
 28 |                       warning=FALSE,     # do not show R warnings
 29 |                       message=FALSE,     # do not show R messages
 30 |                       row.print=25)
 31 | ```
 32 | 
 33 | 
 34 | ```{r}
 35 | if (!require(mlba)) {
 36 |   library(devtools)
 37 |   install_github("gedeck/mlba/mlba", force=TRUE)
 38 | }
 39 | ```
 40 | 
 41 | # Ensembles
 42 | ## Bagging and Boosting in R
 43 | ### Combining Propensities
 44 | ```{r}
 45 | library(tidyverse)
 46 | library(adabag)
 47 | library(rpart)
 48 | library(caret)
 49 | 
 50 | set.seed(1)
 51 | # load and preprocess the data
 52 | bank.df <- mlba::UniversalBank %>%
 53 |     select(-c(ID, ZIP.Code)) %>%
 54 |     mutate(
 55 |       Personal.Loan = factor(Personal.Loan, levels=c(0, 1), labels=c("No", "Yes"))
 56 |     )
 57 | 
 58 | # partition the data
 59 | train.index <- sample(c(1:dim(bank.df)[1]), dim(bank.df)[1]*0.6)
 60 | train.df <- bank.df[train.index, ]
 61 | holdout.df <- bank.df[-train.index, ]
 62 | 
 63 | # single tree (rpart)
 64 | tr <- rpart(Personal.Loan ~ ., data=train.df)
 65 | 
 66 | # bagging and boosting using adabag
 67 | bag <- bagging(Personal.Loan ~ ., data=train.df)
 68 | boost <- boosting(Personal.Loan ~ ., data=train.df)
 69 | 
 70 | # bagging and boosting using randomForest and xgboost with parameter tuning
 71 | bag.rf <- train(Personal.Loan ~ ., data=train.df, method="rf")
 72 | boost.xgb <- train(Personal.Loan ~ ., data=train.df, method="xgbTree", verbosity=0)
 73 | ```
 74 | 
 75 | ```{r}
 76 | library(ROCR)
 77 | rocCurveData <- function(prob, data) {
 78 |   predob <- prediction(prob, data$Personal.Loan)
 79 |   perf <- performance(predob, "tpr", "fpr")
 80 |   return (data.frame(tpr=perf@x.values[[1]], fpr=perf@y.values[[1]]))
 81 | }
 82 | performance.df <- rbind(
 83 |   cbind(rocCurveData(predict(tr, holdout.df, type="prob")[,"Yes"], holdout.df), model="Single tree"),
 84 |   cbind(rocCurveData(predict(bag, holdout.df)$prob[, 2], holdout.df), model="Bagging"),
 85 |   cbind(rocCurveData(predict(boost, holdout.df)$prob[, 2], holdout.df), model="Boosting")
 86 | )
 87 | colors <- c("Single tree"="grey", "Bagging"="blue", "Boosting"="tomato")
 88 | g1 <- ggplot(performance.df, aes(x=tpr, y=fpr, color=model)) +
 89 |   geom_line() +
 90 |   scale_color_manual(values=colors) +
 91 |   geom_segment(aes(x=0, y=0, xend=1, yend=1), color="grey", linetype="dashed") +
 92 |   labs(x="1 - Specificity", y="Sensitivity", color="Model")
 93 | g1
 94 | 
 95 | performance.df <- rbind(
 96 |   cbind(rocCurveData(predict(tr, holdout.df, type="prob")[,"Yes"], holdout.df),
 97 |         model="Single tree"),
 98 |   cbind(rocCurveData(predict(bag.rf, holdout.df, type="prob")[,"Yes"], holdout.df),
 99 |         model="Random forest"),
100 |   cbind(rocCurveData(predict(boost.xgb, holdout.df, type="prob")[,"Yes"], holdout.df),
101 |         model="xgboost")
102 | )
103 | colors <- c("Single tree"="grey", "Random forest"="blue", "xgboost"="tomato")
104 | g2 <- ggplot(performance.df, aes(x=tpr, y=fpr, color=model)) +
105 |   geom_line() +
106 |   scale_color_manual(values=colors) +
107 |   geom_segment(aes(x=0, y=0, xend=1, yend=1), color="grey", linetype="dashed") +
108 |   labs(x="1 - Specificity", y="Sensitivity", color="Model")
109 | g2
110 | library(gridExtra)
111 | g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw(), ncol=2, widths=c(0.49, 0.51))
112 | ggsave(file=file.path("..", "figures", "chapter_13", "bagging-boosting.pdf"),
113 |        g, width=8, height=3, units="in")
114 | ```
115 | 
116 | # Automated Machine Learning (AutoML)
117 | ## AutoML: Explore and Clean Data
118 | ```{r}
119 | library(tidyverse)
120 | 
121 | # load and preprocess the data
122 | bank.df <- mlba::UniversalBank %>%
123 |   # Drop ID and zip code columns.
124 |   select(-c(ID, ZIP.Code)) %>%
125 |   # convert Personal.Loan to a factor with labels Yes and No
126 |   mutate(Personal.Loan = factor(Personal.Loan, levels=c(0, 1), labels=c("No", "Yes")))
127 | 
128 | # partition the data
129 | set.seed(1)
130 | idx <- caret::createDataPartition(bank.df$Personal.Loan, p=0.6, list=FALSE)
131 | train.df <- bank.df[idx, ]
132 | holdout.df <- bank.df[-idx, ]
133 | ```
134 | 
135 | ```{r}
136 | library(h2o)
137 | 
138 | # Start the H2O cluster (locally)
139 | h2o.init()
140 | 
141 | train.h2o <- as.h2o(train.df)
142 | holdout.h2o <- as.h2o(holdout.df)
143 | ```
144 | 
145 | ```{r}
146 | 
147 | ```
148 | 
149 | ## AutoML: Choose Features and Machine Learning Methods
150 | ```{r}
151 | # identify outcome and predictors
152 | y <- "Personal.Loan"
153 | x <- setdiff(names(train.df), y)
154 | 
155 | # run AutoML for 20 base models
156 | aml <- h2o.automl(x=x, y=y, training_frame=train.h2o,
157 |                   max_models=20, exclude_algos=c("DeepLearning"),
158 |                   seed=1)
159 | aml.balanced <- h2o.automl(x=x, y=y, training_frame=train.h2o,
160 |                   max_models=20, exclude_algos=c("DeepLearning"),
161 |                   balance_classes=TRUE,
162 |                   seed=1)
163 | 
164 | aml
165 | ```
166 | 
167 | ```{r}
168 | aml.balanced
169 | ```
170 | 
171 | ## AutoML: Evaluate Model Performance
172 | ```{r}
173 | h2o.confusionMatrix(aml@leader, holdout.h2o)
174 | h2o.confusionMatrix(aml.balanced@leader, holdout.h2o)
175 | ```
176 | 
177 | # Explaining Model Predictions
178 | ## Explaining Model Predictions: LIME
179 | ```{r}
180 | cases <- c('3055', '3358', # predicted Yes
181 |            '2', '1178')    # predicted No
182 | explainer <- lime::lime(train.df, aml@leader, bin_continuous=TRUE, quantile_bins=FALSE)
183 | explanations <- lime::explain(holdout.df[cases,], explainer, n_labels=1, n_features=8)
184 | 
185 | lime::plot_features(explanations, ncol=2)
186 | ```
187 | 
188 | ```{r}
189 | pdf(file=file.path("..", "figures", "chapter_13", "lime-analysis.pdf"),
190 |     width=7, height=6)
191 |     lime::plot_features(explanations, ncol=2)
192 | dev.off()
193 | ```
194 | 
195 | 


--------------------------------------------------------------------------------
/Rmd/chap14.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Machine Learning for Business Analytics"
  3 | author: "Chapter 14: Interventions: Experiments, Uplift Models, and Reinforcement Learning"
  4 | output:
  5 |   pdf_document:
  6 |     toc: no
  7 |     highlight: tango
  8 | #  html_document:
  9 | #    toc: yes
 10 | #    toc_depth: 4
 11 | #    toc_float: yes
 12 | ---
 13 | <style>
 14 | h1.title { font-size: 28px; }
 15 | h1 { font-size: 22px; }
 16 | h2 { font-size: 18px; }
 17 | h3 { font-size: 14px; }
 18 | h4 { font-size: 12px; }
 19 | </style>
 20 | ```{r global_options, include=FALSE}
 21 | knitr::opts_chunk$set(error=TRUE,        # Keep compiling upon error
 22 |                       collapse=FALSE,    # collapse by default
 23 |                       echo=TRUE,         # echo code by default
 24 |                       comment = "#>",    # change comment character
 25 |                       fig.width = 5.5,     # set figure width
 26 |                       fig.align = "center",# set figure position
 27 |                       out.width = "49%", # set width of displayed images
 28 |                       warning=FALSE,     # do not show R warnings
 29 |                       message=FALSE,     # do not show R messages
 30 |                       row.print=25)
 31 | ```
 32 | 
 33 | 
 34 | ```{r}
 35 | if (!require(mlba)) {
 36 |   library(devtools)
 37 |   install_github("gedeck/mlba/mlba", force=TRUE)
 38 | }
 39 | ```
 40 | 
 41 | # A/B Testing
 42 | ## The statistical test for comparing two groups (T-test)
 43 | ```{r}
 44 | pt(q=2.828, df=1998, lower.tail=FALSE)
 45 | pt(q=2.626, df=1998, lower.tail=FALSE)
 46 | ```
 47 | 
 48 | # Uplift (Persuasion) Modeling
 49 | ## Computing Uplift with R
 50 | ```{r}
 51 | library(tidyverse)
 52 | # load and preprocess the data
 53 | predictors <- c("AGE", "NH_WHITE", "COMM_PT", "H_F1", "REG_DAYS",
 54 |                 "PR_PELIG", "E_PELIG", "POLITICALC", "MESSAGE_A")
 55 | outcome <- "MOVED_AD"
 56 | voter.df <- mlba::VoterPersuasion %>%
 57 |   select(all_of(c(predictors, outcome)))
 58 | 
 59 | 
 60 | set.seed(1)
 61 | nrows <- dim(voter.df)[1]
 62 | train.index <- sample(1:nrows, nrows * 0.6)
 63 | train.df <- voter.df[train.index, ]
 64 | holdout.df <- voter.df[-train.index, ]
 65 | 
 66 | # build a random forest model using caret
 67 | train_control <- caret::trainControl(method="none")
 68 | model <- caret::train(MOVED_AD ~ ., data=train.df,
 69 |                       trControl=train_control,
 70 |                       method="rf")
 71 | 
 72 | # calculating the uplift
 73 | uplift_df <- data.frame(holdout.df)
 74 | uplift_df$MESSAGE_A <- 1
 75 | predTreatment <- predict(model, newdata=uplift_df, type="prob")
 76 | uplift_df$MESSAGE_A <- 0
 77 | predControl <- predict(model, newdata=uplift_df, type="prob")
 78 | upliftResult <- data.frame(
 79 |   probMessage = predTreatment[, 1],
 80 |   probNoMessage = predControl[, 1]
 81 | )
 82 | upliftResult$uplift <- upliftResult$probMessage - upliftResult$probNoMessage
 83 | head(upliftResult)
 84 | ```
 85 | 
 86 | ```{r}
 87 | 
 88 | ```
 89 | 
 90 | # Reinforcement Learning
 91 | ## Example of using a Contextual Multi-Arm Bandit for Movie Recommendations
 92 | ```{r}
 93 | library(tidyverse)
 94 | library(mlba)
 95 | library(contextual)
 96 | library(data.table)
 97 | library(splitstackshape)
 98 | 
 99 | # preprocess movies data to create indicator variables for the different genres
100 | movies_dat <- as.data.table(mlba::MovieLensMovies)
101 | movies_dat <- splitstackshape::cSplit_e(movies_dat, "genres", sep="|", type="character",
102 |                                         fill=0, drop=TRUE)
103 | movies_dat[[3]] <- NULL  # deletes the third column
104 | 
105 | ratings_dat <- as.data.table(mlba::MovieLensRatings)
106 | all_movies <- ratings_dat[movies_dat, on=c(movieId="movieId")]
107 | all_movies <- na.omit(all_movies, cols=c("movieId", "userId"))
108 | # renumber userId to sequential numbers starting at 1
109 | all_movies[, userId := as.numeric(as.factor(userId))]
110 | 
111 | # find the top-50 most frequently rated movies
112 | top_50 <- all_movies %>%
113 |   count(movieId) %>%
114 |   slice_max(n, n=50) %>%
115 |   pull(movieId)
116 | top_50_movies   <- all_movies[movieId %in% top_50]
117 | # renumber movieId to sequential numbers starting at 1
118 | top_50_movies[, movieId   := as.numeric(as.factor(movieId))]
119 | 
120 | # create profile of genres for each movie in the top-50 (arm_features)
121 | arm_features <- top_50_movies %>%
122 |   select(-c(userId, rating, timestamp, title)) %>%
123 |   # select one row for each movieId
124 |   group_by(movieId) %>% slice(1) %>% ungroup()
125 | 
126 | # for each user, create their profile of genre preferences based on
127 | # their viewed movies that are not in the top-50 (user_features)
128 | user_features <- all_movies %>%
129 |   filter(! movieId %in% top_50) %>%   # restrict to movies not in the top-50
130 |   select(-c(movieId, rating, timestamp, title)) %>%
131 |   # for each user, sum
132 |   group_by(userId) %>%
133 |   summarise_all(sum) %>%
134 |   # normalize user profile
135 |   group_by(userId) %>%
136 |   mutate(
137 |     total = sum(c_across(genres_Action:genres_Western)),
138 |     across(genres_Action:genres_Western, ~ ./total)
139 |   ) %>%
140 |   select(-c(total)) %>%
141 |   as.data.table()
142 | 
143 | 
144 | # add users who only rated top-50 movies
145 | # their genre preference profile is set to 0 for all genres
146 | all_users <- as.data.table(unique(all_movies$userId))
147 | user_features <- user_features[all_users, on=c(userId="V1")]
148 | user_features[is.na(user_features)] <- 0
149 | setorder(user_features, userId)
150 | ```
151 | 
152 | ```{r}
153 | 
154 | ```
155 | 
156 | ```{r}
157 | # prepare the data for use with the contextual package
158 | top_50_movies[, t := .I]
159 | top_50_movies[, sim := 1]
160 | top_50_movies[, agent := "Offline"]
161 | top_50_movies[, choice := movieId]
162 | top_50_movies[, reward := ifelse(rating <= 4, 0, 1)]
163 | setorder(top_50_movies,timestamp, title)
164 | 
165 | # the bandit samples users with their genre preferences (user_features),
166 | # movie choices (choice), and ratings.
167 | # each movie is characterized by the genre profile (arm_features)
168 | # these data are used to train the agent
169 | environment <- OfflineLookupReplayEvaluatorBandit$new(
170 |     top_50_movies,
171 |     k = 50,
172 |     unique_col = "userId",
173 |     unique_lookup = user_features,
174 |     shared_lookup = arm_features)
175 | 
176 | # define list of strategies to evaluate
177 | agents <-list(
178 |     Agent$new(RandomPolicy$new(), environment, "Random"),
179 |     Agent$new(LinUCBDisjointOptimizedPolicy$new(2.1), environment, "LinUCB Dis"))
180 | 
181 | # setup and run simulation
182 | simulation  <- Simulator$new(
183 |     agents = agents,
184 |     simulations = 20,
185 |     horizon = 10000L,
186 |     save_interval = 1)
187 | results  <- simulation$run()
188 | 
189 | plot(results, type="cumulative", regret=FALSE, rate=TRUE,
190 |      legend_position="topleft", disp="sd")
191 | ```
192 | 
193 | ```{r}
194 | pdf(file=file.path("..", "figures", "chapter_14", "mab-movielens.pdf"), width=6, height=4)
195 |   plot(results, type="cumulative", regret=FALSE, rate=TRUE,
196 |      legend_position="topleft", disp="sd")
197 | dev.off()
198 | ```
199 | 
200 | 


--------------------------------------------------------------------------------
/Rmd/chap15.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Machine Learning for Business Analytics"
  3 | author: "Chapter 15: Association Rules and Collaborative Filtering"
  4 | output:
  5 |   pdf_document:
  6 |     toc: no
  7 |     highlight: tango
  8 | #  html_document:
  9 | #    toc: yes
 10 | #    toc_depth: 4
 11 | #    toc_float: yes
 12 | ---
 13 | <style>
 14 | h1.title { font-size: 28px; }
 15 | h1 { font-size: 22px; }
 16 | h2 { font-size: 18px; }
 17 | h3 { font-size: 14px; }
 18 | h4 { font-size: 12px; }
 19 | </style>
 20 | ```{r global_options, include=FALSE}
 21 | knitr::opts_chunk$set(error=TRUE,        # Keep compiling upon error
 22 |                       collapse=FALSE,    # collapse by default
 23 |                       echo=TRUE,         # echo code by default
 24 |                       comment = "#>",    # change comment character
 25 |                       fig.width = 5.5,     # set figure width
 26 |                       fig.align = "center",# set figure position
 27 |                       out.width = "49%", # set width of displayed images
 28 |                       warning=FALSE,     # do not show R warnings
 29 |                       message=FALSE,     # do not show R messages
 30 |                       row.print=25)
 31 | ```
 32 | 
 33 | 
 34 | ```{r}
 35 | if (!require(mlba)) {
 36 |   library(devtools)
 37 |   install_github("gedeck/mlba/mlba", force=TRUE)
 38 | }
 39 | ```
 40 | 
 41 | # Association Rules
 42 | ## The Process of Rule Selection
 43 | ### Lift
 44 | ```{r}
 45 | library(arules)
 46 | fp.df <- mlba::Faceplate
 47 | 
 48 | # remove first column and convert to matrix
 49 | fp.mat <- as.matrix(fp.df[, -1])
 50 | 
 51 | # convert the binary incidence matrix into a transactions database
 52 | fp.trans <- as(fp.mat, "transactions")
 53 | inspect(fp.trans)
 54 | 
 55 | ## get rules
 56 | # when running apriori(), include the minimum support, minimum confidence, and target
 57 | # as arguments.
 58 | rules <- apriori(fp.trans, parameter = list(supp = 0.2, conf = 0.5, target = "rules"))
 59 | 
 60 | # inspect the first six rules, sorted by their lift
 61 | inspect(head(sort(rules, by = "lift"), n = 6))
 62 | ```
 63 | 
 64 | ## Example 2: Rules for Similar Book Purchases
 65 | ```{r}
 66 | all.books.df <- mlba::CharlesBookClub
 67 | 
 68 | # create a binary incidence matrix
 69 | count.books.df <- all.books.df[, 8:18]
 70 | incid.books.mat <- as.matrix(count.books.df > 0)
 71 | 
 72 | #  convert the binary incidence matrix into a transactions database
 73 | books.trans <- as(incid.books.mat, "transactions")
 74 | inspect(books.trans[1:10])
 75 | 
 76 | # plot data
 77 | itemFrequencyPlot(books.trans)
 78 | 
 79 | # run apriori function
 80 | rules <- apriori(books.trans,
 81 |     parameter = list(supp= 200/4000, conf = 0.5, target = "rules"))
 82 | 
 83 | # inspect top-30 rules sorted by lift
 84 | inspect(head(sort(rules, by = "lift"), n=30))
 85 | ```
 86 | 
 87 | # Collaborative Filtering
 88 | ## Example 4: Predicting Movie Ratings with MovieLens Data
 89 | ```{r}
 90 | library(recommenderlab)
 91 | 
 92 | # download MovieLens data
 93 | ratings <- mlba::MovieLensRatings
 94 | movies <- mlba::MovieLensMovies
 95 | 
 96 | # convert ratings to rating matrix
 97 | idxUserId <- sort(unique(ratings$userId))
 98 | idxMovieId <- sort(unique(ratings$movieId))
 99 | m <- matrix(NA, nrow=length(idxUserId), ncol=length(idxMovieId),
100 |             dimnames=list(
101 |               user=paste("u", 1:length(idxUserId), sep=''),
102 |               item=movies$title[match(idxMovieId, movies$movieId)]
103 |             ))
104 | for (i in 1:nrow(ratings)) {
105 |   rating <- ratings[i,]
106 |   irow <- match(rating$userId, idxUserId)
107 |   icol <- match(rating$movieId, idxMovieId)
108 |   m[irow, icol] <- rating$rating
109 | }
110 | ratingMatrix <- as(m, "realRatingMatrix")
111 | ```
112 | 
113 | ```{r}
114 | 
115 | ```
116 | 
117 | ```{r}
118 | # UBCF model and prediction
119 | recommender <- Recommender(ratingMatrix[-1], method="UBCF")
120 | pred <- predict(recommender, ratingMatrix[1])
121 | as(pred, 'list')
122 | 
123 | # IBCF model and prediction
124 | recommender <- Recommender(ratingMatrix[-1], method="IBCF")
125 | pred <- predict(recommender, ratingMatrix[1])
126 | as(pred, 'list')
127 | ```
128 | 
129 | ```{r}
130 | set.seed(1)
131 | e <- evaluationScheme(ratingMatrix, method="split", train=0.9, given=10)
132 | 
133 | r1 <- Recommender(getData(e, "train"), "UBCF")
134 | r2 <- Recommender(getData(e, "train"), "IBCF")
135 | r3 <- Recommender(getData(e, "train"), "RANDOM")
136 | 
137 | p1 <- predict(r1, getData(e, "known"), type="ratings")
138 | p2 <- predict(r2, getData(e, "known"), type="ratings")
139 | p3 <- predict(r3, getData(e, "known"), type="ratings")
140 | error <- rbind(
141 |   UBCF = calcPredictionAccuracy(p1, getData(e, "unknown")),
142 |   IBCF = calcPredictionAccuracy(p2, getData(e, "unknown")),
143 |   RANDOM = calcPredictionAccuracy(p3, getData(e, "unknown"))
144 | )
145 | error
146 | ```
147 | 
148 | 


--------------------------------------------------------------------------------
/Rmd/chap16.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Machine Learning for Business Analytics"
  3 | author: "Chapter 16: Cluster Analysis"
  4 | output:
  5 |   pdf_document:
  6 |     toc: no
  7 |     highlight: tango
  8 | #  html_document:
  9 | #    toc: yes
 10 | #    toc_depth: 4
 11 | #    toc_float: yes
 12 | ---
 13 | <style>
 14 | h1.title { font-size: 28px; }
 15 | h1 { font-size: 22px; }
 16 | h2 { font-size: 18px; }
 17 | h3 { font-size: 14px; }
 18 | h4 { font-size: 12px; }
 19 | </style>
 20 | ```{r global_options, include=FALSE}
 21 | knitr::opts_chunk$set(error=TRUE,        # Keep compiling upon error
 22 |                       collapse=FALSE,    # collapse by default
 23 |                       echo=TRUE,         # echo code by default
 24 |                       comment = "#>",    # change comment character
 25 |                       fig.width = 5.5,     # set figure width
 26 |                       fig.align = "center",# set figure position
 27 |                       out.width = "49%", # set width of displayed images
 28 |                       warning=FALSE,     # do not show R warnings
 29 |                       message=FALSE,     # do not show R messages
 30 |                       row.print=25)
 31 | ```
 32 | 
 33 | 
 34 | ```{r}
 35 | if (!require(mlba)) {
 36 |   library(devtools)
 37 |   install_github("gedeck/mlba/mlba", force=TRUE)
 38 | }
 39 | ```
 40 | 
 41 | # Measuring Distance Between Two Records
 42 | ## Euclidean Distance
 43 | ```{r}
 44 | library(tidyverse)
 45 | # load data and use Company column as row names
 46 | utilities.df <- mlba::Utilities %>%
 47 |   column_to_rownames("Company")
 48 | 
 49 | # compute Euclidean distance
 50 | # (to compute other distance measures, change the value in the method argument)
 51 | d <- dist(utilities.df, method = "euclidean")
 52 | ```
 53 | 
 54 | ## Normalizing Numerical Variables
 55 | ```{r}
 56 | # normalize input variables
 57 | utilities.df.norm <- scale(utilities.df)
 58 | 
 59 | # compute normalized distance based on Sales and Fuel Cost
 60 | d.norm <- dist(utilities.df.norm[,c("Sales","Fuel_Cost")], method="euclidean")
 61 | ```
 62 | 
 63 | # Hierarchical (Agglomerative) Clustering
 64 | ## Dendrograms: Displaying Clustering Process and Results
 65 | ```{r}
 66 | library(ggplot2)
 67 | library(ggdendro)
 68 | d.norm <- dist(utilities.df.norm, method="euclidean")
 69 | 
 70 | # in hclust() set argument method \galit{to}
 71 | # "ward.D", "single", "complete", "average", "median", or "centroid"
 72 | hc1 <- hclust(d.norm, method="single")
 73 | plot(hc1, hang=-1, ann=FALSE)  # use baseR
 74 | ggdendrogram(hc1)      # use ggdendro package (shown in figure below)
 75 | hc2 <- hclust(d.norm, method="average")
 76 | plot(hc2, hang=-1, ann=FALSE)
 77 | ggdendrogram(hc2)
 78 | ```
 79 | 
 80 | ```{r}
 81 | library(gridExtra)
 82 | addCutline <- function(g, hc, ncluster) {
 83 |   heights <- rev(hc$height)
 84 |   cut_at <- 0.5 * (heights[ncluster] + heights[ncluster - 1])
 85 |   return (g + geom_hline(yintercept=cut_at, color='red', linetype=2))
 86 | }
 87 | g1 <- ggdendrogram(hc1)
 88 | g2 <- ggdendrogram(hc2)
 89 | grid.arrange(addCutline(g1, hc1, 6), addCutline(g2, hc2, 6), nrow=2)
 90 | g <- arrangeGrob(addCutline(g1, hc1, 6), addCutline(g2, hc2, 6), nrow=2)
 91 | ggsave(file=file.path("..", "figures", "chapter_16", "utilities-dendrograms.pdf"),
 92 |        g, width=5, height=8, units="in")
 93 | ```
 94 | 
 95 | ```{r}
 96 | memb <- cutree(hc1, k = 6)
 97 | memb
 98 | memb <- cutree(hc2, k = 6)
 99 | memb
100 | ```
101 | 
102 | ## Validating Clusters
103 | ```{r}
104 | # set labels as cluster membership and utility name
105 | row.names(utilities.df.norm) <- paste(memb, ": ", row.names(utilities.df), sep = "")
106 | 
107 | # plot heatmap
108 | heatmap(utilities.df.norm, Colv=NA, hclustfun=hclust)
109 | ```
110 | 
111 | ```{r}
112 | # grey scale
113 | # rev() reverses the color mapping to large = dark
114 | heatmap(as.matrix(utilities.df.norm), Colv = NA, hclustfun = hclust,
115 |         col=rev(paste("gray",1:99,sep="")))
116 | 
117 | pdf(file=file.path("..", "figures", "chapter_16", "utilities-heatmap.pdf"),
118 |     width=5, height=5)
119 | heatmap(utilities.df.norm, Colv=NA, hclustfun=hclust)
120 | dev.off()
121 | ```
122 | 
123 | # Non-hierarchical Clustering: The 
124 | ## Choosing the Number of Clusters ($k$)
125 | ```{r}
126 | set.seed(123) # set random seed for reproducability
127 | # load and preprocess data
128 | utilities.df <- mlba::Utilities %>%
129 |   column_to_rownames("Company")
130 | 
131 | # normalized distance:
132 | utilities.df.norm <- scale(utilities.df)
133 | 
134 | # run kmeans algorithm
135 | km <- kmeans(utilities.df.norm, 6)
136 | 
137 | # show cluster membership
138 | sort(km$cluster)
139 | ```
140 | 
141 | ```{r}
142 | # centroids
143 |   km$centers
144 |   # within-cluster sum of squares
145 |   km$withinss
146 |   # cluster size
147 |   km$size
148 | ```
149 | 
150 | ```{r}
151 | library(GGally)
152 | centroids <- data.frame(km$centers)
153 | centroids['Cluster'] = paste('Cluster', seq(1, 6))
154 | 
155 | ggparcoord(centroids, columns=1:8, groupColumn='Cluster', showPoints=TRUE) +
156 |     scale_color_viridis_d() +
157 |     labs(x='Variable', y='Value')
158 | ```
159 | 
160 | ```{r}
161 | ggsave(file=file.path("..", "figures", "chapter_16", "utilities-clusterProfile.pdf"),
162 |          last_plot() + theme_bw(), width=8.5, height=3.2, units="in")
163 | ```
164 | 
165 | ```{r}
166 | result <- tibble()
167 | for (k in 1:6) {
168 |   km <- kmeans(utilities.df.norm, k)
169 |   result <- bind_rows(result, tibble(k=k, average_withinss=mean(km$withinss)))
170 | }
171 | 
172 | ggplot(result, aes(x=k, y=average_withinss)) +
173 |   geom_line() +
174 |   geom_point() +
175 |   labs(y="Average within-cluster squared distance",
176 |        x=expression(paste("Number of clusters ", italic("k")))) +
177 |   theme_bw()
178 | ggsave(file=file.path("..", "figures", "chapter_16", "utilities-ellbow.pdf"),
179 |        last_plot(), width=4, height=4, units="in")
180 | ```
181 | 
182 | ```{r}
183 | dist(km$centers)
184 | ```
185 | 
186 | 


--------------------------------------------------------------------------------
/Rmd/chap17.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Machine Learning for Business Analytics"
  3 | author: "Chapter 17: Handling Time Series"
  4 | output:
  5 |   pdf_document:
  6 |     toc: no
  7 |     highlight: tango
  8 | #  html_document:
  9 | #    toc: yes
 10 | #    toc_depth: 4
 11 | #    toc_float: yes
 12 | ---
 13 | <style>
 14 | h1.title { font-size: 28px; }
 15 | h1 { font-size: 22px; }
 16 | h2 { font-size: 18px; }
 17 | h3 { font-size: 14px; }
 18 | h4 { font-size: 12px; }
 19 | </style>
 20 | ```{r global_options, include=FALSE}
 21 | knitr::opts_chunk$set(error=TRUE,        # Keep compiling upon error
 22 |                       collapse=FALSE,    # collapse by default
 23 |                       echo=TRUE,         # echo code by default
 24 |                       comment = "#>",    # change comment character
 25 |                       fig.width = 5.5,     # set figure width
 26 |                       fig.align = "center",# set figure position
 27 |                       out.width = "49%", # set width of displayed images
 28 |                       warning=FALSE,     # do not show R warnings
 29 |                       message=FALSE,     # do not show R messages
 30 |                       row.print=25)
 31 | ```
 32 | 
 33 | 
 34 | ```{r}
 35 | if (!require(mlba)) {
 36 |   library(devtools)
 37 |   install_github("gedeck/mlba/mlba", force=TRUE)
 38 | }
 39 | ```
 40 | 
 41 | # Time Series Components
 42 | ## Example: Ridership on Amtrak Trains
 43 | ```{r}
 44 | library(forecast)
 45 | library(ggplot2)
 46 | Amtrak.data <- mlba::Amtrak
 47 | 
 48 | # create time series object using ts()
 49 | # ts() takes three arguments: start, end, and freq.
 50 | # with monthly data, the frequency of periods per cycle is 12 (per year).
 51 | # arguments start and end are (cycle [=year] number, seasonal period [=month] number) pairs.
 52 | # here start is Jan 1991: start = c(1991, 1); end is Mar 2004: end = c(2004, 3).
 53 | ridership.ts <- ts(Amtrak.data$Ridership,
 54 |     start = c(1991, 1), end = c(2004, 3), freq = 12)
 55 | 
 56 | # plot the series using the autoplot function to make use of ggplot
 57 | autoplot(ridership.ts, xlab="Time", ylab="Ridership (in 000s)") +
 58 |   scale_y_continuous(limits=c(1300, 2300))
 59 | ```
 60 | 
 61 | ```{r}
 62 | g <- last_plot() +
 63 |     scale_x_continuous(n.breaks=10) +
 64 |     theme_bw()
 65 |   ggsave(file=file.path("..", "figures", "chapter_17", "AmtrakFirstPlot.pdf"),
 66 |          g, width=6, height=3, units="in")
 67 | ```
 68 | 
 69 | ```{r}
 70 | library(gridExtra)
 71 |   library(lubridate)
 72 |   library(zoo)
 73 | 
 74 |   BareggTunnel <- mlba::BareggTunnel
 75 |   # convert Day information to a dates object
 76 |   dates <- as.POSIXct(BareggTunnel$Day, format='%d %b %Y')
 77 |   tunnel.ts <- ts(BareggTunnel$Number.of.vehicles,
 78 |                   start=c(2003, yday(dates[1])),
 79 |                   frequency=365)
 80 | 
 81 |   options(scipen=999)
 82 |   g1 <- autoplot(tunnel.ts, xlab="Time", ylab="Number of vehicles") +
 83 |     scale_x_yearmon() +
 84 |     scale_y_continuous(labels = scales::comma)
 85 |   g2 <- autoplot(window(tunnel.ts,
 86 |                  start=c(2004, yday(ISOdate(2004, 2, 1))),
 87 |                  end=c(2004, yday(ISOdate(2004, 6, 1)))),
 88 |                  xlab="Time", ylab="Number of vehicles") +
 89 |     scale_x_yearmon() +
 90 |     scale_y_continuous(labels = scales::comma)
 91 | 
 92 |   grid.arrange(g1, g2, nrow=2)
 93 |   g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw())
 94 |   ggsave(file=file.path("..", "figures", "chapter_17", "TS-TunnelPlots.pdf"), g,
 95 |            width=6, height=4)
 96 | ```
 97 | 
 98 | ```{r}
 99 | library(gridExtra)
100 | 
101 | # to zoom in to a certain period, use window() to create a new, shorter time series
102 | # we create a new, 3-year time series of ridership.ts from Jan 1997 to Dec 1999
103 | ridership.ts.3yrs <- window(ridership.ts, start = c(1997, 1), end = c(1999, 12))
104 | g1 <- autoplot(ridership.ts.3yrs, xlab="Time", ylab="Ridership (in 000s)") +
105 |   scale_y_continuous(limits=c(1300, 2300))
106 | 
107 | # fit a trend line to the time series
108 | g2 <- autoplot(ridership.ts, xlab="Time", ylab="Ridership (in 000s)") +
109 |   scale_y_continuous(limits=c(1300, 2300)) +
110 |   geom_smooth(method="lm", formula=y~poly(x, 2))
111 | 
112 | grid.arrange(g1, g2, nrow=2)
113 | ```
114 | 
115 | ```{r}
116 | g <- arrangeGrob(g1 + theme_bw(),
117 |                    g2 + scale_x_continuous(n.breaks=10) + theme_bw())
118 |   ggsave(file=file.path("..", "figures", "chapter_17", "AmtrakZoomPlots.pdf"), g,
119 |          width=6, height=6)
120 | 
121 | # we can also use tslm to create the quadratic fit
122 | ridership.lm <- tslm(ridership.ts ~ trend + I(trend^2))
123 | autoplot(ridership.ts, xlab="Time", ylab="Ridership (in 000s)") +
124 |   scale_y_continuous(limits=c(1300, 2300)) +
125 |   autolayer(ridership.lm$fitted.values)
126 | ```
127 | 
128 | # Data Partitioning and Performance Evaluation
129 | ## Benchmark Performance: Naive Forecasts
130 | ```{r}
131 | nTest <- 36
132 | nTrain <- length(ridership.ts) - nTest
133 | 
134 | # partition the data
135 | train.ts <- window(ridership.ts, start = c(1991, 1), end = c(1991, nTrain))
136 | test.ts <- window(ridership.ts, start = c(1991, nTrain + 1),
137 |                                    end = c(1991, nTrain + nTest))
138 | 
139 | # generate the naive and seasonal naive forecasts
140 | naive.pred <- naive(train.ts, h=nTest)
141 | snaive.pred <- snaive(train.ts, h=nTest)
142 | 
143 | # compare the actual values and forecasts for both methods
144 | colData <- "steelblue"; colModel <- "tomato"
145 | autoplot(train.ts, xlab="Time", ylab="Ridership (in 000s$)", color=colData) +
146 |   autolayer(test.ts, linetype=2, color=colData) +
147 |   autolayer(naive.pred, PI=FALSE, color=colModel, size=0.75) +
148 |   autolayer(snaive.pred, PI=FALSE, color=colModel, size=0.75)
149 | ```
150 | 
151 | ```{r}
152 | # for the book visualization add additional annotation
153 | delta <- 1/12
154 | date_t <- time(train.ts)[1]
155 | date_th <- time(test.ts)[1] - delta
156 | date_hf <- tail(time(test.ts), 1) + delta
157 | g <- last_plot() +
158 |   geom_vline(xintercept=date_th, color="darkgrey") + geom_vline(xintercept=date_hf, color="darkgrey") +
159 |   geom_segment(aes(x=date_t, xend=date_th-delta, y=2300, yend=2300), color="darkgrey") +
160 |   geom_segment(aes(x=date_th+delta, xend=date_hf-delta, y=2300, yend=2300), color="darkgrey")  +
161 |   geom_segment(aes(x=date_hf+delta, xend=date_hf+2, y=2300, yend=2300), color="darkgrey") +
162 |   geom_text(aes(x=(date_t+date_th)/2, y=2350, label='Training')) +
163 |   geom_text(aes(x=(date_th+date_hf)/2, y=2350, label='Test')) +
164 |   geom_text(aes(x=date_hf+1, y=2350, label='Future')) +
165 |   scale_x_continuous(n.breaks=10) +
166 |   theme_bw()
167 | ggsave(file=file.path("..", "figures", "chapter_17", "AmtrakNaive.pdf"),
168 |        g, width=7, height=4.5)
169 | ```
170 | 
171 | ```{r}
172 | accuracy(naive.pred, test.ts)
173 | accuracy(snaive.pred, test.ts)
174 | ```
175 | 
176 | ## Generating Future Forecasts
177 | ```{r}
178 | ts <- ts(mlba::CanadianWorkHours$Hours,
179 |     start = c(mlba::CanadianWorkHours$Year[1], 1), freq = 1)
180 | 
181 | # plot the series using the autoplot function to make use of ggplot
182 | autoplot(ts, xlab="Year", ylab="Hours per week")
183 | ggsave(file=file.path("..", "figures", "chapter_17", "Exercise-CanadianWorkers.pdf"),
184 |          last_plot() + theme_bw(), width=5, height=4, units="in")
185 | ```
186 | 
187 | 


--------------------------------------------------------------------------------
/Rmd/chap2.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Machine Learning for Business Analytics"
  3 | author: "Chapter 2: Overview of the Machine Learning Process"
  4 | output:
  5 |   pdf_document:
  6 |     toc: no
  7 |     highlight: tango
  8 | #  html_document:
  9 | #    toc: yes
 10 | #    toc_depth: 4
 11 | #    toc_float: yes
 12 | ---
 13 | <style>
 14 | h1.title { font-size: 28px; }
 15 | h1 { font-size: 22px; }
 16 | h2 { font-size: 18px; }
 17 | h3 { font-size: 14px; }
 18 | h4 { font-size: 12px; }
 19 | </style>
 20 | ```{r global_options, include=FALSE}
 21 | knitr::opts_chunk$set(error=TRUE,        # Keep compiling upon error
 22 |                       collapse=FALSE,    # collapse by default
 23 |                       echo=TRUE,         # echo code by default
 24 |                       comment = "#>",    # change comment character
 25 |                       fig.width = 5.5,     # set figure width
 26 |                       fig.align = "center",# set figure position
 27 |                       out.width = "49%", # set width of displayed images
 28 |                       warning=FALSE,     # do not show R warnings
 29 |                       message=FALSE,     # do not show R messages
 30 |                       row.print=25)
 31 | ```
 32 | 
 33 | 
 34 | ```{r}
 35 | if (!require(mlba)) {
 36 |   library(devtools)
 37 |   install_github("gedeck/mlba/mlba", force=TRUE)
 38 | }
 39 | options(scipen=999)
 40 | ```
 41 | 
 42 | # Preliminary Steps 
 43 | ## Loading and Looking at the Data in R
 44 | ```{r, eval=FALSE}
 45 | housing.df <- read.csv('WestRoxbury.csv') # load data from file
 46 | housing.df <- mlba::WestRoxbury # load data from mlba package
 47 | dim(housing.df)  # find the dimension of data frame
 48 | head(housing.df)  # show the first six rows
 49 | View(housing.df)  # show all the data in a new tab
 50 | 
 51 | # Practice showing different subsets of the data
 52 | housing.df[1:10, 1]  # show the first 10 rows of the first column only
 53 | housing.df[1:10, ]  # show the first 10 rows of each of the columns
 54 | housing.df[5, 1:10]  # show the fifth row of the first 10 columns
 55 | housing.df[5, c(1:2, 4, 8:10)]  # show the fifth row of some columns
 56 | housing.df[, 1]  # show the whole first column
 57 | housing.df$TOTAL.VALUE  # a different way to show the whole first column
 58 | housing.df$TOTAL.VALUE[1:10]  # show the first 10 rows of the first column
 59 | length(housing.df$TOTAL.VALUE)  # find the length of the first column
 60 | mean(housing.df$TOTAL.VALUE)  # find the mean of the first column
 61 | summary(housing.df)  # find summary statistics for each column
 62 | ```
 63 | 
 64 | ## Sampling from a Database
 65 | ```{r}
 66 | housing.df <- mlba::WestRoxbury
 67 | 
 68 | # random sample of 5 observations
 69 | s <- sample(row.names(housing.df), 5)
 70 | housing.df[s,]
 71 | 
 72 | # oversample houses with over 10 rooms
 73 | s <- sample(row.names(housing.df), 5, prob=ifelse(housing.df$ROOMS>10, 0.9, 0.01))
 74 | housing.df[s,]
 75 | 
 76 | # rebalance
 77 | housing.df$REMODEL <- factor(housing.df$REMODEL)
 78 | table(housing.df$REMODEL)
 79 | upsampled.df <- caret::upSample(housing.df, housing.df$REMODEL, list=TRUE)$x
 80 | table(upsampled.df$REMODEL)
 81 | ```
 82 | 
 83 | ```{r}
 84 | 
 85 | ```
 86 | 
 87 | ## Preprocessing and Cleaning the Data
 88 | ### Types of Variables
 89 | ```{r}
 90 | library(tidyverse)
 91 | 
 92 | # get overview
 93 | str(housing.df)
 94 | 
 95 | # make REMODEL a factor variable
 96 | housing.df$REMODEL <- factor(housing.df$REMODEL)
 97 | str(housing.df$REMODEL)
 98 | levels(housing.df$REMODEL) # show factor's categories (levels)
 99 | 
100 | # use tidyverse to load and preprocess data in one statement
101 | # the %>% operator inserts the result of the expression on the left
102 | # as the first argument into the function on the right
103 | housing.df <- mlba::WestRoxbury %>%
104 |   mutate(REMODEL=factor(REMODEL))
105 | ```
106 | 
107 | ### Handling Categorical Variables
108 | ```{r}
109 | library(fastDummies)
110 | library(tidyverse)
111 | 
112 | housing.df <- dummy_cols(mlba::WestRoxbury,
113 |                  remove_selected_columns=TRUE,  # remove the original column
114 |                  remove_first_dummy=TRUE)  # removes the first created dummy variable
115 | housing.df %>% head(2)
116 | ```
117 | 
118 | ### Missing Values
119 | ```{r}
120 | # To illustrate missing data procedures, we first convert a few entries for
121 | # BEDROOMS to NA's. Then we impute these missing values using the median of the
122 | # remaining values.
123 | rows.to.missing <- sample(row.names(housing.df), 10)
124 | housing.df[rows.to.missing,]$BEDROOMS <- NA
125 | summary(housing.df$BEDROOMS)
126 | # Now we have 10 NA's and the median of the remaining values is 3.
127 | 
128 | # replace the missing values using the median of the remaining values
129 | # use median() with na.rm=TRUE to ignore missing values when computing the median.
130 | housing.df <- housing.df %>%
131 |   replace_na(list(BEDROOMS=median(housing.df$BEDROOMS, na.rm=TRUE)))
132 | 
133 | summary(housing.df$BEDROOMS)
134 | ```
135 | 
136 | # Predictive Power and Overfitting
137 | ## Creating and Using Data Partitions
138 | ### Holdout Partition
139 | ```{r}
140 | housing.df <- mlba::WestRoxbury %>%
141 |   mutate(REMODEL=factor(REMODEL))
142 | 
143 | # use set.seed() to get the same partitions when re-running the R code.
144 | set.seed(1)
145 | 
146 | ## partitioning into training (60%) and holdout (40%)
147 | # randomly sample 60% of the row IDs for training; the remaining 40% serve
148 | # as holdout
149 | train.rows <- sample(rownames(housing.df), nrow(housing.df)*0.6)
150 | # collect all the columns with training row ID into training set:
151 | train.df <- housing.df[train.rows, ]
152 | # assign row IDs that are not already in the training set, into holdout
153 | holdout.rows <- setdiff(rownames(housing.df), train.rows)
154 | holdout.df <- housing.df[holdout.rows, ]
155 | 
156 | ## partitioning into training (50%), validation (30%), holdout (20%)
157 | # randomly sample 50% of the row IDs for training
158 | train.rows <- sample(rownames(housing.df), nrow(housing.df)*0.5)
159 | 
160 | # sample 30% of the row IDs into the validation set, drawing only from records
161 | # not already in the training set
162 | # use setdiff() to find records not already in the training set
163 | valid.rows <- sample(setdiff(rownames(housing.df), train.rows),
164 |               nrow(housing.df)*0.3)
165 | 
166 | # assign the remaining 20% row IDs serve as holdout
167 | holdout.rows <- setdiff(rownames(housing.df), union(train.rows, valid.rows))
168 | 
169 | # create the 3 data frames by collecting all columns from the appropriate rows
170 | train.df <- housing.df[train.rows, ]
171 | valid.df <- housing.df[valid.rows, ]
172 | holdout.df <- housing.df[holdout.rows, ]
173 | 
174 | ## partitioning into training (60%) and holdout (40%) using caret
175 | set.seed(1)
176 | idx <- caret::createDataPartition(housing.df$TOTAL.VALUE, p=0.6, list=FALSE)
177 | train.df <- housing.df[idx, ]
178 | holdout.df <- housing.df[-idx, ]
179 | ```
180 | 
181 | ```{r}
182 | 
183 | ```
184 | 
185 | # Building a Predictive Model
186 | ## Modeling Process
187 | ### Cross-Validation
188 | ```{r}
189 | library(tidyverse)
190 | library(mlba)
191 | library(fastDummies)
192 | 
193 | housing.df <- mlba::WestRoxbury %>%
194 |   # remove rows with missing values
195 |   drop_na() %>%
196 |   # remove column TAX
197 |   select(-TAX) %>%
198 |   # make REMODEL a factor and convert to dummy variables
199 |   mutate(REMODEL=factor(REMODEL)) %>%
200 |   dummy_cols(select_columns=c('REMODEL'),
201 |              remove_selected_columns=TRUE, remove_first_dummy=TRUE)
202 | ```
203 | 
204 | ```{r}
205 | set.seed(1)
206 | idx <- caret::createDataPartition(housing.df$TOTAL.VALUE, p=0.6, list=FALSE)
207 | train.df <- housing.df[idx, ]
208 | holdout.df <- housing.df[-idx, ]
209 | ```
210 | 
211 | ```{r}
212 | reg <- lm(TOTAL.VALUE ~ ., data=train.df)
213 | train.res <- data.frame(actual=train.df$TOTAL.VALUE, predicted=reg$fitted.values,
214 |                         residuals=reg$residuals)
215 | head(train.res)
216 | ```
217 | 
218 | ```{r}
219 | pred <- predict(reg, newdata=holdout.df)
220 | holdout.res <- data.frame(actual=holdout.df$TOTAL.VALUE, predicted=pred,
221 |                           residuals=holdout.df$TOTAL.VALUE - pred)
222 | head(holdout.res)
223 | ```
224 | 
225 | ```{r}
226 | library(caret)
227 | # compute metrics on training set
228 | data.frame(
229 |     ME = round(mean(train.res$residuals), 5),
230 |     RMSE = RMSE(pred=train.res$predicted, obs=train.res$actual),
231 |     MAE = MAE(pred=train.res$predicted, obs=train.res$actual)
232 | )
233 | 
234 | # compute metrics on holdout set
235 | data.frame(
236 |     ME = round(mean(holdout.res$residuals), 5),
237 |     RMSE = RMSE(pred=holdout.res$predicted, obs=holdout.res$actual),
238 |     MAE = MAE(pred=holdout.res$predicted, obs=holdout.res$actual)
239 | )
240 | ```
241 | 
242 | ```{r}
243 | # For demonstration purposes, we construct the new.data from the original dataset
244 | housing.df <- mlba::WestRoxbury
245 | new.data <- housing.df[100:102, -1] %>%
246 |   mutate(REMODEL=factor(REMODEL, levels=c("None", "Old", "Recent"))) %>%
247 |   dummy_cols(select_columns=c('REMODEL'),
248 |            remove_selected_columns=TRUE, remove_first_dummy=TRUE)
249 | new.data
250 | pred <- predict(reg, newdata = new.data)
251 | pred
252 | ```
253 | 
254 | 


--------------------------------------------------------------------------------
/Rmd/chap20.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Machine Learning for Business Analytics"
  3 | author: "Chapter 20: Social Network Analytics"
  4 | output:
  5 |   pdf_document:
  6 |     toc: no
  7 |     highlight: tango
  8 | #  html_document:
  9 | #    toc: yes
 10 | #    toc_depth: 4
 11 | #    toc_float: yes
 12 | ---
 13 | <style>
 14 | h1.title { font-size: 28px; }
 15 | h1 { font-size: 22px; }
 16 | h2 { font-size: 18px; }
 17 | h3 { font-size: 14px; }
 18 | h4 { font-size: 12px; }
 19 | </style>
 20 | ```{r global_options, include=FALSE}
 21 | knitr::opts_chunk$set(error=TRUE,        # Keep compiling upon error
 22 |                       collapse=FALSE,    # collapse by default
 23 |                       echo=TRUE,         # echo code by default
 24 |                       comment = "#>",    # change comment character
 25 |                       fig.width = 5.5,     # set figure width
 26 |                       fig.align = "center",# set figure position
 27 |                       out.width = "49%", # set width of displayed images
 28 |                       warning=FALSE,     # do not show R warnings
 29 |                       message=FALSE,     # do not show R messages
 30 |                       row.print=25)
 31 | ```
 32 | 
 33 | 
 34 | ```{r}
 35 | if (!require(mlba)) {
 36 |   library(devtools)
 37 |   install_github("gedeck/mlba/mlba", force=TRUE)
 38 | }
 39 | set.seed(1)
 40 | ```
 41 | 
 42 | # Introduction
 43 | ```{r}
 44 | library(igraph)
 45 | 
 46 | # define links in data
 47 | edges <- rbind(
 48 |   c("Dave", "Jenny"), c("Peter", "Jenny"), c("John", "Jenny"),
 49 |   c("Dave", "Peter"), c("Dave", "John"), c("Peter", "Sam"),
 50 |   c("Sam", "Albert"), c("Peter", "John")
 51 | )
 52 | 
 53 | # generate and plot network
 54 | # set argument directed = FALSE in graph.edgelist() to plot an undirected network.
 55 | g <- graph.edgelist(edges, directed = FALSE)
 56 | plot(g, vertex.size = 5, vertex.label.dist = 2)
 57 | ```
 58 | 
 59 | ```{r}
 60 | pdf(file=file.path("..", "figures", "chapter_20", "fig1.pdf"), width=5, height=5)
 61 |     par(mar=c(0,0,0,1)+.1)
 62 |     plot(g, vertex.size = 5, vertex.label.dist = 2)
 63 | dev.off()
 64 | ```
 65 | 
 66 | # Directed vs. Undirected Networks
 67 | ```{r}
 68 | # generate and plot network
 69 | # set argument directed = TRUE in graph.edgelist() to plot a directed network.
 70 | g <- graph.edgelist(edges, directed = TRUE)
 71 | plot(g, vertex.size = 5, vertex.label.dist = 2)
 72 | ```
 73 | 
 74 | ```{r}
 75 | pdf(file=file.path("..", "figures", "chapter_20", "fig2.pdf"), width=5, height=5)
 76 |     par(mar=c(0,0,0,1)+.5)
 77 |     plot(g, vertex.size = 5, vertex.label.dist=2)
 78 | dev.off()
 79 | ```
 80 | 
 81 | ```{r}
 82 | edges <- rbind(c("A", "B"), c("B", "C"), c("C", "A"))
 83 | g <- graph.edgelist(edges, directed = FALSE)
 84 | E(g)$width <- c(20, 5, 5)
 85 | plot(g, vertex.size = 5, vertex.label.dist = 2)
 86 | 
 87 | pdf(file=file.path("..", "figures", "chapter_20", "fig3.pdf"), width=2, height=2)
 88 |     par(mar=c(0,0,0,0)+.2)
 89 |     plot(g, vertex.size = 5, vertex.label.dist = 2)
 90 | dev.off()
 91 | # $
 92 | ```
 93 | 
 94 | # Visualizing and Analyzing Networks
 95 | ## Plot Layout
 96 | ```{r}
 97 | library(igraph)
 98 | drug.df <- mlba::Drug
 99 | 
100 | # convert edges to edge list matrix
101 | edges <- as.matrix(drug.df[, c(1,2)])
102 | g <- graph.edgelist(edges,directed=FALSE)
103 | 
104 | # plot network
105 | # nodes' size is proportional to their eigenvector centrality
106 | plot(g, vertex.label = NA, vertex.size = eigen_centrality(g)$vector * 20)
107 | ```
108 | 
109 | ```{r}
110 | pdf(file=file.path("..", "figures", "chapter_20", "SNA_Drug_Laundry.pdf"), width=5, height=5)
111 |     par(mar=c(0,0,0,0)+.1)
112 |     plot(g, vertex.label = NA, vertex.size = eigen_centrality(g)$vector * 20)
113 | dev.off()
114 | ```
115 | 
116 | ```{r}
117 | edges <- rbind(
118 |   c("Dave", "Jenny"), c("Peter", "Jenny"), c("John", "Jenny"),
119 |   c("Dave", "Peter"), c("Dave", "John"), c("Peter", "Sam"),
120 |   c("Sam", "Albert"), c("Peter", "John")
121 | )
122 | g <- graph.edgelist(edges)
123 | pdf(file=file.path("..", "figures", "chapter_20", "fig5_circle.pdf"), width=3, height=3)
124 |     par(mar=c(0,0,0,1)+.7)
125 |     plot(g, layout = layout_in_circle, vertex.size = 5, vertex.label.dist = 2)
126 | dev.off()
127 | pdf(file=file.path("..", "figures", "chapter_20", "fig5_grid.pdf"), width=3, height=3)
128 |     par(mar=c(0,0,0,1)+.7)
129 |     plot(g, layout = layout_on_grid, vertex.size = 5, vertex.label.dist = 2)
130 | dev.off()
131 | ```
132 | 
133 | ```{r}
134 | # Building on the code presented in Figure 19.1
135 | plot(g, layout = layout_in_circle, vertex.size = 5, vertex.label.dist = 2)
136 | plot(g, layout = layout_on_grid, vertex.size = 5, vertex.label.dist = 2)
137 | ```
138 | 
139 | # Social Data Metrics and Taxonomy
140 | ## Node-Level Centrality Metrics
141 | ```{r}
142 | edges <- rbind(
143 |   c("Dave", "Jenny"), c("Peter", "Jenny"), c("John", "Jenny"),
144 |   c("Dave", "Peter"), c("Dave", "John"), c("Peter", "Sam"),
145 |   c("Sam", "Albert"), c("Peter", "John")
146 | )
147 | g <- graph.edgelist(edges, directed=FALSE)
148 | 
149 | degree(g)
150 | betweenness(g)
151 | betweenness(g)/sum(betweenness(g))
152 | closeness(g)
153 | eigen_centrality(g)
154 | ```
155 | 
156 | ## Egocentric Network
157 | ```{r}
158 | # get Peter's 1-level ego network
159 | # for a 2-level ego network set argument order = 2 in make_ego_graph().
160 | peter.ego <- make_ego_graph(g, order = 1, nodes = "Peter")
161 | plot(peter.ego[[1]], vertex.size = 1, vertex.label.dist = 0.5)
162 | ```
163 | 
164 | ```{r}
165 | pdf(file=file.path("..", "figures", "chapter_20", "fig6_1.pdf"), width=5, height=5)
166 |     par(mar=c(0,0,0,1)+.5)
167 |     peter.ego <- make_ego_graph(g, order = 1, nodes = "Peter")
168 |     g.ego <- peter.ego[[1]]
169 |     V(g.ego)$color <- "orange"
170 |     V(g.ego)["Peter"]$color <- "red"
171 |     plot(g.ego, vertex.size = 5, vertex.label.dist = 2)
172 | dev.off()
173 | pdf(file=file.path("..", "figures", "chapter_20", "fig6_2.pdf"), width=5, height=5)
174 |     par(mar=c(0,0,0,1)+.5)
175 |     peter.ego <- make_ego_graph(g, order = 2, nodes = "Peter")
176 |     g.ego <- peter.ego[[1]]
177 |     V(g.ego)$color <- "orange"
178 |     V(g.ego)["Peter"]$color <- "red"
179 |     plot(g.ego, vertex.size = 5, vertex.label.dist = 2)
180 | dev.off()
181 | ```
182 | 
183 | ## Network Metrics
184 | ```{r}
185 | degree.distribution(g) # normalized
186 | edge_density(g)
187 | ```
188 | 
189 | # Collecting Social Network Data with R
190 | ## Collaborative Filtering
191 | ```{r, eval=FALSE}
192 | library(twitteR)
193 | # replace key and secret number with those you obtained from Twitter
194 | setup_twitter_oauth(consumer_key = "XXX", consumer_secret = "XXX",
195 |                     access_token = "XXX", access_secret = "XXX")
196 | 
197 | # get recent tweets
198 | recent.25.tweets <- searchTwitter("text mining", resultType="recent", n = 25)
199 | ```
200 | 
201 | ```{r, eval=FALSE}
202 | library(Rfacebook)
203 | # replace the app id and secret number with those you obtained from Facebook
204 | fb_oauth <- fbOAuth(app_id = "XXX", app_secret = "XXX")
205 | fb_oauth_credentials <- fromJSON(names(fb_oauth$credentials))
206 | 
207 | # get recent posts on page "dataminingbook"
208 | fb_page <- getPage(page = "dataminingbook", token = fb_oauth_credentials$access_token)
209 | 
210 | # a facebook page contains the following information:
211 | t(t(names(fb_page)))
212 | fb_page[1,]
213 | 
214 | # get information about most recent post
215 | post <- getPost(post=fb_page$id[1], n=20, token=fb_oauth_credentials$access_token)
216 | 
217 | post$likes
218 | post$comments
219 | ```
220 | 
221 | 


--------------------------------------------------------------------------------
/Rmd/chap21.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Machine Learning for Business Analytics"
  3 | author: "Chapter 21: Text Mining"
  4 | output:
  5 |   pdf_document:
  6 |     toc: no
  7 |     highlight: tango
  8 | #  html_document:
  9 | #    toc: yes
 10 | #    toc_depth: 4
 11 | #    toc_float: yes
 12 | ---
 13 | <style>
 14 | h1.title { font-size: 28px; }
 15 | h1 { font-size: 22px; }
 16 | h2 { font-size: 18px; }
 17 | h3 { font-size: 14px; }
 18 | h4 { font-size: 12px; }
 19 | </style>
 20 | ```{r global_options, include=FALSE}
 21 | knitr::opts_chunk$set(error=TRUE,        # Keep compiling upon error
 22 |                       collapse=FALSE,    # collapse by default
 23 |                       echo=TRUE,         # echo code by default
 24 |                       comment = "#>",    # change comment character
 25 |                       fig.width = 5.5,     # set figure width
 26 |                       fig.align = "center",# set figure position
 27 |                       out.width = "49%", # set width of displayed images
 28 |                       warning=FALSE,     # do not show R warnings
 29 |                       message=FALSE,     # do not show R messages
 30 |                       row.print=25)
 31 | ```
 32 | 
 33 | 
 34 | ```{r}
 35 | if (!require(mlba)) {
 36 |   library(devtools)
 37 |   install_github("gedeck/mlba/mlba", force=TRUE)
 38 | }
 39 | ```
 40 | 
 41 | # The Tabular Representation of Text: Term-Document Matrix and ``Bag-of-Words"
 42 | ```{r}
 43 | library(tm)
 44 | 
 45 | # define vector of sentences ("docs")
 46 | text <- c("this is the first sentence",
 47 |           "this is a second sentence",
 48 |           "the third sentence is here")
 49 | 
 50 | # convert sentences into a corpus
 51 | corp <- Corpus(VectorSource(text))
 52 | 
 53 | # compute term frequency
 54 | tdm <- TermDocumentMatrix(corp)
 55 | inspect(tdm)
 56 | ```
 57 | 
 58 | # Preprocessing the Text
 59 | ```{r}
 60 | text <- c("this is the first     sentence!!",
 61 |           "this is a second Sentence :)",
 62 |           "the third sentence, is here",
 63 |           "forth of all sentences")
 64 | corp <- Corpus(VectorSource(text))
 65 | tdm <- TermDocumentMatrix(corp)
 66 | inspect(tdm)
 67 | ```
 68 | 
 69 | ## Tokenization
 70 | ```{r}
 71 | # tokenization
 72 | corp <- tm_map(corp, stripWhitespace)
 73 | corp <- tm_map(corp, removePunctuation)
 74 | tdm <- TermDocumentMatrix(corp)
 75 | inspect(tdm)
 76 | ```
 77 | 
 78 | ## Text Reduction
 79 | ```{r}
 80 | stopwords('english')
 81 | ```
 82 | 
 83 | ```{r}
 84 | # stopwords
 85 | library(SnowballC)
 86 | corp <- tm_map(corp, removeWords, stopwords("english"))
 87 | 
 88 | # stemming
 89 | corp <- tm_map(corp, stemDocument)
 90 | 
 91 | tdm <- TermDocumentMatrix(corp)
 92 | inspect(tdm)
 93 | ```
 94 | 
 95 | ## Term Frequency--Inverse Document Frequency (TF-IDF)
 96 | ```{r}
 97 | tfidf <- weightTfIdf(tdm)
 98 | inspect(tfidf)
 99 | ```
100 | 
101 | # Example: Online Discussions on Autos and Electronics
102 | ## Importing and Labeling the Records
103 | ```{r}
104 | library(tm)
105 | # step 1: import and label records
106 | # read zip file into a corpus
107 | corp <- Corpus(ZipSource(mlba::AutosElectronics, recursive = T))
108 | 
109 | # create an array of records labels
110 | label <- c(rep(1, 1000), rep(0, 1000))
111 | 
112 | # step 2: text preprocessing
113 | # tokenization
114 | corp <- tm_map(corp, stripWhitespace)
115 | corp <- tm_map(corp, removePunctuation)
116 | corp <- tm_map(corp, removeNumbers)
117 | 
118 | # stopwords
119 | corp <- tm_map(corp, removeWords, stopwords("english"))
120 | 
121 | # stemming
122 | corp <- tm_map(corp, stemDocument)
123 | 
124 | # step 3: TF-IDF and latent semantic analysis
125 | # compute TF-IDF
126 | tdm <- TermDocumentMatrix(corp)
127 | tfidf <- weightTfIdf(tdm)
128 | 
129 | # extract (20) concepts
130 | library(lsa)
131 | lsa.tfidf <- lsa(tfidf, dim = 20)
132 | 
133 | # convert to data frame
134 | words.df <- as.data.frame(as.matrix(lsa.tfidf$dk))
135 | ```
136 | 
137 | ## Fitting a Predictive Model
138 | ```{r}
139 | library(caret)
140 | 
141 | # prepare training and holdout sets
142 | set.seed(1)
143 | df <- cbind(label=factor(label), words.df)
144 | idx <- caret::createDataPartition(df$label, p=0.6, list=FALSE)
145 | train.df <- df[idx, ]
146 | holdout.df <- df[-idx, ]
147 | 
148 | # fit logistic regression
149 | logit.reg <- train(label ~ ., data=train.df,
150 |                           trControl=trainControl(method="none"),
151 |                           method="glm", family="binomial")
152 | 
153 | # compute accuracy on holdout set
154 | pred <- predict(logit.reg, newdata=holdout.df)
155 | confusionMatrix(pred, holdout.df$label)
156 | ```
157 | 
158 | ```{r}
159 | library(gains)
160 | 
161 | prob <- predict(logit.reg, newdata=holdout.df, type="prob")[,2]
162 | actual <- ifelse(holdout.df$label == 1, 1, 0)
163 | gain <- gains(actual, prob)
164 | barplot(gain$mean.resp/mean(actual), names.arg=seq(10, 100, by=10),
165 |         xlab="Percentile", ylab="Decile mean / global mean")
166 | ```
167 | 
168 | ```{r}
169 | pdf(file=file.path("..", "figures", "chapter_21", "decileLiftClassification.pdf"),
170 |     width=6, height=4)
171 |     barplot(gain$mean.resp/mean(actual), names.arg=seq(10, 100, by=10),
172 |             xlab="Percentile", ylab="Decile mean / global mean")
173 | dev.off()
174 | ```
175 | 
176 | # Example: Sentiment Analysis of Movie Reviews
177 | ## Data Loading, Preparation, and Partitioning
178 | ```{r}
179 | library(tidyverse)
180 | library(text2vec)
181 | 
182 | # load and split data into training and holdout set
183 | data <- mlba::IMDBdataset10K %>%
184 |   mutate(
185 |     id = row_number(),
186 |     sentiment = as.factor(sentiment)
187 |   )
188 | 
189 | set.seed(1)
190 | trainIndex <- createDataPartition(data$sentiment, p=0.8, list=FALSE)
191 | data_train <- data[trainIndex, ]
192 | data_holdout <- data[-trainIndex, ]
193 | ```
194 | 
195 | ```{r}
196 | prep_fun <- tolower
197 | tok_fun <- word_tokenizer
198 | 
199 | it_train <- itoken(data_train$review, ids=data_train$id,
200 |                    preprocessor=prep_fun, tokenizer=tok_fun)
201 | it_holdout <- itoken(data_holdout$review, ids=data_holdout$id,
202 |                      preprocessor=prep_fun, tokenizer=tok_fun)
203 | 
204 | vocab <- create_vocabulary(it_train)
205 | vocab <- prune_vocabulary(vocab, term_count_min = 5L)
206 | vectorizer <- vocab_vectorizer(vocab)
207 | tcm_train <- create_tcm(it_train, vectorizer, skip_grams_window = 5L)
208 | ```
209 | 
210 | ```{r}
211 | 
212 | ```
213 | 
214 | ## Generating and Applying pb
215 | ```{r}
216 | # determine word vectors
217 | glove <- GlobalVectors$new(rank=100, x_max=10)
218 | wv_main <- glove$fit_transform(tcm_train, n_iter=10, convergence_tol=0.01, n_threads=8)
219 | wv_context <- glove$components
220 | word_vectors <- wv_main + t(wv_context)
221 | ```
222 | 
223 | ```{r}
224 | 
225 | ```
226 | 
227 | ```{r}
228 | dtm_train <- create_dtm(it_train, vectorizer)
229 | common_terms <- intersect(colnames(dtm_train), rownames(word_vectors) )
230 | dtm_averaged <-  normalize(dtm_train[, common_terms], "l1")
231 | sentence_vectors_train <- dtm_averaged %*% word_vectors[common_terms, ]
232 | 
233 | dtm_holdout <- create_dtm(it_holdout, vectorizer)
234 | common_terms <- intersect(colnames(dtm_holdout), rownames(word_vectors) )
235 | dtm_averaged <-  normalize(dtm_holdout[, common_terms], "l1")
236 | sentence_vectors_holdout <- dtm_averaged %*% word_vectors[common_terms, ]
237 | ```
238 | 
239 | ## Fitting a Predictive Model
240 | ```{r}
241 | train.df <- as.data.frame(as.matrix(sentence_vectors_train))
242 | train.df$sentiment <- data_train$sentiment
243 | 
244 | trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
245 | logit.reg <- caret::train(sentiment ~ ., data=train.df, trControl=trControl,
246 |                       # fit logistic regression with a generalized linear model
247 |                       method="glm", family="binomial")
248 | 
249 | holdout.df <- as.data.frame(as.matrix(sentence_vectors_holdout))
250 | holdout.df$sentiment <- data_holdout$sentiment
251 | 
252 | caret::confusionMatrix(predict(logit.reg, holdout.df), holdout.df$sentiment)
253 | ```
254 | 
255 | ```{r}
256 | 
257 | ```
258 | 
259 | ```{r}
260 | library(ROCR)
261 | prob <- predict(logit.reg, newdata=holdout.df, type="prob")$positive
262 | 
263 | predob <- prediction(prob, holdout.df$sentiment)
264 | perf <- performance(predob, "tpr", "fpr")
265 | perf.df <- data.frame(
266 |   tpr=perf@x.values[[1]],
267 |   fpr=perf@y.values[[1]]
268 | )
269 | ggplot(perf.df, aes(x=tpr, y=fpr)) +
270 |   geom_line() +
271 |   geom_segment(aes(x=0, y=0, xend=1, yend=1), color="grey", linetype="dashed") +
272 |   labs(x="1 - Specificity", y="Sensitivity")
273 | ```
274 | 
275 | ```{r}
276 | ggsave(file=file.path("..", "figures", "chapter_21", "glove-ROC.pdf"),
277 |    last_plot() + theme_bw())
278 | ```
279 | 
280 | 


--------------------------------------------------------------------------------
/Rmd/chap22.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Machine Learning for Business Analytics"
  3 | author: "Chapter 22: Responsible Data Science"
  4 | output:
  5 |   pdf_document:
  6 |     toc: no
  7 |     highlight: tango
  8 | #  html_document:
  9 | #    toc: yes
 10 | #    toc_depth: 4
 11 | #    toc_float: yes
 12 | ---
 13 | <style>
 14 | h1.title { font-size: 28px; }
 15 | h1 { font-size: 22px; }
 16 | h2 { font-size: 18px; }
 17 | h3 { font-size: 14px; }
 18 | h4 { font-size: 12px; }
 19 | </style>
 20 | ```{r global_options, include=FALSE}
 21 | knitr::opts_chunk$set(error=TRUE,        # Keep compiling upon error
 22 |                       collapse=FALSE,    # collapse by default
 23 |                       echo=TRUE,         # echo code by default
 24 |                       comment = "#>",    # change comment character
 25 |                       fig.width = 5.5,     # set figure width
 26 |                       fig.align = "center",# set figure position
 27 |                       out.width = "49%", # set width of displayed images
 28 |                       warning=FALSE,     # do not show R warnings
 29 |                       message=FALSE,     # do not show R messages
 30 |                       row.print=25)
 31 | ```
 32 | 
 33 | 
 34 | ```{r}
 35 | if (!require(mlba)) {
 36 |   library(devtools)
 37 |   install_github("gedeck/mlba/mlba", force=TRUE)
 38 | }
 39 | ```
 40 | 
 41 | # Example: Applying the RDS Framework to the COMPAS Example
 42 | ## Data Issues
 43 | ```{r}
 44 | library(caret)
 45 | library(tidyverse)
 46 | # load COMPAS data
 47 | compas.df <- mlba::COMPAS_clean %>%
 48 |   select(-id) %>%
 49 |   mutate(
 50 |     age_cat = factor(age_cat, levels=c("Less than 25", "25 - 45", "Greater than 45")),
 51 |     c_charge_degree = factor(c_charge_degree, levels=c("F", "M")),
 52 |     race = factor(race, levels=c("African-American", "Asian", "Caucasian", "Hispanic",
 53 |                                  "Native American", "Other")),
 54 |     sex = factor(sex, levels=c("Female", "Male")),
 55 |     two_year_recid = factor(two_year_recid, levels=c(0, 1), labels=c("No", "Yes"))
 56 |   )
 57 | 
 58 | # split dataset and train models
 59 | set.seed(1)
 60 | idx <- createDataPartition(compas.df$two_year_recid, p=0.7, list=FALSE)
 61 | train.df <- compas.df[idx, ]
 62 | valid.df <- compas.df[-idx, ]
 63 | ```
 64 | 
 65 | ```{r}
 66 | trControl <- trainControl(method="cv", number=5, allowParallel=TRUE)
 67 | # logistic regression model
 68 | logreg.model <- train(two_year_recid ~ . - race, data=train.df,
 69 |                       method="glm", family="binomial", trControl=trControl)
 70 | # random forest model
 71 | rf.model <- train(two_year_recid ~ . - race, data=train.df,
 72 |                          method="rf", trControl=trControl)
 73 | 
 74 | # extract coefficients and calculate odds shown in table
 75 | logreg.coef <- coef(logreg.model$finalModel)
 76 | data.frame(
 77 |   coefficient=logreg.coef,
 78 |   odds=c(NA, exp(logreg.coef)[-c(1)])
 79 | ) %>% round(3)
 80 | ```
 81 | 
 82 | ```{r}
 83 | # calculation of model accuracy based on cross-validation results
 84 | caret::confusionMatrix(logreg.model)
 85 | caret::confusionMatrix(rf.model)
 86 | ```
 87 | 
 88 | ## Auditing the Model
 89 | ```{r}
 90 | library(ROCR)
 91 | holdoutMetrics <- function(df, model) {
 92 |   result <- data.frame(obs = df$two_year_recid, pred = predict(model, newdata=df),
 93 |     prob = predict(model, newdata=df, type="prob")$Yes)
 94 |   pred <- prediction(result$prob, result$obs)
 95 |   # compute overall performance
 96 |   perf_AUC <- performance(pred, "auc")
 97 |   AUC <- perf_AUC@y.values[[1]]
 98 |   cm <- confusionMatrix(result$pred, result$obs, positive="Yes")
 99 |   return (tibble(AUC=AUC, Accuracy = cm$overall["Accuracy"],
100 |     FPR = 100*(1-cm$byClass["Specificity"]),
101 |     FNR = 100*(1-cm$byClass["Sensitivity"])))
102 | }
103 | # compute performance by race
104 | metricsByRace <- function(model) {
105 |   metrics <- tibble()
106 |   for (raceValue in levels(compas.df$race)) {
107 |     df <- compas.df %>% filter(race==raceValue)
108 |     metrics <- bind_rows(metrics, tibble(race=raceValue, holdoutMetrics(df, model)))
109 |   }
110 |   return (metrics)
111 | }
112 | # combine metrics for logistic and random forest
113 | metrics <- bind_rows(
114 |   tibble(Model="Random forest", metricsByRace(rf.model)),
115 |   tibble(Model="Logistic regression", metricsByRace(logreg.model))
116 | ) %>% filter(! race %in% c("Asian", "Native American"))
117 | ```
118 | 
119 | ```{r}
120 | library(gridExtra)
121 | makeBarchart <- function(metrics, aesthetics) {
122 |   g <- ggplot(metrics, aesthetics) +
123 |     geom_bar(position="dodge", stat="identity") +
124 |     geom_text(hjust=1.5, position=position_dodge(.9)) +
125 |     coord_flip() +
126 |     scale_x_discrete(limits=rev) +
127 |     labs(x="Race") +
128 |     theme_bw()
129 |   return (g)
130 | }
131 | g1 <- makeBarchart(metrics, aes(x=race, y=Accuracy, fill=Model, label=round(Accuracy, 3))) +
132 |   theme(legend.position="none")
133 | g2 <- makeBarchart(metrics, aes(x=race, y=AUC, fill=Model, label=round(AUC, 3))) +
134 |   theme(legend.position="bottom")
135 | grid.arrange(g1, g2, nrow=2, heights=c(6.25, 7))
136 | 
137 | g <- arrangeGrob(g1, g2, heights=c(6.25, 7))
138 | ggsave(file=file.path("..", "figures", "chapter_22", "c22_acc_auc.pdf"),
139 |        g, width=5, height=5, units="in")
140 | 
141 | g1 <- makeBarchart(metrics, aes(x=race, y=FPR, fill=Model, label=round(FPR, 3))) +
142 |   theme(legend.position="none")
143 | g2 <- makeBarchart(metrics, aes(x=race, y=FNR, fill=Model, label=round(FNR, 3))) +
144 |   theme(legend.position="bottom")
145 | grid.arrange(g1, g2, heights=c(6.25, 7))
146 | 
147 | g <- arrangeGrob(g1, g2, heights=c(6.25, 7))
148 | ggsave(file=file.path("..", "figures", "chapter_22", "c22_fpr_fnr.pdf"),
149 |     g, width=5, height=5, units="in")
150 | ```
151 | 
152 | ### Interpretability Methods
153 | ```{r}
154 | library(iml)
155 | predictor.rf = Predictor$new(rf.model, data=valid.df, y=valid.df$two_year_recid)
156 | predictor.lm = Predictor$new(logreg.model, data=valid.df, y=valid.df$two_year_recid)
157 | ```
158 | 
159 | ```{r}
160 | featureEffect.lm = FeatureEffect$new(predictor.lm, feature='priors_count', method='pdp')
161 | featureEffect.rf = FeatureEffect$new(predictor.rf, feature='priors_count', method='pdp')
162 | combined <- bind_rows(
163 |   tibble(Method="Logistic regression", featureEffect.lm$results %>% filter(.class=="Yes")),
164 |   tibble(Method="Random forest", featureEffect.rf$results %>% filter(.class=="Yes"))
165 | )
166 | ggplot(combined, aes(x=priors_count, y=.value, color=Method)) +
167 |   geom_line() +
168 |   labs(x="Feature value", y="Probability of recidivism")
169 | ```
170 | 
171 | ```{r}
172 | ggsave(file=file.path("..", "figures", "chapter_22", "c22f005.pdf"),
173 |   last_plot() + theme_bw(), width=5, height=3, units="in")
174 | ```
175 | 
176 | ```{r}
177 | library(iml)
178 | predictor.rf = Predictor$new(rf.model, data=valid.df, y=valid.df$two_year_recid)
179 | predictor.lm = Predictor$new(logreg.model, data=valid.df, y=valid.df$two_year_recid)
180 | 
181 | # permutation feature importance
182 | FeatureImp$new(predictor.lm, "ce", compare="ratio", n.repetitions=5)
183 | FeatureImp$new(predictor.rf, "ce", compare="ratio", n.repetitions=5)
184 | ```
185 | 
186 | 


--------------------------------------------------------------------------------
/Rmd/chap23.Rmd:
--------------------------------------------------------------------------------
1 | ```{r}
2 | if (!require(mlba)) {
3 |   library(devtools)
4 |   install_github("gedeck/mlba/mlba", force=TRUE)
5 | }
6 | ```
7 | 
8 | 


--------------------------------------------------------------------------------
/Rmd/chap4.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Machine Learning for Business Analytics"
  3 | author: "Chapter 4: Dimension Reduction"
  4 | output:
  5 |   pdf_document:
  6 |     toc: no
  7 |     highlight: tango
  8 | #  html_document:
  9 | #    toc: yes
 10 | #    toc_depth: 4
 11 | #    toc_float: yes
 12 | ---
 13 | <style>
 14 | h1.title { font-size: 28px; }
 15 | h1 { font-size: 22px; }
 16 | h2 { font-size: 18px; }
 17 | h3 { font-size: 14px; }
 18 | h4 { font-size: 12px; }
 19 | </style>
 20 | ```{r global_options, include=FALSE}
 21 | knitr::opts_chunk$set(error=TRUE,        # Keep compiling upon error
 22 |                       collapse=FALSE,    # collapse by default
 23 |                       echo=TRUE,         # echo code by default
 24 |                       comment = "#>",    # change comment character
 25 |                       fig.width = 5.5,     # set figure width
 26 |                       fig.align = "center",# set figure position
 27 |                       out.width = "49%", # set width of displayed images
 28 |                       warning=FALSE,     # do not show R warnings
 29 |                       message=FALSE,     # do not show R messages
 30 |                       row.print=25)
 31 | ```
 32 | 
 33 | 
 34 | ```{r}
 35 | if (!require(mlba)) {
 36 |   library(devtools)
 37 |   install_github("gedeck/mlba/mlba", force=TRUE)
 38 | }
 39 | 
 40 | options(dplyr.summarise.inform = FALSE)
 41 | library(tidyverse)
 42 | ```
 43 | 
 44 | # Data Summaries
 45 | ## Example 1: House Prices in Boston
 46 | ```{r}
 47 | boston.housing.df <- mlba::BostonHousing
 48 | head(boston.housing.df, 9)
 49 | summary(boston.housing.df)
 50 | 
 51 | # compute mean, standard dev., min, max, median, length, and missing values of CRIM
 52 | mean(boston.housing.df$CRIM)
 53 | sd(boston.housing.df$CRIM)
 54 | min(boston.housing.df$CRIM)
 55 | max(boston.housing.df$CRIM)
 56 | median(boston.housing.df$CRIM)
 57 | length(boston.housing.df$CRIM)
 58 | 
 59 | # find the number of missing values of variable CRIM
 60 | sum(is.na(boston.housing.df$CRIM))
 61 | 
 62 | # compute mean, standard dev., min, max, median, length, and missing values for all
 63 | # variables
 64 | data.frame(mean=sapply(boston.housing.df, mean),
 65 |            sd=sapply(boston.housing.df, sd),
 66 |            min=sapply(boston.housing.df, min),
 67 |            max=sapply(boston.housing.df, max),
 68 |            median=sapply(boston.housing.df, median),
 69 |            length=sapply(boston.housing.df, length),
 70 |            miss.val=sapply(boston.housing.df,
 71 |                            function(x) sum(length(which(is.na(x))))))
 72 | ```
 73 | 
 74 | ```{r}
 75 | 
 76 | ```
 77 | 
 78 | ## Summary Statistics
 79 | ```{r}
 80 | round(cor(boston.housing.df),2)
 81 | ```
 82 | 
 83 | ## Aggregation and Pivot Tables
 84 | ```{r}
 85 | boston.housing.df <- mlba::BostonHousing
 86 | table(boston.housing.df$CHAS)
 87 | 
 88 | # tidyverse version
 89 | boston.housing.df %>% count(CHAS)
 90 | ```
 91 | 
 92 | ```{r}
 93 | # create bins of size 1
 94 | boston.housing.df <- boston.housing.df %>%
 95 |   mutate(RM.bin = cut(RM, c(1:9), labels=FALSE))
 96 | 
 97 | # compute the average of MEDV by (binned) RM and CHAS
 98 | # in aggregate() use the argument by= to define the list of aggregating variables,
 99 | # and FUN= as an aggregating function.
100 | aggregate(boston.housing.df$MEDV, by=list(RM=boston.housing.df$RM.bin,
101 |           CHAS=boston.housing.df$CHAS), FUN=mean)
102 | 
103 | # tidyverse version
104 | boston.housing.df %>%
105 |   group_by(RM.bin, CHAS) %>%
106 |   summarise(mean(MEDV))
107 | ```
108 | 
109 | ```{r}
110 | # use install.packages("reshape") the first time the package is used
111 | library(reshape)
112 | boston.housing.df <- mlba::BostonHousing
113 | # create bins of size 1
114 | boston.housing.df <- boston.housing.df %>%
115 |   mutate(RM.bin = cut(RM, c(1:9), labels=FALSE))
116 | 
117 | # use melt() to stack a set of columns into a single column of data.
118 | # stack MEDV values for each combination of (binned) RM and CHAS
119 | mlt <- melt(boston.housing.df, id=c("RM.bin", "CHAS"), measure=c("MEDV"))
120 | head(mlt, 5)
121 | 
122 | # use cast() to reshape data and generate pivot table
123 | cast(mlt, RM.bin ~ CHAS, subset=variable=="MEDV",
124 |      margins=c("grand_row", "grand_col"), mean)
125 | 
126 | # tidyverse version
127 | boston.housing.df %>%
128 |   group_by(RM.bin, CHAS) %>%
129 |   summarize(mean=mean(MEDV)) %>%
130 |   spread(CHAS, mean)
131 | ```
132 | 
133 | # Reducing the Number of Categories in Categorical Variables
134 | ```{r}
135 | boston.housing.df <- mlba::BostonHousing
136 | 
137 | tbl <- table(boston.housing.df$CAT.MEDV, boston.housing.df$ZN)
138 | prop.tbl <- prop.table(tbl, margin=2)
139 | barplot(prop.tbl, xlab="ZN", ylab="", yaxt="n",main="Distribution of CAT.MEDV by ZN")
140 | axis(2, at=(seq(0,1, 0.2)), paste(seq(0,100,20), "%"))
141 | 
142 | library(tidyverse)
143 | df <- data.frame(prop.tbl)
144 | ggplot(df, aes(x=Var2, y=Freq, group=Var1, fill=Var1)) +
145 |   geom_bar(stat="identity", color="grey", width=1) +
146 |   scale_y_continuous(labels = scales::percent, expand=expansion()) +
147 |   scale_fill_manual("CAT.MEDV", values=c("0"="#eeeeee", "1"="darkgrey")) +
148 |   labs(x="ZN", y="", title="Distribution of CAT.MEDV by ZN")
149 | ```
150 | 
151 | ```{r}
152 | g <- last_plot() + theme_bw()
153 |   ggsave(file=file.path("..", "figures", "chapter_04", "reduction-pivot-bar.pdf"),
154 |          g, width=9, height=4, units="in")
155 | ```
156 | 
157 | ```{r}
158 | library(forecast)
159 | tru.data <- mlba::ToysRUsRevenues
160 | tru.ts <- ts(tru.data[, 3], start = c(1992, 1), end = c(1995, 4), freq = 4)
161 | autoplot(tru.ts) +
162 |   geom_point(size=0.5) +
163 |   labs(x="Time", y="Revenue ($ millions)") +
164 |   theme_bw()
165 | ggsave(file=file.path("..", "figures", "chapter_04", "ToysRUs.pdf"),
166 |        last_plot())
167 | ```
168 | 
169 | # Principal Components Analysis
170 | ## Example 2: Breakfast Cereals
171 | ```{r}
172 | library(tidyverse)
173 | cereals.df <- mlba::Cereals %>% select(calories, rating)
174 | # compute PCs on two dimensions
175 | pcs <- prcomp(cereals.df %>% select(calories, rating))
176 | summary(pcs)
177 | pcs$rot
178 | scores <- pcs$x
179 | head(scores, 5)
180 | ```
181 | 
182 | ```{r}
183 | getPCaxis <- function(f, pcs, pcLabel) {
184 |   return (data.frame(
185 |     rbind(pcs$center + f * pcs$rotation[, pcLabel],
186 |           pcs$center - f * pcs$rotation[, pcLabel]))
187 |   )
188 | }
189 | PC1 <- getPCaxis(90, pcs, "PC1")
190 | PC2 <- getPCaxis(50, pcs, "PC2")
191 | ggplot(cereals.df, aes(x=calories, y=rating)) +
192 |   geom_point() +
193 |   geom_line(data=PC1) +
194 |   geom_line(data=PC2) +
195 |   coord_cartesian(xlim=c(0, 200), ylim=c(0, 110)) +
196 |   labs(x="Calories", y="Rating") +
197 |   annotate(geom="text", x=30, y=80, label="z[1]",parse=TRUE) +
198 |   annotate(geom="text", x=120, y=80, label="z[2]",parse=TRUE) +
199 |   theme_bw()
200 | 
201 | ggsave(file=file.path("..", "figures", "chapter_04", "pca_subset.pdf"),
202 |        width=5, height=3, last_plot())
203 | ```
204 | 
205 | ## Principal Components
206 | ```{r}
207 | # load and preprocess the data
208 | cereals.df <- mlba::Cereals %>%
209 |   column_to_rownames("name") %>%
210 |   select(-c(mfr, type)) %>%
211 |   drop_na()
212 | 
213 | pcs <- prcomp(cereals.df)
214 | summary(pcs)
215 | pcs$rotation[,1:5]
216 | ```
217 | 
218 | ## Normalizing the Data
219 | ```{r}
220 | # Use function prcomp() with scale. = T to run PCA on normalized data
221 | pcs.cor <- prcomp(cereals.df, scale. = T)
222 | 
223 | summary(pcs.cor)
224 | pcs.cor$rotation[,1:5]
225 | ```
226 | 
227 | ```{r}
228 | library(ggrepel)
229 | ggplot(data.frame(pcs.cor$x), aes(x=PC1, y=PC2, label=rownames(pcs.cor$x))) +
230 |   geom_point(shape=21) +
231 |   geom_text_repel(size=2, max.overlaps=7) +
232 |   theme_bw()
233 | 
234 | f <- 1.3
235 | ggsave(file=file.path("..", "figures", "chapter_04", "pca_full.pdf"),
236 |        width=f * 4, height=f * 5, last_plot())
237 | ```
238 | 
239 | # Dimension Reduction Using Classification and Regression Trees
240 | ## Using Principal Components for Classification and Prediction
241 | ```{r}
242 | wine.df <- mlba::Wine %>% select(-Type)
243 |   pcs.cor <- prcomp(wine.df)
244 |   summary(pcs.cor)
245 |   pcs.cor$rotation[,1:4]
246 | ```
247 | 
248 | 


--------------------------------------------------------------------------------
/Rmd/chap6.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Machine Learning for Business Analytics"
  3 | author: "Chapter 6: Multiple Linear Regression"
  4 | output:
  5 |   pdf_document:
  6 |     toc: no
  7 |     highlight: tango
  8 | #  html_document:
  9 | #    toc: yes
 10 | #    toc_depth: 4
 11 | #    toc_float: yes
 12 | ---
 13 | <style>
 14 | h1.title { font-size: 28px; }
 15 | h1 { font-size: 22px; }
 16 | h2 { font-size: 18px; }
 17 | h3 { font-size: 14px; }
 18 | h4 { font-size: 12px; }
 19 | </style>
 20 | ```{r global_options, include=FALSE}
 21 | knitr::opts_chunk$set(error=TRUE,        # Keep compiling upon error
 22 |                       collapse=FALSE,    # collapse by default
 23 |                       echo=TRUE,         # echo code by default
 24 |                       comment = "#>",    # change comment character
 25 |                       fig.width = 5.5,     # set figure width
 26 |                       fig.align = "center",# set figure position
 27 |                       out.width = "49%", # set width of displayed images
 28 |                       warning=FALSE,     # do not show R warnings
 29 |                       message=FALSE,     # do not show R messages
 30 |                       row.print=25)
 31 | ```
 32 | 
 33 | 
 34 | ```{r}
 35 | if (!require(mlba)) {
 36 |   library(devtools)
 37 |   install_github("gedeck/mlba/mlba", force=TRUE)
 38 | }
 39 | options(scipen=999, digits = 3)
 40 | ```
 41 | 
 42 | # Estimating the Regression Equation and Prediction
 43 | ## Example: Predicting the Price of Used Toyota Corolla Cars
 44 | ```{r}
 45 | library(caret)
 46 | car.df <- mlba::ToyotaCorolla
 47 | # select variables for regression
 48 | outcome <- "Price"
 49 | predictors <- c("Age_08_04", "KM", "Fuel_Type", "HP", "Met_Color", "Automatic",
 50 |                 "CC", "Doors", "Quarterly_Tax", "Weight")
 51 | # reduce data set to first 1000 rows and selected variables
 52 | car.df <- car.df[1:1000, c(outcome, predictors)]
 53 | 
 54 | # partition data
 55 | set.seed(1)  # set seed for reproducing the partition
 56 | idx <- createDataPartition(car.df$Price, p=0.6, list=FALSE)
 57 | train.df <- car.df[idx, ]
 58 | holdout.df <- car.df[-idx, ]
 59 | 
 60 | # use lm() to run a linear regression of Price on all 11 predictors in the
 61 | # training set.
 62 | # use . after ~ to include all the remaining columns in train.df as predictors.
 63 | car.lm <- lm(Price ~ ., data = train.df)
 64 | #  use options() to ensure numbers are not displayed in scientific notation.
 65 | options(scipen = 999)
 66 | summary(car.lm)
 67 | ```
 68 | 
 69 | ```{r}
 70 | 
 71 | ```
 72 | 
 73 | ```{r}
 74 | # use predict() to make predictions on a new set.
 75 | pred <- predict(car.lm, holdout.df)
 76 | 
 77 | options(scipen=999, digits=0)
 78 | data.frame(
 79 |   'Predicted' = pred[1:20],
 80 |   'Actual' = holdout.df$Price[1:20],
 81 |   'Residual' = holdout.df$Price[1:20] - pred[1:20]
 82 | )
 83 | options(scipen=999, digits = 3)
 84 | 
 85 | # calculate performance metrics
 86 | rbind(
 87 |   Training=mlba::regressionSummary(predict(car.lm, train.df), train.df$Price),
 88 |   Holdout=mlba::regressionSummary(pred, holdout.df$Price)
 89 | )
 90 | ```
 91 | 
 92 | ```{r}
 93 | library(ggplot2)
 94 | pred <- predict(car.lm, holdout.df)
 95 | all.residuals <- holdout.df$Price - pred
 96 | 
 97 | ggplot() +
 98 |     geom_histogram(aes(x=all.residuals), fill="lightgray", color="grey") +
 99 |     labs(x="Residuals", y="Frequency")
100 | ```
101 | 
102 | ```{r}
103 | g <- ggplot() +
104 |          geom_histogram(aes(x=all.residuals), fill="lightgray", color="grey") +
105 |          labs(x="Residuals", y="Frequency") +
106 |          theme_bw()
107 | ggsave(file=file.path("..", "figures", "chapter_06", "residuals-histogram.pdf"),
108 |        g, width=5, height=3, units="in")
109 | ```
110 | 
111 | ## Cross-validation and caret
112 | ```{r}
113 | set.seed(1)
114 | library(caret)
115 | # define 5-fold
116 | trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
117 | model <- caret::train(Price ~ ., data=car.df,
118 |                       method="lm",  # specify the model
119 |                       trControl=trControl)
120 | model
121 | coef(model$finalModel)
122 | ```
123 | 
124 | ```{r}
125 | library(tidyverse)
126 | collectMetrics <- function(model, train.df, holdout.df, nPredictors) {
127 |   if (missing(nPredictors)) {
128 |     coefs = coef(model$finalModel)
129 |     nPredictors = length(coefs) - 1
130 |   }
131 |   return (cbind(
132 |     CV=model$results %>% slice_min(RMSE) %>% dplyr::select(c(RMSE, MAE)),
133 |     Training=mlba::regressionSummary(predict(model, train.df), train.df$Price),
134 |     Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$Price),
135 |     nPredictors=nPredictors
136 |   ))
137 | }
138 | 
139 | metric.full <- collectMetrics(model, train.df, holdout.df)
140 | ```
141 | 
142 | ```{r}
143 | predict(model, car.df[1:3,])
144 | ```
145 | 
146 | ```{r}
147 | 
148 | ```
149 | 
150 | # Variable Selection in Linear Regression
151 | ## How to Reduce the Number of Predictors
152 | ### Exhaustive Search
153 | ```{r}
154 | # use regsubsets() in package leaps to run an exhaustive search.
155 | # unlike with lm, categorical predictors must be turned into dummies manually.
156 | library(leaps)
157 | library(fastDummies)
158 | 
159 | # create dummies for fuel type
160 | leaps.train.df <- dummy_cols(train.df, remove_first_dummy=TRUE,
161 |                              remove_selected_columns=TRUE)
162 | search <- regsubsets(Price ~ ., data=leaps.train.df, nbest=1,
163 |                      nvmax=ncol(leaps.train.df), method="exhaustive")
164 | sum <- summary(search)
165 | 
166 | # show models
167 | sum$which
168 | 
169 | # show metrics
170 | sum$rsq
171 | sum$adjr2
172 | sum$cp
173 | ```
174 | 
175 | ```{r}
176 | optimal <- which.min(sum$cp)
177 | 
178 | # determine the variable names for the optimal model
179 | X <- summary(search)$which[, -1]  # information about included predictors
180 | xvars <- dimnames(X)[[2]] ## column names (all covariates except intercept)
181 | xvars <- xvars[X[optimal,]]
182 | 
183 | # the optimal model contains all dummy variables of Fuel_Type
184 | xvars <- c("Age_08_04", "KM", "HP", "Quarterly_Tax", "Weight", "Fuel_Type")
185 | 
186 | # rebuild model for best predictor set
187 | set.seed(1)
188 | trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE)
189 | model <- caret::train(Price ~ ., data=car.df[, c("Price", xvars)],
190 |                       method="lm",  # specify the model
191 |                       trControl=trControl)
192 | model
193 | coef(model$finalModel)
194 | 
195 | metric.exhaustive <- collectMetrics(model, train.df, holdout.df)
196 | ```
197 | 
198 | ### Popular Subset Selection Algorithms
199 | ```{r}
200 | # as model performance is estimated using AIC, we don't need to use cross-validation
201 | trControl <- caret::trainControl(method="none")
202 | model <- caret::train(Price ~ ., data=train.df, trControl=trControl,
203 |                       # select backward elmination
204 |                       method="glmStepAIC", direction='backward')
205 | 
206 | coef(model$finalModel)
207 | ```
208 | 
209 | ```{r}
210 | model <- caret::train(Price ~ ., data=train.df, trControl=trControl,
211 |                       method="glmStepAIC", direction='forward')
212 | 
213 | coef(model$finalModel)
214 | ```
215 | 
216 | ```{r}
217 | model <- caret::train(Price ~ ., data=train.df, trControl=trControl,
218 |                       method="glmStepAIC", direction='both')
219 | 
220 | coef(model$finalModel)
221 | ```
222 | 
223 | ```{r}
224 | rbind(Training=mlba::regressionSummary(predict(model, train.df), train.df$Price),
225 |       Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$Price))
226 | ```
227 | 
228 | ```{r}
229 | # The models are identical to the best model obtained from the exhaustive search.
230 | # We therefore duplicate the metrics.
231 | metric.stepwise <- metric.exhaustive
232 | ```
233 | 
234 | # Regularization (Shrinkage Models)
235 | ```{r}
236 | set.seed(1)
237 | library(caret)
238 | trControl <- caret::trainControl(method='cv', number=5, allowParallel=TRUE)
239 | tuneGrid <- expand.grid(lambda=10^seq(5, 2, by=-0.1), alpha=0)
240 | model <- caret::train(Price ~ ., data=train.df,
241 |                  method='glmnet',
242 |                  family='gaussian',  # set the family for linear regression
243 |                  trControl=trControl,
244 |                  tuneGrid=tuneGrid)
245 | model$bestTune
246 | coef(model$finalModel, s=model$bestTune$lambda)
247 | ```
248 | 
249 | ```{r}
250 | metric.ridge <- collectMetrics(model, train.df, holdout.df,
251 |   length(coef(model$finalModel, s=model$bestTune$lambda)) - 1)
252 | ridge.model <- model
253 | ```
254 | 
255 | ```{r}
256 | rbind(
257 |   Training=mlba::regressionSummary(predict(model, train.df), train.df$Price),
258 |   Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$Price)
259 | )
260 | ```
261 | 
262 | ```{r}
263 | set.seed(1)
264 | tuneGrid <- expand.grid(lambda=10^seq(4, 0, by=-0.1), alpha=1)
265 | model <- caret::train(Price ~ ., data=train.df,
266 |                  method='glmnet',
267 |                  family='gaussian',  # set the family for linear regression
268 |                  trControl=trControl,
269 |                  tuneGrid=tuneGrid)
270 | model$bestTune
271 | coef(model$finalModel, s=model$bestTune$lambda)
272 | ```
273 | 
274 | ```{r}
275 | lasso.model <- model
276 | metric.lasso <- collectMetrics(lasso.model, train.df, holdout.df,
277 |   sum(coef(lasso.model$finalModel, s=lasso.model$bestTune$lambda) != 0) - 1)
278 | ```
279 | 
280 | ```{r}
281 | rbind(
282 |   Training=mlba::regressionSummary(predict(model, train.df), train.df$Price),
283 |   Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$Price)
284 | )
285 | ```
286 | 
287 | ```{r}
288 | library(tidyverse)
289 | library(gridExtra)
290 | g1 <- ggplot(ridge.model$results, aes(x=lambda, y=RMSE)) +
291 |     geom_pointrange(aes(ymin=RMSE-RMSESD, ymax=RMSE+RMSESD), color='grey') +
292 |     geom_line() +
293 |     geom_point(data=ridge.model$results %>% subset(RMSE == min(RMSE)), color='red') +
294 |     labs(x=expression(paste('Ridge parameter ', lambda)),
295 |          y='RMSE (cross-validation)') +
296 |     scale_x_log10()
297 | g2 <- ggplot(lasso.model$results, aes(x=lambda, y=RMSE)) +
298 |     geom_pointrange(aes(ymin=RMSE-RMSESD, ymax=RMSE+RMSESD), color='grey') +
299 |     geom_line() +
300 |     geom_point(data=lasso.model$results %>% subset(RMSE == min(RMSE)), color='red') +
301 |     labs(x=expression(paste('Lasso parameter ', lambda)),
302 |          y='RMSE (cross-validation)') +
303 |     scale_x_log10()
304 | grid.arrange(g1, g2, ncol=2)
305 | 
306 | g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw(), ncol=2)
307 | ggsave(file=file.path('..', 'figures', 'chapter_06', 'shrinkage-parameter-tuning.pdf'),
308 |        g, width=6, height=2.5, units='in')
309 | ```
310 | 
311 | ```{r}
312 | data.frame(rbind(
313 |   'full'= metric.full,
314 |   'exhaustive' = metric.exhaustive,
315 |   'stepwise' = metric.stepwise,
316 |   'ridge' = metric.ridge,
317 |   'lasso' = metric.lasso
318 | ))
319 | ```
320 | 
321 | 


--------------------------------------------------------------------------------
/Rmd/chap7.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Machine Learning for Business Analytics"
  3 | author: "Chapter 7: $k$-Nearest Neighbors ($k$"
  4 | output:
  5 |   pdf_document:
  6 |     toc: no
  7 |     highlight: tango
  8 | #  html_document:
  9 | #    toc: yes
 10 | #    toc_depth: 4
 11 | #    toc_float: yes
 12 | ---
 13 | <style>
 14 | h1.title { font-size: 28px; }
 15 | h1 { font-size: 22px; }
 16 | h2 { font-size: 18px; }
 17 | h3 { font-size: 14px; }
 18 | h4 { font-size: 12px; }
 19 | </style>
 20 | ```{r global_options, include=FALSE}
 21 | knitr::opts_chunk$set(error=TRUE,        # Keep compiling upon error
 22 |                       collapse=FALSE,    # collapse by default
 23 |                       echo=TRUE,         # echo code by default
 24 |                       comment = "#>",    # change comment character
 25 |                       fig.width = 5.5,     # set figure width
 26 |                       fig.align = "center",# set figure position
 27 |                       out.width = "49%", # set width of displayed images
 28 |                       warning=FALSE,     # do not show R warnings
 29 |                       message=FALSE,     # do not show R messages
 30 |                       row.print=25)
 31 | ```
 32 | 
 33 | 
 34 | ```{r}
 35 | if (!require(mlba)) {
 36 |   library(devtools)
 37 |   install_github("gedeck/mlba/mlba", force=TRUE)
 38 | }
 39 | ```
 40 | 
 41 | # The $k$-NN Classifier (Categorical Outcome)
 42 | ## Example: Riding Mowers
 43 | ```{r}
 44 | library(ggrepel)
 45 | mowers.df <- mlba::RidingMowers
 46 | set.seed(35)
 47 | 
 48 | idx <- sample(nrow(mowers.df), 0.6*nrow(mowers.df))
 49 | train.df <- mowers.df[idx, ]
 50 | holdout.df <- mowers.df[-idx, ]
 51 | ## new household
 52 | new.df <- data.frame(Income = 60, Lot_Size = 20)
 53 | 
 54 | ggplot(mapping=aes(x=Income, y=Lot_Size, shape=Ownership, color=Ownership)) +
 55 |   geom_point(data=train.df) +
 56 |   geom_text_repel(aes(label=rownames(train.df)), data=train.df, show.legend = FALSE) +
 57 |   geom_point(data=cbind(new.df, Ownership='New'))
 58 | ```
 59 | 
 60 | ```{r}
 61 | g <- ggplot(mapping=aes(x=Income, y=Lot_Size, shape=Ownership, color=Ownership, fill=Ownership)) +
 62 |   geom_point(data=train.df, size=4) +
 63 |   geom_text_repel(aes(label=rownames(train.df)), data=train.df, show.legend = FALSE) +
 64 |   geom_point(data=cbind(new.df, Ownership='New'),  size=5) +
 65 |   scale_shape_manual(values = c(18, 15, 21)) +
 66 |   scale_color_manual(values = c('black', 'darkorange', 'steelblue')) +
 67 |   scale_fill_manual(values = c('black', 'darkorange', 'lightblue'))
 68 | 
 69 | g
 70 | 
 71 | ggsave(file=file.path("..", "figures", "chapter_07", "knn-riding-mower.pdf"),
 72 |        g + theme_bw(), width=6, height=4, units="in")
 73 | ```
 74 | 
 75 | ```{r}
 76 | library(caret)
 77 | # train k-NN model with k=3
 78 | model <- train(Ownership ~ ., data=train.df,
 79 |                method="knn",  # specify the model
 80 |                preProcess=c("center", "scale"),  # normalize data
 81 |                tuneGrid=expand.grid(k=3),
 82 |                trControl=trainControl(method="none"))
 83 | model
 84 | 
 85 | # predict new data point
 86 | predict(model, new.df)
 87 | 
 88 | # determine nearest neighbors to new data point
 89 | train.norm.df <- predict(model$preProcess, train.df)
 90 | new.norm.df <- predict(model$preProcess, new.df)
 91 | distances <- apply(train.norm.df[, 1:2], 1,
 92 |                    function(d){ sqrt(sum((d - new.norm.df)^2)) })
 93 | rownames(train.df)[order(distances)][1:3]
 94 | ```
 95 | 
 96 | ## Choosing $k$
 97 | ```{r}
 98 | # use leave-one-out cross-validation for small dataset
 99 | trControl <- trainControl(method="loocv", number=5, allowParallel=TRUE)
100 | model <- train(Ownership ~ ., data=train.df,
101 |                method="knn",
102 |                preProcess=c("center", "scale"),
103 |                tuneGrid=expand.grid(k=seq(1, 13, 2)),
104 |                trControl=trControl)
105 | model
106 | ```
107 | 
108 | ```{r}
109 | model <- train(Ownership ~ ., data=mowers.df,
110 |                method="knn",
111 |                preProcess=c("center", "scale"),
112 |                tuneGrid=expand.grid(k=7),
113 |                trControl=trainControl(method="none"))
114 | predict(model, new.df)
115 | ```
116 | 
117 | ## Setting the Cutoff Value
118 | ```{r}
119 | train.norm.df <- predict(model$preProcess, train.df)
120 | new.norm.df <- predict(model$preProcess, new.df)
121 | distances <- apply(train.norm.df[, 1:2], 1,
122 |                    function(d){ sqrt(sum((d - new.norm.df)^2)) })
123 | train.df[order(distances)[1:8],]
124 | ```
125 | 
126 | 


--------------------------------------------------------------------------------
/Rmd/chap8.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Machine Learning for Business Analytics"
  3 | author: "Chapter 8: The Naive Bayes Classifier"
  4 | output:
  5 |   pdf_document:
  6 |     toc: no
  7 |     highlight: tango
  8 | #  html_document:
  9 | #    toc: yes
 10 | #    toc_depth: 4
 11 | #    toc_float: yes
 12 | ---
 13 | <style>
 14 | h1.title { font-size: 28px; }
 15 | h1 { font-size: 22px; }
 16 | h2 { font-size: 18px; }
 17 | h3 { font-size: 14px; }
 18 | h4 { font-size: 12px; }
 19 | </style>
 20 | ```{r global_options, include=FALSE}
 21 | knitr::opts_chunk$set(error=TRUE,        # Keep compiling upon error
 22 |                       collapse=FALSE,    # collapse by default
 23 |                       echo=TRUE,         # echo code by default
 24 |                       comment = "#>",    # change comment character
 25 |                       fig.width = 5.5,     # set figure width
 26 |                       fig.align = "center",# set figure position
 27 |                       out.width = "49%", # set width of displayed images
 28 |                       warning=FALSE,     # do not show R warnings
 29 |                       message=FALSE,     # do not show R messages
 30 |                       row.print=25)
 31 | ```
 32 | 
 33 | 
 34 | ```{r}
 35 | if (!require(mlba)) {
 36 |   library(devtools)
 37 |   install_github("gedeck/mlba/mlba", force=TRUE)
 38 | }
 39 | options(scipen=999, digits = 3)
 40 | ```
 41 | 
 42 | # Solution: Naive Bayes
 43 | ## Example 3: Predicting Delayed Flights
 44 | ```{r}
 45 | library(tidyverse)
 46 | library(caret)
 47 | library(e1071)
 48 | # load and preprocess dataset
 49 | delays.df <- mlba::FlightDelays %>%
 50 |   mutate(
 51 |     # change numerical variables to categorical
 52 |     DAY_WEEK = factor(DAY_WEEK),
 53 |     ORIGIN = factor(ORIGIN),
 54 |     DEST = factor(DEST),
 55 |     CARRIER = factor(CARRIER),
 56 |     Flight.Status = factor(Flight.Status),
 57 |     # create hourly bins for departure time
 58 |     CRS_DEP_TIME = factor(round(CRS_DEP_TIME / 100))
 59 |   ) %>%
 60 |   select(DAY_WEEK, CRS_DEP_TIME, ORIGIN, DEST, CARRIER, Flight.Status)
 61 | 
 62 | # create training and holdout sets
 63 | set.seed(1)
 64 | idx <- createDataPartition(delays.df$Flight.Status, p=0.6, list=FALSE)
 65 | train.df <- delays.df[idx, ]
 66 | holdout.df <- delays.df[-idx, ]
 67 | 
 68 | # run naive bayes
 69 | delays.nb <- naiveBayes(Flight.Status ~ ., data = train.df)
 70 | delays.nb
 71 | ```
 72 | 
 73 | ```{r}
 74 | 
 75 | ```
 76 | 
 77 | ```{r}
 78 | # use prop.table() with margin = 1 to convert a count table to a proportions table,
 79 | # where each row sums up to 1 (use margin = 2 for column sums)
 80 | prop.table(table(train.df$Flight.Status, train.df$DEST), margin = 1)
 81 | ```
 82 | 
 83 | ```{r}
 84 | ## predict probabilities
 85 | pred.prob <- predict(delays.nb, newdata=holdout.df, type="raw")
 86 | ## predict class membership
 87 | pred.class <- predict(delays.nb, newdata=holdout.df)
 88 | 
 89 | df <- data.frame(actual=holdout.df$Flight.Status, predicted=pred.class, pred.prob)
 90 | 
 91 | df[holdout.df$CARRIER == "DL" & holdout.df$DAY_WEEK == 7 & holdout.df$CRS_DEP_TIME == 10 &
 92 |      holdout.df$DEST == "LGA" & holdout.df$ORIGIN == "DCA",]
 93 | ```
 94 | 
 95 | ```{r}
 96 | # training
 97 | confusionMatrix(predict(delays.nb, newdata=train.df), train.df$Flight.Status)
 98 | 
 99 | # holdout
100 | confusionMatrix(predict(delays.nb, newdata=holdout.df), holdout.df$Flight.Status)
101 | ```
102 | 
103 | ```{r}
104 | library(gains)
105 | actual <- ifelse(holdout.df$Flight.Status == "delayed", 1, 0)
106 | gain <- gains(actual, pred.prob[,"delayed"], groups=length(actual) - 2)
107 | 
108 | nactual <-sum(actual)
109 | ggplot() +
110 |   geom_line(aes(x=gain$cume.obs, y=gain$cume.pct.of.total * nactual)) +
111 |   geom_line(aes(x=c(0, max(gain$cume.obs)), y=c(0, nactual)), color="darkgrey") +
112 |   labs(x="# Cases", y="Cumulative")
113 | ```
114 | 
115 | ```{r}
116 | ggsave(file=file.path("..", "figures", "chapter_08", "Flights-NB-gain.pdf"),
117 |        width=3, height=3,
118 |        last_plot() + theme_bw())
119 | ```
120 | 
121 | ## Working with Continuous Predictors
122 | ```{r}
123 | (p_delayed = dnorm(213, mean=211.36215, sd=15.31))
124 |   (p_ontime = dnorm(213, mean=211.99436, sd=12.79))
125 | 
126 |   p_ontime * 0.805
127 |   p_delayed * 0.195
128 | ```
129 | 
130 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-slate


--------------------------------------------------------------------------------
/_layouts/default.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="{{ site.lang | default: "en-US" }}">
 3 | 
 4 |   <head>
 5 |     <meta charset='utf-8'>
 6 |     <meta http-equiv="X-UA-Compatible" content="IE=edge">
 7 |     <meta name="viewport" content="width=device-width,maximum-scale=2">
 8 |     <link rel="stylesheet" type="text/css" media="screen" href="{{ '/assets/css/style.css?v=' | append: site.github.build_revision | relative_url }}">
 9 |     <script>
10 |         MathJax = {
11 |             tex: {
12 |             inlineMath: [['$', '$']]
13 |             }
14 |         };
15 |     </script>
16 |     <script type="text/javascript" id="MathJax-script" async
17 |         src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
18 |     {% seo %}
19 |     {% include head-custom.html %}
20 |   </head>
21 | 
22 |   <body>
23 | 
24 |     <!-- HEADER -->
25 |     <div id="header_wrap" class="outer">
26 |         <header class="inner">
27 |           {% if site.github.is_project_page %}
28 |             <a id="forkme_banner" href="{{ site.github.repository_url }}">View on GitHub</a>
29 |           {% endif %}
30 | 
31 |           <h1 id="project_title">{{ site.title | default: site.github.repository_name }}</h1>
32 |           <h2 id="project_tagline">{{ site.description | default: site.github.project_tagline }}</h2>
33 | 
34 |           {% if site.show_downloads %}
35 |             <section id="downloads">
36 |               <a class="zip_download_link" href="{{ site.github.zip_url }}">Download this project as a .zip file</a>
37 |               <a class="tar_download_link" href="{{ site.github.tar_url }}">Download this project as a tar.gz file</a>
38 |             </section>
39 |           {% endif %}
40 |         </header>
41 |     </div>
42 | 
43 |     <!-- MAIN CONTENT -->
44 |     <div id="main_content_wrap" class="outer">
45 |       <section id="main_content" class="inner">
46 |         {{ content }}
47 |       </section>
48 |     </div>
49 | 
50 |     <!-- FOOTER  -->
51 |     <div id="footer_wrap" class="outer">
52 |       <footer class="inner">
53 |         {% if site.github.is_project_page %}
54 |         <p class="copyright">{{ site.title | default: site.github.repository_name }} maintained by <a href="{{ site.github.owner_url }}">{{ site.github.owner_name }}</a></p>
55 |         {% endif %}
56 |         <p>Published with <a href="https://pages.github.com">GitHub Pages</a></p>
57 |       </footer>
58 |     </div>
59 |   </body>
60 | </html>


--------------------------------------------------------------------------------
/assets/css/style.scss:
--------------------------------------------------------------------------------
 1 | ---
 2 | ---
 3 | @import "{{ site.theme }}";
 4 | 
 5 | table {
 6 |     border: 0px;
 7 | }
 8 | 
 9 | td {
10 |     border: 0px;
11 |     vertical-align: top;
12 | }
13 | 
14 | .inner {
15 |     max-width: 800px;
16 | }
17 | 


--------------------------------------------------------------------------------
/images/anaconda-create-environment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/images/anaconda-create-environment.png


--------------------------------------------------------------------------------
/images/anaconda-environment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/images/anaconda-environment.png


--------------------------------------------------------------------------------
/images/anaconda-mlba-r-environment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/images/anaconda-mlba-r-environment.png


--------------------------------------------------------------------------------
/images/anaconda-open-terminal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/images/anaconda-open-terminal.png


--------------------------------------------------------------------------------
/images/anaconda-packagelist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/images/anaconda-packagelist.png


--------------------------------------------------------------------------------
/images/anaconda-update.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/images/anaconda-update.png


--------------------------------------------------------------------------------
/images/jupyter-executing-python.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/images/jupyter-executing-python.png


--------------------------------------------------------------------------------
/images/jupyter-new-notebook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/images/jupyter-new-notebook.png


--------------------------------------------------------------------------------
/images/jupyter-notebook-filemanager.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/images/jupyter-notebook-filemanager.png


--------------------------------------------------------------------------------
/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/images/logo.png


--------------------------------------------------------------------------------
/img/mlba-bookcover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/img/mlba-bookcover.png


--------------------------------------------------------------------------------
/installPython.md:
--------------------------------------------------------------------------------
 1 | # Getting Started with Python
 2 | Python is a powerful, general purpose programming language that can be used for many applications ranging from short scripts to enterprise applications. There is a large and growing number of free, open-source libraries and tools for scientific computing. For more information about Python and its use visit [python.org.](https://www.python.org/).
 3 | 
 4 | ## Install Python
 5 | There are many ways of using and developing with Python. However, a good start will be to use Jupyter notebooks, an interactive, browser-based Python interface available through the [Anaconda Distribution](https://www.anaconda.com/) which is particularly useful for scientific computing. We will be using Python 3.8 or newer. 
 6 | 
 7 | Here is what you need to do:
 8 | 
 9 | - Download the Anaconda installer for Python 3.8 or later from https://www.anaconda.com/download/ for your operating system (you will be asked for your email, however this step is optional and you can proceed without providing it)
10 | - Execute the installer
11 |   - macOS: double-click on the pkg file and follow the instructions using the default settings
12 |   - Windows: run the exe file and follow the instructions using default settings
13 |   - During installation of Anaconda, you are asked if you want to install DataSpell. This is optional, but can be a useful tool to get you started using Jupyter Notebooks.
14 |   
15 | Once the application is installed, you can execute Anaconda Navigator from the Start Menu (Windows) and the Application folder (macOS).
16 | 
17 | If you don’t want to use Anaconda, you will find installation instructions for Windows 10 at the end of this document.
18 | 
19 | ## Anaconda Navigator – update and install packages 
20 | <img src='images/logo.png' width=50>
21 | 
22 | You can use _Anaconda Navigator_ to manage your Python installation and run the Jupyter application. 
23 | 
24 | Use the _Environments_ tab to add packages to your Python installation. The package list looks like this:
25 | 
26 | <img src='images/anaconda-environment.png' width=800>
27 | 
28 | To run the deep-learning applications from the book, we require a custom environment called `mlba-r`. It is in general a good idea to use custom Python environments for different projects to reduce the possibility of Python version conflicts and to have different Python versions on the same machine.
29 | 
30 | To create the `mlba-r` environment for the deep-learning applications from the book, click the **[+] Create** link at the bottom of the window. This will open a dialog where you can enter the name of the environment and the Python version. Enter `mlba-r` as the name and select Python 3.8 or newer as the version. Click the **[Create]** button to create the environment.
31 | 
32 | <img src='images/anaconda-create-environment.png' width=400>
33 | 
34 | Click the [Update index…] button to refresh the package list. From time to time, it may ask you to update the Anaconda Navigator application. It’s good practice to update regularly. 
35 | If new versions become available, you will see that the version number changes. The version number of updatable packages are highlighted in blue and with an arrow next to the version number.
36 | This means that you can update the specific package. Change the pull-down menu to [Updatable] and click the green tick mark to select [Mark for update]. Do that for all the packages you want to update, select [Apply] and confirm the update. 
37 | 
38 | <img src='images/anaconda-mlba-r-environment.png' width=800>
39 | 
40 | Once you initiated the update, use the [Clear] button to remove the marking. Anaconda Navigator otherwise will indicate that it is busy when you want to close the application. 
41 | 
42 | Updates are done in the background and will take some time and may require confirmation.  There is no feedback that an update is finished. You will need to refresh the list using [Update index…] to see the progress.
43 | 
44 | ## Anaconda Navigator – install packages
45 | 
46 | To install a package, change the pull down to **[Not installed]** and enter e.g. tensorflow in the [Search packages] field. Click on the rectangle to select the package for download and use the [Apply] button to start the installation. 
47 | Once the library is installed, it will be listed under the installed packages.
48 | 
49 | - tensorflow: Python package for the Tensorflow deep learning library(https://www.tensorflow.org/)
50 | - keras: Python package for the Keras deep learning library (https://keras.io/)
51 | 
52 | **Advanced**: You can also install a library from the command line, which may be faster. Use the command:
53 | `conda install packagename` 
54 | 
55 | 
56 | 
57 | ## Further information
58 | See https://gedeck.github.io/mistat-code-solutions/doc/installPython for more information about installing Python and the required packages.


--------------------------------------------------------------------------------
/mlba-R.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/mlba-R.zip


--------------------------------------------------------------------------------
/mlba-Rmd.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/mlba-Rmd.zip


--------------------------------------------------------------------------------