├── .gitignore ├── R ├── chap1.R ├── chap10.R ├── chap11.R ├── chap12.R ├── chap13.R ├── chap14.R ├── chap15.R ├── chap16.R ├── chap17.R ├── chap18.R ├── chap19.R ├── chap2.R ├── chap20.R ├── chap21.R ├── chap22.R ├── chap23.R ├── chap3.R ├── chap4.R ├── chap5.R ├── chap6.R ├── chap7.R ├── chap8.R └── chap9.R ├── README.md ├── Rmd ├── chap1.Rmd ├── chap10.Rmd ├── chap11.Rmd ├── chap12.Rmd ├── chap13.Rmd ├── chap14.Rmd ├── chap15.Rmd ├── chap16.Rmd ├── chap17.Rmd ├── chap18.Rmd ├── chap19.Rmd ├── chap2.Rmd ├── chap20.Rmd ├── chap21.Rmd ├── chap22.Rmd ├── chap23.Rmd ├── chap3.Rmd ├── chap4.Rmd ├── chap5.Rmd ├── chap6.Rmd ├── chap7.Rmd ├── chap8.Rmd └── chap9.Rmd ├── _config.yml ├── _layouts └── default.html ├── assets └── css │ └── style.scss ├── images ├── anaconda-create-environment.png ├── anaconda-environment.png ├── anaconda-mlba-r-environment.png ├── anaconda-open-terminal.png ├── anaconda-packagelist.png ├── anaconda-update.png ├── jupyter-executing-python.png ├── jupyter-new-notebook.png ├── jupyter-notebook-filemanager.png └── logo.png ├── img └── mlba-bookcover.png ├── installPython.md ├── mlba-R.zip └── mlba-Rmd.zip /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | 8 | # User-specific files 9 | .Ruserdata 10 | 11 | # Example code in package build process 12 | *-Ex.R 13 | 14 | # Output files from R CMD build 15 | /*.tar.gz 16 | 17 | # Output files from R CMD check 18 | /*.Rcheck/ 19 | 20 | # RStudio files 21 | .Rproj.user/ 22 | 23 | # produced vignettes 24 | vignettes/*.html 25 | vignettes/*.pdf 26 | 27 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 28 | .httr-oauth 29 | 30 | # knitr and R markdown default cache directories 31 | *_cache/ 32 | /cache/ 33 | 34 | # Temporary files created by R markdown 35 | *.utf8.md 36 | *.knit.md 37 | 38 | # R Environment Variables 39 | .Renviron 40 | -------------------------------------------------------------------------------- /R/chap1.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/R/chap1.R -------------------------------------------------------------------------------- /R/chap11.R: -------------------------------------------------------------------------------- 1 | 2 | if (!require(mlba)) { 3 | library(devtools) 4 | install_github("gedeck/mlba/mlba", force=TRUE) 5 | } 6 | 7 | # Neural Nets 8 | ## Fitting a Network to Data 9 | ### Training the Model 10 | #### Back Propagation of Error 11 | 12 | library(neuralnet) 13 | df <- mlba::TinyData 14 | df$Like <- df$Acceptance=="like" 15 | df$Dislike <- df$Acceptance=="dislike" 16 | set.seed(1) 17 | 18 | 19 | nn <- neuralnet(Like + Dislike ~ Salt + Fat, data = df, linear.output = F, hidden = 3) 20 | 21 | # display weights 22 | nn$weights 23 | 24 | # display predictions 25 | prediction(nn) 26 | 27 | # plot network 28 | plot(nn, rep="best") 29 | 30 | 31 | 32 | 33 | library(caret) 34 | predict <- compute(nn, data.frame(df$Salt, df$Fat)) 35 | predicted.class=apply(predict$net.result,1,which.max)-1 36 | confusionMatrix(factor(ifelse(predicted.class=="1", "dislike", "like")), 37 | factor(df$Acceptance)) 38 | 39 | 40 | 41 | ### Example 2: Classifying Accident Severity 42 | 43 | library(tidyverse) 44 | library(fastDummies) 45 | 46 | # convert SUR_COND and ALCHL_I to dummy variables (remove firest dummy) 47 | # convert outcome MAX_SEV_IR to dummy variables keeping all 48 | accidents.df <- mlba::AccidentsNN %>% 49 | dummy_cols(select_columns=c("ALCHL_I", "SUR_COND"), 50 | remove_selected_columns=TRUE, remove_first_dummy=TRUE) %>% 51 | dummy_cols(select_columns=c("MAX_SEV_IR"), 52 | remove_selected_columns=TRUE) 53 | 54 | # partition the data 55 | set.seed(1) 56 | idx <- createDataPartition(mlba::AccidentsNN$MAX_SEV_IR, p=0.6, list=FALSE) 57 | train.df <- accidents.df[idx, ] 58 | holdout.df <- accidents.df[-idx, ] 59 | train.actual <- mlba::AccidentsNN[idx, ]$MAX_SEV_IR 60 | holdout.actual <- mlba::AccidentsNN[-idx, ]$MAX_SEV_IR 61 | 62 | nn <- neuralnet(MAX_SEV_IR_0 + MAX_SEV_IR_1 + MAX_SEV_IR_2 ~ ., 63 | data=train.df, hidden=2) 64 | 65 | # predict the three outcome variables and assign class using maximum score 66 | pred.train <- predict(nn, train.df) 67 | class.train <- apply(pred.train, 1, which.max)-1 68 | confusionMatrix(factor(class.train), factor(train.actual)) 69 | 70 | pred.holdout <- predict(nn, holdout.df) 71 | class.holdout <- apply(pred.holdout, 1, which.max)-1 72 | confusionMatrix(factor(class.holdout), factor(holdout.actual)) 73 | 74 | 75 | # 1 not able to predict 2 0.8496 76 | # 2, 3 0.8697 77 | # 4, 5 0.8647 78 | 79 | ## Deep Learning 80 | ### Example: Classification of Fashion Images 81 | 82 | # code to prepare a similar picture using R 83 | library(reticulate) 84 | use_condaenv('mlba-r') 85 | library(keras) 86 | 87 | fashion_mnist <- keras::dataset_fashion_mnist() 88 | clothes.labels <- c('Top', 'Trouser', 'Pullover', 'Dress', 'Coat', 89 | 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Boot') 90 | x_train <- fashion_mnist$train$x 91 | y_train <- fashion_mnist$train$y 92 | 93 | rotate <- function(x) t(apply(x, 2, rev)) 94 | plot_image <- function(x, title = "", title.color = "black") { 95 | x <- rotate(x) 96 | image(x, axes=FALSE, col=grey(seq(0, 1, length = 256)), 97 | main=list(title, col=title.color)) 98 | } 99 | plot_sample <- function() { 100 | par(mfrow=c(4, 5), mar=c(0, 0.2, 1, 0.2)) 101 | for (offset in c(0, 5)) { 102 | range <- (1+offset):(5+offset) 103 | for (i in range) { 104 | examples <- which(y_train %in% (i-1)) 105 | example <- examples[1] 106 | plot_image(x_train[example, , ], clothes.labels[y_train[example] + 1]) 107 | } 108 | for (i in range) { 109 | examples <- which(y_train %in% (i-1)) 110 | example <- examples[2] 111 | plot_image(x_train[example, , ]) 112 | } 113 | } 114 | } 115 | plot_sample() 116 | 117 | pdf(file=file.path("..", "figures", "chapter_11", "fashion-mnist-sample.pdf"), width=5, height=4) 118 | plot_sample() 119 | dev.off() 120 | 121 | #### Data Preprocessing 122 | 123 | # load required packages 124 | # Keras and TensorFlow require a Python conda environment with these packages installed 125 | library(reticulate) 126 | use_condaenv('mlba-r') 127 | library(keras) 128 | library(tensorflow) 129 | 130 | # load the data and split into training and validation sets 131 | fashion_mnist <- keras::dataset_fashion_mnist() 132 | x_train <- fashion_mnist$train$x 133 | y_train <- fashion_mnist$train$y 134 | x_valid <- fashion_mnist$test$x 135 | y_valid <- fashion_mnist$test$y 136 | 137 | # pixel values need to be scaled to range [0, 1] 138 | x_train <- x_train / 255 139 | x_valid <- x_valid / 255 140 | 141 | # input require an additional dimension to describe pixel values 142 | # dimensions are (samples, row, column, pixel) 143 | x_train <- array_reshape(x_train, c(dim(x_train), 1)) 144 | x_valid <- array_reshape(x_valid, c(dim(x_valid), 1)) 145 | 146 | # output values need to be converted into a matrix with one-hot-encoding of classes 147 | # dimensions are (samples, classes) 148 | y_train <- to_categorical(y_train, 10) 149 | y_valid <- to_categorical(y_valid, 10) 150 | dim(x_train) 151 | dim(y_train) 152 | 153 | 154 | # Model definition (architecture taken from 155 | # https://keras.rstudio.com/articles/examples/mnist_cnn.html 156 | input_shape = dim(x_train)[2:4] 157 | num_classes <- 10 158 | 159 | model <- keras_model_sequential() 160 | model %>% 161 | layer_conv_2d(filters=32, kernel_size=c(5,5), activation='relu', 162 | input_shape=input_shape) %>% 163 | layer_conv_2d(filters=64, kernel_size=c(3,3), activation='relu') %>% 164 | layer_max_pooling_2d(pool_size=c(2,2)) %>% 165 | layer_dropout(rate=0.25) %>% 166 | layer_flatten() %>% 167 | layer_dense(units = 128, activation = 'relu') %>% 168 | layer_dropout(rate = 0.5) %>% 169 | layer_dense(units = num_classes, activation = 'softmax') 170 | 171 | model 172 | 173 | # compile model 174 | model %>% compile( 175 | loss = loss_categorical_crossentropy, 176 | optimizer = optimizer_adadelta(), 177 | metrics = c('accuracy') 178 | ) 179 | 180 | # train and evaluate 181 | model %>% fit( 182 | x_train, y_train, 183 | batch_size = 128, 184 | epochs = 20, 185 | verbose = 1, 186 | validation_data = list(x_valid, y_valid) 187 | ) 188 | 189 | #### Training a Deep Learning Network 190 | 191 | library(gridExtra) 192 | df <- read.csv("cv-training.csv") 193 | g1 <- ggplot(df, aes(x=epoch)) + 194 | geom_line(aes(y=loss), color="steelblue") + 195 | geom_line(aes(y=val_loss), color="tomato") + 196 | geom_hline(yintercept=min(df$val_loss), color="black", linetype="dotted") + 197 | geom_vline(xintercept=which.min(df$val_loss), color="black", linetype="dotted") + 198 | labs(x="Epoch", y="Loss") + 199 | theme_bw() 200 | g2 <- ggplot(df, aes(x=epoch)) + 201 | geom_line(aes(y=accuracy), color="steelblue") + 202 | geom_line(aes(y=val_accuracy), color="tomato") + 203 | geom_hline(yintercept=max(df$val_accuracy), color="black", linetype="dotted") + 204 | labs(x="Epoch", y="Accuracy") + 205 | theme_bw() 206 | grid.arrange(g1, g2, ncol=2) 207 | 208 | 209 | g <- arrangeGrob(g1, g2, ncol=2) 210 | ggsave(file=file.path("..", "figures", "chapter_11", "fashion-mnist-learning.pdf"), 211 | g, width=5, height=2.5, units="in") 212 | 213 | #### Applying the Predictive Model 214 | 215 | model <- load_model_tf("cnn-model.tf") 216 | 217 | 218 | propensities <- predict(model, x_valid) 219 | propensities[1:5, ] 220 | 221 | # convert to class using winner takes all 222 | predClass <- apply(propensities, 1, which.max) 223 | predClass[1:5] 224 | 225 | # confusion matrix 226 | caret::confusionMatrix(factor(predClass), factor(fashion_mnist$test$y + 1)) 227 | 228 | 229 | -------------------------------------------------------------------------------- /R/chap12.R: -------------------------------------------------------------------------------- 1 | 2 | if (!require(mlba)) { 3 | library(devtools) 4 | install_github("gedeck/mlba/mlba", force=TRUE) 5 | } 6 | 7 | # Discriminant Analysis 8 | ## Introduction 9 | ### Example 1: Riding Mowers 10 | 11 | library(ggplot2) 12 | mowers.df <- mlba::RidingMowers 13 | g <- ggplot(mowers.df, mapping=aes(x=Income, y=Lot_Size, color=Ownership, fill=Ownership)) + 14 | geom_point(size=4) + 15 | geom_abline(intercept=40, slope=-0.34) + 16 | scale_shape_manual(values = c(15, 21)) + 17 | scale_color_manual(values = c('darkorange', 'steelblue')) + 18 | scale_fill_manual(values = c('darkorange', 'lightblue')) 19 | 20 | g 21 | 22 | ggsave(file=file.path("..", "figures", "chapter_12", "riding-mower.pdf"), 23 | g + theme_bw(), width=6, height=4, units="in") 24 | 25 | ### Example 2: Personal Loan Acceptance 26 | 27 | library(gridExtra) 28 | makePlot <- function(df, title, alpha) { 29 | no_personal_loan <- subset(df, Personal.Loan == 0) 30 | personal_loan <- subset(df, Personal.Loan == 1) 31 | 32 | g <- ggplot(universal.df, aes(x=Income, y=CCAvg, color=Personal.Loan)) + 33 | geom_point(aes(color="nonacceptor"), data=no_personal_loan, alpha=alpha) + 34 | geom_point(aes(color="acceptor"), data=personal_loan) + 35 | labs(title=title, colour="Personal Loan", x='Annual income ($000s)', 36 | y='Monthly average credit card spending ($000s)') + 37 | scale_color_manual(values=c("lightblue", "steelblue"), 38 | guide=guide_legend(override.aes=list(size=3, alpha=1))) + 39 | scale_x_log10() + 40 | scale_y_log10() + 41 | theme_bw() 42 | 43 | return (g) 44 | } 45 | 46 | set.seed(1) 47 | universal.df <- mlba::UniversalBank 48 | idx <- sample(dim(universal.df)[1], 200) 49 | g1 <- makePlot(universal.df[idx, ], 'Sample of 200 customers', 1.0) + 50 | theme(legend.position = c(0.2, 0.85)) 51 | g2 <- makePlot(universal.df, 'All 5000 customers', 0.5) + 52 | guides(color="none") 53 | grid.arrange(g1, g2, ncol=2) 54 | 55 | g <- arrangeGrob(g1, g2, ncol=2) 56 | ggsave(file=file.path("..", "figures", "chapter_12", "personalLoan_sampled.pdf"), 57 | g, width=8, height=4, units="in") 58 | 59 | ## Fisher's Linear Classification Functions 60 | 61 | library(caret) 62 | mowers.df <- mlba::RidingMowers 63 | trControl <- caret::trainControl(method='none') 64 | model <- train(Ownership ~ Income + Lot_Size, data=mowers.df, 65 | method='lda', trControl=trControl) 66 | model$finalModel # access the wrapped LDA model 67 | 68 | # DiscriMiner exposes the Fisher's linear classification function 69 | library(DiscriMiner) 70 | mowers.df <- mlba::RidingMowers 71 | da.mower <- linDA(mowers.df[,1:2], mowers.df[,3]) 72 | da.mower$functions 73 | 74 | 75 | da.mower <- linDA(mowers.df[,1:2], mowers.df[,3]) 76 | # compute propensities manually (below); or, use lda() in package MASS or caret with predict() 77 | propensity.owner <- exp(da.mower$scores[,2])/(exp(da.mower$scores[,1])+exp(da.mower$scores[,2])) 78 | data.frame(Actual=mowers.df$Ownership, Predicted=da.mower$classification, 79 | da.mower$scores, propensity.owner=propensity.owner) 80 | 81 | 82 | library(ggplot2) 83 | da.mower <- train(Ownership ~ Income + Lot_Size, data=mowers.df, 84 | method='lda', trControl=trControl) 85 | means <- colSums(da.mower$finalModel$means) / 2 86 | sIncome <- da.mower$finalModel$scaling['Income', 'LD1'] 87 | sLotSize <- da.mower$finalModel$scaling['Lot_Size', 'LD1'] 88 | m <- - sIncome / sLotSize 89 | y0 <- means['Lot_Size'] - m * means['Income'] 90 | 91 | mowers.df <- mlba::RidingMowers 92 | g <- ggplot(mowers.df, mapping=aes(x=Income, y=Lot_Size, color=Ownership, fill=Ownership)) + 93 | geom_point(size=4) + 94 | geom_point(data=data.frame(da.mower$finalModel$means), color='black', fill='black', shape=4, size=3) + 95 | geom_abline(aes(linetype='ad hoc line', intercept=40, slope=-0.34), color='darkgrey') + 96 | geom_abline(aes(linetype='LDA line', intercept=y0, slope=m)) + 97 | scale_shape_manual(values = c(15, 21)) + 98 | scale_color_manual(values = c('darkorange', 'steelblue')) + 99 | scale_fill_manual(values = c('darkorange', 'lightblue')) + 100 | scale_linetype_manual(name='Linetype', values=c(2, 1), labels=c('ad hoc line', 'LDA line')) + 101 | guides(fill = guide_legend(order = 1), color = guide_legend(order = 1), 102 | linetype = guide_legend(order = 2)) 103 | g 104 | ggsave(file=file.path("..", "figures", "chapter_12", "LDA-riding-mower.pdf"), 105 | g + theme_bw(), width=6, height=4, units="in") 106 | 107 | ## Prior Probabilities 108 | 109 | trControl <- caret::trainControl(method='none') 110 | model <- train(Ownership ~ Income + Lot_Size, data=mowers.df, 111 | method='lda', trControl=trControl) 112 | model.prior <- train(Ownership ~ Income + Lot_Size, data=mowers.df, 113 | method='lda', prior=c(0.85, 0.15), 114 | trControl=trControl) 115 | 116 | family.13 <- mowers.df[13,] 117 | predict(model, family.13) 118 | predict(model.prior, family.13) 119 | 120 | ## Classifying More Than Two Classes 121 | ### Example 3: Medical Dispatch to Accident Scenes 122 | 123 | library(DiscriMiner) 124 | library(caret) 125 | 126 | accidents.df <- mlba::Accidents 127 | lda.model <- linDA(accidents.df[,1:10], accidents.df[,11]) 128 | lda.model$functions 129 | confusionMatrix(as.factor(lda.model$classification), as.factor(accidents.df$MAX_SEV)) 130 | 131 | 132 | propensity <- exp(lda.model$scores[,1:3])/ 133 | (exp(lda.model$scores[,1])+exp(lda.model$scores[,2])+exp(lda.model$scores[,3])) 134 | 135 | res <- data.frame(Actual = accidents.df$MAX_SEV, 136 | Classification = lda.model$classification, 137 | Score = round(lda.model$scores,2), 138 | Propensity = round(propensity,2)) 139 | head(res) 140 | 141 | 142 | 143 | 144 | library(tidyverse) 145 | accidents.df <- mlba::Accidents %>% 146 | mutate(MAX_SEV = factor(MAX_SEV)) 147 | da.model <- train(MAX_SEV ~ ., data=accidents.df, method='lda') 148 | res <- data.frame(Actual=accidents.df$MAX_SEV, 149 | Classification=predict(da.model), 150 | Propensity=predict(da.model, type='prob') %>% round(2)) 151 | head(res) 152 | 153 | 154 | -------------------------------------------------------------------------------- /R/chap13.R: -------------------------------------------------------------------------------- 1 | 2 | if (!require(mlba)) { 3 | library(devtools) 4 | install_github("gedeck/mlba/mlba", force=TRUE) 5 | } 6 | 7 | # Generating, Comparing, and Combining Multiple Models 8 | ## Ensembles 9 | ### Bagging and Boosting in R 10 | #### Combining Propensities 11 | 12 | library(tidyverse) 13 | library(adabag) 14 | library(rpart) 15 | library(caret) 16 | 17 | set.seed(1) 18 | # load and preprocess the data 19 | bank.df <- mlba::UniversalBank %>% 20 | select(-c(ID, ZIP.Code)) %>% 21 | mutate( 22 | Personal.Loan = factor(Personal.Loan, levels=c(0, 1), labels=c("No", "Yes")) 23 | ) 24 | 25 | # partition the data 26 | train.index <- sample(c(1:dim(bank.df)[1]), dim(bank.df)[1]*0.6) 27 | train.df <- bank.df[train.index, ] 28 | holdout.df <- bank.df[-train.index, ] 29 | 30 | # single tree (rpart) 31 | tr <- rpart(Personal.Loan ~ ., data=train.df) 32 | 33 | # bagging and boosting using adabag 34 | bag <- bagging(Personal.Loan ~ ., data=train.df) 35 | boost <- boosting(Personal.Loan ~ ., data=train.df) 36 | 37 | # bagging and boosting using randomForest and xgboost with parameter tuning 38 | bag.rf <- train(Personal.Loan ~ ., data=train.df, method="rf") 39 | boost.xgb <- train(Personal.Loan ~ ., data=train.df, method="xgbTree", verbosity=0) 40 | 41 | 42 | library(ROCR) 43 | rocCurveData <- function(prob, data) { 44 | predob <- prediction(prob, data$Personal.Loan) 45 | perf <- performance(predob, "tpr", "fpr") 46 | return (data.frame(tpr=perf@x.values[[1]], fpr=perf@y.values[[1]])) 47 | } 48 | performance.df <- rbind( 49 | cbind(rocCurveData(predict(tr, holdout.df, type="prob")[,"Yes"], holdout.df), model="Single tree"), 50 | cbind(rocCurveData(predict(bag, holdout.df)$prob[, 2], holdout.df), model="Bagging"), 51 | cbind(rocCurveData(predict(boost, holdout.df)$prob[, 2], holdout.df), model="Boosting") 52 | ) 53 | colors <- c("Single tree"="grey", "Bagging"="blue", "Boosting"="tomato") 54 | g1 <- ggplot(performance.df, aes(x=tpr, y=fpr, color=model)) + 55 | geom_line() + 56 | scale_color_manual(values=colors) + 57 | geom_segment(aes(x=0, y=0, xend=1, yend=1), color="grey", linetype="dashed") + 58 | labs(x="1 - Specificity", y="Sensitivity", color="Model") 59 | g1 60 | 61 | performance.df <- rbind( 62 | cbind(rocCurveData(predict(tr, holdout.df, type="prob")[,"Yes"], holdout.df), 63 | model="Single tree"), 64 | cbind(rocCurveData(predict(bag.rf, holdout.df, type="prob")[,"Yes"], holdout.df), 65 | model="Random forest"), 66 | cbind(rocCurveData(predict(boost.xgb, holdout.df, type="prob")[,"Yes"], holdout.df), 67 | model="xgboost") 68 | ) 69 | colors <- c("Single tree"="grey", "Random forest"="blue", "xgboost"="tomato") 70 | g2 <- ggplot(performance.df, aes(x=tpr, y=fpr, color=model)) + 71 | geom_line() + 72 | scale_color_manual(values=colors) + 73 | geom_segment(aes(x=0, y=0, xend=1, yend=1), color="grey", linetype="dashed") + 74 | labs(x="1 - Specificity", y="Sensitivity", color="Model") 75 | g2 76 | library(gridExtra) 77 | g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw(), ncol=2, widths=c(0.49, 0.51)) 78 | ggsave(file=file.path("..", "figures", "chapter_13", "bagging-boosting.pdf"), 79 | g, width=8, height=3, units="in") 80 | 81 | ## Automated Machine Learning (AutoML) 82 | ### AutoML: Explore and Clean Data 83 | 84 | library(tidyverse) 85 | 86 | # load and preprocess the data 87 | bank.df <- mlba::UniversalBank %>% 88 | # Drop ID and zip code columns. 89 | select(-c(ID, ZIP.Code)) %>% 90 | # convert Personal.Loan to a factor with labels Yes and No 91 | mutate(Personal.Loan = factor(Personal.Loan, levels=c(0, 1), labels=c("No", "Yes"))) 92 | 93 | # partition the data 94 | set.seed(1) 95 | idx <- caret::createDataPartition(bank.df$Personal.Loan, p=0.6, list=FALSE) 96 | train.df <- bank.df[idx, ] 97 | holdout.df <- bank.df[-idx, ] 98 | 99 | 100 | library(h2o) 101 | 102 | # Start the H2O cluster (locally) 103 | h2o.init() 104 | 105 | train.h2o <- as.h2o(train.df) 106 | holdout.h2o <- as.h2o(holdout.df) 107 | 108 | 109 | 110 | ### AutoML: Choose Features and Machine Learning Methods 111 | 112 | # identify outcome and predictors 113 | y <- "Personal.Loan" 114 | x <- setdiff(names(train.df), y) 115 | 116 | # run AutoML for 20 base models 117 | aml <- h2o.automl(x=x, y=y, training_frame=train.h2o, 118 | max_models=20, exclude_algos=c("DeepLearning"), 119 | seed=1) 120 | aml.balanced <- h2o.automl(x=x, y=y, training_frame=train.h2o, 121 | max_models=20, exclude_algos=c("DeepLearning"), 122 | balance_classes=TRUE, 123 | seed=1) 124 | 125 | aml 126 | 127 | 128 | aml.balanced 129 | 130 | ### AutoML: Evaluate Model Performance 131 | 132 | h2o.confusionMatrix(aml@leader, holdout.h2o) 133 | h2o.confusionMatrix(aml.balanced@leader, holdout.h2o) 134 | 135 | ## Explaining Model Predictions 136 | ### Explaining Model Predictions: LIME 137 | 138 | cases <- c('3055', '3358', # predicted Yes 139 | '2', '1178') # predicted No 140 | explainer <- lime::lime(train.df, aml@leader, bin_continuous=TRUE, quantile_bins=FALSE) 141 | explanations <- lime::explain(holdout.df[cases,], explainer, n_labels=1, n_features=8) 142 | 143 | lime::plot_features(explanations, ncol=2) 144 | 145 | 146 | pdf(file=file.path("..", "figures", "chapter_13", "lime-analysis.pdf"), 147 | width=7, height=6) 148 | lime::plot_features(explanations, ncol=2) 149 | dev.off() 150 | -------------------------------------------------------------------------------- /R/chap14.R: -------------------------------------------------------------------------------- 1 | 2 | if (!require(mlba)) { 3 | library(devtools) 4 | install_github("gedeck/mlba/mlba", force=TRUE) 5 | } 6 | 7 | # Interventions: Experiments, Uplift Models, and Reinforcement Learning 8 | ## A/B Testing 9 | ### The statistical test for comparing two groups (T-test) 10 | 11 | pt(q=2.828, df=1998, lower.tail=FALSE) 12 | pt(q=2.626, df=1998, lower.tail=FALSE) 13 | 14 | ## Uplift (Persuasion) Modeling 15 | ### Computing Uplift with R 16 | 17 | library(tidyverse) 18 | # load and preprocess the data 19 | predictors <- c("AGE", "NH_WHITE", "COMM_PT", "H_F1", "REG_DAYS", 20 | "PR_PELIG", "E_PELIG", "POLITICALC", "MESSAGE_A") 21 | outcome <- "MOVED_AD" 22 | voter.df <- mlba::VoterPersuasion %>% 23 | select(all_of(c(predictors, outcome))) 24 | 25 | 26 | set.seed(1) 27 | nrows <- dim(voter.df)[1] 28 | train.index <- sample(1:nrows, nrows * 0.6) 29 | train.df <- voter.df[train.index, ] 30 | holdout.df <- voter.df[-train.index, ] 31 | 32 | # build a random forest model using caret 33 | train_control <- caret::trainControl(method="none") 34 | model <- caret::train(MOVED_AD ~ ., data=train.df, 35 | trControl=train_control, 36 | method="rf") 37 | 38 | # calculating the uplift 39 | uplift_df <- data.frame(holdout.df) 40 | uplift_df$MESSAGE_A <- 1 41 | predTreatment <- predict(model, newdata=uplift_df, type="prob") 42 | uplift_df$MESSAGE_A <- 0 43 | predControl <- predict(model, newdata=uplift_df, type="prob") 44 | upliftResult <- data.frame( 45 | probMessage = predTreatment[, 1], 46 | probNoMessage = predControl[, 1] 47 | ) 48 | upliftResult$uplift <- upliftResult$probMessage - upliftResult$probNoMessage 49 | head(upliftResult) 50 | 51 | 52 | 53 | ## Reinforcement Learning 54 | ### Example of using a Contextual Multi-Arm Bandit for Movie Recommendations 55 | 56 | library(tidyverse) 57 | library(mlba) 58 | library(contextual) 59 | library(data.table) 60 | library(splitstackshape) 61 | 62 | # preprocess movies data to create indicator variables for the different genres 63 | movies_dat <- as.data.table(mlba::MovieLensMovies) 64 | movies_dat <- splitstackshape::cSplit_e(movies_dat, "genres", sep="|", type="character", 65 | fill=0, drop=TRUE) 66 | movies_dat[[3]] <- NULL # deletes the third column 67 | 68 | ratings_dat <- as.data.table(mlba::MovieLensRatings) 69 | all_movies <- ratings_dat[movies_dat, on=c(movieId="movieId")] 70 | all_movies <- na.omit(all_movies, cols=c("movieId", "userId")) 71 | # renumber userId to sequential numbers starting at 1 72 | all_movies[, userId := as.numeric(as.factor(userId))] 73 | 74 | # find the top-50 most frequently rated movies 75 | top_50 <- all_movies %>% 76 | count(movieId) %>% 77 | slice_max(n, n=50) %>% 78 | pull(movieId) 79 | top_50_movies <- all_movies[movieId %in% top_50] 80 | # renumber movieId to sequential numbers starting at 1 81 | top_50_movies[, movieId := as.numeric(as.factor(movieId))] 82 | 83 | # create profile of genres for each movie in the top-50 (arm_features) 84 | arm_features <- top_50_movies %>% 85 | select(-c(userId, rating, timestamp, title)) %>% 86 | # select one row for each movieId 87 | group_by(movieId) %>% slice(1) %>% ungroup() 88 | 89 | # for each user, create their profile of genre preferences based on 90 | # their viewed movies that are not in the top-50 (user_features) 91 | user_features <- all_movies %>% 92 | filter(! movieId %in% top_50) %>% # restrict to movies not in the top-50 93 | select(-c(movieId, rating, timestamp, title)) %>% 94 | # for each user, sum 95 | group_by(userId) %>% 96 | summarise_all(sum) %>% 97 | # normalize user profile 98 | group_by(userId) %>% 99 | mutate( 100 | total = sum(c_across(genres_Action:genres_Western)), 101 | across(genres_Action:genres_Western, ~ ./total) 102 | ) %>% 103 | select(-c(total)) %>% 104 | as.data.table() 105 | 106 | 107 | # add users who only rated top-50 movies 108 | # their genre preference profile is set to 0 for all genres 109 | all_users <- as.data.table(unique(all_movies$userId)) 110 | user_features <- user_features[all_users, on=c(userId="V1")] 111 | user_features[is.na(user_features)] <- 0 112 | setorder(user_features, userId) 113 | 114 | 115 | 116 | 117 | # prepare the data for use with the contextual package 118 | top_50_movies[, t := .I] 119 | top_50_movies[, sim := 1] 120 | top_50_movies[, agent := "Offline"] 121 | top_50_movies[, choice := movieId] 122 | top_50_movies[, reward := ifelse(rating <= 4, 0, 1)] 123 | setorder(top_50_movies,timestamp, title) 124 | 125 | # the bandit samples users with their genre preferences (user_features), 126 | # movie choices (choice), and ratings. 127 | # each movie is characterized by the genre profile (arm_features) 128 | # these data are used to train the agent 129 | environment <- OfflineLookupReplayEvaluatorBandit$new( 130 | top_50_movies, 131 | k = 50, 132 | unique_col = "userId", 133 | unique_lookup = user_features, 134 | shared_lookup = arm_features) 135 | 136 | # define list of strategies to evaluate 137 | agents <-list( 138 | Agent$new(RandomPolicy$new(), environment, "Random"), 139 | Agent$new(LinUCBDisjointOptimizedPolicy$new(2.1), environment, "LinUCB Dis")) 140 | 141 | # setup and run simulation 142 | simulation <- Simulator$new( 143 | agents = agents, 144 | simulations = 20, 145 | horizon = 10000L, 146 | save_interval = 1) 147 | results <- simulation$run() 148 | 149 | plot(results, type="cumulative", regret=FALSE, rate=TRUE, 150 | legend_position="topleft", disp="sd") 151 | 152 | 153 | pdf(file=file.path("..", "figures", "chapter_14", "mab-movielens.pdf"), width=6, height=4) 154 | plot(results, type="cumulative", regret=FALSE, rate=TRUE, 155 | legend_position="topleft", disp="sd") 156 | dev.off() 157 | -------------------------------------------------------------------------------- /R/chap15.R: -------------------------------------------------------------------------------- 1 | 2 | if (!require(mlba)) { 3 | library(devtools) 4 | install_github("gedeck/mlba/mlba", force=TRUE) 5 | } 6 | 7 | # Association Rules and Collaborative Filtering 8 | ## Association Rules 9 | ### The Process of Rule Selection 10 | #### Lift 11 | 12 | library(arules) 13 | fp.df <- mlba::Faceplate 14 | 15 | # remove first column and convert to matrix 16 | fp.mat <- as.matrix(fp.df[, -1]) 17 | 18 | # convert the binary incidence matrix into a transactions database 19 | fp.trans <- as(fp.mat, "transactions") 20 | inspect(fp.trans) 21 | 22 | ## get rules 23 | # when running apriori(), include the minimum support, minimum confidence, and target 24 | # as arguments. 25 | rules <- apriori(fp.trans, parameter = list(supp = 0.2, conf = 0.5, target = "rules")) 26 | 27 | # inspect the first six rules, sorted by their lift 28 | inspect(head(sort(rules, by = "lift"), n = 6)) 29 | 30 | ### Example 2: Rules for Similar Book Purchases 31 | 32 | all.books.df <- mlba::CharlesBookClub 33 | 34 | # create a binary incidence matrix 35 | count.books.df <- all.books.df[, 8:18] 36 | incid.books.mat <- as.matrix(count.books.df > 0) 37 | 38 | # convert the binary incidence matrix into a transactions database 39 | books.trans <- as(incid.books.mat, "transactions") 40 | inspect(books.trans[1:10]) 41 | 42 | # plot data 43 | itemFrequencyPlot(books.trans) 44 | 45 | # run apriori function 46 | rules <- apriori(books.trans, 47 | parameter = list(supp= 200/4000, conf = 0.5, target = "rules")) 48 | 49 | # inspect top-30 rules sorted by lift 50 | inspect(head(sort(rules, by = "lift"), n=30)) 51 | 52 | ## Collaborative Filtering 53 | ### Example 4: Predicting Movie Ratings with MovieLens Data 54 | 55 | library(recommenderlab) 56 | 57 | # download MovieLens data 58 | ratings <- mlba::MovieLensRatings 59 | movies <- mlba::MovieLensMovies 60 | 61 | # convert ratings to rating matrix 62 | idxUserId <- sort(unique(ratings$userId)) 63 | idxMovieId <- sort(unique(ratings$movieId)) 64 | m <- matrix(NA, nrow=length(idxUserId), ncol=length(idxMovieId), 65 | dimnames=list( 66 | user=paste("u", 1:length(idxUserId), sep=''), 67 | item=movies$title[match(idxMovieId, movies$movieId)] 68 | )) 69 | for (i in 1:nrow(ratings)) { 70 | rating <- ratings[i,] 71 | irow <- match(rating$userId, idxUserId) 72 | icol <- match(rating$movieId, idxMovieId) 73 | m[irow, icol] <- rating$rating 74 | } 75 | ratingMatrix <- as(m, "realRatingMatrix") 76 | 77 | 78 | 79 | 80 | # UBCF model and prediction 81 | recommender <- Recommender(ratingMatrix[-1], method="UBCF") 82 | pred <- predict(recommender, ratingMatrix[1]) 83 | as(pred, 'list') 84 | 85 | # IBCF model and prediction 86 | recommender <- Recommender(ratingMatrix[-1], method="IBCF") 87 | pred <- predict(recommender, ratingMatrix[1]) 88 | as(pred, 'list') 89 | 90 | 91 | set.seed(1) 92 | e <- evaluationScheme(ratingMatrix, method="split", train=0.9, given=10) 93 | 94 | r1 <- Recommender(getData(e, "train"), "UBCF") 95 | r2 <- Recommender(getData(e, "train"), "IBCF") 96 | r3 <- Recommender(getData(e, "train"), "RANDOM") 97 | 98 | p1 <- predict(r1, getData(e, "known"), type="ratings") 99 | p2 <- predict(r2, getData(e, "known"), type="ratings") 100 | p3 <- predict(r3, getData(e, "known"), type="ratings") 101 | error <- rbind( 102 | UBCF = calcPredictionAccuracy(p1, getData(e, "unknown")), 103 | IBCF = calcPredictionAccuracy(p2, getData(e, "unknown")), 104 | RANDOM = calcPredictionAccuracy(p3, getData(e, "unknown")) 105 | ) 106 | error 107 | -------------------------------------------------------------------------------- /R/chap16.R: -------------------------------------------------------------------------------- 1 | 2 | if (!require(mlba)) { 3 | library(devtools) 4 | install_github("gedeck/mlba/mlba", force=TRUE) 5 | } 6 | 7 | # Cluster Analysis 8 | ## Measuring Distance Between Two Records 9 | ### Euclidean Distance 10 | 11 | library(tidyverse) 12 | # load data and use Company column as row names 13 | utilities.df <- mlba::Utilities %>% 14 | column_to_rownames("Company") 15 | 16 | # compute Euclidean distance 17 | # (to compute other distance measures, change the value in the method argument) 18 | d <- dist(utilities.df, method = "euclidean") 19 | 20 | ### Normalizing Numerical Variables 21 | 22 | # normalize input variables 23 | utilities.df.norm <- scale(utilities.df) 24 | 25 | # compute normalized distance based on Sales and Fuel Cost 26 | d.norm <- dist(utilities.df.norm[,c("Sales","Fuel_Cost")], method="euclidean") 27 | 28 | ## Hierarchical (Agglomerative) Clustering 29 | ### Dendrograms: Displaying Clustering Process and Results 30 | 31 | library(ggplot2) 32 | library(ggdendro) 33 | d.norm <- dist(utilities.df.norm, method="euclidean") 34 | 35 | # in hclust() set argument method \galit{to} 36 | # "ward.D", "single", "complete", "average", "median", or "centroid" 37 | hc1 <- hclust(d.norm, method="single") 38 | plot(hc1, hang=-1, ann=FALSE) # use baseR 39 | ggdendrogram(hc1) # use ggdendro package (shown in figure below) 40 | hc2 <- hclust(d.norm, method="average") 41 | plot(hc2, hang=-1, ann=FALSE) 42 | ggdendrogram(hc2) 43 | 44 | 45 | library(gridExtra) 46 | addCutline <- function(g, hc, ncluster) { 47 | heights <- rev(hc$height) 48 | cut_at <- 0.5 * (heights[ncluster] + heights[ncluster - 1]) 49 | return (g + geom_hline(yintercept=cut_at, color='red', linetype=2)) 50 | } 51 | g1 <- ggdendrogram(hc1) 52 | g2 <- ggdendrogram(hc2) 53 | grid.arrange(addCutline(g1, hc1, 6), addCutline(g2, hc2, 6), nrow=2) 54 | g <- arrangeGrob(addCutline(g1, hc1, 6), addCutline(g2, hc2, 6), nrow=2) 55 | ggsave(file=file.path("..", "figures", "chapter_16", "utilities-dendrograms.pdf"), 56 | g, width=5, height=8, units="in") 57 | 58 | 59 | memb <- cutree(hc1, k = 6) 60 | memb 61 | memb <- cutree(hc2, k = 6) 62 | memb 63 | 64 | ### Validating Clusters 65 | 66 | # set labels as cluster membership and utility name 67 | row.names(utilities.df.norm) <- paste(memb, ": ", row.names(utilities.df), sep = "") 68 | 69 | # plot heatmap 70 | heatmap(utilities.df.norm, Colv=NA, hclustfun=hclust) 71 | 72 | 73 | # grey scale 74 | # rev() reverses the color mapping to large = dark 75 | heatmap(as.matrix(utilities.df.norm), Colv = NA, hclustfun = hclust, 76 | col=rev(paste("gray",1:99,sep=""))) 77 | 78 | pdf(file=file.path("..", "figures", "chapter_16", "utilities-heatmap.pdf"), 79 | width=5, height=5) 80 | heatmap(utilities.df.norm, Colv=NA, hclustfun=hclust) 81 | dev.off() 82 | 83 | ## Non-hierarchical Clustering: The 84 | ### Choosing the Number of Clusters ($k$) 85 | 86 | set.seed(123) # set random seed for reproducability 87 | # load and preprocess data 88 | utilities.df <- mlba::Utilities %>% 89 | column_to_rownames("Company") 90 | 91 | # normalized distance: 92 | utilities.df.norm <- scale(utilities.df) 93 | 94 | # run kmeans algorithm 95 | km <- kmeans(utilities.df.norm, 6) 96 | 97 | # show cluster membership 98 | sort(km$cluster) 99 | 100 | 101 | # centroids 102 | km$centers 103 | # within-cluster sum of squares 104 | km$withinss 105 | # cluster size 106 | km$size 107 | 108 | 109 | library(GGally) 110 | centroids <- data.frame(km$centers) 111 | centroids['Cluster'] = paste('Cluster', seq(1, 6)) 112 | 113 | ggparcoord(centroids, columns=1:8, groupColumn='Cluster', showPoints=TRUE) + 114 | scale_color_viridis_d() + 115 | labs(x='Variable', y='Value') 116 | 117 | 118 | ggsave(file=file.path("..", "figures", "chapter_16", "utilities-clusterProfile.pdf"), 119 | last_plot() + theme_bw(), width=8.5, height=3.2, units="in") 120 | 121 | 122 | result <- tibble() 123 | for (k in 1:6) { 124 | km <- kmeans(utilities.df.norm, k) 125 | result <- bind_rows(result, tibble(k=k, average_withinss=mean(km$withinss))) 126 | } 127 | 128 | ggplot(result, aes(x=k, y=average_withinss)) + 129 | geom_line() + 130 | geom_point() + 131 | labs(y="Average within-cluster squared distance", 132 | x=expression(paste("Number of clusters ", italic("k")))) + 133 | theme_bw() 134 | ggsave(file=file.path("..", "figures", "chapter_16", "utilities-ellbow.pdf"), 135 | last_plot(), width=4, height=4, units="in") 136 | 137 | 138 | dist(km$centers) 139 | -------------------------------------------------------------------------------- /R/chap17.R: -------------------------------------------------------------------------------- 1 | 2 | if (!require(mlba)) { 3 | library(devtools) 4 | install_github("gedeck/mlba/mlba", force=TRUE) 5 | } 6 | 7 | # Handling Time Series 8 | ## Time Series Components 9 | ### Example: Ridership on Amtrak Trains 10 | 11 | library(forecast) 12 | library(ggplot2) 13 | Amtrak.data <- mlba::Amtrak 14 | 15 | # create time series object using ts() 16 | # ts() takes three arguments: start, end, and freq. 17 | # with monthly data, the frequency of periods per cycle is 12 (per year). 18 | # arguments start and end are (cycle [=year] number, seasonal period [=month] number) pairs. 19 | # here start is Jan 1991: start = c(1991, 1); end is Mar 2004: end = c(2004, 3). 20 | ridership.ts <- ts(Amtrak.data$Ridership, 21 | start = c(1991, 1), end = c(2004, 3), freq = 12) 22 | 23 | # plot the series using the autoplot function to make use of ggplot 24 | autoplot(ridership.ts, xlab="Time", ylab="Ridership (in 000s)") + 25 | scale_y_continuous(limits=c(1300, 2300)) 26 | 27 | 28 | g <- last_plot() + 29 | scale_x_continuous(n.breaks=10) + 30 | theme_bw() 31 | ggsave(file=file.path("..", "figures", "chapter_17", "AmtrakFirstPlot.pdf"), 32 | g, width=6, height=3, units="in") 33 | 34 | 35 | library(gridExtra) 36 | library(lubridate) 37 | library(zoo) 38 | 39 | BareggTunnel <- mlba::BareggTunnel 40 | # convert Day information to a dates object 41 | dates <- as.POSIXct(BareggTunnel$Day, format='%d %b %Y') 42 | tunnel.ts <- ts(BareggTunnel$Number.of.vehicles, 43 | start=c(2003, yday(dates[1])), 44 | frequency=365) 45 | 46 | options(scipen=999) 47 | g1 <- autoplot(tunnel.ts, xlab="Time", ylab="Number of vehicles") + 48 | scale_x_yearmon() + 49 | scale_y_continuous(labels = scales::comma) 50 | g2 <- autoplot(window(tunnel.ts, 51 | start=c(2004, yday(ISOdate(2004, 2, 1))), 52 | end=c(2004, yday(ISOdate(2004, 6, 1)))), 53 | xlab="Time", ylab="Number of vehicles") + 54 | scale_x_yearmon() + 55 | scale_y_continuous(labels = scales::comma) 56 | 57 | grid.arrange(g1, g2, nrow=2) 58 | g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw()) 59 | ggsave(file=file.path("..", "figures", "chapter_17", "TS-TunnelPlots.pdf"), g, 60 | width=6, height=4) 61 | 62 | 63 | library(gridExtra) 64 | 65 | # to zoom in to a certain period, use window() to create a new, shorter time series 66 | # we create a new, 3-year time series of ridership.ts from Jan 1997 to Dec 1999 67 | ridership.ts.3yrs <- window(ridership.ts, start = c(1997, 1), end = c(1999, 12)) 68 | g1 <- autoplot(ridership.ts.3yrs, xlab="Time", ylab="Ridership (in 000s)") + 69 | scale_y_continuous(limits=c(1300, 2300)) 70 | 71 | # fit a trend line to the time series 72 | g2 <- autoplot(ridership.ts, xlab="Time", ylab="Ridership (in 000s)") + 73 | scale_y_continuous(limits=c(1300, 2300)) + 74 | geom_smooth(method="lm", formula=y~poly(x, 2)) 75 | 76 | grid.arrange(g1, g2, nrow=2) 77 | 78 | 79 | g <- arrangeGrob(g1 + theme_bw(), 80 | g2 + scale_x_continuous(n.breaks=10) + theme_bw()) 81 | ggsave(file=file.path("..", "figures", "chapter_17", "AmtrakZoomPlots.pdf"), g, 82 | width=6, height=6) 83 | 84 | # we can also use tslm to create the quadratic fit 85 | ridership.lm <- tslm(ridership.ts ~ trend + I(trend^2)) 86 | autoplot(ridership.ts, xlab="Time", ylab="Ridership (in 000s)") + 87 | scale_y_continuous(limits=c(1300, 2300)) + 88 | autolayer(ridership.lm$fitted.values) 89 | 90 | ## Data Partitioning and Performance Evaluation 91 | ### Benchmark Performance: Naive Forecasts 92 | 93 | nTest <- 36 94 | nTrain <- length(ridership.ts) - nTest 95 | 96 | # partition the data 97 | train.ts <- window(ridership.ts, start = c(1991, 1), end = c(1991, nTrain)) 98 | test.ts <- window(ridership.ts, start = c(1991, nTrain + 1), 99 | end = c(1991, nTrain + nTest)) 100 | 101 | # generate the naive and seasonal naive forecasts 102 | naive.pred <- naive(train.ts, h=nTest) 103 | snaive.pred <- snaive(train.ts, h=nTest) 104 | 105 | # compare the actual values and forecasts for both methods 106 | colData <- "steelblue"; colModel <- "tomato" 107 | autoplot(train.ts, xlab="Time", ylab="Ridership (in 000s$)", color=colData) + 108 | autolayer(test.ts, linetype=2, color=colData) + 109 | autolayer(naive.pred, PI=FALSE, color=colModel, size=0.75) + 110 | autolayer(snaive.pred, PI=FALSE, color=colModel, size=0.75) 111 | 112 | 113 | # for the book visualization add additional annotation 114 | delta <- 1/12 115 | date_t <- time(train.ts)[1] 116 | date_th <- time(test.ts)[1] - delta 117 | date_hf <- tail(time(test.ts), 1) + delta 118 | g <- last_plot() + 119 | geom_vline(xintercept=date_th, color="darkgrey") + geom_vline(xintercept=date_hf, color="darkgrey") + 120 | geom_segment(aes(x=date_t, xend=date_th-delta, y=2300, yend=2300), color="darkgrey") + 121 | geom_segment(aes(x=date_th+delta, xend=date_hf-delta, y=2300, yend=2300), color="darkgrey") + 122 | geom_segment(aes(x=date_hf+delta, xend=date_hf+2, y=2300, yend=2300), color="darkgrey") + 123 | geom_text(aes(x=(date_t+date_th)/2, y=2350, label='Training')) + 124 | geom_text(aes(x=(date_th+date_hf)/2, y=2350, label='Test')) + 125 | geom_text(aes(x=date_hf+1, y=2350, label='Future')) + 126 | scale_x_continuous(n.breaks=10) + 127 | theme_bw() 128 | ggsave(file=file.path("..", "figures", "chapter_17", "AmtrakNaive.pdf"), 129 | g, width=7, height=4.5) 130 | 131 | 132 | accuracy(naive.pred, test.ts) 133 | accuracy(snaive.pred, test.ts) 134 | 135 | ### Generating Future Forecasts 136 | 137 | ts <- ts(mlba::CanadianWorkHours$Hours, 138 | start = c(mlba::CanadianWorkHours$Year[1], 1), freq = 1) 139 | 140 | # plot the series using the autoplot function to make use of ggplot 141 | autoplot(ts, xlab="Year", ylab="Hours per week") 142 | ggsave(file=file.path("..", "figures", "chapter_17", "Exercise-CanadianWorkers.pdf"), 143 | last_plot() + theme_bw(), width=5, height=4, units="in") 144 | -------------------------------------------------------------------------------- /R/chap2.R: -------------------------------------------------------------------------------- 1 | 2 | if (!require(mlba)) { 3 | library(devtools) 4 | install_github("gedeck/mlba/mlba", force=TRUE) 5 | } 6 | options(scipen=999) 7 | 8 | # Overview of the Machine Learning Process 9 | ## Preliminary Steps 10 | ### Loading and Looking at the Data in R 11 | 12 | housing.df <- read.csv('WestRoxbury.csv') # load data from file 13 | housing.df <- mlba::WestRoxbury # load data from mlba package 14 | dim(housing.df) # find the dimension of data frame 15 | head(housing.df) # show the first six rows 16 | View(housing.df) # show all the data in a new tab 17 | 18 | # Practice showing different subsets of the data 19 | housing.df[1:10, 1] # show the first 10 rows of the first column only 20 | housing.df[1:10, ] # show the first 10 rows of each of the columns 21 | housing.df[5, 1:10] # show the fifth row of the first 10 columns 22 | housing.df[5, c(1:2, 4, 8:10)] # show the fifth row of some columns 23 | housing.df[, 1] # show the whole first column 24 | housing.df$TOTAL.VALUE # a different way to show the whole first column 25 | housing.df$TOTAL.VALUE[1:10] # show the first 10 rows of the first column 26 | length(housing.df$TOTAL.VALUE) # find the length of the first column 27 | mean(housing.df$TOTAL.VALUE) # find the mean of the first column 28 | summary(housing.df) # find summary statistics for each column 29 | 30 | ### Sampling from a Database 31 | 32 | housing.df <- mlba::WestRoxbury 33 | 34 | # random sample of 5 observations 35 | s <- sample(row.names(housing.df), 5) 36 | housing.df[s,] 37 | 38 | # oversample houses with over 10 rooms 39 | s <- sample(row.names(housing.df), 5, prob=ifelse(housing.df$ROOMS>10, 0.9, 0.01)) 40 | housing.df[s,] 41 | 42 | # rebalance 43 | housing.df$REMODEL <- factor(housing.df$REMODEL) 44 | table(housing.df$REMODEL) 45 | upsampled.df <- caret::upSample(housing.df, housing.df$REMODEL, list=TRUE)$x 46 | table(upsampled.df$REMODEL) 47 | 48 | 49 | 50 | ### Preprocessing and Cleaning the Data 51 | #### Types of Variables 52 | 53 | library(tidyverse) 54 | 55 | # get overview 56 | str(housing.df) 57 | 58 | # make REMODEL a factor variable 59 | housing.df$REMODEL <- factor(housing.df$REMODEL) 60 | str(housing.df$REMODEL) 61 | levels(housing.df$REMODEL) # show factor's categories (levels) 62 | 63 | # use tidyverse to load and preprocess data in one statement 64 | # the %>% operator inserts the result of the expression on the left 65 | # as the first argument into the function on the right 66 | housing.df <- mlba::WestRoxbury %>% 67 | mutate(REMODEL=factor(REMODEL)) 68 | 69 | #### Handling Categorical Variables 70 | 71 | library(fastDummies) 72 | library(tidyverse) 73 | 74 | housing.df <- dummy_cols(mlba::WestRoxbury, 75 | remove_selected_columns=TRUE, # remove the original column 76 | remove_first_dummy=TRUE) # removes the first created dummy variable 77 | housing.df %>% head(2) 78 | 79 | #### Missing Values 80 | 81 | # To illustrate missing data procedures, we first convert a few entries for 82 | # BEDROOMS to NA's. Then we impute these missing values using the median of the 83 | # remaining values. 84 | rows.to.missing <- sample(row.names(housing.df), 10) 85 | housing.df[rows.to.missing,]$BEDROOMS <- NA 86 | summary(housing.df$BEDROOMS) 87 | # Now we have 10 NA's and the median of the remaining values is 3. 88 | 89 | # replace the missing values using the median of the remaining values 90 | # use median() with na.rm=TRUE to ignore missing values when computing the median. 91 | housing.df <- housing.df %>% 92 | replace_na(list(BEDROOMS=median(housing.df$BEDROOMS, na.rm=TRUE))) 93 | 94 | summary(housing.df$BEDROOMS) 95 | 96 | ## Predictive Power and Overfitting 97 | ### Creating and Using Data Partitions 98 | #### Holdout Partition 99 | 100 | housing.df <- mlba::WestRoxbury %>% 101 | mutate(REMODEL=factor(REMODEL)) 102 | 103 | # use set.seed() to get the same partitions when re-running the R code. 104 | set.seed(1) 105 | 106 | ## partitioning into training (60%) and holdout (40%) 107 | # randomly sample 60% of the row IDs for training; the remaining 40% serve 108 | # as holdout 109 | train.rows <- sample(rownames(housing.df), nrow(housing.df)*0.6) 110 | # collect all the columns with training row ID into training set: 111 | train.df <- housing.df[train.rows, ] 112 | # assign row IDs that are not already in the training set, into holdout 113 | holdout.rows <- setdiff(rownames(housing.df), train.rows) 114 | holdout.df <- housing.df[holdout.rows, ] 115 | 116 | ## partitioning into training (50%), validation (30%), holdout (20%) 117 | # randomly sample 50% of the row IDs for training 118 | train.rows <- sample(rownames(housing.df), nrow(housing.df)*0.5) 119 | 120 | # sample 30% of the row IDs into the validation set, drawing only from records 121 | # not already in the training set 122 | # use setdiff() to find records not already in the training set 123 | valid.rows <- sample(setdiff(rownames(housing.df), train.rows), 124 | nrow(housing.df)*0.3) 125 | 126 | # assign the remaining 20% row IDs serve as holdout 127 | holdout.rows <- setdiff(rownames(housing.df), union(train.rows, valid.rows)) 128 | 129 | # create the 3 data frames by collecting all columns from the appropriate rows 130 | train.df <- housing.df[train.rows, ] 131 | valid.df <- housing.df[valid.rows, ] 132 | holdout.df <- housing.df[holdout.rows, ] 133 | 134 | ## partitioning into training (60%) and holdout (40%) using caret 135 | set.seed(1) 136 | idx <- caret::createDataPartition(housing.df$TOTAL.VALUE, p=0.6, list=FALSE) 137 | train.df <- housing.df[idx, ] 138 | holdout.df <- housing.df[-idx, ] 139 | 140 | 141 | 142 | ## Building a Predictive Model 143 | ### Modeling Process 144 | #### Cross-Validation 145 | 146 | library(tidyverse) 147 | library(mlba) 148 | library(fastDummies) 149 | 150 | housing.df <- mlba::WestRoxbury %>% 151 | # remove rows with missing values 152 | drop_na() %>% 153 | # remove column TAX 154 | select(-TAX) %>% 155 | # make REMODEL a factor and convert to dummy variables 156 | mutate(REMODEL=factor(REMODEL)) %>% 157 | dummy_cols(select_columns=c('REMODEL'), 158 | remove_selected_columns=TRUE, remove_first_dummy=TRUE) 159 | 160 | 161 | set.seed(1) 162 | idx <- caret::createDataPartition(housing.df$TOTAL.VALUE, p=0.6, list=FALSE) 163 | train.df <- housing.df[idx, ] 164 | holdout.df <- housing.df[-idx, ] 165 | 166 | 167 | reg <- lm(TOTAL.VALUE ~ ., data=train.df) 168 | train.res <- data.frame(actual=train.df$TOTAL.VALUE, predicted=reg$fitted.values, 169 | residuals=reg$residuals) 170 | head(train.res) 171 | 172 | 173 | pred <- predict(reg, newdata=holdout.df) 174 | holdout.res <- data.frame(actual=holdout.df$TOTAL.VALUE, predicted=pred, 175 | residuals=holdout.df$TOTAL.VALUE - pred) 176 | head(holdout.res) 177 | 178 | 179 | library(caret) 180 | # compute metrics on training set 181 | data.frame( 182 | ME = round(mean(train.res$residuals), 5), 183 | RMSE = RMSE(pred=train.res$predicted, obs=train.res$actual), 184 | MAE = MAE(pred=train.res$predicted, obs=train.res$actual) 185 | ) 186 | 187 | # compute metrics on holdout set 188 | data.frame( 189 | ME = round(mean(holdout.res$residuals), 5), 190 | RMSE = RMSE(pred=holdout.res$predicted, obs=holdout.res$actual), 191 | MAE = MAE(pred=holdout.res$predicted, obs=holdout.res$actual) 192 | ) 193 | 194 | 195 | # For demonstration purposes, we construct the new.data from the original dataset 196 | housing.df <- mlba::WestRoxbury 197 | new.data <- housing.df[100:102, -1] %>% 198 | mutate(REMODEL=factor(REMODEL, levels=c("None", "Old", "Recent"))) %>% 199 | dummy_cols(select_columns=c('REMODEL'), 200 | remove_selected_columns=TRUE, remove_first_dummy=TRUE) 201 | new.data 202 | pred <- predict(reg, newdata = new.data) 203 | pred 204 | -------------------------------------------------------------------------------- /R/chap20.R: -------------------------------------------------------------------------------- 1 | 2 | if (!require(mlba)) { 3 | library(devtools) 4 | install_github("gedeck/mlba/mlba", force=TRUE) 5 | } 6 | set.seed(1) 7 | 8 | # Social Network Analytics 9 | ## Introduction 10 | 11 | library(igraph) 12 | 13 | # define links in data 14 | edges <- rbind( 15 | c("Dave", "Jenny"), c("Peter", "Jenny"), c("John", "Jenny"), 16 | c("Dave", "Peter"), c("Dave", "John"), c("Peter", "Sam"), 17 | c("Sam", "Albert"), c("Peter", "John") 18 | ) 19 | 20 | # generate and plot network 21 | # set argument directed = FALSE in graph.edgelist() to plot an undirected network. 22 | g <- graph.edgelist(edges, directed = FALSE) 23 | plot(g, vertex.size = 5, vertex.label.dist = 2) 24 | 25 | 26 | pdf(file=file.path("..", "figures", "chapter_20", "fig1.pdf"), width=5, height=5) 27 | par(mar=c(0,0,0,1)+.1) 28 | plot(g, vertex.size = 5, vertex.label.dist = 2) 29 | dev.off() 30 | 31 | ## Directed vs. Undirected Networks 32 | 33 | # generate and plot network 34 | # set argument directed = TRUE in graph.edgelist() to plot a directed network. 35 | g <- graph.edgelist(edges, directed = TRUE) 36 | plot(g, vertex.size = 5, vertex.label.dist = 2) 37 | 38 | 39 | pdf(file=file.path("..", "figures", "chapter_20", "fig2.pdf"), width=5, height=5) 40 | par(mar=c(0,0,0,1)+.5) 41 | plot(g, vertex.size = 5, vertex.label.dist=2) 42 | dev.off() 43 | 44 | 45 | edges <- rbind(c("A", "B"), c("B", "C"), c("C", "A")) 46 | g <- graph.edgelist(edges, directed = FALSE) 47 | E(g)$width <- c(20, 5, 5) 48 | plot(g, vertex.size = 5, vertex.label.dist = 2) 49 | 50 | pdf(file=file.path("..", "figures", "chapter_20", "fig3.pdf"), width=2, height=2) 51 | par(mar=c(0,0,0,0)+.2) 52 | plot(g, vertex.size = 5, vertex.label.dist = 2) 53 | dev.off() 54 | # $ 55 | 56 | ## Visualizing and Analyzing Networks 57 | ### Plot Layout 58 | 59 | library(igraph) 60 | drug.df <- mlba::Drug 61 | 62 | # convert edges to edge list matrix 63 | edges <- as.matrix(drug.df[, c(1,2)]) 64 | g <- graph.edgelist(edges,directed=FALSE) 65 | 66 | # plot network 67 | # nodes' size is proportional to their eigenvector centrality 68 | plot(g, vertex.label = NA, vertex.size = eigen_centrality(g)$vector * 20) 69 | 70 | 71 | pdf(file=file.path("..", "figures", "chapter_20", "SNA_Drug_Laundry.pdf"), width=5, height=5) 72 | par(mar=c(0,0,0,0)+.1) 73 | plot(g, vertex.label = NA, vertex.size = eigen_centrality(g)$vector * 20) 74 | dev.off() 75 | 76 | 77 | edges <- rbind( 78 | c("Dave", "Jenny"), c("Peter", "Jenny"), c("John", "Jenny"), 79 | c("Dave", "Peter"), c("Dave", "John"), c("Peter", "Sam"), 80 | c("Sam", "Albert"), c("Peter", "John") 81 | ) 82 | g <- graph.edgelist(edges) 83 | pdf(file=file.path("..", "figures", "chapter_20", "fig5_circle.pdf"), width=3, height=3) 84 | par(mar=c(0,0,0,1)+.7) 85 | plot(g, layout = layout_in_circle, vertex.size = 5, vertex.label.dist = 2) 86 | dev.off() 87 | pdf(file=file.path("..", "figures", "chapter_20", "fig5_grid.pdf"), width=3, height=3) 88 | par(mar=c(0,0,0,1)+.7) 89 | plot(g, layout = layout_on_grid, vertex.size = 5, vertex.label.dist = 2) 90 | dev.off() 91 | 92 | 93 | # Building on the code presented in Figure 19.1 94 | plot(g, layout = layout_in_circle, vertex.size = 5, vertex.label.dist = 2) 95 | plot(g, layout = layout_on_grid, vertex.size = 5, vertex.label.dist = 2) 96 | 97 | ## Social Data Metrics and Taxonomy 98 | ### Node-Level Centrality Metrics 99 | 100 | edges <- rbind( 101 | c("Dave", "Jenny"), c("Peter", "Jenny"), c("John", "Jenny"), 102 | c("Dave", "Peter"), c("Dave", "John"), c("Peter", "Sam"), 103 | c("Sam", "Albert"), c("Peter", "John") 104 | ) 105 | g <- graph.edgelist(edges, directed=FALSE) 106 | 107 | degree(g) 108 | betweenness(g) 109 | betweenness(g)/sum(betweenness(g)) 110 | closeness(g) 111 | eigen_centrality(g) 112 | 113 | ### Egocentric Network 114 | 115 | # get Peter's 1-level ego network 116 | # for a 2-level ego network set argument order = 2 in make_ego_graph(). 117 | peter.ego <- make_ego_graph(g, order = 1, nodes = "Peter") 118 | plot(peter.ego[[1]], vertex.size = 1, vertex.label.dist = 0.5) 119 | 120 | 121 | pdf(file=file.path("..", "figures", "chapter_20", "fig6_1.pdf"), width=5, height=5) 122 | par(mar=c(0,0,0,1)+.5) 123 | peter.ego <- make_ego_graph(g, order = 1, nodes = "Peter") 124 | g.ego <- peter.ego[[1]] 125 | V(g.ego)$color <- "orange" 126 | V(g.ego)["Peter"]$color <- "red" 127 | plot(g.ego, vertex.size = 5, vertex.label.dist = 2) 128 | dev.off() 129 | pdf(file=file.path("..", "figures", "chapter_20", "fig6_2.pdf"), width=5, height=5) 130 | par(mar=c(0,0,0,1)+.5) 131 | peter.ego <- make_ego_graph(g, order = 2, nodes = "Peter") 132 | g.ego <- peter.ego[[1]] 133 | V(g.ego)$color <- "orange" 134 | V(g.ego)["Peter"]$color <- "red" 135 | plot(g.ego, vertex.size = 5, vertex.label.dist = 2) 136 | dev.off() 137 | 138 | ### Network Metrics 139 | 140 | degree.distribution(g) # normalized 141 | edge_density(g) 142 | 143 | ## Collecting Social Network Data with R 144 | ### Collaborative Filtering 145 | 146 | library(twitteR) 147 | # replace key and secret number with those you obtained from Twitter 148 | setup_twitter_oauth(consumer_key = "XXX", consumer_secret = "XXX", 149 | access_token = "XXX", access_secret = "XXX") 150 | 151 | # get recent tweets 152 | recent.25.tweets <- searchTwitter("text mining", resultType="recent", n = 25) 153 | 154 | 155 | library(Rfacebook) 156 | # replace the app id and secret number with those you obtained from Facebook 157 | fb_oauth <- fbOAuth(app_id = "XXX", app_secret = "XXX") 158 | fb_oauth_credentials <- fromJSON(names(fb_oauth$credentials)) 159 | 160 | # get recent posts on page "dataminingbook" 161 | fb_page <- getPage(page = "dataminingbook", token = fb_oauth_credentials$access_token) 162 | 163 | # a facebook page contains the following information: 164 | t(t(names(fb_page))) 165 | fb_page[1,] 166 | 167 | # get information about most recent post 168 | post <- getPost(post=fb_page$id[1], n=20, token=fb_oauth_credentials$access_token) 169 | 170 | post$likes 171 | post$comments 172 | 173 | -------------------------------------------------------------------------------- /R/chap21.R: -------------------------------------------------------------------------------- 1 | 2 | if (!require(mlba)) { 3 | library(devtools) 4 | install_github("gedeck/mlba/mlba", force=TRUE) 5 | } 6 | 7 | # Text Mining 8 | ## The Tabular Representation of Text: Term-Document Matrix and ``Bag-of-Words" 9 | 10 | library(tm) 11 | 12 | # define vector of sentences ("docs") 13 | text <- c("this is the first sentence", 14 | "this is a second sentence", 15 | "the third sentence is here") 16 | 17 | # convert sentences into a corpus 18 | corp <- Corpus(VectorSource(text)) 19 | 20 | # compute term frequency 21 | tdm <- TermDocumentMatrix(corp) 22 | inspect(tdm) 23 | 24 | ## Preprocessing the Text 25 | 26 | text <- c("this is the first sentence!!", 27 | "this is a second Sentence :)", 28 | "the third sentence, is here", 29 | "forth of all sentences") 30 | corp <- Corpus(VectorSource(text)) 31 | tdm <- TermDocumentMatrix(corp) 32 | inspect(tdm) 33 | 34 | ### Tokenization 35 | 36 | # tokenization 37 | corp <- tm_map(corp, stripWhitespace) 38 | corp <- tm_map(corp, removePunctuation) 39 | tdm <- TermDocumentMatrix(corp) 40 | inspect(tdm) 41 | 42 | ### Text Reduction 43 | 44 | stopwords('english') 45 | 46 | 47 | # stopwords 48 | library(SnowballC) 49 | corp <- tm_map(corp, removeWords, stopwords("english")) 50 | 51 | # stemming 52 | corp <- tm_map(corp, stemDocument) 53 | 54 | tdm <- TermDocumentMatrix(corp) 55 | inspect(tdm) 56 | 57 | ### Term Frequency--Inverse Document Frequency (TF-IDF) 58 | 59 | tfidf <- weightTfIdf(tdm) 60 | inspect(tfidf) 61 | 62 | ## Example: Online Discussions on Autos and Electronics 63 | ### Importing and Labeling the Records 64 | 65 | library(tm) 66 | # step 1: import and label records 67 | # read zip file into a corpus 68 | corp <- Corpus(ZipSource(mlba::AutosElectronics, recursive = T)) 69 | 70 | # create an array of records labels 71 | label <- c(rep(1, 1000), rep(0, 1000)) 72 | 73 | # step 2: text preprocessing 74 | # tokenization 75 | corp <- tm_map(corp, stripWhitespace) 76 | corp <- tm_map(corp, removePunctuation) 77 | corp <- tm_map(corp, removeNumbers) 78 | 79 | # stopwords 80 | corp <- tm_map(corp, removeWords, stopwords("english")) 81 | 82 | # stemming 83 | corp <- tm_map(corp, stemDocument) 84 | 85 | # step 3: TF-IDF and latent semantic analysis 86 | # compute TF-IDF 87 | tdm <- TermDocumentMatrix(corp) 88 | tfidf <- weightTfIdf(tdm) 89 | 90 | # extract (20) concepts 91 | library(lsa) 92 | lsa.tfidf <- lsa(tfidf, dim = 20) 93 | 94 | # convert to data frame 95 | words.df <- as.data.frame(as.matrix(lsa.tfidf$dk)) 96 | 97 | ### Fitting a Predictive Model 98 | 99 | library(caret) 100 | 101 | # prepare training and holdout sets 102 | set.seed(1) 103 | df <- cbind(label=factor(label), words.df) 104 | idx <- caret::createDataPartition(df$label, p=0.6, list=FALSE) 105 | train.df <- df[idx, ] 106 | holdout.df <- df[-idx, ] 107 | 108 | # fit logistic regression 109 | logit.reg <- train(label ~ ., data=train.df, 110 | trControl=trainControl(method="none"), 111 | method="glm", family="binomial") 112 | 113 | # compute accuracy on holdout set 114 | pred <- predict(logit.reg, newdata=holdout.df) 115 | confusionMatrix(pred, holdout.df$label) 116 | 117 | 118 | library(gains) 119 | 120 | prob <- predict(logit.reg, newdata=holdout.df, type="prob")[,2] 121 | actual <- ifelse(holdout.df$label == 1, 1, 0) 122 | gain <- gains(actual, prob) 123 | barplot(gain$mean.resp/mean(actual), names.arg=seq(10, 100, by=10), 124 | xlab="Percentile", ylab="Decile mean / global mean") 125 | 126 | 127 | pdf(file=file.path("..", "figures", "chapter_21", "decileLiftClassification.pdf"), 128 | width=6, height=4) 129 | barplot(gain$mean.resp/mean(actual), names.arg=seq(10, 100, by=10), 130 | xlab="Percentile", ylab="Decile mean / global mean") 131 | dev.off() 132 | 133 | 134 | ## Example: Sentiment Analysis of Movie Reviews 135 | ### Data Loading, Preparation, and Partitioning 136 | 137 | library(tidyverse) 138 | library(text2vec) 139 | 140 | # load and split data into training and holdout set 141 | data <- mlba::IMDBdataset10K %>% 142 | mutate( 143 | id = row_number(), 144 | sentiment = as.factor(sentiment) 145 | ) 146 | 147 | set.seed(1) 148 | trainIndex <- createDataPartition(data$sentiment, p=0.8, list=FALSE) 149 | data_train <- data[trainIndex, ] 150 | data_holdout <- data[-trainIndex, ] 151 | 152 | 153 | prep_fun <- tolower 154 | tok_fun <- word_tokenizer 155 | 156 | it_train <- itoken(data_train$review, ids=data_train$id, 157 | preprocessor=prep_fun, tokenizer=tok_fun) 158 | it_holdout <- itoken(data_holdout$review, ids=data_holdout$id, 159 | preprocessor=prep_fun, tokenizer=tok_fun) 160 | 161 | vocab <- create_vocabulary(it_train) 162 | vocab <- prune_vocabulary(vocab, term_count_min = 5L) 163 | vectorizer <- vocab_vectorizer(vocab) 164 | tcm_train <- create_tcm(it_train, vectorizer, skip_grams_window = 5L) 165 | 166 | 167 | 168 | ### Generating and Applying pb 169 | 170 | # determine word vectors 171 | glove <- GlobalVectors$new(rank=100, x_max=10) 172 | wv_main <- glove$fit_transform(tcm_train, n_iter=10, convergence_tol=0.01, n_threads=8) 173 | wv_context <- glove$components 174 | word_vectors <- wv_main + t(wv_context) 175 | 176 | 177 | 178 | 179 | dtm_train <- create_dtm(it_train, vectorizer) 180 | common_terms <- intersect(colnames(dtm_train), rownames(word_vectors) ) 181 | dtm_averaged <- normalize(dtm_train[, common_terms], "l1") 182 | sentence_vectors_train <- dtm_averaged %*% word_vectors[common_terms, ] 183 | 184 | dtm_holdout <- create_dtm(it_holdout, vectorizer) 185 | common_terms <- intersect(colnames(dtm_holdout), rownames(word_vectors) ) 186 | dtm_averaged <- normalize(dtm_holdout[, common_terms], "l1") 187 | sentence_vectors_holdout <- dtm_averaged %*% word_vectors[common_terms, ] 188 | 189 | ### Fitting a Predictive Model 190 | 191 | train.df <- as.data.frame(as.matrix(sentence_vectors_train)) 192 | train.df$sentiment <- data_train$sentiment 193 | 194 | trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE) 195 | logit.reg <- caret::train(sentiment ~ ., data=train.df, trControl=trControl, 196 | # fit logistic regression with a generalized linear model 197 | method="glm", family="binomial") 198 | 199 | holdout.df <- as.data.frame(as.matrix(sentence_vectors_holdout)) 200 | holdout.df$sentiment <- data_holdout$sentiment 201 | 202 | caret::confusionMatrix(predict(logit.reg, holdout.df), holdout.df$sentiment) 203 | 204 | 205 | 206 | 207 | library(ROCR) 208 | prob <- predict(logit.reg, newdata=holdout.df, type="prob")$positive 209 | 210 | predob <- prediction(prob, holdout.df$sentiment) 211 | perf <- performance(predob, "tpr", "fpr") 212 | perf.df <- data.frame( 213 | tpr=perf@x.values[[1]], 214 | fpr=perf@y.values[[1]] 215 | ) 216 | ggplot(perf.df, aes(x=tpr, y=fpr)) + 217 | geom_line() + 218 | geom_segment(aes(x=0, y=0, xend=1, yend=1), color="grey", linetype="dashed") + 219 | labs(x="1 - Specificity", y="Sensitivity") 220 | 221 | 222 | ggsave(file=file.path("..", "figures", "chapter_21", "glove-ROC.pdf"), 223 | last_plot() + theme_bw()) 224 | -------------------------------------------------------------------------------- /R/chap22.R: -------------------------------------------------------------------------------- 1 | 2 | if (!require(mlba)) { 3 | library(devtools) 4 | install_github("gedeck/mlba/mlba", force=TRUE) 5 | } 6 | 7 | # Responsible Data Science 8 | ## Example: Applying the RDS Framework to the COMPAS Example 9 | ### Data Issues 10 | 11 | library(caret) 12 | library(tidyverse) 13 | # load COMPAS data 14 | compas.df <- mlba::COMPAS_clean %>% 15 | select(-id) %>% 16 | mutate( 17 | age_cat = factor(age_cat, levels=c("Less than 25", "25 - 45", "Greater than 45")), 18 | c_charge_degree = factor(c_charge_degree, levels=c("F", "M")), 19 | race = factor(race, levels=c("African-American", "Asian", "Caucasian", "Hispanic", 20 | "Native American", "Other")), 21 | sex = factor(sex, levels=c("Female", "Male")), 22 | two_year_recid = factor(two_year_recid, levels=c(0, 1), labels=c("No", "Yes")) 23 | ) 24 | 25 | # split dataset and train models 26 | set.seed(1) 27 | idx <- createDataPartition(compas.df$two_year_recid, p=0.7, list=FALSE) 28 | train.df <- compas.df[idx, ] 29 | valid.df <- compas.df[-idx, ] 30 | 31 | 32 | trControl <- trainControl(method="cv", number=5, allowParallel=TRUE) 33 | # logistic regression model 34 | logreg.model <- train(two_year_recid ~ . - race, data=train.df, 35 | method="glm", family="binomial", trControl=trControl) 36 | # random forest model 37 | rf.model <- train(two_year_recid ~ . - race, data=train.df, 38 | method="rf", trControl=trControl) 39 | 40 | # extract coefficients and calculate odds shown in table 41 | logreg.coef <- coef(logreg.model$finalModel) 42 | data.frame( 43 | coefficient=logreg.coef, 44 | odds=c(NA, exp(logreg.coef)[-c(1)]) 45 | ) %>% round(3) 46 | 47 | 48 | # calculation of model accuracy based on cross-validation results 49 | caret::confusionMatrix(logreg.model) 50 | caret::confusionMatrix(rf.model) 51 | 52 | ### Auditing the Model 53 | 54 | library(ROCR) 55 | holdoutMetrics <- function(df, model) { 56 | result <- data.frame(obs = df$two_year_recid, pred = predict(model, newdata=df), 57 | prob = predict(model, newdata=df, type="prob")$Yes) 58 | pred <- prediction(result$prob, result$obs) 59 | # compute overall performance 60 | perf_AUC <- performance(pred, "auc") 61 | AUC <- perf_AUC@y.values[[1]] 62 | cm <- confusionMatrix(result$pred, result$obs, positive="Yes") 63 | return (tibble(AUC=AUC, Accuracy = cm$overall["Accuracy"], 64 | FPR = 100*(1-cm$byClass["Specificity"]), 65 | FNR = 100*(1-cm$byClass["Sensitivity"]))) 66 | } 67 | # compute performance by race 68 | metricsByRace <- function(model) { 69 | metrics <- tibble() 70 | for (raceValue in levels(compas.df$race)) { 71 | df <- compas.df %>% filter(race==raceValue) 72 | metrics <- bind_rows(metrics, tibble(race=raceValue, holdoutMetrics(df, model))) 73 | } 74 | return (metrics) 75 | } 76 | # combine metrics for logistic and random forest 77 | metrics <- bind_rows( 78 | tibble(Model="Random forest", metricsByRace(rf.model)), 79 | tibble(Model="Logistic regression", metricsByRace(logreg.model)) 80 | ) %>% filter(! race %in% c("Asian", "Native American")) 81 | 82 | 83 | library(gridExtra) 84 | makeBarchart <- function(metrics, aesthetics) { 85 | g <- ggplot(metrics, aesthetics) + 86 | geom_bar(position="dodge", stat="identity") + 87 | geom_text(hjust=1.5, position=position_dodge(.9)) + 88 | coord_flip() + 89 | scale_x_discrete(limits=rev) + 90 | labs(x="Race") + 91 | theme_bw() 92 | return (g) 93 | } 94 | g1 <- makeBarchart(metrics, aes(x=race, y=Accuracy, fill=Model, label=round(Accuracy, 3))) + 95 | theme(legend.position="none") 96 | g2 <- makeBarchart(metrics, aes(x=race, y=AUC, fill=Model, label=round(AUC, 3))) + 97 | theme(legend.position="bottom") 98 | grid.arrange(g1, g2, nrow=2, heights=c(6.25, 7)) 99 | 100 | g <- arrangeGrob(g1, g2, heights=c(6.25, 7)) 101 | ggsave(file=file.path("..", "figures", "chapter_22", "c22_acc_auc.pdf"), 102 | g, width=5, height=5, units="in") 103 | 104 | g1 <- makeBarchart(metrics, aes(x=race, y=FPR, fill=Model, label=round(FPR, 3))) + 105 | theme(legend.position="none") 106 | g2 <- makeBarchart(metrics, aes(x=race, y=FNR, fill=Model, label=round(FNR, 3))) + 107 | theme(legend.position="bottom") 108 | grid.arrange(g1, g2, heights=c(6.25, 7)) 109 | 110 | g <- arrangeGrob(g1, g2, heights=c(6.25, 7)) 111 | ggsave(file=file.path("..", "figures", "chapter_22", "c22_fpr_fnr.pdf"), 112 | g, width=5, height=5, units="in") 113 | 114 | #### Interpretability Methods 115 | 116 | library(iml) 117 | predictor.rf = Predictor$new(rf.model, data=valid.df, y=valid.df$two_year_recid) 118 | predictor.lm = Predictor$new(logreg.model, data=valid.df, y=valid.df$two_year_recid) 119 | 120 | 121 | featureEffect.lm = FeatureEffect$new(predictor.lm, feature='priors_count', method='pdp') 122 | featureEffect.rf = FeatureEffect$new(predictor.rf, feature='priors_count', method='pdp') 123 | combined <- bind_rows( 124 | tibble(Method="Logistic regression", featureEffect.lm$results %>% filter(.class=="Yes")), 125 | tibble(Method="Random forest", featureEffect.rf$results %>% filter(.class=="Yes")) 126 | ) 127 | ggplot(combined, aes(x=priors_count, y=.value, color=Method)) + 128 | geom_line() + 129 | labs(x="Feature value", y="Probability of recidivism") 130 | 131 | 132 | ggsave(file=file.path("..", "figures", "chapter_22", "c22f005.pdf"), 133 | last_plot() + theme_bw(), width=5, height=3, units="in") 134 | 135 | 136 | library(iml) 137 | predictor.rf = Predictor$new(rf.model, data=valid.df, y=valid.df$two_year_recid) 138 | predictor.lm = Predictor$new(logreg.model, data=valid.df, y=valid.df$two_year_recid) 139 | 140 | # permutation feature importance 141 | FeatureImp$new(predictor.lm, "ce", compare="ratio", n.repetitions=5) 142 | FeatureImp$new(predictor.rf, "ce", compare="ratio", n.repetitions=5) 143 | -------------------------------------------------------------------------------- /R/chap23.R: -------------------------------------------------------------------------------- 1 | 2 | if (!require(mlba)) { 3 | library(devtools) 4 | install_github("gedeck/mlba/mlba", force=TRUE) 5 | } 6 | -------------------------------------------------------------------------------- /R/chap4.R: -------------------------------------------------------------------------------- 1 | 2 | if (!require(mlba)) { 3 | library(devtools) 4 | install_github("gedeck/mlba/mlba", force=TRUE) 5 | } 6 | 7 | options(dplyr.summarise.inform = FALSE) 8 | library(tidyverse) 9 | 10 | # Dimension Reduction 11 | ## Data Summaries 12 | ### Example 1: House Prices in Boston 13 | 14 | boston.housing.df <- mlba::BostonHousing 15 | head(boston.housing.df, 9) 16 | summary(boston.housing.df) 17 | 18 | # compute mean, standard dev., min, max, median, length, and missing values of CRIM 19 | mean(boston.housing.df$CRIM) 20 | sd(boston.housing.df$CRIM) 21 | min(boston.housing.df$CRIM) 22 | max(boston.housing.df$CRIM) 23 | median(boston.housing.df$CRIM) 24 | length(boston.housing.df$CRIM) 25 | 26 | # find the number of missing values of variable CRIM 27 | sum(is.na(boston.housing.df$CRIM)) 28 | 29 | # compute mean, standard dev., min, max, median, length, and missing values for all 30 | # variables 31 | data.frame(mean=sapply(boston.housing.df, mean), 32 | sd=sapply(boston.housing.df, sd), 33 | min=sapply(boston.housing.df, min), 34 | max=sapply(boston.housing.df, max), 35 | median=sapply(boston.housing.df, median), 36 | length=sapply(boston.housing.df, length), 37 | miss.val=sapply(boston.housing.df, 38 | function(x) sum(length(which(is.na(x)))))) 39 | 40 | 41 | 42 | ### Summary Statistics 43 | 44 | round(cor(boston.housing.df),2) 45 | 46 | ### Aggregation and Pivot Tables 47 | 48 | boston.housing.df <- mlba::BostonHousing 49 | table(boston.housing.df$CHAS) 50 | 51 | # tidyverse version 52 | boston.housing.df %>% count(CHAS) 53 | 54 | 55 | # create bins of size 1 56 | boston.housing.df <- boston.housing.df %>% 57 | mutate(RM.bin = cut(RM, c(1:9), labels=FALSE)) 58 | 59 | # compute the average of MEDV by (binned) RM and CHAS 60 | # in aggregate() use the argument by= to define the list of aggregating variables, 61 | # and FUN= as an aggregating function. 62 | aggregate(boston.housing.df$MEDV, by=list(RM=boston.housing.df$RM.bin, 63 | CHAS=boston.housing.df$CHAS), FUN=mean) 64 | 65 | # tidyverse version 66 | boston.housing.df %>% 67 | group_by(RM.bin, CHAS) %>% 68 | summarise(mean(MEDV)) 69 | 70 | 71 | # use install.packages("reshape") the first time the package is used 72 | library(reshape) 73 | boston.housing.df <- mlba::BostonHousing 74 | # create bins of size 1 75 | boston.housing.df <- boston.housing.df %>% 76 | mutate(RM.bin = cut(RM, c(1:9), labels=FALSE)) 77 | 78 | # use melt() to stack a set of columns into a single column of data. 79 | # stack MEDV values for each combination of (binned) RM and CHAS 80 | mlt <- melt(boston.housing.df, id=c("RM.bin", "CHAS"), measure=c("MEDV")) 81 | head(mlt, 5) 82 | 83 | # use cast() to reshape data and generate pivot table 84 | cast(mlt, RM.bin ~ CHAS, subset=variable=="MEDV", 85 | margins=c("grand_row", "grand_col"), mean) 86 | 87 | # tidyverse version 88 | boston.housing.df %>% 89 | group_by(RM.bin, CHAS) %>% 90 | summarize(mean=mean(MEDV)) %>% 91 | spread(CHAS, mean) 92 | 93 | ## Reducing the Number of Categories in Categorical Variables 94 | 95 | boston.housing.df <- mlba::BostonHousing 96 | 97 | tbl <- table(boston.housing.df$CAT.MEDV, boston.housing.df$ZN) 98 | prop.tbl <- prop.table(tbl, margin=2) 99 | barplot(prop.tbl, xlab="ZN", ylab="", yaxt="n",main="Distribution of CAT.MEDV by ZN") 100 | axis(2, at=(seq(0,1, 0.2)), paste(seq(0,100,20), "%")) 101 | 102 | library(tidyverse) 103 | df <- data.frame(prop.tbl) 104 | ggplot(df, aes(x=Var2, y=Freq, group=Var1, fill=Var1)) + 105 | geom_bar(stat="identity", color="grey", width=1) + 106 | scale_y_continuous(labels = scales::percent, expand=expansion()) + 107 | scale_fill_manual("CAT.MEDV", values=c("0"="#eeeeee", "1"="darkgrey")) + 108 | labs(x="ZN", y="", title="Distribution of CAT.MEDV by ZN") 109 | 110 | 111 | g <- last_plot() + theme_bw() 112 | ggsave(file=file.path("..", "figures", "chapter_04", "reduction-pivot-bar.pdf"), 113 | g, width=9, height=4, units="in") 114 | 115 | 116 | library(forecast) 117 | tru.data <- mlba::ToysRUsRevenues 118 | tru.ts <- ts(tru.data[, 3], start = c(1992, 1), end = c(1995, 4), freq = 4) 119 | autoplot(tru.ts) + 120 | geom_point(size=0.5) + 121 | labs(x="Time", y="Revenue ($ millions)") + 122 | theme_bw() 123 | ggsave(file=file.path("..", "figures", "chapter_04", "ToysRUs.pdf"), 124 | last_plot()) 125 | 126 | ## Principal Components Analysis 127 | ### Example 2: Breakfast Cereals 128 | 129 | library(tidyverse) 130 | cereals.df <- mlba::Cereals %>% select(calories, rating) 131 | # compute PCs on two dimensions 132 | pcs <- prcomp(cereals.df %>% select(calories, rating)) 133 | summary(pcs) 134 | pcs$rot 135 | scores <- pcs$x 136 | head(scores, 5) 137 | 138 | 139 | getPCaxis <- function(f, pcs, pcLabel) { 140 | return (data.frame( 141 | rbind(pcs$center + f * pcs$rotation[, pcLabel], 142 | pcs$center - f * pcs$rotation[, pcLabel])) 143 | ) 144 | } 145 | PC1 <- getPCaxis(90, pcs, "PC1") 146 | PC2 <- getPCaxis(50, pcs, "PC2") 147 | ggplot(cereals.df, aes(x=calories, y=rating)) + 148 | geom_point() + 149 | geom_line(data=PC1) + 150 | geom_line(data=PC2) + 151 | coord_cartesian(xlim=c(0, 200), ylim=c(0, 110)) + 152 | labs(x="Calories", y="Rating") + 153 | annotate(geom="text", x=30, y=80, label="z[1]",parse=TRUE) + 154 | annotate(geom="text", x=120, y=80, label="z[2]",parse=TRUE) + 155 | theme_bw() 156 | 157 | ggsave(file=file.path("..", "figures", "chapter_04", "pca_subset.pdf"), 158 | width=5, height=3, last_plot()) 159 | 160 | ### Principal Components 161 | 162 | # load and preprocess the data 163 | cereals.df <- mlba::Cereals %>% 164 | column_to_rownames("name") %>% 165 | select(-c(mfr, type)) %>% 166 | drop_na() 167 | 168 | pcs <- prcomp(cereals.df) 169 | summary(pcs) 170 | pcs$rotation[,1:5] 171 | 172 | ### Normalizing the Data 173 | 174 | # Use function prcomp() with scale. = T to run PCA on normalized data 175 | pcs.cor <- prcomp(cereals.df, scale. = T) 176 | 177 | summary(pcs.cor) 178 | pcs.cor$rotation[,1:5] 179 | 180 | 181 | library(ggrepel) 182 | ggplot(data.frame(pcs.cor$x), aes(x=PC1, y=PC2, label=rownames(pcs.cor$x))) + 183 | geom_point(shape=21) + 184 | geom_text_repel(size=2, max.overlaps=7) + 185 | theme_bw() 186 | 187 | f <- 1.3 188 | ggsave(file=file.path("..", "figures", "chapter_04", "pca_full.pdf"), 189 | width=f * 4, height=f * 5, last_plot()) 190 | 191 | ## Dimension Reduction Using Classification and Regression Trees 192 | ### Using Principal Components for Classification and Prediction 193 | 194 | wine.df <- mlba::Wine %>% select(-Type) 195 | pcs.cor <- prcomp(wine.df) 196 | summary(pcs.cor) 197 | pcs.cor$rotation[,1:4] 198 | -------------------------------------------------------------------------------- /R/chap6.R: -------------------------------------------------------------------------------- 1 | 2 | if (!require(mlba)) { 3 | library(devtools) 4 | install_github("gedeck/mlba/mlba", force=TRUE) 5 | } 6 | options(scipen=999, digits = 3) 7 | 8 | # Multiple Linear Regression 9 | ## Estimating the Regression Equation and Prediction 10 | ### Example: Predicting the Price of Used Toyota Corolla Cars 11 | 12 | library(caret) 13 | car.df <- mlba::ToyotaCorolla 14 | # select variables for regression 15 | outcome <- "Price" 16 | predictors <- c("Age_08_04", "KM", "Fuel_Type", "HP", "Met_Color", "Automatic", 17 | "CC", "Doors", "Quarterly_Tax", "Weight") 18 | # reduce data set to first 1000 rows and selected variables 19 | car.df <- car.df[1:1000, c(outcome, predictors)] 20 | 21 | # partition data 22 | set.seed(1) # set seed for reproducing the partition 23 | idx <- createDataPartition(car.df$Price, p=0.6, list=FALSE) 24 | train.df <- car.df[idx, ] 25 | holdout.df <- car.df[-idx, ] 26 | 27 | # use lm() to run a linear regression of Price on all 11 predictors in the 28 | # training set. 29 | # use . after ~ to include all the remaining columns in train.df as predictors. 30 | car.lm <- lm(Price ~ ., data = train.df) 31 | # use options() to ensure numbers are not displayed in scientific notation. 32 | options(scipen = 999) 33 | summary(car.lm) 34 | 35 | 36 | 37 | 38 | # use predict() to make predictions on a new set. 39 | pred <- predict(car.lm, holdout.df) 40 | 41 | options(scipen=999, digits=0) 42 | data.frame( 43 | 'Predicted' = pred[1:20], 44 | 'Actual' = holdout.df$Price[1:20], 45 | 'Residual' = holdout.df$Price[1:20] - pred[1:20] 46 | ) 47 | options(scipen=999, digits = 3) 48 | 49 | # calculate performance metrics 50 | rbind( 51 | Training=mlba::regressionSummary(predict(car.lm, train.df), train.df$Price), 52 | Holdout=mlba::regressionSummary(pred, holdout.df$Price) 53 | ) 54 | 55 | 56 | library(ggplot2) 57 | pred <- predict(car.lm, holdout.df) 58 | all.residuals <- holdout.df$Price - pred 59 | 60 | ggplot() + 61 | geom_histogram(aes(x=all.residuals), fill="lightgray", color="grey") + 62 | labs(x="Residuals", y="Frequency") 63 | 64 | 65 | g <- ggplot() + 66 | geom_histogram(aes(x=all.residuals), fill="lightgray", color="grey") + 67 | labs(x="Residuals", y="Frequency") + 68 | theme_bw() 69 | ggsave(file=file.path("..", "figures", "chapter_06", "residuals-histogram.pdf"), 70 | g, width=5, height=3, units="in") 71 | 72 | ### Cross-validation and caret 73 | 74 | set.seed(1) 75 | library(caret) 76 | # define 5-fold 77 | trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE) 78 | model <- caret::train(Price ~ ., data=car.df, 79 | method="lm", # specify the model 80 | trControl=trControl) 81 | model 82 | coef(model$finalModel) 83 | 84 | 85 | library(tidyverse) 86 | collectMetrics <- function(model, train.df, holdout.df, nPredictors) { 87 | if (missing(nPredictors)) { 88 | coefs = coef(model$finalModel) 89 | nPredictors = length(coefs) - 1 90 | } 91 | return (cbind( 92 | CV=model$results %>% slice_min(RMSE) %>% dplyr::select(c(RMSE, MAE)), 93 | Training=mlba::regressionSummary(predict(model, train.df), train.df$Price), 94 | Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$Price), 95 | nPredictors=nPredictors 96 | )) 97 | } 98 | 99 | metric.full <- collectMetrics(model, train.df, holdout.df) 100 | 101 | 102 | predict(model, car.df[1:3,]) 103 | 104 | 105 | 106 | ## Variable Selection in Linear Regression 107 | ### How to Reduce the Number of Predictors 108 | #### Exhaustive Search 109 | 110 | # use regsubsets() in package leaps to run an exhaustive search. 111 | # unlike with lm, categorical predictors must be turned into dummies manually. 112 | library(leaps) 113 | library(fastDummies) 114 | 115 | # create dummies for fuel type 116 | leaps.train.df <- dummy_cols(train.df, remove_first_dummy=TRUE, 117 | remove_selected_columns=TRUE) 118 | search <- regsubsets(Price ~ ., data=leaps.train.df, nbest=1, 119 | nvmax=ncol(leaps.train.df), method="exhaustive") 120 | sum <- summary(search) 121 | 122 | # show models 123 | sum$which 124 | 125 | # show metrics 126 | sum$rsq 127 | sum$adjr2 128 | sum$cp 129 | 130 | 131 | optimal <- which.min(sum$cp) 132 | 133 | # determine the variable names for the optimal model 134 | X <- summary(search)$which[, -1] # information about included predictors 135 | xvars <- dimnames(X)[[2]] ## column names (all covariates except intercept) 136 | xvars <- xvars[X[optimal,]] 137 | 138 | # the optimal model contains all dummy variables of Fuel_Type 139 | xvars <- c("Age_08_04", "KM", "HP", "Quarterly_Tax", "Weight", "Fuel_Type") 140 | 141 | # rebuild model for best predictor set 142 | set.seed(1) 143 | trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE) 144 | model <- caret::train(Price ~ ., data=car.df[, c("Price", xvars)], 145 | method="lm", # specify the model 146 | trControl=trControl) 147 | model 148 | coef(model$finalModel) 149 | 150 | metric.exhaustive <- collectMetrics(model, train.df, holdout.df) 151 | 152 | #### Popular Subset Selection Algorithms 153 | 154 | # as model performance is estimated using AIC, we don't need to use cross-validation 155 | trControl <- caret::trainControl(method="none") 156 | model <- caret::train(Price ~ ., data=train.df, trControl=trControl, 157 | # select backward elmination 158 | method="glmStepAIC", direction='backward') 159 | 160 | coef(model$finalModel) 161 | 162 | 163 | model <- caret::train(Price ~ ., data=train.df, trControl=trControl, 164 | method="glmStepAIC", direction='forward') 165 | 166 | coef(model$finalModel) 167 | 168 | 169 | model <- caret::train(Price ~ ., data=train.df, trControl=trControl, 170 | method="glmStepAIC", direction='both') 171 | 172 | coef(model$finalModel) 173 | 174 | 175 | rbind(Training=mlba::regressionSummary(predict(model, train.df), train.df$Price), 176 | Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$Price)) 177 | 178 | 179 | 180 | # The models are identical to the best model obtained from the exhaustive search. 181 | # We therefore duplicate the metrics. 182 | metric.stepwise <- metric.exhaustive 183 | 184 | ## Regularization (Shrinkage Models) 185 | 186 | set.seed(1) 187 | library(caret) 188 | trControl <- caret::trainControl(method='cv', number=5, allowParallel=TRUE) 189 | tuneGrid <- expand.grid(lambda=10^seq(5, 2, by=-0.1), alpha=0) 190 | model <- caret::train(Price ~ ., data=train.df, 191 | method='glmnet', 192 | family='gaussian', # set the family for linear regression 193 | trControl=trControl, 194 | tuneGrid=tuneGrid) 195 | model$bestTune 196 | coef(model$finalModel, s=model$bestTune$lambda) 197 | 198 | 199 | metric.ridge <- collectMetrics(model, train.df, holdout.df, 200 | length(coef(model$finalModel, s=model$bestTune$lambda)) - 1) 201 | ridge.model <- model 202 | 203 | 204 | rbind( 205 | Training=mlba::regressionSummary(predict(model, train.df), train.df$Price), 206 | Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$Price) 207 | ) 208 | 209 | 210 | set.seed(1) 211 | tuneGrid <- expand.grid(lambda=10^seq(4, 0, by=-0.1), alpha=1) 212 | model <- caret::train(Price ~ ., data=train.df, 213 | method='glmnet', 214 | family='gaussian', # set the family for linear regression 215 | trControl=trControl, 216 | tuneGrid=tuneGrid) 217 | model$bestTune 218 | coef(model$finalModel, s=model$bestTune$lambda) 219 | 220 | 221 | lasso.model <- model 222 | metric.lasso <- collectMetrics(lasso.model, train.df, holdout.df, 223 | sum(coef(lasso.model$finalModel, s=lasso.model$bestTune$lambda) != 0) - 1) 224 | 225 | 226 | rbind( 227 | Training=mlba::regressionSummary(predict(model, train.df), train.df$Price), 228 | Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$Price) 229 | ) 230 | 231 | 232 | library(tidyverse) 233 | library(gridExtra) 234 | g1 <- ggplot(ridge.model$results, aes(x=lambda, y=RMSE)) + 235 | geom_pointrange(aes(ymin=RMSE-RMSESD, ymax=RMSE+RMSESD), color='grey') + 236 | geom_line() + 237 | geom_point(data=ridge.model$results %>% subset(RMSE == min(RMSE)), color='red') + 238 | labs(x=expression(paste('Ridge parameter ', lambda)), 239 | y='RMSE (cross-validation)') + 240 | scale_x_log10() 241 | g2 <- ggplot(lasso.model$results, aes(x=lambda, y=RMSE)) + 242 | geom_pointrange(aes(ymin=RMSE-RMSESD, ymax=RMSE+RMSESD), color='grey') + 243 | geom_line() + 244 | geom_point(data=lasso.model$results %>% subset(RMSE == min(RMSE)), color='red') + 245 | labs(x=expression(paste('Lasso parameter ', lambda)), 246 | y='RMSE (cross-validation)') + 247 | scale_x_log10() 248 | grid.arrange(g1, g2, ncol=2) 249 | 250 | g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw(), ncol=2) 251 | ggsave(file=file.path('..', 'figures', 'chapter_06', 'shrinkage-parameter-tuning.pdf'), 252 | g, width=6, height=2.5, units='in') 253 | 254 | 255 | data.frame(rbind( 256 | 'full'= metric.full, 257 | 'exhaustive' = metric.exhaustive, 258 | 'stepwise' = metric.stepwise, 259 | 'ridge' = metric.ridge, 260 | 'lasso' = metric.lasso 261 | )) 262 | -------------------------------------------------------------------------------- /R/chap7.R: -------------------------------------------------------------------------------- 1 | 2 | if (!require(mlba)) { 3 | library(devtools) 4 | install_github("gedeck/mlba/mlba", force=TRUE) 5 | } 6 | 7 | # $k$-Nearest Neighbors ($k$ 8 | ## The $k$-NN Classifier (Categorical Outcome) 9 | ### Example: Riding Mowers 10 | 11 | library(ggrepel) 12 | mowers.df <- mlba::RidingMowers 13 | set.seed(35) 14 | 15 | idx <- sample(nrow(mowers.df), 0.6*nrow(mowers.df)) 16 | train.df <- mowers.df[idx, ] 17 | holdout.df <- mowers.df[-idx, ] 18 | ## new household 19 | new.df <- data.frame(Income = 60, Lot_Size = 20) 20 | 21 | ggplot(mapping=aes(x=Income, y=Lot_Size, shape=Ownership, color=Ownership)) + 22 | geom_point(data=train.df) + 23 | geom_text_repel(aes(label=rownames(train.df)), data=train.df, show.legend = FALSE) + 24 | geom_point(data=cbind(new.df, Ownership='New')) 25 | 26 | 27 | g <- ggplot(mapping=aes(x=Income, y=Lot_Size, shape=Ownership, color=Ownership, fill=Ownership)) + 28 | geom_point(data=train.df, size=4) + 29 | geom_text_repel(aes(label=rownames(train.df)), data=train.df, show.legend = FALSE) + 30 | geom_point(data=cbind(new.df, Ownership='New'), size=5) + 31 | scale_shape_manual(values = c(18, 15, 21)) + 32 | scale_color_manual(values = c('black', 'darkorange', 'steelblue')) + 33 | scale_fill_manual(values = c('black', 'darkorange', 'lightblue')) 34 | 35 | g 36 | 37 | ggsave(file=file.path("..", "figures", "chapter_07", "knn-riding-mower.pdf"), 38 | g + theme_bw(), width=6, height=4, units="in") 39 | 40 | 41 | library(caret) 42 | # train k-NN model with k=3 43 | model <- train(Ownership ~ ., data=train.df, 44 | method="knn", # specify the model 45 | preProcess=c("center", "scale"), # normalize data 46 | tuneGrid=expand.grid(k=3), 47 | trControl=trainControl(method="none")) 48 | model 49 | 50 | # predict new data point 51 | predict(model, new.df) 52 | 53 | # determine nearest neighbors to new data point 54 | train.norm.df <- predict(model$preProcess, train.df) 55 | new.norm.df <- predict(model$preProcess, new.df) 56 | distances <- apply(train.norm.df[, 1:2], 1, 57 | function(d){ sqrt(sum((d - new.norm.df)^2)) }) 58 | rownames(train.df)[order(distances)][1:3] 59 | 60 | ### Choosing $k$ 61 | 62 | # use leave-one-out cross-validation for small dataset 63 | trControl <- trainControl(method="loocv", number=5, allowParallel=TRUE) 64 | model <- train(Ownership ~ ., data=train.df, 65 | method="knn", 66 | preProcess=c("center", "scale"), 67 | tuneGrid=expand.grid(k=seq(1, 13, 2)), 68 | trControl=trControl) 69 | model 70 | 71 | 72 | model <- train(Ownership ~ ., data=mowers.df, 73 | method="knn", 74 | preProcess=c("center", "scale"), 75 | tuneGrid=expand.grid(k=7), 76 | trControl=trainControl(method="none")) 77 | predict(model, new.df) 78 | 79 | ### Setting the Cutoff Value 80 | 81 | train.norm.df <- predict(model$preProcess, train.df) 82 | new.norm.df <- predict(model$preProcess, new.df) 83 | distances <- apply(train.norm.df[, 1:2], 1, 84 | function(d){ sqrt(sum((d - new.norm.df)^2)) }) 85 | train.df[order(distances)[1:8],] 86 | -------------------------------------------------------------------------------- /R/chap8.R: -------------------------------------------------------------------------------- 1 | 2 | if (!require(mlba)) { 3 | library(devtools) 4 | install_github("gedeck/mlba/mlba", force=TRUE) 5 | } 6 | options(scipen=999, digits = 3) 7 | 8 | # The Naive Bayes Classifier 9 | ## Solution: Naive Bayes 10 | ### Example 3: Predicting Delayed Flights 11 | 12 | library(tidyverse) 13 | library(caret) 14 | library(e1071) 15 | # load and preprocess dataset 16 | delays.df <- mlba::FlightDelays %>% 17 | mutate( 18 | # change numerical variables to categorical 19 | DAY_WEEK = factor(DAY_WEEK), 20 | ORIGIN = factor(ORIGIN), 21 | DEST = factor(DEST), 22 | CARRIER = factor(CARRIER), 23 | Flight.Status = factor(Flight.Status), 24 | # create hourly bins for departure time 25 | CRS_DEP_TIME = factor(round(CRS_DEP_TIME / 100)) 26 | ) %>% 27 | select(DAY_WEEK, CRS_DEP_TIME, ORIGIN, DEST, CARRIER, Flight.Status) 28 | 29 | # create training and holdout sets 30 | set.seed(1) 31 | idx <- createDataPartition(delays.df$Flight.Status, p=0.6, list=FALSE) 32 | train.df <- delays.df[idx, ] 33 | holdout.df <- delays.df[-idx, ] 34 | 35 | # run naive bayes 36 | delays.nb <- naiveBayes(Flight.Status ~ ., data = train.df) 37 | delays.nb 38 | 39 | 40 | 41 | 42 | # use prop.table() with margin = 1 to convert a count table to a proportions table, 43 | # where each row sums up to 1 (use margin = 2 for column sums) 44 | prop.table(table(train.df$Flight.Status, train.df$DEST), margin = 1) 45 | 46 | 47 | ## predict probabilities 48 | pred.prob <- predict(delays.nb, newdata=holdout.df, type="raw") 49 | ## predict class membership 50 | pred.class <- predict(delays.nb, newdata=holdout.df) 51 | 52 | df <- data.frame(actual=holdout.df$Flight.Status, predicted=pred.class, pred.prob) 53 | 54 | df[holdout.df$CARRIER == "DL" & holdout.df$DAY_WEEK == 7 & holdout.df$CRS_DEP_TIME == 10 & 55 | holdout.df$DEST == "LGA" & holdout.df$ORIGIN == "DCA",] 56 | 57 | 58 | # training 59 | confusionMatrix(predict(delays.nb, newdata=train.df), train.df$Flight.Status) 60 | 61 | # holdout 62 | confusionMatrix(predict(delays.nb, newdata=holdout.df), holdout.df$Flight.Status) 63 | 64 | 65 | library(gains) 66 | actual <- ifelse(holdout.df$Flight.Status == "delayed", 1, 0) 67 | gain <- gains(actual, pred.prob[,"delayed"], groups=length(actual) - 2) 68 | 69 | nactual <-sum(actual) 70 | ggplot() + 71 | geom_line(aes(x=gain$cume.obs, y=gain$cume.pct.of.total * nactual)) + 72 | geom_line(aes(x=c(0, max(gain$cume.obs)), y=c(0, nactual)), color="darkgrey") + 73 | labs(x="# Cases", y="Cumulative") 74 | 75 | 76 | ggsave(file=file.path("..", "figures", "chapter_08", "Flights-NB-gain.pdf"), 77 | width=3, height=3, 78 | last_plot() + theme_bw()) 79 | 80 | ### Working with Continuous Predictors 81 | 82 | (p_delayed = dnorm(213, mean=211.36215, sd=15.31)) 83 | (p_ontime = dnorm(213, mean=211.99436, sd=12.79)) 84 | 85 | p_ontime * 0.805 86 | p_delayed * 0.195 87 | -------------------------------------------------------------------------------- /R/chap9.R: -------------------------------------------------------------------------------- 1 | 2 | if (!require(mlba)) { 3 | library(devtools) 4 | install_github("gedeck/mlba/mlba", force=TRUE) 5 | } 6 | options(scipen=999) 7 | 8 | # Classification and Regression Trees 9 | ## Classification Trees 10 | ### Example 1: Riding Mowers 11 | 12 | library(rpart) 13 | library(rpart.plot) 14 | mowers.df <- mlba::RidingMowers 15 | class.tree <- rpart(Ownership ~ ., data = mowers.df, 16 | control = rpart.control(minsplit = 0), 17 | method = "class") 18 | rpart.rules(class.tree) 19 | 20 | plot_common_styling <- function(g, filename) { 21 | g <- g + 22 | geom_point(size=2) + 23 | scale_color_manual(values=c("darkorange", "steelblue")) + 24 | scale_fill_manual(values=c("darkorange", "lightblue")) + 25 | labs(x="Income ($000s)", y="Lot size (000s sqft)") + 26 | theme_bw() + 27 | theme(legend.position=c(0.89, 0.91), 28 | legend.title=element_blank(), 29 | legend.key=element_blank(), 30 | legend.background=element_blank()) 31 | ggsave(file=file.path("..", "figures", "chapter_09", filename), 32 | g, width=5, height=3, units="in") 33 | return(g) 34 | } 35 | g <- ggplot(mowers.df, mapping=aes(x=Income, y=Lot_Size, color=Ownership, fill=Ownership)) 36 | plot_common_styling(g, "mowers_tree_0.pdf") 37 | g <- g + geom_vline(xintercept=59.7) 38 | plot_common_styling(g, "mowers_tree_1.pdf") 39 | g <- g + geom_segment(x=59.9, y=21, xend=25, yend=21, color="black") 40 | plot_common_styling(g, "mowers_tree_2.pdf") 41 | g <- g + geom_segment(x=59.9, y=19.8, xend=120, yend=19.8, color="black") 42 | plot_common_styling(g, "mowers_tree_3.pdf") 43 | g <- g + geom_segment(x=84.75, y=19.8, xend=84.75, yend=10, color="black") 44 | plot_common_styling(g, "mowers_tree_4.pdf") 45 | g <- g + geom_segment(x=61.5, y=19.8, xend=61.5, yend=10, color="black") 46 | plot_common_styling(g, "mowers_tree_5.pdf") 47 | 48 | ### Measures of Impurity 49 | #### Normalization 50 | 51 | ggplot() + 52 | scale_x_continuous(limits=c(0,1)) + 53 | geom_hline(yintercept=0.5, linetype=2, color="grey") + 54 | geom_hline(yintercept=1, linetype=2, color="grey") + 55 | geom_function(aes(color="Entropy measure"), 56 | fun = function(x) {- x*log2(x) - (1-x)*log2(1-x)}, 57 | xlim=c(0.0001, 0.9999), n=100) + 58 | geom_function(aes(color="Gini index"), 59 | fun = function(x) {1 - x^2 - (1-x)^2}) + 60 | labs(y="Impurity measure", x=expression(~italic(p)[1]), color="Impurity measure") + 61 | scale_color_manual(values=c("Entropy measure"="darkorange", "Gini Index"="steelblue")) 62 | 63 | ggsave(file=file.path("..", "figures", "chapter_09", "gini_entropy.pdf"), 64 | last_plot() + theme_bw(), width=5, height=2.5, units="in") 65 | 66 | 67 | 68 | library(rpart) 69 | library(rpart.plot) 70 | mowers.df <- mlba::RidingMowers 71 | 72 | # use rpart() to run a classification tree. 73 | # define rpart.control() in rpart() to determine the depth of the tree. 74 | class.tree <- rpart(Ownership ~ ., data = mowers.df, 75 | control=rpart.control(maxdepth=2), method="class") 76 | ## plot tree 77 | # use rpart.plot() to plot the tree. You can control plotting parameters such 78 | # as color, shape, and information displayed (which and where). 79 | rpart.plot(class.tree, extra=1, fallen.leaves=FALSE) 80 | 81 | 82 | pdf(file.path("..", "figures", "chapter_09", "CT-mowerTree1.pdf"), width=3, height=3) 83 | rpart.plot(class.tree, extra=1, fallen.leaves=FALSE) 84 | dev.off() 85 | 86 | class.tree <- rpart(Ownership ~ ., data = mowers.df, 87 | control=rpart.control(minsplit=1), method="class") 88 | rpart.plot(class.tree, extra=1, fallen.leaves=FALSE) 89 | pdf(file.path("..", "figures", "chapter_09", "CT-mowerTree3.pdf"), width=5, height=5) 90 | rpart.plot(class.tree, extra=1, fallen.leaves=FALSE) 91 | dev.off() 92 | 93 | 94 | class.tree 95 | 96 | ## Evaluating the Performance of a Classification Tree 97 | ### Example 2: Acceptance of Personal Loan 98 | 99 | library(tidyverse) 100 | library(caret) 101 | 102 | # Load and preprocess data 103 | bank.df <- mlba::UniversalBank %>% 104 | # Drop ID and zip code columns. 105 | select(-c(ID, ZIP.Code)) %>% 106 | # convert Personal.Loan to a factor with labels Yes and No 107 | mutate(Personal.Loan = factor(Personal.Loan, levels=c(0, 1), labels=c("No", "Yes")), 108 | Education = factor(Education, levels=c(1, 2, 3), labels=c("UG", "Grad", "Prof"))) 109 | 110 | # partition 111 | set.seed(1) 112 | idx <- createDataPartition(bank.df$Personal.Loan, p=0.6, list=FALSE) 113 | train.df <- bank.df[idx, ] 114 | holdout.df <- bank.df[-idx, ] 115 | 116 | # classification tree 117 | default.ct <- rpart(Personal.Loan ~ ., data=train.df, method="class") 118 | # plot tree 119 | rpart.plot(default.ct, extra=1, fallen.leaves=FALSE) 120 | 121 | 122 | pdf(file.path("..", "figures", "chapter_09", "CT-universalTree1.pdf"), width=5, height=5) 123 | rpart.plot(default.ct, extra=1, fallen.leaves=FALSE) 124 | dev.off() 125 | 126 | 127 | deeper.ct <- rpart(Personal.Loan ~ ., data=train.df, method="class", cp=0, minsplit=1) 128 | # count number of leaves 129 | sum(deeper.ct$frame$var == "") 130 | # plot tree 131 | rpart.plot(deeper.ct, extra=1, fallen.leaves=FALSE) 132 | 133 | 134 | pdf(file.path("..", "figures", "chapter_09", "CT-universalTree2.pdf"), width=5, height=2.5) 135 | rpart.plot(deeper.ct, extra=1, fallen.leaves=FALSE) 136 | dev.off() 137 | 138 | 139 | # classify records in the holdout data. 140 | # set argument type = "class" in predict() to generate predicted class membership. 141 | default.ct.point.pred.train <- predict(default.ct,train.df,type = "class") 142 | # generate confusion matrix for training data 143 | confusionMatrix(default.ct.point.pred.train, train.df$Personal.Loan) 144 | ### repeat the code for the holdout set, and the deeper tree 145 | 146 | 147 | default.ct.point.pred.holdout <- predict(default.ct,holdout.df,type = "class") 148 | confusionMatrix(default.ct.point.pred.holdout, holdout.df$Personal.Loan) 149 | 150 | deeper.ct.point.pred.train <- predict(deeper.ct,train.df,type = "class") 151 | confusionMatrix(deeper.ct.point.pred.train, train.df$Personal.Loan) 152 | 153 | deeper.ct.point.pred.holdout <- predict(deeper.ct,holdout.df,type = "class") 154 | confusionMatrix(default.ct.point.pred.holdout, holdout.df$Personal.Loan) 155 | 156 | ## Avoiding Overfitting 157 | ### Stopping Tree Growth 158 | #### Stopping Tree Growth: Grid Search for Parameter Tuning 159 | 160 | set.seed(1) 161 | trControl <- trainControl(method="cv", number=5, allowParallel=TRUE) 162 | model1 <- train(Personal.Loan ~ ., data=train.df, 163 | method="rpart", trControl=trControl, 164 | tuneGrid=data.frame(cp=c(1, 0.1, 0.01, 0.001, 0.0001))) 165 | model1$results 166 | # focus grid search around cp=0.001 167 | model2 <- train(Personal.Loan ~ ., data=train.df, 168 | method="rpart", trControl=trControl, 169 | tuneGrid=data.frame(cp=c(0.005, 0.002, 0.001, 0.0005, 0.0002))) 170 | model2$results 171 | 172 | ### Pruning the Tree 173 | #### Stopping Tree Growth: Conditional Inference Trees 174 | 175 | # argument xval refers to the number of folds to use in rpart's built-in 176 | # cross-validation procedure 177 | # argument cp sets the smallest value for the complexity parameter. 178 | cv.ct <- rpart(Personal.Loan ~ ., data=train.df, method="class", 179 | cp=0.00001, minsplit=5, xval=5) 180 | # use printcp() to print the table. 181 | printcp(cv.ct) 182 | 183 | 184 | # prune by lower cp 185 | pruned.ct <- prune(cv.ct, 186 | cp=cv.ct$cptable[which.min(cv.ct$cptable[,"xerror"]),"CP"]) 187 | sum(pruned.ct$frame$var == "") 188 | rpart.plot(pruned.ct, extra=1, fallen.leaves=FALSE) 189 | 190 | 191 | pdf(file.path("..", "figures", "chapter_09", "CT-universalTree-pruned.pdf"), width=5, height=2.5) 192 | rpart.plot(pruned.ct, extra=1, fallen.leaves=FALSE) 193 | dev.off() 194 | 195 | ### Best-Pruned Tree 196 | 197 | # prune by lower cp 198 | minErrorRow <- cv.ct$cptable[which.min(cv.ct$cptable[,"xerror"]), ] 199 | cutoff <- minErrorRow["xerror"] + minErrorRow["xstd"] 200 | best.cp <- cv.ct$cptable[cv.ct$cptable[,"xerror"] < cutoff,][1, "CP"] 201 | 202 | best.ct <- prune(cv.ct, cp=best.cp) 203 | sum(best.ct$frame$var == "") 204 | rpart.plot(best.ct, extra=1, fallen.leaves=FALSE) 205 | pdf(file.path("..", "figures", "chapter_09", "CT-universalTree-best.pdf"), width=4, height=2.75) 206 | rpart.plot(best.ct, extra=1, fallen.leaves=FALSE) 207 | dev.off() 208 | 209 | ## Classification Rules from Trees 210 | 211 | rpart.rules(best.ct) 212 | 213 | ## Regression Trees 214 | 215 | # select variables for regression 216 | outcome <- "Price" 217 | predictors <- c("Age_08_04", "KM", "Fuel_Type", "HP", "Met_Color", "Automatic", 218 | "CC", "Doors", "Quarterly_Tax", "Weight") 219 | # reduce data set to first 1000 rows and selected variables 220 | car.df <- mlba::ToyotaCorolla[1:1000, c(outcome, predictors)] 221 | 222 | # partition data 223 | set.seed(1) # set seed for reproducing the partition 224 | idx <- createDataPartition(car.df$Price, p=0.6, list=FALSE) 225 | car.train.df <- car.df[idx, ] 226 | car.holdout.df <- car.df[-idx, ] 227 | 228 | # use method "anova" for a regression model 229 | cv.rt <- rpart(Price ~ ., data=car.train.df, method="anova", 230 | cp=0.00001, minsplit=5, xval=5) 231 | 232 | # prune by lower cp 233 | minErrorRow <- cv.rt$cptable[which.min(cv.rt$cptable[,"xerror"]), ] 234 | cutoff <- minErrorRow["xerror"] + minErrorRow["xstd"] 235 | best.cp <- cv.rt$cptable[cv.rt$cptable[,"xerror"] < cutoff,][1, "CP"] 236 | 237 | best.rt <- prune(cv.rt, cp=best.cp) 238 | 239 | # set digits to a negative number to avoid scientific notation 240 | rpart.plot(best.rt, extra=1, fallen.leaves=FALSE, digits=-4) 241 | 242 | pdf(file.path("..", "figures", "chapter_09", "RT-ToyotaTree.pdf"), width=7, height=4) 243 | rpart.plot(best.rt, extra=1, fallen.leaves=FALSE, digits=-4) 244 | dev.off() 245 | 246 | ## Improving Prediction: Random Forests and Boosted Trees 247 | ### Random Forests 248 | 249 | library(randomForest) 250 | ## random forest 251 | rf <- randomForest(Personal.Loan ~ ., data=train.df, ntree=500, 252 | mtry=4, nodesize=5, importance=TRUE) 253 | 254 | ## variable importance plot 255 | varImpPlot(rf, type=1) 256 | 257 | ## confusion matrix 258 | rf.pred <- predict(rf, holdout.df) 259 | confusionMatrix(rf.pred, holdout.df$Personal.Loan) 260 | 261 | 262 | pdf(file.path("..", "figures", "chapter_09", "VarImp.pdf"), width=7, height=4) 263 | varImpPlot(rf, type=1, main="") 264 | dev.off() 265 | 266 | ### Boosted Trees 267 | 268 | library(caret) 269 | library(xgboost) 270 | 271 | xgb <- train(Personal.Loan ~ ., data=train.df, method="xgbTree", verbosity=0) 272 | 273 | # compare ROC curves for classification tree, random forest, and boosted tree models 274 | library(ROCR) 275 | rocCurveData <- function(model, data) { 276 | prob <- predict(model, data, type="prob")[, "Yes"] 277 | predob <- prediction(prob, data$Personal.Loan) 278 | perf <- performance(predob, "tpr", "fpr") 279 | return (data.frame(tpr=perf@x.values[[1]], fpr=perf@y.values[[1]])) 280 | } 281 | 282 | performance.df <- rbind( 283 | cbind(rocCurveData(best.ct, holdout.df), model="Best-pruned tree"), 284 | cbind(rocCurveData(rf, holdout.df), model="Random forest"), 285 | cbind(rocCurveData(xgb, holdout.df), model="xgboost") 286 | ) 287 | colors <- c("Best-pruned tree"="grey", "Random forest"="blue", "xgboost"="tomato") 288 | ggplot(performance.df, aes(x=tpr, y=fpr, color=model)) + 289 | geom_line() + 290 | scale_color_manual(values=colors) + 291 | geom_segment(aes(x=0, y=0, xend=1, yend=1), color="grey", linetype="dashed") + 292 | labs(x="1 - Specificity", y="Sensitivity", color="Model") 293 | 294 | 295 | library(gridExtra) 296 | g <- last_plot() + theme_bw() 297 | g1 <- g + guides(color="none") 298 | g2 <- g + scale_x_continuous(limits=c(0, 0.2)) + scale_y_continuous(limits=c(0.8, 1.0)) 299 | 300 | g <- arrangeGrob(g1, g2, widths=c(3, 4.5), ncol=2) 301 | ggsave(file=file.path("..", "figures", "chapter_09", "xgboost-ROC-1.pdf"), 302 | g, width=8, height=3, units="in") 303 | 304 | 305 | xgb.focused <- train(Personal.Loan ~ ., data=train.df, 306 | method="xgbTree", verbosity=0, 307 | scale_pos_weight=10) 308 | 309 | performance.df <- rbind( 310 | cbind(rocCurveData(xgb, holdout.df), model="xgboost"), 311 | cbind(rocCurveData(xgb.focused, holdout.df), model="xgboost (focused)") 312 | ) 313 | 314 | colors <- c("xgboost"="tomato", "xgboost (focused)"="darkgreen") 315 | ggplot(performance.df, aes(x=tpr, y=fpr, color=model)) + 316 | geom_line() + 317 | scale_color_manual(values=colors) + 318 | geom_segment(aes(x=0, y=0, xend=1, yend=1), color="grey", linetype="dashed") + 319 | labs(x="1 - Specificity", y="Sensitivity", color="Model") 320 | 321 | 322 | library(gridExtra) 323 | g <- last_plot() + theme_bw() 324 | g1 <- g + guides(color="none") 325 | g2 <- g + scale_x_continuous(limits=c(0, 0.2)) + scale_y_continuous(limits=c(0.8, 1.0)) 326 | 327 | g <- arrangeGrob(g1, g2, widths=c(3, 4.5), ncol=2) 328 | ggsave(file=file.path("..", "figures", "chapter_09", "xgboost-ROC-2.pdf"), 329 | g, width=8, height=3, units="in") 330 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 19 | 20 |
5 |

Machine Learning for Business Analytics
6 | Concepts, Techniques, and Applications in R

7 | 8 | by Galit Shmueli, Peter C. Bruce, Peter Gedeck, Inbal Yahav, Nitin R. Patel 9 | 10 | Publisher: Wiley; 2nd edition (February, 2023) 11 | ISBN: 978-1-118-83517-2 12 | Buy at 13 | Amazon 14 | or 15 | Wiley 16 | 17 | 18 |
21 | 22 | # Description 23 | Machine learning —also known as data mining or data analytics— is a fundamental part of data science. It is used by organizations in a wide variety of arenas to turn raw data into actionable information. 24 | 25 | Machine Learning for Business Analytics: Concepts, Techniques, and Applications in R provides a comprehensive introduction and an overview of this methodology. This best-selling textbook covers both statistical and machine learning algorithms for prediction, classification, visualization, dimension reduction, rule mining, recommendations, clustering, text mining, experimentation and network analytics. Along with hands-on exercises and real-life case studies, it also discusses managerial and ethical issues for responsible use of machine learning techniques. 26 | 27 | # Source code, datasets, and instructors material 28 | This repository contains: 29 | 30 | - `Rmd`: R code of individual chapters as 31 | [R markdown files](https://github.com/gedeck/mlba-R-code/tree/main/Rmd) - 32 | download all as [mlba-Rmd.zip](mlba-Rmd.zip) 33 | - `R`: R code of individual chapters as plain R 34 | [R files](https://github.com/gedeck/mlba-R-code/tree/main/R) - 35 | download all as [mlba-R.zip](mlba-R.zip) 36 | 37 | The datasets are distributed using the [mlba](https://github.com/gedeck/mlba) package; see below for installation instructions. 38 | To find instructors material go to [www.dataminingbook.com](https://www.dataminingbook.com/book/r-2nd-edition-2022). 39 | 40 | 41 | # Installation of R packages used in the book 42 | R and most packages can be installed directly from [CRAN](https://cran.r-project.org/). Go there for instructions on how to install R and individual packages. 43 | The [RStudio IDE](https://posit.co/downloads/) is a 44 | 45 | ## MLBA 46 | The `mlba` package is available from [](https://github.com/gedeck/mlba). You can install this package using the following commands: 47 | ``` 48 | if (!require(mlba)) { 49 | library(devtools) 50 | install_github("gedeck/mlba/mlba", force=TRUE) 51 | } 52 | ``` 53 | Note that this requires the installation of the `devtools` package 54 | 55 | ## DiscriMiner 56 | The `DiscriMiner` package is currently not available from CRAN. You can install it directly from Github as described in https://github.com/gastonstat/DiscriMiner 57 | ``` 58 | if (!require(DiscriMiner)) { 59 | library(devtools) 60 | install_github('DiscriMiner’, username='gastonstat') 61 | } 62 | ``` 63 | 64 | # Setting up an environment for deep learning applications 65 | In order to run the code for the deep learning applications, you will need to create a Python environment with the required packages. A convenient way to do this is to use [Anaconda](https://www.anaconda.com/products/individual). See [installPython.md](installPython.md) for instructions on how to install Anaconda and create the required environment. -------------------------------------------------------------------------------- /Rmd/chap1.Rmd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/Rmd/chap1.Rmd -------------------------------------------------------------------------------- /Rmd/chap11.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Machine Learning for Business Analytics" 3 | author: "Chapter 11: Neural Nets" 4 | output: 5 | pdf_document: 6 | toc: no 7 | highlight: tango 8 | # html_document: 9 | # toc: yes 10 | # toc_depth: 4 11 | # toc_float: yes 12 | --- 13 | 20 | ```{r global_options, include=FALSE} 21 | knitr::opts_chunk$set(error=TRUE, # Keep compiling upon error 22 | collapse=FALSE, # collapse by default 23 | echo=TRUE, # echo code by default 24 | comment = "#>", # change comment character 25 | fig.width = 5.5, # set figure width 26 | fig.align = "center",# set figure position 27 | out.width = "49%", # set width of displayed images 28 | warning=FALSE, # do not show R warnings 29 | message=FALSE, # do not show R messages 30 | row.print=25) 31 | ``` 32 | 33 | 34 | ```{r} 35 | if (!require(mlba)) { 36 | library(devtools) 37 | install_github("gedeck/mlba/mlba", force=TRUE) 38 | } 39 | ``` 40 | 41 | # Fitting a Network to Data 42 | ## Training the Model 43 | ### Back Propagation of Error 44 | ```{r} 45 | library(neuralnet) 46 | df <- mlba::TinyData 47 | df$Like <- df$Acceptance=="like" 48 | df$Dislike <- df$Acceptance=="dislike" 49 | set.seed(1) 50 | ``` 51 | 52 | ```{r} 53 | nn <- neuralnet(Like + Dislike ~ Salt + Fat, data = df, linear.output = F, hidden = 3) 54 | 55 | # display weights 56 | nn$weights 57 | 58 | # display predictions 59 | prediction(nn) 60 | 61 | # plot network 62 | plot(nn, rep="best") 63 | ``` 64 | 65 | ```{r} 66 | 67 | ``` 68 | 69 | ```{r} 70 | library(caret) 71 | predict <- compute(nn, data.frame(df$Salt, df$Fat)) 72 | predicted.class=apply(predict$net.result,1,which.max)-1 73 | confusionMatrix(factor(ifelse(predicted.class=="1", "dislike", "like")), 74 | factor(df$Acceptance)) 75 | ``` 76 | 77 | ```{r} 78 | 79 | ``` 80 | 81 | ## Example 2: Classifying Accident Severity 82 | ```{r} 83 | library(tidyverse) 84 | library(fastDummies) 85 | 86 | # convert SUR_COND and ALCHL_I to dummy variables (remove firest dummy) 87 | # convert outcome MAX_SEV_IR to dummy variables keeping all 88 | accidents.df <- mlba::AccidentsNN %>% 89 | dummy_cols(select_columns=c("ALCHL_I", "SUR_COND"), 90 | remove_selected_columns=TRUE, remove_first_dummy=TRUE) %>% 91 | dummy_cols(select_columns=c("MAX_SEV_IR"), 92 | remove_selected_columns=TRUE) 93 | 94 | # partition the data 95 | set.seed(1) 96 | idx <- createDataPartition(mlba::AccidentsNN$MAX_SEV_IR, p=0.6, list=FALSE) 97 | train.df <- accidents.df[idx, ] 98 | holdout.df <- accidents.df[-idx, ] 99 | train.actual <- mlba::AccidentsNN[idx, ]$MAX_SEV_IR 100 | holdout.actual <- mlba::AccidentsNN[-idx, ]$MAX_SEV_IR 101 | 102 | nn <- neuralnet(MAX_SEV_IR_0 + MAX_SEV_IR_1 + MAX_SEV_IR_2 ~ ., 103 | data=train.df, hidden=2) 104 | 105 | # predict the three outcome variables and assign class using maximum score 106 | pred.train <- predict(nn, train.df) 107 | class.train <- apply(pred.train, 1, which.max)-1 108 | confusionMatrix(factor(class.train), factor(train.actual)) 109 | 110 | pred.holdout <- predict(nn, holdout.df) 111 | class.holdout <- apply(pred.holdout, 1, which.max)-1 112 | confusionMatrix(factor(class.holdout), factor(holdout.actual)) 113 | ``` 114 | 115 | ```{r} 116 | # 1 not able to predict 2 0.8496 117 | # 2, 3 0.8697 118 | # 4, 5 0.8647 119 | ``` 120 | 121 | # Deep Learning 122 | ## Example: Classification of Fashion Images 123 | ```{r} 124 | # code to prepare a similar picture using R 125 | library(reticulate) 126 | use_condaenv('mlba-r') 127 | library(keras) 128 | 129 | fashion_mnist <- keras::dataset_fashion_mnist() 130 | clothes.labels <- c('Top', 'Trouser', 'Pullover', 'Dress', 'Coat', 131 | 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Boot') 132 | x_train <- fashion_mnist$train$x 133 | y_train <- fashion_mnist$train$y 134 | 135 | rotate <- function(x) t(apply(x, 2, rev)) 136 | plot_image <- function(x, title = "", title.color = "black") { 137 | x <- rotate(x) 138 | image(x, axes=FALSE, col=grey(seq(0, 1, length = 256)), 139 | main=list(title, col=title.color)) 140 | } 141 | plot_sample <- function() { 142 | par(mfrow=c(4, 5), mar=c(0, 0.2, 1, 0.2)) 143 | for (offset in c(0, 5)) { 144 | range <- (1+offset):(5+offset) 145 | for (i in range) { 146 | examples <- which(y_train %in% (i-1)) 147 | example <- examples[1] 148 | plot_image(x_train[example, , ], clothes.labels[y_train[example] + 1]) 149 | } 150 | for (i in range) { 151 | examples <- which(y_train %in% (i-1)) 152 | example <- examples[2] 153 | plot_image(x_train[example, , ]) 154 | } 155 | } 156 | } 157 | plot_sample() 158 | 159 | pdf(file=file.path("..", "figures", "chapter_11", "fashion-mnist-sample.pdf"), width=5, height=4) 160 | plot_sample() 161 | dev.off() 162 | ``` 163 | 164 | ### Data Preprocessing 165 | ```{r} 166 | # load required packages 167 | # Keras and TensorFlow require a Python conda environment with these packages installed 168 | library(reticulate) 169 | use_condaenv('mlba-r') 170 | library(keras) 171 | library(tensorflow) 172 | 173 | # load the data and split into training and validation sets 174 | fashion_mnist <- keras::dataset_fashion_mnist() 175 | x_train <- fashion_mnist$train$x 176 | y_train <- fashion_mnist$train$y 177 | x_valid <- fashion_mnist$test$x 178 | y_valid <- fashion_mnist$test$y 179 | 180 | # pixel values need to be scaled to range [0, 1] 181 | x_train <- x_train / 255 182 | x_valid <- x_valid / 255 183 | 184 | # input require an additional dimension to describe pixel values 185 | # dimensions are (samples, row, column, pixel) 186 | x_train <- array_reshape(x_train, c(dim(x_train), 1)) 187 | x_valid <- array_reshape(x_valid, c(dim(x_valid), 1)) 188 | 189 | # output values need to be converted into a matrix with one-hot-encoding of classes 190 | # dimensions are (samples, classes) 191 | y_train <- to_categorical(y_train, 10) 192 | y_valid <- to_categorical(y_valid, 10) 193 | dim(x_train) 194 | dim(y_train) 195 | ``` 196 | 197 | ```{r, eval=FALSE} 198 | # Model definition (architecture taken from 199 | # https://keras.rstudio.com/articles/examples/mnist_cnn.html 200 | input_shape = dim(x_train)[2:4] 201 | num_classes <- 10 202 | 203 | model <- keras_model_sequential() 204 | model %>% 205 | layer_conv_2d(filters=32, kernel_size=c(5,5), activation='relu', 206 | input_shape=input_shape) %>% 207 | layer_conv_2d(filters=64, kernel_size=c(3,3), activation='relu') %>% 208 | layer_max_pooling_2d(pool_size=c(2,2)) %>% 209 | layer_dropout(rate=0.25) %>% 210 | layer_flatten() %>% 211 | layer_dense(units = 128, activation = 'relu') %>% 212 | layer_dropout(rate = 0.5) %>% 213 | layer_dense(units = num_classes, activation = 'softmax') 214 | 215 | model 216 | 217 | # compile model 218 | model %>% compile( 219 | loss = loss_categorical_crossentropy, 220 | optimizer = optimizer_adadelta(), 221 | metrics = c('accuracy') 222 | ) 223 | 224 | # train and evaluate 225 | model %>% fit( 226 | x_train, y_train, 227 | batch_size = 128, 228 | epochs = 20, 229 | verbose = 1, 230 | validation_data = list(x_valid, y_valid) 231 | ) 232 | ``` 233 | 234 | ### Training a Deep Learning Network 235 | ```{r} 236 | library(gridExtra) 237 | df <- read.csv("cv-training.csv") 238 | g1 <- ggplot(df, aes(x=epoch)) + 239 | geom_line(aes(y=loss), color="steelblue") + 240 | geom_line(aes(y=val_loss), color="tomato") + 241 | geom_hline(yintercept=min(df$val_loss), color="black", linetype="dotted") + 242 | geom_vline(xintercept=which.min(df$val_loss), color="black", linetype="dotted") + 243 | labs(x="Epoch", y="Loss") + 244 | theme_bw() 245 | g2 <- ggplot(df, aes(x=epoch)) + 246 | geom_line(aes(y=accuracy), color="steelblue") + 247 | geom_line(aes(y=val_accuracy), color="tomato") + 248 | geom_hline(yintercept=max(df$val_accuracy), color="black", linetype="dotted") + 249 | labs(x="Epoch", y="Accuracy") + 250 | theme_bw() 251 | grid.arrange(g1, g2, ncol=2) 252 | 253 | 254 | g <- arrangeGrob(g1, g2, ncol=2) 255 | ggsave(file=file.path("..", "figures", "chapter_11", "fashion-mnist-learning.pdf"), 256 | g, width=5, height=2.5, units="in") 257 | ``` 258 | 259 | ### Applying the Predictive Model 260 | ```{r} 261 | model <- load_model_tf("cnn-model.tf") 262 | ``` 263 | 264 | ```{r} 265 | propensities <- predict(model, x_valid) 266 | propensities[1:5, ] 267 | 268 | # convert to class using winner takes all 269 | predClass <- apply(propensities, 1, which.max) 270 | predClass[1:5] 271 | 272 | # confusion matrix 273 | caret::confusionMatrix(factor(predClass), factor(fashion_mnist$test$y + 1)) 274 | ``` 275 | 276 | ```{r} 277 | 278 | ``` 279 | 280 | -------------------------------------------------------------------------------- /Rmd/chap12.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Machine Learning for Business Analytics" 3 | author: "Chapter 12: Discriminant Analysis" 4 | output: 5 | pdf_document: 6 | toc: no 7 | highlight: tango 8 | # html_document: 9 | # toc: yes 10 | # toc_depth: 4 11 | # toc_float: yes 12 | --- 13 | 20 | ```{r global_options, include=FALSE} 21 | knitr::opts_chunk$set(error=TRUE, # Keep compiling upon error 22 | collapse=FALSE, # collapse by default 23 | echo=TRUE, # echo code by default 24 | comment = "#>", # change comment character 25 | fig.width = 5.5, # set figure width 26 | fig.align = "center",# set figure position 27 | out.width = "49%", # set width of displayed images 28 | warning=FALSE, # do not show R warnings 29 | message=FALSE, # do not show R messages 30 | row.print=25) 31 | ``` 32 | 33 | 34 | ```{r} 35 | if (!require(mlba)) { 36 | library(devtools) 37 | install_github("gedeck/mlba/mlba", force=TRUE) 38 | } 39 | ``` 40 | 41 | # Introduction 42 | ## Example 1: Riding Mowers 43 | ```{r} 44 | library(ggplot2) 45 | mowers.df <- mlba::RidingMowers 46 | g <- ggplot(mowers.df, mapping=aes(x=Income, y=Lot_Size, color=Ownership, fill=Ownership)) + 47 | geom_point(size=4) + 48 | geom_abline(intercept=40, slope=-0.34) + 49 | scale_shape_manual(values = c(15, 21)) + 50 | scale_color_manual(values = c('darkorange', 'steelblue')) + 51 | scale_fill_manual(values = c('darkorange', 'lightblue')) 52 | 53 | g 54 | 55 | ggsave(file=file.path("..", "figures", "chapter_12", "riding-mower.pdf"), 56 | g + theme_bw(), width=6, height=4, units="in") 57 | ``` 58 | 59 | ## Example 2: Personal Loan Acceptance 60 | ```{r} 61 | library(gridExtra) 62 | makePlot <- function(df, title, alpha) { 63 | no_personal_loan <- subset(df, Personal.Loan == 0) 64 | personal_loan <- subset(df, Personal.Loan == 1) 65 | 66 | g <- ggplot(universal.df, aes(x=Income, y=CCAvg, color=Personal.Loan)) + 67 | geom_point(aes(color="nonacceptor"), data=no_personal_loan, alpha=alpha) + 68 | geom_point(aes(color="acceptor"), data=personal_loan) + 69 | labs(title=title, colour="Personal Loan", x='Annual income ($000s)', 70 | y='Monthly average credit card spending ($000s)') + 71 | scale_color_manual(values=c("lightblue", "steelblue"), 72 | guide=guide_legend(override.aes=list(size=3, alpha=1))) + 73 | scale_x_log10() + 74 | scale_y_log10() + 75 | theme_bw() 76 | 77 | return (g) 78 | } 79 | 80 | set.seed(1) 81 | universal.df <- mlba::UniversalBank 82 | idx <- sample(dim(universal.df)[1], 200) 83 | g1 <- makePlot(universal.df[idx, ], 'Sample of 200 customers', 1.0) + 84 | theme(legend.position = c(0.2, 0.85)) 85 | g2 <- makePlot(universal.df, 'All 5000 customers', 0.5) + 86 | guides(color="none") 87 | grid.arrange(g1, g2, ncol=2) 88 | 89 | g <- arrangeGrob(g1, g2, ncol=2) 90 | ggsave(file=file.path("..", "figures", "chapter_12", "personalLoan_sampled.pdf"), 91 | g, width=8, height=4, units="in") 92 | ``` 93 | 94 | # Fisher's Linear Classification Functions 95 | ```{r} 96 | library(caret) 97 | mowers.df <- mlba::RidingMowers 98 | trControl <- caret::trainControl(method='none') 99 | model <- train(Ownership ~ Income + Lot_Size, data=mowers.df, 100 | method='lda', trControl=trControl) 101 | model$finalModel # access the wrapped LDA model 102 | 103 | # DiscriMiner exposes the Fisher's linear classification function 104 | library(DiscriMiner) 105 | mowers.df <- mlba::RidingMowers 106 | da.mower <- linDA(mowers.df[,1:2], mowers.df[,3]) 107 | da.mower$functions 108 | ``` 109 | 110 | ```{r} 111 | da.mower <- linDA(mowers.df[,1:2], mowers.df[,3]) 112 | # compute propensities manually (below); or, use lda() in package MASS or caret with predict() 113 | propensity.owner <- exp(da.mower$scores[,2])/(exp(da.mower$scores[,1])+exp(da.mower$scores[,2])) 114 | data.frame(Actual=mowers.df$Ownership, Predicted=da.mower$classification, 115 | da.mower$scores, propensity.owner=propensity.owner) 116 | ``` 117 | 118 | ```{r} 119 | library(ggplot2) 120 | da.mower <- train(Ownership ~ Income + Lot_Size, data=mowers.df, 121 | method='lda', trControl=trControl) 122 | means <- colSums(da.mower$finalModel$means) / 2 123 | sIncome <- da.mower$finalModel$scaling['Income', 'LD1'] 124 | sLotSize <- da.mower$finalModel$scaling['Lot_Size', 'LD1'] 125 | m <- - sIncome / sLotSize 126 | y0 <- means['Lot_Size'] - m * means['Income'] 127 | 128 | mowers.df <- mlba::RidingMowers 129 | g <- ggplot(mowers.df, mapping=aes(x=Income, y=Lot_Size, color=Ownership, fill=Ownership)) + 130 | geom_point(size=4) + 131 | geom_point(data=data.frame(da.mower$finalModel$means), color='black', fill='black', shape=4, size=3) + 132 | geom_abline(aes(linetype='ad hoc line', intercept=40, slope=-0.34), color='darkgrey') + 133 | geom_abline(aes(linetype='LDA line', intercept=y0, slope=m)) + 134 | scale_shape_manual(values = c(15, 21)) + 135 | scale_color_manual(values = c('darkorange', 'steelblue')) + 136 | scale_fill_manual(values = c('darkorange', 'lightblue')) + 137 | scale_linetype_manual(name='Linetype', values=c(2, 1), labels=c('ad hoc line', 'LDA line')) + 138 | guides(fill = guide_legend(order = 1), color = guide_legend(order = 1), 139 | linetype = guide_legend(order = 2)) 140 | g 141 | ggsave(file=file.path("..", "figures", "chapter_12", "LDA-riding-mower.pdf"), 142 | g + theme_bw(), width=6, height=4, units="in") 143 | ``` 144 | 145 | # Prior Probabilities 146 | ```{r} 147 | trControl <- caret::trainControl(method='none') 148 | model <- train(Ownership ~ Income + Lot_Size, data=mowers.df, 149 | method='lda', trControl=trControl) 150 | model.prior <- train(Ownership ~ Income + Lot_Size, data=mowers.df, 151 | method='lda', prior=c(0.85, 0.15), 152 | trControl=trControl) 153 | 154 | family.13 <- mowers.df[13,] 155 | predict(model, family.13) 156 | predict(model.prior, family.13) 157 | ``` 158 | 159 | # Classifying More Than Two Classes 160 | ## Example 3: Medical Dispatch to Accident Scenes 161 | ```{r} 162 | library(DiscriMiner) 163 | library(caret) 164 | 165 | accidents.df <- mlba::Accidents 166 | lda.model <- linDA(accidents.df[,1:10], accidents.df[,11]) 167 | lda.model$functions 168 | confusionMatrix(as.factor(lda.model$classification), as.factor(accidents.df$MAX_SEV)) 169 | ``` 170 | 171 | ```{r} 172 | propensity <- exp(lda.model$scores[,1:3])/ 173 | (exp(lda.model$scores[,1])+exp(lda.model$scores[,2])+exp(lda.model$scores[,3])) 174 | 175 | res <- data.frame(Actual = accidents.df$MAX_SEV, 176 | Classification = lda.model$classification, 177 | Score = round(lda.model$scores,2), 178 | Propensity = round(propensity,2)) 179 | head(res) 180 | ``` 181 | 182 | ```{r} 183 | 184 | ``` 185 | 186 | ```{r} 187 | library(tidyverse) 188 | accidents.df <- mlba::Accidents %>% 189 | mutate(MAX_SEV = factor(MAX_SEV)) 190 | da.model <- train(MAX_SEV ~ ., data=accidents.df, method='lda') 191 | res <- data.frame(Actual=accidents.df$MAX_SEV, 192 | Classification=predict(da.model), 193 | Propensity=predict(da.model, type='prob') %>% round(2)) 194 | head(res) 195 | ``` 196 | 197 | ```{r} 198 | 199 | ``` 200 | 201 | -------------------------------------------------------------------------------- /Rmd/chap13.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Machine Learning for Business Analytics" 3 | author: "Chapter 13: Generating, Comparing, and Combining Multiple Models" 4 | output: 5 | pdf_document: 6 | toc: no 7 | highlight: tango 8 | # html_document: 9 | # toc: yes 10 | # toc_depth: 4 11 | # toc_float: yes 12 | --- 13 | 20 | ```{r global_options, include=FALSE} 21 | knitr::opts_chunk$set(error=TRUE, # Keep compiling upon error 22 | collapse=FALSE, # collapse by default 23 | echo=TRUE, # echo code by default 24 | comment = "#>", # change comment character 25 | fig.width = 5.5, # set figure width 26 | fig.align = "center",# set figure position 27 | out.width = "49%", # set width of displayed images 28 | warning=FALSE, # do not show R warnings 29 | message=FALSE, # do not show R messages 30 | row.print=25) 31 | ``` 32 | 33 | 34 | ```{r} 35 | if (!require(mlba)) { 36 | library(devtools) 37 | install_github("gedeck/mlba/mlba", force=TRUE) 38 | } 39 | ``` 40 | 41 | # Ensembles 42 | ## Bagging and Boosting in R 43 | ### Combining Propensities 44 | ```{r} 45 | library(tidyverse) 46 | library(adabag) 47 | library(rpart) 48 | library(caret) 49 | 50 | set.seed(1) 51 | # load and preprocess the data 52 | bank.df <- mlba::UniversalBank %>% 53 | select(-c(ID, ZIP.Code)) %>% 54 | mutate( 55 | Personal.Loan = factor(Personal.Loan, levels=c(0, 1), labels=c("No", "Yes")) 56 | ) 57 | 58 | # partition the data 59 | train.index <- sample(c(1:dim(bank.df)[1]), dim(bank.df)[1]*0.6) 60 | train.df <- bank.df[train.index, ] 61 | holdout.df <- bank.df[-train.index, ] 62 | 63 | # single tree (rpart) 64 | tr <- rpart(Personal.Loan ~ ., data=train.df) 65 | 66 | # bagging and boosting using adabag 67 | bag <- bagging(Personal.Loan ~ ., data=train.df) 68 | boost <- boosting(Personal.Loan ~ ., data=train.df) 69 | 70 | # bagging and boosting using randomForest and xgboost with parameter tuning 71 | bag.rf <- train(Personal.Loan ~ ., data=train.df, method="rf") 72 | boost.xgb <- train(Personal.Loan ~ ., data=train.df, method="xgbTree", verbosity=0) 73 | ``` 74 | 75 | ```{r} 76 | library(ROCR) 77 | rocCurveData <- function(prob, data) { 78 | predob <- prediction(prob, data$Personal.Loan) 79 | perf <- performance(predob, "tpr", "fpr") 80 | return (data.frame(tpr=perf@x.values[[1]], fpr=perf@y.values[[1]])) 81 | } 82 | performance.df <- rbind( 83 | cbind(rocCurveData(predict(tr, holdout.df, type="prob")[,"Yes"], holdout.df), model="Single tree"), 84 | cbind(rocCurveData(predict(bag, holdout.df)$prob[, 2], holdout.df), model="Bagging"), 85 | cbind(rocCurveData(predict(boost, holdout.df)$prob[, 2], holdout.df), model="Boosting") 86 | ) 87 | colors <- c("Single tree"="grey", "Bagging"="blue", "Boosting"="tomato") 88 | g1 <- ggplot(performance.df, aes(x=tpr, y=fpr, color=model)) + 89 | geom_line() + 90 | scale_color_manual(values=colors) + 91 | geom_segment(aes(x=0, y=0, xend=1, yend=1), color="grey", linetype="dashed") + 92 | labs(x="1 - Specificity", y="Sensitivity", color="Model") 93 | g1 94 | 95 | performance.df <- rbind( 96 | cbind(rocCurveData(predict(tr, holdout.df, type="prob")[,"Yes"], holdout.df), 97 | model="Single tree"), 98 | cbind(rocCurveData(predict(bag.rf, holdout.df, type="prob")[,"Yes"], holdout.df), 99 | model="Random forest"), 100 | cbind(rocCurveData(predict(boost.xgb, holdout.df, type="prob")[,"Yes"], holdout.df), 101 | model="xgboost") 102 | ) 103 | colors <- c("Single tree"="grey", "Random forest"="blue", "xgboost"="tomato") 104 | g2 <- ggplot(performance.df, aes(x=tpr, y=fpr, color=model)) + 105 | geom_line() + 106 | scale_color_manual(values=colors) + 107 | geom_segment(aes(x=0, y=0, xend=1, yend=1), color="grey", linetype="dashed") + 108 | labs(x="1 - Specificity", y="Sensitivity", color="Model") 109 | g2 110 | library(gridExtra) 111 | g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw(), ncol=2, widths=c(0.49, 0.51)) 112 | ggsave(file=file.path("..", "figures", "chapter_13", "bagging-boosting.pdf"), 113 | g, width=8, height=3, units="in") 114 | ``` 115 | 116 | # Automated Machine Learning (AutoML) 117 | ## AutoML: Explore and Clean Data 118 | ```{r} 119 | library(tidyverse) 120 | 121 | # load and preprocess the data 122 | bank.df <- mlba::UniversalBank %>% 123 | # Drop ID and zip code columns. 124 | select(-c(ID, ZIP.Code)) %>% 125 | # convert Personal.Loan to a factor with labels Yes and No 126 | mutate(Personal.Loan = factor(Personal.Loan, levels=c(0, 1), labels=c("No", "Yes"))) 127 | 128 | # partition the data 129 | set.seed(1) 130 | idx <- caret::createDataPartition(bank.df$Personal.Loan, p=0.6, list=FALSE) 131 | train.df <- bank.df[idx, ] 132 | holdout.df <- bank.df[-idx, ] 133 | ``` 134 | 135 | ```{r} 136 | library(h2o) 137 | 138 | # Start the H2O cluster (locally) 139 | h2o.init() 140 | 141 | train.h2o <- as.h2o(train.df) 142 | holdout.h2o <- as.h2o(holdout.df) 143 | ``` 144 | 145 | ```{r} 146 | 147 | ``` 148 | 149 | ## AutoML: Choose Features and Machine Learning Methods 150 | ```{r} 151 | # identify outcome and predictors 152 | y <- "Personal.Loan" 153 | x <- setdiff(names(train.df), y) 154 | 155 | # run AutoML for 20 base models 156 | aml <- h2o.automl(x=x, y=y, training_frame=train.h2o, 157 | max_models=20, exclude_algos=c("DeepLearning"), 158 | seed=1) 159 | aml.balanced <- h2o.automl(x=x, y=y, training_frame=train.h2o, 160 | max_models=20, exclude_algos=c("DeepLearning"), 161 | balance_classes=TRUE, 162 | seed=1) 163 | 164 | aml 165 | ``` 166 | 167 | ```{r} 168 | aml.balanced 169 | ``` 170 | 171 | ## AutoML: Evaluate Model Performance 172 | ```{r} 173 | h2o.confusionMatrix(aml@leader, holdout.h2o) 174 | h2o.confusionMatrix(aml.balanced@leader, holdout.h2o) 175 | ``` 176 | 177 | # Explaining Model Predictions 178 | ## Explaining Model Predictions: LIME 179 | ```{r} 180 | cases <- c('3055', '3358', # predicted Yes 181 | '2', '1178') # predicted No 182 | explainer <- lime::lime(train.df, aml@leader, bin_continuous=TRUE, quantile_bins=FALSE) 183 | explanations <- lime::explain(holdout.df[cases,], explainer, n_labels=1, n_features=8) 184 | 185 | lime::plot_features(explanations, ncol=2) 186 | ``` 187 | 188 | ```{r} 189 | pdf(file=file.path("..", "figures", "chapter_13", "lime-analysis.pdf"), 190 | width=7, height=6) 191 | lime::plot_features(explanations, ncol=2) 192 | dev.off() 193 | ``` 194 | 195 | -------------------------------------------------------------------------------- /Rmd/chap14.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Machine Learning for Business Analytics" 3 | author: "Chapter 14: Interventions: Experiments, Uplift Models, and Reinforcement Learning" 4 | output: 5 | pdf_document: 6 | toc: no 7 | highlight: tango 8 | # html_document: 9 | # toc: yes 10 | # toc_depth: 4 11 | # toc_float: yes 12 | --- 13 | 20 | ```{r global_options, include=FALSE} 21 | knitr::opts_chunk$set(error=TRUE, # Keep compiling upon error 22 | collapse=FALSE, # collapse by default 23 | echo=TRUE, # echo code by default 24 | comment = "#>", # change comment character 25 | fig.width = 5.5, # set figure width 26 | fig.align = "center",# set figure position 27 | out.width = "49%", # set width of displayed images 28 | warning=FALSE, # do not show R warnings 29 | message=FALSE, # do not show R messages 30 | row.print=25) 31 | ``` 32 | 33 | 34 | ```{r} 35 | if (!require(mlba)) { 36 | library(devtools) 37 | install_github("gedeck/mlba/mlba", force=TRUE) 38 | } 39 | ``` 40 | 41 | # A/B Testing 42 | ## The statistical test for comparing two groups (T-test) 43 | ```{r} 44 | pt(q=2.828, df=1998, lower.tail=FALSE) 45 | pt(q=2.626, df=1998, lower.tail=FALSE) 46 | ``` 47 | 48 | # Uplift (Persuasion) Modeling 49 | ## Computing Uplift with R 50 | ```{r} 51 | library(tidyverse) 52 | # load and preprocess the data 53 | predictors <- c("AGE", "NH_WHITE", "COMM_PT", "H_F1", "REG_DAYS", 54 | "PR_PELIG", "E_PELIG", "POLITICALC", "MESSAGE_A") 55 | outcome <- "MOVED_AD" 56 | voter.df <- mlba::VoterPersuasion %>% 57 | select(all_of(c(predictors, outcome))) 58 | 59 | 60 | set.seed(1) 61 | nrows <- dim(voter.df)[1] 62 | train.index <- sample(1:nrows, nrows * 0.6) 63 | train.df <- voter.df[train.index, ] 64 | holdout.df <- voter.df[-train.index, ] 65 | 66 | # build a random forest model using caret 67 | train_control <- caret::trainControl(method="none") 68 | model <- caret::train(MOVED_AD ~ ., data=train.df, 69 | trControl=train_control, 70 | method="rf") 71 | 72 | # calculating the uplift 73 | uplift_df <- data.frame(holdout.df) 74 | uplift_df$MESSAGE_A <- 1 75 | predTreatment <- predict(model, newdata=uplift_df, type="prob") 76 | uplift_df$MESSAGE_A <- 0 77 | predControl <- predict(model, newdata=uplift_df, type="prob") 78 | upliftResult <- data.frame( 79 | probMessage = predTreatment[, 1], 80 | probNoMessage = predControl[, 1] 81 | ) 82 | upliftResult$uplift <- upliftResult$probMessage - upliftResult$probNoMessage 83 | head(upliftResult) 84 | ``` 85 | 86 | ```{r} 87 | 88 | ``` 89 | 90 | # Reinforcement Learning 91 | ## Example of using a Contextual Multi-Arm Bandit for Movie Recommendations 92 | ```{r} 93 | library(tidyverse) 94 | library(mlba) 95 | library(contextual) 96 | library(data.table) 97 | library(splitstackshape) 98 | 99 | # preprocess movies data to create indicator variables for the different genres 100 | movies_dat <- as.data.table(mlba::MovieLensMovies) 101 | movies_dat <- splitstackshape::cSplit_e(movies_dat, "genres", sep="|", type="character", 102 | fill=0, drop=TRUE) 103 | movies_dat[[3]] <- NULL # deletes the third column 104 | 105 | ratings_dat <- as.data.table(mlba::MovieLensRatings) 106 | all_movies <- ratings_dat[movies_dat, on=c(movieId="movieId")] 107 | all_movies <- na.omit(all_movies, cols=c("movieId", "userId")) 108 | # renumber userId to sequential numbers starting at 1 109 | all_movies[, userId := as.numeric(as.factor(userId))] 110 | 111 | # find the top-50 most frequently rated movies 112 | top_50 <- all_movies %>% 113 | count(movieId) %>% 114 | slice_max(n, n=50) %>% 115 | pull(movieId) 116 | top_50_movies <- all_movies[movieId %in% top_50] 117 | # renumber movieId to sequential numbers starting at 1 118 | top_50_movies[, movieId := as.numeric(as.factor(movieId))] 119 | 120 | # create profile of genres for each movie in the top-50 (arm_features) 121 | arm_features <- top_50_movies %>% 122 | select(-c(userId, rating, timestamp, title)) %>% 123 | # select one row for each movieId 124 | group_by(movieId) %>% slice(1) %>% ungroup() 125 | 126 | # for each user, create their profile of genre preferences based on 127 | # their viewed movies that are not in the top-50 (user_features) 128 | user_features <- all_movies %>% 129 | filter(! movieId %in% top_50) %>% # restrict to movies not in the top-50 130 | select(-c(movieId, rating, timestamp, title)) %>% 131 | # for each user, sum 132 | group_by(userId) %>% 133 | summarise_all(sum) %>% 134 | # normalize user profile 135 | group_by(userId) %>% 136 | mutate( 137 | total = sum(c_across(genres_Action:genres_Western)), 138 | across(genres_Action:genres_Western, ~ ./total) 139 | ) %>% 140 | select(-c(total)) %>% 141 | as.data.table() 142 | 143 | 144 | # add users who only rated top-50 movies 145 | # their genre preference profile is set to 0 for all genres 146 | all_users <- as.data.table(unique(all_movies$userId)) 147 | user_features <- user_features[all_users, on=c(userId="V1")] 148 | user_features[is.na(user_features)] <- 0 149 | setorder(user_features, userId) 150 | ``` 151 | 152 | ```{r} 153 | 154 | ``` 155 | 156 | ```{r} 157 | # prepare the data for use with the contextual package 158 | top_50_movies[, t := .I] 159 | top_50_movies[, sim := 1] 160 | top_50_movies[, agent := "Offline"] 161 | top_50_movies[, choice := movieId] 162 | top_50_movies[, reward := ifelse(rating <= 4, 0, 1)] 163 | setorder(top_50_movies,timestamp, title) 164 | 165 | # the bandit samples users with their genre preferences (user_features), 166 | # movie choices (choice), and ratings. 167 | # each movie is characterized by the genre profile (arm_features) 168 | # these data are used to train the agent 169 | environment <- OfflineLookupReplayEvaluatorBandit$new( 170 | top_50_movies, 171 | k = 50, 172 | unique_col = "userId", 173 | unique_lookup = user_features, 174 | shared_lookup = arm_features) 175 | 176 | # define list of strategies to evaluate 177 | agents <-list( 178 | Agent$new(RandomPolicy$new(), environment, "Random"), 179 | Agent$new(LinUCBDisjointOptimizedPolicy$new(2.1), environment, "LinUCB Dis")) 180 | 181 | # setup and run simulation 182 | simulation <- Simulator$new( 183 | agents = agents, 184 | simulations = 20, 185 | horizon = 10000L, 186 | save_interval = 1) 187 | results <- simulation$run() 188 | 189 | plot(results, type="cumulative", regret=FALSE, rate=TRUE, 190 | legend_position="topleft", disp="sd") 191 | ``` 192 | 193 | ```{r} 194 | pdf(file=file.path("..", "figures", "chapter_14", "mab-movielens.pdf"), width=6, height=4) 195 | plot(results, type="cumulative", regret=FALSE, rate=TRUE, 196 | legend_position="topleft", disp="sd") 197 | dev.off() 198 | ``` 199 | 200 | -------------------------------------------------------------------------------- /Rmd/chap15.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Machine Learning for Business Analytics" 3 | author: "Chapter 15: Association Rules and Collaborative Filtering" 4 | output: 5 | pdf_document: 6 | toc: no 7 | highlight: tango 8 | # html_document: 9 | # toc: yes 10 | # toc_depth: 4 11 | # toc_float: yes 12 | --- 13 | 20 | ```{r global_options, include=FALSE} 21 | knitr::opts_chunk$set(error=TRUE, # Keep compiling upon error 22 | collapse=FALSE, # collapse by default 23 | echo=TRUE, # echo code by default 24 | comment = "#>", # change comment character 25 | fig.width = 5.5, # set figure width 26 | fig.align = "center",# set figure position 27 | out.width = "49%", # set width of displayed images 28 | warning=FALSE, # do not show R warnings 29 | message=FALSE, # do not show R messages 30 | row.print=25) 31 | ``` 32 | 33 | 34 | ```{r} 35 | if (!require(mlba)) { 36 | library(devtools) 37 | install_github("gedeck/mlba/mlba", force=TRUE) 38 | } 39 | ``` 40 | 41 | # Association Rules 42 | ## The Process of Rule Selection 43 | ### Lift 44 | ```{r} 45 | library(arules) 46 | fp.df <- mlba::Faceplate 47 | 48 | # remove first column and convert to matrix 49 | fp.mat <- as.matrix(fp.df[, -1]) 50 | 51 | # convert the binary incidence matrix into a transactions database 52 | fp.trans <- as(fp.mat, "transactions") 53 | inspect(fp.trans) 54 | 55 | ## get rules 56 | # when running apriori(), include the minimum support, minimum confidence, and target 57 | # as arguments. 58 | rules <- apriori(fp.trans, parameter = list(supp = 0.2, conf = 0.5, target = "rules")) 59 | 60 | # inspect the first six rules, sorted by their lift 61 | inspect(head(sort(rules, by = "lift"), n = 6)) 62 | ``` 63 | 64 | ## Example 2: Rules for Similar Book Purchases 65 | ```{r} 66 | all.books.df <- mlba::CharlesBookClub 67 | 68 | # create a binary incidence matrix 69 | count.books.df <- all.books.df[, 8:18] 70 | incid.books.mat <- as.matrix(count.books.df > 0) 71 | 72 | # convert the binary incidence matrix into a transactions database 73 | books.trans <- as(incid.books.mat, "transactions") 74 | inspect(books.trans[1:10]) 75 | 76 | # plot data 77 | itemFrequencyPlot(books.trans) 78 | 79 | # run apriori function 80 | rules <- apriori(books.trans, 81 | parameter = list(supp= 200/4000, conf = 0.5, target = "rules")) 82 | 83 | # inspect top-30 rules sorted by lift 84 | inspect(head(sort(rules, by = "lift"), n=30)) 85 | ``` 86 | 87 | # Collaborative Filtering 88 | ## Example 4: Predicting Movie Ratings with MovieLens Data 89 | ```{r} 90 | library(recommenderlab) 91 | 92 | # download MovieLens data 93 | ratings <- mlba::MovieLensRatings 94 | movies <- mlba::MovieLensMovies 95 | 96 | # convert ratings to rating matrix 97 | idxUserId <- sort(unique(ratings$userId)) 98 | idxMovieId <- sort(unique(ratings$movieId)) 99 | m <- matrix(NA, nrow=length(idxUserId), ncol=length(idxMovieId), 100 | dimnames=list( 101 | user=paste("u", 1:length(idxUserId), sep=''), 102 | item=movies$title[match(idxMovieId, movies$movieId)] 103 | )) 104 | for (i in 1:nrow(ratings)) { 105 | rating <- ratings[i,] 106 | irow <- match(rating$userId, idxUserId) 107 | icol <- match(rating$movieId, idxMovieId) 108 | m[irow, icol] <- rating$rating 109 | } 110 | ratingMatrix <- as(m, "realRatingMatrix") 111 | ``` 112 | 113 | ```{r} 114 | 115 | ``` 116 | 117 | ```{r} 118 | # UBCF model and prediction 119 | recommender <- Recommender(ratingMatrix[-1], method="UBCF") 120 | pred <- predict(recommender, ratingMatrix[1]) 121 | as(pred, 'list') 122 | 123 | # IBCF model and prediction 124 | recommender <- Recommender(ratingMatrix[-1], method="IBCF") 125 | pred <- predict(recommender, ratingMatrix[1]) 126 | as(pred, 'list') 127 | ``` 128 | 129 | ```{r} 130 | set.seed(1) 131 | e <- evaluationScheme(ratingMatrix, method="split", train=0.9, given=10) 132 | 133 | r1 <- Recommender(getData(e, "train"), "UBCF") 134 | r2 <- Recommender(getData(e, "train"), "IBCF") 135 | r3 <- Recommender(getData(e, "train"), "RANDOM") 136 | 137 | p1 <- predict(r1, getData(e, "known"), type="ratings") 138 | p2 <- predict(r2, getData(e, "known"), type="ratings") 139 | p3 <- predict(r3, getData(e, "known"), type="ratings") 140 | error <- rbind( 141 | UBCF = calcPredictionAccuracy(p1, getData(e, "unknown")), 142 | IBCF = calcPredictionAccuracy(p2, getData(e, "unknown")), 143 | RANDOM = calcPredictionAccuracy(p3, getData(e, "unknown")) 144 | ) 145 | error 146 | ``` 147 | 148 | -------------------------------------------------------------------------------- /Rmd/chap16.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Machine Learning for Business Analytics" 3 | author: "Chapter 16: Cluster Analysis" 4 | output: 5 | pdf_document: 6 | toc: no 7 | highlight: tango 8 | # html_document: 9 | # toc: yes 10 | # toc_depth: 4 11 | # toc_float: yes 12 | --- 13 | 20 | ```{r global_options, include=FALSE} 21 | knitr::opts_chunk$set(error=TRUE, # Keep compiling upon error 22 | collapse=FALSE, # collapse by default 23 | echo=TRUE, # echo code by default 24 | comment = "#>", # change comment character 25 | fig.width = 5.5, # set figure width 26 | fig.align = "center",# set figure position 27 | out.width = "49%", # set width of displayed images 28 | warning=FALSE, # do not show R warnings 29 | message=FALSE, # do not show R messages 30 | row.print=25) 31 | ``` 32 | 33 | 34 | ```{r} 35 | if (!require(mlba)) { 36 | library(devtools) 37 | install_github("gedeck/mlba/mlba", force=TRUE) 38 | } 39 | ``` 40 | 41 | # Measuring Distance Between Two Records 42 | ## Euclidean Distance 43 | ```{r} 44 | library(tidyverse) 45 | # load data and use Company column as row names 46 | utilities.df <- mlba::Utilities %>% 47 | column_to_rownames("Company") 48 | 49 | # compute Euclidean distance 50 | # (to compute other distance measures, change the value in the method argument) 51 | d <- dist(utilities.df, method = "euclidean") 52 | ``` 53 | 54 | ## Normalizing Numerical Variables 55 | ```{r} 56 | # normalize input variables 57 | utilities.df.norm <- scale(utilities.df) 58 | 59 | # compute normalized distance based on Sales and Fuel Cost 60 | d.norm <- dist(utilities.df.norm[,c("Sales","Fuel_Cost")], method="euclidean") 61 | ``` 62 | 63 | # Hierarchical (Agglomerative) Clustering 64 | ## Dendrograms: Displaying Clustering Process and Results 65 | ```{r} 66 | library(ggplot2) 67 | library(ggdendro) 68 | d.norm <- dist(utilities.df.norm, method="euclidean") 69 | 70 | # in hclust() set argument method \galit{to} 71 | # "ward.D", "single", "complete", "average", "median", or "centroid" 72 | hc1 <- hclust(d.norm, method="single") 73 | plot(hc1, hang=-1, ann=FALSE) # use baseR 74 | ggdendrogram(hc1) # use ggdendro package (shown in figure below) 75 | hc2 <- hclust(d.norm, method="average") 76 | plot(hc2, hang=-1, ann=FALSE) 77 | ggdendrogram(hc2) 78 | ``` 79 | 80 | ```{r} 81 | library(gridExtra) 82 | addCutline <- function(g, hc, ncluster) { 83 | heights <- rev(hc$height) 84 | cut_at <- 0.5 * (heights[ncluster] + heights[ncluster - 1]) 85 | return (g + geom_hline(yintercept=cut_at, color='red', linetype=2)) 86 | } 87 | g1 <- ggdendrogram(hc1) 88 | g2 <- ggdendrogram(hc2) 89 | grid.arrange(addCutline(g1, hc1, 6), addCutline(g2, hc2, 6), nrow=2) 90 | g <- arrangeGrob(addCutline(g1, hc1, 6), addCutline(g2, hc2, 6), nrow=2) 91 | ggsave(file=file.path("..", "figures", "chapter_16", "utilities-dendrograms.pdf"), 92 | g, width=5, height=8, units="in") 93 | ``` 94 | 95 | ```{r} 96 | memb <- cutree(hc1, k = 6) 97 | memb 98 | memb <- cutree(hc2, k = 6) 99 | memb 100 | ``` 101 | 102 | ## Validating Clusters 103 | ```{r} 104 | # set labels as cluster membership and utility name 105 | row.names(utilities.df.norm) <- paste(memb, ": ", row.names(utilities.df), sep = "") 106 | 107 | # plot heatmap 108 | heatmap(utilities.df.norm, Colv=NA, hclustfun=hclust) 109 | ``` 110 | 111 | ```{r} 112 | # grey scale 113 | # rev() reverses the color mapping to large = dark 114 | heatmap(as.matrix(utilities.df.norm), Colv = NA, hclustfun = hclust, 115 | col=rev(paste("gray",1:99,sep=""))) 116 | 117 | pdf(file=file.path("..", "figures", "chapter_16", "utilities-heatmap.pdf"), 118 | width=5, height=5) 119 | heatmap(utilities.df.norm, Colv=NA, hclustfun=hclust) 120 | dev.off() 121 | ``` 122 | 123 | # Non-hierarchical Clustering: The 124 | ## Choosing the Number of Clusters ($k$) 125 | ```{r} 126 | set.seed(123) # set random seed for reproducability 127 | # load and preprocess data 128 | utilities.df <- mlba::Utilities %>% 129 | column_to_rownames("Company") 130 | 131 | # normalized distance: 132 | utilities.df.norm <- scale(utilities.df) 133 | 134 | # run kmeans algorithm 135 | km <- kmeans(utilities.df.norm, 6) 136 | 137 | # show cluster membership 138 | sort(km$cluster) 139 | ``` 140 | 141 | ```{r} 142 | # centroids 143 | km$centers 144 | # within-cluster sum of squares 145 | km$withinss 146 | # cluster size 147 | km$size 148 | ``` 149 | 150 | ```{r} 151 | library(GGally) 152 | centroids <- data.frame(km$centers) 153 | centroids['Cluster'] = paste('Cluster', seq(1, 6)) 154 | 155 | ggparcoord(centroids, columns=1:8, groupColumn='Cluster', showPoints=TRUE) + 156 | scale_color_viridis_d() + 157 | labs(x='Variable', y='Value') 158 | ``` 159 | 160 | ```{r} 161 | ggsave(file=file.path("..", "figures", "chapter_16", "utilities-clusterProfile.pdf"), 162 | last_plot() + theme_bw(), width=8.5, height=3.2, units="in") 163 | ``` 164 | 165 | ```{r} 166 | result <- tibble() 167 | for (k in 1:6) { 168 | km <- kmeans(utilities.df.norm, k) 169 | result <- bind_rows(result, tibble(k=k, average_withinss=mean(km$withinss))) 170 | } 171 | 172 | ggplot(result, aes(x=k, y=average_withinss)) + 173 | geom_line() + 174 | geom_point() + 175 | labs(y="Average within-cluster squared distance", 176 | x=expression(paste("Number of clusters ", italic("k")))) + 177 | theme_bw() 178 | ggsave(file=file.path("..", "figures", "chapter_16", "utilities-ellbow.pdf"), 179 | last_plot(), width=4, height=4, units="in") 180 | ``` 181 | 182 | ```{r} 183 | dist(km$centers) 184 | ``` 185 | 186 | -------------------------------------------------------------------------------- /Rmd/chap17.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Machine Learning for Business Analytics" 3 | author: "Chapter 17: Handling Time Series" 4 | output: 5 | pdf_document: 6 | toc: no 7 | highlight: tango 8 | # html_document: 9 | # toc: yes 10 | # toc_depth: 4 11 | # toc_float: yes 12 | --- 13 | 20 | ```{r global_options, include=FALSE} 21 | knitr::opts_chunk$set(error=TRUE, # Keep compiling upon error 22 | collapse=FALSE, # collapse by default 23 | echo=TRUE, # echo code by default 24 | comment = "#>", # change comment character 25 | fig.width = 5.5, # set figure width 26 | fig.align = "center",# set figure position 27 | out.width = "49%", # set width of displayed images 28 | warning=FALSE, # do not show R warnings 29 | message=FALSE, # do not show R messages 30 | row.print=25) 31 | ``` 32 | 33 | 34 | ```{r} 35 | if (!require(mlba)) { 36 | library(devtools) 37 | install_github("gedeck/mlba/mlba", force=TRUE) 38 | } 39 | ``` 40 | 41 | # Time Series Components 42 | ## Example: Ridership on Amtrak Trains 43 | ```{r} 44 | library(forecast) 45 | library(ggplot2) 46 | Amtrak.data <- mlba::Amtrak 47 | 48 | # create time series object using ts() 49 | # ts() takes three arguments: start, end, and freq. 50 | # with monthly data, the frequency of periods per cycle is 12 (per year). 51 | # arguments start and end are (cycle [=year] number, seasonal period [=month] number) pairs. 52 | # here start is Jan 1991: start = c(1991, 1); end is Mar 2004: end = c(2004, 3). 53 | ridership.ts <- ts(Amtrak.data$Ridership, 54 | start = c(1991, 1), end = c(2004, 3), freq = 12) 55 | 56 | # plot the series using the autoplot function to make use of ggplot 57 | autoplot(ridership.ts, xlab="Time", ylab="Ridership (in 000s)") + 58 | scale_y_continuous(limits=c(1300, 2300)) 59 | ``` 60 | 61 | ```{r} 62 | g <- last_plot() + 63 | scale_x_continuous(n.breaks=10) + 64 | theme_bw() 65 | ggsave(file=file.path("..", "figures", "chapter_17", "AmtrakFirstPlot.pdf"), 66 | g, width=6, height=3, units="in") 67 | ``` 68 | 69 | ```{r} 70 | library(gridExtra) 71 | library(lubridate) 72 | library(zoo) 73 | 74 | BareggTunnel <- mlba::BareggTunnel 75 | # convert Day information to a dates object 76 | dates <- as.POSIXct(BareggTunnel$Day, format='%d %b %Y') 77 | tunnel.ts <- ts(BareggTunnel$Number.of.vehicles, 78 | start=c(2003, yday(dates[1])), 79 | frequency=365) 80 | 81 | options(scipen=999) 82 | g1 <- autoplot(tunnel.ts, xlab="Time", ylab="Number of vehicles") + 83 | scale_x_yearmon() + 84 | scale_y_continuous(labels = scales::comma) 85 | g2 <- autoplot(window(tunnel.ts, 86 | start=c(2004, yday(ISOdate(2004, 2, 1))), 87 | end=c(2004, yday(ISOdate(2004, 6, 1)))), 88 | xlab="Time", ylab="Number of vehicles") + 89 | scale_x_yearmon() + 90 | scale_y_continuous(labels = scales::comma) 91 | 92 | grid.arrange(g1, g2, nrow=2) 93 | g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw()) 94 | ggsave(file=file.path("..", "figures", "chapter_17", "TS-TunnelPlots.pdf"), g, 95 | width=6, height=4) 96 | ``` 97 | 98 | ```{r} 99 | library(gridExtra) 100 | 101 | # to zoom in to a certain period, use window() to create a new, shorter time series 102 | # we create a new, 3-year time series of ridership.ts from Jan 1997 to Dec 1999 103 | ridership.ts.3yrs <- window(ridership.ts, start = c(1997, 1), end = c(1999, 12)) 104 | g1 <- autoplot(ridership.ts.3yrs, xlab="Time", ylab="Ridership (in 000s)") + 105 | scale_y_continuous(limits=c(1300, 2300)) 106 | 107 | # fit a trend line to the time series 108 | g2 <- autoplot(ridership.ts, xlab="Time", ylab="Ridership (in 000s)") + 109 | scale_y_continuous(limits=c(1300, 2300)) + 110 | geom_smooth(method="lm", formula=y~poly(x, 2)) 111 | 112 | grid.arrange(g1, g2, nrow=2) 113 | ``` 114 | 115 | ```{r} 116 | g <- arrangeGrob(g1 + theme_bw(), 117 | g2 + scale_x_continuous(n.breaks=10) + theme_bw()) 118 | ggsave(file=file.path("..", "figures", "chapter_17", "AmtrakZoomPlots.pdf"), g, 119 | width=6, height=6) 120 | 121 | # we can also use tslm to create the quadratic fit 122 | ridership.lm <- tslm(ridership.ts ~ trend + I(trend^2)) 123 | autoplot(ridership.ts, xlab="Time", ylab="Ridership (in 000s)") + 124 | scale_y_continuous(limits=c(1300, 2300)) + 125 | autolayer(ridership.lm$fitted.values) 126 | ``` 127 | 128 | # Data Partitioning and Performance Evaluation 129 | ## Benchmark Performance: Naive Forecasts 130 | ```{r} 131 | nTest <- 36 132 | nTrain <- length(ridership.ts) - nTest 133 | 134 | # partition the data 135 | train.ts <- window(ridership.ts, start = c(1991, 1), end = c(1991, nTrain)) 136 | test.ts <- window(ridership.ts, start = c(1991, nTrain + 1), 137 | end = c(1991, nTrain + nTest)) 138 | 139 | # generate the naive and seasonal naive forecasts 140 | naive.pred <- naive(train.ts, h=nTest) 141 | snaive.pred <- snaive(train.ts, h=nTest) 142 | 143 | # compare the actual values and forecasts for both methods 144 | colData <- "steelblue"; colModel <- "tomato" 145 | autoplot(train.ts, xlab="Time", ylab="Ridership (in 000s$)", color=colData) + 146 | autolayer(test.ts, linetype=2, color=colData) + 147 | autolayer(naive.pred, PI=FALSE, color=colModel, size=0.75) + 148 | autolayer(snaive.pred, PI=FALSE, color=colModel, size=0.75) 149 | ``` 150 | 151 | ```{r} 152 | # for the book visualization add additional annotation 153 | delta <- 1/12 154 | date_t <- time(train.ts)[1] 155 | date_th <- time(test.ts)[1] - delta 156 | date_hf <- tail(time(test.ts), 1) + delta 157 | g <- last_plot() + 158 | geom_vline(xintercept=date_th, color="darkgrey") + geom_vline(xintercept=date_hf, color="darkgrey") + 159 | geom_segment(aes(x=date_t, xend=date_th-delta, y=2300, yend=2300), color="darkgrey") + 160 | geom_segment(aes(x=date_th+delta, xend=date_hf-delta, y=2300, yend=2300), color="darkgrey") + 161 | geom_segment(aes(x=date_hf+delta, xend=date_hf+2, y=2300, yend=2300), color="darkgrey") + 162 | geom_text(aes(x=(date_t+date_th)/2, y=2350, label='Training')) + 163 | geom_text(aes(x=(date_th+date_hf)/2, y=2350, label='Test')) + 164 | geom_text(aes(x=date_hf+1, y=2350, label='Future')) + 165 | scale_x_continuous(n.breaks=10) + 166 | theme_bw() 167 | ggsave(file=file.path("..", "figures", "chapter_17", "AmtrakNaive.pdf"), 168 | g, width=7, height=4.5) 169 | ``` 170 | 171 | ```{r} 172 | accuracy(naive.pred, test.ts) 173 | accuracy(snaive.pred, test.ts) 174 | ``` 175 | 176 | ## Generating Future Forecasts 177 | ```{r} 178 | ts <- ts(mlba::CanadianWorkHours$Hours, 179 | start = c(mlba::CanadianWorkHours$Year[1], 1), freq = 1) 180 | 181 | # plot the series using the autoplot function to make use of ggplot 182 | autoplot(ts, xlab="Year", ylab="Hours per week") 183 | ggsave(file=file.path("..", "figures", "chapter_17", "Exercise-CanadianWorkers.pdf"), 184 | last_plot() + theme_bw(), width=5, height=4, units="in") 185 | ``` 186 | 187 | -------------------------------------------------------------------------------- /Rmd/chap2.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Machine Learning for Business Analytics" 3 | author: "Chapter 2: Overview of the Machine Learning Process" 4 | output: 5 | pdf_document: 6 | toc: no 7 | highlight: tango 8 | # html_document: 9 | # toc: yes 10 | # toc_depth: 4 11 | # toc_float: yes 12 | --- 13 | 20 | ```{r global_options, include=FALSE} 21 | knitr::opts_chunk$set(error=TRUE, # Keep compiling upon error 22 | collapse=FALSE, # collapse by default 23 | echo=TRUE, # echo code by default 24 | comment = "#>", # change comment character 25 | fig.width = 5.5, # set figure width 26 | fig.align = "center",# set figure position 27 | out.width = "49%", # set width of displayed images 28 | warning=FALSE, # do not show R warnings 29 | message=FALSE, # do not show R messages 30 | row.print=25) 31 | ``` 32 | 33 | 34 | ```{r} 35 | if (!require(mlba)) { 36 | library(devtools) 37 | install_github("gedeck/mlba/mlba", force=TRUE) 38 | } 39 | options(scipen=999) 40 | ``` 41 | 42 | # Preliminary Steps 43 | ## Loading and Looking at the Data in R 44 | ```{r, eval=FALSE} 45 | housing.df <- read.csv('WestRoxbury.csv') # load data from file 46 | housing.df <- mlba::WestRoxbury # load data from mlba package 47 | dim(housing.df) # find the dimension of data frame 48 | head(housing.df) # show the first six rows 49 | View(housing.df) # show all the data in a new tab 50 | 51 | # Practice showing different subsets of the data 52 | housing.df[1:10, 1] # show the first 10 rows of the first column only 53 | housing.df[1:10, ] # show the first 10 rows of each of the columns 54 | housing.df[5, 1:10] # show the fifth row of the first 10 columns 55 | housing.df[5, c(1:2, 4, 8:10)] # show the fifth row of some columns 56 | housing.df[, 1] # show the whole first column 57 | housing.df$TOTAL.VALUE # a different way to show the whole first column 58 | housing.df$TOTAL.VALUE[1:10] # show the first 10 rows of the first column 59 | length(housing.df$TOTAL.VALUE) # find the length of the first column 60 | mean(housing.df$TOTAL.VALUE) # find the mean of the first column 61 | summary(housing.df) # find summary statistics for each column 62 | ``` 63 | 64 | ## Sampling from a Database 65 | ```{r} 66 | housing.df <- mlba::WestRoxbury 67 | 68 | # random sample of 5 observations 69 | s <- sample(row.names(housing.df), 5) 70 | housing.df[s,] 71 | 72 | # oversample houses with over 10 rooms 73 | s <- sample(row.names(housing.df), 5, prob=ifelse(housing.df$ROOMS>10, 0.9, 0.01)) 74 | housing.df[s,] 75 | 76 | # rebalance 77 | housing.df$REMODEL <- factor(housing.df$REMODEL) 78 | table(housing.df$REMODEL) 79 | upsampled.df <- caret::upSample(housing.df, housing.df$REMODEL, list=TRUE)$x 80 | table(upsampled.df$REMODEL) 81 | ``` 82 | 83 | ```{r} 84 | 85 | ``` 86 | 87 | ## Preprocessing and Cleaning the Data 88 | ### Types of Variables 89 | ```{r} 90 | library(tidyverse) 91 | 92 | # get overview 93 | str(housing.df) 94 | 95 | # make REMODEL a factor variable 96 | housing.df$REMODEL <- factor(housing.df$REMODEL) 97 | str(housing.df$REMODEL) 98 | levels(housing.df$REMODEL) # show factor's categories (levels) 99 | 100 | # use tidyverse to load and preprocess data in one statement 101 | # the %>% operator inserts the result of the expression on the left 102 | # as the first argument into the function on the right 103 | housing.df <- mlba::WestRoxbury %>% 104 | mutate(REMODEL=factor(REMODEL)) 105 | ``` 106 | 107 | ### Handling Categorical Variables 108 | ```{r} 109 | library(fastDummies) 110 | library(tidyverse) 111 | 112 | housing.df <- dummy_cols(mlba::WestRoxbury, 113 | remove_selected_columns=TRUE, # remove the original column 114 | remove_first_dummy=TRUE) # removes the first created dummy variable 115 | housing.df %>% head(2) 116 | ``` 117 | 118 | ### Missing Values 119 | ```{r} 120 | # To illustrate missing data procedures, we first convert a few entries for 121 | # BEDROOMS to NA's. Then we impute these missing values using the median of the 122 | # remaining values. 123 | rows.to.missing <- sample(row.names(housing.df), 10) 124 | housing.df[rows.to.missing,]$BEDROOMS <- NA 125 | summary(housing.df$BEDROOMS) 126 | # Now we have 10 NA's and the median of the remaining values is 3. 127 | 128 | # replace the missing values using the median of the remaining values 129 | # use median() with na.rm=TRUE to ignore missing values when computing the median. 130 | housing.df <- housing.df %>% 131 | replace_na(list(BEDROOMS=median(housing.df$BEDROOMS, na.rm=TRUE))) 132 | 133 | summary(housing.df$BEDROOMS) 134 | ``` 135 | 136 | # Predictive Power and Overfitting 137 | ## Creating and Using Data Partitions 138 | ### Holdout Partition 139 | ```{r} 140 | housing.df <- mlba::WestRoxbury %>% 141 | mutate(REMODEL=factor(REMODEL)) 142 | 143 | # use set.seed() to get the same partitions when re-running the R code. 144 | set.seed(1) 145 | 146 | ## partitioning into training (60%) and holdout (40%) 147 | # randomly sample 60% of the row IDs for training; the remaining 40% serve 148 | # as holdout 149 | train.rows <- sample(rownames(housing.df), nrow(housing.df)*0.6) 150 | # collect all the columns with training row ID into training set: 151 | train.df <- housing.df[train.rows, ] 152 | # assign row IDs that are not already in the training set, into holdout 153 | holdout.rows <- setdiff(rownames(housing.df), train.rows) 154 | holdout.df <- housing.df[holdout.rows, ] 155 | 156 | ## partitioning into training (50%), validation (30%), holdout (20%) 157 | # randomly sample 50% of the row IDs for training 158 | train.rows <- sample(rownames(housing.df), nrow(housing.df)*0.5) 159 | 160 | # sample 30% of the row IDs into the validation set, drawing only from records 161 | # not already in the training set 162 | # use setdiff() to find records not already in the training set 163 | valid.rows <- sample(setdiff(rownames(housing.df), train.rows), 164 | nrow(housing.df)*0.3) 165 | 166 | # assign the remaining 20% row IDs serve as holdout 167 | holdout.rows <- setdiff(rownames(housing.df), union(train.rows, valid.rows)) 168 | 169 | # create the 3 data frames by collecting all columns from the appropriate rows 170 | train.df <- housing.df[train.rows, ] 171 | valid.df <- housing.df[valid.rows, ] 172 | holdout.df <- housing.df[holdout.rows, ] 173 | 174 | ## partitioning into training (60%) and holdout (40%) using caret 175 | set.seed(1) 176 | idx <- caret::createDataPartition(housing.df$TOTAL.VALUE, p=0.6, list=FALSE) 177 | train.df <- housing.df[idx, ] 178 | holdout.df <- housing.df[-idx, ] 179 | ``` 180 | 181 | ```{r} 182 | 183 | ``` 184 | 185 | # Building a Predictive Model 186 | ## Modeling Process 187 | ### Cross-Validation 188 | ```{r} 189 | library(tidyverse) 190 | library(mlba) 191 | library(fastDummies) 192 | 193 | housing.df <- mlba::WestRoxbury %>% 194 | # remove rows with missing values 195 | drop_na() %>% 196 | # remove column TAX 197 | select(-TAX) %>% 198 | # make REMODEL a factor and convert to dummy variables 199 | mutate(REMODEL=factor(REMODEL)) %>% 200 | dummy_cols(select_columns=c('REMODEL'), 201 | remove_selected_columns=TRUE, remove_first_dummy=TRUE) 202 | ``` 203 | 204 | ```{r} 205 | set.seed(1) 206 | idx <- caret::createDataPartition(housing.df$TOTAL.VALUE, p=0.6, list=FALSE) 207 | train.df <- housing.df[idx, ] 208 | holdout.df <- housing.df[-idx, ] 209 | ``` 210 | 211 | ```{r} 212 | reg <- lm(TOTAL.VALUE ~ ., data=train.df) 213 | train.res <- data.frame(actual=train.df$TOTAL.VALUE, predicted=reg$fitted.values, 214 | residuals=reg$residuals) 215 | head(train.res) 216 | ``` 217 | 218 | ```{r} 219 | pred <- predict(reg, newdata=holdout.df) 220 | holdout.res <- data.frame(actual=holdout.df$TOTAL.VALUE, predicted=pred, 221 | residuals=holdout.df$TOTAL.VALUE - pred) 222 | head(holdout.res) 223 | ``` 224 | 225 | ```{r} 226 | library(caret) 227 | # compute metrics on training set 228 | data.frame( 229 | ME = round(mean(train.res$residuals), 5), 230 | RMSE = RMSE(pred=train.res$predicted, obs=train.res$actual), 231 | MAE = MAE(pred=train.res$predicted, obs=train.res$actual) 232 | ) 233 | 234 | # compute metrics on holdout set 235 | data.frame( 236 | ME = round(mean(holdout.res$residuals), 5), 237 | RMSE = RMSE(pred=holdout.res$predicted, obs=holdout.res$actual), 238 | MAE = MAE(pred=holdout.res$predicted, obs=holdout.res$actual) 239 | ) 240 | ``` 241 | 242 | ```{r} 243 | # For demonstration purposes, we construct the new.data from the original dataset 244 | housing.df <- mlba::WestRoxbury 245 | new.data <- housing.df[100:102, -1] %>% 246 | mutate(REMODEL=factor(REMODEL, levels=c("None", "Old", "Recent"))) %>% 247 | dummy_cols(select_columns=c('REMODEL'), 248 | remove_selected_columns=TRUE, remove_first_dummy=TRUE) 249 | new.data 250 | pred <- predict(reg, newdata = new.data) 251 | pred 252 | ``` 253 | 254 | -------------------------------------------------------------------------------- /Rmd/chap20.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Machine Learning for Business Analytics" 3 | author: "Chapter 20: Social Network Analytics" 4 | output: 5 | pdf_document: 6 | toc: no 7 | highlight: tango 8 | # html_document: 9 | # toc: yes 10 | # toc_depth: 4 11 | # toc_float: yes 12 | --- 13 | 20 | ```{r global_options, include=FALSE} 21 | knitr::opts_chunk$set(error=TRUE, # Keep compiling upon error 22 | collapse=FALSE, # collapse by default 23 | echo=TRUE, # echo code by default 24 | comment = "#>", # change comment character 25 | fig.width = 5.5, # set figure width 26 | fig.align = "center",# set figure position 27 | out.width = "49%", # set width of displayed images 28 | warning=FALSE, # do not show R warnings 29 | message=FALSE, # do not show R messages 30 | row.print=25) 31 | ``` 32 | 33 | 34 | ```{r} 35 | if (!require(mlba)) { 36 | library(devtools) 37 | install_github("gedeck/mlba/mlba", force=TRUE) 38 | } 39 | set.seed(1) 40 | ``` 41 | 42 | # Introduction 43 | ```{r} 44 | library(igraph) 45 | 46 | # define links in data 47 | edges <- rbind( 48 | c("Dave", "Jenny"), c("Peter", "Jenny"), c("John", "Jenny"), 49 | c("Dave", "Peter"), c("Dave", "John"), c("Peter", "Sam"), 50 | c("Sam", "Albert"), c("Peter", "John") 51 | ) 52 | 53 | # generate and plot network 54 | # set argument directed = FALSE in graph.edgelist() to plot an undirected network. 55 | g <- graph.edgelist(edges, directed = FALSE) 56 | plot(g, vertex.size = 5, vertex.label.dist = 2) 57 | ``` 58 | 59 | ```{r} 60 | pdf(file=file.path("..", "figures", "chapter_20", "fig1.pdf"), width=5, height=5) 61 | par(mar=c(0,0,0,1)+.1) 62 | plot(g, vertex.size = 5, vertex.label.dist = 2) 63 | dev.off() 64 | ``` 65 | 66 | # Directed vs. Undirected Networks 67 | ```{r} 68 | # generate and plot network 69 | # set argument directed = TRUE in graph.edgelist() to plot a directed network. 70 | g <- graph.edgelist(edges, directed = TRUE) 71 | plot(g, vertex.size = 5, vertex.label.dist = 2) 72 | ``` 73 | 74 | ```{r} 75 | pdf(file=file.path("..", "figures", "chapter_20", "fig2.pdf"), width=5, height=5) 76 | par(mar=c(0,0,0,1)+.5) 77 | plot(g, vertex.size = 5, vertex.label.dist=2) 78 | dev.off() 79 | ``` 80 | 81 | ```{r} 82 | edges <- rbind(c("A", "B"), c("B", "C"), c("C", "A")) 83 | g <- graph.edgelist(edges, directed = FALSE) 84 | E(g)$width <- c(20, 5, 5) 85 | plot(g, vertex.size = 5, vertex.label.dist = 2) 86 | 87 | pdf(file=file.path("..", "figures", "chapter_20", "fig3.pdf"), width=2, height=2) 88 | par(mar=c(0,0,0,0)+.2) 89 | plot(g, vertex.size = 5, vertex.label.dist = 2) 90 | dev.off() 91 | # $ 92 | ``` 93 | 94 | # Visualizing and Analyzing Networks 95 | ## Plot Layout 96 | ```{r} 97 | library(igraph) 98 | drug.df <- mlba::Drug 99 | 100 | # convert edges to edge list matrix 101 | edges <- as.matrix(drug.df[, c(1,2)]) 102 | g <- graph.edgelist(edges,directed=FALSE) 103 | 104 | # plot network 105 | # nodes' size is proportional to their eigenvector centrality 106 | plot(g, vertex.label = NA, vertex.size = eigen_centrality(g)$vector * 20) 107 | ``` 108 | 109 | ```{r} 110 | pdf(file=file.path("..", "figures", "chapter_20", "SNA_Drug_Laundry.pdf"), width=5, height=5) 111 | par(mar=c(0,0,0,0)+.1) 112 | plot(g, vertex.label = NA, vertex.size = eigen_centrality(g)$vector * 20) 113 | dev.off() 114 | ``` 115 | 116 | ```{r} 117 | edges <- rbind( 118 | c("Dave", "Jenny"), c("Peter", "Jenny"), c("John", "Jenny"), 119 | c("Dave", "Peter"), c("Dave", "John"), c("Peter", "Sam"), 120 | c("Sam", "Albert"), c("Peter", "John") 121 | ) 122 | g <- graph.edgelist(edges) 123 | pdf(file=file.path("..", "figures", "chapter_20", "fig5_circle.pdf"), width=3, height=3) 124 | par(mar=c(0,0,0,1)+.7) 125 | plot(g, layout = layout_in_circle, vertex.size = 5, vertex.label.dist = 2) 126 | dev.off() 127 | pdf(file=file.path("..", "figures", "chapter_20", "fig5_grid.pdf"), width=3, height=3) 128 | par(mar=c(0,0,0,1)+.7) 129 | plot(g, layout = layout_on_grid, vertex.size = 5, vertex.label.dist = 2) 130 | dev.off() 131 | ``` 132 | 133 | ```{r} 134 | # Building on the code presented in Figure 19.1 135 | plot(g, layout = layout_in_circle, vertex.size = 5, vertex.label.dist = 2) 136 | plot(g, layout = layout_on_grid, vertex.size = 5, vertex.label.dist = 2) 137 | ``` 138 | 139 | # Social Data Metrics and Taxonomy 140 | ## Node-Level Centrality Metrics 141 | ```{r} 142 | edges <- rbind( 143 | c("Dave", "Jenny"), c("Peter", "Jenny"), c("John", "Jenny"), 144 | c("Dave", "Peter"), c("Dave", "John"), c("Peter", "Sam"), 145 | c("Sam", "Albert"), c("Peter", "John") 146 | ) 147 | g <- graph.edgelist(edges, directed=FALSE) 148 | 149 | degree(g) 150 | betweenness(g) 151 | betweenness(g)/sum(betweenness(g)) 152 | closeness(g) 153 | eigen_centrality(g) 154 | ``` 155 | 156 | ## Egocentric Network 157 | ```{r} 158 | # get Peter's 1-level ego network 159 | # for a 2-level ego network set argument order = 2 in make_ego_graph(). 160 | peter.ego <- make_ego_graph(g, order = 1, nodes = "Peter") 161 | plot(peter.ego[[1]], vertex.size = 1, vertex.label.dist = 0.5) 162 | ``` 163 | 164 | ```{r} 165 | pdf(file=file.path("..", "figures", "chapter_20", "fig6_1.pdf"), width=5, height=5) 166 | par(mar=c(0,0,0,1)+.5) 167 | peter.ego <- make_ego_graph(g, order = 1, nodes = "Peter") 168 | g.ego <- peter.ego[[1]] 169 | V(g.ego)$color <- "orange" 170 | V(g.ego)["Peter"]$color <- "red" 171 | plot(g.ego, vertex.size = 5, vertex.label.dist = 2) 172 | dev.off() 173 | pdf(file=file.path("..", "figures", "chapter_20", "fig6_2.pdf"), width=5, height=5) 174 | par(mar=c(0,0,0,1)+.5) 175 | peter.ego <- make_ego_graph(g, order = 2, nodes = "Peter") 176 | g.ego <- peter.ego[[1]] 177 | V(g.ego)$color <- "orange" 178 | V(g.ego)["Peter"]$color <- "red" 179 | plot(g.ego, vertex.size = 5, vertex.label.dist = 2) 180 | dev.off() 181 | ``` 182 | 183 | ## Network Metrics 184 | ```{r} 185 | degree.distribution(g) # normalized 186 | edge_density(g) 187 | ``` 188 | 189 | # Collecting Social Network Data with R 190 | ## Collaborative Filtering 191 | ```{r, eval=FALSE} 192 | library(twitteR) 193 | # replace key and secret number with those you obtained from Twitter 194 | setup_twitter_oauth(consumer_key = "XXX", consumer_secret = "XXX", 195 | access_token = "XXX", access_secret = "XXX") 196 | 197 | # get recent tweets 198 | recent.25.tweets <- searchTwitter("text mining", resultType="recent", n = 25) 199 | ``` 200 | 201 | ```{r, eval=FALSE} 202 | library(Rfacebook) 203 | # replace the app id and secret number with those you obtained from Facebook 204 | fb_oauth <- fbOAuth(app_id = "XXX", app_secret = "XXX") 205 | fb_oauth_credentials <- fromJSON(names(fb_oauth$credentials)) 206 | 207 | # get recent posts on page "dataminingbook" 208 | fb_page <- getPage(page = "dataminingbook", token = fb_oauth_credentials$access_token) 209 | 210 | # a facebook page contains the following information: 211 | t(t(names(fb_page))) 212 | fb_page[1,] 213 | 214 | # get information about most recent post 215 | post <- getPost(post=fb_page$id[1], n=20, token=fb_oauth_credentials$access_token) 216 | 217 | post$likes 218 | post$comments 219 | ``` 220 | 221 | -------------------------------------------------------------------------------- /Rmd/chap21.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Machine Learning for Business Analytics" 3 | author: "Chapter 21: Text Mining" 4 | output: 5 | pdf_document: 6 | toc: no 7 | highlight: tango 8 | # html_document: 9 | # toc: yes 10 | # toc_depth: 4 11 | # toc_float: yes 12 | --- 13 | 20 | ```{r global_options, include=FALSE} 21 | knitr::opts_chunk$set(error=TRUE, # Keep compiling upon error 22 | collapse=FALSE, # collapse by default 23 | echo=TRUE, # echo code by default 24 | comment = "#>", # change comment character 25 | fig.width = 5.5, # set figure width 26 | fig.align = "center",# set figure position 27 | out.width = "49%", # set width of displayed images 28 | warning=FALSE, # do not show R warnings 29 | message=FALSE, # do not show R messages 30 | row.print=25) 31 | ``` 32 | 33 | 34 | ```{r} 35 | if (!require(mlba)) { 36 | library(devtools) 37 | install_github("gedeck/mlba/mlba", force=TRUE) 38 | } 39 | ``` 40 | 41 | # The Tabular Representation of Text: Term-Document Matrix and ``Bag-of-Words" 42 | ```{r} 43 | library(tm) 44 | 45 | # define vector of sentences ("docs") 46 | text <- c("this is the first sentence", 47 | "this is a second sentence", 48 | "the third sentence is here") 49 | 50 | # convert sentences into a corpus 51 | corp <- Corpus(VectorSource(text)) 52 | 53 | # compute term frequency 54 | tdm <- TermDocumentMatrix(corp) 55 | inspect(tdm) 56 | ``` 57 | 58 | # Preprocessing the Text 59 | ```{r} 60 | text <- c("this is the first sentence!!", 61 | "this is a second Sentence :)", 62 | "the third sentence, is here", 63 | "forth of all sentences") 64 | corp <- Corpus(VectorSource(text)) 65 | tdm <- TermDocumentMatrix(corp) 66 | inspect(tdm) 67 | ``` 68 | 69 | ## Tokenization 70 | ```{r} 71 | # tokenization 72 | corp <- tm_map(corp, stripWhitespace) 73 | corp <- tm_map(corp, removePunctuation) 74 | tdm <- TermDocumentMatrix(corp) 75 | inspect(tdm) 76 | ``` 77 | 78 | ## Text Reduction 79 | ```{r} 80 | stopwords('english') 81 | ``` 82 | 83 | ```{r} 84 | # stopwords 85 | library(SnowballC) 86 | corp <- tm_map(corp, removeWords, stopwords("english")) 87 | 88 | # stemming 89 | corp <- tm_map(corp, stemDocument) 90 | 91 | tdm <- TermDocumentMatrix(corp) 92 | inspect(tdm) 93 | ``` 94 | 95 | ## Term Frequency--Inverse Document Frequency (TF-IDF) 96 | ```{r} 97 | tfidf <- weightTfIdf(tdm) 98 | inspect(tfidf) 99 | ``` 100 | 101 | # Example: Online Discussions on Autos and Electronics 102 | ## Importing and Labeling the Records 103 | ```{r} 104 | library(tm) 105 | # step 1: import and label records 106 | # read zip file into a corpus 107 | corp <- Corpus(ZipSource(mlba::AutosElectronics, recursive = T)) 108 | 109 | # create an array of records labels 110 | label <- c(rep(1, 1000), rep(0, 1000)) 111 | 112 | # step 2: text preprocessing 113 | # tokenization 114 | corp <- tm_map(corp, stripWhitespace) 115 | corp <- tm_map(corp, removePunctuation) 116 | corp <- tm_map(corp, removeNumbers) 117 | 118 | # stopwords 119 | corp <- tm_map(corp, removeWords, stopwords("english")) 120 | 121 | # stemming 122 | corp <- tm_map(corp, stemDocument) 123 | 124 | # step 3: TF-IDF and latent semantic analysis 125 | # compute TF-IDF 126 | tdm <- TermDocumentMatrix(corp) 127 | tfidf <- weightTfIdf(tdm) 128 | 129 | # extract (20) concepts 130 | library(lsa) 131 | lsa.tfidf <- lsa(tfidf, dim = 20) 132 | 133 | # convert to data frame 134 | words.df <- as.data.frame(as.matrix(lsa.tfidf$dk)) 135 | ``` 136 | 137 | ## Fitting a Predictive Model 138 | ```{r} 139 | library(caret) 140 | 141 | # prepare training and holdout sets 142 | set.seed(1) 143 | df <- cbind(label=factor(label), words.df) 144 | idx <- caret::createDataPartition(df$label, p=0.6, list=FALSE) 145 | train.df <- df[idx, ] 146 | holdout.df <- df[-idx, ] 147 | 148 | # fit logistic regression 149 | logit.reg <- train(label ~ ., data=train.df, 150 | trControl=trainControl(method="none"), 151 | method="glm", family="binomial") 152 | 153 | # compute accuracy on holdout set 154 | pred <- predict(logit.reg, newdata=holdout.df) 155 | confusionMatrix(pred, holdout.df$label) 156 | ``` 157 | 158 | ```{r} 159 | library(gains) 160 | 161 | prob <- predict(logit.reg, newdata=holdout.df, type="prob")[,2] 162 | actual <- ifelse(holdout.df$label == 1, 1, 0) 163 | gain <- gains(actual, prob) 164 | barplot(gain$mean.resp/mean(actual), names.arg=seq(10, 100, by=10), 165 | xlab="Percentile", ylab="Decile mean / global mean") 166 | ``` 167 | 168 | ```{r} 169 | pdf(file=file.path("..", "figures", "chapter_21", "decileLiftClassification.pdf"), 170 | width=6, height=4) 171 | barplot(gain$mean.resp/mean(actual), names.arg=seq(10, 100, by=10), 172 | xlab="Percentile", ylab="Decile mean / global mean") 173 | dev.off() 174 | ``` 175 | 176 | # Example: Sentiment Analysis of Movie Reviews 177 | ## Data Loading, Preparation, and Partitioning 178 | ```{r} 179 | library(tidyverse) 180 | library(text2vec) 181 | 182 | # load and split data into training and holdout set 183 | data <- mlba::IMDBdataset10K %>% 184 | mutate( 185 | id = row_number(), 186 | sentiment = as.factor(sentiment) 187 | ) 188 | 189 | set.seed(1) 190 | trainIndex <- createDataPartition(data$sentiment, p=0.8, list=FALSE) 191 | data_train <- data[trainIndex, ] 192 | data_holdout <- data[-trainIndex, ] 193 | ``` 194 | 195 | ```{r} 196 | prep_fun <- tolower 197 | tok_fun <- word_tokenizer 198 | 199 | it_train <- itoken(data_train$review, ids=data_train$id, 200 | preprocessor=prep_fun, tokenizer=tok_fun) 201 | it_holdout <- itoken(data_holdout$review, ids=data_holdout$id, 202 | preprocessor=prep_fun, tokenizer=tok_fun) 203 | 204 | vocab <- create_vocabulary(it_train) 205 | vocab <- prune_vocabulary(vocab, term_count_min = 5L) 206 | vectorizer <- vocab_vectorizer(vocab) 207 | tcm_train <- create_tcm(it_train, vectorizer, skip_grams_window = 5L) 208 | ``` 209 | 210 | ```{r} 211 | 212 | ``` 213 | 214 | ## Generating and Applying pb 215 | ```{r} 216 | # determine word vectors 217 | glove <- GlobalVectors$new(rank=100, x_max=10) 218 | wv_main <- glove$fit_transform(tcm_train, n_iter=10, convergence_tol=0.01, n_threads=8) 219 | wv_context <- glove$components 220 | word_vectors <- wv_main + t(wv_context) 221 | ``` 222 | 223 | ```{r} 224 | 225 | ``` 226 | 227 | ```{r} 228 | dtm_train <- create_dtm(it_train, vectorizer) 229 | common_terms <- intersect(colnames(dtm_train), rownames(word_vectors) ) 230 | dtm_averaged <- normalize(dtm_train[, common_terms], "l1") 231 | sentence_vectors_train <- dtm_averaged %*% word_vectors[common_terms, ] 232 | 233 | dtm_holdout <- create_dtm(it_holdout, vectorizer) 234 | common_terms <- intersect(colnames(dtm_holdout), rownames(word_vectors) ) 235 | dtm_averaged <- normalize(dtm_holdout[, common_terms], "l1") 236 | sentence_vectors_holdout <- dtm_averaged %*% word_vectors[common_terms, ] 237 | ``` 238 | 239 | ## Fitting a Predictive Model 240 | ```{r} 241 | train.df <- as.data.frame(as.matrix(sentence_vectors_train)) 242 | train.df$sentiment <- data_train$sentiment 243 | 244 | trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE) 245 | logit.reg <- caret::train(sentiment ~ ., data=train.df, trControl=trControl, 246 | # fit logistic regression with a generalized linear model 247 | method="glm", family="binomial") 248 | 249 | holdout.df <- as.data.frame(as.matrix(sentence_vectors_holdout)) 250 | holdout.df$sentiment <- data_holdout$sentiment 251 | 252 | caret::confusionMatrix(predict(logit.reg, holdout.df), holdout.df$sentiment) 253 | ``` 254 | 255 | ```{r} 256 | 257 | ``` 258 | 259 | ```{r} 260 | library(ROCR) 261 | prob <- predict(logit.reg, newdata=holdout.df, type="prob")$positive 262 | 263 | predob <- prediction(prob, holdout.df$sentiment) 264 | perf <- performance(predob, "tpr", "fpr") 265 | perf.df <- data.frame( 266 | tpr=perf@x.values[[1]], 267 | fpr=perf@y.values[[1]] 268 | ) 269 | ggplot(perf.df, aes(x=tpr, y=fpr)) + 270 | geom_line() + 271 | geom_segment(aes(x=0, y=0, xend=1, yend=1), color="grey", linetype="dashed") + 272 | labs(x="1 - Specificity", y="Sensitivity") 273 | ``` 274 | 275 | ```{r} 276 | ggsave(file=file.path("..", "figures", "chapter_21", "glove-ROC.pdf"), 277 | last_plot() + theme_bw()) 278 | ``` 279 | 280 | -------------------------------------------------------------------------------- /Rmd/chap22.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Machine Learning for Business Analytics" 3 | author: "Chapter 22: Responsible Data Science" 4 | output: 5 | pdf_document: 6 | toc: no 7 | highlight: tango 8 | # html_document: 9 | # toc: yes 10 | # toc_depth: 4 11 | # toc_float: yes 12 | --- 13 | 20 | ```{r global_options, include=FALSE} 21 | knitr::opts_chunk$set(error=TRUE, # Keep compiling upon error 22 | collapse=FALSE, # collapse by default 23 | echo=TRUE, # echo code by default 24 | comment = "#>", # change comment character 25 | fig.width = 5.5, # set figure width 26 | fig.align = "center",# set figure position 27 | out.width = "49%", # set width of displayed images 28 | warning=FALSE, # do not show R warnings 29 | message=FALSE, # do not show R messages 30 | row.print=25) 31 | ``` 32 | 33 | 34 | ```{r} 35 | if (!require(mlba)) { 36 | library(devtools) 37 | install_github("gedeck/mlba/mlba", force=TRUE) 38 | } 39 | ``` 40 | 41 | # Example: Applying the RDS Framework to the COMPAS Example 42 | ## Data Issues 43 | ```{r} 44 | library(caret) 45 | library(tidyverse) 46 | # load COMPAS data 47 | compas.df <- mlba::COMPAS_clean %>% 48 | select(-id) %>% 49 | mutate( 50 | age_cat = factor(age_cat, levels=c("Less than 25", "25 - 45", "Greater than 45")), 51 | c_charge_degree = factor(c_charge_degree, levels=c("F", "M")), 52 | race = factor(race, levels=c("African-American", "Asian", "Caucasian", "Hispanic", 53 | "Native American", "Other")), 54 | sex = factor(sex, levels=c("Female", "Male")), 55 | two_year_recid = factor(two_year_recid, levels=c(0, 1), labels=c("No", "Yes")) 56 | ) 57 | 58 | # split dataset and train models 59 | set.seed(1) 60 | idx <- createDataPartition(compas.df$two_year_recid, p=0.7, list=FALSE) 61 | train.df <- compas.df[idx, ] 62 | valid.df <- compas.df[-idx, ] 63 | ``` 64 | 65 | ```{r} 66 | trControl <- trainControl(method="cv", number=5, allowParallel=TRUE) 67 | # logistic regression model 68 | logreg.model <- train(two_year_recid ~ . - race, data=train.df, 69 | method="glm", family="binomial", trControl=trControl) 70 | # random forest model 71 | rf.model <- train(two_year_recid ~ . - race, data=train.df, 72 | method="rf", trControl=trControl) 73 | 74 | # extract coefficients and calculate odds shown in table 75 | logreg.coef <- coef(logreg.model$finalModel) 76 | data.frame( 77 | coefficient=logreg.coef, 78 | odds=c(NA, exp(logreg.coef)[-c(1)]) 79 | ) %>% round(3) 80 | ``` 81 | 82 | ```{r} 83 | # calculation of model accuracy based on cross-validation results 84 | caret::confusionMatrix(logreg.model) 85 | caret::confusionMatrix(rf.model) 86 | ``` 87 | 88 | ## Auditing the Model 89 | ```{r} 90 | library(ROCR) 91 | holdoutMetrics <- function(df, model) { 92 | result <- data.frame(obs = df$two_year_recid, pred = predict(model, newdata=df), 93 | prob = predict(model, newdata=df, type="prob")$Yes) 94 | pred <- prediction(result$prob, result$obs) 95 | # compute overall performance 96 | perf_AUC <- performance(pred, "auc") 97 | AUC <- perf_AUC@y.values[[1]] 98 | cm <- confusionMatrix(result$pred, result$obs, positive="Yes") 99 | return (tibble(AUC=AUC, Accuracy = cm$overall["Accuracy"], 100 | FPR = 100*(1-cm$byClass["Specificity"]), 101 | FNR = 100*(1-cm$byClass["Sensitivity"]))) 102 | } 103 | # compute performance by race 104 | metricsByRace <- function(model) { 105 | metrics <- tibble() 106 | for (raceValue in levels(compas.df$race)) { 107 | df <- compas.df %>% filter(race==raceValue) 108 | metrics <- bind_rows(metrics, tibble(race=raceValue, holdoutMetrics(df, model))) 109 | } 110 | return (metrics) 111 | } 112 | # combine metrics for logistic and random forest 113 | metrics <- bind_rows( 114 | tibble(Model="Random forest", metricsByRace(rf.model)), 115 | tibble(Model="Logistic regression", metricsByRace(logreg.model)) 116 | ) %>% filter(! race %in% c("Asian", "Native American")) 117 | ``` 118 | 119 | ```{r} 120 | library(gridExtra) 121 | makeBarchart <- function(metrics, aesthetics) { 122 | g <- ggplot(metrics, aesthetics) + 123 | geom_bar(position="dodge", stat="identity") + 124 | geom_text(hjust=1.5, position=position_dodge(.9)) + 125 | coord_flip() + 126 | scale_x_discrete(limits=rev) + 127 | labs(x="Race") + 128 | theme_bw() 129 | return (g) 130 | } 131 | g1 <- makeBarchart(metrics, aes(x=race, y=Accuracy, fill=Model, label=round(Accuracy, 3))) + 132 | theme(legend.position="none") 133 | g2 <- makeBarchart(metrics, aes(x=race, y=AUC, fill=Model, label=round(AUC, 3))) + 134 | theme(legend.position="bottom") 135 | grid.arrange(g1, g2, nrow=2, heights=c(6.25, 7)) 136 | 137 | g <- arrangeGrob(g1, g2, heights=c(6.25, 7)) 138 | ggsave(file=file.path("..", "figures", "chapter_22", "c22_acc_auc.pdf"), 139 | g, width=5, height=5, units="in") 140 | 141 | g1 <- makeBarchart(metrics, aes(x=race, y=FPR, fill=Model, label=round(FPR, 3))) + 142 | theme(legend.position="none") 143 | g2 <- makeBarchart(metrics, aes(x=race, y=FNR, fill=Model, label=round(FNR, 3))) + 144 | theme(legend.position="bottom") 145 | grid.arrange(g1, g2, heights=c(6.25, 7)) 146 | 147 | g <- arrangeGrob(g1, g2, heights=c(6.25, 7)) 148 | ggsave(file=file.path("..", "figures", "chapter_22", "c22_fpr_fnr.pdf"), 149 | g, width=5, height=5, units="in") 150 | ``` 151 | 152 | ### Interpretability Methods 153 | ```{r} 154 | library(iml) 155 | predictor.rf = Predictor$new(rf.model, data=valid.df, y=valid.df$two_year_recid) 156 | predictor.lm = Predictor$new(logreg.model, data=valid.df, y=valid.df$two_year_recid) 157 | ``` 158 | 159 | ```{r} 160 | featureEffect.lm = FeatureEffect$new(predictor.lm, feature='priors_count', method='pdp') 161 | featureEffect.rf = FeatureEffect$new(predictor.rf, feature='priors_count', method='pdp') 162 | combined <- bind_rows( 163 | tibble(Method="Logistic regression", featureEffect.lm$results %>% filter(.class=="Yes")), 164 | tibble(Method="Random forest", featureEffect.rf$results %>% filter(.class=="Yes")) 165 | ) 166 | ggplot(combined, aes(x=priors_count, y=.value, color=Method)) + 167 | geom_line() + 168 | labs(x="Feature value", y="Probability of recidivism") 169 | ``` 170 | 171 | ```{r} 172 | ggsave(file=file.path("..", "figures", "chapter_22", "c22f005.pdf"), 173 | last_plot() + theme_bw(), width=5, height=3, units="in") 174 | ``` 175 | 176 | ```{r} 177 | library(iml) 178 | predictor.rf = Predictor$new(rf.model, data=valid.df, y=valid.df$two_year_recid) 179 | predictor.lm = Predictor$new(logreg.model, data=valid.df, y=valid.df$two_year_recid) 180 | 181 | # permutation feature importance 182 | FeatureImp$new(predictor.lm, "ce", compare="ratio", n.repetitions=5) 183 | FeatureImp$new(predictor.rf, "ce", compare="ratio", n.repetitions=5) 184 | ``` 185 | 186 | -------------------------------------------------------------------------------- /Rmd/chap23.Rmd: -------------------------------------------------------------------------------- 1 | ```{r} 2 | if (!require(mlba)) { 3 | library(devtools) 4 | install_github("gedeck/mlba/mlba", force=TRUE) 5 | } 6 | ``` 7 | 8 | -------------------------------------------------------------------------------- /Rmd/chap4.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Machine Learning for Business Analytics" 3 | author: "Chapter 4: Dimension Reduction" 4 | output: 5 | pdf_document: 6 | toc: no 7 | highlight: tango 8 | # html_document: 9 | # toc: yes 10 | # toc_depth: 4 11 | # toc_float: yes 12 | --- 13 | 20 | ```{r global_options, include=FALSE} 21 | knitr::opts_chunk$set(error=TRUE, # Keep compiling upon error 22 | collapse=FALSE, # collapse by default 23 | echo=TRUE, # echo code by default 24 | comment = "#>", # change comment character 25 | fig.width = 5.5, # set figure width 26 | fig.align = "center",# set figure position 27 | out.width = "49%", # set width of displayed images 28 | warning=FALSE, # do not show R warnings 29 | message=FALSE, # do not show R messages 30 | row.print=25) 31 | ``` 32 | 33 | 34 | ```{r} 35 | if (!require(mlba)) { 36 | library(devtools) 37 | install_github("gedeck/mlba/mlba", force=TRUE) 38 | } 39 | 40 | options(dplyr.summarise.inform = FALSE) 41 | library(tidyverse) 42 | ``` 43 | 44 | # Data Summaries 45 | ## Example 1: House Prices in Boston 46 | ```{r} 47 | boston.housing.df <- mlba::BostonHousing 48 | head(boston.housing.df, 9) 49 | summary(boston.housing.df) 50 | 51 | # compute mean, standard dev., min, max, median, length, and missing values of CRIM 52 | mean(boston.housing.df$CRIM) 53 | sd(boston.housing.df$CRIM) 54 | min(boston.housing.df$CRIM) 55 | max(boston.housing.df$CRIM) 56 | median(boston.housing.df$CRIM) 57 | length(boston.housing.df$CRIM) 58 | 59 | # find the number of missing values of variable CRIM 60 | sum(is.na(boston.housing.df$CRIM)) 61 | 62 | # compute mean, standard dev., min, max, median, length, and missing values for all 63 | # variables 64 | data.frame(mean=sapply(boston.housing.df, mean), 65 | sd=sapply(boston.housing.df, sd), 66 | min=sapply(boston.housing.df, min), 67 | max=sapply(boston.housing.df, max), 68 | median=sapply(boston.housing.df, median), 69 | length=sapply(boston.housing.df, length), 70 | miss.val=sapply(boston.housing.df, 71 | function(x) sum(length(which(is.na(x)))))) 72 | ``` 73 | 74 | ```{r} 75 | 76 | ``` 77 | 78 | ## Summary Statistics 79 | ```{r} 80 | round(cor(boston.housing.df),2) 81 | ``` 82 | 83 | ## Aggregation and Pivot Tables 84 | ```{r} 85 | boston.housing.df <- mlba::BostonHousing 86 | table(boston.housing.df$CHAS) 87 | 88 | # tidyverse version 89 | boston.housing.df %>% count(CHAS) 90 | ``` 91 | 92 | ```{r} 93 | # create bins of size 1 94 | boston.housing.df <- boston.housing.df %>% 95 | mutate(RM.bin = cut(RM, c(1:9), labels=FALSE)) 96 | 97 | # compute the average of MEDV by (binned) RM and CHAS 98 | # in aggregate() use the argument by= to define the list of aggregating variables, 99 | # and FUN= as an aggregating function. 100 | aggregate(boston.housing.df$MEDV, by=list(RM=boston.housing.df$RM.bin, 101 | CHAS=boston.housing.df$CHAS), FUN=mean) 102 | 103 | # tidyverse version 104 | boston.housing.df %>% 105 | group_by(RM.bin, CHAS) %>% 106 | summarise(mean(MEDV)) 107 | ``` 108 | 109 | ```{r} 110 | # use install.packages("reshape") the first time the package is used 111 | library(reshape) 112 | boston.housing.df <- mlba::BostonHousing 113 | # create bins of size 1 114 | boston.housing.df <- boston.housing.df %>% 115 | mutate(RM.bin = cut(RM, c(1:9), labels=FALSE)) 116 | 117 | # use melt() to stack a set of columns into a single column of data. 118 | # stack MEDV values for each combination of (binned) RM and CHAS 119 | mlt <- melt(boston.housing.df, id=c("RM.bin", "CHAS"), measure=c("MEDV")) 120 | head(mlt, 5) 121 | 122 | # use cast() to reshape data and generate pivot table 123 | cast(mlt, RM.bin ~ CHAS, subset=variable=="MEDV", 124 | margins=c("grand_row", "grand_col"), mean) 125 | 126 | # tidyverse version 127 | boston.housing.df %>% 128 | group_by(RM.bin, CHAS) %>% 129 | summarize(mean=mean(MEDV)) %>% 130 | spread(CHAS, mean) 131 | ``` 132 | 133 | # Reducing the Number of Categories in Categorical Variables 134 | ```{r} 135 | boston.housing.df <- mlba::BostonHousing 136 | 137 | tbl <- table(boston.housing.df$CAT.MEDV, boston.housing.df$ZN) 138 | prop.tbl <- prop.table(tbl, margin=2) 139 | barplot(prop.tbl, xlab="ZN", ylab="", yaxt="n",main="Distribution of CAT.MEDV by ZN") 140 | axis(2, at=(seq(0,1, 0.2)), paste(seq(0,100,20), "%")) 141 | 142 | library(tidyverse) 143 | df <- data.frame(prop.tbl) 144 | ggplot(df, aes(x=Var2, y=Freq, group=Var1, fill=Var1)) + 145 | geom_bar(stat="identity", color="grey", width=1) + 146 | scale_y_continuous(labels = scales::percent, expand=expansion()) + 147 | scale_fill_manual("CAT.MEDV", values=c("0"="#eeeeee", "1"="darkgrey")) + 148 | labs(x="ZN", y="", title="Distribution of CAT.MEDV by ZN") 149 | ``` 150 | 151 | ```{r} 152 | g <- last_plot() + theme_bw() 153 | ggsave(file=file.path("..", "figures", "chapter_04", "reduction-pivot-bar.pdf"), 154 | g, width=9, height=4, units="in") 155 | ``` 156 | 157 | ```{r} 158 | library(forecast) 159 | tru.data <- mlba::ToysRUsRevenues 160 | tru.ts <- ts(tru.data[, 3], start = c(1992, 1), end = c(1995, 4), freq = 4) 161 | autoplot(tru.ts) + 162 | geom_point(size=0.5) + 163 | labs(x="Time", y="Revenue ($ millions)") + 164 | theme_bw() 165 | ggsave(file=file.path("..", "figures", "chapter_04", "ToysRUs.pdf"), 166 | last_plot()) 167 | ``` 168 | 169 | # Principal Components Analysis 170 | ## Example 2: Breakfast Cereals 171 | ```{r} 172 | library(tidyverse) 173 | cereals.df <- mlba::Cereals %>% select(calories, rating) 174 | # compute PCs on two dimensions 175 | pcs <- prcomp(cereals.df %>% select(calories, rating)) 176 | summary(pcs) 177 | pcs$rot 178 | scores <- pcs$x 179 | head(scores, 5) 180 | ``` 181 | 182 | ```{r} 183 | getPCaxis <- function(f, pcs, pcLabel) { 184 | return (data.frame( 185 | rbind(pcs$center + f * pcs$rotation[, pcLabel], 186 | pcs$center - f * pcs$rotation[, pcLabel])) 187 | ) 188 | } 189 | PC1 <- getPCaxis(90, pcs, "PC1") 190 | PC2 <- getPCaxis(50, pcs, "PC2") 191 | ggplot(cereals.df, aes(x=calories, y=rating)) + 192 | geom_point() + 193 | geom_line(data=PC1) + 194 | geom_line(data=PC2) + 195 | coord_cartesian(xlim=c(0, 200), ylim=c(0, 110)) + 196 | labs(x="Calories", y="Rating") + 197 | annotate(geom="text", x=30, y=80, label="z[1]",parse=TRUE) + 198 | annotate(geom="text", x=120, y=80, label="z[2]",parse=TRUE) + 199 | theme_bw() 200 | 201 | ggsave(file=file.path("..", "figures", "chapter_04", "pca_subset.pdf"), 202 | width=5, height=3, last_plot()) 203 | ``` 204 | 205 | ## Principal Components 206 | ```{r} 207 | # load and preprocess the data 208 | cereals.df <- mlba::Cereals %>% 209 | column_to_rownames("name") %>% 210 | select(-c(mfr, type)) %>% 211 | drop_na() 212 | 213 | pcs <- prcomp(cereals.df) 214 | summary(pcs) 215 | pcs$rotation[,1:5] 216 | ``` 217 | 218 | ## Normalizing the Data 219 | ```{r} 220 | # Use function prcomp() with scale. = T to run PCA on normalized data 221 | pcs.cor <- prcomp(cereals.df, scale. = T) 222 | 223 | summary(pcs.cor) 224 | pcs.cor$rotation[,1:5] 225 | ``` 226 | 227 | ```{r} 228 | library(ggrepel) 229 | ggplot(data.frame(pcs.cor$x), aes(x=PC1, y=PC2, label=rownames(pcs.cor$x))) + 230 | geom_point(shape=21) + 231 | geom_text_repel(size=2, max.overlaps=7) + 232 | theme_bw() 233 | 234 | f <- 1.3 235 | ggsave(file=file.path("..", "figures", "chapter_04", "pca_full.pdf"), 236 | width=f * 4, height=f * 5, last_plot()) 237 | ``` 238 | 239 | # Dimension Reduction Using Classification and Regression Trees 240 | ## Using Principal Components for Classification and Prediction 241 | ```{r} 242 | wine.df <- mlba::Wine %>% select(-Type) 243 | pcs.cor <- prcomp(wine.df) 244 | summary(pcs.cor) 245 | pcs.cor$rotation[,1:4] 246 | ``` 247 | 248 | -------------------------------------------------------------------------------- /Rmd/chap6.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Machine Learning for Business Analytics" 3 | author: "Chapter 6: Multiple Linear Regression" 4 | output: 5 | pdf_document: 6 | toc: no 7 | highlight: tango 8 | # html_document: 9 | # toc: yes 10 | # toc_depth: 4 11 | # toc_float: yes 12 | --- 13 | 20 | ```{r global_options, include=FALSE} 21 | knitr::opts_chunk$set(error=TRUE, # Keep compiling upon error 22 | collapse=FALSE, # collapse by default 23 | echo=TRUE, # echo code by default 24 | comment = "#>", # change comment character 25 | fig.width = 5.5, # set figure width 26 | fig.align = "center",# set figure position 27 | out.width = "49%", # set width of displayed images 28 | warning=FALSE, # do not show R warnings 29 | message=FALSE, # do not show R messages 30 | row.print=25) 31 | ``` 32 | 33 | 34 | ```{r} 35 | if (!require(mlba)) { 36 | library(devtools) 37 | install_github("gedeck/mlba/mlba", force=TRUE) 38 | } 39 | options(scipen=999, digits = 3) 40 | ``` 41 | 42 | # Estimating the Regression Equation and Prediction 43 | ## Example: Predicting the Price of Used Toyota Corolla Cars 44 | ```{r} 45 | library(caret) 46 | car.df <- mlba::ToyotaCorolla 47 | # select variables for regression 48 | outcome <- "Price" 49 | predictors <- c("Age_08_04", "KM", "Fuel_Type", "HP", "Met_Color", "Automatic", 50 | "CC", "Doors", "Quarterly_Tax", "Weight") 51 | # reduce data set to first 1000 rows and selected variables 52 | car.df <- car.df[1:1000, c(outcome, predictors)] 53 | 54 | # partition data 55 | set.seed(1) # set seed for reproducing the partition 56 | idx <- createDataPartition(car.df$Price, p=0.6, list=FALSE) 57 | train.df <- car.df[idx, ] 58 | holdout.df <- car.df[-idx, ] 59 | 60 | # use lm() to run a linear regression of Price on all 11 predictors in the 61 | # training set. 62 | # use . after ~ to include all the remaining columns in train.df as predictors. 63 | car.lm <- lm(Price ~ ., data = train.df) 64 | # use options() to ensure numbers are not displayed in scientific notation. 65 | options(scipen = 999) 66 | summary(car.lm) 67 | ``` 68 | 69 | ```{r} 70 | 71 | ``` 72 | 73 | ```{r} 74 | # use predict() to make predictions on a new set. 75 | pred <- predict(car.lm, holdout.df) 76 | 77 | options(scipen=999, digits=0) 78 | data.frame( 79 | 'Predicted' = pred[1:20], 80 | 'Actual' = holdout.df$Price[1:20], 81 | 'Residual' = holdout.df$Price[1:20] - pred[1:20] 82 | ) 83 | options(scipen=999, digits = 3) 84 | 85 | # calculate performance metrics 86 | rbind( 87 | Training=mlba::regressionSummary(predict(car.lm, train.df), train.df$Price), 88 | Holdout=mlba::regressionSummary(pred, holdout.df$Price) 89 | ) 90 | ``` 91 | 92 | ```{r} 93 | library(ggplot2) 94 | pred <- predict(car.lm, holdout.df) 95 | all.residuals <- holdout.df$Price - pred 96 | 97 | ggplot() + 98 | geom_histogram(aes(x=all.residuals), fill="lightgray", color="grey") + 99 | labs(x="Residuals", y="Frequency") 100 | ``` 101 | 102 | ```{r} 103 | g <- ggplot() + 104 | geom_histogram(aes(x=all.residuals), fill="lightgray", color="grey") + 105 | labs(x="Residuals", y="Frequency") + 106 | theme_bw() 107 | ggsave(file=file.path("..", "figures", "chapter_06", "residuals-histogram.pdf"), 108 | g, width=5, height=3, units="in") 109 | ``` 110 | 111 | ## Cross-validation and caret 112 | ```{r} 113 | set.seed(1) 114 | library(caret) 115 | # define 5-fold 116 | trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE) 117 | model <- caret::train(Price ~ ., data=car.df, 118 | method="lm", # specify the model 119 | trControl=trControl) 120 | model 121 | coef(model$finalModel) 122 | ``` 123 | 124 | ```{r} 125 | library(tidyverse) 126 | collectMetrics <- function(model, train.df, holdout.df, nPredictors) { 127 | if (missing(nPredictors)) { 128 | coefs = coef(model$finalModel) 129 | nPredictors = length(coefs) - 1 130 | } 131 | return (cbind( 132 | CV=model$results %>% slice_min(RMSE) %>% dplyr::select(c(RMSE, MAE)), 133 | Training=mlba::regressionSummary(predict(model, train.df), train.df$Price), 134 | Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$Price), 135 | nPredictors=nPredictors 136 | )) 137 | } 138 | 139 | metric.full <- collectMetrics(model, train.df, holdout.df) 140 | ``` 141 | 142 | ```{r} 143 | predict(model, car.df[1:3,]) 144 | ``` 145 | 146 | ```{r} 147 | 148 | ``` 149 | 150 | # Variable Selection in Linear Regression 151 | ## How to Reduce the Number of Predictors 152 | ### Exhaustive Search 153 | ```{r} 154 | # use regsubsets() in package leaps to run an exhaustive search. 155 | # unlike with lm, categorical predictors must be turned into dummies manually. 156 | library(leaps) 157 | library(fastDummies) 158 | 159 | # create dummies for fuel type 160 | leaps.train.df <- dummy_cols(train.df, remove_first_dummy=TRUE, 161 | remove_selected_columns=TRUE) 162 | search <- regsubsets(Price ~ ., data=leaps.train.df, nbest=1, 163 | nvmax=ncol(leaps.train.df), method="exhaustive") 164 | sum <- summary(search) 165 | 166 | # show models 167 | sum$which 168 | 169 | # show metrics 170 | sum$rsq 171 | sum$adjr2 172 | sum$cp 173 | ``` 174 | 175 | ```{r} 176 | optimal <- which.min(sum$cp) 177 | 178 | # determine the variable names for the optimal model 179 | X <- summary(search)$which[, -1] # information about included predictors 180 | xvars <- dimnames(X)[[2]] ## column names (all covariates except intercept) 181 | xvars <- xvars[X[optimal,]] 182 | 183 | # the optimal model contains all dummy variables of Fuel_Type 184 | xvars <- c("Age_08_04", "KM", "HP", "Quarterly_Tax", "Weight", "Fuel_Type") 185 | 186 | # rebuild model for best predictor set 187 | set.seed(1) 188 | trControl <- caret::trainControl(method="cv", number=5, allowParallel=TRUE) 189 | model <- caret::train(Price ~ ., data=car.df[, c("Price", xvars)], 190 | method="lm", # specify the model 191 | trControl=trControl) 192 | model 193 | coef(model$finalModel) 194 | 195 | metric.exhaustive <- collectMetrics(model, train.df, holdout.df) 196 | ``` 197 | 198 | ### Popular Subset Selection Algorithms 199 | ```{r} 200 | # as model performance is estimated using AIC, we don't need to use cross-validation 201 | trControl <- caret::trainControl(method="none") 202 | model <- caret::train(Price ~ ., data=train.df, trControl=trControl, 203 | # select backward elmination 204 | method="glmStepAIC", direction='backward') 205 | 206 | coef(model$finalModel) 207 | ``` 208 | 209 | ```{r} 210 | model <- caret::train(Price ~ ., data=train.df, trControl=trControl, 211 | method="glmStepAIC", direction='forward') 212 | 213 | coef(model$finalModel) 214 | ``` 215 | 216 | ```{r} 217 | model <- caret::train(Price ~ ., data=train.df, trControl=trControl, 218 | method="glmStepAIC", direction='both') 219 | 220 | coef(model$finalModel) 221 | ``` 222 | 223 | ```{r} 224 | rbind(Training=mlba::regressionSummary(predict(model, train.df), train.df$Price), 225 | Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$Price)) 226 | ``` 227 | 228 | ```{r} 229 | # The models are identical to the best model obtained from the exhaustive search. 230 | # We therefore duplicate the metrics. 231 | metric.stepwise <- metric.exhaustive 232 | ``` 233 | 234 | # Regularization (Shrinkage Models) 235 | ```{r} 236 | set.seed(1) 237 | library(caret) 238 | trControl <- caret::trainControl(method='cv', number=5, allowParallel=TRUE) 239 | tuneGrid <- expand.grid(lambda=10^seq(5, 2, by=-0.1), alpha=0) 240 | model <- caret::train(Price ~ ., data=train.df, 241 | method='glmnet', 242 | family='gaussian', # set the family for linear regression 243 | trControl=trControl, 244 | tuneGrid=tuneGrid) 245 | model$bestTune 246 | coef(model$finalModel, s=model$bestTune$lambda) 247 | ``` 248 | 249 | ```{r} 250 | metric.ridge <- collectMetrics(model, train.df, holdout.df, 251 | length(coef(model$finalModel, s=model$bestTune$lambda)) - 1) 252 | ridge.model <- model 253 | ``` 254 | 255 | ```{r} 256 | rbind( 257 | Training=mlba::regressionSummary(predict(model, train.df), train.df$Price), 258 | Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$Price) 259 | ) 260 | ``` 261 | 262 | ```{r} 263 | set.seed(1) 264 | tuneGrid <- expand.grid(lambda=10^seq(4, 0, by=-0.1), alpha=1) 265 | model <- caret::train(Price ~ ., data=train.df, 266 | method='glmnet', 267 | family='gaussian', # set the family for linear regression 268 | trControl=trControl, 269 | tuneGrid=tuneGrid) 270 | model$bestTune 271 | coef(model$finalModel, s=model$bestTune$lambda) 272 | ``` 273 | 274 | ```{r} 275 | lasso.model <- model 276 | metric.lasso <- collectMetrics(lasso.model, train.df, holdout.df, 277 | sum(coef(lasso.model$finalModel, s=lasso.model$bestTune$lambda) != 0) - 1) 278 | ``` 279 | 280 | ```{r} 281 | rbind( 282 | Training=mlba::regressionSummary(predict(model, train.df), train.df$Price), 283 | Holdout=mlba::regressionSummary(predict(model, holdout.df), holdout.df$Price) 284 | ) 285 | ``` 286 | 287 | ```{r} 288 | library(tidyverse) 289 | library(gridExtra) 290 | g1 <- ggplot(ridge.model$results, aes(x=lambda, y=RMSE)) + 291 | geom_pointrange(aes(ymin=RMSE-RMSESD, ymax=RMSE+RMSESD), color='grey') + 292 | geom_line() + 293 | geom_point(data=ridge.model$results %>% subset(RMSE == min(RMSE)), color='red') + 294 | labs(x=expression(paste('Ridge parameter ', lambda)), 295 | y='RMSE (cross-validation)') + 296 | scale_x_log10() 297 | g2 <- ggplot(lasso.model$results, aes(x=lambda, y=RMSE)) + 298 | geom_pointrange(aes(ymin=RMSE-RMSESD, ymax=RMSE+RMSESD), color='grey') + 299 | geom_line() + 300 | geom_point(data=lasso.model$results %>% subset(RMSE == min(RMSE)), color='red') + 301 | labs(x=expression(paste('Lasso parameter ', lambda)), 302 | y='RMSE (cross-validation)') + 303 | scale_x_log10() 304 | grid.arrange(g1, g2, ncol=2) 305 | 306 | g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw(), ncol=2) 307 | ggsave(file=file.path('..', 'figures', 'chapter_06', 'shrinkage-parameter-tuning.pdf'), 308 | g, width=6, height=2.5, units='in') 309 | ``` 310 | 311 | ```{r} 312 | data.frame(rbind( 313 | 'full'= metric.full, 314 | 'exhaustive' = metric.exhaustive, 315 | 'stepwise' = metric.stepwise, 316 | 'ridge' = metric.ridge, 317 | 'lasso' = metric.lasso 318 | )) 319 | ``` 320 | 321 | -------------------------------------------------------------------------------- /Rmd/chap7.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Machine Learning for Business Analytics" 3 | author: "Chapter 7: $k$-Nearest Neighbors ($k$" 4 | output: 5 | pdf_document: 6 | toc: no 7 | highlight: tango 8 | # html_document: 9 | # toc: yes 10 | # toc_depth: 4 11 | # toc_float: yes 12 | --- 13 | 20 | ```{r global_options, include=FALSE} 21 | knitr::opts_chunk$set(error=TRUE, # Keep compiling upon error 22 | collapse=FALSE, # collapse by default 23 | echo=TRUE, # echo code by default 24 | comment = "#>", # change comment character 25 | fig.width = 5.5, # set figure width 26 | fig.align = "center",# set figure position 27 | out.width = "49%", # set width of displayed images 28 | warning=FALSE, # do not show R warnings 29 | message=FALSE, # do not show R messages 30 | row.print=25) 31 | ``` 32 | 33 | 34 | ```{r} 35 | if (!require(mlba)) { 36 | library(devtools) 37 | install_github("gedeck/mlba/mlba", force=TRUE) 38 | } 39 | ``` 40 | 41 | # The $k$-NN Classifier (Categorical Outcome) 42 | ## Example: Riding Mowers 43 | ```{r} 44 | library(ggrepel) 45 | mowers.df <- mlba::RidingMowers 46 | set.seed(35) 47 | 48 | idx <- sample(nrow(mowers.df), 0.6*nrow(mowers.df)) 49 | train.df <- mowers.df[idx, ] 50 | holdout.df <- mowers.df[-idx, ] 51 | ## new household 52 | new.df <- data.frame(Income = 60, Lot_Size = 20) 53 | 54 | ggplot(mapping=aes(x=Income, y=Lot_Size, shape=Ownership, color=Ownership)) + 55 | geom_point(data=train.df) + 56 | geom_text_repel(aes(label=rownames(train.df)), data=train.df, show.legend = FALSE) + 57 | geom_point(data=cbind(new.df, Ownership='New')) 58 | ``` 59 | 60 | ```{r} 61 | g <- ggplot(mapping=aes(x=Income, y=Lot_Size, shape=Ownership, color=Ownership, fill=Ownership)) + 62 | geom_point(data=train.df, size=4) + 63 | geom_text_repel(aes(label=rownames(train.df)), data=train.df, show.legend = FALSE) + 64 | geom_point(data=cbind(new.df, Ownership='New'), size=5) + 65 | scale_shape_manual(values = c(18, 15, 21)) + 66 | scale_color_manual(values = c('black', 'darkorange', 'steelblue')) + 67 | scale_fill_manual(values = c('black', 'darkorange', 'lightblue')) 68 | 69 | g 70 | 71 | ggsave(file=file.path("..", "figures", "chapter_07", "knn-riding-mower.pdf"), 72 | g + theme_bw(), width=6, height=4, units="in") 73 | ``` 74 | 75 | ```{r} 76 | library(caret) 77 | # train k-NN model with k=3 78 | model <- train(Ownership ~ ., data=train.df, 79 | method="knn", # specify the model 80 | preProcess=c("center", "scale"), # normalize data 81 | tuneGrid=expand.grid(k=3), 82 | trControl=trainControl(method="none")) 83 | model 84 | 85 | # predict new data point 86 | predict(model, new.df) 87 | 88 | # determine nearest neighbors to new data point 89 | train.norm.df <- predict(model$preProcess, train.df) 90 | new.norm.df <- predict(model$preProcess, new.df) 91 | distances <- apply(train.norm.df[, 1:2], 1, 92 | function(d){ sqrt(sum((d - new.norm.df)^2)) }) 93 | rownames(train.df)[order(distances)][1:3] 94 | ``` 95 | 96 | ## Choosing $k$ 97 | ```{r} 98 | # use leave-one-out cross-validation for small dataset 99 | trControl <- trainControl(method="loocv", number=5, allowParallel=TRUE) 100 | model <- train(Ownership ~ ., data=train.df, 101 | method="knn", 102 | preProcess=c("center", "scale"), 103 | tuneGrid=expand.grid(k=seq(1, 13, 2)), 104 | trControl=trControl) 105 | model 106 | ``` 107 | 108 | ```{r} 109 | model <- train(Ownership ~ ., data=mowers.df, 110 | method="knn", 111 | preProcess=c("center", "scale"), 112 | tuneGrid=expand.grid(k=7), 113 | trControl=trainControl(method="none")) 114 | predict(model, new.df) 115 | ``` 116 | 117 | ## Setting the Cutoff Value 118 | ```{r} 119 | train.norm.df <- predict(model$preProcess, train.df) 120 | new.norm.df <- predict(model$preProcess, new.df) 121 | distances <- apply(train.norm.df[, 1:2], 1, 122 | function(d){ sqrt(sum((d - new.norm.df)^2)) }) 123 | train.df[order(distances)[1:8],] 124 | ``` 125 | 126 | -------------------------------------------------------------------------------- /Rmd/chap8.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Machine Learning for Business Analytics" 3 | author: "Chapter 8: The Naive Bayes Classifier" 4 | output: 5 | pdf_document: 6 | toc: no 7 | highlight: tango 8 | # html_document: 9 | # toc: yes 10 | # toc_depth: 4 11 | # toc_float: yes 12 | --- 13 | 20 | ```{r global_options, include=FALSE} 21 | knitr::opts_chunk$set(error=TRUE, # Keep compiling upon error 22 | collapse=FALSE, # collapse by default 23 | echo=TRUE, # echo code by default 24 | comment = "#>", # change comment character 25 | fig.width = 5.5, # set figure width 26 | fig.align = "center",# set figure position 27 | out.width = "49%", # set width of displayed images 28 | warning=FALSE, # do not show R warnings 29 | message=FALSE, # do not show R messages 30 | row.print=25) 31 | ``` 32 | 33 | 34 | ```{r} 35 | if (!require(mlba)) { 36 | library(devtools) 37 | install_github("gedeck/mlba/mlba", force=TRUE) 38 | } 39 | options(scipen=999, digits = 3) 40 | ``` 41 | 42 | # Solution: Naive Bayes 43 | ## Example 3: Predicting Delayed Flights 44 | ```{r} 45 | library(tidyverse) 46 | library(caret) 47 | library(e1071) 48 | # load and preprocess dataset 49 | delays.df <- mlba::FlightDelays %>% 50 | mutate( 51 | # change numerical variables to categorical 52 | DAY_WEEK = factor(DAY_WEEK), 53 | ORIGIN = factor(ORIGIN), 54 | DEST = factor(DEST), 55 | CARRIER = factor(CARRIER), 56 | Flight.Status = factor(Flight.Status), 57 | # create hourly bins for departure time 58 | CRS_DEP_TIME = factor(round(CRS_DEP_TIME / 100)) 59 | ) %>% 60 | select(DAY_WEEK, CRS_DEP_TIME, ORIGIN, DEST, CARRIER, Flight.Status) 61 | 62 | # create training and holdout sets 63 | set.seed(1) 64 | idx <- createDataPartition(delays.df$Flight.Status, p=0.6, list=FALSE) 65 | train.df <- delays.df[idx, ] 66 | holdout.df <- delays.df[-idx, ] 67 | 68 | # run naive bayes 69 | delays.nb <- naiveBayes(Flight.Status ~ ., data = train.df) 70 | delays.nb 71 | ``` 72 | 73 | ```{r} 74 | 75 | ``` 76 | 77 | ```{r} 78 | # use prop.table() with margin = 1 to convert a count table to a proportions table, 79 | # where each row sums up to 1 (use margin = 2 for column sums) 80 | prop.table(table(train.df$Flight.Status, train.df$DEST), margin = 1) 81 | ``` 82 | 83 | ```{r} 84 | ## predict probabilities 85 | pred.prob <- predict(delays.nb, newdata=holdout.df, type="raw") 86 | ## predict class membership 87 | pred.class <- predict(delays.nb, newdata=holdout.df) 88 | 89 | df <- data.frame(actual=holdout.df$Flight.Status, predicted=pred.class, pred.prob) 90 | 91 | df[holdout.df$CARRIER == "DL" & holdout.df$DAY_WEEK == 7 & holdout.df$CRS_DEP_TIME == 10 & 92 | holdout.df$DEST == "LGA" & holdout.df$ORIGIN == "DCA",] 93 | ``` 94 | 95 | ```{r} 96 | # training 97 | confusionMatrix(predict(delays.nb, newdata=train.df), train.df$Flight.Status) 98 | 99 | # holdout 100 | confusionMatrix(predict(delays.nb, newdata=holdout.df), holdout.df$Flight.Status) 101 | ``` 102 | 103 | ```{r} 104 | library(gains) 105 | actual <- ifelse(holdout.df$Flight.Status == "delayed", 1, 0) 106 | gain <- gains(actual, pred.prob[,"delayed"], groups=length(actual) - 2) 107 | 108 | nactual <-sum(actual) 109 | ggplot() + 110 | geom_line(aes(x=gain$cume.obs, y=gain$cume.pct.of.total * nactual)) + 111 | geom_line(aes(x=c(0, max(gain$cume.obs)), y=c(0, nactual)), color="darkgrey") + 112 | labs(x="# Cases", y="Cumulative") 113 | ``` 114 | 115 | ```{r} 116 | ggsave(file=file.path("..", "figures", "chapter_08", "Flights-NB-gain.pdf"), 117 | width=3, height=3, 118 | last_plot() + theme_bw()) 119 | ``` 120 | 121 | ## Working with Continuous Predictors 122 | ```{r} 123 | (p_delayed = dnorm(213, mean=211.36215, sd=15.31)) 124 | (p_ontime = dnorm(213, mean=211.99436, sd=12.79)) 125 | 126 | p_ontime * 0.805 127 | p_delayed * 0.195 128 | ``` 129 | 130 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-slate -------------------------------------------------------------------------------- /_layouts/default.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 16 | 18 | {% seo %} 19 | {% include head-custom.html %} 20 | 21 | 22 | 23 | 24 | 25 |
26 |
27 | {% if site.github.is_project_page %} 28 | View on GitHub 29 | {% endif %} 30 | 31 |

{{ site.title | default: site.github.repository_name }}

32 |

{{ site.description | default: site.github.project_tagline }}

33 | 34 | {% if site.show_downloads %} 35 |
36 | Download this project as a .zip file 37 | Download this project as a tar.gz file 38 |
39 | {% endif %} 40 |
41 |
42 | 43 | 44 |
45 |
46 | {{ content }} 47 |
48 |
49 | 50 | 51 | 59 | 60 | -------------------------------------------------------------------------------- /assets/css/style.scss: -------------------------------------------------------------------------------- 1 | --- 2 | --- 3 | @import "{{ site.theme }}"; 4 | 5 | table { 6 | border: 0px; 7 | } 8 | 9 | td { 10 | border: 0px; 11 | vertical-align: top; 12 | } 13 | 14 | .inner { 15 | max-width: 800px; 16 | } 17 | -------------------------------------------------------------------------------- /images/anaconda-create-environment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/images/anaconda-create-environment.png -------------------------------------------------------------------------------- /images/anaconda-environment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/images/anaconda-environment.png -------------------------------------------------------------------------------- /images/anaconda-mlba-r-environment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/images/anaconda-mlba-r-environment.png -------------------------------------------------------------------------------- /images/anaconda-open-terminal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/images/anaconda-open-terminal.png -------------------------------------------------------------------------------- /images/anaconda-packagelist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/images/anaconda-packagelist.png -------------------------------------------------------------------------------- /images/anaconda-update.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/images/anaconda-update.png -------------------------------------------------------------------------------- /images/jupyter-executing-python.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/images/jupyter-executing-python.png -------------------------------------------------------------------------------- /images/jupyter-new-notebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/images/jupyter-new-notebook.png -------------------------------------------------------------------------------- /images/jupyter-notebook-filemanager.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/images/jupyter-notebook-filemanager.png -------------------------------------------------------------------------------- /images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/images/logo.png -------------------------------------------------------------------------------- /img/mlba-bookcover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/img/mlba-bookcover.png -------------------------------------------------------------------------------- /installPython.md: -------------------------------------------------------------------------------- 1 | # Getting Started with Python 2 | Python is a powerful, general purpose programming language that can be used for many applications ranging from short scripts to enterprise applications. There is a large and growing number of free, open-source libraries and tools for scientific computing. For more information about Python and its use visit [python.org.](https://www.python.org/). 3 | 4 | ## Install Python 5 | There are many ways of using and developing with Python. However, a good start will be to use Jupyter notebooks, an interactive, browser-based Python interface available through the [Anaconda Distribution](https://www.anaconda.com/) which is particularly useful for scientific computing. We will be using Python 3.8 or newer. 6 | 7 | Here is what you need to do: 8 | 9 | - Download the Anaconda installer for Python 3.8 or later from https://www.anaconda.com/download/ for your operating system (you will be asked for your email, however this step is optional and you can proceed without providing it) 10 | - Execute the installer 11 | - macOS: double-click on the pkg file and follow the instructions using the default settings 12 | - Windows: run the exe file and follow the instructions using default settings 13 | - During installation of Anaconda, you are asked if you want to install DataSpell. This is optional, but can be a useful tool to get you started using Jupyter Notebooks. 14 | 15 | Once the application is installed, you can execute Anaconda Navigator from the Start Menu (Windows) and the Application folder (macOS). 16 | 17 | If you don’t want to use Anaconda, you will find installation instructions for Windows 10 at the end of this document. 18 | 19 | ## Anaconda Navigator – update and install packages 20 | 21 | 22 | You can use _Anaconda Navigator_ to manage your Python installation and run the Jupyter application. 23 | 24 | Use the _Environments_ tab to add packages to your Python installation. The package list looks like this: 25 | 26 | 27 | 28 | To run the deep-learning applications from the book, we require a custom environment called `mlba-r`. It is in general a good idea to use custom Python environments for different projects to reduce the possibility of Python version conflicts and to have different Python versions on the same machine. 29 | 30 | To create the `mlba-r` environment for the deep-learning applications from the book, click the **[+] Create** link at the bottom of the window. This will open a dialog where you can enter the name of the environment and the Python version. Enter `mlba-r` as the name and select Python 3.8 or newer as the version. Click the **[Create]** button to create the environment. 31 | 32 | 33 | 34 | Click the [Update index…] button to refresh the package list. From time to time, it may ask you to update the Anaconda Navigator application. It’s good practice to update regularly. 35 | If new versions become available, you will see that the version number changes. The version number of updatable packages are highlighted in blue and with an arrow next to the version number. 36 | This means that you can update the specific package. Change the pull-down menu to [Updatable] and click the green tick mark to select [Mark for update]. Do that for all the packages you want to update, select [Apply] and confirm the update. 37 | 38 | 39 | 40 | Once you initiated the update, use the [Clear] button to remove the marking. Anaconda Navigator otherwise will indicate that it is busy when you want to close the application. 41 | 42 | Updates are done in the background and will take some time and may require confirmation. There is no feedback that an update is finished. You will need to refresh the list using [Update index…] to see the progress. 43 | 44 | ## Anaconda Navigator – install packages 45 | 46 | To install a package, change the pull down to **[Not installed]** and enter e.g. tensorflow in the [Search packages] field. Click on the rectangle to select the package for download and use the [Apply] button to start the installation. 47 | Once the library is installed, it will be listed under the installed packages. 48 | 49 | - tensorflow: Python package for the Tensorflow deep learning library(https://www.tensorflow.org/) 50 | - keras: Python package for the Keras deep learning library (https://keras.io/) 51 | 52 | **Advanced**: You can also install a library from the command line, which may be faster. Use the command: 53 | `conda install packagename` 54 | 55 | 56 | 57 | ## Further information 58 | See https://gedeck.github.io/mistat-code-solutions/doc/installPython for more information about installing Python and the required packages. -------------------------------------------------------------------------------- /mlba-R.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/mlba-R.zip -------------------------------------------------------------------------------- /mlba-Rmd.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gedeck/mlba-R-code/c6909650d140f4f0e4da6987f1f97daf5786ea20/mlba-Rmd.zip --------------------------------------------------------------------------------