├── Chapter01 ├── Chapter 01_Journey from Statistics to Machine Learning.R ├── Chapter 01_Journey from Statistics to Machine Learning.py └── Data.zip ├── Chapter02 ├── Chapter 02_Parallelism of Statistics and Machine Learning.R └── Chapter 02_Parallelism of Statistics and Machine Learning.py ├── Chapter03 ├── Chapter 03_Logistic Regression vs Random Forest.R └── Chapter 03_Logistic Regression vs Random Forest.py ├── Chapter04 ├── Chapter 04_Tree based ML Models.R ├── Chapter 04_Tree based ML Models.py └── WA_Fn-UseC_-HR-Employee-Attrition.csv ├── Chapter05 ├── Chapter 05_KNN n Naive Bayes.R └── Chapter 05_KNN n Naive Bayes.py ├── Chapter06 ├── Chapter 06_SVM_n_NN.R ├── Chapter 06_SVM_n_NN.py ├── digitsdata.csv └── letterdata.csv ├── Chapter07 ├── Chapter 07_Recomm_Engine.R ├── Chapter 07_Recomm_Engine.py ├── movies.csv └── ratings.csv ├── Chapter08 ├── Chapter 08_Kmeans_PCA.R ├── Chapter 08_Kmeans_PCA.py ├── digitsdata.csv └── iris.csv ├── Chapter09 └── Chapter 09_RL.py ├── LICENSE └── README.md /Chapter01/Chapter 01_Journey from Statistics to Machine Learning.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | rm(list = ls()) 5 | 6 | # First change the following directory link to where all the input files do exist 7 | 8 | setwd("D:\\Book writing\\Codes\\Chapter 1") 9 | 10 | 11 | data <- c(4,5,1,2,7,2,6,9,3) 12 | 13 | # Calculate Mean 14 | dt_mean = mean(data) ; print(round(dt_mean,2)) 15 | 16 | # Calculate Median 17 | dt_median = median(data); print(dt_median) 18 | 19 | # Calculate Mode 20 | func_mode <- function(input_dt){ 21 | unq <- unique(input_dt) 22 | unq[which.max(tabulate(match(input_dt,unq)))] 23 | } 24 | 25 | dt_mode = func_mode(data); print(dt_mode) 26 | 27 | 28 | 29 | # Desriptive statistics - dispersion 30 | game_points <- c(35,56,43,59,63,79,35,41,64,43,93,60,77,24,82) 31 | 32 | # Calculation Variance 33 | dt_var = var(game_points); print(round(dt_var,2)) 34 | 35 | # Calculation Standard Deviation 36 | dt_std = sd(game_points); print(round(dt_std,2)) 37 | 38 | # Calculation Range 39 | range_val<-function(x) return(diff(range(x))) 40 | dt_range = range_val(game_points); print(dt_range) 41 | 42 | # Calculation Quantiles 43 | dt_quantile = quantile(game_points,probs = c(0.2,0.8,1.0)); print(dt_quantile) 44 | 45 | # Calculation Inter quartile range 46 | dt_iqr = IQR(game_points); print(dt_iqr) 47 | 48 | 49 | 50 | 51 | # Hypothesis testing 52 | 53 | xbar = 990; mu0 = 1000; s = 12.5 ; n = 30 54 | t_smple = (xbar - mu0)/(s/sqrt(n));print (round(t_smple,2)) 55 | 56 | alpha = 0.05 57 | t_alpha = qt(alpha,df= n-1);print (round(t_alpha,3)) 58 | 59 | p_val = pt(t_smple,df = n-1);print (p_val) 60 | 61 | 62 | 63 | # Normal Distribution 64 | xbar = 67; mu0 = 52; s = 16.3 65 | 66 | # Normal distribution 67 | # P (Z >= (x-mu)/sigma) 68 | # F(x) = P(X <= x) 69 | pr = 1- pnorm(67, mean=52, sd=16.3) 70 | print(paste("Prob. to score more than 67 is ",round(pr*100,2),"%")) 71 | 72 | 73 | 74 | # Chi-square independence test 75 | survey = read.csv("survey.csv",header=TRUE) 76 | 77 | tbl = table(survey$Smoke,survey$Exer) 78 | p_val = chisq.test(tbl) 79 | 80 | print(paste("P-value is :",round(p_val$p.value,3))) 81 | 82 | 83 | #ANOVA 84 | fetilizers = read.csv("fetilizers.csv",header=TRUE) 85 | 86 | # Concatenate data rows into single vector 87 | r = c(t(as.matrix(fetilizers))) 88 | f = c("fertilizer1","fertilizer2","fertilizer3") 89 | k = 3; n = 6 90 | 91 | tm = gl(k,1,n*k,factor(f)) 92 | blk = gl(n,k,k*n) 93 | av = aov(r ~ tm + blk) 94 | 95 | smry = summary(av) 96 | print(smry) 97 | 98 | 99 | 100 | 101 | 102 | # Linear Regression vs. Gradient descent 103 | train_data = read.csv("mtcars.csv",header=TRUE) 104 | 105 | attach(train_data) 106 | plot(hp, mpg, col = "blue", pch = 20) 107 | 108 | 109 | # Linear Regression 110 | model <- lm(mpg ~ hp, data = train_data) 111 | coef(model) 112 | 113 | abline(model) 114 | mtext(paste('y =', round(coef(model)[[2]],3), '* x', '+', round( coef(model)[[1]],2))) 115 | 116 | 117 | 118 | rm(list = ls()) 119 | 120 | # Linear Regression 121 | train_data = read.csv("mtcars.csv",header=TRUE) 122 | model <- lm(mpg ~ hp, data = train_data) 123 | print (coef(model)) 124 | 125 | 126 | # Gradient descent 127 | gradDesc <- function(x, y, learn_rate, conv_threshold, batch_size, max_iter) { 128 | m <- runif(1, 0, 1) 129 | c <- runif(1, 0, 1) 130 | ypred <- m * x + c 131 | MSE <- sum((y - ypred) ^ 2) / batch_size 132 | 133 | converged = F 134 | iterations = 0 135 | 136 | while(converged == F) { 137 | m_new <- m - learn_rate * ((1 / batch_size) * (sum((ypred - y) * x))) 138 | c_new <- c - learn_rate * ((1 / batch_size) * (sum(ypred - y))) 139 | 140 | m <- m_new 141 | c <- c_new 142 | ypred <- m * x + c 143 | MSE_new <- sum((y - ypred) ^ 2) / batch_size 144 | 145 | if(MSE - MSE_new <= conv_threshold) { 146 | converged = T 147 | return(paste("Iterations:",iterations,"Optimal intercept:", c, "Optimal slope:", m)) 148 | } 149 | 150 | iterations = iterations + 1 151 | 152 | if(iterations > max_iter) { 153 | converged = T 154 | return(paste("Iterations:",iterations,"Optimal intercept:", c, "Optimal slope:", m)) 155 | } 156 | 157 | MSE = MSE_new 158 | } 159 | } 160 | 161 | gradDesc(x = hp,y = mpg, learn_rate = 0.00003, conv_threshold = 1e-8, batch_size = 32, max_iter = 1500000) 162 | 163 | 164 | 165 | 166 | # Train & Test samples 167 | full_data = read.csv("mtcars.csv",header=TRUE) 168 | 169 | set.seed(123) 170 | numrow = nrow(full_data) 171 | trnind = sample(1:numrow,size = as.integer(0.7*numrow)) 172 | 173 | train_data = full_data[trnind,] 174 | test_data = full_data[-trnind,] 175 | 176 | 177 | # Train Validation & Test samples 178 | trvaltest <- function(dat,prop = c(0.5,0.25,0.25)){ 179 | nrw = nrow(dat) 180 | trnr = as.integer(nrw *prop[1]) 181 | vlnr = as.integer(nrw*prop[2]) 182 | set.seed(123) 183 | trni = sample(1:nrow(dat),trnr) 184 | trndata = dat[trni,] 185 | rmng = dat[-trni,] 186 | vlni = sample(1:nrow(rmng),vlnr) 187 | valdata = rmng[vlni,] 188 | tstdata = rmng[-vlni,] 189 | mylist = list("trn" = trndata,"val"= valdata,"tst" = tstdata) 190 | return(mylist) 191 | } 192 | 193 | outdata = trvaltest(mtcars,prop = c(0.5,0.25,0.25)) 194 | train_data = outdata$trn;valid_data = outdata$val;test_data = outdata$tst 195 | 196 | 197 | 198 | 199 | # Grid Search on Decision Trees 200 | library(rpart) 201 | input_data = read.csv("ad.csv",header=FALSE) 202 | input_data$V1559 = as.factor(input_data$V1559) 203 | set.seed(123) 204 | numrow = nrow(input_data) 205 | trnind = sample(1:numrow,size = as.integer(0.7*numrow)) 206 | 207 | train_data = input_data[trnind,];test_data = input_data[-trnind,] 208 | minspset = c(2,3);minobset = c(1,2,3) 209 | initacc = 0 210 | 211 | for (minsp in minspset){ 212 | for (minob in minobset){ 213 | tr_fit = rpart(V1559 ~.,data = train_data,method = "class",minsplit = minsp, minbucket = minob) 214 | tr_predt = predict(tr_fit,newdata = train_data,type = "class") 215 | tble = table(tr_predt,train_data$V1559) 216 | acc = (tble[1,1]+tble[2,2])/sum(tble) 217 | acc 218 | if (acc > initacc){ 219 | tr_predtst = predict(tr_fit,newdata = test_data,type = "class") 220 | tblet = table(test_data$V1559,tr_predtst) 221 | acct = (tblet[1,1]+tblet[2,2])/sum(tblet) 222 | acct 223 | print(paste("Best Score")) 224 | print( paste("Train Accuracy ",round(acc,3),"Test Accuracy",round(acct,3))) 225 | print( paste(" Min split ",minsp," Min obs per node ",minob)) 226 | print(paste("Confusion matrix on test data")) 227 | print(tblet) 228 | precsn_0 = (tblet[1,1])/(tblet[1,1]+tblet[2,1]) 229 | precsn_1 = (tblet[2,2])/(tblet[1,2]+tblet[2,2]) 230 | print(paste("Precision_0: ",round(precsn_0,3),"Precision_1: ",round(precsn_1,3))) 231 | rcall_0 = (tblet[1,1])/(tblet[1,1]+tblet[1,2]) 232 | rcall_1 = (tblet[2,2])/(tblet[2,1]+tblet[2,2]) 233 | print(paste("Recall_0: ",round(rcall_0,3),"Recall_1: ",round(rcall_1,3))) 234 | initacc = acc 235 | } 236 | } 237 | } 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | -------------------------------------------------------------------------------- /Chapter01/Chapter 01_Journey from Statistics to Machine Learning.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import os 4 | 5 | 6 | """ First change the following directory link to where all input files do exist """ 7 | 8 | os.chdir("D:\Book writing\Codes\Chapter 1") 9 | 10 | 11 | 12 | import numpy as np 13 | from scipy import stats 14 | 15 | 16 | data = np.array([4,5,1,2,7,2,6,9,3]) 17 | 18 | # Calculate Mean 19 | dt_mean = np.mean(data) ; print ("Mean :",round(dt_mean,2)) 20 | 21 | # Calculate Median 22 | dt_median = np.median(data) ; print ("Median :",dt_median) 23 | 24 | # Calculate Mode 25 | dt_mode = stats.mode(data); print ("Mode :",dt_mode[0][0]) 26 | 27 | 28 | # Deviance calculations 29 | 30 | import numpy as np 31 | from statistics import variance,stdev 32 | 33 | game_points = np.array([35,56,43,59,63,79,35,41,64,43,93,60,77,24,82]) 34 | 35 | # Calculate Variance 36 | dt_var = variance(game_points) ; print ("Sample variance:", round(dt_var,2)) 37 | 38 | # Calculate Standard Deviation 39 | dt_std = stdev(game_points) ; print ("Sample std.dev:",round(dt_std,2)) 40 | 41 | # Calculate Range 42 | dt_rng = np.max(game_points,axis=0) - np.min(game_points,axis=0) ; print ("Range:",dt_rng) 43 | 44 | 45 | #Calculate percentiles 46 | print ("Quantiles:") 47 | for val in [20,80,100]: 48 | dt_qntls = np.percentile(game_points,val) 49 | print (str(val)+"%" ,dt_qntls) 50 | 51 | # Calculate IQR 52 | q75, q25 = np.percentile(game_points, [75 ,25]); print ("Inter quartile range:",q75-q25 ) 53 | 54 | 55 | # Hypothesis testing 56 | #import scipy 57 | 58 | from scipy import stats 59 | 60 | xbar = 990; mu0 = 1000; s = 12.5; n = 30 61 | # Test Statistic 62 | t_smple = (xbar-mu0)/(s/np.sqrt(float(n))); print ("Test Statistic:",round(t_smple,2)) 63 | # Critical value from t-table 64 | alpha = 0.05 65 | t_alpha = stats.t.ppf(alpha,n-1); print ("Critical value from t-table:",round(t_alpha,3)) 66 | #Lower tail p-value from t-table 67 | p_val = stats.t.sf(np.abs(t_smple), n-1); print ("Lower tail p-value from t-table", p_val) 68 | 69 | 70 | # Normal Distribution 71 | from scipy import stats 72 | xbar = 67; mu0 = 52; s = 16.3 73 | 74 | # Calculating z-score 75 | z = (67-52)/16.3 76 | 77 | # Calculating probability under the curve 78 | p_val = 1- stats.norm.cdf(z) 79 | print ("Prob. to score more than 67 is ",round(p_val*100,2),"%") 80 | 81 | 82 | 83 | # Chi-square independence test 84 | import pandas as pd 85 | from scipy import stats 86 | 87 | survey = pd.read_csv("survey.csv") 88 | # Tabulating 2 variables with row & column variables respectively 89 | survey_tab = pd.crosstab(survey.Smoke, survey.Exer, margins = True) 90 | # Creating observed table for analysis 91 | observed = survey_tab.ix[0:4,0:3] 92 | 93 | contg = stats.chi2_contingency(observed= observed) 94 | p_value = round(contg[1],3) 95 | print ("P-value is: ",p_value) 96 | 97 | 98 | 99 | #ANOVA 100 | import pandas as pd 101 | from scipy import stats 102 | 103 | fetilizers = pd.read_csv("fetilizers.csv") 104 | 105 | one_way_anova = stats.f_oneway(fetilizers["fertilizer1"], fetilizers["fertilizer2"], fetilizers["fertilizer3"]) 106 | 107 | print ("Statistic :", round(one_way_anova[0],2),", p-value :",round(one_way_anova[1],3)) 108 | 109 | 110 | 111 | 112 | 113 | # Train & Test split 114 | import pandas as pd 115 | from sklearn.model_selection import train_test_split 116 | 117 | original_data = pd.read_csv("mtcars.csv") 118 | 119 | train_data,test_data = train_test_split(original_data,train_size = 0.7,random_state=42) 120 | 121 | 122 | # Linear Regressio vs. Gradient Descent 123 | 124 | import numpy as np 125 | import pandas as pd 126 | 127 | train_data = pd.read_csv("mtcars.csv") 128 | 129 | X = np.array(train_data["hp"]) ; y = np.array(train_data["mpg"]) 130 | X = X.reshape(32,1); y = y.reshape(32,1) 131 | 132 | from sklearn.linear_model import LinearRegression 133 | model = LinearRegression(fit_intercept = True) 134 | 135 | model.fit(X,y) 136 | print ("Linear Regression Results") 137 | print ("Intercept",model.intercept_[0] ,"Coefficient",model.coef_[0]) 138 | 139 | 140 | def gradient_descent(x, y,learn_rate, conv_threshold,batch_size,max_iter): 141 | converged = False 142 | iter = 0 143 | m = batch_size 144 | 145 | t0 = np.random.random(x.shape[1]) 146 | t1 = np.random.random(x.shape[1]) 147 | 148 | MSE = (sum([(t0 + t1*x[i] - y[i])**2 for i in range(m)])/ m) 149 | 150 | while not converged: 151 | grad0 = 1.0/m * sum([(t0 + t1*x[i] - y[i]) for i in range(m)]) 152 | grad1 = 1.0/m * sum([(t0 + t1*x[i] - y[i])*x[i] for i in range(m)]) 153 | 154 | temp0 = t0 - learn_rate * grad0 155 | temp1 = t1 - learn_rate * grad1 156 | 157 | t0 = temp0 158 | t1 = temp1 159 | 160 | MSE_New = (sum( [ (t0 + t1*x[i] - y[i])**2 for i in range(m)] ) / m) 161 | 162 | if abs(MSE - MSE_New ) <= conv_threshold: 163 | print ('Converged, iterations: ', iter) 164 | converged = True 165 | 166 | MSE = MSE_New 167 | iter += 1 168 | 169 | if iter == max_iter: 170 | print ('Max interactions reached') 171 | converged = True 172 | 173 | return t0,t1 174 | 175 | if __name__ == '__main__': 176 | Inter, Coeff = gradient_descent(x = X,y = y,learn_rate=0.00003 ,conv_threshold=1e-8, batch_size=32,max_iter=1500000) 177 | print ("Gradient Descent Results") 178 | print (('Intercept = %s Coefficient = %s') %(Inter, Coeff)) 179 | 180 | 181 | 182 | 183 | # Train Validation Test split 184 | 185 | import pandas as pd 186 | from sklearn.model_selection import train_test_split 187 | 188 | original_data = pd.read_csv("mtcars.csv") 189 | 190 | 191 | def data_split(dat,trf = 0.5,vlf=0.25,tsf = 0.25): 192 | nrows = dat.shape[0] 193 | trnr = int(nrows*trf) 194 | vlnr = int(nrows*vlf) 195 | 196 | tr_data,rmng = train_test_split(dat,train_size = trnr,random_state=42) 197 | vl_data, ts_data = train_test_split(rmng,train_size = vlnr,random_state=45) 198 | 199 | return (tr_data,vl_data,ts_data) 200 | 201 | 202 | train_data, validation_data, test_data = data_split(original_data,trf=0.5,vlf=0.25,tsf=0.25) 203 | 204 | 205 | 206 | 207 | 208 | # Grid search on Decision Trees 209 | import pandas as pd 210 | from sklearn.tree import DecisionTreeClassifier 211 | from sklearn.model_selection import train_test_split,GridSearchCV 212 | from sklearn.metrics import classification_report,confusion_matrix,accuracy_score 213 | from sklearn.pipeline import Pipeline 214 | 215 | 216 | 217 | input_data = pd.read_csv("ad.csv",header=None) 218 | 219 | X_columns = set(input_data.columns.values) 220 | y = input_data[len(input_data.columns.values)-1] 221 | X_columns.remove(len(input_data.columns.values)-1) 222 | X = input_data[list(X_columns)] 223 | 224 | X_train, X_test,y_train,y_test = train_test_split(X,y,train_size = 0.7,random_state=33) 225 | 226 | pipeline = Pipeline([ 227 | ('clf', DecisionTreeClassifier(criterion='entropy')) 228 | ]) 229 | parameters = { 230 | 'clf__max_depth': (50,100,150), 231 | 'clf__min_samples_split': (2, 3), 232 | 'clf__min_samples_leaf': (1, 2, 3) 233 | } 234 | 235 | grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy') 236 | grid_search.fit(X_train, y_train) 237 | 238 | y_pred = grid_search.predict(X_test) 239 | 240 | print ('\n Best score: \n', grid_search.best_score_) 241 | print ('\n Best parameters set: \n') 242 | best_parameters = grid_search.best_estimator_.get_params() 243 | for param_name in sorted(parameters.keys()): 244 | print ('\t%s: %r' % (param_name, best_parameters[param_name])) 245 | print ("\n Confusion Matrix on Test data \n",confusion_matrix(y_test,y_pred)) 246 | print ("\n Test Accuracy \n",accuracy_score(y_test,y_pred)) 247 | print ("\nPrecision Recall f1 table \n",classification_report(y_test, y_pred)) 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | -------------------------------------------------------------------------------- /Chapter01/Data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Statistics-for-Machine-Learning/41e73f42da97c164859641c2add6e487cbc77402/Chapter01/Data.zip -------------------------------------------------------------------------------- /Chapter02/Chapter 02_Parallelism of Statistics and Machine Learning.R: -------------------------------------------------------------------------------- 1 | 2 | rm(list = ls()) 3 | 4 | # First change the following directory link to where all the input files do exist 5 | setwd("D:\\Book writing\\Codes\\Chapter 2") 6 | 7 | 8 | 9 | # Simple Linear Regression 10 | wine_quality = read.csv("winequality-red.csv",header=TRUE,sep = ";",check.names = FALSE) 11 | names(wine_quality) <- gsub(" ", "_", names(wine_quality)) 12 | 13 | set.seed(123) 14 | numrow = nrow(wine_quality) 15 | trnind = sample(1:numrow,size = as.integer(0.7*numrow)) 16 | train_data = wine_quality[trnind,] 17 | test_data = wine_quality[-trnind,] 18 | 19 | x_train = train_data$alcohol;y_train = train_data$quality 20 | x_test = test_data$alcohol; y_test = test_data$quality 21 | 22 | x_mean = mean(x_train); y_mean = mean(y_train) 23 | x_var = sum((x_train - x_mean)**2) ; y_var = sum((y_train-y_mean)**2) 24 | covariance = sum((x_train-x_mean)*(y_train-y_mean)) 25 | 26 | b1 = covariance/x_var 27 | b0 = y_mean - b1*x_mean 28 | 29 | pred_y = b0+b1*x_test 30 | 31 | R2 <- 1 - (sum((y_test-pred_y )^2)/sum((y_test-mean(y_test))^2)) 32 | print(paste("Test Adjusted R-squared :",round(R2,4))) 33 | 34 | 35 | 36 | 37 | library(usdm) 38 | 39 | # Multi linear Regression 40 | wine_quality = read.csv("winequality-red.csv",header=TRUE,sep = ";",check.names = FALSE) 41 | names(wine_quality) <- gsub(" ", "_", names(wine_quality)) 42 | 43 | set.seed(123) 44 | numrow = nrow(wine_quality) 45 | trnind = sample(1:numrow,size = as.integer(0.7*numrow)) 46 | train_data = wine_quality[trnind,] 47 | test_data = wine_quality[-trnind,] 48 | 49 | xvars = c("volatile_acidity","chlorides","free_sulfur_dioxide", 50 | "total_sulfur_dioxide","pH","sulphates","alcohol") 51 | yvar = "quality" 52 | 53 | frmla = paste(yvar,"~",paste(xvars,collapse = "+")) 54 | lr_fit = lm(as.formula(frmla),data = train_data) 55 | print(summary(lr_fit)) 56 | 57 | #VIF calculation 58 | wine_v2 = train_data[,xvars] 59 | print(vif(wine_v2)) 60 | 61 | #Test prediction 62 | pred_y = predict(lr_fit,newdata = test_data) 63 | R2 <- 1 - (sum((test_data[,yvar]-pred_y )^2)/sum((test_data[,yvar]-mean(test_data[,yvar]))^2)) 64 | print(paste("Test Adjusted R-squared :",R2)) 65 | 66 | 67 | 68 | 69 | # xvars = c("fixed_acidity","volatile_acidity","citric_acid","residual_sugar","chlorides","free_sulfur_dioxide", 70 | # "total_sulfur_dioxide","density","pH","sulphates","alcohol") 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | # Ridge regression 79 | library(glmnet) 80 | 81 | wine_quality = read.csv("winequality-red.csv",header=TRUE,sep = ";",check.names = FALSE) 82 | names(wine_quality) <- gsub(" ", "_", names(wine_quality)) 83 | 84 | set.seed(123) 85 | numrow = nrow(wine_quality) 86 | trnind = sample(1:numrow,size = as.integer(0.7*numrow)) 87 | train_data = wine_quality[trnind,]; test_data = wine_quality[-trnind,] 88 | 89 | xvars = c("fixed_acidity","volatile_acidity","citric_acid","residual_sugar","chlorides","free_sulfur_dioxide", 90 | "total_sulfur_dioxide","density","pH","sulphates","alcohol") 91 | yvar = "quality" 92 | 93 | x_train = as.matrix(train_data[,xvars]);y_train = as.double(as.matrix(train_data[,yvar])) 94 | x_test = as.matrix(test_data[,xvars]) 95 | 96 | print(paste("Ridge Regression")) 97 | lambdas = c(1e-4,1e-3,1e-2,0.1,0.5,1.0,5.0,10.0) 98 | initrsq = 0 99 | for (lmbd in lambdas){ 100 | ridge_fit = glmnet(x_train,y_train,alpha = 0,lambda = lmbd) 101 | pred_y = predict(ridge_fit,x_test) 102 | R2 <- 1 - (sum((test_data[,yvar]-pred_y )^2)/sum((test_data[,yvar]-mean(test_data[,yvar]))^2)) 103 | 104 | if (R2 > initrsq){ 105 | print(paste("Lambda:",lmbd,"Test Adjusted R-squared :",round(R2,4))) 106 | initrsq = R2 107 | } 108 | } 109 | 110 | 111 | 112 | # Lasso Regression 113 | print(paste("Lasso Regression")) 114 | lambdas = c(1e-4,1e-3,1e-2,0.1,0.5,1.0,5.0,10.0) 115 | initrsq = 0 116 | for (lmbd in lambdas){ 117 | lasso_fit = glmnet(x_train,y_train,alpha = 1,lambda = lmbd) 118 | pred_y = predict(lasso_fit,x_test) 119 | R2 <- 1 - (sum((test_data[,yvar]-pred_y )^2)/sum((test_data[,yvar]-mean(test_data[,yvar]))^2)) 120 | 121 | if (R2 > initrsq){ 122 | print(paste("Lambda:",lmbd,"Test Adjusted R-squared :",round(R2,4))) 123 | initrsq = R2 124 | } 125 | } 126 | 127 | 128 | -------------------------------------------------------------------------------- /Chapter02/Chapter 02_Parallelism of Statistics and Machine Learning.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | import os 5 | 6 | 7 | """ First change the following directory link to where all input files do exist """ 8 | os.chdir("D:\\Book writing\\Codes\\Chapter 2") 9 | 10 | 11 | import numpy as np 12 | import pandas as pd 13 | import statsmodels.api as sm 14 | import matplotlib.pyplot as plt 15 | import seaborn as sns 16 | #from sklearn.model_selection import train_test_split 17 | #from sklearn.metrics import r2_score 18 | 19 | 20 | wine_quality = pd.read_csv("winequality-red.csv",sep=';') 21 | # Step for converting white space in columns to _ value for better handling 22 | wine_quality.rename(columns=lambda x: x.replace(" ", "_"), inplace=True) 23 | 24 | # Simple Linear Regression - chart 25 | model = sm.OLS(wine_quality['quality'],sm.add_constant(wine_quality['alcohol'])).fit() 26 | 27 | print (model.summary()) 28 | 29 | plt.scatter(wine_quality['alcohol'],wine_quality['quality'],label = 'Actual Data') 30 | plt.plot(wine_quality['alcohol'],model.params[0]+model.params[1]*wine_quality['alcohol'], 31 | c ='r',label="Regression fit") 32 | plt.title('Wine Quality regressed on Alchohol') 33 | plt.xlabel('Alcohol') 34 | plt.ylabel('Quality') 35 | plt.show() 36 | 37 | 38 | # Simple Linear Regression - Model fit 39 | import pandas as pd 40 | from sklearn.model_selection import train_test_split 41 | from sklearn.metrics import r2_score 42 | 43 | 44 | wine_quality = pd.read_csv("winequality-red.csv",sep=';') 45 | wine_quality.rename(columns=lambda x: x.replace(" ", "_"), inplace=True) 46 | 47 | x_train,x_test,y_train,y_test = train_test_split(wine_quality['alcohol'],wine_quality["quality"],train_size = 0.7,random_state=42) 48 | 49 | x_train = pd.DataFrame(x_train);x_test = pd.DataFrame(x_test) 50 | y_train = pd.DataFrame(y_train);y_test = pd.DataFrame(y_test) 51 | 52 | def mean(values): 53 | return round(sum(values)/float(len(values)),2) 54 | 55 | alcohol_mean = mean(x_train['alcohol']) 56 | quality_mean = mean(y_train['quality']) 57 | 58 | alcohol_variance = round(sum((x_train['alcohol'] - alcohol_mean)**2),2) 59 | quality_variance = round(sum((y_train['quality'] - quality_mean)**2),2) 60 | 61 | covariance = round(sum((x_train['alcohol'] - alcohol_mean) * (y_train['quality'] - quality_mean )),2) 62 | b1 = covariance/alcohol_variance 63 | b0 = quality_mean - b1*alcohol_mean 64 | print ("\n\nIntercept (B0):",round(b0,4),"Co-efficient (B1):",round(b1,4)) 65 | y_test["y_pred"] = pd.DataFrame(b0+b1*x_test['alcohol']) 66 | R_sqrd = 1- ( sum((y_test['quality']-y_test['y_pred'])**2) / sum((y_test['quality'] - mean(y_test['quality']))**2 )) 67 | print ("Test R-squared value:",round(R_sqrd,4)) 68 | 69 | 70 | # Plots - pair plots 71 | eda_colnms = [ 'volatile_acidity', 'chlorides', 'sulphates', 'alcohol','quality'] 72 | sns.set(style='whitegrid',context = 'notebook') 73 | sns.pairplot(wine_quality[eda_colnms],size = 2.5,x_vars= eda_colnms,y_vars=eda_colnms) 74 | plt.show() 75 | 76 | 77 | 78 | # Correlation coefficients 79 | corr_mat = np.corrcoef(wine_quality[eda_colnms].values.T) 80 | sns.set(font_scale=1) 81 | full_mat = sns.heatmap(corr_mat, cbar=True, annot=True, square=True, fmt='.2f', 82 | annot_kws={'size': 15}, yticklabels=eda_colnms, xticklabels=eda_colnms) 83 | 84 | plt.show() 85 | 86 | 87 | 88 | # Multi linear regression model 89 | colnms = [ 'volatile_acidity', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 90 | 'pH', 'sulphates', 'alcohol'] 91 | 92 | 93 | pdx = wine_quality[colnms] 94 | pdy = wine_quality["quality"] 95 | 96 | x_train,x_test,y_train,y_test = train_test_split(pdx,pdy,train_size = 0.7,random_state=42) 97 | x_train_new = sm.add_constant(x_train) 98 | x_test_new = sm.add_constant(x_test) 99 | 100 | #random.seed(434) 101 | full_mod = sm.OLS(y_train,x_train_new) 102 | full_res = full_mod.fit() 103 | print ("\n \n",full_res.summary()) 104 | 105 | 106 | print ("\nVariance Inflation Factor") 107 | cnames = x_train.columns 108 | for i in np.arange(0,len(cnames)): 109 | xvars = list(cnames) 110 | yvar = xvars.pop(i) 111 | mod = sm.OLS(x_train[yvar],sm.add_constant(x_train_new[xvars])) 112 | res = mod.fit() 113 | vif = 1/(1-res.rsquared) 114 | print (yvar,round(vif,3)) 115 | 116 | # Predition of data 117 | y_pred = full_res.predict(x_test_new) 118 | y_pred_df = pd.DataFrame(y_pred) 119 | y_pred_df.columns = ['y_pred'] 120 | pred_data = pd.DataFrame(y_pred_df['y_pred']) 121 | y_test_new = pd.DataFrame(y_test) 122 | #y_test_new.reset_index(inplace=True) 123 | 124 | pred_data['y_test'] = pd.DataFrame(y_test_new['quality']) 125 | 126 | # R-square calculation 127 | rsqd = r2_score(y_test_new['quality'].tolist(), y_pred_df['y_pred'].tolist()) 128 | print ("\nTest R-squared value:",round(rsqd,4)) 129 | 130 | 131 | 132 | 133 | 134 | 135 | # Ridge Regression 136 | from sklearn.linear_model import Ridge 137 | 138 | wine_quality = pd.read_csv("winequality-red.csv",sep=';') 139 | wine_quality.rename(columns=lambda x: x.replace(" ", "_"), inplace=True) 140 | 141 | all_colnms = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 142 | 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 143 | 'pH', 'sulphates', 'alcohol'] 144 | 145 | 146 | pdx = wine_quality[all_colnms] 147 | pdy = wine_quality["quality"] 148 | 149 | x_train,x_test,y_train,y_test = train_test_split(pdx,pdy,train_size = 0.7,random_state=42) 150 | 151 | alphas = [1e-4,1e-3,1e-2,0.1,0.5,1.0,5.0,10.0] 152 | 153 | initrsq = 0 154 | 155 | print ("\nRidge Regression: Best Parameters\n") 156 | for alph in alphas: 157 | ridge_reg = Ridge(alpha=alph) 158 | ridge_reg.fit(x_train,y_train) 159 | tr_rsqrd = ridge_reg.score(x_train,y_train) 160 | ts_rsqrd = ridge_reg.score(x_test,y_test) 161 | 162 | if ts_rsqrd > initrsq: 163 | print ("Lambda: ",alph,"Train R-Squared value:",round(tr_rsqrd,5),"Test R-squared value:",round(ts_rsqrd,5)) 164 | initrsq = ts_rsqrd 165 | 166 | # Coeffients of Ridge regression of best alpha value 167 | ridge_reg = Ridge(alpha=0.001) 168 | ridge_reg.fit(x_train,y_train) 169 | 170 | 171 | print ("\nRidge Regression coefficient values of Alpha = 0.001\n") 172 | for i in range(11): 173 | print (all_colnms[i],": ",ridge_reg.coef_[i]) 174 | 175 | # Lasso Regression 176 | from sklearn.linear_model import Lasso 177 | 178 | alphas = [1e-4,1e-3,1e-2,0.1,0.5,1.0,5.0,10.0] 179 | initrsq = 0 180 | print ("\nLasso Regression: Best Parameters\n") 181 | 182 | for alph in alphas: 183 | lasso_reg = Lasso(alpha=alph) 184 | lasso_reg.fit(x_train,y_train) 185 | tr_rsqrd = lasso_reg.score(x_train,y_train) 186 | ts_rsqrd = lasso_reg.score(x_test,y_test) 187 | 188 | if ts_rsqrd > initrsq: 189 | print ("Lambda: ",alph,"Train R-Squared value:",round(tr_rsqrd,5),"Test R-squared value:",round(ts_rsqrd,5)) 190 | initrsq = ts_rsqrd 191 | 192 | # Coeffients of Lasso regression of best alpha value 193 | lasso_reg = Lasso(alpha=0.001) 194 | lasso_reg.fit(x_train,y_train) 195 | 196 | print ("\nLasso Regression coefficient values of Alpha = 0.001\n") 197 | for i in range(11): 198 | print (all_colnms[i],": ",lasso_reg.coef_[i]) 199 | 200 | -------------------------------------------------------------------------------- /Chapter03/Chapter 03_Logistic Regression vs Random Forest.R: -------------------------------------------------------------------------------- 1 | 2 | rm(list = ls()) 3 | 4 | # First change the following directory link to where all the input files do exist 5 | setwd("D:\\Book writing\\Codes\\Chapter 3") 6 | 7 | library(mctest) 8 | library(dummies) 9 | library(Information) 10 | library(pROC) 11 | 12 | credit_data = read.csv("credit_data.csv") 13 | credit_data$class = credit_data$class-1 14 | 15 | # I.V Calculation 16 | IV <- create_infotables(data=credit_data, y="class", parallel=FALSE) 17 | for (i in 1:length(colnames(credit_data))-1){ 18 | seca = IV[[1]][i][1] 19 | sum(seca[[1]][5]) 20 | print(paste(colnames(credit_data)[i],",IV_Value:",round(sum(seca[[1]][5]),4))) 21 | } 22 | 23 | # Dummy variables creation 24 | dummy_stseca =data.frame(dummy(credit_data$Status_of_existing_checking_account)) 25 | dummy_ch = data.frame(dummy(credit_data$Credit_history)) 26 | dummy_purpose = data.frame(dummy(credit_data$Purpose)) 27 | dummy_savacc = data.frame(dummy(credit_data$Savings_Account)) 28 | dummy_presc = data.frame(dummy(credit_data$Present_Employment_since)) 29 | dummy_perssx = data.frame(dummy(credit_data$Personal_status_and_sex)) 30 | dummy_othdts = data.frame(dummy(credit_data$Other_debtors)) 31 | dummy_property = data.frame(dummy(credit_data$Property)) 32 | dummy_othinstpln = data.frame(dummy(credit_data$Other_installment_plans)) 33 | dummy_forgnwrkr = data.frame(dummy(credit_data$Foreign_worker)) 34 | 35 | # Cleaning the variables name from . to _ 36 | colClean <- function(x){ colnames(x) <- gsub("\\.", "_", colnames(x)); x } 37 | dummy_stseca = colClean(dummy_stseca) ;dummy_ch = colClean(dummy_ch) 38 | dummy_purpose = colClean(dummy_purpose); dummy_savacc= colClean(dummy_savacc) 39 | dummy_presc= colClean(dummy_presc);dummy_perssx= colClean(dummy_perssx); 40 | dummy_othdts= colClean(dummy_othdts);dummy_property= colClean(dummy_property); 41 | dummy_othinstpln= colClean(dummy_othinstpln);dummy_forgnwrkr= colClean(dummy_forgnwrkr); 42 | 43 | 44 | continuous_columns = c('Duration_in_month', 'Credit_amount','Installment_rate_in_percentage_of_disposable_income', 45 | 'Age_in_years','Number_of_existing_credits_at_this_bank') 46 | 47 | credit_continuous = credit_data[,continuous_columns] 48 | credit_data_new = cbind(dummy_stseca,dummy_ch,dummy_purpose,dummy_savacc,dummy_presc,dummy_perssx, 49 | dummy_othdts,dummy_property,dummy_othinstpln,dummy_forgnwrkr,credit_continuous,credit_data$class) 50 | 51 | colnames(credit_data_new)[51] <- "class" 52 | 53 | # Setting seed for repeatability of results of train & test split 54 | set.seed(123) 55 | numrow = nrow(credit_data_new) 56 | trnind = sample(1:numrow,size = as.integer(0.7*numrow)) 57 | train_data = credit_data_new[trnind,] 58 | test_data = credit_data_new[-trnind,] 59 | 60 | remove_cols_extra_dummy = c("Status_of_existing_checking_account_A11","Credit_history_A30", 61 | "Purpose_A40","Savings_Account_A61","Present_Employment_since_A71","Personal_status_and_sex_A91", 62 | "Other_debtors_A101","Property_A121","Other_installment_plans_A141","Foreign_worker_A201") 63 | 64 | # Removing insignificant variables one by one 65 | remove_cols_insig = c("Purpose_A46","Purpose_A45","Purpose_A44","Savings_Account_A63", "Other_installment_plans_A143", 66 | "Property_A123","Status_of_existing_checking_account_A12", 67 | "Present_Employment_since_A72","Present_Employment_since_A75", 68 | "Present_Employment_since_A73","Credit_history_A32","Credit_history_A33", 69 | "Purpose_A40","Present_Employment_since_A74","Purpose_A49","Purpose_A48", 70 | "Property_A122","Personal_status_and_sex_A92","Foreign_worker_A202", 71 | "Personal_status_and_sex_A94","Purpose_A42","Other_debtors_A102", 72 | "Age_in_years","Savings_Account_A64","Savings_Account_A62", 73 | "Savings_Account_A65", "Other_debtors_A103") 74 | 75 | remove_cols = c(remove_cols_extra_dummy,remove_cols_insig) 76 | 77 | glm_fit = glm(class ~.,family = "binomial",data = train_data[,!(names(train_data) %in% remove_cols)]) 78 | # Significance check - p_value 79 | summary(glm_fit) 80 | 81 | # Multi collinearity check - VIF 82 | remove_cols_vif = c(remove_cols,"class") 83 | vif_table = imcdiag(train_data[,!(names(train_data) %in% remove_cols_vif)],train_data$class,detr=0.001, conf=0.99) 84 | vif_table 85 | 86 | # Predicting probabilities 87 | train_data$glm_probs = predict(glm_fit,newdata = train_data,type = "response") 88 | test_data$glm_probs = predict(glm_fit,newdata = test_data,type = "response") 89 | 90 | # Area under ROC 91 | 92 | ROC1 <- roc(as.factor(train_data$class),train_data$glm_probs) 93 | plot(ROC1, col = "blue") 94 | print(paste("Area under the curve",round(auc(ROC1),4))) 95 | 96 | # Actual prediction based on threshold tuning 97 | threshold_vals = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9) 98 | for (thld in threshold_vals){ 99 | train_data$glm_pred = 0 100 | train_data$glm_pred[train_data$glm_probs>thld]=1 101 | 102 | tble = table(train_data$glm_pred,train_data$class) 103 | acc = (tble[1,1]+tble[2,2])/sum(tble) 104 | print(paste("Threshold",thld,"Train accuracy",round(acc,4))) 105 | 106 | } 107 | 108 | # Best threshold from above search is 0.5 with accuracy as 0.7841 109 | best_threshold = 0.5 110 | 111 | # Train confusion matrix & accuracy 112 | train_data$glm_pred = 0 113 | train_data$glm_pred[train_data$glm_probs>best_threshold]=1 114 | tble = table(train_data$glm_pred,train_data$class) 115 | acc = (tble[1,1]+tble[2,2])/sum(tble) 116 | print(paste("Confusion Matrix - Train Data")) 117 | print(tble) 118 | print(paste("Train accuracy",round(acc,4))) 119 | 120 | # Test confusion matrix & accuracy 121 | test_data$glm_pred = 0 122 | test_data$glm_pred[test_data$glm_probs>best_threshold]=1 123 | tble_test = table(test_data$glm_pred,test_data$class) 124 | acc_test = (tble_test[1,1]+tble_test[2,2])/sum(tble_test) 125 | print(paste("Confusion Matrix - Test Data")) 126 | print(tble_test) 127 | print(paste("Test accuracy",round(acc_test,4))) 128 | 129 | 130 | 131 | # Random Forest 132 | library(randomForest) 133 | library(e1071) 134 | 135 | credit_data = read.csv("credit_data.csv") 136 | 137 | credit_data$class = credit_data$class-1 138 | credit_data$class = as.factor(credit_data$class) 139 | 140 | set.seed(123) 141 | numrow = nrow(credit_data) 142 | trnind = sample(1:numrow,size = as.integer(0.7*numrow)) 143 | train_data = credit_data[trnind,] 144 | test_data = credit_data[-trnind,] 145 | 146 | rf_fit = randomForest(class~.,data = train_data,mtry=4,maxnodes= 2000,ntree=1000,nodesize = 2) 147 | rf_pred = predict(rf_fit,data = train_data,type = "response") 148 | rf_predt = predict(rf_fit,newdata = test_data,type = "response") 149 | 150 | tble = table(train_data$class,rf_pred) 151 | tblet = table(test_data$class,rf_predt) 152 | 153 | acc = (tble[1,1]+tble[2,2])/sum(tble) 154 | acct = (tblet[1,1]+tblet[2,2])/sum(tblet) 155 | print(paste("Train acc",round(acc,4),"Test acc",round(acct,4))) 156 | 157 | # Grid Search 158 | rf_grid = tune(randomForest,class~.,data = train_data,ranges = list( 159 | mtry = c(4,5), 160 | maxnodes = c(700,1000), 161 | ntree = c(1000,2000,3000), 162 | nodesize = c(1,2) 163 | ), 164 | tunecontrol = tune.control(cross = 5) 165 | ) 166 | 167 | summary(rf_grid) 168 | 169 | best_model = rf_grid$best.model 170 | summary(best_model) 171 | 172 | y_pred_train = predict(best_model,data = train_data) 173 | train_conf_mat = table(train_data$class,y_pred_train) 174 | 175 | print(paste("Train Confusion Matrix - Grid Search:")) 176 | print(train_conf_mat) 177 | 178 | train_acc = (train_conf_mat[1,1]+train_conf_mat[2,2])/sum(train_conf_mat) 179 | print(paste("Train_accuracy-Grid Search:",round(train_acc,4))) 180 | 181 | y_pred_test = predict(best_model,newdata = test_data) 182 | test_conf_mat = table(test_data$class,y_pred_test) 183 | 184 | print(paste("Test Confusion Matrix - Grid Search:")) 185 | print(test_conf_mat) 186 | 187 | test_acc = (test_conf_mat[1,1]+test_conf_mat[2,2])/sum(test_conf_mat) 188 | print(paste("Test_accuracy-Grid Search:",round(test_acc,4))) 189 | 190 | # Variable Importance 191 | vari = varImpPlot(best_model) 192 | print(paste("Variable Importance - Table")) 193 | print(vari) 194 | 195 | 196 | 197 | -------------------------------------------------------------------------------- /Chapter03/Chapter 03_Logistic Regression vs Random Forest.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | import os 5 | 6 | """ First change the following directory link to where all input files do exist """ 7 | 8 | os.chdir("D:\\Book writing\\Codes\\Chapter 3") 9 | 10 | import numpy as np 11 | import pandas as pd 12 | 13 | from sklearn.model_selection import train_test_split 14 | from sklearn.metrics import accuracy_score,classification_report 15 | 16 | credit_data = pd.read_csv("credit_data.csv") 17 | 18 | 19 | print (credit_data.head()) 20 | credit_data['class'] = credit_data['class']-1 21 | 22 | 23 | # Calculation of IV metrics 24 | def IV_calc(data,var): 25 | if data[var].dtypes == "object": 26 | dataf = data.groupby([var])['class'].agg(['count','sum']) 27 | dataf.columns = ["Total","bad"] 28 | dataf["good"] = dataf["Total"] - dataf["bad"] 29 | dataf["bad_per"] = dataf["bad"]/dataf["bad"].sum() 30 | dataf["good_per"] = dataf["good"]/dataf["good"].sum() 31 | dataf["I_V"] = (dataf["good_per"] - dataf["bad_per"]) * np.log(dataf["good_per"]/dataf["bad_per"]) 32 | return dataf 33 | else: 34 | data['bin_var'] = pd.qcut(data[var].rank(method='first'),10) 35 | dataf = data.groupby(['bin_var'])['class'].agg(['count','sum']) 36 | dataf.columns = ["Total","bad"] 37 | dataf["good"] = dataf["Total"] - dataf["bad"] 38 | dataf["bad_per"] = dataf["bad"]/dataf["bad"].sum() 39 | dataf["good_per"] = dataf["good"]/dataf["good"].sum() 40 | dataf["I_V"] = (dataf["good_per"] - dataf["bad_per"]) * np.log(dataf["good_per"]/dataf["bad_per"]) 41 | return dataf 42 | 43 | 44 | IV_calc(credit_data,'Status_of_existing_checking_account') 45 | 46 | print ("\n\nCredit History - Information Value\n") 47 | print (IV_calc(credit_data,'Credit_history')) 48 | 49 | print ("\n\nCredit History - Duration in month\n") 50 | print (IV_calc(credit_data,'Duration_in_month')) 51 | 52 | 53 | print ("\n\nInformation Value by descending order\n") 54 | discrete_columns = ['Status_of_existing_checking_account','Credit_history','Purpose','Savings_Account', 55 | 'Present_Employment_since','Personal_status_and_sex','Other_debtors','Property', 56 | 'Other_installment_plans','Housing','Job','Telephone','Foreign_worker'] 57 | 58 | continuous_columns = ['Duration_in_month', 'Credit_amount','Installment_rate_in_percentage_of_disposable_income', 59 | 'Present_residence_since', 'Age_in_years','Number_of_existing_credits_at_this_bank', 60 | 'Number_of_People_being_liable_to_provide_maintenance_for'] 61 | 62 | total_columns = discrete_columns + continuous_columns 63 | 64 | # List of IV values 65 | Iv_list = [] 66 | for col in total_columns: 67 | assigned_data = IV_calc(data = credit_data,var = col) 68 | iv_val = round(assigned_data["I_V"].sum(),3) 69 | dt_type = credit_data[col].dtypes 70 | Iv_list.append((iv_val,col,dt_type)) 71 | 72 | Iv_list = sorted(Iv_list,reverse = True) 73 | 74 | for i in range(len(Iv_list)): 75 | print (Iv_list[i][0],",",Iv_list[i][1],",type =",Iv_list[i][2]) 76 | 77 | 78 | # Retaining top 15 variables 79 | dummy_stseca = pd.get_dummies(credit_data['Status_of_existing_checking_account'], prefix='status_exs_accnt') 80 | dummy_ch = pd.get_dummies(credit_data['Credit_history'], prefix='cred_hist') 81 | dummy_purpose = pd.get_dummies(credit_data['Purpose'], prefix='purpose') 82 | dummy_savacc = pd.get_dummies(credit_data['Savings_Account'], prefix='sav_acc') 83 | dummy_presc = pd.get_dummies(credit_data['Present_Employment_since'], prefix='pre_emp_snc') 84 | dummy_perssx = pd.get_dummies(credit_data['Personal_status_and_sex'], prefix='per_stat_sx') 85 | dummy_othdts = pd.get_dummies(credit_data['Other_debtors'], prefix='oth_debtors') 86 | 87 | 88 | dummy_property = pd.get_dummies(credit_data['Property'], prefix='property') 89 | dummy_othinstpln = pd.get_dummies(credit_data['Other_installment_plans'], prefix='oth_inst_pln') 90 | dummy_forgnwrkr = pd.get_dummies(credit_data['Foreign_worker'], prefix='forgn_wrkr') 91 | 92 | #dummy_housing = pd.get_dummies(credit_data['Housing'], prefix='housing') 93 | #dummy_job = pd.get_dummies(credit_data['Job'], prefix='job') 94 | #dummy_telephn = pd.get_dummies(credit_data['Telephone'], prefix='telephn') 95 | 96 | 97 | continuous_columns = ['Duration_in_month', 'Credit_amount','Installment_rate_in_percentage_of_disposable_income', 98 | 'Age_in_years','Number_of_existing_credits_at_this_bank' ] 99 | 100 | 101 | credit_continuous = credit_data[continuous_columns] 102 | credit_data_new = pd.concat([dummy_stseca,dummy_ch,dummy_purpose,dummy_savacc,dummy_presc,dummy_perssx, 103 | dummy_property,dummy_othinstpln,dummy_othdts, 104 | dummy_forgnwrkr,credit_continuous,credit_data['class']],axis=1) 105 | 106 | x_train,x_test,y_train,y_test = train_test_split(credit_data_new.drop(['class'],axis=1),credit_data_new['class'],train_size = 0.7,random_state=42) 107 | 108 | y_train = pd.DataFrame(y_train) 109 | y_test = pd.DataFrame(y_test) 110 | 111 | 112 | # Logistic Regression 113 | remove_cols_extra_dummy = ['status_exs_accnt_A11','cred_hist_A30','purpose_A40','sav_acc_A61','pre_emp_snc_A71', 114 | 'per_stat_sx_A91','oth_debtors_A101','property_A121','oth_inst_pln_A141','forgn_wrkr_A201'] 115 | 116 | #'housing_A151','job_A171','telephn_A191', 117 | 118 | 119 | 120 | 121 | remove_cols_insig = ['purpose_A46','purpose_A45','purpose_A44','sav_acc_A63','oth_inst_pln_A143', 122 | 'property_A123','status_exs_accnt_A12','pre_emp_snc_A72','pre_emp_snc_A75', 123 | 'pre_emp_snc_A73','cred_hist_A32','cred_hist_A33','purpose_A410','pre_emp_snc_A74', 124 | 'purpose_A49','purpose_A48','property_A122','per_stat_sx_A92','forgn_wrkr_A202', 125 | 'per_stat_sx_A94','purpose_A42','oth_debtors_A102','Age_in_years','sav_acc_A64', 126 | 'sav_acc_A62','sav_acc_A65','oth_debtors_A103'] 127 | 128 | remove_cols = list(set(remove_cols_extra_dummy+remove_cols_insig)) 129 | 130 | 131 | import statsmodels.api as sm 132 | logistic_model = sm.Logit(y_train,sm.add_constant(x_train.drop(remove_cols,axis=1))).fit() 133 | print (logistic_model.summary()) 134 | 135 | 136 | # Calculation of VIF 137 | print ("\nVariance Inflation Factor") 138 | cnames = x_train.drop(remove_cols,axis=1).columns 139 | for i in np.arange(0,len(cnames)): 140 | xvars = list(cnames) 141 | yvar = xvars.pop(i) 142 | mod = sm.OLS(x_train.drop(remove_cols,axis=1)[yvar],sm.add_constant(x_train.drop(remove_cols,axis=1)[xvars])) 143 | res = mod.fit() 144 | vif = 1/(1-res.rsquared) 145 | print (yvar,round(vif,3)) 146 | 147 | 148 | y_pred = pd.DataFrame(logistic_model.predict(sm.add_constant(x_train.drop(remove_cols,axis=1)))) 149 | y_pred.columns = ["probs"] 150 | #both = pd.concat([y_train.reset_index(drop=True),y_pred],axis=1) 151 | 152 | both = pd.concat([y_train,y_pred],axis=1) 153 | 154 | zeros = both[['class','probs']][both['class']==0] 155 | ones = both[['class','probs']][both['class']==1] 156 | 157 | def df_crossjoin(df1, df2, **kwargs): 158 | df1['_tmpkey'] = 1 159 | df2['_tmpkey'] = 1 160 | res = pd.merge(df1, df2, on='_tmpkey', **kwargs).drop('_tmpkey', axis=1) 161 | res.index = pd.MultiIndex.from_product((df1.index, df2.index)) 162 | df1.drop('_tmpkey', axis=1, inplace=True) 163 | df2.drop('_tmpkey', axis=1, inplace=True) 164 | return res 165 | 166 | joined_data = df_crossjoin(ones,zeros) 167 | 168 | joined_data['concordant_pair'] = 0 169 | joined_data.loc[joined_data['probs_x'] > joined_data['probs_y'],'concordant_pair'] =1 170 | joined_data['discordant_pair'] = 0 171 | joined_data.loc[joined_data['probs_x'] < joined_data['probs_y'],'discordant_pair'] =1 172 | joined_data['tied_pair'] = 0 173 | joined_data.loc[joined_data['probs_x'] == joined_data['probs_y'],'tied_pair'] =1 174 | p_conc = (sum(joined_data['concordant_pair'])*1.0 )/ (joined_data.shape[0]) 175 | p_disc = (sum(joined_data['discordant_pair'])*1.0 )/ (joined_data.shape[0]) 176 | 177 | 178 | c_statistic = 0.5 + (p_conc - p_disc)/2.0 179 | print ("\nC-statistic:",round(c_statistic,4)) 180 | 181 | 182 | 183 | # ROC & AUC 184 | import matplotlib.pyplot as plt 185 | from sklearn import metrics 186 | from sklearn.metrics import auc 187 | fpr, tpr, thresholds = metrics.roc_curve(both['class'],both['probs'], pos_label=1) 188 | 189 | roc_auc = auc(fpr,tpr) 190 | plt.figure() 191 | lw = 2 192 | plt.plot(fpr, tpr, color='darkorange', 193 | lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 194 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') 195 | plt.xlim([0.0, 1.0]) 196 | plt.ylim([0.0, 1.05]) 197 | plt.xlabel('False Positive Rate (1-Specificity)') 198 | plt.ylabel('True Positive Rate') 199 | plt.title('ROC Curve - German Credit Data') 200 | plt.legend(loc="lower right") 201 | plt.show() 202 | 203 | # Tuning for threshold 204 | for i in list(np.arange(0,1,0.1)): 205 | both["y_pred"] = 0 206 | both.loc[both["probs"] > i, 'y_pred'] = 1 207 | print ("Threshold",i,"Train Accuracy:",round(accuracy_score(both['class'],both['y_pred']),4)) 208 | 209 | # Implement best threshold on train data 210 | both["y_pred"] = 0 211 | both.loc[both["probs"] > 0.5, 'y_pred'] = 1 212 | print ("\nTrain Confusion Matrix\n\n",pd.crosstab(both['class'],both['y_pred'],rownames = ["Actuall"],colnames = ["Predicted"])) 213 | print ("\nTrain Accuracy:",round(accuracy_score(both['class'],both['y_pred']),4)) 214 | 215 | # Predicting test output 216 | y_pred_test = pd.DataFrame(logistic_model.predict(sm.add_constant(x_test.drop(remove_cols,axis=1)))) 217 | y_pred_test.columns = ["probs"] 218 | 219 | #both_test = pd.concat([y_test.reset_index(drop=True),y_pred_test],axis=1) 220 | both_test = pd.concat([y_test,y_pred_test],axis=1) 221 | both_test["y_pred"] = 0 222 | both_test.loc[both_test["probs"] > 0.5, 'y_pred'] = 1 223 | print ("\nTest Confusion Matrix\n\n",pd.crosstab(both_test['class'],both_test['y_pred'],rownames = ["Actuall"],colnames = ["Predicted"])) 224 | print ("\nTest Accuracy:",round(accuracy_score(both_test['class'],both_test['y_pred']),4)) 225 | 226 | 227 | # Random Forest - Scikit Learn 228 | import pandas as pd 229 | from sklearn.ensemble import RandomForestClassifier 230 | 231 | credit_data = pd.read_csv("credit_data.csv") 232 | credit_data['class'] = credit_data['class']-1 233 | 234 | dummy_stseca = pd.get_dummies(credit_data['Status_of_existing_checking_account'], prefix='status_exs_accnt') 235 | dummy_ch = pd.get_dummies(credit_data['Credit_history'], prefix='cred_hist') 236 | dummy_purpose = pd.get_dummies(credit_data['Purpose'], prefix='purpose') 237 | dummy_savacc = pd.get_dummies(credit_data['Savings_Account'], prefix='sav_acc') 238 | dummy_presc = pd.get_dummies(credit_data['Present_Employment_since'], prefix='pre_emp_snc') 239 | dummy_perssx = pd.get_dummies(credit_data['Personal_status_and_sex'], prefix='per_stat_sx') 240 | dummy_othdts = pd.get_dummies(credit_data['Other_debtors'], prefix='oth_debtors') 241 | dummy_property = pd.get_dummies(credit_data['Property'], prefix='property') 242 | dummy_othinstpln = pd.get_dummies(credit_data['Other_installment_plans'], prefix='oth_inst_pln') 243 | dummy_housing = pd.get_dummies(credit_data['Housing'], prefix='housing') 244 | dummy_job = pd.get_dummies(credit_data['Job'], prefix='job') 245 | dummy_telephn = pd.get_dummies(credit_data['Telephone'], prefix='telephn') 246 | dummy_forgnwrkr = pd.get_dummies(credit_data['Foreign_worker'], prefix='forgn_wrkr') 247 | 248 | continuous_columns = ['Duration_in_month', 'Credit_amount','Installment_rate_in_percentage_of_disposable_income', 249 | 'Present_residence_since', 'Age_in_years','Number_of_existing_credits_at_this_bank', 250 | 'Number_of_People_being_liable_to_provide_maintenance_for'] 251 | 252 | credit_continuous = credit_data[continuous_columns] 253 | credit_data_new = pd.concat([dummy_stseca,dummy_ch,dummy_purpose,dummy_savacc,dummy_presc,dummy_perssx, 254 | dummy_othdts,dummy_property,dummy_othinstpln,dummy_housing,dummy_job, 255 | dummy_telephn,dummy_forgnwrkr,credit_continuous,credit_data['class']],axis=1) 256 | 257 | x_train,x_test,y_train,y_test = train_test_split(credit_data_new.drop(['class'],axis=1),credit_data_new['class'],train_size = 0.7,random_state=42) 258 | 259 | 260 | rf_fit = RandomForestClassifier(n_estimators=1000,criterion="gini",max_depth=100,min_samples_split=3,min_samples_leaf=2) 261 | rf_fit.fit(x_train,y_train) 262 | 263 | print ("\nRandom Forest - Train Confusion Matrix\n\n",pd.crosstab(y_train,rf_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"])) 264 | print ("\nRandom Forest - Train accuracy",round(accuracy_score(y_train,rf_fit.predict(x_train)),3)) 265 | 266 | print ("\n\nRandom Forest - Test Confusion Matrix\n\n",pd.crosstab(y_test,rf_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"])) 267 | print ("\nRandom Forest - Test accuracy",round(accuracy_score(y_test,rf_fit.predict(x_test)),3)) 268 | 269 | 270 | 271 | # Grid Search 272 | from sklearn.pipeline import Pipeline 273 | from sklearn.model_selection import train_test_split,GridSearchCV 274 | 275 | pipeline = Pipeline([ 276 | ('clf',RandomForestClassifier(criterion='gini')) ]) 277 | 278 | parameters = { 279 | 'clf__n_estimators':(1000,2000,3000), 280 | 'clf__max_depth':(100,200,300), 281 | 'clf__min_samples_split':(2,3), 282 | 'clf__min_samples_leaf':(1,2) } 283 | 284 | grid_search = GridSearchCV(pipeline,parameters,n_jobs=-1,cv=5,verbose=1,scoring='accuracy') 285 | grid_search.fit(x_train,y_train) 286 | 287 | 288 | print ('Best Training score: %0.3f' % grid_search.best_score_) 289 | print ('Best parameters set:') 290 | best_parameters = grid_search.best_estimator_.get_params() 291 | for param_name in sorted(parameters.keys()): 292 | print ('\t%s: %r' % (param_name, best_parameters[param_name])) 293 | 294 | predictions = grid_search.predict(x_test) 295 | 296 | print ("Testing accuracy:",round(accuracy_score(y_test, predictions),4)) 297 | print ("\nComplete report of Testing data\n",classification_report(y_test, predictions)) 298 | 299 | print ("\n\nRandom Forest Grid Search- Test Confusion Matrix\n\n",pd.crosstab(y_test, predictions,rownames = ["Actuall"],colnames = ["Predicted"])) 300 | 301 | 302 | # Variable Importance chart 303 | import matplotlib.pyplot as plt 304 | rf_fit = RandomForestClassifier(n_estimators=1000,criterion="gini",max_depth=300,min_samples_split=3,min_samples_leaf=1) 305 | rf_fit.fit(x_train,y_train) 306 | 307 | importances = rf_fit.feature_importances_ 308 | std = np.std([tree.feature_importances_ for tree in rf_fit.estimators_], axis=0) 309 | indices = np.argsort(importances)[::-1] 310 | 311 | colnames = list(x_train.columns) 312 | # Print the feature ranking 313 | print("\nFeature ranking:\n") 314 | for f in range(x_train.shape[1]): 315 | print ("Feature",indices[f],",",colnames[indices[f]],round(importances[indices[f]],4)) 316 | 317 | plt.figure() 318 | #plt.title("Variable importance") 319 | plt.bar(range(x_train.shape[1]), importances[indices], 320 | color="r", yerr=std[indices], align="center") 321 | plt.xticks(range(x_train.shape[1]), indices) 322 | plt.xlim([-1, x_train.shape[1]]) 323 | plt.show() 324 | 325 | 326 | indexi = list(indices) 327 | colnms = list(x_train.columns[indices]) 328 | impclnms = list(importances[indices]) 329 | 330 | 331 | print ("\nVariable Importance Values\n") 332 | for i in range(len(importances)): 333 | print ("Variable Index",indexi[i],",",colnms[i],",",round(impclnms[i],4)) 334 | 335 | -------------------------------------------------------------------------------- /Chapter04/Chapter 04_Tree based ML Models.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | rm(list = ls()) 5 | 6 | # First change the following directory link to where all the input files do exist 7 | setwd("D:\\Book writing\\Codes\\Chapter 4") 8 | 9 | 10 | library(randomForest) 11 | library(gbm) 12 | library(xgboost) 13 | library(C50) 14 | library(e1071) 15 | library(caret) 16 | 17 | hrattr_data = read.csv("WA_Fn-UseC_-HR-Employee-Attrition.csv") 18 | 19 | str(hrattr_data) 20 | summary(hrattr_data) 21 | 22 | hrattr_data$Attrition_ind = 0;hrattr_data$Attrition_ind[hrattr_data$Attrition=="Yes"]=1 23 | hrattr_data$Attrition_ind = as.factor(hrattr_data$Attrition_ind) 24 | 25 | remove_cols = c("EmployeeCount","EmployeeNumber","Over18","StandardHours","Attrition") 26 | hrattr_data_new = hrattr_data[,!(names(hrattr_data) %in% remove_cols)] 27 | 28 | set.seed(123) 29 | numrow = nrow(hrattr_data_new) 30 | trnind = sample(1:numrow,size = as.integer(0.7*numrow)) 31 | train_data = hrattr_data_new[trnind,] 32 | test_data = hrattr_data_new[-trnind,] 33 | 34 | frac_trzero = (table(train_data$Attrition_ind)[[1]])/nrow(train_data) 35 | frac_trone = (table(train_data$Attrition_ind)[[2]])/nrow(train_data) 36 | 37 | frac_tszero = (table(test_data$Attrition_ind)[[1]])/nrow(test_data) 38 | frac_tsone = (table(test_data$Attrition_ind)[[2]])/nrow(test_data) 39 | 40 | prec_zero <- function(act,pred){ tble = table(act,pred) 41 | return( round( tble[1,1]/(tble[1,1]+tble[2,1]),4) ) } 42 | 43 | prec_one <- function(act,pred){ tble = table(act,pred) 44 | return( round( tble[2,2]/(tble[2,2]+tble[1,2]),4) ) } 45 | 46 | recl_zero <- function(act,pred){tble = table(act,pred) 47 | return( round( tble[1,1]/(tble[1,1]+tble[1,2]),4) ) } 48 | 49 | recl_one <- function(act,pred){ tble = table(act,pred) 50 | return( round( tble[2,2]/(tble[2,2]+tble[2,1]),4) ) } 51 | 52 | accrcy <- function(act,pred){ tble = table(act,pred) 53 | return( round((tble[1,1]+tble[2,2])/sum(tble),4)) } 54 | 55 | 56 | # Decision Trees using C5.0 package 57 | library(C50) 58 | dtree_fit = C5.0(train_data[-31],train_data$Attrition_ind,costs = NULL, 59 | control = C5.0Control(minCases = 1)) 60 | 61 | summary(dtree_fit) 62 | 63 | tr_y_pred = predict(dtree_fit, train_data,type = "class") 64 | ts_y_pred = predict(dtree_fit,test_data,type = "class") 65 | 66 | tr_y_act = train_data$Attrition_ind;ts_y_act = test_data$Attrition_ind 67 | 68 | tr_tble = table(tr_y_act,tr_y_pred) 69 | print(paste("Train Confusion Matrix")) 70 | print(tr_tble) 71 | 72 | tr_acc = accrcy(tr_y_act,tr_y_pred) 73 | trprec_zero = prec_zero(tr_y_act,tr_y_pred); trrecl_zero = recl_zero(tr_y_act,tr_y_pred) 74 | trprec_one = prec_one(tr_y_act,tr_y_pred); trrecl_one = recl_one(tr_y_act,tr_y_pred) 75 | 76 | trprec_ovll = trprec_zero *frac_trzero + trprec_one*frac_trone 77 | trrecl_ovll = trrecl_zero *frac_trzero + trrecl_one*frac_trone 78 | 79 | print(paste("Decision Tree Train accuracy:",tr_acc)) 80 | print(paste("Decision Tree - Train Classification Report")) 81 | print(paste("Zero_Precision",trprec_zero,"Zero_Recall",trrecl_zero)) 82 | print(paste("One_Precision",trprec_one,"One_Recall",trrecl_one)) 83 | print(paste("Overall_Precision",round(trprec_ovll,4),"Overall_Recall",round(trrecl_ovll,4))) 84 | 85 | ts_tble = table(ts_y_act,ts_y_pred) 86 | print(paste("Test Confusion Matrix")) 87 | print(ts_tble) 88 | 89 | ts_acc = accrcy(ts_y_act,ts_y_pred) 90 | tsprec_zero = prec_zero(ts_y_act,ts_y_pred); tsrecl_zero = recl_zero(ts_y_act,ts_y_pred) 91 | tsprec_one = prec_one(ts_y_act,ts_y_pred); tsrecl_one = recl_one(ts_y_act,ts_y_pred) 92 | 93 | tsprec_ovll = tsprec_zero *frac_tszero + tsprec_one*frac_tsone 94 | tsrecl_ovll = tsrecl_zero *frac_tszero + tsrecl_one*frac_tsone 95 | 96 | print(paste("Decision Tree Test accuracy:",ts_acc)) 97 | print(paste("Decision Tree - Test Classification Report")) 98 | print(paste("Zero_Precision",tsprec_zero,"Zero_Recall",tsrecl_zero)) 99 | print(paste("One_Precision",tsprec_one,"One_Recall",tsrecl_one)) 100 | print(paste("Overall_Precision",round(tsprec_ovll,4),"Overall_Recall",round(tsrecl_ovll,4))) 101 | 102 | 103 | #Decision Trees using C5.0 package - Error Costs 104 | library(C50) 105 | 106 | class_zero_wgt = c(0.01,0.1,0.2,0.3,0.4,0.5) 107 | 108 | for (cwt in class_zero_wgt){ 109 | cwtz = cwt 110 | cwto = 1-cwtz 111 | cstvr = cwto/cwtz 112 | 113 | error_cost <- matrix(c(0, 1, cstvr, 0), nrow = 2) 114 | 115 | dtree_fit = C5.0(train_data[-31],train_data$Attrition_ind,costs = error_cost, 116 | control = C5.0Control(minCases = 1)) 117 | 118 | summary(dtree_fit) 119 | 120 | tr_y_pred = predict(dtree_fit, train_data,type = "class") 121 | ts_y_pred = predict(dtree_fit,test_data,type = "class") 122 | 123 | tr_y_act = train_data$Attrition_ind;ts_y_act = test_data$Attrition_ind 124 | tr_acc = accrcy(tr_y_act,tr_y_pred) 125 | ts_acc = accrcy(ts_y_act,ts_y_pred) 126 | 127 | print(paste("Class weights","{0:",cwtz,"1:",cwto,"}", 128 | "Decision Tree Train accuracy:",tr_acc, 129 | "Decision Tree Test accuracy:",ts_acc)) 130 | ts_tble = table(ts_y_act,ts_y_pred) 131 | print(paste("Test Confusion Matrix")) 132 | print(ts_tble) 133 | 134 | } 135 | 136 | 137 | # Bagging Classifier - using Random forest package but all variables selected 138 | library(randomForest) 139 | 140 | set.seed(43) 141 | rf_fit = randomForest(Attrition_ind~.,data = train_data,mtry=30,maxnodes= 64, 142 | classwt = c(0.3,0.7),ntree=5000,nodesize = 1) 143 | 144 | tr_y_pred = predict(rf_fit,data = train_data,type = "response") 145 | ts_y_pred = predict(rf_fit,newdata = test_data,type = "response") 146 | 147 | tr_y_act = train_data$Attrition_ind;ts_y_act = test_data$Attrition_ind 148 | 149 | tr_tble = table(tr_y_act,tr_y_pred) 150 | print(paste("Train Confusion Matrix")) 151 | print(tr_tble) 152 | 153 | tr_acc = accrcy(tr_y_act,tr_y_pred) 154 | trprec_zero = prec_zero(tr_y_act,tr_y_pred); trrecl_zero = recl_zero(tr_y_act,tr_y_pred) 155 | trprec_one = prec_one(tr_y_act,tr_y_pred); trrecl_one = recl_one(tr_y_act,tr_y_pred) 156 | 157 | trprec_ovll = trprec_zero *frac_trzero + trprec_one*frac_trone 158 | trrecl_ovll = trrecl_zero *frac_trzero + trrecl_one*frac_trone 159 | 160 | print(paste("Random Forest Train accuracy:",tr_acc)) 161 | print(paste("Random Forest - Train Classification Report")) 162 | print(paste("Zero_Precision",trprec_zero,"Zero_Recall",trrecl_zero)) 163 | print(paste("One_Precision",trprec_one,"One_Recall",trrecl_one)) 164 | print(paste("Overall_Precision",round(trprec_ovll,4),"Overall_Recall",round(trrecl_ovll,4))) 165 | 166 | ts_tble = table(ts_y_act,ts_y_pred) 167 | print(paste("Test Confusion Matrix")) 168 | print(ts_tble) 169 | 170 | ts_acc = accrcy(ts_y_act,ts_y_pred) 171 | tsprec_zero = prec_zero(ts_y_act,ts_y_pred); tsrecl_zero = recl_zero(ts_y_act,ts_y_pred) 172 | tsprec_one = prec_one(ts_y_act,ts_y_pred); tsrecl_one = recl_one(ts_y_act,ts_y_pred) 173 | 174 | tsprec_ovll = tsprec_zero *frac_tszero + tsprec_one*frac_tsone 175 | tsrecl_ovll = tsrecl_zero *frac_tszero + tsrecl_one*frac_tsone 176 | 177 | print(paste("Random Forest Test accuracy:",ts_acc)) 178 | print(paste("Random Forest - Test Classification Report")) 179 | print(paste("Zero_Precision",tsprec_zero,"Zero_Recall",tsrecl_zero)) 180 | print(paste("One_Precision",tsprec_one,"One_Recall",tsrecl_one)) 181 | print(paste("Overall_Precision",round(tsprec_ovll,4),"Overall_Recall",round(tsrecl_ovll,4))) 182 | 183 | 184 | # Random Forest 185 | library(randomForest) 186 | set.seed(43) 187 | rf_fit = randomForest(Attrition_ind~.,data = train_data,mtry=6,maxnodes= 64, 188 | classwt = c(0.3,0.7),ntree=5000,nodesize = 1) 189 | tr_y_pred = predict(rf_fit,data = train_data,type = "response") 190 | ts_y_pred = predict(rf_fit,newdata = test_data,type = "response") 191 | 192 | tr_y_act = train_data$Attrition_ind;ts_y_act = test_data$Attrition_ind 193 | 194 | tr_tble = table(tr_y_act,tr_y_pred) 195 | print(paste("Train Confusion Matrix")) 196 | print(tr_tble) 197 | 198 | tr_acc = accrcy(tr_y_act,tr_y_pred) 199 | trprec_zero = prec_zero(tr_y_act,tr_y_pred); trrecl_zero = recl_zero(tr_y_act,tr_y_pred) 200 | trprec_one = prec_one(tr_y_act,tr_y_pred); trrecl_one = recl_one(tr_y_act,tr_y_pred) 201 | 202 | trprec_ovll = trprec_zero *frac_trzero + trprec_one*frac_trone 203 | trrecl_ovll = trrecl_zero *frac_trzero + trrecl_one*frac_trone 204 | 205 | print(paste("Random Forest Train accuracy:",tr_acc)) 206 | print(paste("Random Forest - Train Classification Report")) 207 | print(paste("Zero_Precision",trprec_zero,"Zero_Recall",trrecl_zero)) 208 | print(paste("One_Precision",trprec_one,"One_Recall",trrecl_one)) 209 | print(paste("Overall_Precision",round(trprec_ovll,4),"Overall_Recall",round(trrecl_ovll,4))) 210 | 211 | ts_tble = table(ts_y_act,ts_y_pred) 212 | print(paste("Test Confusion Matrix")) 213 | print(ts_tble) 214 | 215 | ts_acc = accrcy(ts_y_act,ts_y_pred) 216 | tsprec_zero = prec_zero(ts_y_act,ts_y_pred); tsrecl_zero = recl_zero(ts_y_act,ts_y_pred) 217 | tsprec_one = prec_one(ts_y_act,ts_y_pred); tsrecl_one = recl_one(ts_y_act,ts_y_pred) 218 | 219 | tsprec_ovll = tsprec_zero *frac_tszero + tsprec_one*frac_tsone 220 | tsrecl_ovll = tsrecl_zero *frac_tszero + tsrecl_one*frac_tsone 221 | 222 | print(paste("Random Forest Test accuracy:",ts_acc)) 223 | print(paste("Random Forest - Test Classification Report")) 224 | print(paste("Zero_Precision",tsprec_zero,"Zero_Recall",tsrecl_zero)) 225 | print(paste("One_Precision",tsprec_one,"One_Recall",tsrecl_one)) 226 | print(paste("Overall_Precision",round(tsprec_ovll,4),"Overall_Recall",round(tsrecl_ovll,4))) 227 | 228 | 229 | 230 | # Grid Search - Random Forest 231 | library(e1071) 232 | library(randomForest) 233 | rf_grid = tune(randomForest,Attrition_ind~.,data = train_data,classwt = c(0.3,0.7),ranges = list( 234 | mtry = c(5,6), 235 | maxnodes = c(32,64), 236 | ntree = c(3000,5000), 237 | nodesize = c(1,2) 238 | ), 239 | tunecontrol = tune.control(cross = 5) 240 | ) 241 | 242 | print(paste("Best parameter from Grid Search")) 243 | print(summary(rf_grid)) 244 | 245 | best_model = rf_grid$best.model 246 | 247 | tr_y_pred = predict(best_model,data = train_data,type = "response") 248 | ts_y_pred = predict(best_model,newdata = test_data,type = "response") 249 | 250 | tr_y_act = train_data$Attrition_ind;ts_y_act = test_data$Attrition_ind 251 | 252 | tr_tble = table(tr_y_act,tr_y_pred) 253 | print(paste("Random Forest Grid search Train Confusion Matrix")) 254 | print(tr_tble) 255 | 256 | tr_acc = accrcy(tr_y_act,tr_y_pred) 257 | trprec_zero = prec_zero(tr_y_act,tr_y_pred); trrecl_zero = recl_zero(tr_y_act,tr_y_pred) 258 | trprec_one = prec_one(tr_y_act,tr_y_pred); trrecl_one = recl_one(tr_y_act,tr_y_pred) 259 | 260 | trprec_ovll = trprec_zero *frac_trzero + trprec_one*frac_trone 261 | trrecl_ovll = trrecl_zero *frac_trzero + trrecl_one*frac_trone 262 | 263 | print(paste("Random Forest Grid Search Train accuracy:",tr_acc)) 264 | print(paste("Random Forest Grid Search - Train Classification Report")) 265 | print(paste("Zero_Precision",trprec_zero,"Zero_Recall",trrecl_zero)) 266 | print(paste("One_Precision",trprec_one,"One_Recall",trrecl_one)) 267 | print(paste("Overall_Precision",round(trprec_ovll,4),"Overall_Recall",round(trrecl_ovll,4))) 268 | 269 | ts_tble = table(ts_y_act,ts_y_pred) 270 | print(paste("Random Forest Grid search Test Confusion Matrix")) 271 | print(ts_tble) 272 | 273 | ts_acc = accrcy(ts_y_act,ts_y_pred) 274 | tsprec_zero = prec_zero(ts_y_act,ts_y_pred); tsrecl_zero = recl_zero(ts_y_act,ts_y_pred) 275 | tsprec_one = prec_one(ts_y_act,ts_y_pred); tsrecl_one = recl_one(ts_y_act,ts_y_pred) 276 | 277 | tsprec_ovll = tsprec_zero *frac_tszero + tsprec_one*frac_tsone 278 | tsrecl_ovll = tsrecl_zero *frac_tszero + tsrecl_one*frac_tsone 279 | 280 | print(paste("Random Forest Grid Search Test accuracy:",ts_acc)) 281 | print(paste("Random Forest Grid Search - Test Classification Report")) 282 | print(paste("Zero_Precision",tsprec_zero,"Zero_Recall",tsrecl_zero)) 283 | print(paste("One_Precision",tsprec_one,"One_Recall",tsrecl_one)) 284 | print(paste("Overall_Precision",round(tsprec_ovll,4),"Overall_Recall",round(tsrecl_ovll,4))) 285 | 286 | 287 | # Adaboost classifier using C5.0 with trails included for boosting 288 | library(C50) 289 | 290 | class_zero_wgt = 0.3 291 | class_one_wgt = 1-class_zero_wgt 292 | cstvr = class_one_wgt/class_zero_wgt 293 | 294 | error_cost <- matrix(c(0, 1, cstvr, 0), nrow = 2) 295 | 296 | # Fitting Adaboost model 297 | ada_fit = C5.0(train_data[-31],train_data$Attrition_ind,costs = error_cost, 298 | trails = 5000,control = C5.0Control(minCases = 1)) 299 | 300 | summary(ada_fit) 301 | 302 | tr_y_pred = predict(ada_fit, train_data,type = "class") 303 | ts_y_pred = predict(ada_fit,test_data,type = "class") 304 | 305 | tr_y_act = train_data$Attrition_ind;ts_y_act = test_data$Attrition_ind 306 | 307 | tr_tble = table(tr_y_act,tr_y_pred) 308 | print(paste("AdaBoost - Train Confusion Matrix")) 309 | print(tr_tble) 310 | 311 | tr_acc = accrcy(tr_y_act,tr_y_pred) 312 | trprec_zero = prec_zero(tr_y_act,tr_y_pred); trrecl_zero = recl_zero(tr_y_act,tr_y_pred) 313 | trprec_one = prec_one(tr_y_act,tr_y_pred); trrecl_one = recl_one(tr_y_act,tr_y_pred) 314 | 315 | trprec_ovll = trprec_zero *frac_trzero + trprec_one*frac_trone 316 | trrecl_ovll = trrecl_zero *frac_trzero + trrecl_one*frac_trone 317 | 318 | print(paste("AdaBoost Train accuracy:",tr_acc)) 319 | print(paste("AdaBoost - Train Classification Report")) 320 | print(paste("Zero_Precision",trprec_zero,"Zero_Recall",trrecl_zero)) 321 | print(paste("One_Precision",trprec_one,"One_Recall",trrecl_one)) 322 | print(paste("Overall_Precision",round(trprec_ovll,4),"Overall_Recall",round(trrecl_ovll,4))) 323 | 324 | ts_tble = table(ts_y_act,ts_y_pred) 325 | print(paste("AdaBoost - Test Confusion Matrix")) 326 | print(ts_tble) 327 | 328 | ts_acc = accrcy(ts_y_act,ts_y_pred) 329 | tsprec_zero = prec_zero(ts_y_act,ts_y_pred); tsrecl_zero = recl_zero(ts_y_act,ts_y_pred) 330 | tsprec_one = prec_one(ts_y_act,ts_y_pred); tsrecl_one = recl_one(ts_y_act,ts_y_pred) 331 | 332 | tsprec_ovll = tsprec_zero *frac_tszero + tsprec_one*frac_tsone 333 | tsrecl_ovll = tsrecl_zero *frac_tszero + tsrecl_one*frac_tsone 334 | 335 | print(paste("AdaBoost Test accuracy:",ts_acc)) 336 | print(paste("AdaBoost - Test Classification Report")) 337 | print(paste("Zero_Precision",tsprec_zero,"Zero_Recall",tsrecl_zero)) 338 | print(paste("One_Precision",tsprec_one,"One_Recall",tsrecl_one)) 339 | print(paste("Overall_Precision",round(tsprec_ovll,4),"Overall_Recall",round(tsrecl_ovll,4))) 340 | 341 | 342 | # Gradient boosting 343 | library(gbm) 344 | library(caret) 345 | 346 | set.seed(43) 347 | 348 | # Giving weights to all the observations in a way that total weights will be euqal 1 349 | model_weights <- ifelse(train_data$Attrition_ind == "0", 350 | (1/table(train_data$Attrition_ind)[1]) * 0.3, 351 | (1/table(train_data$Attrition_ind)[2]) * 0.7) 352 | 353 | # Setting parameters for GBM 354 | grid <- expand.grid(n.trees = 5000, interaction.depth = 1, shrinkage = .04, n.minobsinnode = 1) 355 | 356 | # Fitting the GBM model 357 | gbm_fit <- train(Attrition_ind ~ ., data = train_data, method = "gbm", weights = model_weights, 358 | tuneGrid=grid,verbose = FALSE) 359 | # To print variable importance plot 360 | summary(gbm_fit) 361 | 362 | tr_y_pred = predict(gbm_fit, train_data,type = "raw") 363 | ts_y_pred = predict(gbm_fit,test_data,type = "raw") 364 | 365 | tr_y_act = train_data$Attrition_ind;ts_y_act = test_data$Attrition_ind 366 | 367 | tr_tble = table(tr_y_act,tr_y_pred) 368 | print(paste("Gradient Boosting - Train Confusion Matrix")) 369 | print(tr_tble) 370 | 371 | tr_acc = accrcy(tr_y_act,tr_y_pred) 372 | trprec_zero = prec_zero(tr_y_act,tr_y_pred); trrecl_zero = recl_zero(tr_y_act,tr_y_pred) 373 | trprec_one = prec_one(tr_y_act,tr_y_pred); trrecl_one = recl_one(tr_y_act,tr_y_pred) 374 | 375 | trprec_ovll = trprec_zero *frac_trzero + trprec_one*frac_trone 376 | trrecl_ovll = trrecl_zero *frac_trzero + trrecl_one*frac_trone 377 | 378 | print(paste("Gradient Boosting Train accuracy:",tr_acc)) 379 | print(paste("Gradient Boosting - Train Classification Report")) 380 | print(paste("Zero_Precision",trprec_zero,"Zero_Recall",trrecl_zero)) 381 | print(paste("One_Precision",trprec_one,"One_Recall",trrecl_one)) 382 | print(paste("Overall_Precision",round(trprec_ovll,4),"Overall_Recall",round(trrecl_ovll,4))) 383 | 384 | ts_tble = table(ts_y_act,ts_y_pred) 385 | print(paste("Gradient Boosting - Test Confusion Matrix")) 386 | print(ts_tble) 387 | 388 | ts_acc = accrcy(ts_y_act,ts_y_pred) 389 | tsprec_zero = prec_zero(ts_y_act,ts_y_pred); tsrecl_zero = recl_zero(ts_y_act,ts_y_pred) 390 | tsprec_one = prec_one(ts_y_act,ts_y_pred); tsrecl_one = recl_one(ts_y_act,ts_y_pred) 391 | 392 | tsprec_ovll = tsprec_zero *frac_tszero + tsprec_one*frac_tsone 393 | tsrecl_ovll = tsrecl_zero *frac_tszero + tsrecl_one*frac_tsone 394 | 395 | print(paste("Gradient Boosting Test accuracy:",ts_acc)) 396 | print(paste("Gradient Boosting - Test Classification Report")) 397 | print(paste("Zero_Precision",tsprec_zero,"Zero_Recall",tsrecl_zero)) 398 | print(paste("One_Precision",tsprec_one,"One_Recall",tsrecl_one)) 399 | print(paste("Overall_Precision",round(tsprec_ovll,4),"Overall_Recall",round(tsrecl_ovll,4))) 400 | 401 | 402 | # Use the following code for performing cross validation on data - At the moment commented though 403 | #fitControl <- trainControl(method = "repeatedcv", number = 4, repeats = 4) 404 | 405 | # gbmFit1 <- train(Attrition_ind ~ ., data = train_data, method = "gbm", 406 | # trControl = fitControl,tuneGrid=grid,verbose = FALSE) 407 | 408 | 409 | 410 | # Xgboost Classifier 411 | library(xgboost); library(caret) 412 | 413 | hrattr_data = read.csv("WA_Fn-UseC_-HR-Employee-Attrition.csv") 414 | str(hrattr_data); summary(hrattr_data) 415 | 416 | # Target variable creation 417 | hrattr_data$Attrition_ind = 0; 418 | hrattr_data$Attrition_ind[hrattr_data$Attrition=="Yes"]=1 419 | 420 | # Columns to be removed due to no change in its value across observations 421 | remove_cols = c("EmployeeCount","EmployeeNumber","Over18","StandardHours","Attrition") 422 | hrattr_data_new = hrattr_data[,!(names(hrattr_data) %in% remove_cols)] 423 | 424 | 425 | # List of variables with continuous values 426 | continuous_columns = c('Age','DailyRate','DistanceFromHome','Education','EnvironmentSatisfaction', 427 | 'HourlyRate', 'JobInvolvement', 'JobLevel','JobSatisfaction','MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 428 | 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction','StockOptionLevel', 'TotalWorkingYears', 429 | 'TrainingTimesLastYear','WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 430 | 'YearsWithCurrManager') 431 | 432 | 433 | # list of categorical variables 434 | ohe_feats = c('BusinessTravel', 'Department', 'EducationField','Gender', 435 | 'JobRole','MaritalStatus','OverTime') 436 | 437 | # one-hot-encoding categorical features 438 | dummies <- dummyVars(~ BusinessTravel+Department+EducationField+Gender+JobRole+MaritalStatus+OverTime, data = hrattr_data_new) 439 | df_all_ohe <- as.data.frame(predict(dummies, newdata = hrattr_data_new)) 440 | 441 | # Cleaning column names & replace . with _ 442 | colClean <- function(x){ colnames(x) <- gsub("\\.", "_", colnames(x)); x } 443 | df_all_ohe = colClean(df_all_ohe) 444 | 445 | hrattr_data_new$Attrition_ind = as.integer(hrattr_data_new$Attrition_ind) 446 | 447 | # Combining both continuous & dummy variables from categories 448 | hrattr_data_v3 = cbind(df_all_ohe,hrattr_data_new[,(names(hrattr_data_new) %in% continuous_columns)], 449 | hrattr_data_new$Attrition_ind) 450 | 451 | names(hrattr_data_v3)[52] = "Attrition_ind" 452 | 453 | # Train & Test split based on 70% & 30% 454 | set.seed(123) 455 | numrow = nrow(hrattr_data_v3) 456 | trnind = sample(1:numrow,size = as.integer(0.7*numrow)) 457 | train_data = hrattr_data_v3[trnind,] 458 | test_data = hrattr_data_v3[-trnind,] 459 | 460 | # Custom functions for calculation of Precision & Recall 461 | frac_trzero = (table(train_data$Attrition_ind)[[1]])/nrow(train_data) 462 | frac_trone = (table(train_data$Attrition_ind)[[2]])/nrow(train_data) 463 | 464 | frac_tszero = (table(test_data$Attrition_ind)[[1]])/nrow(test_data) 465 | frac_tsone = (table(test_data$Attrition_ind)[[2]])/nrow(test_data) 466 | 467 | prec_zero <- function(act,pred){ tble = table(act,pred) 468 | return( round( tble[1,1]/(tble[1,1]+tble[2,1]),4) ) } 469 | 470 | prec_one <- function(act,pred){ tble = table(act,pred) 471 | return( round( tble[2,2]/(tble[2,2]+tble[1,2]),4) ) } 472 | 473 | recl_zero <- function(act,pred){tble = table(act,pred) 474 | return( round( tble[1,1]/(tble[1,1]+tble[1,2]),4) ) } 475 | 476 | recl_one <- function(act,pred){ tble = table(act,pred) 477 | return( round( tble[2,2]/(tble[2,2]+tble[2,1]),4) ) } 478 | 479 | accrcy <- function(act,pred){ tble = table(act,pred) 480 | return( round((tble[1,1]+tble[2,2])/sum(tble),4)) } 481 | 482 | y = train_data$Attrition_ind 483 | 484 | # XGBoost Classifier Training 485 | xgb <- xgboost(data = data.matrix(train_data[,-52]),label = y,eta = 0.04, 486 | max_depth = 2, nround=5000, subsample = 0.5, 487 | colsample_bytree = 0.5, seed = 1, eval_metric = "logloss", 488 | objective = "binary:logistic",nthread = 3 489 | ) 490 | 491 | # XGBoost value prediction on train & test data 492 | tr_y_pred_prob <- predict(xgb, data.matrix(train_data[,-52])) 493 | tr_y_pred <- as.numeric(tr_y_pred_prob > 0.5) 494 | 495 | ts_y_pred_prob <- predict(xgb, data.matrix(test_data[,-52])) 496 | ts_y_pred <- as.numeric(ts_y_pred_prob > 0.5) 497 | tr_y_act = train_data$Attrition_ind;ts_y_act = test_data$Attrition_ind 498 | tr_tble = table(tr_y_act,tr_y_pred) 499 | 500 | # XGBoost Metric predictions on Train Data 501 | print(paste("Xgboost - Train Confusion Matrix")) 502 | print(tr_tble) 503 | 504 | tr_acc = accrcy(tr_y_act,tr_y_pred) 505 | trprec_zero = prec_zero(tr_y_act,tr_y_pred); trrecl_zero = recl_zero(tr_y_act,tr_y_pred) 506 | trprec_one = prec_one(tr_y_act,tr_y_pred); trrecl_one = recl_one(tr_y_act,tr_y_pred) 507 | trprec_ovll = trprec_zero *frac_trzero + trprec_one*frac_trone 508 | trrecl_ovll = trrecl_zero *frac_trzero + trrecl_one*frac_trone 509 | 510 | print(paste("Xgboost Train accuracy:",tr_acc)) 511 | print(paste("Xgboost - Train Classification Report")) 512 | print(paste("Zero_Precision",trprec_zero,"Zero_Recall",trrecl_zero)) 513 | print(paste("One_Precision",trprec_one,"One_Recall",trrecl_one)) 514 | print(paste("Overall_Precision",round(trprec_ovll,4),"Overall_Recall",round(trrecl_ovll,4))) 515 | 516 | 517 | # XGBoost Metric predictions on Test Data 518 | ts_tble = table(ts_y_act,ts_y_pred) 519 | print(paste("Xgboost - Test Confusion Matrix")) 520 | print(ts_tble) 521 | 522 | ts_acc = accrcy(ts_y_act,ts_y_pred) 523 | tsprec_zero = prec_zero(ts_y_act,ts_y_pred); tsrecl_zero = recl_zero(ts_y_act,ts_y_pred) 524 | tsprec_one = prec_one(ts_y_act,ts_y_pred); tsrecl_one = recl_one(ts_y_act,ts_y_pred) 525 | tsprec_ovll = tsprec_zero *frac_tszero + tsprec_one*frac_tsone 526 | tsrecl_ovll = tsrecl_zero *frac_tszero + tsrecl_one*frac_tsone 527 | 528 | print(paste("Xgboost Test accuracy:",ts_acc)) 529 | print(paste("Xgboost - Test Classification Report")) 530 | print(paste("Zero_Precision",tsprec_zero,"Zero_Recall",tsrecl_zero)) 531 | print(paste("One_Precision",tsprec_one,"One_Recall",tsrecl_one)) 532 | print(paste("Overall_Precision",round(tsprec_ovll,4),"Overall_Recall",round(tsrecl_ovll,4))) 533 | 534 | 535 | 536 | # Ensemble of Ensembles 537 | rm(list = ls()) 538 | 539 | setwd("D:\\Book writing\\Codes\\Chapter 4") 540 | 541 | hrattr_data = read.csv("WA_Fn-UseC_-HR-Employee-Attrition.csv") 542 | str(hrattr_data) 543 | summary(hrattr_data) 544 | 545 | hrattr_data$Attrition_ind = 0;hrattr_data$Attrition_ind[hrattr_data$Attrition=="Yes"]=1 546 | hrattr_data$Attrition_ind = as.factor(hrattr_data$Attrition_ind) 547 | 548 | remove_cols = c("EmployeeCount","EmployeeNumber","Over18","StandardHours","Attrition") 549 | hrattr_data_new = hrattr_data[,!(names(hrattr_data) %in% remove_cols)] 550 | 551 | set.seed(123) 552 | numrow = nrow(hrattr_data_new) 553 | trnind = sample(1:numrow,size = as.integer(0.7*numrow)) 554 | train_data = hrattr_data_new[trnind,] 555 | test_data = hrattr_data_new[-trnind,] 556 | 557 | frac_trzero = (table(train_data$Attrition_ind)[[1]])/nrow(train_data) 558 | frac_trone = (table(train_data$Attrition_ind)[[2]])/nrow(train_data) 559 | 560 | frac_tszero = (table(test_data$Attrition_ind)[[1]])/nrow(test_data) 561 | frac_tsone = (table(test_data$Attrition_ind)[[2]])/nrow(test_data) 562 | 563 | prec_zero <- function(act,pred){ tble = table(act,pred) 564 | return( round( tble[1,1]/(tble[1,1]+tble[2,1]),4) ) } 565 | 566 | prec_one <- function(act,pred){ tble = table(act,pred) 567 | return( round( tble[2,2]/(tble[2,2]+tble[1,2]),4) ) } 568 | 569 | recl_zero <- function(act,pred){tble = table(act,pred) 570 | return( round( tble[1,1]/(tble[1,1]+tble[1,2]),4) ) } 571 | 572 | recl_one <- function(act,pred){ tble = table(act,pred) 573 | return( round( tble[2,2]/(tble[2,2]+tble[2,1]),4) ) } 574 | 575 | accrcy <- function(act,pred){ tble = table(act,pred) 576 | return( round((tble[1,1]+tble[2,2])/sum(tble),4)) } 577 | 578 | 579 | # Ensemble of Ensembles with different type of Classifiers 580 | train_data$Attrition_ind = as.factor(train_data$Attrition_ind) 581 | 582 | # Classifier 1 - Logistic Regression 583 | glm_fit = glm(Attrition_ind ~.,family = "binomial",data = train_data) 584 | glm_probs = predict(glm_fit,newdata = train_data,type = "response") 585 | 586 | # Classifier 2 - Decision Tree classifier 587 | library(C50) 588 | dtree_fit = C5.0(train_data[-31],train_data$Attrition_ind, 589 | control = C5.0Control(minCases = 1)) 590 | dtree_probs = predict(dtree_fit,newdata = train_data,type = "prob")[,2] 591 | 592 | # Classifier 3 - Random Forest 593 | library(randomForest) 594 | rf_fit = randomForest(Attrition_ind~.,data = train_data,mtry=6,maxnodes= 64,ntree=5000,nodesize = 1) 595 | rf_probs = predict(rf_fit,newdata = train_data,type = "prob")[,2] 596 | 597 | 598 | # Classifier 4 - Adaboost 599 | ada_fit = C5.0(train_data[-31],train_data$Attrition_ind,trails = 5000,control = C5.0Control(minCases = 1)) 600 | ada_probs = predict(ada_fit,newdata = train_data,type = "prob")[,2] 601 | 602 | # Ensemble of Models 603 | ensemble = data.frame(glm_probs,dtree_probs,rf_probs,ada_probs) 604 | ensemble = cbind(ensemble,train_data$Attrition_ind) 605 | names(ensemble)[5] = "Attrition_ind" 606 | rownames(ensemble) <- 1:nrow(ensemble) 607 | 608 | # Meta-classifier on top of individual classifiers 609 | meta_clf = glm(Attrition_ind~.,data = ensemble,family = "binomial") 610 | meta_probs = predict(meta_clf, ensemble,type = "response") 611 | 612 | ensemble$pred_class = 0 613 | ensemble$pred_class[meta_probs>0.5]=1 614 | 615 | # Train confusion & accuracy metrics 616 | tr_y_pred = ensemble$pred_class 617 | tr_y_act = train_data$Attrition_ind;ts_y_act = test_data$Attrition_ind 618 | tr_tble = table(tr_y_act,tr_y_pred) 619 | print(paste("Ensemble - Train Confusion Matrix")) 620 | print(tr_tble) 621 | 622 | tr_acc = accrcy(tr_y_act,tr_y_pred) 623 | print(paste("Ensemble Train accuracy:",tr_acc)) 624 | 625 | # Now verifing on test data 626 | glm_probs = predict(glm_fit,newdata = test_data,type = "response") 627 | dtree_probs = predict(dtree_fit,newdata = test_data,type = "prob")[,2] 628 | rf_probs = predict(rf_fit,newdata = test_data,type = "prob")[,2] 629 | ada_probs = predict(ada_fit,newdata = test_data,type = "prob")[,2] 630 | 631 | ensemble_test = data.frame(glm_probs,dtree_probs,rf_probs,ada_probs) 632 | ensemble_test = cbind(ensemble_test,test_data$Attrition_ind) 633 | names(ensemble_test)[5] = "Attrition_ind" 634 | 635 | rownames(ensemble_test) <- 1:nrow(ensemble_test) 636 | meta_test_probs = predict(meta_clf,newdata = ensemble_test,type = "response") 637 | ensemble_test$pred_class = 0 638 | ensemble_test$pred_class[meta_test_probs>0.5]=1 639 | 640 | # Test confusion & accuracy metrics 641 | ts_y_pred = ensemble_test$pred_class 642 | ts_tble = table(ts_y_act,ts_y_pred) 643 | print(paste("Ensemble - Test Confusion Matrix")) 644 | print(ts_tble) 645 | 646 | ts_acc = accrcy(ts_y_act,ts_y_pred) 647 | print(paste("Ensemble Test accuracy:",ts_acc)) 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | -------------------------------------------------------------------------------- /Chapter04/Chapter 04_Tree based ML Models.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import os 4 | """ First change the following directory link to where all input files do exist """ 5 | os.chdir("D:\\Book writing\\Codes\\Chapter 4") 6 | 7 | import numpy as np 8 | import pandas as pd 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.metrics import accuracy_score,classification_report 11 | 12 | import matplotlib.pyplot as plt 13 | 14 | 15 | hrattr_data = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv") 16 | 17 | print (hrattr_data.head()) 18 | 19 | hrattr_data['Attrition_ind'] = 0 20 | hrattr_data.loc[hrattr_data['Attrition']=='Yes','Attrition_ind'] = 1 21 | 22 | dummy_busnstrvl = pd.get_dummies(hrattr_data['BusinessTravel'], prefix='busns_trvl') 23 | dummy_dept = pd.get_dummies(hrattr_data['Department'], prefix='dept') 24 | dummy_edufield = pd.get_dummies(hrattr_data['EducationField'], prefix='edufield') 25 | dummy_gender = pd.get_dummies(hrattr_data['Gender'], prefix='gend') 26 | dummy_jobrole = pd.get_dummies(hrattr_data['JobRole'], prefix='jobrole') 27 | dummy_maritstat = pd.get_dummies(hrattr_data['MaritalStatus'], prefix='maritalstat') 28 | dummy_overtime = pd.get_dummies(hrattr_data['OverTime'], prefix='overtime') 29 | 30 | continuous_columns = ['Age','DailyRate','DistanceFromHome','Education','EnvironmentSatisfaction', 31 | 'HourlyRate', 'JobInvolvement', 'JobLevel','JobSatisfaction','MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 32 | 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction','StockOptionLevel', 'TotalWorkingYears', 33 | 'TrainingTimesLastYear','WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 34 | 'YearsWithCurrManager'] 35 | 36 | hrattr_continuous = hrattr_data[continuous_columns] 37 | 38 | hrattr_continuous['Age'].describe() 39 | hrattr_data['BusinessTravel'].value_counts() 40 | 41 | hrattr_data_new = pd.concat([dummy_busnstrvl,dummy_dept,dummy_edufield,dummy_gender,dummy_jobrole, 42 | dummy_maritstat,dummy_overtime,hrattr_continuous,hrattr_data['Attrition_ind']],axis=1) 43 | 44 | # Train & Test split 45 | x_train,x_test,y_train,y_test = train_test_split(hrattr_data_new.drop(['Attrition_ind'],axis=1), 46 | hrattr_data_new['Attrition_ind'],train_size = 0.7,random_state=42) 47 | 48 | # Decision Tree Classifier 49 | from sklearn.tree import DecisionTreeClassifier 50 | dt_fit = DecisionTreeClassifier(criterion="gini",max_depth=5,min_samples_split=2,min_samples_leaf=1,random_state=42) 51 | dt_fit.fit(x_train,y_train) 52 | 53 | print ("\nDecision Tree - Train Confusion Matrix\n\n",pd.crosstab(y_train,dt_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"])) 54 | print ("\nDecision Tree - Train accuracy:",round(accuracy_score(y_train,dt_fit.predict(x_train)),3)) 55 | print ("\nDecision Tree - Train Classification Report\n",classification_report(y_train,dt_fit.predict(x_train))) 56 | 57 | print ("\n\nDecision Tree - Test Confusion Matrix\n\n",pd.crosstab(y_test,dt_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"])) 58 | print ("\nDecision Tree - Test accuracy:",round(accuracy_score(y_test,dt_fit.predict(x_test)),3)) 59 | print ("\nDecision Tree - Test Classification Report\n",classification_report(y_test,dt_fit.predict(x_test))) 60 | 61 | 62 | # Tuning class weights to analyze accuracy, precision & recall 63 | dummyarray = np.empty((6,10)) 64 | dt_wttune = pd.DataFrame(dummyarray) 65 | 66 | dt_wttune.columns = ["zero_wght","one_wght","tr_accuracy","tst_accuracy","prec_zero","prec_one", 67 | "prec_ovll","recl_zero","recl_one","recl_ovll"] 68 | 69 | zero_clwghts = [0.01,0.1,0.2,0.3,0.4,0.5] 70 | 71 | for i in range(len(zero_clwghts)): 72 | clwght = {0:zero_clwghts[i],1:1.0-zero_clwghts[i]} 73 | dt_fit = DecisionTreeClassifier(criterion="gini",max_depth=5,min_samples_split=2, 74 | min_samples_leaf=1,random_state=42,class_weight = clwght) 75 | dt_fit.fit(x_train,y_train) 76 | dt_wttune.loc[i, 'zero_wght'] = clwght[0] 77 | dt_wttune.loc[i, 'one_wght'] = clwght[1] 78 | dt_wttune.loc[i, 'tr_accuracy'] = round(accuracy_score(y_train,dt_fit.predict(x_train)),3) 79 | dt_wttune.loc[i, 'tst_accuracy'] = round(accuracy_score(y_test,dt_fit.predict(x_test)),3) 80 | 81 | clf_sp = classification_report(y_test,dt_fit.predict(x_test)).split() 82 | dt_wttune.loc[i, 'prec_zero'] = float(clf_sp[5]) 83 | dt_wttune.loc[i, 'prec_one'] = float(clf_sp[10]) 84 | dt_wttune.loc[i, 'prec_ovll'] = float(clf_sp[17]) 85 | 86 | dt_wttune.loc[i, 'recl_zero'] = float(clf_sp[6]) 87 | dt_wttune.loc[i, 'recl_one'] = float(clf_sp[11]) 88 | dt_wttune.loc[i, 'recl_ovll'] = float(clf_sp[18]) 89 | print ("\nClass Weights",clwght,"Train accuracy:",round(accuracy_score(y_train,dt_fit.predict(x_train)),3),"Test accuracy:",round(accuracy_score(y_test,dt_fit.predict(x_test)),3)) 90 | print ("Test Confusion Matrix\n\n",pd.crosstab(y_test,dt_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"])) 91 | 92 | 93 | # Bagging Classifier 94 | from sklearn.tree import DecisionTreeClassifier 95 | from sklearn.ensemble import BaggingClassifier 96 | 97 | dt_fit = DecisionTreeClassifier(criterion="gini",max_depth=5,min_samples_split=2,min_samples_leaf=1,random_state=42, 98 | class_weight = {0:0.3,1:0.7}) 99 | 100 | bag_fit = BaggingClassifier(base_estimator= dt_fit,n_estimators=5000,max_samples=0.67,max_features=1.0, 101 | bootstrap=True,bootstrap_features=True,n_jobs=-1,random_state=42) 102 | 103 | bag_fit.fit(x_train, y_train) 104 | 105 | print ("\nBagging - Train Confusion Matrix\n\n",pd.crosstab(y_train,bag_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"])) 106 | print ("\nBagging- Train accuracy",round(accuracy_score(y_train,bag_fit.predict(x_train)),3)) 107 | print ("\nBagging - Train Classification Report\n",classification_report(y_train,bag_fit.predict(x_train))) 108 | 109 | print ("\n\nBagging - Test Confusion Matrix\n\n",pd.crosstab(y_test,bag_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"])) 110 | print ("\nBagging - Test accuracy",round(accuracy_score(y_test,bag_fit.predict(x_test)),3)) 111 | print ("\nBagging - Test Classification Report\n",classification_report(y_test,bag_fit.predict(x_test))) 112 | 113 | 114 | 115 | # Random Forest Classifier 116 | from sklearn.ensemble import RandomForestClassifier 117 | 118 | rf_fit = RandomForestClassifier(n_estimators=5000,criterion="gini",max_depth=5,min_samples_split=2,bootstrap=True, 119 | max_features='auto',random_state=42,min_samples_leaf=1,class_weight = {0:0.3,1:0.7}) 120 | rf_fit.fit(x_train,y_train) 121 | 122 | print ("\nRandom Forest - Train Confusion Matrix\n\n",pd.crosstab(y_train,rf_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"])) 123 | print ("\nRandom Forest - Train accuracy",round(accuracy_score(y_train,rf_fit.predict(x_train)),3)) 124 | print ("\nRandom Forest - Train Classification Report\n",classification_report(y_train,rf_fit.predict(x_train))) 125 | 126 | print ("\n\nRandom Forest - Test Confusion Matrix\n\n",pd.crosstab(y_test,rf_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"])) 127 | print ("\nRandom Forest - Test accuracy",round(accuracy_score(y_test,rf_fit.predict(x_test)),3)) 128 | print ("\nRandom Forest - Test Classification Report\n",classification_report(y_test,rf_fit.predict(x_test))) 129 | 130 | 131 | # Plot of Variable importance by mean decrease in gini 132 | model_ranks = pd.Series(rf_fit.feature_importances_,index=x_train.columns, name='Importance').sort_values(ascending=False, inplace=False) 133 | model_ranks.index.name = 'Variables' 134 | top_features = model_ranks.iloc[:31].sort_values(ascending=True,inplace=False) 135 | plt.figure(figsize=(20,10)) 136 | ax = top_features.plot(kind='barh') 137 | _ = ax.set_title("Variable Importance Plot") 138 | _ = ax.set_xlabel('Mean decrease in Variance') 139 | _ = ax.set_yticklabels(top_features.index, fontsize=13) 140 | 141 | 142 | 143 | 144 | # Random Forest Classifier - Grid Search 145 | from sklearn.pipeline import Pipeline 146 | from sklearn.model_selection import train_test_split,GridSearchCV 147 | 148 | pipeline = Pipeline([ 149 | ('clf',RandomForestClassifier(criterion='gini',class_weight = {0:0.3,1:0.7}))]) 150 | 151 | parameters = { 152 | 'clf__n_estimators':(2000,3000,5000), 153 | 'clf__max_depth':(5,15,30), 154 | 'clf__min_samples_split':(2,3), 155 | 'clf__min_samples_leaf':(1,2) } 156 | 157 | grid_search = GridSearchCV(pipeline,parameters,n_jobs=-1,cv=5,verbose=1,scoring='accuracy') 158 | grid_search.fit(x_train,y_train) 159 | 160 | print ('Best Training score: %0.3f' % grid_search.best_score_) 161 | print ('Best parameters set:') 162 | best_parameters = grid_search.best_estimator_.get_params() 163 | for param_name in sorted(parameters.keys()): 164 | print ('\t%s: %r' % (param_name, best_parameters[param_name])) 165 | 166 | predictions = grid_search.predict(x_test) 167 | 168 | print ("Testing accuracy:",round(accuracy_score(y_test, predictions),4)) 169 | print ("\nComplete report of Testing data\n",classification_report(y_test, predictions)) 170 | print ("\n\nRandom Forest Grid Search- Test Confusion Matrix\n\n",pd.crosstab(y_test, predictions,rownames = ["Actuall"],colnames = ["Predicted"])) 171 | 172 | 173 | # Adaboost Classifier 174 | from sklearn.tree import DecisionTreeClassifier 175 | from sklearn.ensemble import AdaBoostClassifier 176 | dtree = DecisionTreeClassifier(criterion='gini',max_depth=1) 177 | 178 | adabst_fit = AdaBoostClassifier(base_estimator= dtree, 179 | n_estimators=5000,learning_rate=0.05,random_state=42) 180 | 181 | adabst_fit.fit(x_train, y_train) 182 | 183 | print ("\nAdaBoost - Train Confusion Matrix\n\n",pd.crosstab(y_train,adabst_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"])) 184 | print ("\nAdaBoost - Train accuracy",round(accuracy_score(y_train,adabst_fit.predict(x_train)),3)) 185 | print ("\nAdaBoost - Train Classification Report\n",classification_report(y_train,adabst_fit.predict(x_train))) 186 | 187 | print ("\n\nAdaBoost - Test Confusion Matrix\n\n",pd.crosstab(y_test,adabst_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"])) 188 | print ("\nAdaBoost - Test accuracy",round(accuracy_score(y_test,adabst_fit.predict(x_test)),3)) 189 | print ("\nAdaBoost - Test Classification Report\n",classification_report(y_test,adabst_fit.predict(x_test))) 190 | 191 | # Gradientboost Classifier 192 | from sklearn.ensemble import GradientBoostingClassifier 193 | 194 | gbc_fit = GradientBoostingClassifier(loss='deviance',learning_rate=0.05,n_estimators=5000, 195 | min_samples_split=2,min_samples_leaf=1,max_depth=1,random_state=42 ) 196 | gbc_fit.fit(x_train,y_train) 197 | 198 | print ("\nGradient Boost - Train Confusion Matrix\n\n",pd.crosstab(y_train,gbc_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"])) 199 | print ("\nGradient Boost - Train accuracy",round(accuracy_score(y_train,gbc_fit.predict(x_train)),3)) 200 | print ("\nGradient Boost - Train Classification Report\n",classification_report(y_train,gbc_fit.predict(x_train))) 201 | 202 | print ("\n\nGradient Boost - Test Confusion Matrix\n\n",pd.crosstab(y_test,gbc_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"])) 203 | print ("\nGradient Boost - Test accuracy",round(accuracy_score(y_test,gbc_fit.predict(x_test)),3)) 204 | print ("\nGradient Boost - Test Classification Report\n",classification_report(y_test,gbc_fit.predict(x_test))) 205 | 206 | 207 | # Xgboost Classifier 208 | import xgboost as xgb 209 | 210 | xgb_fit = xgb.XGBClassifier(max_depth=2, n_estimators=5000, learning_rate=0.05) 211 | xgb_fit.fit(x_train, y_train) 212 | 213 | print ("\nXGBoost - Train Confusion Matrix\n\n",pd.crosstab(y_train,xgb_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"])) 214 | print ("\nXGBoost - Train accuracy",round(accuracy_score(y_train,xgb_fit.predict(x_train)),3)) 215 | print ("\nXGBoost - Train Classification Report\n",classification_report(y_train,xgb_fit.predict(x_train))) 216 | 217 | print ("\n\nXGBoost - Test Confusion Matrix\n\n",pd.crosstab(y_test,xgb_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"])) 218 | print ("\nXGBoost - Test accuracy",round(accuracy_score(y_test,xgb_fit.predict(x_test)),3)) 219 | print ("\nXGBoost - Test Classification Report\n",classification_report(y_test,xgb_fit.predict(x_test))) 220 | 221 | 222 | #Ensemble of Ensembles - by fitting various classifiers 223 | clwght = {0:0.3,1:0.7} 224 | 225 | # Classifier 1 226 | from sklearn.linear_model import LogisticRegression 227 | clf1_logreg_fit = LogisticRegression(fit_intercept=True,class_weight=clwght) 228 | clf1_logreg_fit.fit(x_train,y_train) 229 | 230 | print ("\nLogistic Regression for Ensemble - Train Confusion Matrix\n\n",pd.crosstab(y_train,clf1_logreg_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"])) 231 | print ("\nLogistic Regression for Ensemble - Train accuracy",round(accuracy_score(y_train,clf1_logreg_fit.predict(x_train)),3)) 232 | print ("\nLogistic Regression for Ensemble - Train Classification Report\n",classification_report(y_train,clf1_logreg_fit.predict(x_train))) 233 | 234 | print ("\n\nLogistic Regression for Ensemble - Test Confusion Matrix\n\n",pd.crosstab(y_test,clf1_logreg_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"])) 235 | print ("\nLogistic Regression for Ensemble - Test accuracy",round(accuracy_score(y_test,clf1_logreg_fit.predict(x_test)),3)) 236 | print ("\nLogistic Regression for Ensemble - Test Classification Report\n",classification_report(y_test,clf1_logreg_fit.predict(x_test))) 237 | 238 | 239 | # Classifier 2 240 | from sklearn.tree import DecisionTreeClassifier 241 | clf2_dt_fit = DecisionTreeClassifier(criterion="gini",max_depth=5,min_samples_split=2, 242 | min_samples_leaf=1,random_state=42,class_weight=clwght) 243 | clf2_dt_fit.fit(x_train,y_train) 244 | 245 | print ("\nDecision Tree for Ensemble - Train Confusion Matrix\n\n",pd.crosstab(y_train,clf2_dt_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"])) 246 | print ("\nDecision Tree for Ensemble - Train accuracy",round(accuracy_score(y_train,clf2_dt_fit.predict(x_train)),3)) 247 | print ("\nDecision Tree for Ensemble - Train Classification Report\n",classification_report(y_train,clf2_dt_fit.predict(x_train))) 248 | 249 | print ("\n\nDecision Tree for Ensemble - Test Confusion Matrix\n\n",pd.crosstab(y_test,clf2_dt_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"])) 250 | print ("\nDecision Tree for Ensemble - Test accuracy",round(accuracy_score(y_test,clf2_dt_fit.predict(x_test)),3)) 251 | print ("\nDecision Tree for Ensemble - Test Classification Report\n",classification_report(y_test,clf2_dt_fit.predict(x_test))) 252 | 253 | 254 | # Classifier 3 255 | from sklearn.ensemble import RandomForestClassifier 256 | clf3_rf_fit = RandomForestClassifier(n_estimators=10000,criterion="gini",max_depth=6, 257 | min_samples_split=2,min_samples_leaf=1,class_weight = clwght) 258 | clf3_rf_fit.fit(x_train,y_train) 259 | 260 | print ("\nRandom Forest for Ensemble - Train Confusion Matrix\n\n",pd.crosstab(y_train,clf3_rf_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"])) 261 | print ("\nRandom Forest for Ensemble - Train accuracy",round(accuracy_score(y_train,clf3_rf_fit.predict(x_train)),3)) 262 | print ("\nRandom Forest for Ensemble - Train Classification Report\n",classification_report(y_train,clf3_rf_fit.predict(x_train))) 263 | 264 | print ("\n\nRandom Forest for Ensemble - Test Confusion Matrix\n\n",pd.crosstab(y_test,clf3_rf_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"])) 265 | print ("\nRandom Forest for Ensemble - Test accuracy",round(accuracy_score(y_test,clf3_rf_fit.predict(x_test)),3)) 266 | print ("\nRandom Forest for Ensemble - Test Classification Report\n",classification_report(y_test,clf3_rf_fit.predict(x_test))) 267 | 268 | 269 | # Classifier 4 270 | from sklearn.ensemble import AdaBoostClassifier 271 | clf4_dtree = DecisionTreeClassifier(criterion='gini',max_depth=1,class_weight = clwght) 272 | clf4_adabst_fit = AdaBoostClassifier(base_estimator= clf4_dtree, 273 | n_estimators=5000,learning_rate=0.05,random_state=42) 274 | 275 | clf4_adabst_fit.fit(x_train, y_train) 276 | 277 | print ("\nAdaBoost for Ensemble - Train Confusion Matrix\n\n",pd.crosstab(y_train,clf4_adabst_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"])) 278 | print ("\nAdaBoost for Ensemble - Train accuracy",round(accuracy_score(y_train,clf4_adabst_fit.predict(x_train)),3)) 279 | print ("\nAdaBoost for Ensemble - Train Classification Report\n",classification_report(y_train,clf4_adabst_fit.predict(x_train))) 280 | 281 | print ("\n\nAdaBoost for Ensemble - Test Confusion Matrix\n\n",pd.crosstab(y_test,clf4_adabst_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"])) 282 | print ("\nAdaBoost for Ensemble - Test accuracy",round(accuracy_score(y_test,clf4_adabst_fit.predict(x_test)),3)) 283 | print ("\nAdaBoost for Ensemble - Test Classification Report\n",classification_report(y_test,clf4_adabst_fit.predict(x_test))) 284 | 285 | 286 | ensemble = pd.DataFrame() 287 | 288 | ensemble["log_output_one"] = pd.DataFrame(clf1_logreg_fit.predict_proba(x_train))[1] 289 | ensemble["dtr_output_one"] = pd.DataFrame(clf2_dt_fit.predict_proba(x_train))[1] 290 | ensemble["rf_output_one"] = pd.DataFrame(clf3_rf_fit.predict_proba(x_train))[1] 291 | ensemble["adb_output_one"] = pd.DataFrame(clf4_adabst_fit.predict_proba(x_train))[1] 292 | 293 | ensemble = pd.concat([ensemble,pd.DataFrame(y_train).reset_index(drop = True )],axis=1) 294 | 295 | # Fitting meta-classifier 296 | meta_logit_fit = LogisticRegression(fit_intercept=False) 297 | meta_logit_fit.fit(ensemble[['log_output_one','dtr_output_one','rf_output_one','adb_output_one']],ensemble['Attrition_ind']) 298 | 299 | coefs = meta_logit_fit.coef_ 300 | print ("Co-efficients for LR, DT, RF & AB are:",coefs) 301 | 302 | ensemble_test = pd.DataFrame() 303 | ensemble_test["log_output_one"] = pd.DataFrame(clf1_logreg_fit.predict_proba(x_test))[1] 304 | ensemble_test["dtr_output_one"] = pd.DataFrame(clf2_dt_fit.predict_proba(x_test))[1] 305 | ensemble_test["rf_output_one"] = pd.DataFrame(clf3_rf_fit.predict_proba(x_test))[1] 306 | ensemble_test["adb_output_one"] = pd.DataFrame(clf4_adabst_fit.predict_proba(x_test))[1] 307 | 308 | ensemble_test["all_one"] = meta_logit_fit.predict(ensemble_test[['log_output_one','dtr_output_one','rf_output_one','adb_output_one']]) 309 | 310 | ensemble_test = pd.concat([ensemble_test,pd.DataFrame(y_test).reset_index(drop = True )],axis=1) 311 | 312 | print ("\n\nEnsemble of Models - Test Confusion Matrix\n\n",pd.crosstab(ensemble_test['Attrition_ind'],ensemble_test['all_one'],rownames = ["Actuall"],colnames = ["Predicted"])) 313 | print ("\nEnsemble of Models - Test accuracy",round(accuracy_score(ensemble_test['Attrition_ind'],ensemble_test['all_one']),3)) 314 | print ("\nEnsemble of Models - Test Classification Report\n",classification_report(ensemble_test['Attrition_ind'],ensemble_test['all_one'])) 315 | 316 | 317 | 318 | 319 | # Ensemble of Ensembles - by applying bagging on simple classifier 320 | from sklearn.tree import DecisionTreeClassifier 321 | from sklearn.ensemble import BaggingClassifier 322 | from sklearn.ensemble import AdaBoostClassifier 323 | 324 | clwght = {0:0.3,1:0.7} 325 | 326 | eoe_dtree = DecisionTreeClassifier(criterion='gini',max_depth=1,class_weight = clwght) 327 | eoe_adabst_fit = AdaBoostClassifier(base_estimator= eoe_dtree, 328 | n_estimators=500,learning_rate=0.05,random_state=42) 329 | eoe_adabst_fit.fit(x_train, y_train) 330 | 331 | print ("\nAdaBoost - Train Confusion Matrix\n\n",pd.crosstab(y_train,eoe_adabst_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"])) 332 | print ("\nAdaBoost - Train accuracy",round(accuracy_score(y_train,eoe_adabst_fit.predict(x_train)),3)) 333 | print ("\nAdaBoost - Train Classification Report\n",classification_report(y_train,eoe_adabst_fit.predict(x_train))) 334 | 335 | print ("\n\nAdaBoost - Test Confusion Matrix\n\n",pd.crosstab(y_test,eoe_adabst_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"])) 336 | print ("\nAdaBoost - Test accuracy",round(accuracy_score(y_test,eoe_adabst_fit.predict(x_test)),3)) 337 | print ("\nAdaBoost - Test Classification Report\n",classification_report(y_test,eoe_adabst_fit.predict(x_test))) 338 | 339 | 340 | bag_fit = BaggingClassifier(base_estimator= eoe_adabst_fit,n_estimators=50, 341 | max_samples=1.0,max_features=1.0, 342 | bootstrap=True, 343 | bootstrap_features=False, 344 | n_jobs=-1, 345 | random_state=42) 346 | 347 | bag_fit.fit(x_train, y_train) 348 | 349 | print ("\nEnsemble of AdaBoost - Train Confusion Matrix\n\n",pd.crosstab(y_train,bag_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"])) 350 | print ("\nEnsemble of AdaBoost - Train accuracy",round(accuracy_score(y_train,bag_fit.predict(x_train)),3)) 351 | print ("\nEnsemble of AdaBoost - Train Classification Report\n",classification_report(y_train,bag_fit.predict(x_train))) 352 | 353 | print ("\n\nEnsemble of AdaBoost - Test Confusion Matrix\n\n",pd.crosstab(y_test,bag_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"])) 354 | print ("\nEnsemble of AdaBoost - Test accuracy",round(accuracy_score(y_test,bag_fit.predict(x_test)),3)) 355 | print ("\nEnsemble of AdaBoost - Test Classification Report\n",classification_report(y_test,bag_fit.predict(x_test))) 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | -------------------------------------------------------------------------------- /Chapter05/Chapter 05_KNN n Naive Bayes.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Statistics-for-Machine-Learning/41e73f42da97c164859641c2add6e487cbc77402/Chapter05/Chapter 05_KNN n Naive Bayes.R -------------------------------------------------------------------------------- /Chapter05/Chapter 05_KNN n Naive Bayes.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | import os 5 | """ First change the following directory link to where all input files do exist """ 6 | os.chdir("D:\\Book writing\\Codes\\Chapter 5") 7 | 8 | 9 | 10 | import numpy as np 11 | import pandas as pd 12 | 13 | # KNN Curse of Dimensionality 14 | import random,math 15 | 16 | def random_point_gen(dimension): 17 | return [random.random() for _ in range(dimension)] 18 | 19 | def distance(v,w): 20 | vec_sub = [v_i-w_i for v_i,w_i in zip(v,w)] 21 | sum_of_sqrs = sum(v_i*v_i for v_i in vec_sub) 22 | return math.sqrt(sum_of_sqrs) 23 | 24 | def random_distances_comparison(dimension,number_pairs): 25 | return [distance(random_point_gen(dimension),random_point_gen(dimension)) 26 | for _ in range(number_pairs)] 27 | 28 | def mean(x): 29 | return sum(x) / len(x) 30 | 31 | dimensions = range(1, 201, 5) 32 | 33 | avg_distances = [] 34 | min_distances = [] 35 | 36 | 37 | dummyarray = np.empty((20,4)) 38 | dist_vals = pd.DataFrame(dummyarray) 39 | dist_vals.columns = ["Dimension","Min_Distance","Avg_Distance","Min/Avg_Distance"] 40 | 41 | random.seed(34) 42 | i = 0 43 | for dims in dimensions: 44 | distances = random_distances_comparison(dims, 1000) 45 | avg_distances.append(mean(distances)) 46 | min_distances.append(min(distances)) 47 | 48 | dist_vals.loc[i,"Dimension"] = dims 49 | dist_vals.loc[i,"Min_Distance"] = min(distances) 50 | dist_vals.loc[i,"Avg_Distance"] = mean(distances) 51 | dist_vals.loc[i,"Min/Avg_Distance"] = min(distances)/mean(distances) 52 | 53 | print(dims, min(distances), mean(distances), min(distances)*1.0 / mean(distances)) 54 | i = i+1 55 | 56 | # Ploting Average distances for Various Dimensions 57 | import matplotlib.pyplot as plt 58 | plt.figure() 59 | #plt.title('Avg. Distance Change with Number of Dimensions for 1K Obs') 60 | plt.xlabel('Dimensions') 61 | plt.ylabel('Avg. Distance') 62 | plt.plot(dist_vals["Dimension"],dist_vals["Avg_Distance"]) 63 | plt.legend(loc='best') 64 | plt.show() 65 | 66 | 67 | 68 | # 1-Dimension Plot 69 | import numpy as np 70 | import pandas as pd 71 | import matplotlib.pyplot as plt 72 | 73 | one_d_data = np.random.rand(60,1) 74 | one_d_data_df = pd.DataFrame(one_d_data) 75 | one_d_data_df.columns = ["1D_Data"] 76 | one_d_data_df["height"] = 1 77 | 78 | plt.figure() 79 | plt.scatter(one_d_data_df['1D_Data'],one_d_data_df["height"]) 80 | plt.yticks([]) 81 | plt.xlabel("1-D points") 82 | plt.show() 83 | 84 | # 2- Dimensions Plot 85 | two_d_data = np.random.rand(60,2) 86 | two_d_data_df = pd.DataFrame(two_d_data) 87 | two_d_data_df.columns = ["x_axis","y_axis"] 88 | 89 | plt.figure() 90 | plt.scatter(two_d_data_df['x_axis'],two_d_data_df["y_axis"]) 91 | plt.xlabel("x_axis");plt.ylabel("y_axis") 92 | plt.show() 93 | 94 | # 3- Dimensions Plot 95 | three_d_data = np.random.rand(60,3) 96 | three_d_data_df = pd.DataFrame(three_d_data) 97 | three_d_data_df.columns = ["x_axis","y_axis","z_axis"] 98 | 99 | from mpl_toolkits.mplot3d import Axes3D 100 | fig = plt.figure() 101 | ax = fig.add_subplot(111, projection='3d') 102 | ax.scatter(three_d_data_df['x_axis'],three_d_data_df["y_axis"],three_d_data_df["z_axis"]) 103 | ax.set_xlabel('x_axis') 104 | ax.set_ylabel('y_axis') 105 | ax.set_zlabel('z_axis') 106 | plt.show() 107 | 108 | 109 | 110 | 111 | # KNN CLassifier - Breast Cancer 112 | import numpy as np 113 | import pandas as pd 114 | from sklearn.metrics import accuracy_score,classification_report 115 | 116 | breast_cancer = pd.read_csv("Breast_Cancer_Wisconsin.csv") 117 | 118 | print (breast_cancer.head()) 119 | 120 | breast_cancer['Bare_Nuclei'] = breast_cancer['Bare_Nuclei'].replace('?', np.NAN) 121 | breast_cancer['Bare_Nuclei'] = breast_cancer['Bare_Nuclei'].fillna(breast_cancer['Bare_Nuclei'].value_counts().index[0]) 122 | 123 | breast_cancer['Cancer_Ind'] = 0 124 | breast_cancer.loc[breast_cancer['Class']==4,'Cancer_Ind'] = 1 125 | 126 | x_vars = breast_cancer.drop(['ID_Number','Class','Cancer_Ind'],axis=1) 127 | y_var = breast_cancer['Cancer_Ind'] 128 | 129 | 130 | from sklearn.preprocessing import StandardScaler 131 | x_vars_stdscle = StandardScaler().fit_transform(x_vars.values) 132 | from sklearn.model_selection import train_test_split 133 | x_vars_stdscle_df = pd.DataFrame(x_vars_stdscle, index=x_vars.index, columns=x_vars.columns) 134 | x_train,x_test,y_train,y_test = train_test_split(x_vars_stdscle_df,y_var,train_size = 0.7,random_state=42) 135 | 136 | from sklearn.neighbors import KNeighborsClassifier 137 | knn_fit = KNeighborsClassifier(n_neighbors=3,p=2,metric='minkowski') 138 | knn_fit.fit(x_train,y_train) 139 | 140 | print ("\nK-Nearest Neighbors - Train Confusion Matrix\n\n",pd.crosstab(y_train,knn_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]) ) 141 | print ("\nK-Nearest Neighbors - Train accuracy:",round(accuracy_score(y_train,knn_fit.predict(x_train)),3)) 142 | print ("\nK-Nearest Neighbors - Train Classification Report\n",classification_report(y_train,knn_fit.predict(x_train))) 143 | 144 | print ("\n\nK-Nearest Neighbors - Test Confusion Matrix\n\n",pd.crosstab(y_test,knn_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"])) 145 | print ("\nK-Nearest Neighbors - Test accuracy:",round(accuracy_score(y_test,knn_fit.predict(x_test)),3)) 146 | print ("\nK-Nearest Neighbors - Test Classification Report\n",classification_report(y_test,knn_fit.predict(x_test))) 147 | 148 | 149 | # Tuning of K- value for Train & Test data 150 | dummyarray = np.empty((5,3)) 151 | k_valchart = pd.DataFrame(dummyarray) 152 | k_valchart.columns = ["K_value","Train_acc","Test_acc"] 153 | 154 | k_vals = [1,2,3,4,5] 155 | for i in range(len(k_vals)): 156 | knn_fit = KNeighborsClassifier(n_neighbors=k_vals[i],p=2,metric='minkowski') 157 | knn_fit.fit(x_train,y_train) 158 | 159 | print ("\nK-value",k_vals[i]) 160 | 161 | tr_accscore = round(accuracy_score(y_train,knn_fit.predict(x_train)),3) 162 | print ("\nK-Nearest Neighbors - Train Confusion Matrix\n\n",pd.crosstab(y_train,knn_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]) ) 163 | print ("\nK-Nearest Neighbors - Train accuracy:",tr_accscore) 164 | print ("\nK-Nearest Neighbors - Train Classification Report\n",classification_report(y_train,knn_fit.predict(x_train))) 165 | 166 | ts_accscore = round(accuracy_score(y_test,knn_fit.predict(x_test)),3) 167 | print ("\n\nK-Nearest Neighbors - Test Confusion Matrix\n\n",pd.crosstab(y_test,knn_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"])) 168 | print ("\nK-Nearest Neighbors - Test accuracy:",ts_accscore) 169 | print ("\nK-Nearest Neighbors - Test Classification Report\n",classification_report(y_test,knn_fit.predict(x_test))) 170 | 171 | k_valchart.loc[i, 'K_value'] = k_vals[i] 172 | k_valchart.loc[i, 'Train_acc'] = tr_accscore 173 | k_valchart.loc[i, 'Test_acc'] = ts_accscore 174 | 175 | 176 | # Ploting accuracies over varied K-values 177 | import matplotlib.pyplot as plt 178 | plt.figure() 179 | #plt.title('KNN Train & Test Accuracy change with K-value') 180 | 181 | plt.xlabel('K-value') 182 | plt.ylabel('Accuracy') 183 | plt.plot(k_valchart["K_value"],k_valchart["Train_acc"]) 184 | plt.plot(k_valchart["K_value"],k_valchart["Test_acc"]) 185 | 186 | plt.axis([0.9,5, 0.92, 1.005]) 187 | plt.xticks([1,2,3,4,5]) 188 | 189 | for a,b in zip(k_valchart["K_value"],k_valchart["Train_acc"]): 190 | plt.text(a, b, str(b),fontsize=10) 191 | 192 | for a,b in zip(k_valchart["K_value"],k_valchart["Test_acc"]): 193 | plt.text(a, b, str(b),fontsize=10) 194 | 195 | plt.legend(loc='upper right') 196 | 197 | plt.show() 198 | 199 | 200 | 201 | 202 | 203 | # Naive Bayes using NLP 204 | 205 | # USe following code if it wont work in first place with UTF-8 code error 206 | 207 | # import sys 208 | # reload(sys) 209 | # sys.setdefaultencoding('utf-8') 210 | 211 | import csv 212 | 213 | smsdata = open('SMSSpamCollection.txt','r') 214 | csv_reader = csv.reader(smsdata,delimiter='\t') 215 | 216 | smsdata_data = [] 217 | smsdata_labels = [] 218 | 219 | for line in csv_reader: 220 | smsdata_labels.append(line[0]) 221 | smsdata_data.append(line[1]) 222 | 223 | smsdata.close() 224 | 225 | # Printing top 5 lines 226 | for i in range(5): 227 | print (smsdata_data[i],smsdata_labels[i]) 228 | 229 | # Printing Spam & Ham count 230 | from collections import Counter 231 | c = Counter( smsdata_labels ) 232 | print(c) 233 | 234 | 235 | import nltk 236 | from nltk.corpus import stopwords 237 | from nltk.stem import WordNetLemmatizer 238 | import string 239 | import pandas as pd 240 | from nltk import pos_tag 241 | from nltk.stem import PorterStemmer 242 | 243 | def preprocessing(text): 244 | text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split()) 245 | 246 | tokens = [word for sent in nltk.sent_tokenize(text2) for word in 247 | nltk.word_tokenize(sent)] 248 | 249 | tokens = [word.lower() for word in tokens] 250 | 251 | stopwds = stopwords.words('english') 252 | tokens = [token for token in tokens if token not in stopwds] 253 | 254 | tokens = [word for word in tokens if len(word)>=3] 255 | 256 | stemmer = PorterStemmer() 257 | tokens = [stemmer.stem(word) for word in tokens] 258 | 259 | tagged_corpus = pos_tag(tokens) 260 | 261 | Noun_tags = ['NN','NNP','NNPS','NNS'] 262 | Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ'] 263 | 264 | lemmatizer = WordNetLemmatizer() 265 | 266 | def prat_lemmatize(token,tag): 267 | if tag in Noun_tags: 268 | return lemmatizer.lemmatize(token,'n') 269 | elif tag in Verb_tags: 270 | return lemmatizer.lemmatize(token,'v') 271 | else: 272 | return lemmatizer.lemmatize(token,'n') 273 | 274 | pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus]) 275 | 276 | return pre_proc_text 277 | 278 | 279 | smsdata_data_2 = [] 280 | 281 | for i in smsdata_data: 282 | smsdata_data_2.append(preprocessing(i)) 283 | 284 | 285 | import numpy as np 286 | 287 | 288 | trainset_size = int(round(len(smsdata_data_2)*0.70)) 289 | 290 | 291 | print ('The training set size for this classifier is ' + str(trainset_size) + '\n') 292 | 293 | x_train = np.array([''.join(rec) for rec in smsdata_data_2[0:trainset_size]]) 294 | y_train = np.array([rec for rec in smsdata_labels[0:trainset_size]]) 295 | x_test = np.array([''.join(rec) for rec in smsdata_data_2[trainset_size+1:len(smsdata_data_2)]]) 296 | y_test = np.array([rec for rec in smsdata_labels[trainset_size+1:len(smsdata_labels)]]) 297 | 298 | 299 | # building TFIDF vectorizer 300 | from sklearn.feature_extraction.text import TfidfVectorizer 301 | 302 | vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_words='english', 303 | max_features= 4000,strip_accents='unicode', norm='l2') 304 | 305 | x_train_2 = vectorizer.fit_transform(x_train).todense() 306 | x_test_2 = vectorizer.transform(x_test).todense() 307 | 308 | from sklearn.naive_bayes import MultinomialNB 309 | clf = MultinomialNB().fit(x_train_2, y_train) 310 | 311 | ytrain_nb_predicted = clf.predict(x_train_2) 312 | ytest_nb_predicted = clf.predict(x_test_2) 313 | 314 | from sklearn.metrics import classification_report,accuracy_score 315 | 316 | print ("\nNaive Bayes - Train Confusion Matrix\n\n",pd.crosstab(y_train,ytrain_nb_predicted,rownames = ["Actuall"],colnames = ["Predicted"])) 317 | print ("\nNaive Bayes- Train accuracy",round(accuracy_score(y_train,ytrain_nb_predicted),3)) 318 | print ("\nNaive Bayes - Train Classification Report\n",classification_report(y_train,ytrain_nb_predicted)) 319 | 320 | print ("\nNaive Bayes - Test Confusion Matrix\n\n",pd.crosstab(y_test,ytest_nb_predicted,rownames = ["Actuall"],colnames = ["Predicted"])) 321 | print ("\nNaive Bayes- Test accuracy",round(accuracy_score(y_test,ytest_nb_predicted),3)) 322 | print ("\nNaive Bayes - Test Classification Report\n",classification_report(y_test,ytest_nb_predicted)) 323 | 324 | 325 | # printing top features 326 | feature_names = vectorizer.get_feature_names() 327 | coefs = clf.coef_ 328 | intercept = clf.intercept_ 329 | coefs_with_fns = sorted(zip(clf.coef_[0], feature_names)) 330 | 331 | print ("\n\nTop 10 features - both first & last\n") 332 | n=10 333 | top_n_coefs = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1]) 334 | for (coef_1, fn_1), (coef_2, fn_2) in top_n_coefs: 335 | print('\t%.4f\t%-15s\t\t%.4f\t%-15s' % (coef_1, fn_1, coef_2, fn_2)) 336 | 337 | 338 | -------------------------------------------------------------------------------- /Chapter06/Chapter 06_SVM_n_NN.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | rm(list = ls()) 7 | 8 | # First change the following directory link to where all the input files do exist 9 | setwd("D:\\Book writing\\Codes\\Chapter 6") 10 | 11 | letter_data = read.csv("letterdata.csv") 12 | 13 | set.seed(123) 14 | numrow = nrow(letter_data) 15 | trnind = sample(1:numrow,size = as.integer(0.7*numrow)) 16 | train_data = letter_data[trnind,] 17 | test_data = letter_data[-trnind,] 18 | 19 | 20 | library(e1071) 21 | 22 | accrcy <- function(matrx){ 23 | return( sum(diag(matrx)/sum(matrx)))} 24 | 25 | precsn <- function(matrx){ 26 | return(diag(matrx) / rowSums(matrx)) 27 | } 28 | 29 | recll <- function(matrx){ 30 | return(diag(matrx) / colSums(matrx)) 31 | } 32 | 33 | 34 | 35 | 36 | # SVM - Linear Kernel 37 | svm_fit = svm(letter~.,data = train_data,kernel="linear",cost=1.0,scale = TRUE) 38 | 39 | tr_y_pred = predict(svm_fit, train_data) 40 | ts_y_pred = predict(svm_fit,test_data) 41 | 42 | tr_y_act = train_data$letter;ts_y_act = test_data$letter 43 | 44 | tr_tble = table(tr_y_act,tr_y_pred) 45 | print(paste("Train Confusion Matrix")) 46 | print(tr_tble) 47 | tr_acc = accrcy(tr_tble) 48 | print(paste("SVM Linear Kernel Train accuracy:",round(tr_acc,4))) 49 | 50 | tr_prec = precsn(tr_tble) 51 | print(paste("SVM Linear Kernel Train Precision:")) 52 | print(tr_prec) 53 | 54 | tr_rcl = recll(tr_tble) 55 | print(paste("SVM Linear Kernel Train Recall:")) 56 | print(tr_rcl) 57 | 58 | ts_tble = table(ts_y_act,ts_y_pred) 59 | print(paste("Test Confusion Matrix")) 60 | print(ts_tble) 61 | 62 | ts_acc = accrcy(ts_tble) 63 | print(paste("SVM Linear Kernel Test accuracy:",round(ts_acc,4))) 64 | 65 | ts_prec = precsn(ts_tble) 66 | print(paste("SVM Linear Kernel Test Precision:")) 67 | print(ts_prec) 68 | 69 | ts_rcl = recll(ts_tble) 70 | print(paste("SVM Linear Kernel Test Recall:")) 71 | print(ts_rcl) 72 | 73 | 74 | # SVM - Polynomial Kernel 75 | svm_poly_fit = svm(letter~.,data = train_data,kernel="poly",cost=1.0,degree = 2 ,scale = TRUE) 76 | 77 | tr_y_pred = predict(svm_poly_fit, train_data) 78 | ts_y_pred = predict(svm_poly_fit,test_data) 79 | 80 | tr_y_act = train_data$letter;ts_y_act = test_data$letter 81 | 82 | 83 | tr_tble = table(tr_y_act,tr_y_pred) 84 | print(paste("Train Confusion Matrix")) 85 | print(tr_tble) 86 | tr_acc = accrcy(tr_tble) 87 | print(paste("SVM Polynomial Kernel Train accuracy:",round(tr_acc,4))) 88 | 89 | tr_prec = precsn(tr_tble) 90 | print(paste("SVM Polynomial Kernel Train Precision:")) 91 | print(tr_prec) 92 | 93 | tr_rcl = recll(tr_tble) 94 | print(paste("SVM Polynomial Kernel Train Recall:")) 95 | print(tr_rcl) 96 | 97 | ts_tble = table(ts_y_act,ts_y_pred) 98 | print(paste("Test Confusion Matrix")) 99 | print(ts_tble) 100 | 101 | ts_acc = accrcy(ts_tble) 102 | print(paste("SVM Polynomial Kernel Test accuracy:",round(ts_acc,4))) 103 | 104 | ts_prec = precsn(ts_tble) 105 | print(paste("SVM Polynomial Kernel Test Precision:")) 106 | print(ts_prec) 107 | 108 | ts_rcl = recll(ts_tble) 109 | print(paste("SVM Polynomial Kernel Test Recall:")) 110 | print(ts_rcl) 111 | 112 | 113 | 114 | # SVM - RBF Kernel 115 | svm_rbf_fit = svm(letter~.,data = train_data,kernel="radial",cost=1.0,gamma = 0.2 ,scale = TRUE) 116 | 117 | tr_y_pred = predict(svm_rbf_fit, train_data) 118 | ts_y_pred = predict(svm_rbf_fit,test_data) 119 | 120 | tr_y_act = train_data$letter;ts_y_act = test_data$letter 121 | 122 | tr_tble = table(tr_y_act,tr_y_pred) 123 | print(paste("Train Confusion Matrix")) 124 | print(tr_tble) 125 | tr_acc = accrcy(tr_tble) 126 | print(paste("SVM RBF Kernel Train accuracy:",round(tr_acc,4))) 127 | 128 | tr_prec = precsn(tr_tble) 129 | print(paste("SVM RBF Kernel Train Precision:")) 130 | print(tr_prec) 131 | 132 | tr_rcl = recll(tr_tble) 133 | print(paste("SVM RBF Kernel Train Recall:")) 134 | print(tr_rcl) 135 | 136 | ts_tble = table(ts_y_act,ts_y_pred) 137 | print(paste("Test Confusion Matrix")) 138 | print(ts_tble) 139 | 140 | ts_acc = accrcy(ts_tble) 141 | print(paste("SVM RBF Kernel Test accuracy:",round(ts_acc,4))) 142 | 143 | ts_prec = precsn(ts_tble) 144 | print(paste("SVM RBF Kernel Test Precision:")) 145 | print(ts_prec) 146 | 147 | ts_rcl = recll(ts_tble) 148 | print(paste("SVM RBF Kernel Test Recall:")) 149 | print(ts_rcl) 150 | 151 | 152 | 153 | # Grid search - RBF Kernel 154 | library(e1071) 155 | svm_rbf_grid = tune(svm,letter~.,data = train_data,kernel="radial",scale=TRUE,ranges = list( 156 | cost = c(0.1,0.3,1,3,10,30), 157 | gamma = c(0.001,0.01,0.1,0.3,1) 158 | 159 | ), 160 | tunecontrol = tune.control(cross = 5) 161 | ) 162 | 163 | print(paste("Best parameter from Grid Search")) 164 | print(summary(svm_rbf_grid)) 165 | 166 | best_model = svm_rbf_grid$best.model 167 | 168 | tr_y_pred = predict(best_model,data = train_data,type = "response") 169 | ts_y_pred = predict(best_model,newdata = test_data,type = "response") 170 | 171 | tr_y_act = train_data$letter;ts_y_act = test_data$letter 172 | 173 | 174 | tr_tble = table(tr_y_act,tr_y_pred) 175 | print(paste("Train Confusion Matrix")) 176 | print(tr_tble) 177 | tr_acc = accrcy(tr_tble) 178 | print(paste("SVM RBF Kernel Train accuracy:",round(tr_acc,4))) 179 | 180 | tr_prec = precsn(tr_tble) 181 | print(paste("SVM RBF Kernel Train Precision:")) 182 | print(tr_prec) 183 | 184 | tr_rcl = recll(tr_tble) 185 | print(paste("SVM RBF Kernel Train Recall:")) 186 | print(tr_rcl) 187 | 188 | ts_tble = table(ts_y_act,ts_y_pred) 189 | print(paste("Test Confusion Matrix")) 190 | print(ts_tble) 191 | 192 | ts_acc = accrcy(ts_tble) 193 | print(paste("SVM RBF Kernel Test accuracy:",round(ts_acc,4))) 194 | 195 | ts_prec = precsn(ts_tble) 196 | print(paste("SVM RBF Kernel Test Precision:")) 197 | print(ts_prec) 198 | 199 | ts_rcl = recll(ts_tble) 200 | print(paste("SVM RBF Kernel Test Recall:")) 201 | print(ts_rcl) 202 | 203 | 204 | 205 | 206 | # Artificial Neural Networks 207 | setwd("D:\\Book writing\\Codes\\Chapter 6") 208 | digits_data = read.csv("digitsdata.csv") 209 | 210 | remove_cols = c("target") 211 | x_data = digits_data[,!(names(digits_data) %in% remove_cols)] 212 | y_data = digits_data[,c("target")] 213 | 214 | 215 | normalize <- function(x) {return((x - min(x)) / (max(x) - min(x)))} 216 | 217 | 218 | data_norm <- as.data.frame(lapply(x_data, normalize)) 219 | data_norm <- replace(data_norm, is.na(data_norm), 0.0) 220 | data_norm_v2 = data.frame(as.factor(y_data),data_norm) 221 | names(data_norm_v2)[1] = "target" 222 | 223 | 224 | set.seed(123) 225 | numrow = nrow(data_norm_v2) 226 | trnind = sample(1:numrow,size = as.integer(0.7*numrow)) 227 | train_data = data_norm_v2[trnind,] 228 | test_data = data_norm_v2[-trnind,] 229 | 230 | f <- as.formula(paste("target ~", paste(names(train_data)[!names(train_data) %in% "target"], collapse = " + "))) 231 | 232 | library(nnet) 233 | accuracy <- function(mat){return(sum(diag(mat)) / sum(mat))} 234 | 235 | nnet_fit = nnet(f,train_data,size=c(9),maxit=200) 236 | y_pred = predict(nnet_fit,newdata = test_data,type = "class") 237 | tble = table(test_data$target,y_pred) 238 | print(accuracy(tble)) 239 | 240 | 241 | #Plotting nnet from the github packages 242 | require(RCurl) 243 | root.url<-'https://gist.githubusercontent.com/fawda123' 244 | raw.fun<-paste( 245 | root.url, 246 | '5086859/raw/cc1544804d5027d82b70e74b83b3941cd2184354/nnet_plot_fun.r', 247 | sep='/') 248 | script<-getURL(raw.fun, ssl.verifypeer = FALSE) 249 | eval(parse(text = script)) 250 | rm('script','raw.fun') 251 | 252 | # Ploting the neural net 253 | plot(nnet_fit) 254 | 255 | 256 | # Grid Search - ANN 257 | neurons = c(1,2,3,4,5,6,7,8,9,10,11,12,13) 258 | iters = c(200,300,400,500,600,700,800,900) 259 | 260 | initacc = 0 261 | 262 | for(itr in iters){ 263 | for(nd in neurons){ 264 | nnet_fit = nnet(f,train_data,size=c(nd),maxit=itr,trace=FALSE) 265 | y_pred = predict(nnet_fit,newdata = test_data,type = "class") 266 | tble = table(test_data$target,y_pred) 267 | acc = accuracy(tble) 268 | 269 | if (acc>initacc){ 270 | print(paste("Neurons",nd,"Iterations",itr,"Test accuracy",acc)) 271 | initacc = acc 272 | } 273 | 274 | } 275 | } 276 | 277 | 278 | 279 | -------------------------------------------------------------------------------- /Chapter06/Chapter 06_SVM_n_NN.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import os 4 | """ First change the following directory link to where all input files do exist """ 5 | os.chdir("D:\\Book writing\\Codes\\Chapter 6") 6 | 7 | 8 | 9 | import pandas as pd 10 | letterdata = pd.read_csv("letterdata.csv") 11 | print (letterdata.head()) 12 | 13 | x_vars = letterdata.drop(['letter'],axis=1) 14 | y_var = letterdata["letter"] 15 | 16 | y_var = y_var.replace({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7,'H':8,'I':9,'J':10, 17 | 'K':11,'L':12,'M':13,'N':14,'O':15,'P':16,'Q':17,'R':18,'S':19,'T':20, 18 | 'U':21,'V':22,'W':23,'X':24,'Y':25,'Z':26}) 19 | 20 | from sklearn.metrics import accuracy_score,classification_report 21 | from sklearn.model_selection import train_test_split 22 | x_train,x_test,y_train,y_test = train_test_split(x_vars,y_var,train_size = 0.7,random_state=42) 23 | 24 | 25 | # Linear Classifier 26 | from sklearn.svm import SVC 27 | svm_fit = SVC(kernel='linear',C=1.0,random_state=43) 28 | svm_fit.fit(x_train,y_train) 29 | 30 | print ("\nSVM Linear Classifier - Train Confusion Matrix\n\n",pd.crosstab(y_train,svm_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]) ) 31 | print ("\nSVM Linear Classifier - Train accuracy:",round(accuracy_score(y_train,svm_fit.predict(x_train)),3)) 32 | print ("\nSVM Linear Classifier - Train Classification Report\n",classification_report(y_train,svm_fit.predict(x_train))) 33 | 34 | print ("\n\nSVM Linear Classifier - Test Confusion Matrix\n\n",pd.crosstab(y_test,svm_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"])) 35 | print ("\nSVM Linear Classifier - Test accuracy:",round(accuracy_score(y_test,svm_fit.predict(x_test)),3)) 36 | print ("\nSVM Linear Classifier - Test Classification Report\n",classification_report(y_test,svm_fit.predict(x_test))) 37 | 38 | 39 | #Polynomial Kernel 40 | svm_poly_fit = SVC(kernel='poly',C=1.0,degree=2) 41 | svm_poly_fit.fit(x_train,y_train) 42 | 43 | print ("\nSVM Polynomial Kernel Classifier - Train Confusion Matrix\n\n",pd.crosstab(y_train,svm_poly_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]) ) 44 | print ("\nSVM Polynomial Kernel Classifier - Train accuracy:",round(accuracy_score(y_train,svm_poly_fit.predict(x_train)),3)) 45 | print ("\nSVM Polynomial Kernel Classifier - Train Classification Report\n",classification_report(y_train,svm_poly_fit.predict(x_train))) 46 | 47 | print ("\n\nSVM Polynomial Kernel Classifier - Test Confusion Matrix\n\n",pd.crosstab(y_test,svm_poly_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"])) 48 | print ("\nSVM Polynomial Kernel Classifier - Test accuracy:",round(accuracy_score(y_test,svm_poly_fit.predict(x_test)),3)) 49 | print ("\nSVM Polynomial Kernel Classifier - Test Classification Report\n",classification_report(y_test,svm_poly_fit.predict(x_test))) 50 | 51 | 52 | #RBF Kernel 53 | svm_rbf_fit = SVC(kernel='rbf',C=1.0, gamma=0.1) 54 | svm_rbf_fit.fit(x_train,y_train) 55 | 56 | print ("\nSVM RBF Kernel Classifier - Train Confusion Matrix\n\n",pd.crosstab(y_train,svm_rbf_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]) ) 57 | print ("\nSVM RBF Kernel Classifier - Train accuracy:",round(accuracy_score(y_train,svm_rbf_fit.predict(x_train)),3)) 58 | print ("\nSVM RBF Kernel Classifier - Train Classification Report\n",classification_report(y_train,svm_rbf_fit.predict(x_train))) 59 | 60 | print ("\n\nSVM RBF Kernel Classifier - Test Confusion Matrix\n\n",pd.crosstab(y_test,svm_rbf_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"])) 61 | print ("\nSVM RBF Kernel Classifier - Test accuracy:",round(accuracy_score(y_test,svm_rbf_fit.predict(x_test)),3)) 62 | print ("\nSVM RBF Kernel Classifier - Test Classification Report\n",classification_report(y_test,svm_rbf_fit.predict(x_test))) 63 | 64 | 65 | 66 | # Grid Search - RBF Kernel 67 | from sklearn.pipeline import Pipeline 68 | from sklearn.model_selection import train_test_split,GridSearchCV 69 | 70 | pipeline = Pipeline([('clf',SVC(kernel='rbf',C=1,gamma=0.1 ))]) 71 | 72 | parameters = {'clf__C':(0.1,0.3,1,3,10,30), 73 | 'clf__gamma':(0.001,0.01,0.1,0.3,1)} 74 | 75 | grid_search_rbf = GridSearchCV(pipeline,parameters,n_jobs=-1,cv=5,verbose=1,scoring='accuracy') 76 | grid_search_rbf.fit(x_train,y_train) 77 | 78 | 79 | print ('RBF Kernel Grid Search Best Training score: %0.3f' % grid_search_rbf.best_score_) 80 | print ('RBF Kernel Grid Search Best parameters set:') 81 | best_parameters = grid_search_rbf.best_estimator_.get_params() 82 | for param_name in sorted(parameters.keys()): 83 | print ('\t%s: %r' % (param_name, best_parameters[param_name])) 84 | 85 | predictions = grid_search_rbf.predict(x_test) 86 | 87 | print ("\nRBF Kernel Grid Search - Testing accuracy:",round(accuracy_score(y_test, predictions),4)) 88 | print ("\nRBF Kernel Grid Search - Test Classification Report\n",classification_report(y_test, predictions)) 89 | print ("\n\nRBF Kernel Grid Search- Test Confusion Matrix\n\n",pd.crosstab(y_test, predictions,rownames = ["Actuall"],colnames = ["Predicted"])) 90 | 91 | 92 | 93 | 94 | 95 | 96 | # Neural Networks - Classifying hand-written digits using Scikit Learn 97 | 98 | import pandas as pd 99 | from sklearn.datasets import load_digits 100 | from sklearn.model_selection import train_test_split 101 | from sklearn.pipeline import Pipeline 102 | from sklearn.preprocessing import StandardScaler 103 | 104 | from sklearn.neural_network import MLPClassifier 105 | digits = load_digits() 106 | X = digits.data 107 | y = digits.target 108 | 109 | 110 | # Checking dimensions 111 | print (X.shape) 112 | print (y.shape) 113 | 114 | # Plotting first digit 115 | import matplotlib.pyplot as plt 116 | plt.matshow(digits.images[0]) 117 | plt.show() 118 | 119 | #X_df = pd.DataFrame(X) 120 | #y_df = pd.DataFrame(y) 121 | #y_df.columns = ['target'] 122 | #digitdata = pd.concat([y_df,X_df],axis=1) 123 | #digitdata.to_csv("digitsdata.csv",index= False) 124 | 125 | from sklearn.model_selection import train_test_split 126 | x_vars_stdscle = StandardScaler().fit_transform(X) 127 | x_train,x_test,y_train,y_test = train_test_split(x_vars_stdscle,y,train_size = 0.7,random_state=42) 128 | 129 | 130 | # Grid Search - Neural Network 131 | from sklearn.pipeline import Pipeline 132 | from sklearn.model_selection import train_test_split,GridSearchCV 133 | from sklearn.metrics import accuracy_score,classification_report 134 | 135 | pipeline = Pipeline([('mlp',MLPClassifier(hidden_layer_sizes= (100,50,),activation='relu', 136 | solver='adam',alpha=0.0001,max_iter=300 )) ]) 137 | 138 | parameters = {'mlp__alpha':(0.001,0.01,0.1,0.3,0.5,1.0), 139 | 'mlp__max_iter':(100,200,300)} 140 | 141 | grid_search_nn = GridSearchCV(pipeline,parameters,n_jobs=-1,cv=5,verbose=1,scoring='accuracy') 142 | grid_search_nn.fit(x_train,y_train) 143 | 144 | print ('\n\nNeural Network Best Training score: %0.3f' % grid_search_nn.best_score_) 145 | print ('\nNeural Network Best parameters set:') 146 | best_parameters = grid_search_nn.best_estimator_.get_params() 147 | for param_name in sorted(parameters.keys()): 148 | print ('\t%s: %r' % (param_name, best_parameters[param_name])) 149 | 150 | predictions_train = grid_search_nn.predict(x_train) 151 | predictions_test = grid_search_nn.predict(x_test) 152 | 153 | print ("\nNeural Network Training accuracy:",round(accuracy_score(y_train, predictions_train),4)) 154 | print ("\nNeural Network Complete report of Training data\n",classification_report(y_train, predictions_train)) 155 | print ("\n\nNeural Network Grid Search- Train Confusion Matrix\n\n",pd.crosstab(y_train, predictions_train,rownames = ["Actuall"],colnames = ["Predicted"])) 156 | 157 | print ("\n\nNeural Network Testing accuracy:",round(accuracy_score(y_test, predictions_test),4)) 158 | print ("\nNeural Network Complete report of Testing data\n",classification_report(y_test, predictions_test)) 159 | print ("\n\nNeural Network Grid Search- Test Confusion Matrix\n\n",pd.crosstab(y_test, predictions_test,rownames = ["Actuall"],colnames = ["Predicted"])) 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | # Neural Networks - Classifying hand-written digits using Keras 168 | import numpy as np 169 | import pandas as pd 170 | import matplotlib.pyplot as plt 171 | 172 | from sklearn.datasets import load_digits 173 | from sklearn.model_selection import train_test_split 174 | from sklearn.preprocessing import StandardScaler 175 | from sklearn.metrics import accuracy_score,classification_report 176 | 177 | from keras.models import Sequential 178 | from keras.layers.core import Dense, Dropout, Activation 179 | from keras.optimizers import Adadelta,Adam,RMSprop 180 | from keras.utils import np_utils 181 | 182 | 183 | digits = load_digits() 184 | X = digits.data 185 | y = digits.target 186 | 187 | print (X.shape) 188 | print (y.shape) 189 | 190 | print ("\nPrinting first digit") 191 | plt.matshow(digits.images[0]) 192 | plt.show() 193 | 194 | 195 | x_vars_stdscle = StandardScaler().fit_transform(X) 196 | x_train,x_test,y_train,y_test = train_test_split(x_vars_stdscle,y,train_size = 0.7,random_state=42) 197 | 198 | # Definiting hyper parameters 199 | np.random.seed(1337) 200 | nb_classes = 10 201 | batch_size = 128 202 | nb_epochs = 200 203 | 204 | Y_train = np_utils.to_categorical(y_train, nb_classes) 205 | 206 | print (Y_train.shape) 207 | 208 | print (y_train[0]) 209 | print (Y_train[0]) 210 | 211 | 212 | #Deep Layer Model building in Keras 213 | 214 | model = Sequential() 215 | 216 | model.add(Dense(100,input_shape= (64,))) 217 | model.add(Activation('relu')) 218 | model.add(Dropout(0.5)) 219 | 220 | model.add(Dense(50)) 221 | model.add(Activation('relu')) 222 | model.add(Dropout(0.5)) 223 | 224 | model.add(Dense(nb_classes)) 225 | model.add(Activation('softmax')) 226 | 227 | model.compile(loss='categorical_crossentropy', optimizer='adam') 228 | 229 | 230 | # Model Training 231 | model.fit(x_train, Y_train, batch_size=batch_size, nb_epoch=nb_epochs,verbose=1) 232 | 233 | 234 | #Model Prediction 235 | y_train_predclass = model.predict_classes(x_train,batch_size=batch_size) 236 | y_test_predclass = model.predict_classes(x_test,batch_size=batch_size) 237 | 238 | print ("\n\nDeep Neural Network - Train accuracy:"),(round(accuracy_score(y_train,y_train_predclass),3)) 239 | 240 | print ("\nDeep Neural Network - Train Classification Report") 241 | print (classification_report(y_train,y_train_predclass)) 242 | 243 | print ("\nDeep Neural Network - Train Confusion Matrix\n") 244 | print (pd.crosstab(y_train,y_train_predclass,rownames = ["Actuall"],colnames = ["Predicted"]) ) 245 | 246 | 247 | print ("\nDeep Neural Network - Test accuracy:"),(round(accuracy_score(y_test,y_test_predclass),3)) 248 | 249 | print ("\nDeep Neural Network - Test Classification Report") 250 | print (classification_report(y_test,y_test_predclass)) 251 | 252 | print ("\nDeep Neural Network - Test Confusion Matrix\n") 253 | print (pd.crosstab(y_test,y_test_predclass,rownames = ["Actuall"],colnames = ["Predicted"]) ) 254 | 255 | 256 | 257 | 258 | 259 | -------------------------------------------------------------------------------- /Chapter07/Chapter 07_Recomm_Engine.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | rm(list = ls()) 4 | 5 | # First change the following directory link to where all the input files do exist 6 | setwd("D:\\Book writing\\Codes\\Chapter 7\\ml-latest-small\\ml-latest-small") 7 | 8 | ratings = read.csv("ratings.csv") 9 | movies = read.csv("movies.csv") 10 | 11 | ratings = ratings[,!names(ratings) %in% c("timestamp")] 12 | 13 | library(reshape2) 14 | 15 | # Creating Pivot table 16 | ratings_mat = acast(ratings,userId~movieId) 17 | ratings_mat[is.na(ratings_mat)] =0 18 | 19 | 20 | # Content based filtering 21 | library(lsa) 22 | 23 | a = c(2, 1, 0, 2, 0, 1, 1, 1) 24 | b = c(2, 1, 1, 1, 1, 0, 1, 1) 25 | 26 | print (paste("Cosine similarity between A and B is", round(cosine(a,b),4))) 27 | 28 | 29 | m = nrow(ratings_mat);n = ncol(ratings_mat) 30 | 31 | 32 | # User similarity matrix 33 | mat_users = matrix(nrow = m, ncol = m) 34 | 35 | for (i in 1:m){ 36 | for (j in 1:m){ 37 | if (i != j){ 38 | mat_users[i,j] = cosine(ratings_mat[i,],ratings_mat[j,]) 39 | } 40 | else { 41 | mat_users[i,j] = 0.0 42 | } 43 | } 44 | } 45 | 46 | 47 | colnames(mat_users) = rownames(ratings_mat); rownames(mat_users) = rownames(ratings_mat) 48 | df_users = as.data.frame(mat_users) 49 | 50 | 51 | 52 | # Finiding similar users 53 | topn_simusers <- function(uid=16,n=5){ 54 | sorted_df = sort(df_users[uid,],decreasing = TRUE)[1:n] 55 | print(paste("Similar users as user:",uid)) 56 | return(sorted_df) 57 | } 58 | 59 | print(topn_simusers(uid = 17,n=10)) 60 | 61 | 62 | # Finiding most rated movies of a user 63 | library(sqldf) 64 | 65 | ratings_withmovie = sqldf(" select a.*,b.title from 66 | ratings as a left join movies as b 67 | on a.movieId = b.movieId") 68 | 69 | 70 | # Finding most rated movies of a user 71 | topn_movieratings <- function(uid=355,n_ratings=10){ 72 | uid_ratings = ratings_withmovie[ratings_withmovie$userId==uid,] 73 | sorted_uidrtng = uid_ratings[order(-uid_ratings$rating),] 74 | return(head(sorted_uidrtng,n_ratings)) 75 | } 76 | 77 | print( topn_movieratings(uid = 596,n=10)) 78 | 79 | 80 | 81 | # Movies similarity matrix 82 | mat_movies = matrix(nrow = n, ncol = n) 83 | 84 | for (i in 1:n){ 85 | for (j in 1:n){ 86 | if (i != j){ 87 | mat_movies[i,j] = cosine(ratings_mat[,i],ratings_mat[,j]) 88 | } 89 | else { 90 | mat_movies[i,j] = 0.0 91 | } 92 | } 93 | } 94 | 95 | colnames(mat_movies) = colnames(ratings_mat); rownames(mat_movies) = colnames(ratings_mat) 96 | df_movies = as.data.frame(mat_movies) 97 | 98 | write.csv(df_movies,"df_movies.csv") 99 | 100 | df_movies = read.csv("df_movies.csv") 101 | rownames(df_movies) = df_movies$X 102 | colnames(df_movies) = c("aaa",df_movies$X) 103 | df_movies = subset(df_movies, select=-c(aaa)) 104 | 105 | 106 | 107 | # Finiding similar movies 108 | topn_simovies <- function(mid=588,n_movies=5){ 109 | sorted_df = sort(df_movies[mid,],decreasing = TRUE)[1:n_movies] 110 | sorted_df_t = as.data.frame(t(sorted_df)) 111 | colnames(sorted_df_t) = c("score") 112 | sorted_df_t$movieId = rownames(sorted_df_t) 113 | 114 | print(paste("Similar",n_movies, "movies as compared to the movie",mid,"are :")) 115 | sorted_df_t_wmovie = sqldf(" select a.*,b.title from sorted_df_t as a left join movies as b 116 | on a.movieId = b.movieId") 117 | return(sorted_df_t_wmovie) 118 | } 119 | 120 | print(topn_simovies(mid = 589,n_movies=15)) 121 | 122 | 123 | 124 | 125 | # Collaborative filtering 126 | ratings = read.csv("ratings.csv") 127 | movies = read.csv("movies.csv") 128 | 129 | library(sqldf) 130 | library(reshape2) 131 | library(recommenderlab) 132 | 133 | ratings_v2 = ratings[,-c(4)] 134 | 135 | ratings_mat = acast(ratings_v2,userId~movieId) 136 | ratings_mat2 = as(ratings_mat, "realRatingMatrix") 137 | 138 | getRatingMatrix(ratings_mat2) 139 | 140 | #Plotting user-item complete matrix 141 | image(ratings_mat2, main = "Raw Ratings") 142 | 143 | # Fitting ALS method on Data 144 | rec=Recommender(ratings_mat2[1:nrow(ratings_mat2)],method="UBCF", param=list(normalize = "Z-score",method="Cosine",nn=5, minRating=1)) 145 | rec_2=Recommender(ratings_mat2[1:nrow(ratings_mat2)],method="POPULAR") 146 | 147 | print(rec) 148 | print(rec_2) 149 | 150 | names(getModel(rec)) 151 | getModel(rec)$nn 152 | 153 | 154 | # Create predictions for all the users 155 | recom_pred = predict(rec,ratings_mat2[1:nrow(ratings_mat2)],type="ratings") 156 | 157 | # Putting predicitons into list 158 | rec_list<-as(recom_pred,"list") 159 | head(summary(rec_list)) 160 | 161 | print_recommendations <- function(uid=586,top_nmovies=10){ 162 | recoms_list = rec_list[[uid]] 163 | sorted_df = as.data.frame(sort(recoms_list,decreasing = TRUE)[1:top_nmovies]) 164 | colnames(sorted_df) = c("score") 165 | 166 | sorted_df$movieId = rownames(sorted_df) 167 | print(paste("Movies recommended for the user",uid,"are follows:")) 168 | sorted_df_t_wmovie = sqldf(" select a.*,b.title from sorted_df as a left join movies as b 169 | on a.movieId = b.movieId") 170 | 171 | return(sorted_df_t_wmovie) 172 | } 173 | 174 | print(print_recommendations(uid = 580,top_nmovies = 15)) 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | -------------------------------------------------------------------------------- /Chapter07/Chapter 07_Recomm_Engine.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import os 4 | """ First change the following directory link to where all input files do exist """ 5 | os.chdir("D:\\Book writing\\Codes\\Chapter 7\\ml-latest-small\\ml-latest-small") 6 | 7 | 8 | import pandas as pd 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | 12 | 13 | ratings = pd.read_csv("ratings.csv") 14 | print (ratings.head()) 15 | 16 | movies = pd.read_csv("movies.csv") 17 | print (movies.head()) 18 | 19 | 20 | #Combining movie ratings & movie names 21 | ratings = pd.merge(ratings[['userId','movieId','rating']],movies[['movieId','title']], 22 | how='left',left_on ='movieId' ,right_on = 'movieId') 23 | 24 | 25 | rp = ratings.pivot_table(columns = ['movieId'],index = ['userId'],values = 'rating') 26 | rp = rp.fillna(0) 27 | 28 | # Converting pandas dataframe to numpy for faster execution in loops etc. 29 | rp_mat = rp.as_matrix() 30 | 31 | 32 | from scipy.spatial.distance import cosine 33 | 34 | 35 | #The cosine of the angle between them is about 0.822. 36 | a= np.asarray( [2, 1, 0, 2, 0, 1, 1, 1]) 37 | b = np.asarray( [2, 1, 1, 1, 1, 0, 1, 1]) 38 | 39 | print("\n\n") 40 | print ("Cosine similarity between A and B is",round(1-cosine(a,b),4)) 41 | 42 | 43 | m, n = rp.shape 44 | 45 | # User similarity matrix 46 | mat_users = np.zeros((m, m)) 47 | for i in range(m): 48 | for j in range(m): 49 | if i != j: 50 | mat_users[i][j] = (1- cosine(rp_mat[i,:], rp_mat[j,:])) 51 | else: 52 | mat_users[i][j] = 0. 53 | 54 | pd_users = pd.DataFrame(mat_users,index =rp.index ,columns= rp.index ) 55 | 56 | 57 | # Finding similar users 58 | def topn_simusers(uid = 16,n=5): 59 | users = pd_users.loc[uid,:].sort_values(ascending = False) 60 | topn_users = users.iloc[:n,] 61 | topn_users = topn_users.rename('score') 62 | print ("Similar users as user:",uid) 63 | return pd.DataFrame(topn_users) 64 | 65 | print (topn_simusers(uid=17,n=10)) 66 | 67 | 68 | # Finding most rated movies of a user 69 | def topn_movieratings(uid = 355,n_ratings=10): 70 | uid_ratings = ratings.loc[ratings['userId']==uid] 71 | uid_ratings = uid_ratings.sort_values(by='rating',ascending = [False]) 72 | print ("Top",n_ratings ,"movie ratings of user:",uid) 73 | return uid_ratings.iloc[:n_ratings,] 74 | 75 | print (topn_movieratings(uid=596,n_ratings=10)) 76 | 77 | 78 | # Movie similarity matrix 79 | import time 80 | start_time = time.time() 81 | mat_movies = np.zeros((n, n)) 82 | 83 | for i in range(n): 84 | for j in range(n): 85 | if i!=j: 86 | mat_movies[i,j] = (1- cosine(rp_mat[:,i], rp_mat[:,j])) 87 | else: 88 | mat_movies[i,j] = 0. 89 | print("--- %s seconds ---" % (time.time() - start_time)) 90 | 91 | 92 | pd_movies = pd.DataFrame(mat_movies,index =rp.columns ,columns= rp.columns ) 93 | 94 | 95 | #pd_movies.to_csv('pd_movies.csv',sep=',') 96 | pd_movies = pd.read_csv("pd_movies.csv",index_col='movieId') 97 | 98 | 99 | # Finding similar movies 100 | def topn_simovies(mid = 588,n=15): 101 | mid_ratings = pd_movies.loc[mid,:].sort_values(ascending = False) 102 | topn_movies = pd.DataFrame(mid_ratings.iloc[:n,]) 103 | topn_movies['index1'] = topn_movies.index 104 | topn_movies['index1'] = topn_movies['index1'].astype('int64') 105 | topn_movies = pd.merge(topn_movies,movies[['movieId','title']],how = 'left',left_on ='index1' ,right_on = 'movieId') 106 | print ("Movies similar to movie id:",mid,",",movies['title'][movies['movieId']==mid].to_string(index=False),",are") 107 | del topn_movies['index1'] 108 | return topn_movies 109 | 110 | 111 | print (topn_simovies(mid=589,n=15)) 112 | 113 | 114 | 115 | 116 | #Collaborative filtering 117 | 118 | import os 119 | """ First change the following directory link to where all input files do exist """ 120 | os.chdir("D:\\Book writing\\Codes\\Chapter 7\\ml-latest-small\\ml-latest-small") 121 | 122 | 123 | import pandas as pd 124 | import numpy as np 125 | import matplotlib.pyplot as plt 126 | 127 | 128 | ratings = pd.read_csv("ratings.csv") 129 | print (ratings.head()) 130 | 131 | movies = pd.read_csv("movies.csv") 132 | print (movies.head()) 133 | 134 | rp = ratings.pivot_table(columns = ['movieId'],index = ['userId'],values = 'rating') 135 | rp = rp.fillna(0) 136 | 137 | A = rp.values 138 | 139 | print ("\nShape of Original Sparse Matrix",A.shape) 140 | 141 | 142 | W = A>0.5 143 | W[W==True]=1 144 | W[W==False]=0 145 | W = W.astype(np.float64,copy=False) 146 | 147 | 148 | W_pred = A<0.5 149 | W_pred[W_pred==True]=1 150 | W_pred[W_pred==False]=0 151 | W_pred = W_pred.astype(np.float64,copy=False) 152 | np.fill_diagonal(W_pred,val=0) 153 | 154 | 155 | 156 | # Parameters 157 | m,n = A.shape 158 | 159 | n_iterations = 200 160 | n_factors = 100 161 | lmbda = 0.1 162 | 163 | X = 5 * np.random.rand(m,n_factors) 164 | Y = 5* np.random.rand(n_factors,n) 165 | 166 | def get_error(A, X, Y, W): 167 | return np.sqrt(np.sum((W * (A - np.dot(X, Y)))**2)/np.sum(W)) 168 | 169 | errors = [] 170 | for itr in range(n_iterations): 171 | X = np.linalg.solve(np.dot(Y,Y.T)+ lmbda * np.eye(n_factors),np.dot(Y,A.T)).T 172 | Y = np.linalg.solve(np.dot(X.T,X)+ lmbda * np.eye(n_factors),np.dot(X.T,A)) 173 | 174 | if itr%10 == 0: 175 | print(itr," iterations completed","RMSError value is:",get_error(A,X,Y,W)) 176 | 177 | errors.append(get_error(A,X,Y,W)) 178 | 179 | A_hat = np.dot(X,Y) 180 | print ("RMSError of rated movies: ",get_error(A,X,Y,W)) 181 | 182 | 183 | plt.plot(errors); 184 | plt.ylim([0, 3.5]); 185 | plt.xlabel("Number of Iterations");plt.ylabel("RMSE") 186 | #plt.title("No.of Iterations vs. RMSE") 187 | plt.show() 188 | 189 | 190 | 191 | def print_recommovies(uid=315,n_movies=15,pred_mat = A_hat,wpred_mat = W_pred ): 192 | pred_recos = pred_mat*wpred_mat 193 | pd_predrecos = pd.DataFrame(pred_recos,index =rp.index ,columns= rp.columns ) 194 | pred_ratings = pd_predrecos.loc[uid,:].sort_values(ascending = False) 195 | pred_topratings = pred_ratings[:n_movies,] 196 | pred_topratings = pred_topratings.rename('pred_ratings') 197 | pred_topratings = pd.DataFrame(pred_topratings) 198 | pred_topratings['index1'] = pred_topratings.index 199 | pred_topratings['index1'] = pred_topratings['index1'].astype('int64') 200 | pred_topratings = pd.merge(pred_topratings,movies[['movieId','title']],how = 'left',left_on ='index1' ,right_on = 'movieId') 201 | del pred_topratings['index1'] 202 | print ("\nTop",n_movies,"movies predicted for the user:",uid," based on collaborative filtering\n") 203 | return pred_topratings 204 | 205 | 206 | predmtrx = print_recommovies(uid=355,n_movies=10,pred_mat=A_hat,wpred_mat=W_pred) 207 | print (predmtrx) 208 | 209 | 210 | 211 | 212 | # Grid Search on Collaborative Filtering 213 | def get_error(A, X, Y, W): 214 | return np.sqrt(np.sum((W * (A - np.dot(X, Y)))**2)/np.sum(W)) 215 | 216 | niters = [20,50,100,200] 217 | factors = [30,50,70,100] 218 | lambdas = [0.001,0.01,0.05,0.1] 219 | 220 | init_error = float("inf") 221 | 222 | 223 | print("\n\nGrid Search results of ALS Matrix Factorization:\n") 224 | for niter in niters: 225 | for facts in factors: 226 | for lmbd in lambdas: 227 | 228 | X = 5 * np.random.rand(m,facts) 229 | Y = 5* np.random.rand(facts,n) 230 | 231 | for itr in range(niter): 232 | X = np.linalg.solve(np.dot(Y,Y.T)+ lmbd * np.eye(facts),np.dot(Y,A.T)).T 233 | Y = np.linalg.solve(np.dot(X.T,X)+ lmbd * np.eye(facts),np.dot(X.T,A)) 234 | 235 | error = get_error(A,X,Y,W) 236 | 237 | if error best_score: 13 | best, best_score = x, x_score 14 | return best 15 | 16 | def vector_add(a, b): 17 | return tuple(map(operator.add, a, b)) 18 | 19 | 20 | orientations = [(1,0), (0, 1), (-1, 0), (0, -1)] 21 | 22 | def turn_right(orientation): 23 | return orientations[orientations.index(orientation)-1] 24 | 25 | def turn_left(orientation): 26 | return orientations[(orientations.index(orientation)+1) % len(orientations)] 27 | 28 | def isnumber(x): 29 | return hasattr(x, '__int__') 30 | 31 | 32 | 33 | """A Markov Decision Process, defined by an init_pos_posial state, transition model, 34 | and reward function. """ 35 | 36 | class MDP: 37 | 38 | def __init__(self, init_pos, actlist, terminals, transitions={}, states=None, gamma=0.99): 39 | if not (0 < gamma <= 1): 40 | raise ValueError("MDP should have 0 < gamma <= 1 values") 41 | 42 | if states: 43 | self.states = states 44 | else: 45 | self.states = set() 46 | self.init_pos = init_pos 47 | self.actlist = actlist 48 | self.terminals = terminals 49 | self.transitions = transitions 50 | self.gamma = gamma 51 | self.reward = {} 52 | 53 | """Returns a numeric reward for the state.""" 54 | def R(self, state): 55 | return self.reward[state] 56 | 57 | """Transition model. From a state and an action, return a list of (probability, result-state) pairs""" 58 | def T(self, state, action): 59 | if(self.transitions == {}): 60 | raise ValueError("Transition model is missing") 61 | else: 62 | return self.transitions[state][action] 63 | 64 | """Set of actions that can be performed for a particular state""" 65 | def actions(self, state): 66 | if state in self.terminals: 67 | return [None] 68 | else: 69 | return self.actlist 70 | 71 | 72 | 73 | """A two-dimensional grid MDP""" 74 | class GridMDP(MDP): 75 | 76 | def __init__(self, grid, terminals, init_pos=(0, 0), gamma=0.99): 77 | 78 | """ because we want row 0 on bottom, not on top """ 79 | grid.reverse() 80 | 81 | MDP.__init__(self, init_pos, actlist=orientations, 82 | terminals=terminals, gamma=gamma) 83 | self.grid = grid 84 | self.rows = len(grid) 85 | self.cols = len(grid[0]) 86 | for x in range(self.cols): 87 | for y in range(self.rows): 88 | self.reward[x, y] = grid[y][x] 89 | if grid[y][x] is not None: 90 | self.states.add((x, y)) 91 | 92 | def T(self, state, action): 93 | if action is None: 94 | return [(0.0, state)] 95 | else: 96 | return [(0.8, self.go(state, action)), 97 | (0.1, self.go(state, turn_right(action))), 98 | (0.1, self.go(state, turn_left(action)))] 99 | 100 | """Return the state that results from going in this direction.""" 101 | def go(self, state, direction): 102 | state1 = vector_add(state, direction) 103 | return state1 if state1 in self.states else state 104 | 105 | """Convert a mapping from (x, y) to v into a [[..., v, ...]] grid.""" 106 | def to_grid(self, mapping): 107 | return list(reversed([[mapping.get((x, y), None) 108 | for x in range(self.cols)] 109 | for y in range(self.rows)])) 110 | 111 | """Convert a mapping from (x, y) to v into a [[..., v, ...]] grid.""" 112 | def to_arrows(self, policy): 113 | chars = { 114 | (1, 0): '>', (0, 1): '^', (-1, 0): '<', (0, -1): 'v', None: '.'} 115 | return self.to_grid({s: chars[a] for (s, a) in policy.items()}) 116 | 117 | """Solving an MDP by value iteration and returns the optimum state values """ 118 | def value_iteration(mdp, epsilon=0.001): 119 | STSN = {s: 0 for s in mdp.states} 120 | R, T, gamma = mdp.R, mdp.T, mdp.gamma 121 | while True: 122 | STS = STSN.copy() 123 | delta = 0 124 | for s in mdp.states: 125 | STSN[s] = R(s) + gamma * max([sum([p * STS[s1] for (p, s1) in T(s, a)]) 126 | for a in mdp.actions(s)]) 127 | delta = max(delta, abs(STSN[s] - STS[s])) 128 | if delta < epsilon * (1 - gamma) / gamma: 129 | return STS 130 | 131 | """Given an MDP and a utility function STS, determine the best policy, 132 | as a mapping from state to action """ 133 | def best_policy(mdp, STS): 134 | pi = {} 135 | for s in mdp.states: 136 | pi[s] = argmax(mdp.actions(s), lambda a: expected_utility(a, s, STS, mdp)) 137 | return pi 138 | 139 | """The expected utility of doing a in state s, according to the MDP and STS.""" 140 | def expected_utility(a, s, STS, mdp): 141 | return sum([p * STS[s1] for (p, s1) in mdp.T(s, a)]) 142 | 143 | """Solve an MDP by policy iteration""" 144 | def policy_iteration(mdp): 145 | STS = {s: 0 for s in mdp.states} 146 | pi = {s: random.choice(mdp.actions(s)) for s in mdp.states} 147 | while True: 148 | STS = policy_evaluation(pi, STS, mdp) 149 | unchanged = True 150 | for s in mdp.states: 151 | a = argmax(mdp.actions(s),lambda a: expected_utility(a, s, STS, mdp)) 152 | if a != pi[s]: 153 | pi[s] = a 154 | unchanged = False 155 | if unchanged: 156 | return pi 157 | 158 | """Return an updated utility mapping U from each state in the MDP to its 159 | utility, using an approximation (modified policy iteration)""" 160 | def policy_evaluation(pi, STS, mdp, k=20): 161 | R, T, gamma = mdp.R, mdp.T, mdp.gamma 162 | for i in range(k): 163 | for s in mdp.states: 164 | STS[s] = R(s) + gamma * sum([p * STS[s1] for (p, s1) in T(s, pi[s])]) 165 | return STS 166 | 167 | 168 | def print_table(table, header=None, sep=' ', numfmt='{}'): 169 | justs = ['rjust' if isnumber(x) else 'ljust' for x in table[0]] 170 | if header: 171 | table.insert(0, header) 172 | table = [[numfmt.format(x) if isnumber(x) else x for x in row] 173 | for row in table] 174 | sizes = list( 175 | map(lambda seq: max(map(len, seq)), 176 | list(zip(*[map(str, row) for row in table])))) 177 | for row in table: 178 | print(sep.join(getattr( 179 | str(x), j)(size) for (j, size, x) in zip(justs, sizes, row))) 180 | 181 | 182 | 183 | """ A 4x3 grid environment that presents the agent with a sequential decision problem""" 184 | sequential_decision_environment = GridMDP([[-0.02, -0.02, -0.02, +1], 185 | [-0.02, None, -0.02, -1], 186 | [-0.02, -0.02, -0.02, -0.02]], 187 | terminals=[(3, 2), (3, 1)]) 188 | 189 | # Value Iteration 190 | value_iter = best_policy(sequential_decision_environment, value_iteration(sequential_decision_environment, .01)) 191 | print("\n Optimal Policy based on Value Iteration\n") 192 | print_table(sequential_decision_environment.to_arrows(value_iter)) 193 | 194 | 195 | #Policy Iteration 196 | policy_iter = policy_iteration(sequential_decision_environment) 197 | print("\n Optimal Policy based on Policy Iteration & Evaluation\n") 198 | print_table(sequential_decision_environment.to_arrows(policy_iter)) 199 | 200 | 201 | 202 | 203 | # Monte Carlo Methods 204 | 205 | from __future__ import print_function 206 | import numpy as np 207 | import matplotlib.pyplot as plt 208 | from mpl_toolkits.mplot3d import Axes3D 209 | 210 | 211 | #actions: hit or stand 212 | ACTION_HIT = 0 213 | ACTION_STAND = 1 214 | actions = [ACTION_HIT, ACTION_STAND] 215 | 216 | 217 | 218 | "policy for player" 219 | policyPlayer = np.zeros(22) 220 | 221 | for i in range(12, 20): 222 | policyPlayer[i] = ACTION_HIT 223 | 224 | policyPlayer[20] = ACTION_STAND 225 | policyPlayer[21] = ACTION_STAND 226 | 227 | "function form of target policy of player" 228 | def targetPolicyPlayer(usableAcePlayer, playerSum, dealerCard): 229 | return policyPlayer[playerSum] 230 | 231 | "function form of behavior policy of player" 232 | def behaviorPolicyPlayer(usableAcePlayer, playerSum, dealerCard): 233 | if np.random.binomial(1, 0.5) == 1: 234 | return ACTION_STAND 235 | return ACTION_HIT 236 | 237 | "policy for dealer" 238 | policyDealer = np.zeros(22) 239 | for i in range(12, 17): 240 | policyDealer[i] = ACTION_HIT 241 | for i in range(17, 22): 242 | policyDealer[i] = ACTION_STAND 243 | 244 | "get a new card" 245 | def getCard(): 246 | card = np.random.randint(1, 14) 247 | card = min(card, 10) 248 | return card 249 | 250 | # play a game 251 | 252 | def play(policyPlayerFn, initialState=None, initialAction=None): 253 | # player status 254 | 255 | # sum of player 256 | playerSum = 0 257 | 258 | # trajectory of player 259 | playerTrajectory = [] 260 | 261 | # whether player uses Ace as 11 262 | usableAcePlayer = False 263 | 264 | # dealer status 265 | dealerCard1 = 0 266 | dealerCard2 = 0 267 | usableAceDealer = False 268 | 269 | if initialState is None: 270 | # generate a random initial state 271 | 272 | numOfAce = 0 273 | 274 | # initialize cards of player 275 | while playerSum < 12: 276 | # if sum of player is less than 12, always hit 277 | card = getCard() 278 | 279 | # if get an Ace, use it as 11 280 | if card == 1: 281 | numOfAce += 1 282 | card = 11 283 | usableAcePlayer = True 284 | playerSum += card 285 | 286 | # if player's sum is larger than 21, he must hold at least one Ace, two Aces are possible 287 | if playerSum > 21: 288 | # use the Ace as 1 rather than 11 289 | playerSum -= 10 290 | 291 | # if the player only has one Ace, then he doesn't have usable Ace any more 292 | if numOfAce == 1: 293 | usableAcePlayer = False 294 | 295 | # initialize cards of dealer, suppose dealer will show the first card he gets 296 | dealerCard1 = getCard() 297 | dealerCard2 = getCard() 298 | 299 | else: 300 | # use specified initial state 301 | usableAcePlayer = initialState[0] 302 | playerSum = initialState[1] 303 | dealerCard1 = initialState[2] 304 | dealerCard2 = getCard() 305 | 306 | # initial state of the game 307 | state = [usableAcePlayer, playerSum, dealerCard1] 308 | 309 | # initialize dealer's sum 310 | dealerSum = 0 311 | if dealerCard1 == 1 and dealerCard2 != 1: 312 | dealerSum += 11 + dealerCard2 313 | usableAceDealer = True 314 | elif dealerCard1 != 1 and dealerCard2 == 1: 315 | dealerSum += dealerCard1 + 11 316 | usableAceDealer = True 317 | elif dealerCard1 == 1 and dealerCard2 == 1: 318 | dealerSum += 1 + 11 319 | usableAceDealer = True 320 | else: 321 | dealerSum += dealerCard1 + dealerCard2 322 | 323 | # game starts! 324 | 325 | # player's turn 326 | while True: 327 | if initialAction is not None: 328 | action = initialAction 329 | initialAction = None 330 | else: 331 | # get action based on current sum 332 | action = policyPlayerFn(usableAcePlayer, playerSum, dealerCard1) 333 | 334 | # track player's trajectory for importance sampling 335 | playerTrajectory.append([action, (usableAcePlayer, playerSum, dealerCard1)]) 336 | 337 | if action == ACTION_STAND: 338 | break 339 | # if hit, get new card 340 | playerSum += getCard() 341 | 342 | # player busts 343 | if playerSum > 21: 344 | # if player has a usable Ace, use it as 1 to avoid busting and continue 345 | if usableAcePlayer == True: 346 | playerSum -= 10 347 | usableAcePlayer = False 348 | else: 349 | # otherwise player loses 350 | return state, -1, playerTrajectory 351 | 352 | # dealer's turn 353 | while True: 354 | # get action based on current sum 355 | action = policyDealer[dealerSum] 356 | if action == ACTION_STAND: 357 | break 358 | # if hit, get a new card 359 | dealerSum += getCard() 360 | # dealer busts 361 | if dealerSum > 21: 362 | if usableAceDealer == True: 363 | # if dealer has a usable Ace, use it as 1 to avoid busting and continue 364 | dealerSum -= 10 365 | usableAceDealer = False 366 | else: 367 | # otherwise dealer loses 368 | return state, 1, playerTrajectory 369 | 370 | # compare the sum between player and dealer 371 | if playerSum > dealerSum: 372 | return state, 1, playerTrajectory 373 | elif playerSum == dealerSum: 374 | return state, 0, playerTrajectory 375 | else: 376 | return state, -1, playerTrajectory 377 | 378 | # Monte Carlo Sample with On-Policy 379 | def monteCarloOnPolicy(nEpisodes): 380 | statesUsableAce = np.zeros((10, 10)) 381 | # initialze counts to 1 to avoid 0 being divided 382 | statesUsableAceCount = np.ones((10, 10)) 383 | statesNoUsableAce = np.zeros((10, 10)) 384 | # initialze counts to 1 to avoid 0 being divided 385 | statesNoUsableAceCount = np.ones((10, 10)) 386 | for i in range(0, nEpisodes): 387 | state, reward, _ = play(targetPolicyPlayer) 388 | state[1] -= 12 389 | state[2] -= 1 390 | if state[0]: 391 | statesUsableAceCount[state[1], state[2]] += 1 392 | statesUsableAce[state[1], state[2]] += reward 393 | else: 394 | statesNoUsableAceCount[state[1], state[2]] += 1 395 | statesNoUsableAce[state[1], state[2]] += reward 396 | return statesUsableAce / statesUsableAceCount, statesNoUsableAce / statesNoUsableAceCount 397 | 398 | # Monte Carlo with Exploring Starts 399 | def monteCarloES(nEpisodes): 400 | # (playerSum, dealerCard, usableAce, action) 401 | stateActionValues = np.zeros((10, 10, 2, 2)) 402 | # initialze counts to 1 to avoid division by 0 403 | stateActionPairCount = np.ones((10, 10, 2, 2)) 404 | 405 | # behavior policy is greedy 406 | def behaviorPolicy(usableAce, playerSum, dealerCard): 407 | usableAce = int(usableAce) 408 | playerSum -= 12 409 | dealerCard -= 1 410 | # get argmax of the average returns(s, a) 411 | return np.argmax(stateActionValues[playerSum, dealerCard, usableAce, :] 412 | / stateActionPairCount[playerSum, dealerCard, usableAce, :]) 413 | 414 | # play for several episodes 415 | for episode in range(nEpisodes): 416 | if episode % 1000 == 0: 417 | print('episode:', episode) 418 | # for each episode, use a randomly initialized state and action 419 | initialState = [bool(np.random.choice([0, 1])), 420 | np.random.choice(range(12, 22)), 421 | np.random.choice(range(1, 11))] 422 | initialAction = np.random.choice(actions) 423 | _, reward, trajectory = play(behaviorPolicy, initialState, initialAction) 424 | for action, (usableAce, playerSum, dealerCard) in trajectory: 425 | usableAce = int(usableAce) 426 | playerSum -= 12 427 | dealerCard -= 1 428 | # update values of state-action pairs 429 | stateActionValues[playerSum, dealerCard, usableAce, action] += reward 430 | stateActionPairCount[playerSum, dealerCard, usableAce, action] += 1 431 | 432 | return stateActionValues / stateActionPairCount 433 | 434 | 435 | # print the state value 436 | figureIndex = 0 437 | def prettyPrint(data, tile, zlabel='reward'): 438 | global figureIndex 439 | fig = plt.figure(figureIndex) 440 | figureIndex += 1 441 | fig.suptitle(tile) 442 | ax = fig.add_subplot(111, projection='3d') 443 | x_axis = [] 444 | y_axis = [] 445 | z_axis = [] 446 | for i in range(12, 22): 447 | for j in range(1, 11): 448 | x_axis.append(i) 449 | y_axis.append(j) 450 | z_axis.append(data[i - 12, j - 1]) 451 | ax.scatter(x_axis, y_axis, z_axis,c='red') 452 | ax.set_xlabel('player sum') 453 | ax.set_ylabel('dealer showing') 454 | ax.set_zlabel(zlabel) 455 | 456 | 457 | 458 | # On-Policy results 459 | def onPolicy(): 460 | statesUsableAce1, statesNoUsableAce1 = monteCarloOnPolicy(10000) 461 | statesUsableAce2, statesNoUsableAce2 = monteCarloOnPolicy(500000) 462 | prettyPrint(statesUsableAce1, 'Usable Ace & 10000 Episodes') 463 | prettyPrint(statesNoUsableAce1, 'No Usable Ace & 10000 Episodes') 464 | prettyPrint(statesUsableAce2, 'Usable Ace & 500000 Episodes') 465 | prettyPrint(statesNoUsableAce2, 'No Usable Ace & 500000 Episodes') 466 | plt.show() 467 | 468 | 469 | # Optimized or Monte Calro Control 470 | def MC_ES_optimalPolicy(): 471 | stateActionValues = monteCarloES(500000) 472 | stateValueUsableAce = np.zeros((10, 10)) 473 | stateValueNoUsableAce = np.zeros((10, 10)) 474 | # get the optimal policy 475 | actionUsableAce = np.zeros((10, 10), dtype='int') 476 | actionNoUsableAce = np.zeros((10, 10), dtype='int') 477 | for i in range(10): 478 | for j in range(10): 479 | stateValueNoUsableAce[i, j] = np.max(stateActionValues[i, j, 0, :]) 480 | stateValueUsableAce[i, j] = np.max(stateActionValues[i, j, 1, :]) 481 | actionNoUsableAce[i, j] = np.argmax(stateActionValues[i, j, 0, :]) 482 | actionUsableAce[i, j] = np.argmax(stateActionValues[i, j, 1, :]) 483 | prettyPrint(stateValueUsableAce, 'Optimal state value with usable Ace') 484 | prettyPrint(stateValueNoUsableAce, 'Optimal state value with no usable Ace') 485 | prettyPrint(actionUsableAce, 'Optimal policy with usable Ace', 'Action (0 Hit, 1 Stick)') 486 | prettyPrint(actionNoUsableAce, 'Optimal policy with no usable Ace', 'Action (0 Hit, 1 Stick)') 487 | plt.show() 488 | 489 | 490 | 491 | # Run on policy function 492 | onPolicy() 493 | 494 | # Run Monte Carlo Control or Explored starts 495 | MC_ES_optimalPolicy() 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | # Cliff-Walking - TD Learning - SARSA & Q-Learning 508 | from __future__ import print_function 509 | import numpy as np 510 | import matplotlib.pyplot as plt 511 | 512 | # Grid dimensions 513 | GRID_HEIGHT = 4 514 | GRID_WIDTH = 12 515 | 516 | # probability for exploration, step size,gamma 517 | EPSILON = 0.1 518 | ALPHA = 0.5 519 | GAMMA = 1 520 | 521 | # all possible actions 522 | ACTION_UP = 0; ACTION_DOWN = 1;ACTION_LEFT = 2;ACTION_RIGHT = 3 523 | actions = [ACTION_UP, ACTION_DOWN, ACTION_LEFT, ACTION_RIGHT] 524 | 525 | # initial state action pair values 526 | stateActionValues = np.zeros((GRID_HEIGHT, GRID_WIDTH, 4)) 527 | startState = [3, 0] 528 | goalState = [3, 11] 529 | 530 | # reward for each action in each state 531 | actionRewards = np.zeros((GRID_HEIGHT, GRID_WIDTH, 4)) 532 | actionRewards[:, :, :] = -1.0 533 | actionRewards[2, 1:11, ACTION_DOWN] = -100.0 534 | actionRewards[3, 0, ACTION_RIGHT] = -100.0 535 | 536 | # set up destinations for each action in each state 537 | actionDestination = [] 538 | for i in range(0, GRID_HEIGHT): 539 | actionDestination.append([]) 540 | for j in range(0, GRID_WIDTH): 541 | destinaion = dict() 542 | destinaion[ACTION_UP] = [max(i - 1, 0), j] 543 | destinaion[ACTION_LEFT] = [i, max(j - 1, 0)] 544 | destinaion[ACTION_RIGHT] = [i, min(j + 1, GRID_WIDTH - 1)] 545 | if i == 2 and 1 <= j <= 10: 546 | destinaion[ACTION_DOWN] = startState 547 | else: 548 | destinaion[ACTION_DOWN] = [min(i + 1, GRID_HEIGHT - 1), j] 549 | actionDestination[-1].append(destinaion) 550 | actionDestination[3][0][ACTION_RIGHT] = startState 551 | 552 | # choose an action based on epsilon greedy algorithm 553 | def chooseAction(state, stateActionValues): 554 | if np.random.binomial(1, EPSILON) == 1: 555 | return np.random.choice(actions) 556 | else: 557 | return np.argmax(stateActionValues[state[0], state[1], :]) 558 | 559 | 560 | # SARSA update 561 | 562 | def sarsa(stateActionValues, expected=False, stepSize=ALPHA): 563 | currentState = startState 564 | currentAction = chooseAction(currentState, stateActionValues) 565 | rewards = 0.0 566 | while currentState != goalState: 567 | newState = actionDestination[currentState[0]][currentState[1]][currentAction] 568 | newAction = chooseAction(newState, stateActionValues) 569 | reward = actionRewards[currentState[0], currentState[1], currentAction] 570 | rewards += reward 571 | if not expected: 572 | valueTarget = stateActionValues[newState[0], newState[1], newAction] 573 | else: 574 | valueTarget = 0.0 575 | actionValues = stateActionValues[newState[0], newState[1], :] 576 | bestActions = np.argwhere(actionValues == np.max(actionValues)) 577 | for action in actions: 578 | if action in bestActions: 579 | valueTarget += ((1.0 - EPSILON) / len(bestActions) + EPSILON / len(actions)) * stateActionValues[newState[0], newState[1], action] 580 | else: 581 | valueTarget += EPSILON / len(actions) * stateActionValues[newState[0], newState[1], action] 582 | valueTarget *= GAMMA 583 | stateActionValues[currentState[0], currentState[1], currentAction] += stepSize * (reward + 584 | valueTarget - stateActionValues[currentState[0], currentState[1], currentAction]) 585 | currentState = newState 586 | currentAction = newAction 587 | return rewards 588 | 589 | # Q-Learning update 590 | def qLearning(stateActionValues, stepSize=ALPHA): 591 | currentState = startState 592 | rewards = 0.0 593 | while currentState != goalState: 594 | currentAction = chooseAction(currentState, stateActionValues) 595 | reward = actionRewards[currentState[0], currentState[1], currentAction] 596 | rewards += reward 597 | newState = actionDestination[currentState[0]][currentState[1]][currentAction] 598 | stateActionValues[currentState[0], currentState[1], currentAction] += stepSize * ( 599 | reward + GAMMA * np.max(stateActionValues[newState[0], newState[1], :]) - 600 | stateActionValues[currentState[0], currentState[1], currentAction]) 601 | currentState = newState 602 | return rewards 603 | 604 | # print optimal policy 605 | def printOptimalPolicy(stateActionValues): 606 | optimalPolicy = [] 607 | for i in range(0, GRID_HEIGHT): 608 | optimalPolicy.append([]) 609 | for j in range(0, GRID_WIDTH): 610 | if [i, j] == goalState: 611 | optimalPolicy[-1].append('G') 612 | continue 613 | bestAction = np.argmax(stateActionValues[i, j, :]) 614 | if bestAction == ACTION_UP: 615 | optimalPolicy[-1].append('U') 616 | elif bestAction == ACTION_DOWN: 617 | optimalPolicy[-1].append('D') 618 | elif bestAction == ACTION_LEFT: 619 | optimalPolicy[-1].append('L') 620 | elif bestAction == ACTION_RIGHT: 621 | optimalPolicy[-1].append('R') 622 | for row in optimalPolicy: 623 | print(row) 624 | 625 | def SARSAnQLPlot(): 626 | # averaging the reward sums from 10 successive episodes 627 | averageRange = 10 628 | 629 | # episodes of each run 630 | nEpisodes = 500 631 | 632 | # perform 20 independent runs 633 | runs = 20 634 | 635 | rewardsSarsa = np.zeros(nEpisodes) 636 | rewardsQLearning = np.zeros(nEpisodes) 637 | for run in range(0, runs): 638 | stateActionValuesSarsa = np.copy(stateActionValues) 639 | stateActionValuesQLearning = np.copy(stateActionValues) 640 | for i in range(0, nEpisodes): 641 | # cut off the value by -100 to draw the figure more elegantly 642 | rewardsSarsa[i] += max(sarsa(stateActionValuesSarsa), -100) 643 | rewardsQLearning[i] += max(qLearning(stateActionValuesQLearning), -100) 644 | 645 | # averaging over independt runs 646 | rewardsSarsa /= runs 647 | rewardsQLearning /= runs 648 | 649 | # averaging over successive episodes 650 | smoothedRewardsSarsa = np.copy(rewardsSarsa) 651 | smoothedRewardsQLearning = np.copy(rewardsQLearning) 652 | for i in range(averageRange, nEpisodes): 653 | smoothedRewardsSarsa[i] = np.mean(rewardsSarsa[i - averageRange: i + 1]) 654 | smoothedRewardsQLearning[i] = np.mean(rewardsQLearning[i - averageRange: i + 1]) 655 | 656 | # display optimal policy 657 | print('Sarsa Optimal Policy:') 658 | printOptimalPolicy(stateActionValuesSarsa) 659 | print('Q-Learning Optimal Policy:') 660 | printOptimalPolicy(stateActionValuesQLearning) 661 | 662 | # draw reward curves 663 | plt.figure(1) 664 | plt.plot(smoothedRewardsSarsa, label='Sarsa') 665 | plt.plot(smoothedRewardsQLearning, label='Q-Learning') 666 | plt.xlabel('Episodes') 667 | plt.ylabel('Sum of rewards during episode') 668 | plt.legend() 669 | 670 | 671 | # Sum of Rewards for SARSA vs. QLearning 672 | SARSAnQLPlot() 673 | 674 | 675 | 676 | 677 | 678 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Statistics for Machine Learning 4 | This is the code repository for [Statistics for Machine Learning](https://www.packtpub.com/big-data-and-business-intelligence/statistics-machine-learning?utm_source=github&utm_medium=repository&utm_campaign=9781788295758), published by [Packt](https://www.packtpub.com/?utm_source=github). It contains all the supporting project files necessary to work through the book from start to finish. 5 | ## About the Book 6 | Complex statistics in Machine Learning worry a lot of developers. Knowing statistics helps you build strong Machine Learning models that are optimized for a given problem statement. This book will teach you all it takes to perform complex statistical computations required for Machine Learning. You will gain information on statistics behind supervised learning, unsupervised learning, reinforcement learning, and more. You will see real-world examples that discuss the statistical side of Machine Learning and familiarize yourself with it. You will come across programs for performing tasks such as model, parameter fitting, regression, classification, density collection, working with vectors, matrices, and more. By the end of the book, you will have mastered the required statistics for Machine Learning and will be able to apply your new skills to any sort of industry problem. 7 | ## Instructions and Navigation 8 | All of the code is organized into folders. Each folder starts with a number followed by the application name. For example, Chapter02. 9 | 10 | 11 | 12 | The code will look like the following: 13 | ``` 14 | >>> import numpy as np 15 | >>> from scipy import stats 16 | >>> data = np.array([4,5,1,2,7,2,6,9,3]) 17 | # Calculate Mean 18 | >>> dt_mean = np.mean(data) ; print ("Mean :",round(dt_mean,2)) 19 | # Calculate Median 20 | >>> dt_median = np.median(data) ; print ("Median :",dt_median) 21 | # Calculate Mode 22 | >>> dt_mode = stats.mode(data); print ("Mode :",dt_mode[0][0]) 23 | ``` 24 | 25 | This book assumes that you know the basics of Python and R and how to install the 26 | libraries. It does not assume that you are already equipped with the knowledge of advanced 27 | statistics and mathematics, like linear algebra and so on. 28 | The following versions of software are used throughout this book, but it should run fine 29 | with any more recent ones as well: 30 | * Anaconda 3–4.3.1 (all Python and its relevant packages are included in 31 | Anaconda, Python 3.6.1, NumPy 1.12.1, Pandas 0.19.2, and scikit-learn 0.18.1) 32 | * R 3.4.0 and RStudio 1.0.143 33 | * Theano 0.9.0 34 | * Keras 2.0.2 35 | 36 | ## Related Products 37 | * [Machine Learning for Developers](https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-developers?utm_source=github&utm_medium=repository&utm_campaign=9781786469878) 38 | 39 | * [Scala for Machine Learning](https://www.packtpub.com/big-data-and-business-intelligence/scala-machine-learning?utm_source=github&utm_medium=repository&utm_campaign=9781783558742) 40 | 41 | * [Machine Learning for OpenCV](https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-opencv?utm_source=github&utm_medium=repository&utm_campaign=9781783980284) 42 | ### Download a free PDF 43 | 44 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.
45 |

https://packt.link/free-ebook/9781788295758

--------------------------------------------------------------------------------