├── Chapter01
    ├── Chapter 01_Journey from Statistics to Machine Learning.R
    ├── Chapter 01_Journey from Statistics to Machine Learning.py
    └── Data.zip
├── Chapter02
    ├── Chapter 02_Parallelism of Statistics and Machine Learning.R
    └── Chapter 02_Parallelism of Statistics and Machine Learning.py
├── Chapter03
    ├── Chapter 03_Logistic Regression vs Random Forest.R
    └── Chapter 03_Logistic Regression vs Random Forest.py
├── Chapter04
    ├── Chapter 04_Tree based ML Models.R
    ├── Chapter 04_Tree based ML Models.py
    └── WA_Fn-UseC_-HR-Employee-Attrition.csv
├── Chapter05
    ├── Chapter 05_KNN n Naive Bayes.R
    └── Chapter 05_KNN n Naive Bayes.py
├── Chapter06
    ├── Chapter 06_SVM_n_NN.R
    ├── Chapter 06_SVM_n_NN.py
    ├── digitsdata.csv
    └── letterdata.csv
├── Chapter07
    ├── Chapter 07_Recomm_Engine.R
    ├── Chapter 07_Recomm_Engine.py
    ├── movies.csv
    └── ratings.csv
├── Chapter08
    ├── Chapter 08_Kmeans_PCA.R
    ├── Chapter 08_Kmeans_PCA.py
    ├── digitsdata.csv
    └── iris.csv
├── Chapter09
    └── Chapter 09_RL.py
├── LICENSE
└── README.md


/Chapter01/Chapter 01_Journey from Statistics to Machine Learning.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | rm(list = ls())
  5 | 
  6 | # First change the following directory link to where all the input files do exist
  7 | 
  8 | setwd("D:\\Book writing\\Codes\\Chapter 1")
  9 | 
 10 | 
 11 | data <- c(4,5,1,2,7,2,6,9,3)
 12 | 
 13 | # Calculate Mean
 14 | dt_mean = mean(data) ; print(round(dt_mean,2))
 15 | 
 16 | # Calculate Median
 17 | dt_median = median(data); print(dt_median)
 18 | 
 19 | # Calculate Mode
 20 | func_mode <- function(input_dt){
 21 |   unq <- unique(input_dt)
 22 |   unq[which.max(tabulate(match(input_dt,unq)))]
 23 | }
 24 | 
 25 | dt_mode = func_mode(data); print(dt_mode)
 26 | 
 27 | 
 28 | 
 29 | # Desriptive statistics - dispersion
 30 | game_points <- c(35,56,43,59,63,79,35,41,64,43,93,60,77,24,82)
 31 | 
 32 | # Calculation Variance
 33 | dt_var = var(game_points); print(round(dt_var,2))
 34 | 
 35 | # Calculation Standard Deviation
 36 | dt_std = sd(game_points); print(round(dt_std,2))
 37 | 
 38 | # Calculation Range
 39 | range_val<-function(x) return(diff(range(x))) 
 40 | dt_range = range_val(game_points); print(dt_range)
 41 | 
 42 | # Calculation Quantiles
 43 | dt_quantile = quantile(game_points,probs = c(0.2,0.8,1.0)); print(dt_quantile)
 44 | 
 45 | # Calculation Inter quartile range
 46 | dt_iqr = IQR(game_points); print(dt_iqr)
 47 | 
 48 | 
 49 | 
 50 | 
 51 | # Hypothesis testing
 52 | 
 53 | xbar = 990; mu0 = 1000; s = 12.5 ; n = 30
 54 | t_smple = (xbar - mu0)/(s/sqrt(n));print (round(t_smple,2))
 55 | 
 56 | alpha = 0.05
 57 | t_alpha = qt(alpha,df= n-1);print (round(t_alpha,3))
 58 | 
 59 | p_val = pt(t_smple,df = n-1);print (p_val)
 60 | 
 61 | 
 62 | 
 63 | # Normal Distribution
 64 | xbar = 67; mu0 = 52; s = 16.3
 65 | 
 66 | # Normal distribution
 67 | # P (Z >= (x-mu)/sigma) 
 68 | # F(x) = P(X <= x)
 69 | pr = 1- pnorm(67, mean=52, sd=16.3)
 70 | print(paste("Prob. to score more than 67 is ",round(pr*100,2),"%"))
 71 | 
 72 | 
 73 | 
 74 | # Chi-square independence test
 75 | survey = read.csv("survey.csv",header=TRUE)
 76 | 
 77 | tbl = table(survey$Smoke,survey$Exer)
 78 | p_val = chisq.test(tbl)
 79 | 
 80 | print(paste("P-value is :",round(p_val$p.value,3)))
 81 | 
 82 | 
 83 | #ANOVA
 84 | fetilizers = read.csv("fetilizers.csv",header=TRUE)
 85 | 
 86 | # Concatenate data rows into single vector
 87 | r = c(t(as.matrix(fetilizers)))
 88 | f = c("fertilizer1","fertilizer2","fertilizer3")
 89 | k = 3; n = 6
 90 | 
 91 | tm = gl(k,1,n*k,factor(f))
 92 | blk = gl(n,k,k*n)
 93 | av = aov(r ~ tm + blk)
 94 | 
 95 | smry = summary(av)
 96 | print(smry)
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | # Linear Regression vs. Gradient descent
103 | train_data = read.csv("mtcars.csv",header=TRUE)
104 | 
105 | attach(train_data)
106 | plot(hp, mpg, col = "blue", pch = 20)
107 | 
108 | 
109 | # Linear Regression
110 | model <- lm(mpg ~ hp, data = train_data)
111 | coef(model)
112 | 
113 | abline(model)
114 | mtext(paste('y =', round(coef(model)[[2]],3), '* x', '+', round( coef(model)[[1]],2)))
115 | 
116 | 
117 | 
118 | rm(list = ls())
119 | 
120 | # Linear Regression
121 | train_data = read.csv("mtcars.csv",header=TRUE)
122 | model <- lm(mpg ~ hp, data = train_data)
123 | print (coef(model))
124 | 
125 | 
126 | # Gradient descent
127 | gradDesc <- function(x, y, learn_rate, conv_threshold, batch_size, max_iter) {
128 |   m <- runif(1, 0, 1)
129 |   c <- runif(1, 0, 1)
130 |   ypred <- m * x + c
131 |   MSE <- sum((y - ypred) ^ 2) / batch_size
132 |   
133 |   converged = F
134 |   iterations = 0
135 |   
136 |   while(converged == F) {
137 |     m_new <- m - learn_rate * ((1 / batch_size) * (sum((ypred - y) * x)))
138 |     c_new <- c - learn_rate * ((1 / batch_size) * (sum(ypred - y)))
139 |     
140 |     m <- m_new
141 |     c <- c_new
142 |     ypred <- m * x + c
143 |     MSE_new <- sum((y - ypred) ^ 2) / batch_size
144 |     
145 |     if(MSE - MSE_new <= conv_threshold) {
146 |       converged = T
147 |       return(paste("Iterations:",iterations,"Optimal intercept:", c, "Optimal slope:", m))
148 |     }
149 |     
150 |     iterations = iterations + 1
151 |     
152 |     if(iterations > max_iter) { 
153 |       converged = T
154 |       return(paste("Iterations:",iterations,"Optimal intercept:", c, "Optimal slope:", m))
155 |     }
156 |     
157 |     MSE = MSE_new
158 |   }
159 | }
160 | 
161 | gradDesc(x = hp,y =  mpg, learn_rate = 0.00003, conv_threshold = 1e-8, batch_size = 32, max_iter = 1500000)
162 | 
163 | 
164 | 
165 | 
166 | # Train & Test samples
167 | full_data = read.csv("mtcars.csv",header=TRUE)
168 | 
169 | set.seed(123)
170 | numrow = nrow(full_data)
171 | trnind = sample(1:numrow,size = as.integer(0.7*numrow))
172 | 
173 | train_data = full_data[trnind,]
174 | test_data = full_data[-trnind,]
175 | 
176 | 
177 | # Train Validation & Test samples
178 | trvaltest <- function(dat,prop = c(0.5,0.25,0.25)){
179 |   nrw = nrow(dat)
180 |   trnr = as.integer(nrw *prop[1])
181 |   vlnr = as.integer(nrw*prop[2])
182 |   set.seed(123)
183 |   trni = sample(1:nrow(dat),trnr)
184 |   trndata = dat[trni,]
185 |   rmng = dat[-trni,]
186 |   vlni = sample(1:nrow(rmng),vlnr)
187 |   valdata = rmng[vlni,]
188 |   tstdata = rmng[-vlni,]
189 |   mylist = list("trn" = trndata,"val"= valdata,"tst" = tstdata)
190 |   return(mylist)
191 | }
192 | 
193 | outdata = trvaltest(mtcars,prop = c(0.5,0.25,0.25))
194 | train_data = outdata$trn;valid_data = outdata$val;test_data = outdata$tst
195 | 
196 | 
197 | 
198 | 
199 | # Grid Search on Decision Trees
200 | library(rpart)
201 | input_data = read.csv("ad.csv",header=FALSE)
202 | input_data$V1559 = as.factor(input_data$V1559)
203 | set.seed(123)
204 | numrow = nrow(input_data)
205 | trnind = sample(1:numrow,size = as.integer(0.7*numrow))
206 | 
207 | train_data = input_data[trnind,];test_data = input_data[-trnind,]
208 | minspset = c(2,3);minobset = c(1,2,3)
209 | initacc = 0
210 | 
211 | for (minsp in minspset){
212 |   for (minob in minobset){
213 |     tr_fit = rpart(V1559 ~.,data = train_data,method = "class",minsplit = minsp, minbucket = minob)
214 |     tr_predt = predict(tr_fit,newdata = train_data,type = "class")
215 |     tble = table(tr_predt,train_data$V1559)
216 |     acc = (tble[1,1]+tble[2,2])/sum(tble)
217 |     acc
218 |     if (acc > initacc){
219 |       tr_predtst = predict(tr_fit,newdata = test_data,type = "class")
220 |       tblet = table(test_data$V1559,tr_predtst)
221 |       acct = (tblet[1,1]+tblet[2,2])/sum(tblet)
222 |       acct
223 |       print(paste("Best Score"))
224 |       print( paste("Train Accuracy ",round(acc,3),"Test Accuracy",round(acct,3)))
225 |       print( paste(" Min split ",minsp," Min obs per node ",minob))
226 |       print(paste("Confusion matrix on test data"))
227 |       print(tblet)
228 |       precsn_0 = (tblet[1,1])/(tblet[1,1]+tblet[2,1])
229 |       precsn_1 = (tblet[2,2])/(tblet[1,2]+tblet[2,2])
230 |       print(paste("Precision_0: ",round(precsn_0,3),"Precision_1: ",round(precsn_1,3)))
231 |       rcall_0 = (tblet[1,1])/(tblet[1,1]+tblet[1,2])
232 |       rcall_1 = (tblet[2,2])/(tblet[2,1]+tblet[2,2])
233 |       print(paste("Recall_0: ",round(rcall_0,3),"Recall_1: ",round(rcall_1,3)))
234 |       initacc = acc
235 |     }
236 |   }
237 | }
238 | 
239 | 
240 | 
241 | 
242 | 
243 | 
244 | 
245 | 
246 | 
247 | 
248 | 
249 | 
250 | 
251 | 
252 | 
253 | 
254 | 


--------------------------------------------------------------------------------
/Chapter01/Chapter 01_Journey from Statistics to Machine Learning.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | import os
  4 | 
  5 | 
  6 | """ First change the following directory link to where all input files do exist """
  7 | 
  8 | os.chdir("D:\Book writing\Codes\Chapter 1")
  9 | 
 10 | 
 11 | 
 12 | import numpy as np
 13 | from scipy import stats
 14 | 
 15 | 
 16 | data = np.array([4,5,1,2,7,2,6,9,3])
 17 | 
 18 | # Calculate Mean
 19 | dt_mean = np.mean(data) ; print ("Mean :",round(dt_mean,2))
 20 |               
 21 | # Calculate Median                 
 22 | dt_median = np.median(data) ; print ("Median :",dt_median)        
 23 | 
 24 | # Calculate Mode                     
 25 | dt_mode =  stats.mode(data); print ("Mode :",dt_mode[0][0])                   
 26 | 
 27 | 
 28 | # Deviance calculations
 29 | 
 30 | import numpy as np
 31 | from statistics import variance,stdev
 32 | 
 33 | game_points = np.array([35,56,43,59,63,79,35,41,64,43,93,60,77,24,82])
 34 | 
 35 | # Calculate Variance
 36 | dt_var = variance(game_points) ; print ("Sample variance:", round(dt_var,2))
 37 | 
 38 | # Calculate Standard Deviation
 39 | dt_std = stdev(game_points) ; print ("Sample std.dev:",round(dt_std,2))
 40 |                
 41 | # Calculate Range
 42 | dt_rng = np.max(game_points,axis=0) - np.min(game_points,axis=0) ; print ("Range:",dt_rng)
 43 | 
 44 | 
 45 | #Calculate percentiles
 46 | print ("Quantiles:")
 47 | for val in [20,80,100]:
 48 |     dt_qntls = np.percentile(game_points,val) 
 49 |     print (str(val)+"%" ,dt_qntls)
 50 |                                 
 51 | # Calculate IQR                           
 52 | q75, q25 = np.percentile(game_points, [75 ,25]); print ("Inter quartile range:",q75-q25 )
 53 |                         
 54 |                         
 55 | # Hypothesis testing
 56 | #import scipy                       
 57 |           
 58 | from scipy import stats              
 59 | 
 60 | xbar = 990; mu0 = 1000; s = 12.5; n = 30
 61 | # Test Statistic
 62 | t_smple  = (xbar-mu0)/(s/np.sqrt(float(n))); print ("Test Statistic:",round(t_smple,2))
 63 | # Critical value from t-table
 64 | alpha = 0.05
 65 | t_alpha = stats.t.ppf(alpha,n-1); print ("Critical value from t-table:",round(t_alpha,3))          
 66 | #Lower tail p-value from t-table                        
 67 | p_val = stats.t.sf(np.abs(t_smple), n-1); print ("Lower tail p-value from t-table", p_val)                        
 68 |                       
 69 | 
 70 | # Normal Distribution
 71 | from scipy import stats
 72 | xbar = 67; mu0 = 52; s = 16.3
 73 | 
 74 | # Calculating z-score
 75 | z = (67-52)/16.3
 76 | 
 77 | # Calculating probability under the curve    
 78 | p_val = 1- stats.norm.cdf(z)
 79 | print ("Prob. to score more than 67 is ",round(p_val*100,2),"%")
 80 | 
 81 | 
 82 | 
 83 | # Chi-square independence test
 84 | import pandas as pd
 85 | from scipy import stats
 86 | 
 87 | survey = pd.read_csv("survey.csv")  
 88 | # Tabulating 2 variables with row & column variables respectively
 89 | survey_tab = pd.crosstab(survey.Smoke, survey.Exer, margins = True)
 90 | # Creating observed table for analysis
 91 | observed = survey_tab.ix[0:4,0:3] 
 92 | 
 93 | contg = stats.chi2_contingency(observed= observed)
 94 | p_value = round(contg[1],3)
 95 | print ("P-value is: ",p_value)
 96 | 
 97 | 
 98 | 
 99 | #ANOVA
100 | import pandas as pd
101 | from scipy import stats
102 | 
103 | fetilizers = pd.read_csv("fetilizers.csv")
104 | 
105 | one_way_anova = stats.f_oneway(fetilizers["fertilizer1"], fetilizers["fertilizer2"], fetilizers["fertilizer3"])
106 | 
107 | print ("Statistic :", round(one_way_anova[0],2),", p-value :",round(one_way_anova[1],3))
108 | 
109 | 
110 | 
111 | 
112 | 
113 | # Train & Test split
114 | import pandas as pd      
115 | from sklearn.model_selection import train_test_split              
116 |                         
117 | original_data = pd.read_csv("mtcars.csv")     
118 | 
119 | train_data,test_data = train_test_split(original_data,train_size = 0.7,random_state=42)
120 | 
121 | 
122 | # Linear Regressio vs. Gradient Descent             
123 |                
124 | import numpy as np                        
125 | import pandas as pd
126 |                        
127 | train_data = pd.read_csv("mtcars.csv")                       
128 |                         
129 | X = np.array(train_data["hp"])  ; y = np.array(train_data["mpg"]) 
130 | X = X.reshape(32,1); y = y.reshape(32,1)
131 | 
132 | from sklearn.linear_model import LinearRegression
133 | model = LinearRegression(fit_intercept = True) 
134 |  
135 | model.fit(X,y)       
136 | print ("Linear Regression Results")        
137 | print ("Intercept",model.intercept_[0] ,"Coefficient",model.coef_[0])   
138 |                    
139 | 
140 | def gradient_descent(x, y,learn_rate, conv_threshold,batch_size,max_iter):    
141 |     converged = False
142 |     iter = 0
143 |     m = batch_size 
144 |  
145 |     t0 = np.random.random(x.shape[1])
146 |     t1 = np.random.random(x.shape[1])
147 | 
148 |     MSE = (sum([(t0 + t1*x[i] - y[i])**2 for i in range(m)])/ m)    
149 | 
150 |     while not converged:        
151 |         grad0 = 1.0/m * sum([(t0 + t1*x[i] - y[i]) for i in range(m)]) 
152 |         grad1 = 1.0/m * sum([(t0 + t1*x[i] - y[i])*x[i] for i in range(m)])
153 | 
154 |         temp0 = t0 - learn_rate * grad0
155 |         temp1 = t1 - learn_rate * grad1
156 |     
157 |         t0 = temp0
158 |         t1 = temp1
159 | 
160 |         MSE_New = (sum( [ (t0 + t1*x[i] - y[i])**2 for i in range(m)] ) / m)
161 | 
162 |         if abs(MSE - MSE_New ) <= conv_threshold:
163 |             print ('Converged, iterations: ', iter)
164 |             converged = True
165 |     
166 |         MSE = MSE_New   
167 |         iter += 1 
168 |     
169 |         if iter == max_iter:
170 |             print ('Max interactions reached')
171 |             converged = True
172 | 
173 |     return t0,t1
174 | 
175 | if __name__ == '__main__':
176 |     Inter, Coeff = gradient_descent(x = X,y = y,learn_rate=0.00003 ,conv_threshold=1e-8, batch_size=32,max_iter=1500000)
177 |     print ("Gradient Descent Results")
178 |     print (('Intercept = %s Coefficient = %s') %(Inter, Coeff)) 
179 | 
180 | 
181 | 
182 | 
183 | # Train Validation Test split      
184 | 
185 | import pandas as pd      
186 | from sklearn.model_selection import train_test_split              
187 |                         
188 | original_data = pd.read_csv("mtcars.csv")                   
189 |  
190 | 
191 | def data_split(dat,trf = 0.5,vlf=0.25,tsf = 0.25):
192 |     nrows = dat.shape[0]    
193 |     trnr = int(nrows*trf)
194 |     vlnr = int(nrows*vlf)    
195 |     
196 |     tr_data,rmng = train_test_split(dat,train_size = trnr,random_state=42)
197 |     vl_data, ts_data = train_test_split(rmng,train_size = vlnr,random_state=45)  
198 |     
199 |     return (tr_data,vl_data,ts_data)
200 | 
201 | 
202 | train_data, validation_data, test_data = data_split(original_data,trf=0.5,vlf=0.25,tsf=0.25)
203 | 
204 | 
205 | 
206 | 
207 | 
208 | # Grid search on Decision Trees
209 | import pandas as pd
210 | from sklearn.tree import DecisionTreeClassifier
211 | from sklearn.model_selection import train_test_split,GridSearchCV
212 | from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
213 | from sklearn.pipeline import Pipeline
214 | 
215 | 
216 | 
217 | input_data = pd.read_csv("ad.csv",header=None)                       
218 | 
219 | X_columns = set(input_data.columns.values)
220 | y = input_data[len(input_data.columns.values)-1]
221 | X_columns.remove(len(input_data.columns.values)-1)
222 | X = input_data[list(X_columns)]
223 | 
224 | X_train, X_test,y_train,y_test = train_test_split(X,y,train_size = 0.7,random_state=33)
225 | 
226 | pipeline = Pipeline([
227 |     ('clf', DecisionTreeClassifier(criterion='entropy'))
228 | ])
229 | parameters = {
230 |     'clf__max_depth': (50,100,150),
231 |     'clf__min_samples_split': (2, 3),
232 |     'clf__min_samples_leaf': (1, 2, 3)
233 | }
234 | 
235 | grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')
236 | grid_search.fit(X_train, y_train)
237 | 
238 | y_pred = grid_search.predict(X_test)
239 | 
240 | print ('\n Best score: \n', grid_search.best_score_)
241 | print ('\n Best parameters set: \n')
242 | best_parameters = grid_search.best_estimator_.get_params()
243 | for param_name in sorted(parameters.keys()):
244 |     print ('\t%s: %r' % (param_name, best_parameters[param_name]))
245 | print ("\n Confusion Matrix on Test data \n",confusion_matrix(y_test,y_pred))
246 | print ("\n Test Accuracy \n",accuracy_score(y_test,y_pred))
247 | print ("\nPrecision Recall f1 table \n",classification_report(y_test, y_pred))
248 | 
249 | 
250 | 
251 | 
252 | 
253 | 
254 | 
255 | 
256 | 
257 | 
258 | 
259 | 
260 | 
261 | 
262 | 
263 | 
264 | 
265 | 
266 | 
267 | 
268 | 
269 | 
270 | 
271 | 
272 | 
273 | 
274 | 
275 | 
276 | 
277 | 
278 | 


--------------------------------------------------------------------------------
/Chapter01/Data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Statistics-for-Machine-Learning/41e73f42da97c164859641c2add6e487cbc77402/Chapter01/Data.zip


--------------------------------------------------------------------------------
/Chapter02/Chapter 02_Parallelism of Statistics and Machine Learning.R:
--------------------------------------------------------------------------------
  1 | 
  2 | rm(list = ls())
  3 | 
  4 | # First change the following directory link to where all the input files do exist
  5 | setwd("D:\\Book writing\\Codes\\Chapter 2")
  6 | 
  7 | 
  8 | 
  9 | # Simple Linear Regression
 10 | wine_quality = read.csv("winequality-red.csv",header=TRUE,sep = ";",check.names = FALSE)
 11 | names(wine_quality) <- gsub(" ", "_", names(wine_quality))
 12 | 
 13 | set.seed(123)
 14 | numrow = nrow(wine_quality)
 15 | trnind = sample(1:numrow,size = as.integer(0.7*numrow))
 16 | train_data = wine_quality[trnind,]
 17 | test_data = wine_quality[-trnind,]
 18 | 
 19 | x_train = train_data$alcohol;y_train = train_data$quality
 20 | x_test = test_data$alcohol; y_test = test_data$quality
 21 | 
 22 | x_mean = mean(x_train); y_mean = mean(y_train)
 23 | x_var = sum((x_train - x_mean)**2) ; y_var = sum((y_train-y_mean)**2)
 24 | covariance = sum((x_train-x_mean)*(y_train-y_mean))
 25 | 
 26 | b1 = covariance/x_var  
 27 | b0 = y_mean - b1*x_mean
 28 | 
 29 | pred_y = b0+b1*x_test
 30 | 
 31 | R2 <- 1 - (sum((y_test-pred_y )^2)/sum((y_test-mean(y_test))^2))
 32 | print(paste("Test Adjusted R-squared :",round(R2,4)))
 33 | 
 34 | 
 35 | 
 36 | 
 37 | library(usdm)
 38 | 
 39 | # Multi linear Regression
 40 | wine_quality = read.csv("winequality-red.csv",header=TRUE,sep = ";",check.names = FALSE)
 41 | names(wine_quality) <- gsub(" ", "_", names(wine_quality))
 42 | 
 43 | set.seed(123)
 44 | numrow = nrow(wine_quality)
 45 | trnind = sample(1:numrow,size = as.integer(0.7*numrow))
 46 | train_data = wine_quality[trnind,]
 47 | test_data = wine_quality[-trnind,]
 48 | 
 49 | xvars = c("volatile_acidity","chlorides","free_sulfur_dioxide", 
 50 |            "total_sulfur_dioxide","pH","sulphates","alcohol")
 51 | yvar = "quality"
 52 | 
 53 | frmla = paste(yvar,"~",paste(xvars,collapse = "+"))
 54 | lr_fit = lm(as.formula(frmla),data = train_data)
 55 | print(summary(lr_fit))
 56 | 
 57 | #VIF calculation
 58 | wine_v2 = train_data[,xvars]
 59 | print(vif(wine_v2))
 60 | 
 61 | #Test prediction
 62 | pred_y = predict(lr_fit,newdata = test_data)
 63 | R2 <- 1 - (sum((test_data[,yvar]-pred_y )^2)/sum((test_data[,yvar]-mean(test_data[,yvar]))^2))
 64 | print(paste("Test Adjusted R-squared :",R2))
 65 | 
 66 | 
 67 | 
 68 | 
 69 | # xvars = c("fixed_acidity","volatile_acidity","citric_acid","residual_sugar","chlorides","free_sulfur_dioxide", 
 70 | #           "total_sulfur_dioxide","density","pH","sulphates","alcohol")
 71 | 
 72 | 
 73 | 
 74 | 
 75 | 
 76 | 
 77 | 
 78 | # Ridge regression
 79 | library(glmnet)
 80 | 
 81 | wine_quality = read.csv("winequality-red.csv",header=TRUE,sep = ";",check.names = FALSE)
 82 | names(wine_quality) <- gsub(" ", "_", names(wine_quality))
 83 | 
 84 | set.seed(123)
 85 | numrow = nrow(wine_quality)
 86 | trnind = sample(1:numrow,size = as.integer(0.7*numrow))
 87 | train_data = wine_quality[trnind,]; test_data = wine_quality[-trnind,]
 88 | 
 89 | xvars = c("fixed_acidity","volatile_acidity","citric_acid","residual_sugar","chlorides","free_sulfur_dioxide", 
 90 |            "total_sulfur_dioxide","density","pH","sulphates","alcohol")
 91 | yvar = "quality"
 92 | 
 93 | x_train = as.matrix(train_data[,xvars]);y_train = as.double(as.matrix(train_data[,yvar]))
 94 | x_test = as.matrix(test_data[,xvars])
 95 | 
 96 | print(paste("Ridge Regression"))
 97 | lambdas = c(1e-4,1e-3,1e-2,0.1,0.5,1.0,5.0,10.0)
 98 | initrsq = 0
 99 | for (lmbd in lambdas){
100 |   ridge_fit = glmnet(x_train,y_train,alpha = 0,lambda = lmbd)
101 |   pred_y = predict(ridge_fit,x_test)
102 |   R2 <- 1 - (sum((test_data[,yvar]-pred_y )^2)/sum((test_data[,yvar]-mean(test_data[,yvar]))^2))
103 |   
104 |   if (R2 > initrsq){
105 |     print(paste("Lambda:",lmbd,"Test Adjusted R-squared :",round(R2,4)))
106 |     initrsq = R2
107 |   }
108 | }
109 | 
110 | 
111 | 
112 | # Lasso Regression
113 | print(paste("Lasso Regression"))
114 | lambdas = c(1e-4,1e-3,1e-2,0.1,0.5,1.0,5.0,10.0)
115 | initrsq = 0
116 | for (lmbd in lambdas){
117 |   lasso_fit = glmnet(x_train,y_train,alpha = 1,lambda = lmbd)
118 |   pred_y = predict(lasso_fit,x_test)
119 |   R2 <- 1 - (sum((test_data[,yvar]-pred_y )^2)/sum((test_data[,yvar]-mean(test_data[,yvar]))^2))
120 |   
121 |   if (R2 > initrsq){
122 |     print(paste("Lambda:",lmbd,"Test Adjusted R-squared :",round(R2,4)))
123 |     initrsq = R2
124 |   }
125 | }
126 | 
127 | 
128 | 


--------------------------------------------------------------------------------
/Chapter02/Chapter 02_Parallelism of Statistics and Machine Learning.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | import os
  5 | 
  6 | 
  7 | """ First change the following directory link to where all input files do exist """
  8 | os.chdir("D:\\Book writing\\Codes\\Chapter 2")
  9 | 
 10 | 
 11 | import numpy as np
 12 | import pandas as pd
 13 | import statsmodels.api as sm
 14 | import matplotlib.pyplot as plt
 15 | import seaborn as sns
 16 | #from sklearn.model_selection import train_test_split    
 17 | #from sklearn.metrics import r2_score
 18 | 
 19 | 
 20 | wine_quality = pd.read_csv("winequality-red.csv",sep=';')  
 21 | # Step for converting white space in columns to _ value for better handling 
 22 | wine_quality.rename(columns=lambda x: x.replace(" ", "_"), inplace=True)
 23 | 
 24 | # Simple Linear Regression - chart
 25 | model = sm.OLS(wine_quality['quality'],sm.add_constant(wine_quality['alcohol'])).fit()
 26 |  
 27 | print (model.summary())
 28 | 
 29 | plt.scatter(wine_quality['alcohol'],wine_quality['quality'],label = 'Actual Data')
 30 | plt.plot(wine_quality['alcohol'],model.params[0]+model.params[1]*wine_quality['alcohol'],
 31 |          c ='r',label="Regression fit")
 32 | plt.title('Wine Quality regressed on Alchohol')
 33 | plt.xlabel('Alcohol')
 34 | plt.ylabel('Quality')
 35 | plt.show()
 36 | 
 37 | 
 38 | # Simple Linear Regression - Model fit
 39 | import pandas as pd
 40 | from sklearn.model_selection import train_test_split    
 41 | from sklearn.metrics import r2_score
 42 | 
 43 | 
 44 | wine_quality = pd.read_csv("winequality-red.csv",sep=';')  
 45 | wine_quality.rename(columns=lambda x: x.replace(" ", "_"), inplace=True)
 46 | 
 47 | x_train,x_test,y_train,y_test = train_test_split(wine_quality['alcohol'],wine_quality["quality"],train_size = 0.7,random_state=42)
 48 | 
 49 | x_train = pd.DataFrame(x_train);x_test = pd.DataFrame(x_test)
 50 | y_train = pd.DataFrame(y_train);y_test = pd.DataFrame(y_test)
 51 | 
 52 | def mean(values):
 53 |     return round(sum(values)/float(len(values)),2)
 54 | 
 55 | alcohol_mean = mean(x_train['alcohol'])
 56 | quality_mean = mean(y_train['quality'])
 57 | 
 58 | alcohol_variance = round(sum((x_train['alcohol'] - alcohol_mean)**2),2)
 59 | quality_variance = round(sum((y_train['quality'] - quality_mean)**2),2)
 60 | 
 61 | covariance = round(sum((x_train['alcohol'] - alcohol_mean) * (y_train['quality'] - quality_mean )),2)
 62 | b1 = covariance/alcohol_variance
 63 | b0 = quality_mean - b1*alcohol_mean
 64 | print ("\n\nIntercept (B0):",round(b0,4),"Co-efficient (B1):",round(b1,4))
 65 | y_test["y_pred"] = pd.DataFrame(b0+b1*x_test['alcohol'])
 66 | R_sqrd = 1- ( sum((y_test['quality']-y_test['y_pred'])**2) / sum((y_test['quality'] - mean(y_test['quality']))**2 ))
 67 | print ("Test R-squared value:",round(R_sqrd,4))
 68 | 
 69 | 
 70 | # Plots - pair plots
 71 | eda_colnms = [ 'volatile_acidity',  'chlorides', 'sulphates', 'alcohol','quality']
 72 | sns.set(style='whitegrid',context = 'notebook')
 73 | sns.pairplot(wine_quality[eda_colnms],size = 2.5,x_vars= eda_colnms,y_vars=eda_colnms)
 74 | plt.show()
 75 | 
 76 | 
 77 | 
 78 | # Correlation coefficients
 79 | corr_mat = np.corrcoef(wine_quality[eda_colnms].values.T)
 80 | sns.set(font_scale=1)
 81 | full_mat = sns.heatmap(corr_mat, cbar=True, annot=True, square=True, fmt='.2f', 
 82 |                        annot_kws={'size': 15}, yticklabels=eda_colnms, xticklabels=eda_colnms)
 83 | 
 84 | plt.show()
 85 | 
 86 | 
 87 | 
 88 | # Multi linear regression model
 89 | colnms = [ 'volatile_acidity',  'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide',
 90 |  'pH', 'sulphates', 'alcohol']
 91 | 
 92 | 
 93 | pdx = wine_quality[colnms]
 94 | pdy = wine_quality["quality"]
 95 | 
 96 | x_train,x_test,y_train,y_test = train_test_split(pdx,pdy,train_size = 0.7,random_state=42)
 97 | x_train_new = sm.add_constant(x_train)
 98 | x_test_new = sm.add_constant(x_test)
 99 | 
100 | #random.seed(434)
101 | full_mod = sm.OLS(y_train,x_train_new)
102 | full_res = full_mod.fit()
103 | print ("\n \n",full_res.summary())
104 | 
105 | 
106 | print ("\nVariance Inflation Factor")
107 | cnames = x_train.columns
108 | for i in np.arange(0,len(cnames)):
109 |     xvars = list(cnames)
110 |     yvar = xvars.pop(i)
111 |     mod = sm.OLS(x_train[yvar],sm.add_constant(x_train_new[xvars]))
112 |     res = mod.fit()
113 |     vif = 1/(1-res.rsquared)
114 |     print (yvar,round(vif,3))
115 | 
116 | # Predition of data    
117 | y_pred = full_res.predict(x_test_new)
118 | y_pred_df = pd.DataFrame(y_pred)
119 | y_pred_df.columns = ['y_pred']
120 | pred_data = pd.DataFrame(y_pred_df['y_pred'])
121 | y_test_new = pd.DataFrame(y_test)
122 | #y_test_new.reset_index(inplace=True)
123 | 
124 | pred_data['y_test'] = pd.DataFrame(y_test_new['quality'])
125 | 
126 | # R-square calculation
127 | rsqd = r2_score(y_test_new['quality'].tolist(), y_pred_df['y_pred'].tolist())
128 | print ("\nTest R-squared value:",round(rsqd,4))
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | # Ridge Regression
136 | from sklearn.linear_model import Ridge
137 | 
138 | wine_quality = pd.read_csv("winequality-red.csv",sep=';')  
139 | wine_quality.rename(columns=lambda x: x.replace(" ", "_"), inplace=True)
140 | 
141 | all_colnms = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
142 |  'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
143 |  'pH', 'sulphates', 'alcohol']
144 | 
145 | 
146 | pdx = wine_quality[all_colnms]
147 | pdy = wine_quality["quality"]
148 | 
149 | x_train,x_test,y_train,y_test = train_test_split(pdx,pdy,train_size = 0.7,random_state=42)
150 | 
151 | alphas = [1e-4,1e-3,1e-2,0.1,0.5,1.0,5.0,10.0]
152 | 
153 | initrsq = 0
154 | 
155 | print ("\nRidge Regression: Best Parameters\n")
156 | for alph in alphas:
157 |     ridge_reg = Ridge(alpha=alph) 
158 |     ridge_reg.fit(x_train,y_train)    
159 |     tr_rsqrd = ridge_reg.score(x_train,y_train)
160 |     ts_rsqrd = ridge_reg.score(x_test,y_test)    
161 | 
162 |     if ts_rsqrd > initrsq:
163 |         print ("Lambda: ",alph,"Train R-Squared value:",round(tr_rsqrd,5),"Test R-squared value:",round(ts_rsqrd,5))
164 |         initrsq = ts_rsqrd
165 | 
166 | # Coeffients of Ridge regression of best alpha value
167 | ridge_reg = Ridge(alpha=0.001) 
168 | ridge_reg.fit(x_train,y_train) 
169 |  
170 | 
171 | print ("\nRidge Regression coefficient values of Alpha = 0.001\n")
172 | for i in range(11):
173 |     print (all_colnms[i],": ",ridge_reg.coef_[i])
174 | 
175 | # Lasso Regression
176 | from sklearn.linear_model import Lasso
177 | 
178 | alphas = [1e-4,1e-3,1e-2,0.1,0.5,1.0,5.0,10.0]
179 | initrsq = 0
180 | print ("\nLasso Regression: Best Parameters\n")
181 | 
182 | for alph in alphas:
183 |     lasso_reg = Lasso(alpha=alph) 
184 |     lasso_reg.fit(x_train,y_train)    
185 |     tr_rsqrd = lasso_reg.score(x_train,y_train)
186 |     ts_rsqrd = lasso_reg.score(x_test,y_test)    
187 | 
188 |     if ts_rsqrd > initrsq:
189 |         print ("Lambda: ",alph,"Train R-Squared value:",round(tr_rsqrd,5),"Test R-squared value:",round(ts_rsqrd,5))
190 |         initrsq = ts_rsqrd
191 | 
192 | # Coeffients of Lasso regression of best alpha value
193 | lasso_reg = Lasso(alpha=0.001) 
194 | lasso_reg.fit(x_train,y_train) 
195 | 
196 | print ("\nLasso Regression coefficient values of Alpha = 0.001\n")
197 | for i in range(11):
198 |     print (all_colnms[i],": ",lasso_reg.coef_[i])
199 | 
200 | 


--------------------------------------------------------------------------------
/Chapter03/Chapter 03_Logistic Regression vs Random Forest.R:
--------------------------------------------------------------------------------
  1 | 
  2 | rm(list = ls())
  3 | 
  4 | # First change the following directory link to where all the input files do exist
  5 | setwd("D:\\Book writing\\Codes\\Chapter 3")
  6 | 
  7 | library(mctest)
  8 | library(dummies)
  9 | library(Information)
 10 | library(pROC)
 11 | 
 12 | credit_data = read.csv("credit_data.csv")
 13 | credit_data$class = credit_data$class-1 
 14 | 
 15 | # I.V Calculation
 16 | IV <- create_infotables(data=credit_data, y="class", parallel=FALSE)
 17 | for (i in 1:length(colnames(credit_data))-1){
 18 |   seca = IV[[1]][i][1]
 19 |   sum(seca[[1]][5])
 20 |   print(paste(colnames(credit_data)[i],",IV_Value:",round(sum(seca[[1]][5]),4)))
 21 | }
 22 | 
 23 | # Dummy variables creation
 24 | dummy_stseca =data.frame(dummy(credit_data$Status_of_existing_checking_account))
 25 | dummy_ch = data.frame(dummy(credit_data$Credit_history))
 26 | dummy_purpose =  data.frame(dummy(credit_data$Purpose))
 27 | dummy_savacc = data.frame(dummy(credit_data$Savings_Account))
 28 | dummy_presc = data.frame(dummy(credit_data$Present_Employment_since))
 29 | dummy_perssx = data.frame(dummy(credit_data$Personal_status_and_sex))
 30 | dummy_othdts = data.frame(dummy(credit_data$Other_debtors))
 31 | dummy_property = data.frame(dummy(credit_data$Property))
 32 | dummy_othinstpln = data.frame(dummy(credit_data$Other_installment_plans))
 33 | dummy_forgnwrkr = data.frame(dummy(credit_data$Foreign_worker))
 34 | 
 35 | # Cleaning the variables name from . to _
 36 | colClean <- function(x){ colnames(x) <- gsub("\\.", "_", colnames(x)); x } 
 37 | dummy_stseca = colClean(dummy_stseca) ;dummy_ch = colClean(dummy_ch) 
 38 | dummy_purpose = colClean(dummy_purpose); dummy_savacc= colClean(dummy_savacc)
 39 | dummy_presc= colClean(dummy_presc);dummy_perssx= colClean(dummy_perssx);
 40 | dummy_othdts= colClean(dummy_othdts);dummy_property= colClean(dummy_property);
 41 | dummy_othinstpln= colClean(dummy_othinstpln);dummy_forgnwrkr= colClean(dummy_forgnwrkr);
 42 | 
 43 | 
 44 | continuous_columns = c('Duration_in_month', 'Credit_amount','Installment_rate_in_percentage_of_disposable_income',
 45 |                       'Age_in_years','Number_of_existing_credits_at_this_bank')
 46 | 
 47 | credit_continuous = credit_data[,continuous_columns]
 48 | credit_data_new = cbind(dummy_stseca,dummy_ch,dummy_purpose,dummy_savacc,dummy_presc,dummy_perssx,
 49 |                         dummy_othdts,dummy_property,dummy_othinstpln,dummy_forgnwrkr,credit_continuous,credit_data$class)
 50 | 
 51 | colnames(credit_data_new)[51] <- "class"
 52 | 
 53 | # Setting seed for repeatability of results of train & test split
 54 | set.seed(123)
 55 | numrow = nrow(credit_data_new)
 56 | trnind = sample(1:numrow,size = as.integer(0.7*numrow))
 57 | train_data = credit_data_new[trnind,]
 58 | test_data = credit_data_new[-trnind,]
 59 | 
 60 | remove_cols_extra_dummy = c("Status_of_existing_checking_account_A11","Credit_history_A30",
 61 |                             "Purpose_A40","Savings_Account_A61","Present_Employment_since_A71","Personal_status_and_sex_A91",
 62 |                             "Other_debtors_A101","Property_A121","Other_installment_plans_A141","Foreign_worker_A201")
 63 | 
 64 | # Removing insignificant variables one by one
 65 | remove_cols_insig = c("Purpose_A46","Purpose_A45","Purpose_A44","Savings_Account_A63", "Other_installment_plans_A143",
 66 |                       "Property_A123","Status_of_existing_checking_account_A12",
 67 |                       "Present_Employment_since_A72","Present_Employment_since_A75",
 68 |                       "Present_Employment_since_A73","Credit_history_A32","Credit_history_A33",
 69 |                       "Purpose_A40","Present_Employment_since_A74","Purpose_A49","Purpose_A48",
 70 |                       "Property_A122","Personal_status_and_sex_A92","Foreign_worker_A202",
 71 |                       "Personal_status_and_sex_A94","Purpose_A42","Other_debtors_A102",
 72 |                       "Age_in_years","Savings_Account_A64","Savings_Account_A62",
 73 |                       "Savings_Account_A65", "Other_debtors_A103")
 74 | 
 75 | remove_cols = c(remove_cols_extra_dummy,remove_cols_insig)
 76 | 
 77 | glm_fit = glm(class ~.,family = "binomial",data = train_data[,!(names(train_data) %in% remove_cols)])
 78 | # Significance check - p_value
 79 | summary(glm_fit)
 80 | 
 81 | # Multi collinearity check - VIF
 82 | remove_cols_vif = c(remove_cols,"class")
 83 | vif_table = imcdiag(train_data[,!(names(train_data) %in% remove_cols_vif)],train_data$class,detr=0.001, conf=0.99)
 84 | vif_table  
 85 | 
 86 | # Predicting probabilities
 87 | train_data$glm_probs = predict(glm_fit,newdata = train_data,type = "response")
 88 | test_data$glm_probs = predict(glm_fit,newdata = test_data,type = "response")
 89 | 
 90 | # Area under ROC
 91 | 
 92 | ROC1 <- roc(as.factor(train_data$class),train_data$glm_probs)
 93 | plot(ROC1, col = "blue")
 94 | print(paste("Area under the curve",round(auc(ROC1),4))) 
 95 | 
 96 | # Actual prediction based on threshold tuning 
 97 | threshold_vals = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9)
 98 | for (thld in threshold_vals){
 99 |   train_data$glm_pred = 0
100 |   train_data$glm_pred[train_data$glm_probs>thld]=1
101 |   
102 |   tble = table(train_data$glm_pred,train_data$class)
103 |   acc = (tble[1,1]+tble[2,2])/sum(tble)
104 |   print(paste("Threshold",thld,"Train accuracy",round(acc,4)))
105 |   
106 | }
107 | 
108 | # Best threshold from above search is 0.5 with accuracy as 0.7841
109 | best_threshold = 0.5
110 | 
111 | # Train confusion matrix & accuracy
112 | train_data$glm_pred = 0
113 | train_data$glm_pred[train_data$glm_probs>best_threshold]=1
114 | tble = table(train_data$glm_pred,train_data$class)
115 | acc = (tble[1,1]+tble[2,2])/sum(tble)
116 | print(paste("Confusion Matrix - Train Data"))
117 | print(tble)
118 | print(paste("Train accuracy",round(acc,4)))
119 | 
120 | # Test confusion matrix & accuracy
121 | test_data$glm_pred = 0
122 | test_data$glm_pred[test_data$glm_probs>best_threshold]=1
123 | tble_test = table(test_data$glm_pred,test_data$class)
124 | acc_test = (tble_test[1,1]+tble_test[2,2])/sum(tble_test)
125 | print(paste("Confusion Matrix - Test Data"))
126 | print(tble_test)
127 | print(paste("Test accuracy",round(acc_test,4)))
128 | 
129 | 
130 | 
131 | # Random Forest
132 | library(randomForest)
133 | library(e1071)
134 | 
135 | credit_data = read.csv("credit_data.csv")
136 | 
137 | credit_data$class = credit_data$class-1 
138 | credit_data$class = as.factor(credit_data$class)
139 | 
140 | set.seed(123)
141 | numrow = nrow(credit_data)
142 | trnind = sample(1:numrow,size = as.integer(0.7*numrow))
143 | train_data = credit_data[trnind,]
144 | test_data = credit_data[-trnind,]
145 | 
146 | rf_fit = randomForest(class~.,data = train_data,mtry=4,maxnodes= 2000,ntree=1000,nodesize = 2)
147 | rf_pred = predict(rf_fit,data = train_data,type = "response")
148 | rf_predt = predict(rf_fit,newdata = test_data,type = "response")
149 | 
150 | tble = table(train_data$class,rf_pred)
151 | tblet = table(test_data$class,rf_predt)
152 | 
153 | acc = (tble[1,1]+tble[2,2])/sum(tble)
154 | acct = (tblet[1,1]+tblet[2,2])/sum(tblet)
155 | print(paste("Train acc",round(acc,4),"Test acc",round(acct,4)))
156 | 
157 | # Grid Search
158 | rf_grid = tune(randomForest,class~.,data = train_data,ranges = list(
159 |               mtry = c(4,5),
160 |               maxnodes = c(700,1000),
161 |               ntree = c(1000,2000,3000),
162 |               nodesize = c(1,2)
163 |           ),
164 |           tunecontrol = tune.control(cross = 5)
165 | )
166 | 
167 | summary(rf_grid)
168 | 
169 | best_model = rf_grid$best.model
170 | summary(best_model)
171 | 
172 | y_pred_train = predict(best_model,data = train_data)
173 | train_conf_mat = table(train_data$class,y_pred_train)
174 | 
175 | print(paste("Train Confusion Matrix - Grid Search:"))
176 | print(train_conf_mat)
177 | 
178 | train_acc = (train_conf_mat[1,1]+train_conf_mat[2,2])/sum(train_conf_mat)
179 | print(paste("Train_accuracy-Grid Search:",round(train_acc,4)))
180 | 
181 | y_pred_test = predict(best_model,newdata = test_data)
182 | test_conf_mat = table(test_data$class,y_pred_test)
183 | 
184 | print(paste("Test Confusion Matrix - Grid Search:"))
185 | print(test_conf_mat)
186 | 
187 | test_acc = (test_conf_mat[1,1]+test_conf_mat[2,2])/sum(test_conf_mat)
188 | print(paste("Test_accuracy-Grid Search:",round(test_acc,4)))
189 | 
190 | # Variable Importance
191 | vari = varImpPlot(best_model)
192 | print(paste("Variable Importance - Table"))
193 | print(vari)
194 | 
195 | 
196 | 
197 | 


--------------------------------------------------------------------------------
/Chapter03/Chapter 03_Logistic Regression vs Random Forest.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | import os
  5 | 
  6 | """ First change the following directory link to where all input files do exist """
  7 | 
  8 | os.chdir("D:\\Book writing\\Codes\\Chapter 3")
  9 | 
 10 | import numpy as np
 11 | import pandas as pd
 12 | 
 13 | from sklearn.model_selection import train_test_split    
 14 | from sklearn.metrics import accuracy_score,classification_report
 15 | 
 16 | credit_data = pd.read_csv("credit_data.csv")
 17 | 
 18 | 
 19 | print (credit_data.head())
 20 | credit_data['class'] = credit_data['class']-1
 21 |           
 22 |       
 23 | # Calculation of IV  metrics
 24 | def IV_calc(data,var):
 25 |     if data[var].dtypes == "object":
 26 |         dataf = data.groupby([var])['class'].agg(['count','sum'])
 27 |         dataf.columns = ["Total","bad"]    
 28 |         dataf["good"] = dataf["Total"] - dataf["bad"]
 29 |         dataf["bad_per"] = dataf["bad"]/dataf["bad"].sum()
 30 |         dataf["good_per"] = dataf["good"]/dataf["good"].sum()
 31 |         dataf["I_V"] = (dataf["good_per"] - dataf["bad_per"]) * np.log(dataf["good_per"]/dataf["bad_per"])
 32 |         return dataf
 33 |     else:
 34 |         data['bin_var'] = pd.qcut(data[var].rank(method='first'),10)
 35 |         dataf = data.groupby(['bin_var'])['class'].agg(['count','sum'])
 36 |         dataf.columns = ["Total","bad"]    
 37 |         dataf["good"] = dataf["Total"] - dataf["bad"]
 38 |         dataf["bad_per"] = dataf["bad"]/dataf["bad"].sum()
 39 |         dataf["good_per"] = dataf["good"]/dataf["good"].sum()
 40 |         dataf["I_V"] = (dataf["good_per"] - dataf["bad_per"]) * np.log(dataf["good_per"]/dataf["bad_per"])
 41 |         return dataf
 42 | 
 43 | 
 44 | IV_calc(credit_data,'Status_of_existing_checking_account')
 45 | 
 46 | print ("\n\nCredit History - Information Value\n")
 47 | print (IV_calc(credit_data,'Credit_history'))
 48 | 
 49 | print ("\n\nCredit History - Duration in month\n")
 50 | print (IV_calc(credit_data,'Duration_in_month'))
 51 | 
 52 | 
 53 | print ("\n\nInformation Value by descending order\n")
 54 | discrete_columns = ['Status_of_existing_checking_account','Credit_history','Purpose','Savings_Account',
 55 |                     'Present_Employment_since','Personal_status_and_sex','Other_debtors','Property',
 56 |                     'Other_installment_plans','Housing','Job','Telephone','Foreign_worker']
 57 | 
 58 | continuous_columns = ['Duration_in_month', 'Credit_amount','Installment_rate_in_percentage_of_disposable_income',
 59 |                       'Present_residence_since', 'Age_in_years','Number_of_existing_credits_at_this_bank',
 60 |                        'Number_of_People_being_liable_to_provide_maintenance_for']
 61 | 
 62 | total_columns = discrete_columns + continuous_columns
 63 | 
 64 | # List of IV values
 65 | Iv_list = []
 66 | for col in total_columns:
 67 |     assigned_data =  IV_calc(data = credit_data,var = col)
 68 |     iv_val = round(assigned_data["I_V"].sum(),3)
 69 |     dt_type = credit_data[col].dtypes
 70 |     Iv_list.append((iv_val,col,dt_type))
 71 | 
 72 | Iv_list = sorted(Iv_list,reverse = True)
 73 | 
 74 | for i in range(len(Iv_list)):
 75 |     print (Iv_list[i][0],",",Iv_list[i][1],",type =",Iv_list[i][2])
 76 | 
 77 | 
 78 | # Retaining top 15 variables
 79 | dummy_stseca = pd.get_dummies(credit_data['Status_of_existing_checking_account'], prefix='status_exs_accnt')
 80 | dummy_ch = pd.get_dummies(credit_data['Credit_history'], prefix='cred_hist')
 81 | dummy_purpose = pd.get_dummies(credit_data['Purpose'], prefix='purpose')
 82 | dummy_savacc = pd.get_dummies(credit_data['Savings_Account'], prefix='sav_acc')
 83 | dummy_presc = pd.get_dummies(credit_data['Present_Employment_since'], prefix='pre_emp_snc')
 84 | dummy_perssx = pd.get_dummies(credit_data['Personal_status_and_sex'], prefix='per_stat_sx')
 85 | dummy_othdts = pd.get_dummies(credit_data['Other_debtors'], prefix='oth_debtors')
 86 | 
 87 | 
 88 | dummy_property = pd.get_dummies(credit_data['Property'], prefix='property')
 89 | dummy_othinstpln = pd.get_dummies(credit_data['Other_installment_plans'], prefix='oth_inst_pln')
 90 | dummy_forgnwrkr = pd.get_dummies(credit_data['Foreign_worker'], prefix='forgn_wrkr')
 91 | 
 92 | #dummy_housing = pd.get_dummies(credit_data['Housing'], prefix='housing')
 93 | #dummy_job = pd.get_dummies(credit_data['Job'], prefix='job')
 94 | #dummy_telephn = pd.get_dummies(credit_data['Telephone'], prefix='telephn')
 95 | 
 96 | 
 97 | continuous_columns = ['Duration_in_month', 'Credit_amount','Installment_rate_in_percentage_of_disposable_income',
 98 |                        'Age_in_years','Number_of_existing_credits_at_this_bank' ]
 99 | 
100 | 
101 | credit_continuous = credit_data[continuous_columns]
102 | credit_data_new = pd.concat([dummy_stseca,dummy_ch,dummy_purpose,dummy_savacc,dummy_presc,dummy_perssx,
103 |                              dummy_property,dummy_othinstpln,dummy_othdts,
104 |                              dummy_forgnwrkr,credit_continuous,credit_data['class']],axis=1)
105 | 
106 | x_train,x_test,y_train,y_test = train_test_split(credit_data_new.drop(['class'],axis=1),credit_data_new['class'],train_size = 0.7,random_state=42)
107 | 
108 | y_train = pd.DataFrame(y_train)
109 | y_test = pd.DataFrame(y_test)
110 | 
111 | 
112 | # Logistic Regression
113 | remove_cols_extra_dummy = ['status_exs_accnt_A11','cred_hist_A30','purpose_A40','sav_acc_A61','pre_emp_snc_A71',
114 |                'per_stat_sx_A91','oth_debtors_A101','property_A121','oth_inst_pln_A141','forgn_wrkr_A201']
115 | 
116 | #'housing_A151','job_A171','telephn_A191',
117 | 
118 | 
119 | 
120 | 
121 | remove_cols_insig = ['purpose_A46','purpose_A45','purpose_A44','sav_acc_A63','oth_inst_pln_A143',
122 |                      'property_A123','status_exs_accnt_A12','pre_emp_snc_A72','pre_emp_snc_A75',
123 |                      'pre_emp_snc_A73','cred_hist_A32','cred_hist_A33','purpose_A410','pre_emp_snc_A74',
124 |                      'purpose_A49','purpose_A48','property_A122','per_stat_sx_A92','forgn_wrkr_A202',
125 |                      'per_stat_sx_A94','purpose_A42','oth_debtors_A102','Age_in_years','sav_acc_A64',
126 |                      'sav_acc_A62','sav_acc_A65','oth_debtors_A103']
127 | 
128 | remove_cols = list(set(remove_cols_extra_dummy+remove_cols_insig))
129 | 
130 | 
131 | import statsmodels.api as sm
132 | logistic_model = sm.Logit(y_train,sm.add_constant(x_train.drop(remove_cols,axis=1))).fit()
133 | print (logistic_model.summary())
134 | 
135 | 
136 | # Calculation of VIF
137 | print ("\nVariance Inflation Factor")
138 | cnames = x_train.drop(remove_cols,axis=1).columns
139 | for i in np.arange(0,len(cnames)):
140 |     xvars = list(cnames)
141 |     yvar = xvars.pop(i)
142 |     mod = sm.OLS(x_train.drop(remove_cols,axis=1)[yvar],sm.add_constant(x_train.drop(remove_cols,axis=1)[xvars]))
143 |     res = mod.fit()
144 |     vif = 1/(1-res.rsquared)
145 |     print (yvar,round(vif,3))
146 | 
147 | 
148 | y_pred = pd.DataFrame(logistic_model.predict(sm.add_constant(x_train.drop(remove_cols,axis=1))))
149 | y_pred.columns = ["probs"]
150 | #both = pd.concat([y_train.reset_index(drop=True),y_pred],axis=1)
151 | 
152 | both = pd.concat([y_train,y_pred],axis=1)
153 | 
154 | zeros = both[['class','probs']][both['class']==0]
155 | ones = both[['class','probs']][both['class']==1]
156 | 
157 | def df_crossjoin(df1, df2, **kwargs):
158 |     df1['_tmpkey'] = 1
159 |     df2['_tmpkey'] = 1
160 |     res = pd.merge(df1, df2, on='_tmpkey', **kwargs).drop('_tmpkey', axis=1)
161 |     res.index = pd.MultiIndex.from_product((df1.index, df2.index))
162 |     df1.drop('_tmpkey', axis=1, inplace=True)
163 |     df2.drop('_tmpkey', axis=1, inplace=True)
164 |     return res
165 | 
166 | joined_data = df_crossjoin(ones,zeros)
167 | 
168 | joined_data['concordant_pair'] = 0
169 | joined_data.loc[joined_data['probs_x'] > joined_data['probs_y'],'concordant_pair'] =1
170 | joined_data['discordant_pair'] = 0
171 | joined_data.loc[joined_data['probs_x'] < joined_data['probs_y'],'discordant_pair'] =1
172 | joined_data['tied_pair'] = 0           
173 | joined_data.loc[joined_data['probs_x'] == joined_data['probs_y'],'tied_pair'] =1 
174 | p_conc = (sum(joined_data['concordant_pair'])*1.0 )/ (joined_data.shape[0])     
175 | p_disc =  (sum(joined_data['discordant_pair'])*1.0 )/ (joined_data.shape[0])
176 |    
177 | 
178 | c_statistic = 0.5 + (p_conc - p_disc)/2.0           
179 | print ("\nC-statistic:",round(c_statistic,4))                    
180 |                     
181 | 
182 | 
183 | # ROC & AUC
184 | import matplotlib.pyplot as plt
185 | from sklearn import metrics
186 | from sklearn.metrics import auc
187 | fpr, tpr, thresholds = metrics.roc_curve(both['class'],both['probs'], pos_label=1)
188 | 
189 | roc_auc = auc(fpr,tpr)
190 | plt.figure()
191 | lw = 2
192 | plt.plot(fpr, tpr, color='darkorange',
193 |          lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
194 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
195 | plt.xlim([0.0, 1.0])
196 | plt.ylim([0.0, 1.05])
197 | plt.xlabel('False Positive Rate (1-Specificity)')
198 | plt.ylabel('True Positive Rate')
199 | plt.title('ROC Curve - German Credit Data')
200 | plt.legend(loc="lower right")
201 | plt.show()
202 | 
203 | # Tuning for threshold
204 | for i in list(np.arange(0,1,0.1)):
205 |     both["y_pred"] = 0
206 |     both.loc[both["probs"] > i, 'y_pred'] = 1      
207 |     print ("Threshold",i,"Train Accuracy:",round(accuracy_score(both['class'],both['y_pred']),4))
208 | 
209 | # Implement best threshold on train data
210 | both["y_pred"] = 0
211 | both.loc[both["probs"] > 0.5, 'y_pred'] = 1      
212 | print ("\nTrain Confusion Matrix\n\n",pd.crosstab(both['class'],both['y_pred'],rownames = ["Actuall"],colnames = ["Predicted"]))      
213 | print ("\nTrain Accuracy:",round(accuracy_score(both['class'],both['y_pred']),4))
214 | 
215 | # Predicting test output
216 | y_pred_test = pd.DataFrame(logistic_model.predict(sm.add_constant(x_test.drop(remove_cols,axis=1))))
217 | y_pred_test.columns = ["probs"]
218 | 
219 | #both_test = pd.concat([y_test.reset_index(drop=True),y_pred_test],axis=1)
220 | both_test = pd.concat([y_test,y_pred_test],axis=1)
221 | both_test["y_pred"] = 0
222 | both_test.loc[both_test["probs"] > 0.5, 'y_pred'] = 1      
223 | print ("\nTest Confusion Matrix\n\n",pd.crosstab(both_test['class'],both_test['y_pred'],rownames = ["Actuall"],colnames = ["Predicted"]))      
224 | print ("\nTest Accuracy:",round(accuracy_score(both_test['class'],both_test['y_pred']),4))
225 | 
226 | 
227 | # Random Forest - Scikit Learn
228 | import pandas as pd
229 | from sklearn.ensemble import RandomForestClassifier
230 | 
231 | credit_data = pd.read_csv("credit_data.csv")
232 | credit_data['class'] = credit_data['class']-1
233 |                     
234 | dummy_stseca = pd.get_dummies(credit_data['Status_of_existing_checking_account'], prefix='status_exs_accnt')
235 | dummy_ch = pd.get_dummies(credit_data['Credit_history'], prefix='cred_hist')
236 | dummy_purpose = pd.get_dummies(credit_data['Purpose'], prefix='purpose')
237 | dummy_savacc = pd.get_dummies(credit_data['Savings_Account'], prefix='sav_acc')
238 | dummy_presc = pd.get_dummies(credit_data['Present_Employment_since'], prefix='pre_emp_snc')
239 | dummy_perssx = pd.get_dummies(credit_data['Personal_status_and_sex'], prefix='per_stat_sx')
240 | dummy_othdts = pd.get_dummies(credit_data['Other_debtors'], prefix='oth_debtors')
241 | dummy_property = pd.get_dummies(credit_data['Property'], prefix='property')
242 | dummy_othinstpln = pd.get_dummies(credit_data['Other_installment_plans'], prefix='oth_inst_pln')
243 | dummy_housing = pd.get_dummies(credit_data['Housing'], prefix='housing')
244 | dummy_job = pd.get_dummies(credit_data['Job'], prefix='job')
245 | dummy_telephn = pd.get_dummies(credit_data['Telephone'], prefix='telephn')
246 | dummy_forgnwrkr = pd.get_dummies(credit_data['Foreign_worker'], prefix='forgn_wrkr')
247 | 
248 | continuous_columns = ['Duration_in_month', 'Credit_amount','Installment_rate_in_percentage_of_disposable_income',
249 |                       'Present_residence_since', 'Age_in_years','Number_of_existing_credits_at_this_bank',
250 |                        'Number_of_People_being_liable_to_provide_maintenance_for']
251 | 
252 | credit_continuous = credit_data[continuous_columns]
253 | credit_data_new = pd.concat([dummy_stseca,dummy_ch,dummy_purpose,dummy_savacc,dummy_presc,dummy_perssx,
254 |                              dummy_othdts,dummy_property,dummy_othinstpln,dummy_housing,dummy_job,
255 |                              dummy_telephn,dummy_forgnwrkr,credit_continuous,credit_data['class']],axis=1)
256 | 
257 | x_train,x_test,y_train,y_test = train_test_split(credit_data_new.drop(['class'],axis=1),credit_data_new['class'],train_size = 0.7,random_state=42)
258 |            
259 | 
260 | rf_fit = RandomForestClassifier(n_estimators=1000,criterion="gini",max_depth=100,min_samples_split=3,min_samples_leaf=2)
261 | rf_fit.fit(x_train,y_train)           
262 | 
263 | print ("\nRandom Forest - Train Confusion Matrix\n\n",pd.crosstab(y_train,rf_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]))      
264 | print ("\nRandom Forest - Train accuracy",round(accuracy_score(y_train,rf_fit.predict(x_train)),3))
265 | 
266 | print ("\n\nRandom Forest - Test Confusion Matrix\n\n",pd.crosstab(y_test,rf_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
267 | print ("\nRandom Forest - Test accuracy",round(accuracy_score(y_test,rf_fit.predict(x_test)),3))
268 | 
269 | 
270 | 
271 | # Grid Search
272 | from sklearn.pipeline import Pipeline
273 | from sklearn.model_selection import train_test_split,GridSearchCV
274 | 
275 | pipeline = Pipeline([
276 |         ('clf',RandomForestClassifier(criterion='gini'))  ])
277 | 
278 | parameters = {
279 |         'clf__n_estimators':(1000,2000,3000),
280 |         'clf__max_depth':(100,200,300),
281 |         'clf__min_samples_split':(2,3),
282 |         'clf__min_samples_leaf':(1,2)  }
283 | 
284 | grid_search = GridSearchCV(pipeline,parameters,n_jobs=-1,cv=5,verbose=1,scoring='accuracy')
285 | grid_search.fit(x_train,y_train)
286 | 
287 | 
288 | print ('Best Training score: %0.3f' % grid_search.best_score_)
289 | print ('Best parameters set:')
290 | best_parameters = grid_search.best_estimator_.get_params()
291 | for param_name in sorted(parameters.keys()):
292 |     print ('\t%s: %r' % (param_name, best_parameters[param_name]))
293 | 
294 | predictions = grid_search.predict(x_test)
295 | 
296 | print ("Testing accuracy:",round(accuracy_score(y_test, predictions),4))
297 | print ("\nComplete report of Testing data\n",classification_report(y_test, predictions))
298 | 
299 | print ("\n\nRandom Forest Grid Search- Test Confusion Matrix\n\n",pd.crosstab(y_test, predictions,rownames = ["Actuall"],colnames = ["Predicted"]))      
300 | 
301 | 
302 | # Variable Importance chart
303 | import matplotlib.pyplot as plt
304 | rf_fit = RandomForestClassifier(n_estimators=1000,criterion="gini",max_depth=300,min_samples_split=3,min_samples_leaf=1)
305 | rf_fit.fit(x_train,y_train)   
306 | 
307 | importances = rf_fit.feature_importances_
308 | std = np.std([tree.feature_importances_ for tree in rf_fit.estimators_], axis=0)
309 | indices = np.argsort(importances)[::-1]
310 | 
311 | colnames = list(x_train.columns)
312 | # Print the feature ranking
313 | print("\nFeature ranking:\n")
314 | for f in range(x_train.shape[1]):
315 |     print ("Feature",indices[f],",",colnames[indices[f]],round(importances[indices[f]],4))
316 | 
317 | plt.figure()
318 | #plt.title("Variable importance")
319 | plt.bar(range(x_train.shape[1]), importances[indices],
320 |        color="r", yerr=std[indices], align="center")
321 | plt.xticks(range(x_train.shape[1]), indices)
322 | plt.xlim([-1, x_train.shape[1]])
323 | plt.show()
324 | 
325 | 
326 | indexi = list(indices)
327 | colnms = list(x_train.columns[indices])
328 | impclnms = list(importances[indices])
329 | 
330 | 
331 | print ("\nVariable Importance Values\n")
332 | for i in  range(len(importances)):
333 |     print ("Variable Index",indexi[i],",",colnms[i],",",round(impclnms[i],4))
334 | 
335 | 


--------------------------------------------------------------------------------
/Chapter04/Chapter 04_Tree based ML Models.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | rm(list = ls())
  5 | 
  6 | # First change the following directory link to where all the input files do exist
  7 | setwd("D:\\Book writing\\Codes\\Chapter 4")
  8 | 
  9 | 
 10 | library(randomForest)
 11 | library(gbm)
 12 | library(xgboost)
 13 | library(C50)
 14 | library(e1071)
 15 | library(caret)
 16 | 
 17 | hrattr_data = read.csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")
 18 | 
 19 | str(hrattr_data)
 20 | summary(hrattr_data)
 21 | 
 22 | hrattr_data$Attrition_ind = 0;hrattr_data$Attrition_ind[hrattr_data$Attrition=="Yes"]=1
 23 | hrattr_data$Attrition_ind = as.factor(hrattr_data$Attrition_ind)
 24 | 
 25 | remove_cols = c("EmployeeCount","EmployeeNumber","Over18","StandardHours","Attrition")
 26 | hrattr_data_new = hrattr_data[,!(names(hrattr_data) %in% remove_cols)]
 27 | 
 28 | set.seed(123)
 29 | numrow = nrow(hrattr_data_new)
 30 | trnind = sample(1:numrow,size = as.integer(0.7*numrow))
 31 | train_data = hrattr_data_new[trnind,]
 32 | test_data = hrattr_data_new[-trnind,]
 33 | 
 34 | frac_trzero = (table(train_data$Attrition_ind)[[1]])/nrow(train_data)
 35 | frac_trone = (table(train_data$Attrition_ind)[[2]])/nrow(train_data)
 36 | 
 37 | frac_tszero = (table(test_data$Attrition_ind)[[1]])/nrow(test_data)
 38 | frac_tsone = (table(test_data$Attrition_ind)[[2]])/nrow(test_data)
 39 | 
 40 | prec_zero <- function(act,pred){  tble = table(act,pred)
 41 | return( round( tble[1,1]/(tble[1,1]+tble[2,1]),4)  ) }
 42 | 
 43 | prec_one <- function(act,pred){ tble = table(act,pred)
 44 | return( round( tble[2,2]/(tble[2,2]+tble[1,2]),4)   ) }
 45 | 
 46 | recl_zero <- function(act,pred){tble = table(act,pred)
 47 | return( round( tble[1,1]/(tble[1,1]+tble[1,2]),4)   ) }
 48 | 
 49 | recl_one <- function(act,pred){ tble = table(act,pred)
 50 | return( round( tble[2,2]/(tble[2,2]+tble[2,1]),4)  ) }
 51 | 
 52 | accrcy <- function(act,pred){ tble = table(act,pred)
 53 | return( round((tble[1,1]+tble[2,2])/sum(tble),4)) }
 54 | 
 55 | 
 56 | # Decision Trees using C5.0 package
 57 | library(C50)
 58 | dtree_fit = C5.0(train_data[-31],train_data$Attrition_ind,costs = NULL,
 59 |                  control = C5.0Control(minCases = 1))
 60 | 
 61 | summary(dtree_fit)
 62 | 
 63 | tr_y_pred = predict(dtree_fit, train_data,type = "class")
 64 | ts_y_pred = predict(dtree_fit,test_data,type = "class")
 65 | 
 66 | tr_y_act = train_data$Attrition_ind;ts_y_act = test_data$Attrition_ind
 67 | 
 68 | tr_tble = table(tr_y_act,tr_y_pred)
 69 | print(paste("Train Confusion Matrix"))
 70 | print(tr_tble)
 71 | 
 72 | tr_acc = accrcy(tr_y_act,tr_y_pred)
 73 | trprec_zero = prec_zero(tr_y_act,tr_y_pred); trrecl_zero = recl_zero(tr_y_act,tr_y_pred)
 74 | trprec_one = prec_one(tr_y_act,tr_y_pred); trrecl_one = recl_one(tr_y_act,tr_y_pred)
 75 | 
 76 | trprec_ovll = trprec_zero *frac_trzero + trprec_one*frac_trone
 77 | trrecl_ovll = trrecl_zero *frac_trzero + trrecl_one*frac_trone
 78 | 
 79 | print(paste("Decision Tree Train accuracy:",tr_acc))
 80 | print(paste("Decision Tree - Train Classification Report"))
 81 | print(paste("Zero_Precision",trprec_zero,"Zero_Recall",trrecl_zero))
 82 | print(paste("One_Precision",trprec_one,"One_Recall",trrecl_one))
 83 | print(paste("Overall_Precision",round(trprec_ovll,4),"Overall_Recall",round(trrecl_ovll,4)))
 84 | 
 85 | ts_tble = table(ts_y_act,ts_y_pred)
 86 | print(paste("Test Confusion Matrix"))
 87 | print(ts_tble)
 88 | 
 89 | ts_acc = accrcy(ts_y_act,ts_y_pred)
 90 | tsprec_zero = prec_zero(ts_y_act,ts_y_pred); tsrecl_zero = recl_zero(ts_y_act,ts_y_pred)
 91 | tsprec_one = prec_one(ts_y_act,ts_y_pred); tsrecl_one = recl_one(ts_y_act,ts_y_pred)
 92 | 
 93 | tsprec_ovll = tsprec_zero *frac_tszero + tsprec_one*frac_tsone
 94 | tsrecl_ovll = tsrecl_zero *frac_tszero + tsrecl_one*frac_tsone
 95 | 
 96 | print(paste("Decision Tree Test accuracy:",ts_acc))
 97 | print(paste("Decision Tree - Test Classification Report"))
 98 | print(paste("Zero_Precision",tsprec_zero,"Zero_Recall",tsrecl_zero))
 99 | print(paste("One_Precision",tsprec_one,"One_Recall",tsrecl_one))
100 | print(paste("Overall_Precision",round(tsprec_ovll,4),"Overall_Recall",round(tsrecl_ovll,4)))
101 | 
102 | 
103 | #Decision Trees using C5.0 package - Error Costs
104 | library(C50)
105 | 
106 | class_zero_wgt = c(0.01,0.1,0.2,0.3,0.4,0.5)
107 | 
108 | for (cwt in class_zero_wgt){
109 |   cwtz = cwt
110 |   cwto = 1-cwtz
111 |   cstvr = cwto/cwtz
112 |   
113 |   error_cost <- matrix(c(0, 1, cstvr, 0), nrow = 2)
114 |   
115 |   dtree_fit = C5.0(train_data[-31],train_data$Attrition_ind,costs = error_cost,
116 |                    control = C5.0Control(minCases = 1))
117 |   
118 |   summary(dtree_fit)
119 |   
120 |   tr_y_pred = predict(dtree_fit, train_data,type = "class")
121 |   ts_y_pred = predict(dtree_fit,test_data,type = "class")
122 |   
123 |   tr_y_act = train_data$Attrition_ind;ts_y_act = test_data$Attrition_ind
124 |   tr_acc = accrcy(tr_y_act,tr_y_pred)
125 |   ts_acc = accrcy(ts_y_act,ts_y_pred)
126 |   
127 |   print(paste("Class weights","{0:",cwtz,"1:",cwto,"}",
128 |               "Decision Tree Train accuracy:",tr_acc,
129 |               "Decision Tree Test accuracy:",ts_acc))
130 |   ts_tble = table(ts_y_act,ts_y_pred)
131 |   print(paste("Test Confusion Matrix"))
132 |   print(ts_tble)
133 |  
134 | }
135 | 
136 | 
137 | # Bagging Classifier - using Random forest package but all variables selected
138 | library(randomForest)
139 | 
140 | set.seed(43)
141 | rf_fit = randomForest(Attrition_ind~.,data = train_data,mtry=30,maxnodes= 64,
142 |                       classwt = c(0.3,0.7),ntree=5000,nodesize = 1)
143 | 
144 | tr_y_pred = predict(rf_fit,data = train_data,type = "response")
145 | ts_y_pred = predict(rf_fit,newdata = test_data,type = "response")
146 | 
147 | tr_y_act = train_data$Attrition_ind;ts_y_act = test_data$Attrition_ind
148 | 
149 | tr_tble = table(tr_y_act,tr_y_pred)
150 | print(paste("Train Confusion Matrix"))
151 | print(tr_tble)
152 | 
153 | tr_acc = accrcy(tr_y_act,tr_y_pred)
154 | trprec_zero = prec_zero(tr_y_act,tr_y_pred); trrecl_zero = recl_zero(tr_y_act,tr_y_pred)
155 | trprec_one = prec_one(tr_y_act,tr_y_pred); trrecl_one = recl_one(tr_y_act,tr_y_pred)
156 | 
157 | trprec_ovll = trprec_zero *frac_trzero + trprec_one*frac_trone
158 | trrecl_ovll = trrecl_zero *frac_trzero + trrecl_one*frac_trone
159 | 
160 | print(paste("Random Forest Train accuracy:",tr_acc))
161 | print(paste("Random Forest - Train Classification Report"))
162 | print(paste("Zero_Precision",trprec_zero,"Zero_Recall",trrecl_zero))
163 | print(paste("One_Precision",trprec_one,"One_Recall",trrecl_one))
164 | print(paste("Overall_Precision",round(trprec_ovll,4),"Overall_Recall",round(trrecl_ovll,4)))
165 | 
166 | ts_tble = table(ts_y_act,ts_y_pred)
167 | print(paste("Test Confusion Matrix"))
168 | print(ts_tble)
169 | 
170 | ts_acc = accrcy(ts_y_act,ts_y_pred)
171 | tsprec_zero = prec_zero(ts_y_act,ts_y_pred); tsrecl_zero = recl_zero(ts_y_act,ts_y_pred)
172 | tsprec_one = prec_one(ts_y_act,ts_y_pred); tsrecl_one = recl_one(ts_y_act,ts_y_pred)
173 | 
174 | tsprec_ovll = tsprec_zero *frac_tszero + tsprec_one*frac_tsone
175 | tsrecl_ovll = tsrecl_zero *frac_tszero + tsrecl_one*frac_tsone
176 | 
177 | print(paste("Random Forest Test accuracy:",ts_acc))
178 | print(paste("Random Forest - Test Classification Report"))
179 | print(paste("Zero_Precision",tsprec_zero,"Zero_Recall",tsrecl_zero))
180 | print(paste("One_Precision",tsprec_one,"One_Recall",tsrecl_one))
181 | print(paste("Overall_Precision",round(tsprec_ovll,4),"Overall_Recall",round(tsrecl_ovll,4)))
182 | 
183 | 
184 | # Random Forest
185 | library(randomForest)
186 | set.seed(43)
187 | rf_fit = randomForest(Attrition_ind~.,data = train_data,mtry=6,maxnodes= 64,
188 |                       classwt = c(0.3,0.7),ntree=5000,nodesize = 1)
189 | tr_y_pred = predict(rf_fit,data = train_data,type = "response")
190 | ts_y_pred = predict(rf_fit,newdata = test_data,type = "response")
191 | 
192 | tr_y_act = train_data$Attrition_ind;ts_y_act = test_data$Attrition_ind
193 | 
194 | tr_tble = table(tr_y_act,tr_y_pred)
195 | print(paste("Train Confusion Matrix"))
196 | print(tr_tble)
197 | 
198 | tr_acc = accrcy(tr_y_act,tr_y_pred)
199 | trprec_zero = prec_zero(tr_y_act,tr_y_pred); trrecl_zero = recl_zero(tr_y_act,tr_y_pred)
200 | trprec_one = prec_one(tr_y_act,tr_y_pred); trrecl_one = recl_one(tr_y_act,tr_y_pred)
201 | 
202 | trprec_ovll = trprec_zero *frac_trzero + trprec_one*frac_trone
203 | trrecl_ovll = trrecl_zero *frac_trzero + trrecl_one*frac_trone
204 | 
205 | print(paste("Random Forest Train accuracy:",tr_acc))
206 | print(paste("Random Forest - Train Classification Report"))
207 | print(paste("Zero_Precision",trprec_zero,"Zero_Recall",trrecl_zero))
208 | print(paste("One_Precision",trprec_one,"One_Recall",trrecl_one))
209 | print(paste("Overall_Precision",round(trprec_ovll,4),"Overall_Recall",round(trrecl_ovll,4)))
210 | 
211 | ts_tble = table(ts_y_act,ts_y_pred)
212 | print(paste("Test Confusion Matrix"))
213 | print(ts_tble)
214 | 
215 | ts_acc = accrcy(ts_y_act,ts_y_pred)
216 | tsprec_zero = prec_zero(ts_y_act,ts_y_pred); tsrecl_zero = recl_zero(ts_y_act,ts_y_pred)
217 | tsprec_one = prec_one(ts_y_act,ts_y_pred); tsrecl_one = recl_one(ts_y_act,ts_y_pred)
218 | 
219 | tsprec_ovll = tsprec_zero *frac_tszero + tsprec_one*frac_tsone
220 | tsrecl_ovll = tsrecl_zero *frac_tszero + tsrecl_one*frac_tsone
221 | 
222 | print(paste("Random Forest Test accuracy:",ts_acc))
223 | print(paste("Random Forest - Test Classification Report"))
224 | print(paste("Zero_Precision",tsprec_zero,"Zero_Recall",tsrecl_zero))
225 | print(paste("One_Precision",tsprec_one,"One_Recall",tsrecl_one))
226 | print(paste("Overall_Precision",round(tsprec_ovll,4),"Overall_Recall",round(tsrecl_ovll,4)))
227 | 
228 | 
229 | 
230 | # Grid Search - Random Forest
231 | library(e1071)
232 | library(randomForest)
233 | rf_grid = tune(randomForest,Attrition_ind~.,data = train_data,classwt = c(0.3,0.7),ranges = list(
234 |   mtry = c(5,6),
235 |   maxnodes = c(32,64),
236 |   ntree = c(3000,5000),
237 |   nodesize = c(1,2)
238 | ),
239 | tunecontrol = tune.control(cross = 5)
240 | )
241 | 
242 | print(paste("Best parameter from Grid Search"))
243 | print(summary(rf_grid))
244 | 
245 | best_model = rf_grid$best.model
246 | 
247 | tr_y_pred = predict(best_model,data = train_data,type = "response")
248 | ts_y_pred = predict(best_model,newdata = test_data,type = "response")
249 | 
250 | tr_y_act = train_data$Attrition_ind;ts_y_act = test_data$Attrition_ind
251 | 
252 | tr_tble = table(tr_y_act,tr_y_pred)
253 | print(paste("Random Forest Grid search Train Confusion Matrix"))
254 | print(tr_tble)
255 | 
256 | tr_acc = accrcy(tr_y_act,tr_y_pred)
257 | trprec_zero = prec_zero(tr_y_act,tr_y_pred); trrecl_zero = recl_zero(tr_y_act,tr_y_pred)
258 | trprec_one = prec_one(tr_y_act,tr_y_pred); trrecl_one = recl_one(tr_y_act,tr_y_pred)
259 | 
260 | trprec_ovll = trprec_zero *frac_trzero + trprec_one*frac_trone
261 | trrecl_ovll = trrecl_zero *frac_trzero + trrecl_one*frac_trone
262 | 
263 | print(paste("Random Forest Grid Search Train accuracy:",tr_acc))
264 | print(paste("Random Forest Grid Search - Train Classification Report"))
265 | print(paste("Zero_Precision",trprec_zero,"Zero_Recall",trrecl_zero))
266 | print(paste("One_Precision",trprec_one,"One_Recall",trrecl_one))
267 | print(paste("Overall_Precision",round(trprec_ovll,4),"Overall_Recall",round(trrecl_ovll,4)))
268 | 
269 | ts_tble = table(ts_y_act,ts_y_pred)
270 | print(paste("Random Forest Grid search Test Confusion Matrix"))
271 | print(ts_tble)
272 | 
273 | ts_acc = accrcy(ts_y_act,ts_y_pred)
274 | tsprec_zero = prec_zero(ts_y_act,ts_y_pred); tsrecl_zero = recl_zero(ts_y_act,ts_y_pred)
275 | tsprec_one = prec_one(ts_y_act,ts_y_pred); tsrecl_one = recl_one(ts_y_act,ts_y_pred)
276 | 
277 | tsprec_ovll = tsprec_zero *frac_tszero + tsprec_one*frac_tsone
278 | tsrecl_ovll = tsrecl_zero *frac_tszero + tsrecl_one*frac_tsone
279 | 
280 | print(paste("Random Forest Grid Search Test accuracy:",ts_acc))
281 | print(paste("Random Forest Grid Search - Test Classification Report"))
282 | print(paste("Zero_Precision",tsprec_zero,"Zero_Recall",tsrecl_zero))
283 | print(paste("One_Precision",tsprec_one,"One_Recall",tsrecl_one))
284 | print(paste("Overall_Precision",round(tsprec_ovll,4),"Overall_Recall",round(tsrecl_ovll,4)))
285 | 
286 | 
287 | # Adaboost classifier using C5.0 with trails included for boosting
288 | library(C50)
289 | 
290 | class_zero_wgt = 0.3
291 | class_one_wgt = 1-class_zero_wgt
292 | cstvr = class_one_wgt/class_zero_wgt
293 | 
294 | error_cost <- matrix(c(0, 1, cstvr, 0), nrow = 2)
295 | 
296 | # Fitting Adaboost model  
297 | ada_fit = C5.0(train_data[-31],train_data$Attrition_ind,costs = error_cost,
298 |                    trails = 5000,control = C5.0Control(minCases = 1))
299 | 
300 | summary(ada_fit)
301 | 
302 | tr_y_pred = predict(ada_fit, train_data,type = "class")
303 | ts_y_pred = predict(ada_fit,test_data,type = "class")
304 | 
305 | tr_y_act = train_data$Attrition_ind;ts_y_act = test_data$Attrition_ind
306 | 
307 | tr_tble = table(tr_y_act,tr_y_pred)
308 | print(paste("AdaBoost - Train Confusion Matrix"))
309 | print(tr_tble)
310 | 
311 | tr_acc = accrcy(tr_y_act,tr_y_pred)
312 | trprec_zero = prec_zero(tr_y_act,tr_y_pred); trrecl_zero = recl_zero(tr_y_act,tr_y_pred)
313 | trprec_one = prec_one(tr_y_act,tr_y_pred); trrecl_one = recl_one(tr_y_act,tr_y_pred)
314 | 
315 | trprec_ovll = trprec_zero *frac_trzero + trprec_one*frac_trone
316 | trrecl_ovll = trrecl_zero *frac_trzero + trrecl_one*frac_trone
317 | 
318 | print(paste("AdaBoost Train accuracy:",tr_acc))
319 | print(paste("AdaBoost - Train Classification Report"))
320 | print(paste("Zero_Precision",trprec_zero,"Zero_Recall",trrecl_zero))
321 | print(paste("One_Precision",trprec_one,"One_Recall",trrecl_one))
322 | print(paste("Overall_Precision",round(trprec_ovll,4),"Overall_Recall",round(trrecl_ovll,4)))
323 | 
324 | ts_tble = table(ts_y_act,ts_y_pred)
325 | print(paste("AdaBoost - Test Confusion Matrix"))
326 | print(ts_tble)
327 | 
328 | ts_acc = accrcy(ts_y_act,ts_y_pred)
329 | tsprec_zero = prec_zero(ts_y_act,ts_y_pred); tsrecl_zero = recl_zero(ts_y_act,ts_y_pred)
330 | tsprec_one = prec_one(ts_y_act,ts_y_pred); tsrecl_one = recl_one(ts_y_act,ts_y_pred)
331 | 
332 | tsprec_ovll = tsprec_zero *frac_tszero + tsprec_one*frac_tsone
333 | tsrecl_ovll = tsrecl_zero *frac_tszero + tsrecl_one*frac_tsone
334 | 
335 | print(paste("AdaBoost Test accuracy:",ts_acc))
336 | print(paste("AdaBoost - Test Classification Report"))
337 | print(paste("Zero_Precision",tsprec_zero,"Zero_Recall",tsrecl_zero))
338 | print(paste("One_Precision",tsprec_one,"One_Recall",tsrecl_one))
339 | print(paste("Overall_Precision",round(tsprec_ovll,4),"Overall_Recall",round(tsrecl_ovll,4)))
340 | 
341 | 
342 | # Gradient boosting
343 | library(gbm)
344 | library(caret)
345 | 
346 | set.seed(43)
347 | 
348 | # Giving weights to all the observations in a way that total weights will be euqal 1
349 | model_weights <- ifelse(train_data$Attrition_ind == "0",
350 |                         (1/table(train_data$Attrition_ind)[1]) * 0.3,
351 |                         (1/table(train_data$Attrition_ind)[2]) * 0.7)
352 | 
353 | # Setting parameters for GBM
354 | grid <- expand.grid(n.trees = 5000, interaction.depth = 1, shrinkage = .04, n.minobsinnode = 1)
355 | 
356 | # Fitting the GBM model
357 | gbm_fit <- train(Attrition_ind ~ ., data = train_data, method = "gbm", weights = model_weights,
358 |                  tuneGrid=grid,verbose = FALSE)
359 | # To print variable importance plot
360 | summary(gbm_fit)
361 | 
362 | tr_y_pred = predict(gbm_fit, train_data,type = "raw")
363 | ts_y_pred = predict(gbm_fit,test_data,type = "raw")
364 | 
365 | tr_y_act = train_data$Attrition_ind;ts_y_act = test_data$Attrition_ind
366 | 
367 | tr_tble = table(tr_y_act,tr_y_pred)
368 | print(paste("Gradient Boosting - Train Confusion Matrix"))
369 | print(tr_tble)
370 | 
371 | tr_acc = accrcy(tr_y_act,tr_y_pred)
372 | trprec_zero = prec_zero(tr_y_act,tr_y_pred); trrecl_zero = recl_zero(tr_y_act,tr_y_pred)
373 | trprec_one = prec_one(tr_y_act,tr_y_pred); trrecl_one = recl_one(tr_y_act,tr_y_pred)
374 | 
375 | trprec_ovll = trprec_zero *frac_trzero + trprec_one*frac_trone
376 | trrecl_ovll = trrecl_zero *frac_trzero + trrecl_one*frac_trone
377 | 
378 | print(paste("Gradient Boosting Train accuracy:",tr_acc))
379 | print(paste("Gradient Boosting - Train Classification Report"))
380 | print(paste("Zero_Precision",trprec_zero,"Zero_Recall",trrecl_zero))
381 | print(paste("One_Precision",trprec_one,"One_Recall",trrecl_one))
382 | print(paste("Overall_Precision",round(trprec_ovll,4),"Overall_Recall",round(trrecl_ovll,4)))
383 | 
384 | ts_tble = table(ts_y_act,ts_y_pred)
385 | print(paste("Gradient Boosting - Test Confusion Matrix"))
386 | print(ts_tble)
387 | 
388 | ts_acc = accrcy(ts_y_act,ts_y_pred)
389 | tsprec_zero = prec_zero(ts_y_act,ts_y_pred); tsrecl_zero = recl_zero(ts_y_act,ts_y_pred)
390 | tsprec_one = prec_one(ts_y_act,ts_y_pred); tsrecl_one = recl_one(ts_y_act,ts_y_pred)
391 | 
392 | tsprec_ovll = tsprec_zero *frac_tszero + tsprec_one*frac_tsone
393 | tsrecl_ovll = tsrecl_zero *frac_tszero + tsrecl_one*frac_tsone
394 | 
395 | print(paste("Gradient Boosting Test accuracy:",ts_acc))
396 | print(paste("Gradient Boosting - Test Classification Report"))
397 | print(paste("Zero_Precision",tsprec_zero,"Zero_Recall",tsrecl_zero))
398 | print(paste("One_Precision",tsprec_one,"One_Recall",tsrecl_one))
399 | print(paste("Overall_Precision",round(tsprec_ovll,4),"Overall_Recall",round(tsrecl_ovll,4)))
400 | 
401 | 
402 | # Use the following code for performing cross validation on data - At the moment commented though
403 | #fitControl <- trainControl(method = "repeatedcv", number = 4, repeats = 4)
404 | 
405 | # gbmFit1 <- train(Attrition_ind ~ ., data = train_data, method = "gbm", 
406 | #                  trControl = fitControl,tuneGrid=grid,verbose = FALSE)
407 | 
408 | 
409 | 
410 | # Xgboost Classifier
411 | library(xgboost); library(caret)
412 | 
413 | hrattr_data = read.csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")
414 | str(hrattr_data); summary(hrattr_data)
415 | 
416 | # Target variable creation
417 | hrattr_data$Attrition_ind = 0;
418 | hrattr_data$Attrition_ind[hrattr_data$Attrition=="Yes"]=1
419 | 
420 | # Columns to be removed due to no change in its value across observations
421 | remove_cols = c("EmployeeCount","EmployeeNumber","Over18","StandardHours","Attrition")
422 | hrattr_data_new = hrattr_data[,!(names(hrattr_data) %in% remove_cols)]
423 | 
424 | 
425 | # List of  variables with continuous values
426 | continuous_columns = c('Age','DailyRate','DistanceFromHome','Education','EnvironmentSatisfaction',
427 |                        'HourlyRate', 'JobInvolvement', 'JobLevel','JobSatisfaction','MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 
428 |                        'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction','StockOptionLevel', 'TotalWorkingYears', 
429 |                        'TrainingTimesLastYear','WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
430 |                        'YearsWithCurrManager')
431 | 
432 | 
433 | # list of categorical variables
434 | ohe_feats = c('BusinessTravel', 'Department', 'EducationField','Gender',
435 |               'JobRole','MaritalStatus','OverTime')
436 | 
437 | # one-hot-encoding categorical features
438 | dummies <- dummyVars(~ BusinessTravel+Department+EducationField+Gender+JobRole+MaritalStatus+OverTime, data = hrattr_data_new)
439 | df_all_ohe <- as.data.frame(predict(dummies, newdata = hrattr_data_new))
440 | 
441 | # Cleaning column names & replace . with _
442 | colClean <- function(x){ colnames(x) <- gsub("\\.", "_", colnames(x)); x } 
443 | df_all_ohe = colClean(df_all_ohe)
444 | 
445 | hrattr_data_new$Attrition_ind = as.integer(hrattr_data_new$Attrition_ind)
446 | 
447 | # Combining both continuous & dummy variables from categories
448 | hrattr_data_v3 = cbind(df_all_ohe,hrattr_data_new[,(names(hrattr_data_new) %in% continuous_columns)],
449 |                        hrattr_data_new$Attrition_ind)
450 | 
451 | names(hrattr_data_v3)[52] = "Attrition_ind"
452 | 
453 | # Train & Test split based on 70% & 30% 
454 | set.seed(123)
455 | numrow = nrow(hrattr_data_v3)
456 | trnind = sample(1:numrow,size = as.integer(0.7*numrow))
457 | train_data = hrattr_data_v3[trnind,]
458 | test_data = hrattr_data_v3[-trnind,]
459 | 
460 | # Custom functions for calculation of Precision & Recall 
461 | frac_trzero = (table(train_data$Attrition_ind)[[1]])/nrow(train_data)
462 | frac_trone = (table(train_data$Attrition_ind)[[2]])/nrow(train_data)
463 | 
464 | frac_tszero = (table(test_data$Attrition_ind)[[1]])/nrow(test_data)
465 | frac_tsone = (table(test_data$Attrition_ind)[[2]])/nrow(test_data)
466 | 
467 | prec_zero <- function(act,pred){  tble = table(act,pred)
468 | return( round( tble[1,1]/(tble[1,1]+tble[2,1]),4)  ) }
469 | 
470 | prec_one <- function(act,pred){ tble = table(act,pred)
471 | return( round( tble[2,2]/(tble[2,2]+tble[1,2]),4)   ) }
472 | 
473 | recl_zero <- function(act,pred){tble = table(act,pred)
474 | return( round( tble[1,1]/(tble[1,1]+tble[1,2]),4)   ) }
475 | 
476 | recl_one <- function(act,pred){ tble = table(act,pred)
477 | return( round( tble[2,2]/(tble[2,2]+tble[2,1]),4)  ) }
478 | 
479 | accrcy <- function(act,pred){ tble = table(act,pred)
480 | return( round((tble[1,1]+tble[2,2])/sum(tble),4)) }
481 | 
482 | y = train_data$Attrition_ind
483 | 
484 | # XGBoost Classifier Training
485 | xgb <- xgboost(data = data.matrix(train_data[,-52]),label = y,eta = 0.04,
486 |                max_depth = 2, nround=5000, subsample = 0.5,
487 |                colsample_bytree = 0.5, seed = 1, eval_metric = "logloss",
488 |                objective = "binary:logistic",nthread = 3
489 | )
490 | 
491 | # XGBoost value prediction on train & test data
492 | tr_y_pred_prob <- predict(xgb, data.matrix(train_data[,-52]))
493 | tr_y_pred <- as.numeric(tr_y_pred_prob > 0.5)
494 | 
495 | ts_y_pred_prob <- predict(xgb, data.matrix(test_data[,-52]))
496 | ts_y_pred <- as.numeric(ts_y_pred_prob > 0.5)
497 | tr_y_act = train_data$Attrition_ind;ts_y_act = test_data$Attrition_ind
498 | tr_tble = table(tr_y_act,tr_y_pred)
499 | 
500 | # XGBoost Metric predictions on Train Data
501 | print(paste("Xgboost - Train Confusion Matrix"))
502 | print(tr_tble)
503 | 
504 | tr_acc = accrcy(tr_y_act,tr_y_pred)
505 | trprec_zero = prec_zero(tr_y_act,tr_y_pred); trrecl_zero = recl_zero(tr_y_act,tr_y_pred)
506 | trprec_one = prec_one(tr_y_act,tr_y_pred); trrecl_one = recl_one(tr_y_act,tr_y_pred)
507 | trprec_ovll = trprec_zero *frac_trzero + trprec_one*frac_trone
508 | trrecl_ovll = trrecl_zero *frac_trzero + trrecl_one*frac_trone
509 | 
510 | print(paste("Xgboost Train accuracy:",tr_acc))
511 | print(paste("Xgboost - Train Classification Report"))
512 | print(paste("Zero_Precision",trprec_zero,"Zero_Recall",trrecl_zero))
513 | print(paste("One_Precision",trprec_one,"One_Recall",trrecl_one))
514 | print(paste("Overall_Precision",round(trprec_ovll,4),"Overall_Recall",round(trrecl_ovll,4)))
515 | 
516 | 
517 | # XGBoost Metric predictions on Test Data
518 | ts_tble = table(ts_y_act,ts_y_pred)
519 | print(paste("Xgboost - Test Confusion Matrix"))
520 | print(ts_tble)
521 | 
522 | ts_acc = accrcy(ts_y_act,ts_y_pred)
523 | tsprec_zero = prec_zero(ts_y_act,ts_y_pred); tsrecl_zero = recl_zero(ts_y_act,ts_y_pred)
524 | tsprec_one = prec_one(ts_y_act,ts_y_pred); tsrecl_one = recl_one(ts_y_act,ts_y_pred)
525 | tsprec_ovll = tsprec_zero *frac_tszero + tsprec_one*frac_tsone
526 | tsrecl_ovll = tsrecl_zero *frac_tszero + tsrecl_one*frac_tsone
527 | 
528 | print(paste("Xgboost Test accuracy:",ts_acc))
529 | print(paste("Xgboost - Test Classification Report"))
530 | print(paste("Zero_Precision",tsprec_zero,"Zero_Recall",tsrecl_zero))
531 | print(paste("One_Precision",tsprec_one,"One_Recall",tsrecl_one))
532 | print(paste("Overall_Precision",round(tsprec_ovll,4),"Overall_Recall",round(tsrecl_ovll,4)))
533 | 
534 | 
535 | 
536 | # Ensemble of Ensembles
537 | rm(list = ls())
538 | 
539 | setwd("D:\\Book writing\\Codes\\Chapter 4")
540 | 
541 | hrattr_data = read.csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")
542 | str(hrattr_data)
543 | summary(hrattr_data)
544 | 
545 | hrattr_data$Attrition_ind = 0;hrattr_data$Attrition_ind[hrattr_data$Attrition=="Yes"]=1
546 | hrattr_data$Attrition_ind = as.factor(hrattr_data$Attrition_ind)
547 | 
548 | remove_cols = c("EmployeeCount","EmployeeNumber","Over18","StandardHours","Attrition")
549 | hrattr_data_new = hrattr_data[,!(names(hrattr_data) %in% remove_cols)]
550 | 
551 | set.seed(123)
552 | numrow = nrow(hrattr_data_new)
553 | trnind = sample(1:numrow,size = as.integer(0.7*numrow))
554 | train_data = hrattr_data_new[trnind,]
555 | test_data = hrattr_data_new[-trnind,]
556 | 
557 | frac_trzero = (table(train_data$Attrition_ind)[[1]])/nrow(train_data)
558 | frac_trone = (table(train_data$Attrition_ind)[[2]])/nrow(train_data)
559 | 
560 | frac_tszero = (table(test_data$Attrition_ind)[[1]])/nrow(test_data)
561 | frac_tsone = (table(test_data$Attrition_ind)[[2]])/nrow(test_data)
562 | 
563 | prec_zero <- function(act,pred){  tble = table(act,pred)
564 | return( round( tble[1,1]/(tble[1,1]+tble[2,1]),4)  ) }
565 | 
566 | prec_one <- function(act,pred){ tble = table(act,pred)
567 | return( round( tble[2,2]/(tble[2,2]+tble[1,2]),4)   ) }
568 | 
569 | recl_zero <- function(act,pred){tble = table(act,pred)
570 | return( round( tble[1,1]/(tble[1,1]+tble[1,2]),4)   ) }
571 | 
572 | recl_one <- function(act,pred){ tble = table(act,pred)
573 | return( round( tble[2,2]/(tble[2,2]+tble[2,1]),4)  ) }
574 | 
575 | accrcy <- function(act,pred){ tble = table(act,pred)
576 | return( round((tble[1,1]+tble[2,2])/sum(tble),4)) }
577 | 
578 | 
579 | # Ensemble of Ensembles with different type of Classifiers
580 | train_data$Attrition_ind = as.factor(train_data$Attrition_ind)
581 | 
582 | # Classifier 1 - Logistic Regression
583 | glm_fit = glm(Attrition_ind ~.,family = "binomial",data = train_data)
584 | glm_probs = predict(glm_fit,newdata = train_data,type = "response")
585 | 
586 | # Classifier 2 - Decision Tree classifier
587 | library(C50)
588 | dtree_fit = C5.0(train_data[-31],train_data$Attrition_ind,
589 |                  control = C5.0Control(minCases = 1))
590 | dtree_probs = predict(dtree_fit,newdata = train_data,type = "prob")[,2]
591 |                       
592 | # Classifier 3 - Random Forest
593 | library(randomForest)
594 | rf_fit = randomForest(Attrition_ind~.,data = train_data,mtry=6,maxnodes= 64,ntree=5000,nodesize = 1)
595 | rf_probs = predict(rf_fit,newdata = train_data,type = "prob")[,2]
596 | 
597 | 
598 | # Classifier 4 - Adaboost
599 | ada_fit = C5.0(train_data[-31],train_data$Attrition_ind,trails = 5000,control = C5.0Control(minCases = 1))
600 | ada_probs = predict(ada_fit,newdata = train_data,type = "prob")[,2]
601 | 
602 | # Ensemble of Models
603 | ensemble = data.frame(glm_probs,dtree_probs,rf_probs,ada_probs)
604 | ensemble = cbind(ensemble,train_data$Attrition_ind)
605 | names(ensemble)[5] = "Attrition_ind"
606 | rownames(ensemble) <- 1:nrow(ensemble)
607 | 
608 | # Meta-classifier on top of individual classifiers
609 | meta_clf = glm(Attrition_ind~.,data = ensemble,family = "binomial")
610 | meta_probs = predict(meta_clf, ensemble,type = "response")
611 | 
612 | ensemble$pred_class = 0
613 | ensemble$pred_class[meta_probs>0.5]=1
614 | 
615 | # Train confusion & accuracy metrics
616 | tr_y_pred = ensemble$pred_class
617 | tr_y_act = train_data$Attrition_ind;ts_y_act = test_data$Attrition_ind
618 | tr_tble = table(tr_y_act,tr_y_pred)
619 | print(paste("Ensemble - Train Confusion Matrix"))
620 | print(tr_tble)
621 | 
622 | tr_acc = accrcy(tr_y_act,tr_y_pred)
623 | print(paste("Ensemble Train accuracy:",tr_acc))
624 | 
625 | # Now verifing on test data
626 | glm_probs = predict(glm_fit,newdata = test_data,type = "response")
627 | dtree_probs = predict(dtree_fit,newdata = test_data,type = "prob")[,2]
628 | rf_probs = predict(rf_fit,newdata = test_data,type = "prob")[,2]
629 | ada_probs = predict(ada_fit,newdata = test_data,type = "prob")[,2]
630 | 
631 | ensemble_test = data.frame(glm_probs,dtree_probs,rf_probs,ada_probs)
632 | ensemble_test = cbind(ensemble_test,test_data$Attrition_ind)
633 | names(ensemble_test)[5] = "Attrition_ind"
634 | 
635 | rownames(ensemble_test) <- 1:nrow(ensemble_test)
636 | meta_test_probs = predict(meta_clf,newdata = ensemble_test,type = "response")
637 | ensemble_test$pred_class = 0
638 | ensemble_test$pred_class[meta_test_probs>0.5]=1
639 | 
640 | # Test confusion & accuracy metrics
641 | ts_y_pred = ensemble_test$pred_class
642 | ts_tble = table(ts_y_act,ts_y_pred)
643 | print(paste("Ensemble - Test Confusion Matrix"))
644 | print(ts_tble)
645 | 
646 | ts_acc = accrcy(ts_y_act,ts_y_pred)
647 | print(paste("Ensemble Test accuracy:",ts_acc))
648 | 
649 | 
650 | 
651 | 
652 | 
653 | 
654 | 
655 | 
656 | 
657 | 
658 | 
659 | 
660 | 
661 | 
662 | 


--------------------------------------------------------------------------------
/Chapter04/Chapter 04_Tree based ML Models.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | import os
  4 | """ First change the following directory link to where all input files do exist """
  5 | os.chdir("D:\\Book writing\\Codes\\Chapter 4")
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | from sklearn.model_selection import train_test_split    
 10 | from sklearn.metrics import accuracy_score,classification_report
 11 | 
 12 | import matplotlib.pyplot as plt
 13 | 
 14 | 
 15 | hrattr_data = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")
 16 | 
 17 | print (hrattr_data.head())
 18 | 
 19 | hrattr_data['Attrition_ind'] = 0
 20 | hrattr_data.loc[hrattr_data['Attrition']=='Yes','Attrition_ind'] = 1
 21 | 
 22 | dummy_busnstrvl = pd.get_dummies(hrattr_data['BusinessTravel'], prefix='busns_trvl')
 23 | dummy_dept = pd.get_dummies(hrattr_data['Department'], prefix='dept')
 24 | dummy_edufield = pd.get_dummies(hrattr_data['EducationField'], prefix='edufield')
 25 | dummy_gender = pd.get_dummies(hrattr_data['Gender'], prefix='gend')
 26 | dummy_jobrole = pd.get_dummies(hrattr_data['JobRole'], prefix='jobrole')
 27 | dummy_maritstat = pd.get_dummies(hrattr_data['MaritalStatus'], prefix='maritalstat') 
 28 | dummy_overtime = pd.get_dummies(hrattr_data['OverTime'], prefix='overtime') 
 29 | 
 30 | continuous_columns = ['Age','DailyRate','DistanceFromHome','Education','EnvironmentSatisfaction',
 31 | 'HourlyRate', 'JobInvolvement', 'JobLevel','JobSatisfaction','MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 
 32 | 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction','StockOptionLevel', 'TotalWorkingYears', 
 33 | 'TrainingTimesLastYear','WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
 34 | 'YearsWithCurrManager']
 35 | 
 36 | hrattr_continuous = hrattr_data[continuous_columns]
 37 | 
 38 | hrattr_continuous['Age'].describe()
 39 | hrattr_data['BusinessTravel'].value_counts()
 40 | 
 41 | hrattr_data_new = pd.concat([dummy_busnstrvl,dummy_dept,dummy_edufield,dummy_gender,dummy_jobrole,
 42 |   dummy_maritstat,dummy_overtime,hrattr_continuous,hrattr_data['Attrition_ind']],axis=1)
 43 | 
 44 | # Train & Test split
 45 | x_train,x_test,y_train,y_test = train_test_split(hrattr_data_new.drop(['Attrition_ind'],axis=1),
 46 |                                                  hrattr_data_new['Attrition_ind'],train_size = 0.7,random_state=42)
 47 | 
 48 | # Decision Tree Classifier
 49 | from sklearn.tree import DecisionTreeClassifier
 50 | dt_fit = DecisionTreeClassifier(criterion="gini",max_depth=5,min_samples_split=2,min_samples_leaf=1,random_state=42)
 51 | dt_fit.fit(x_train,y_train)
 52 | 
 53 | print ("\nDecision Tree - Train Confusion Matrix\n\n",pd.crosstab(y_train,dt_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]))      
 54 | print ("\nDecision Tree - Train accuracy:",round(accuracy_score(y_train,dt_fit.predict(x_train)),3))
 55 | print ("\nDecision Tree - Train Classification Report\n",classification_report(y_train,dt_fit.predict(x_train)))
 56 | 
 57 | print ("\n\nDecision Tree - Test Confusion Matrix\n\n",pd.crosstab(y_test,dt_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
 58 | print ("\nDecision Tree - Test accuracy:",round(accuracy_score(y_test,dt_fit.predict(x_test)),3))
 59 | print ("\nDecision Tree - Test Classification Report\n",classification_report(y_test,dt_fit.predict(x_test)))
 60 | 
 61 | 
 62 | # Tuning class weights to analyze accuracy, precision & recall
 63 | dummyarray = np.empty((6,10))
 64 | dt_wttune = pd.DataFrame(dummyarray)
 65 | 
 66 | dt_wttune.columns = ["zero_wght","one_wght","tr_accuracy","tst_accuracy","prec_zero","prec_one",
 67 |                      "prec_ovll","recl_zero","recl_one","recl_ovll"]
 68 | 
 69 | zero_clwghts = [0.01,0.1,0.2,0.3,0.4,0.5]
 70 | 
 71 | for i in range(len(zero_clwghts)):
 72 |     clwght = {0:zero_clwghts[i],1:1.0-zero_clwghts[i]}
 73 |     dt_fit = DecisionTreeClassifier(criterion="gini",max_depth=5,min_samples_split=2,
 74 |                                     min_samples_leaf=1,random_state=42,class_weight = clwght)
 75 |     dt_fit.fit(x_train,y_train)
 76 |     dt_wttune.loc[i, 'zero_wght'] = clwght[0]       
 77 |     dt_wttune.loc[i, 'one_wght'] = clwght[1]     
 78 |     dt_wttune.loc[i, 'tr_accuracy'] = round(accuracy_score(y_train,dt_fit.predict(x_train)),3)    
 79 |     dt_wttune.loc[i, 'tst_accuracy'] = round(accuracy_score(y_test,dt_fit.predict(x_test)),3)    
 80 |         
 81 |     clf_sp = classification_report(y_test,dt_fit.predict(x_test)).split()
 82 |     dt_wttune.loc[i, 'prec_zero'] = float(clf_sp[5])   
 83 |     dt_wttune.loc[i, 'prec_one'] = float(clf_sp[10])   
 84 |     dt_wttune.loc[i, 'prec_ovll'] = float(clf_sp[17])   
 85 |     
 86 |     dt_wttune.loc[i, 'recl_zero'] = float(clf_sp[6])   
 87 |     dt_wttune.loc[i, 'recl_one'] = float(clf_sp[11])   
 88 |     dt_wttune.loc[i, 'recl_ovll'] = float(clf_sp[18])
 89 |     print ("\nClass Weights",clwght,"Train accuracy:",round(accuracy_score(y_train,dt_fit.predict(x_train)),3),"Test accuracy:",round(accuracy_score(y_test,dt_fit.predict(x_test)),3))
 90 |     print ("Test Confusion Matrix\n\n",pd.crosstab(y_test,dt_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
 91 | 
 92 | 
 93 | # Bagging Classifier
 94 | from sklearn.tree import DecisionTreeClassifier
 95 | from sklearn.ensemble import BaggingClassifier
 96 | 
 97 | dt_fit = DecisionTreeClassifier(criterion="gini",max_depth=5,min_samples_split=2,min_samples_leaf=1,random_state=42,
 98 |                                 class_weight = {0:0.3,1:0.7})
 99 | 
100 | bag_fit = BaggingClassifier(base_estimator= dt_fit,n_estimators=5000,max_samples=0.67,max_features=1.0,
101 |                             bootstrap=True,bootstrap_features=True,n_jobs=-1,random_state=42)
102 | 
103 | bag_fit.fit(x_train, y_train)
104 | 
105 | print ("\nBagging - Train Confusion Matrix\n\n",pd.crosstab(y_train,bag_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]))      
106 | print ("\nBagging- Train accuracy",round(accuracy_score(y_train,bag_fit.predict(x_train)),3))
107 | print ("\nBagging  - Train Classification Report\n",classification_report(y_train,bag_fit.predict(x_train)))
108 | 
109 | print ("\n\nBagging - Test Confusion Matrix\n\n",pd.crosstab(y_test,bag_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
110 | print ("\nBagging - Test accuracy",round(accuracy_score(y_test,bag_fit.predict(x_test)),3))
111 | print ("\nBagging - Test Classification Report\n",classification_report(y_test,bag_fit.predict(x_test)))
112 | 
113 | 
114 | 
115 | # Random Forest Classifier
116 | from sklearn.ensemble import RandomForestClassifier
117 | 
118 | rf_fit = RandomForestClassifier(n_estimators=5000,criterion="gini",max_depth=5,min_samples_split=2,bootstrap=True,
119 |                                 max_features='auto',random_state=42,min_samples_leaf=1,class_weight = {0:0.3,1:0.7})
120 | rf_fit.fit(x_train,y_train)       
121 | 
122 | print ("\nRandom Forest - Train Confusion Matrix\n\n",pd.crosstab(y_train,rf_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]))      
123 | print ("\nRandom Forest - Train accuracy",round(accuracy_score(y_train,rf_fit.predict(x_train)),3))
124 | print ("\nRandom Forest  - Train Classification Report\n",classification_report(y_train,rf_fit.predict(x_train)))
125 | 
126 | print ("\n\nRandom Forest - Test Confusion Matrix\n\n",pd.crosstab(y_test,rf_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
127 | print ("\nRandom Forest - Test accuracy",round(accuracy_score(y_test,rf_fit.predict(x_test)),3))
128 | print ("\nRandom Forest - Test Classification Report\n",classification_report(y_test,rf_fit.predict(x_test)))
129 | 
130 | 
131 | # Plot of Variable importance by mean decrease in gini
132 | model_ranks = pd.Series(rf_fit.feature_importances_,index=x_train.columns, name='Importance').sort_values(ascending=False, inplace=False)
133 | model_ranks.index.name = 'Variables'
134 | top_features = model_ranks.iloc[:31].sort_values(ascending=True,inplace=False)
135 | plt.figure(figsize=(20,10))
136 | ax = top_features.plot(kind='barh')
137 | _ = ax.set_title("Variable Importance Plot")
138 | _ = ax.set_xlabel('Mean decrease in Variance')
139 | _ = ax.set_yticklabels(top_features.index, fontsize=13)
140 | 
141 | 
142 | 
143 | 
144 | # Random Forest Classifier - Grid Search
145 | from sklearn.pipeline import Pipeline
146 | from sklearn.model_selection import train_test_split,GridSearchCV
147 | 
148 | pipeline = Pipeline([
149 |         ('clf',RandomForestClassifier(criterion='gini',class_weight = {0:0.3,1:0.7}))])
150 | 
151 | parameters = {
152 |         'clf__n_estimators':(2000,3000,5000),
153 |         'clf__max_depth':(5,15,30),
154 |         'clf__min_samples_split':(2,3),
155 |         'clf__min_samples_leaf':(1,2)  }
156 | 
157 | grid_search = GridSearchCV(pipeline,parameters,n_jobs=-1,cv=5,verbose=1,scoring='accuracy')
158 | grid_search.fit(x_train,y_train)
159 | 
160 | print ('Best Training score: %0.3f' % grid_search.best_score_)
161 | print ('Best parameters set:')
162 | best_parameters = grid_search.best_estimator_.get_params()
163 | for param_name in sorted(parameters.keys()):
164 |     print ('\t%s: %r' % (param_name, best_parameters[param_name]))
165 | 
166 | predictions = grid_search.predict(x_test)
167 | 
168 | print ("Testing accuracy:",round(accuracy_score(y_test, predictions),4))
169 | print ("\nComplete report of Testing data\n",classification_report(y_test, predictions))
170 | print ("\n\nRandom Forest Grid Search- Test Confusion Matrix\n\n",pd.crosstab(y_test, predictions,rownames = ["Actuall"],colnames = ["Predicted"]))      
171 | 
172 | 
173 | # Adaboost Classifier
174 | from sklearn.tree import DecisionTreeClassifier
175 | from sklearn.ensemble import AdaBoostClassifier
176 | dtree = DecisionTreeClassifier(criterion='gini',max_depth=1)
177 | 
178 | adabst_fit = AdaBoostClassifier(base_estimator= dtree,
179 |         n_estimators=5000,learning_rate=0.05,random_state=42)
180 | 
181 | adabst_fit.fit(x_train, y_train)
182 | 
183 | print ("\nAdaBoost - Train Confusion Matrix\n\n",pd.crosstab(y_train,adabst_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]))      
184 | print ("\nAdaBoost  - Train accuracy",round(accuracy_score(y_train,adabst_fit.predict(x_train)),3))
185 | print ("\nAdaBoost  - Train Classification Report\n",classification_report(y_train,adabst_fit.predict(x_train)))
186 | 
187 | print ("\n\nAdaBoost  - Test Confusion Matrix\n\n",pd.crosstab(y_test,adabst_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
188 | print ("\nAdaBoost  - Test accuracy",round(accuracy_score(y_test,adabst_fit.predict(x_test)),3))
189 | print ("\nAdaBoost - Test Classification Report\n",classification_report(y_test,adabst_fit.predict(x_test)))
190 | 
191 | # Gradientboost Classifier
192 | from sklearn.ensemble import GradientBoostingClassifier
193 | 
194 | gbc_fit = GradientBoostingClassifier(loss='deviance',learning_rate=0.05,n_estimators=5000,
195 |                                      min_samples_split=2,min_samples_leaf=1,max_depth=1,random_state=42 )
196 | gbc_fit.fit(x_train,y_train)
197 | 
198 | print ("\nGradient Boost - Train Confusion Matrix\n\n",pd.crosstab(y_train,gbc_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]))      
199 | print ("\nGradient Boost - Train accuracy",round(accuracy_score(y_train,gbc_fit.predict(x_train)),3))
200 | print ("\nGradient Boost  - Train Classification Report\n",classification_report(y_train,gbc_fit.predict(x_train)))
201 | 
202 | print ("\n\nGradient Boost - Test Confusion Matrix\n\n",pd.crosstab(y_test,gbc_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
203 | print ("\nGradient Boost - Test accuracy",round(accuracy_score(y_test,gbc_fit.predict(x_test)),3))
204 | print ("\nGradient Boost - Test Classification Report\n",classification_report(y_test,gbc_fit.predict(x_test)))
205 | 
206 |   
207 | # Xgboost Classifier
208 | import xgboost as xgb
209 | 
210 | xgb_fit = xgb.XGBClassifier(max_depth=2, n_estimators=5000, learning_rate=0.05)
211 | xgb_fit.fit(x_train, y_train)
212 | 
213 | print ("\nXGBoost - Train Confusion Matrix\n\n",pd.crosstab(y_train,xgb_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]))      
214 | print ("\nXGBoost - Train accuracy",round(accuracy_score(y_train,xgb_fit.predict(x_train)),3))
215 | print ("\nXGBoost  - Train Classification Report\n",classification_report(y_train,xgb_fit.predict(x_train)))
216 | 
217 | print ("\n\nXGBoost - Test Confusion Matrix\n\n",pd.crosstab(y_test,xgb_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
218 | print ("\nXGBoost - Test accuracy",round(accuracy_score(y_test,xgb_fit.predict(x_test)),3))
219 | print ("\nXGBoost - Test Classification Report\n",classification_report(y_test,xgb_fit.predict(x_test)))
220 | 
221 | 
222 | #Ensemble of Ensembles - by fitting various classifiers
223 | clwght = {0:0.3,1:0.7}
224 | 
225 | # Classifier 1
226 | from sklearn.linear_model import LogisticRegression
227 | clf1_logreg_fit = LogisticRegression(fit_intercept=True,class_weight=clwght)
228 | clf1_logreg_fit.fit(x_train,y_train)
229 | 
230 | print ("\nLogistic Regression for Ensemble - Train Confusion Matrix\n\n",pd.crosstab(y_train,clf1_logreg_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]))      
231 | print ("\nLogistic Regression for Ensemble - Train accuracy",round(accuracy_score(y_train,clf1_logreg_fit.predict(x_train)),3))
232 | print ("\nLogistic Regression for Ensemble - Train Classification Report\n",classification_report(y_train,clf1_logreg_fit.predict(x_train)))
233 | 
234 | print ("\n\nLogistic Regression for Ensemble - Test Confusion Matrix\n\n",pd.crosstab(y_test,clf1_logreg_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
235 | print ("\nLogistic Regression for Ensemble - Test accuracy",round(accuracy_score(y_test,clf1_logreg_fit.predict(x_test)),3))
236 | print ("\nLogistic Regression for Ensemble - Test Classification Report\n",classification_report(y_test,clf1_logreg_fit.predict(x_test)))
237 | 
238 | 
239 | # Classifier 2
240 | from sklearn.tree import DecisionTreeClassifier
241 | clf2_dt_fit = DecisionTreeClassifier(criterion="gini",max_depth=5,min_samples_split=2,
242 |                                      min_samples_leaf=1,random_state=42,class_weight=clwght)
243 | clf2_dt_fit.fit(x_train,y_train)
244 | 
245 | print ("\nDecision Tree for Ensemble - Train Confusion Matrix\n\n",pd.crosstab(y_train,clf2_dt_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]))      
246 | print ("\nDecision Tree for Ensemble - Train accuracy",round(accuracy_score(y_train,clf2_dt_fit.predict(x_train)),3))
247 | print ("\nDecision Tree for Ensemble - Train Classification Report\n",classification_report(y_train,clf2_dt_fit.predict(x_train)))
248 | 
249 | print ("\n\nDecision Tree for Ensemble - Test Confusion Matrix\n\n",pd.crosstab(y_test,clf2_dt_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
250 | print ("\nDecision Tree for Ensemble - Test accuracy",round(accuracy_score(y_test,clf2_dt_fit.predict(x_test)),3))
251 | print ("\nDecision Tree for Ensemble - Test Classification Report\n",classification_report(y_test,clf2_dt_fit.predict(x_test)))
252 | 
253 | 
254 | # Classifier 3
255 | from sklearn.ensemble import RandomForestClassifier
256 | clf3_rf_fit = RandomForestClassifier(n_estimators=10000,criterion="gini",max_depth=6,
257 |                                 min_samples_split=2,min_samples_leaf=1,class_weight = clwght)
258 | clf3_rf_fit.fit(x_train,y_train)       
259 | 
260 | print ("\nRandom Forest for Ensemble - Train Confusion Matrix\n\n",pd.crosstab(y_train,clf3_rf_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]))      
261 | print ("\nRandom Forest for Ensemble - Train accuracy",round(accuracy_score(y_train,clf3_rf_fit.predict(x_train)),3))
262 | print ("\nRandom Forest for Ensemble - Train Classification Report\n",classification_report(y_train,clf3_rf_fit.predict(x_train)))
263 | 
264 | print ("\n\nRandom Forest for Ensemble - Test Confusion Matrix\n\n",pd.crosstab(y_test,clf3_rf_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
265 | print ("\nRandom Forest for Ensemble - Test accuracy",round(accuracy_score(y_test,clf3_rf_fit.predict(x_test)),3))
266 | print ("\nRandom Forest for Ensemble - Test Classification Report\n",classification_report(y_test,clf3_rf_fit.predict(x_test)))
267 | 
268 | 
269 | # Classifier 4
270 | from sklearn.ensemble import AdaBoostClassifier
271 | clf4_dtree = DecisionTreeClassifier(criterion='gini',max_depth=1,class_weight = clwght)
272 | clf4_adabst_fit = AdaBoostClassifier(base_estimator= clf4_dtree,
273 |         n_estimators=5000,learning_rate=0.05,random_state=42)
274 | 
275 | clf4_adabst_fit.fit(x_train, y_train)
276 | 
277 | print ("\nAdaBoost for Ensemble  - Train Confusion Matrix\n\n",pd.crosstab(y_train,clf4_adabst_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]))      
278 | print ("\nAdaBoost for Ensemble   - Train accuracy",round(accuracy_score(y_train,clf4_adabst_fit.predict(x_train)),3))
279 | print ("\nAdaBoost for Ensemble   - Train Classification Report\n",classification_report(y_train,clf4_adabst_fit.predict(x_train)))
280 | 
281 | print ("\n\nAdaBoost for Ensemble   - Test Confusion Matrix\n\n",pd.crosstab(y_test,clf4_adabst_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
282 | print ("\nAdaBoost for Ensemble   - Test accuracy",round(accuracy_score(y_test,clf4_adabst_fit.predict(x_test)),3))
283 | print ("\nAdaBoost for Ensemble  - Test Classification Report\n",classification_report(y_test,clf4_adabst_fit.predict(x_test)))
284 | 
285 | 
286 | ensemble = pd.DataFrame()
287 | 
288 | ensemble["log_output_one"] = pd.DataFrame(clf1_logreg_fit.predict_proba(x_train))[1]
289 | ensemble["dtr_output_one"] = pd.DataFrame(clf2_dt_fit.predict_proba(x_train))[1]
290 | ensemble["rf_output_one"] = pd.DataFrame(clf3_rf_fit.predict_proba(x_train))[1]
291 | ensemble["adb_output_one"] = pd.DataFrame(clf4_adabst_fit.predict_proba(x_train))[1]
292 | 
293 | ensemble = pd.concat([ensemble,pd.DataFrame(y_train).reset_index(drop = True )],axis=1)
294 | 
295 | # Fitting meta-classifier
296 | meta_logit_fit =  LogisticRegression(fit_intercept=False)
297 | meta_logit_fit.fit(ensemble[['log_output_one','dtr_output_one','rf_output_one','adb_output_one']],ensemble['Attrition_ind'])
298 | 
299 | coefs =  meta_logit_fit.coef_
300 | print ("Co-efficients for LR, DT, RF & AB are:",coefs)
301 | 
302 | ensemble_test = pd.DataFrame()
303 | ensemble_test["log_output_one"] = pd.DataFrame(clf1_logreg_fit.predict_proba(x_test))[1]
304 | ensemble_test["dtr_output_one"] = pd.DataFrame(clf2_dt_fit.predict_proba(x_test))[1]
305 | ensemble_test["rf_output_one"] = pd.DataFrame(clf3_rf_fit.predict_proba(x_test))[1]
306 | ensemble_test["adb_output_one"] = pd.DataFrame(clf4_adabst_fit.predict_proba(x_test))[1]
307 | 
308 | ensemble_test["all_one"] = meta_logit_fit.predict(ensemble_test[['log_output_one','dtr_output_one','rf_output_one','adb_output_one']])
309 | 
310 | ensemble_test = pd.concat([ensemble_test,pd.DataFrame(y_test).reset_index(drop = True )],axis=1)
311 | 
312 | print ("\n\nEnsemble of Models - Test Confusion Matrix\n\n",pd.crosstab(ensemble_test['Attrition_ind'],ensemble_test['all_one'],rownames = ["Actuall"],colnames = ["Predicted"]))      
313 | print ("\nEnsemble of Models - Test accuracy",round(accuracy_score(ensemble_test['Attrition_ind'],ensemble_test['all_one']),3))
314 | print ("\nEnsemble of Models - Test Classification Report\n",classification_report(ensemble_test['Attrition_ind'],ensemble_test['all_one']))
315 | 
316 | 
317 | 
318 | 
319 | # Ensemble of Ensembles - by applying bagging on simple classifier
320 | from sklearn.tree import DecisionTreeClassifier
321 | from sklearn.ensemble import BaggingClassifier
322 | from sklearn.ensemble import AdaBoostClassifier
323 | 
324 | clwght = {0:0.3,1:0.7}
325 | 
326 | eoe_dtree = DecisionTreeClassifier(criterion='gini',max_depth=1,class_weight = clwght)
327 | eoe_adabst_fit = AdaBoostClassifier(base_estimator= eoe_dtree,
328 |         n_estimators=500,learning_rate=0.05,random_state=42)
329 | eoe_adabst_fit.fit(x_train, y_train)
330 | 
331 | print ("\nAdaBoost - Train Confusion Matrix\n\n",pd.crosstab(y_train,eoe_adabst_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]))      
332 | print ("\nAdaBoost - Train accuracy",round(accuracy_score(y_train,eoe_adabst_fit.predict(x_train)),3))
333 | print ("\nAdaBoost  - Train Classification Report\n",classification_report(y_train,eoe_adabst_fit.predict(x_train)))
334 | 
335 | print ("\n\nAdaBoost - Test Confusion Matrix\n\n",pd.crosstab(y_test,eoe_adabst_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
336 | print ("\nAdaBoost - Test accuracy",round(accuracy_score(y_test,eoe_adabst_fit.predict(x_test)),3))
337 | print ("\nAdaBoost - Test Classification Report\n",classification_report(y_test,eoe_adabst_fit.predict(x_test)))
338 | 
339 | 
340 | bag_fit = BaggingClassifier(base_estimator= eoe_adabst_fit,n_estimators=50,
341 |                             max_samples=1.0,max_features=1.0,
342 |                             bootstrap=True,
343 |                             bootstrap_features=False,
344 |                             n_jobs=-1,
345 |                             random_state=42)
346 | 
347 | bag_fit.fit(x_train, y_train)
348 | 
349 | print ("\nEnsemble of AdaBoost - Train Confusion Matrix\n\n",pd.crosstab(y_train,bag_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]))      
350 | print ("\nEnsemble of AdaBoost - Train accuracy",round(accuracy_score(y_train,bag_fit.predict(x_train)),3))
351 | print ("\nEnsemble of AdaBoost  - Train Classification Report\n",classification_report(y_train,bag_fit.predict(x_train)))
352 | 
353 | print ("\n\nEnsemble of AdaBoost - Test Confusion Matrix\n\n",pd.crosstab(y_test,bag_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
354 | print ("\nEnsemble of AdaBoost - Test accuracy",round(accuracy_score(y_test,bag_fit.predict(x_test)),3))
355 | print ("\nEnsemble of AdaBoost - Test Classification Report\n",classification_report(y_test,bag_fit.predict(x_test)))
356 | 
357 | 
358 | 
359 | 
360 | 
361 | 
362 | 
363 | 
364 | 


--------------------------------------------------------------------------------
/Chapter05/Chapter 05_KNN n Naive Bayes.R:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Statistics-for-Machine-Learning/41e73f42da97c164859641c2add6e487cbc77402/Chapter05/Chapter 05_KNN n Naive Bayes.R


--------------------------------------------------------------------------------
/Chapter05/Chapter 05_KNN n Naive Bayes.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | import os
  5 | """ First change the following directory link to where all input files do exist """
  6 | os.chdir("D:\\Book writing\\Codes\\Chapter 5")
  7 | 
  8 | 
  9 | 
 10 | import numpy as np
 11 | import pandas as pd
 12 | 
 13 | # KNN Curse of Dimensionality
 14 | import random,math
 15 | 
 16 | def random_point_gen(dimension):
 17 |     return [random.random() for _ in range(dimension)]
 18 | 
 19 | def distance(v,w):
 20 |     vec_sub = [v_i-w_i for v_i,w_i in zip(v,w)]
 21 |     sum_of_sqrs = sum(v_i*v_i for v_i in vec_sub)
 22 |     return math.sqrt(sum_of_sqrs)
 23 | 
 24 | def random_distances_comparison(dimension,number_pairs):
 25 |     return [distance(random_point_gen(dimension),random_point_gen(dimension))
 26 |             for _ in range(number_pairs)]
 27 | 
 28 | def mean(x):
 29 |     return sum(x) / len(x)
 30 | 
 31 | dimensions = range(1, 201, 5)
 32 | 
 33 | avg_distances = []
 34 | min_distances = []
 35 | 
 36 | 
 37 | dummyarray = np.empty((20,4))
 38 | dist_vals = pd.DataFrame(dummyarray)
 39 | dist_vals.columns = ["Dimension","Min_Distance","Avg_Distance","Min/Avg_Distance"]
 40 | 
 41 | random.seed(34)
 42 | i = 0
 43 | for dims in dimensions:
 44 |     distances = random_distances_comparison(dims, 1000)  
 45 |     avg_distances.append(mean(distances))    
 46 |     min_distances.append(min(distances))     
 47 |     
 48 |     dist_vals.loc[i,"Dimension"] = dims
 49 |     dist_vals.loc[i,"Min_Distance"] = min(distances)
 50 |     dist_vals.loc[i,"Avg_Distance"] = mean(distances)
 51 |     dist_vals.loc[i,"Min/Avg_Distance"] = min(distances)/mean(distances)
 52 |                  
 53 |     print(dims, min(distances), mean(distances), min(distances)*1.0 / mean(distances))
 54 |     i = i+1
 55 | 
 56 | # Ploting Average distances for Various Dimensions
 57 | import matplotlib.pyplot as plt
 58 | plt.figure()
 59 | #plt.title('Avg. Distance Change with Number of Dimensions for 1K Obs')
 60 | plt.xlabel('Dimensions')
 61 | plt.ylabel('Avg. Distance')
 62 | plt.plot(dist_vals["Dimension"],dist_vals["Avg_Distance"])
 63 | plt.legend(loc='best')
 64 | plt.show()
 65 | 
 66 | 
 67 | 
 68 | # 1-Dimension Plot
 69 | import numpy as np
 70 | import pandas as pd
 71 | import matplotlib.pyplot as plt
 72 | 
 73 | one_d_data = np.random.rand(60,1)
 74 | one_d_data_df = pd.DataFrame(one_d_data)
 75 | one_d_data_df.columns = ["1D_Data"]
 76 | one_d_data_df["height"] = 1
 77 | 
 78 | plt.figure()
 79 | plt.scatter(one_d_data_df['1D_Data'],one_d_data_df["height"])
 80 | plt.yticks([])
 81 | plt.xlabel("1-D points")
 82 | plt.show()
 83 | 
 84 | # 2- Dimensions Plot
 85 | two_d_data = np.random.rand(60,2)
 86 | two_d_data_df = pd.DataFrame(two_d_data)
 87 | two_d_data_df.columns = ["x_axis","y_axis"]
 88 | 
 89 | plt.figure()
 90 | plt.scatter(two_d_data_df['x_axis'],two_d_data_df["y_axis"])
 91 | plt.xlabel("x_axis");plt.ylabel("y_axis")
 92 | plt.show()
 93 | 
 94 | # 3- Dimensions Plot
 95 | three_d_data = np.random.rand(60,3)
 96 | three_d_data_df = pd.DataFrame(three_d_data)
 97 | three_d_data_df.columns = ["x_axis","y_axis","z_axis"]
 98 | 
 99 | from mpl_toolkits.mplot3d import Axes3D
100 | fig = plt.figure()
101 | ax = fig.add_subplot(111, projection='3d')
102 | ax.scatter(three_d_data_df['x_axis'],three_d_data_df["y_axis"],three_d_data_df["z_axis"])
103 | ax.set_xlabel('x_axis')
104 | ax.set_ylabel('y_axis')
105 | ax.set_zlabel('z_axis')
106 | plt.show()
107 | 
108 | 
109 | 
110 | 
111 | # KNN CLassifier - Breast Cancer
112 | import numpy as np
113 | import pandas as pd
114 | from sklearn.metrics import accuracy_score,classification_report
115 | 
116 | breast_cancer = pd.read_csv("Breast_Cancer_Wisconsin.csv")
117 | 
118 | print (breast_cancer.head())
119 | 
120 | breast_cancer['Bare_Nuclei'] = breast_cancer['Bare_Nuclei'].replace('?', np.NAN)
121 | breast_cancer['Bare_Nuclei'] = breast_cancer['Bare_Nuclei'].fillna(breast_cancer['Bare_Nuclei'].value_counts().index[0])
122 | 
123 | breast_cancer['Cancer_Ind'] = 0
124 | breast_cancer.loc[breast_cancer['Class']==4,'Cancer_Ind'] = 1
125 | 
126 | x_vars = breast_cancer.drop(['ID_Number','Class','Cancer_Ind'],axis=1)
127 | y_var = breast_cancer['Cancer_Ind']
128 | 
129 | 
130 | from sklearn.preprocessing import StandardScaler
131 | x_vars_stdscle = StandardScaler().fit_transform(x_vars.values)
132 | from sklearn.model_selection import train_test_split
133 | x_vars_stdscle_df = pd.DataFrame(x_vars_stdscle, index=x_vars.index, columns=x_vars.columns)
134 | x_train,x_test,y_train,y_test = train_test_split(x_vars_stdscle_df,y_var,train_size = 0.7,random_state=42)
135 | 
136 | from sklearn.neighbors import KNeighborsClassifier
137 | knn_fit = KNeighborsClassifier(n_neighbors=3,p=2,metric='minkowski')
138 | knn_fit.fit(x_train,y_train)
139 | 
140 | print ("\nK-Nearest Neighbors - Train Confusion Matrix\n\n",pd.crosstab(y_train,knn_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]) )     
141 | print ("\nK-Nearest Neighbors - Train accuracy:",round(accuracy_score(y_train,knn_fit.predict(x_train)),3))
142 | print ("\nK-Nearest Neighbors - Train Classification Report\n",classification_report(y_train,knn_fit.predict(x_train)))
143 | 
144 | print ("\n\nK-Nearest Neighbors - Test Confusion Matrix\n\n",pd.crosstab(y_test,knn_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
145 | print ("\nK-Nearest Neighbors - Test accuracy:",round(accuracy_score(y_test,knn_fit.predict(x_test)),3))
146 | print ("\nK-Nearest Neighbors - Test Classification Report\n",classification_report(y_test,knn_fit.predict(x_test)))
147 | 
148 | 
149 | # Tuning of K- value for Train & Test data
150 | dummyarray = np.empty((5,3))
151 | k_valchart = pd.DataFrame(dummyarray)
152 | k_valchart.columns = ["K_value","Train_acc","Test_acc"]
153 | 
154 | k_vals = [1,2,3,4,5]
155 | for i in range(len(k_vals)):
156 |     knn_fit = KNeighborsClassifier(n_neighbors=k_vals[i],p=2,metric='minkowski')
157 |     knn_fit.fit(x_train,y_train)
158 | 
159 |     print ("\nK-value",k_vals[i])
160 |     
161 |     tr_accscore = round(accuracy_score(y_train,knn_fit.predict(x_train)),3)
162 |     print ("\nK-Nearest Neighbors - Train Confusion Matrix\n\n",pd.crosstab(y_train,knn_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]) )     
163 |     print ("\nK-Nearest Neighbors - Train accuracy:",tr_accscore)
164 |     print ("\nK-Nearest Neighbors - Train Classification Report\n",classification_report(y_train,knn_fit.predict(x_train)))
165 | 
166 |     ts_accscore = round(accuracy_score(y_test,knn_fit.predict(x_test)),3)    
167 |     print ("\n\nK-Nearest Neighbors - Test Confusion Matrix\n\n",pd.crosstab(y_test,knn_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
168 |     print ("\nK-Nearest Neighbors - Test accuracy:",ts_accscore)
169 |     print ("\nK-Nearest Neighbors - Test Classification Report\n",classification_report(y_test,knn_fit.predict(x_test)))
170 |     
171 |     k_valchart.loc[i, 'K_value'] = k_vals[i]      
172 |     k_valchart.loc[i, 'Train_acc'] = tr_accscore     
173 |     k_valchart.loc[i, 'Test_acc'] = ts_accscore               
174 | 
175 | 
176 | # Ploting accuracies over varied K-values
177 | import matplotlib.pyplot as plt
178 | plt.figure()
179 | #plt.title('KNN Train & Test Accuracy change with K-value')
180 | 
181 | plt.xlabel('K-value')
182 | plt.ylabel('Accuracy')
183 | plt.plot(k_valchart["K_value"],k_valchart["Train_acc"])
184 | plt.plot(k_valchart["K_value"],k_valchart["Test_acc"])
185 | 
186 | plt.axis([0.9,5, 0.92, 1.005])
187 | plt.xticks([1,2,3,4,5])
188 | 
189 | for a,b in zip(k_valchart["K_value"],k_valchart["Train_acc"]):
190 |     plt.text(a, b, str(b),fontsize=10)
191 | 
192 | for a,b in zip(k_valchart["K_value"],k_valchart["Test_acc"]):
193 |     plt.text(a, b, str(b),fontsize=10)
194 |     
195 | plt.legend(loc='upper right')    
196 | 
197 | plt.show()
198 | 
199 | 
200 | 
201 | 
202 | 
203 | # Naive Bayes using NLP
204 | 
205 | # USe following code if it wont work in first place with UTF-8 code error
206 | 
207 | # import sys
208 | # reload(sys)
209 | # sys.setdefaultencoding('utf-8')
210 | 
211 | import csv
212 | 
213 | smsdata = open('SMSSpamCollection.txt','r')
214 | csv_reader = csv.reader(smsdata,delimiter='\t')
215 | 
216 | smsdata_data = []
217 | smsdata_labels = []
218 | 
219 | for line in csv_reader:
220 |     smsdata_labels.append(line[0])
221 |     smsdata_data.append(line[1])
222 | 
223 | smsdata.close()
224 | 
225 | # Printing top 5 lines
226 | for i in range(5):
227 |     print (smsdata_data[i],smsdata_labels[i])
228 | 
229 | # Printing Spam & Ham count
230 | from collections import Counter
231 | c = Counter( smsdata_labels )
232 | print(c)
233 | 
234 | 
235 | import nltk
236 | from nltk.corpus import stopwords
237 | from nltk.stem import WordNetLemmatizer
238 | import string
239 | import pandas as pd
240 | from nltk import pos_tag
241 | from nltk.stem import PorterStemmer
242 | 
243 | def preprocessing(text):
244 |     text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())
245 | 
246 |     tokens = [word for sent in nltk.sent_tokenize(text2) for word in
247 |               nltk.word_tokenize(sent)]
248 |     
249 |     tokens = [word.lower() for word in tokens]
250 |     
251 |     stopwds = stopwords.words('english')
252 |     tokens = [token for token in tokens if token not in stopwds]
253 |     
254 |     tokens = [word for word in tokens if len(word)>=3]
255 |     
256 |     stemmer = PorterStemmer()
257 |     tokens = [stemmer.stem(word) for word in tokens]
258 | 
259 |     tagged_corpus = pos_tag(tokens)    
260 |     
261 |     Noun_tags = ['NN','NNP','NNPS','NNS']
262 |     Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']
263 | 
264 |     lemmatizer = WordNetLemmatizer()
265 | 
266 |     def prat_lemmatize(token,tag):
267 |         if tag in Noun_tags:
268 |             return lemmatizer.lemmatize(token,'n')
269 |         elif tag in Verb_tags:
270 |             return lemmatizer.lemmatize(token,'v')
271 |         else:
272 |             return lemmatizer.lemmatize(token,'n')
273 |     
274 |     pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             
275 | 
276 |     return pre_proc_text
277 | 
278 | 
279 | smsdata_data_2 = []
280 | 
281 | for i in smsdata_data:
282 |     smsdata_data_2.append(preprocessing(i))
283 | 
284 | 
285 | import numpy as np
286 | 
287 | 
288 | trainset_size = int(round(len(smsdata_data_2)*0.70))
289 | 
290 | 
291 | print ('The training set size for this classifier is ' + str(trainset_size) + '\n')
292 | 
293 | x_train = np.array([''.join(rec) for rec in smsdata_data_2[0:trainset_size]])
294 | y_train = np.array([rec for rec in smsdata_labels[0:trainset_size]])
295 | x_test = np.array([''.join(rec) for rec in smsdata_data_2[trainset_size+1:len(smsdata_data_2)]])
296 | y_test = np.array([rec for rec in smsdata_labels[trainset_size+1:len(smsdata_labels)]])
297 | 
298 | 
299 | # building TFIDF vectorizer 
300 | from sklearn.feature_extraction.text import TfidfVectorizer
301 | 
302 | vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2),  stop_words='english', 
303 |                              max_features= 4000,strip_accents='unicode',  norm='l2')
304 | 
305 | x_train_2 = vectorizer.fit_transform(x_train).todense()
306 | x_test_2 = vectorizer.transform(x_test).todense()
307 | 
308 | from sklearn.naive_bayes import MultinomialNB
309 | clf = MultinomialNB().fit(x_train_2, y_train)
310 | 
311 | ytrain_nb_predicted = clf.predict(x_train_2)
312 | ytest_nb_predicted = clf.predict(x_test_2)
313 | 
314 | from sklearn.metrics import classification_report,accuracy_score
315 | 
316 | print ("\nNaive Bayes - Train Confusion Matrix\n\n",pd.crosstab(y_train,ytrain_nb_predicted,rownames = ["Actuall"],colnames = ["Predicted"]))      
317 | print ("\nNaive Bayes- Train accuracy",round(accuracy_score(y_train,ytrain_nb_predicted),3))
318 | print ("\nNaive Bayes  - Train Classification Report\n",classification_report(y_train,ytrain_nb_predicted))
319 | 
320 | print ("\nNaive Bayes - Test Confusion Matrix\n\n",pd.crosstab(y_test,ytest_nb_predicted,rownames = ["Actuall"],colnames = ["Predicted"]))      
321 | print ("\nNaive Bayes- Test accuracy",round(accuracy_score(y_test,ytest_nb_predicted),3))
322 | print ("\nNaive Bayes  - Test Classification Report\n",classification_report(y_test,ytest_nb_predicted))
323 | 
324 | 
325 | # printing top features 
326 | feature_names = vectorizer.get_feature_names()
327 | coefs = clf.coef_
328 | intercept = clf.intercept_
329 | coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
330 | 
331 | print ("\n\nTop 10 features - both first & last\n")
332 | n=10
333 | top_n_coefs = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
334 | for (coef_1, fn_1), (coef_2, fn_2) in top_n_coefs:
335 |     print('\t%.4f\t%-15s\t\t%.4f\t%-15s' % (coef_1, fn_1, coef_2, fn_2))
336 | 
337 | 
338 | 


--------------------------------------------------------------------------------
/Chapter06/Chapter 06_SVM_n_NN.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | 
  5 | 
  6 | rm(list = ls())
  7 | 
  8 | # First change the following directory link to where all the input files do exist
  9 | setwd("D:\\Book writing\\Codes\\Chapter 6")
 10 | 
 11 | letter_data = read.csv("letterdata.csv")
 12 | 
 13 | set.seed(123)
 14 | numrow = nrow(letter_data)
 15 | trnind = sample(1:numrow,size = as.integer(0.7*numrow))
 16 | train_data = letter_data[trnind,]
 17 | test_data = letter_data[-trnind,]
 18 | 
 19 | 
 20 | library(e1071)
 21 | 
 22 | accrcy <- function(matrx){ 
 23 |   return( sum(diag(matrx)/sum(matrx)))}
 24 | 
 25 | precsn <- function(matrx){
 26 |   return(diag(matrx) / rowSums(matrx))
 27 | }
 28 | 
 29 | recll <- function(matrx){
 30 |   return(diag(matrx) / colSums(matrx))
 31 | }
 32 | 
 33 | 
 34 | 
 35 | 
 36 | # SVM - Linear Kernel
 37 | svm_fit = svm(letter~.,data = train_data,kernel="linear",cost=1.0,scale = TRUE)
 38 | 
 39 | tr_y_pred = predict(svm_fit, train_data)
 40 | ts_y_pred = predict(svm_fit,test_data)
 41 | 
 42 | tr_y_act = train_data$letter;ts_y_act = test_data$letter
 43 | 
 44 | tr_tble = table(tr_y_act,tr_y_pred)
 45 | print(paste("Train Confusion Matrix"))
 46 | print(tr_tble)
 47 | tr_acc = accrcy(tr_tble)
 48 | print(paste("SVM Linear Kernel Train accuracy:",round(tr_acc,4)))
 49 | 
 50 | tr_prec = precsn(tr_tble)
 51 | print(paste("SVM Linear Kernel Train Precision:"))
 52 | print(tr_prec)
 53 | 
 54 | tr_rcl = recll(tr_tble)
 55 | print(paste("SVM Linear Kernel Train Recall:"))
 56 | print(tr_rcl)
 57 | 
 58 | ts_tble = table(ts_y_act,ts_y_pred)
 59 | print(paste("Test Confusion Matrix"))
 60 | print(ts_tble)
 61 | 
 62 | ts_acc = accrcy(ts_tble)
 63 | print(paste("SVM Linear Kernel Test accuracy:",round(ts_acc,4)))
 64 | 
 65 | ts_prec = precsn(ts_tble)
 66 | print(paste("SVM Linear Kernel Test Precision:"))
 67 | print(ts_prec)
 68 | 
 69 | ts_rcl = recll(ts_tble)
 70 | print(paste("SVM Linear Kernel Test Recall:"))
 71 | print(ts_rcl)
 72 | 
 73 | 
 74 | # SVM - Polynomial Kernel
 75 | svm_poly_fit = svm(letter~.,data = train_data,kernel="poly",cost=1.0,degree = 2  ,scale = TRUE)
 76 | 
 77 | tr_y_pred = predict(svm_poly_fit, train_data)
 78 | ts_y_pred = predict(svm_poly_fit,test_data)
 79 | 
 80 | tr_y_act = train_data$letter;ts_y_act = test_data$letter
 81 | 
 82 | 
 83 | tr_tble = table(tr_y_act,tr_y_pred)
 84 | print(paste("Train Confusion Matrix"))
 85 | print(tr_tble)
 86 | tr_acc = accrcy(tr_tble)
 87 | print(paste("SVM Polynomial Kernel Train accuracy:",round(tr_acc,4)))
 88 | 
 89 | tr_prec = precsn(tr_tble)
 90 | print(paste("SVM Polynomial Kernel Train Precision:"))
 91 | print(tr_prec)
 92 | 
 93 | tr_rcl = recll(tr_tble)
 94 | print(paste("SVM Polynomial Kernel Train Recall:"))
 95 | print(tr_rcl)
 96 | 
 97 | ts_tble = table(ts_y_act,ts_y_pred)
 98 | print(paste("Test Confusion Matrix"))
 99 | print(ts_tble)
100 | 
101 | ts_acc = accrcy(ts_tble)
102 | print(paste("SVM Polynomial Kernel Test accuracy:",round(ts_acc,4)))
103 | 
104 | ts_prec = precsn(ts_tble)
105 | print(paste("SVM Polynomial Kernel Test Precision:"))
106 | print(ts_prec)
107 | 
108 | ts_rcl = recll(ts_tble)
109 | print(paste("SVM Polynomial Kernel Test Recall:"))
110 | print(ts_rcl)
111 | 
112 | 
113 | 
114 | # SVM - RBF Kernel
115 | svm_rbf_fit = svm(letter~.,data = train_data,kernel="radial",cost=1.0,gamma = 0.2  ,scale = TRUE)
116 | 
117 | tr_y_pred = predict(svm_rbf_fit, train_data)
118 | ts_y_pred = predict(svm_rbf_fit,test_data)
119 | 
120 | tr_y_act = train_data$letter;ts_y_act = test_data$letter
121 | 
122 | tr_tble = table(tr_y_act,tr_y_pred)
123 | print(paste("Train Confusion Matrix"))
124 | print(tr_tble)
125 | tr_acc = accrcy(tr_tble)
126 | print(paste("SVM RBF Kernel Train accuracy:",round(tr_acc,4)))
127 | 
128 | tr_prec = precsn(tr_tble)
129 | print(paste("SVM RBF Kernel Train Precision:"))
130 | print(tr_prec)
131 | 
132 | tr_rcl = recll(tr_tble)
133 | print(paste("SVM RBF Kernel Train Recall:"))
134 | print(tr_rcl)
135 | 
136 | ts_tble = table(ts_y_act,ts_y_pred)
137 | print(paste("Test Confusion Matrix"))
138 | print(ts_tble)
139 | 
140 | ts_acc = accrcy(ts_tble)
141 | print(paste("SVM RBF Kernel Test accuracy:",round(ts_acc,4)))
142 | 
143 | ts_prec = precsn(ts_tble)
144 | print(paste("SVM RBF Kernel Test Precision:"))
145 | print(ts_prec)
146 | 
147 | ts_rcl = recll(ts_tble)
148 | print(paste("SVM RBF Kernel Test Recall:"))
149 | print(ts_rcl)
150 | 
151 | 
152 | 
153 | # Grid search - RBF Kernel
154 | library(e1071)
155 | svm_rbf_grid = tune(svm,letter~.,data = train_data,kernel="radial",scale=TRUE,ranges = list(
156 |   cost = c(0.1,0.3,1,3,10,30),
157 |   gamma = c(0.001,0.01,0.1,0.3,1)
158 |   
159 | ),
160 | tunecontrol = tune.control(cross = 5)
161 | )
162 | 
163 | print(paste("Best parameter from Grid Search"))
164 | print(summary(svm_rbf_grid))
165 | 
166 | best_model = svm_rbf_grid$best.model
167 | 
168 | tr_y_pred = predict(best_model,data = train_data,type = "response")
169 | ts_y_pred = predict(best_model,newdata = test_data,type = "response")
170 | 
171 | tr_y_act = train_data$letter;ts_y_act = test_data$letter
172 | 
173 | 
174 | tr_tble = table(tr_y_act,tr_y_pred)
175 | print(paste("Train Confusion Matrix"))
176 | print(tr_tble)
177 | tr_acc = accrcy(tr_tble)
178 | print(paste("SVM RBF Kernel Train accuracy:",round(tr_acc,4)))
179 | 
180 | tr_prec = precsn(tr_tble)
181 | print(paste("SVM RBF Kernel Train Precision:"))
182 | print(tr_prec)
183 | 
184 | tr_rcl = recll(tr_tble)
185 | print(paste("SVM RBF Kernel Train Recall:"))
186 | print(tr_rcl)
187 | 
188 | ts_tble = table(ts_y_act,ts_y_pred)
189 | print(paste("Test Confusion Matrix"))
190 | print(ts_tble)
191 | 
192 | ts_acc = accrcy(ts_tble)
193 | print(paste("SVM RBF Kernel Test accuracy:",round(ts_acc,4)))
194 | 
195 | ts_prec = precsn(ts_tble)
196 | print(paste("SVM RBF Kernel Test Precision:"))
197 | print(ts_prec)
198 | 
199 | ts_rcl = recll(ts_tble)
200 | print(paste("SVM RBF Kernel Test Recall:"))
201 | print(ts_rcl)
202 | 
203 | 
204 | 
205 | 
206 | # Artificial Neural Networks
207 | setwd("D:\\Book writing\\Codes\\Chapter 6")
208 | digits_data = read.csv("digitsdata.csv")
209 | 
210 | remove_cols = c("target")
211 | x_data = digits_data[,!(names(digits_data) %in% remove_cols)]
212 | y_data = digits_data[,c("target")]
213 | 
214 | 
215 | normalize <- function(x) {return((x - min(x)) / (max(x) - min(x)))}
216 | 
217 | 
218 | data_norm <- as.data.frame(lapply(x_data, normalize))
219 | data_norm <- replace(data_norm, is.na(data_norm), 0.0)
220 | data_norm_v2 = data.frame(as.factor(y_data),data_norm)
221 | names(data_norm_v2)[1] = "target"
222 | 
223 | 
224 | set.seed(123)
225 | numrow = nrow(data_norm_v2)
226 | trnind = sample(1:numrow,size = as.integer(0.7*numrow))
227 | train_data = data_norm_v2[trnind,]
228 | test_data = data_norm_v2[-trnind,]
229 | 
230 | f <- as.formula(paste("target ~", paste(names(train_data)[!names(train_data) %in% "target"], collapse = " + ")))
231 | 
232 | library(nnet)
233 | accuracy <- function(mat){return(sum(diag(mat)) / sum(mat))}
234 | 
235 | nnet_fit = nnet(f,train_data,size=c(9),maxit=200)
236 | y_pred = predict(nnet_fit,newdata = test_data,type = "class")
237 | tble = table(test_data$target,y_pred)
238 | print(accuracy(tble))
239 | 
240 | 
241 | #Plotting nnet from the github packages
242 | require(RCurl)
243 | root.url<-'https://gist.githubusercontent.com/fawda123'
244 | raw.fun<-paste(
245 |   root.url,
246 |   '5086859/raw/cc1544804d5027d82b70e74b83b3941cd2184354/nnet_plot_fun.r',
247 |   sep='/')
248 | script<-getURL(raw.fun, ssl.verifypeer = FALSE)
249 | eval(parse(text = script))
250 | rm('script','raw.fun')
251 | 
252 | # Ploting the neural net
253 | plot(nnet_fit)
254 | 
255 | 
256 | # Grid Search - ANN
257 | neurons = c(1,2,3,4,5,6,7,8,9,10,11,12,13)
258 | iters = c(200,300,400,500,600,700,800,900)
259 | 
260 | initacc = 0
261 | 
262 | for(itr in iters){
263 |   for(nd in neurons){
264 |     nnet_fit = nnet(f,train_data,size=c(nd),maxit=itr,trace=FALSE)
265 |     y_pred = predict(nnet_fit,newdata = test_data,type = "class")
266 |     tble = table(test_data$target,y_pred)
267 |     acc = accuracy(tble)
268 |     
269 |     if (acc>initacc){
270 |       print(paste("Neurons",nd,"Iterations",itr,"Test accuracy",acc))
271 |       initacc = acc
272 |     }
273 |     
274 |   }
275 | }
276 | 
277 | 
278 | 
279 | 


--------------------------------------------------------------------------------
/Chapter06/Chapter 06_SVM_n_NN.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | import os
  4 | """ First change the following directory link to where all input files do exist """
  5 | os.chdir("D:\\Book writing\\Codes\\Chapter 6")
  6 | 
  7 | 
  8 | 
  9 | import pandas as pd
 10 | letterdata = pd.read_csv("letterdata.csv")
 11 | print (letterdata.head())
 12 | 
 13 | x_vars = letterdata.drop(['letter'],axis=1)
 14 | y_var = letterdata["letter"]
 15 | 
 16 | y_var = y_var.replace({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7,'H':8,'I':9,'J':10,
 17 | 'K':11,'L':12,'M':13,'N':14,'O':15,'P':16,'Q':17,'R':18,'S':19,'T':20,
 18 | 'U':21,'V':22,'W':23,'X':24,'Y':25,'Z':26})
 19 | 
 20 | from sklearn.metrics import accuracy_score,classification_report
 21 | from sklearn.model_selection import train_test_split
 22 | x_train,x_test,y_train,y_test = train_test_split(x_vars,y_var,train_size = 0.7,random_state=42)
 23 | 
 24 | 
 25 | # Linear Classifier
 26 | from sklearn.svm import SVC
 27 | svm_fit = SVC(kernel='linear',C=1.0,random_state=43)
 28 | svm_fit.fit(x_train,y_train)
 29 | 
 30 | print ("\nSVM Linear Classifier - Train Confusion Matrix\n\n",pd.crosstab(y_train,svm_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]) )     
 31 | print ("\nSVM Linear Classifier - Train accuracy:",round(accuracy_score(y_train,svm_fit.predict(x_train)),3))
 32 | print ("\nSVM Linear Classifier - Train Classification Report\n",classification_report(y_train,svm_fit.predict(x_train)))
 33 | 
 34 | print ("\n\nSVM Linear Classifier - Test Confusion Matrix\n\n",pd.crosstab(y_test,svm_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
 35 | print ("\nSVM Linear Classifier - Test accuracy:",round(accuracy_score(y_test,svm_fit.predict(x_test)),3))
 36 | print ("\nSVM Linear Classifier - Test Classification Report\n",classification_report(y_test,svm_fit.predict(x_test)))
 37 | 
 38 | 
 39 | #Polynomial Kernel
 40 | svm_poly_fit = SVC(kernel='poly',C=1.0,degree=2)
 41 | svm_poly_fit.fit(x_train,y_train)
 42 | 
 43 | print ("\nSVM Polynomial Kernel Classifier - Train Confusion Matrix\n\n",pd.crosstab(y_train,svm_poly_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]) )     
 44 | print ("\nSVM Polynomial Kernel Classifier - Train accuracy:",round(accuracy_score(y_train,svm_poly_fit.predict(x_train)),3))
 45 | print ("\nSVM Polynomial Kernel Classifier - Train Classification Report\n",classification_report(y_train,svm_poly_fit.predict(x_train)))
 46 | 
 47 | print ("\n\nSVM Polynomial Kernel Classifier - Test Confusion Matrix\n\n",pd.crosstab(y_test,svm_poly_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
 48 | print ("\nSVM Polynomial Kernel Classifier - Test accuracy:",round(accuracy_score(y_test,svm_poly_fit.predict(x_test)),3))
 49 | print ("\nSVM Polynomial Kernel Classifier - Test Classification Report\n",classification_report(y_test,svm_poly_fit.predict(x_test)))
 50 | 
 51 | 
 52 | #RBF Kernel
 53 | svm_rbf_fit = SVC(kernel='rbf',C=1.0, gamma=0.1)
 54 | svm_rbf_fit.fit(x_train,y_train)
 55 | 
 56 | print ("\nSVM RBF Kernel Classifier - Train Confusion Matrix\n\n",pd.crosstab(y_train,svm_rbf_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]) )     
 57 | print ("\nSVM RBF Kernel Classifier - Train accuracy:",round(accuracy_score(y_train,svm_rbf_fit.predict(x_train)),3))
 58 | print ("\nSVM RBF Kernel Classifier - Train Classification Report\n",classification_report(y_train,svm_rbf_fit.predict(x_train)))
 59 | 
 60 | print ("\n\nSVM RBF Kernel Classifier - Test Confusion Matrix\n\n",pd.crosstab(y_test,svm_rbf_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
 61 | print ("\nSVM RBF Kernel Classifier - Test accuracy:",round(accuracy_score(y_test,svm_rbf_fit.predict(x_test)),3))
 62 | print ("\nSVM RBF Kernel Classifier - Test Classification Report\n",classification_report(y_test,svm_rbf_fit.predict(x_test)))
 63 | 
 64 | 
 65 | 
 66 | # Grid Search - RBF Kernel
 67 | from sklearn.pipeline import Pipeline
 68 | from sklearn.model_selection import train_test_split,GridSearchCV
 69 | 
 70 | pipeline = Pipeline([('clf',SVC(kernel='rbf',C=1,gamma=0.1 ))])
 71 | 
 72 | parameters = {'clf__C':(0.1,0.3,1,3,10,30),
 73 |               'clf__gamma':(0.001,0.01,0.1,0.3,1)}
 74 | 
 75 | grid_search_rbf = GridSearchCV(pipeline,parameters,n_jobs=-1,cv=5,verbose=1,scoring='accuracy')
 76 | grid_search_rbf.fit(x_train,y_train)
 77 | 
 78 | 
 79 | print ('RBF Kernel Grid Search Best Training score: %0.3f' % grid_search_rbf.best_score_)
 80 | print ('RBF Kernel Grid Search Best parameters set:')
 81 | best_parameters = grid_search_rbf.best_estimator_.get_params()
 82 | for param_name in sorted(parameters.keys()):
 83 |     print ('\t%s: %r' % (param_name, best_parameters[param_name]))
 84 | 
 85 | predictions = grid_search_rbf.predict(x_test)
 86 | 
 87 | print ("\nRBF Kernel Grid Search - Testing accuracy:",round(accuracy_score(y_test, predictions),4))
 88 | print ("\nRBF Kernel Grid Search - Test Classification Report\n",classification_report(y_test, predictions))
 89 | print ("\n\nRBF Kernel Grid Search- Test Confusion Matrix\n\n",pd.crosstab(y_test, predictions,rownames = ["Actuall"],colnames = ["Predicted"]))      
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | # Neural Networks -  Classifying hand-written digits using Scikit Learn
 97 | 
 98 | import pandas as pd
 99 | from sklearn.datasets import load_digits
100 | from sklearn.model_selection import train_test_split
101 | from sklearn.pipeline import Pipeline
102 | from sklearn.preprocessing import StandardScaler
103 | 
104 | from sklearn.neural_network import MLPClassifier
105 | digits = load_digits()
106 | X = digits.data
107 | y = digits.target
108 | 
109 | 
110 | # Checking dimensions
111 | print (X.shape)
112 | print (y.shape)
113 | 
114 | # Plotting first digit
115 | import matplotlib.pyplot as plt 
116 | plt.matshow(digits.images[0]) 
117 | plt.show() 
118 | 
119 | #X_df = pd.DataFrame(X)
120 | #y_df = pd.DataFrame(y)
121 | #y_df.columns = ['target']
122 | #digitdata = pd.concat([y_df,X_df],axis=1)
123 | #digitdata.to_csv("digitsdata.csv",index= False)
124 | 
125 | from sklearn.model_selection import train_test_split
126 | x_vars_stdscle = StandardScaler().fit_transform(X)
127 | x_train,x_test,y_train,y_test = train_test_split(x_vars_stdscle,y,train_size = 0.7,random_state=42)
128 | 
129 | 
130 | # Grid Search - Neural Network 
131 | from sklearn.pipeline import Pipeline
132 | from sklearn.model_selection import train_test_split,GridSearchCV
133 | from sklearn.metrics import accuracy_score,classification_report
134 | 
135 | pipeline = Pipeline([('mlp',MLPClassifier(hidden_layer_sizes= (100,50,),activation='relu',
136 |             solver='adam',alpha=0.0001,max_iter=300 ))  ])
137 | 
138 | parameters = {'mlp__alpha':(0.001,0.01,0.1,0.3,0.5,1.0),
139 |               'mlp__max_iter':(100,200,300)}
140 | 
141 | grid_search_nn = GridSearchCV(pipeline,parameters,n_jobs=-1,cv=5,verbose=1,scoring='accuracy')
142 | grid_search_nn.fit(x_train,y_train)
143 | 
144 | print ('\n\nNeural Network Best Training score: %0.3f' % grid_search_nn.best_score_)
145 | print ('\nNeural Network Best parameters set:')
146 | best_parameters = grid_search_nn.best_estimator_.get_params()
147 | for param_name in sorted(parameters.keys()):
148 |     print ('\t%s: %r' % (param_name, best_parameters[param_name]))
149 | 
150 | predictions_train = grid_search_nn.predict(x_train)
151 | predictions_test = grid_search_nn.predict(x_test)
152 | 
153 | print ("\nNeural Network Training accuracy:",round(accuracy_score(y_train, predictions_train),4))
154 | print ("\nNeural Network Complete report of Training data\n",classification_report(y_train, predictions_train))
155 | print ("\n\nNeural Network Grid Search- Train Confusion Matrix\n\n",pd.crosstab(y_train, predictions_train,rownames = ["Actuall"],colnames = ["Predicted"]))      
156 | 
157 | print ("\n\nNeural Network Testing accuracy:",round(accuracy_score(y_test, predictions_test),4))
158 | print ("\nNeural Network Complete report of Testing data\n",classification_report(y_test, predictions_test))
159 | print ("\n\nNeural Network Grid Search- Test Confusion Matrix\n\n",pd.crosstab(y_test, predictions_test,rownames = ["Actuall"],colnames = ["Predicted"]))      
160 | 
161 | 
162 | 
163 | 
164 | 
165 | 
166 | 
167 | # Neural Networks -  Classifying hand-written digits using Keras
168 | import numpy as np
169 | import pandas as pd
170 | import matplotlib.pyplot as plt
171 | 
172 | from sklearn.datasets import load_digits
173 | from sklearn.model_selection import train_test_split
174 | from sklearn.preprocessing import StandardScaler
175 | from sklearn.metrics import accuracy_score,classification_report
176 | 
177 | from keras.models import Sequential
178 | from keras.layers.core import Dense, Dropout, Activation
179 | from keras.optimizers import Adadelta,Adam,RMSprop
180 | from keras.utils import np_utils
181 | 
182 | 
183 | digits = load_digits()
184 | X = digits.data
185 | y = digits.target
186 | 
187 | print (X.shape)
188 | print (y.shape)
189 | 
190 | print ("\nPrinting first digit")
191 | plt.matshow(digits.images[0]) 
192 | plt.show() 
193 | 
194 | 
195 | x_vars_stdscle = StandardScaler().fit_transform(X)
196 | x_train,x_test,y_train,y_test = train_test_split(x_vars_stdscle,y,train_size = 0.7,random_state=42)
197 | 
198 | # Definiting hyper parameters
199 | np.random.seed(1337) 
200 | nb_classes = 10
201 | batch_size = 128
202 | nb_epochs = 200
203 | 
204 | Y_train = np_utils.to_categorical(y_train, nb_classes)
205 | 
206 | print (Y_train.shape)
207 | 
208 | print (y_train[0])
209 | print (Y_train[0])
210 | 
211 | 
212 | #Deep Layer Model building in Keras
213 | 
214 | model = Sequential()
215 | 
216 | model.add(Dense(100,input_shape= (64,)))
217 | model.add(Activation('relu'))
218 | model.add(Dropout(0.5))
219 | 
220 | model.add(Dense(50))
221 | model.add(Activation('relu'))
222 | model.add(Dropout(0.5))
223 | 
224 | model.add(Dense(nb_classes))
225 | model.add(Activation('softmax'))
226 | 
227 | model.compile(loss='categorical_crossentropy', optimizer='adam')
228 | 
229 | 
230 | # Model Training
231 | model.fit(x_train, Y_train, batch_size=batch_size, nb_epoch=nb_epochs,verbose=1)
232 | 
233 | 
234 | #Model Prediction
235 | y_train_predclass = model.predict_classes(x_train,batch_size=batch_size)
236 | y_test_predclass = model.predict_classes(x_test,batch_size=batch_size)
237 | 
238 | print ("\n\nDeep Neural Network  - Train accuracy:"),(round(accuracy_score(y_train,y_train_predclass),3))
239 | 
240 | print ("\nDeep Neural Network  - Train Classification Report")
241 | print (classification_report(y_train,y_train_predclass))
242 | 
243 | print ("\nDeep Neural Network - Train Confusion Matrix\n")
244 | print (pd.crosstab(y_train,y_train_predclass,rownames = ["Actuall"],colnames = ["Predicted"]) )  
245 | 
246 | 
247 | print ("\nDeep Neural Network  - Test accuracy:"),(round(accuracy_score(y_test,y_test_predclass),3))
248 | 
249 | print ("\nDeep Neural Network  - Test Classification Report")
250 | print (classification_report(y_test,y_test_predclass))
251 | 
252 | print ("\nDeep Neural Network - Test Confusion Matrix\n")
253 | print (pd.crosstab(y_test,y_test_predclass,rownames = ["Actuall"],colnames = ["Predicted"]) )
254 | 
255 | 
256 | 
257 | 
258 | 
259 | 


--------------------------------------------------------------------------------
/Chapter07/Chapter 07_Recomm_Engine.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | rm(list = ls())
  4 | 
  5 | # First change the following directory link to where all the input files do exist
  6 | setwd("D:\\Book writing\\Codes\\Chapter 7\\ml-latest-small\\ml-latest-small")
  7 | 
  8 | ratings = read.csv("ratings.csv")
  9 | movies = read.csv("movies.csv")
 10 | 
 11 | ratings = ratings[,!names(ratings) %in% c("timestamp")]
 12 | 
 13 | library(reshape2)
 14 | 
 15 | # Creating Pivot table
 16 | ratings_mat = acast(ratings,userId~movieId)
 17 | ratings_mat[is.na(ratings_mat)] =0
 18 | 
 19 | 
 20 | # Content based filtering
 21 | library(lsa)
 22 | 
 23 | a = c(2, 1, 0, 2, 0, 1, 1, 1)
 24 | b = c(2, 1, 1, 1, 1, 0, 1, 1)
 25 | 
 26 | print (paste("Cosine similarity between A and B is", round(cosine(a,b),4)))
 27 | 
 28 | 
 29 | m = nrow(ratings_mat);n = ncol(ratings_mat)
 30 | 
 31 | 
 32 | # User similarity matrix
 33 | mat_users = matrix(nrow = m, ncol = m)
 34 | 
 35 | for (i in 1:m){
 36 |   for (j in 1:m){
 37 |     if (i != j){
 38 |       mat_users[i,j] = cosine(ratings_mat[i,],ratings_mat[j,])
 39 |     }
 40 |     else {
 41 |       mat_users[i,j] = 0.0
 42 |     }
 43 |   }
 44 | }
 45 | 
 46 | 
 47 | colnames(mat_users) = rownames(ratings_mat); rownames(mat_users) = rownames(ratings_mat)
 48 | df_users = as.data.frame(mat_users)
 49 | 
 50 | 
 51 | 
 52 | # Finiding similar users
 53 | topn_simusers <- function(uid=16,n=5){
 54 |   sorted_df = sort(df_users[uid,],decreasing = TRUE)[1:n]
 55 |   print(paste("Similar users as user:",uid))
 56 |   return(sorted_df)
 57 | }
 58 | 
 59 | print(topn_simusers(uid = 17,n=10))
 60 | 
 61 | 
 62 | # Finiding most rated movies of a user
 63 | library(sqldf)
 64 | 
 65 | ratings_withmovie = sqldf(" select a.*,b.title from 
 66 |                           ratings as a left join movies as b 
 67 |                           on a.movieId = b.movieId")
 68 | 
 69 | 
 70 | # Finding most rated movies of a user
 71 | topn_movieratings <- function(uid=355,n_ratings=10){
 72 |   uid_ratings = ratings_withmovie[ratings_withmovie$userId==uid,]
 73 |   sorted_uidrtng = uid_ratings[order(-uid_ratings$rating),]
 74 |   return(head(sorted_uidrtng,n_ratings))
 75 | }
 76 |   
 77 | print( topn_movieratings(uid = 596,n=10))
 78 | 
 79 | 
 80 | 
 81 | # Movies similarity matrix
 82 | mat_movies = matrix(nrow = n, ncol = n)
 83 | 
 84 | for (i in 1:n){
 85 |   for (j in 1:n){
 86 |     if (i != j){
 87 |       mat_movies[i,j] = cosine(ratings_mat[,i],ratings_mat[,j])
 88 |     }
 89 |     else {
 90 |       mat_movies[i,j] = 0.0
 91 |     }
 92 |   }
 93 | }
 94 | 
 95 | colnames(mat_movies) = colnames(ratings_mat); rownames(mat_movies) = colnames(ratings_mat)
 96 | df_movies = as.data.frame(mat_movies)
 97 | 
 98 | write.csv(df_movies,"df_movies.csv")
 99 | 
100 | df_movies = read.csv("df_movies.csv")
101 | rownames(df_movies) = df_movies$X
102 | colnames(df_movies) = c("aaa",df_movies$X)
103 | df_movies = subset(df_movies, select=-c(aaa))
104 | 
105 | 
106 | 
107 | # Finiding similar movies
108 | topn_simovies <- function(mid=588,n_movies=5){
109 |   sorted_df = sort(df_movies[mid,],decreasing = TRUE)[1:n_movies]
110 |   sorted_df_t = as.data.frame(t(sorted_df))
111 |   colnames(sorted_df_t) = c("score")
112 |   sorted_df_t$movieId = rownames(sorted_df_t)
113 |   
114 |   print(paste("Similar",n_movies, "movies as compared to the movie",mid,"are :"))
115 |   sorted_df_t_wmovie = sqldf(" select a.*,b.title from sorted_df_t as a left join movies as b 
116 |                           on a.movieId = b.movieId")
117 |   return(sorted_df_t_wmovie)
118 | }
119 | 
120 | print(topn_simovies(mid = 589,n_movies=15))
121 | 
122 | 
123 | 
124 | 
125 | # Collaborative filtering
126 | ratings = read.csv("ratings.csv")
127 | movies = read.csv("movies.csv")
128 | 
129 | library(sqldf)
130 | library(reshape2)
131 | library(recommenderlab)
132 | 
133 | ratings_v2 = ratings[,-c(4)]
134 | 
135 | ratings_mat = acast(ratings_v2,userId~movieId)
136 | ratings_mat2 =  as(ratings_mat, "realRatingMatrix")
137 | 
138 | getRatingMatrix(ratings_mat2)
139 | 
140 | #Plotting user-item complete matrix
141 | image(ratings_mat2, main = "Raw Ratings")
142 | 
143 | # Fitting ALS method on Data
144 | rec=Recommender(ratings_mat2[1:nrow(ratings_mat2)],method="UBCF", param=list(normalize = "Z-score",method="Cosine",nn=5, minRating=1))
145 | rec_2=Recommender(ratings_mat2[1:nrow(ratings_mat2)],method="POPULAR")
146 | 
147 | print(rec)
148 | print(rec_2)
149 | 
150 | names(getModel(rec))
151 | getModel(rec)$nn
152 | 
153 | 
154 | # Create predictions for all the users
155 | recom_pred = predict(rec,ratings_mat2[1:nrow(ratings_mat2)],type="ratings")
156 | 
157 | # Putting predicitons into list
158 | rec_list<-as(recom_pred,"list")
159 | head(summary(rec_list))
160 | 
161 | print_recommendations <- function(uid=586,top_nmovies=10){
162 |   recoms_list = rec_list[[uid]]
163 |   sorted_df = as.data.frame(sort(recoms_list,decreasing = TRUE)[1:top_nmovies])
164 |   colnames(sorted_df) = c("score")
165 | 
166 |   sorted_df$movieId = rownames(sorted_df)
167 |   print(paste("Movies recommended for the user",uid,"are follows:"))
168 |   sorted_df_t_wmovie = sqldf(" select a.*,b.title from sorted_df as a left join movies as b 
169 |                              on a.movieId = b.movieId")
170 |   
171 |   return(sorted_df_t_wmovie)
172 | }
173 | 
174 | print(print_recommendations(uid = 580,top_nmovies = 15))
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 


--------------------------------------------------------------------------------
/Chapter07/Chapter 07_Recomm_Engine.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | import os
  4 | """ First change the following directory link to where all input files do exist """
  5 | os.chdir("D:\\Book writing\\Codes\\Chapter 7\\ml-latest-small\\ml-latest-small")
  6 | 
  7 | 
  8 | import pandas as pd
  9 | import numpy as np
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | 
 13 | ratings = pd.read_csv("ratings.csv")
 14 | print (ratings.head())
 15 | 
 16 | movies = pd.read_csv("movies.csv")
 17 | print (movies.head())
 18 | 
 19 | 
 20 | #Combining movie ratings & movie names
 21 | ratings = pd.merge(ratings[['userId','movieId','rating']],movies[['movieId','title']],
 22 |                    how='left',left_on ='movieId' ,right_on = 'movieId')
 23 | 
 24 | 
 25 | rp = ratings.pivot_table(columns = ['movieId'],index = ['userId'],values = 'rating')
 26 | rp = rp.fillna(0)
 27 | 
 28 | # Converting pandas dataframe to numpy for faster execution in loops etc.
 29 | rp_mat = rp.as_matrix()
 30 | 
 31 | 
 32 | from scipy.spatial.distance import cosine
 33 | 
 34 | 
 35 | #The cosine of the angle between them is about 0.822.
 36 | a= np.asarray( [2, 1, 0, 2, 0, 1, 1, 1])
 37 | b = np.asarray( [2, 1, 1, 1, 1, 0, 1, 1])
 38 | 
 39 | print("\n\n")
 40 | print ("Cosine similarity between A and B is",round(1-cosine(a,b),4))
 41 | 
 42 | 
 43 | m, n = rp.shape
 44 | 
 45 | # User similarity matrix
 46 | mat_users = np.zeros((m, m))
 47 | for i in range(m):
 48 |     for j in range(m):
 49 |         if i != j:
 50 |             mat_users[i][j] = (1- cosine(rp_mat[i,:], rp_mat[j,:]))
 51 |         else:
 52 |             mat_users[i][j] = 0.
 53 |             
 54 | pd_users = pd.DataFrame(mat_users,index =rp.index ,columns= rp.index )
 55 | 
 56 | 
 57 | # Finding similar users
 58 | def topn_simusers(uid = 16,n=5):
 59 |     users = pd_users.loc[uid,:].sort_values(ascending = False)
 60 |     topn_users = users.iloc[:n,]
 61 |     topn_users = topn_users.rename('score')    
 62 |     print ("Similar users as user:",uid)
 63 |     return pd.DataFrame(topn_users)
 64 | 
 65 | print (topn_simusers(uid=17,n=10))   
 66 | 
 67 | 
 68 | # Finding most rated movies of a user
 69 | def topn_movieratings(uid = 355,n_ratings=10):    
 70 |     uid_ratings = ratings.loc[ratings['userId']==uid]
 71 |     uid_ratings = uid_ratings.sort_values(by='rating',ascending = [False])
 72 |     print ("Top",n_ratings ,"movie ratings of user:",uid)
 73 |     return uid_ratings.iloc[:n_ratings,]    
 74 | 
 75 | print (topn_movieratings(uid=596,n_ratings=10))
 76 | 
 77 | 
 78 | # Movie similarity matrix
 79 | import time
 80 | start_time = time.time()
 81 | mat_movies = np.zeros((n, n))
 82 | 
 83 | for i in range(n):
 84 |     for j in range(n):
 85 |         if i!=j:
 86 |             mat_movies[i,j] = (1- cosine(rp_mat[:,i], rp_mat[:,j]))
 87 |         else:
 88 |             mat_movies[i,j] = 0.
 89 | print("--- %s seconds ---" % (time.time() - start_time))
 90 | 
 91 | 
 92 | pd_movies = pd.DataFrame(mat_movies,index =rp.columns ,columns= rp.columns )
 93 | 
 94 | 
 95 | #pd_movies.to_csv('pd_movies.csv',sep=',')
 96 | pd_movies = pd.read_csv("pd_movies.csv",index_col='movieId')
 97 | 
 98 | 
 99 | # Finding similar movies
100 | def topn_simovies(mid = 588,n=15):
101 |     mid_ratings = pd_movies.loc[mid,:].sort_values(ascending = False)
102 |     topn_movies = pd.DataFrame(mid_ratings.iloc[:n,])
103 |     topn_movies['index1'] = topn_movies.index
104 |     topn_movies['index1'] = topn_movies['index1'].astype('int64')
105 |     topn_movies = pd.merge(topn_movies,movies[['movieId','title']],how = 'left',left_on ='index1' ,right_on = 'movieId')
106 |     print ("Movies similar to movie id:",mid,",",movies['title'][movies['movieId']==mid].to_string(index=False),",are")
107 |     del topn_movies['index1']
108 |     return topn_movies
109 | 
110 | 
111 | print (topn_simovies(mid=589,n=15))
112 | 
113 | 
114 | 
115 | 
116 | #Collaborative filtering
117 | 
118 | import os
119 | """ First change the following directory link to where all input files do exist """
120 | os.chdir("D:\\Book writing\\Codes\\Chapter 7\\ml-latest-small\\ml-latest-small")
121 | 
122 | 
123 | import pandas as pd
124 | import numpy as np
125 | import matplotlib.pyplot as plt
126 | 
127 | 
128 | ratings = pd.read_csv("ratings.csv")
129 | print (ratings.head())
130 | 
131 | movies = pd.read_csv("movies.csv")
132 | print (movies.head())
133 | 
134 | rp = ratings.pivot_table(columns = ['movieId'],index = ['userId'],values = 'rating')
135 | rp = rp.fillna(0)
136 | 
137 | A = rp.values
138 | 
139 | print ("\nShape of Original Sparse Matrix",A.shape)
140 | 
141 | 
142 | W = A>0.5
143 | W[W==True]=1
144 | W[W==False]=0
145 | W = W.astype(np.float64,copy=False)
146 | 
147 | 
148 | W_pred = A<0.5
149 | W_pred[W_pred==True]=1
150 | W_pred[W_pred==False]=0
151 | W_pred = W_pred.astype(np.float64,copy=False)
152 | np.fill_diagonal(W_pred,val=0)
153 | 
154 | 
155 | 
156 | # Parameters
157 | m,n = A.shape
158 | 
159 | n_iterations = 200
160 | n_factors = 100
161 | lmbda = 0.1
162 | 
163 | X = 5 * np.random.rand(m,n_factors)
164 | Y = 5* np.random.rand(n_factors,n)
165 | 
166 | def get_error(A, X, Y, W):
167 |     return np.sqrt(np.sum((W * (A - np.dot(X, Y)))**2)/np.sum(W))
168 | 
169 | errors = []
170 | for itr in range(n_iterations):
171 |     X = np.linalg.solve(np.dot(Y,Y.T)+ lmbda * np.eye(n_factors),np.dot(Y,A.T)).T
172 |     Y = np.linalg.solve(np.dot(X.T,X)+ lmbda * np.eye(n_factors),np.dot(X.T,A))
173 |     
174 |     if itr%10 == 0:
175 |         print(itr," iterations completed","RMSError value is:",get_error(A,X,Y,W))
176 |                  
177 |     errors.append(get_error(A,X,Y,W))
178 | 
179 | A_hat = np.dot(X,Y)
180 | print ("RMSError of rated movies: ",get_error(A,X,Y,W))
181 |     
182 | 
183 | plt.plot(errors);
184 | plt.ylim([0, 3.5]);
185 | plt.xlabel("Number of Iterations");plt.ylabel("RMSE")
186 | #plt.title("No.of Iterations vs. RMSE")
187 | plt.show()
188 | 
189 | 
190 | 
191 | def print_recommovies(uid=315,n_movies=15,pred_mat = A_hat,wpred_mat = W_pred ):
192 |     pred_recos = pred_mat*wpred_mat
193 |     pd_predrecos = pd.DataFrame(pred_recos,index =rp.index ,columns= rp.columns )
194 |     pred_ratings = pd_predrecos.loc[uid,:].sort_values(ascending = False)
195 |     pred_topratings = pred_ratings[:n_movies,]
196 |     pred_topratings = pred_topratings.rename('pred_ratings')  
197 |     pred_topratings = pd.DataFrame(pred_topratings)
198 |     pred_topratings['index1'] = pred_topratings.index
199 |     pred_topratings['index1'] = pred_topratings['index1'].astype('int64')
200 |     pred_topratings = pd.merge(pred_topratings,movies[['movieId','title']],how = 'left',left_on ='index1' ,right_on = 'movieId')
201 |     del pred_topratings['index1']    
202 |     print ("\nTop",n_movies,"movies predicted for the user:",uid," based on collaborative filtering\n")
203 |     return pred_topratings
204 | 
205 | 
206 | predmtrx = print_recommovies(uid=355,n_movies=10,pred_mat=A_hat,wpred_mat=W_pred)
207 | print (predmtrx)
208 | 
209 | 
210 | 
211 | 
212 | # Grid Search on Collaborative Filtering
213 | def get_error(A, X, Y, W):
214 |     return np.sqrt(np.sum((W * (A - np.dot(X, Y)))**2)/np.sum(W))
215 | 
216 | niters = [20,50,100,200]
217 | factors = [30,50,70,100]
218 | lambdas = [0.001,0.01,0.05,0.1]
219 | 
220 | init_error = float("inf")
221 | 
222 | 
223 | print("\n\nGrid Search results of ALS Matrix Factorization:\n")
224 | for niter in niters:
225 |     for facts in factors:
226 |         for lmbd in lambdas:
227 |                 
228 |             X = 5 * np.random.rand(m,facts)
229 |             Y = 5* np.random.rand(facts,n)
230 |             
231 |             for itr in range(niter):
232 |                 X = np.linalg.solve(np.dot(Y,Y.T)+ lmbd * np.eye(facts),np.dot(Y,A.T)).T
233 |                 Y = np.linalg.solve(np.dot(X.T,X)+ lmbd * np.eye(facts),np.dot(X.T,A))
234 |             
235 |             error = get_error(A,X,Y,W)
236 |             
237 |             if error<init_error:
238 |                 print ("No.of iters",niter,"No.of Factors",facts,"Lambda",lmbd,"RMSE",error)
239 |                 init_error = error
240 |                 
241 |             
242 |             
243 | 
244 | 
245 | 
246 | 
247 | 
248 | 
249 | 
250 | 
251 | 
252 | 
253 | 
254 | 
255 | 


--------------------------------------------------------------------------------
/Chapter08/Chapter 08_Kmeans_PCA.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | rm(list = ls())
  5 | 
  6 | # First change the following directory link to where all the input files do exist
  7 | setwd("D:\\Book writing\\Codes\\Chapter 8")
  8 | 
  9 | iris_data = read.csv("iris.csv")
 10 | x_iris = iris_data[,!names(iris_data) %in% c("class")]
 11 | y_iris = iris_data$class
 12 | 
 13 | km_fit = kmeans(x_iris,centers = 3,iter.max = 300 )
 14 | 
 15 | print(paste("K-Means Clustering- Confusion matrix"))
 16 | table(y_iris,km_fit$cluster)
 17 | 
 18 | mat_avgss = matrix(nrow = 10, ncol = 2)
 19 | 
 20 | # Average within the cluster sum of square
 21 | print(paste("Avg. Within sum of squares"))
 22 | for (i in (1:10)){
 23 |   km_fit = kmeans(x_iris,centers = i,iter.max = 300 )
 24 |   mean_km = mean(km_fit$withinss)
 25 |   print(paste("K-Value",i,",Avg.within sum of squares",round(mean_km,2)))
 26 |   mat_avgss[i,1] = i
 27 |   mat_avgss[i,2] = mean_km
 28 | }
 29 | 
 30 | plot(mat_avgss[,1],mat_avgss[,2],type = 'o',xlab = "K_Value",ylab = "Avg. within sum of square")
 31 | title("Avg. within sum of squares vs. K-value")
 32 | 
 33 | 
 34 | mat_varexp = matrix(nrow = 10, ncol = 2)
 35 | # Percentage of Variance explained
 36 | print(paste("Percent. variance explained"))
 37 | for (i in (1:10)){
 38 |   km_fit = kmeans(x_iris,centers = i,iter.max = 300 )
 39 |   var_exp = km_fit$betweenss/km_fit$totss
 40 |   print(paste("K-Value",i,",Percent var explained",round(var_exp,4)))
 41 |   mat_varexp[i,1]=i
 42 |   mat_varexp[i,2]=var_exp
 43 | }
 44 | 
 45 | plot(mat_varexp[,1],mat_varexp[,2],type = 'o',xlab = "K_Value",ylab = "Percent Var explained")
 46 | title("Avg. within sum of squares vs. K-value")
 47 | 
 48 | 
 49 | # PCA
 50 | digits_data = read.csv("digitsdata.csv")
 51 | 
 52 | remove_cols = c("target")
 53 | x_data = digits_data[,!(names(digits_data) %in% remove_cols)]
 54 | y_data = digits_data[,c("target")]
 55 | 
 56 | # Normalizing the data
 57 | normalize <- function(x) {return((x - min(x)) / (max(x) - min(x)))}
 58 | data_norm <- as.data.frame(lapply(x_data, normalize))
 59 | data_norm <- replace(data_norm, is.na(data_norm), 0.0)
 60 | 
 61 | 
 62 | # Extracting Principal Components
 63 | pr_out =prcomp(data_norm)
 64 | pr_components_all = pr_out$x
 65 | 
 66 | 
 67 | # 2- Dimensional PCA
 68 | K_prcomps = 2
 69 | 
 70 | pr_components = pr_components_all[,1:K_prcomps]
 71 | 
 72 | pr_components_df = data.frame(pr_components)
 73 | pr_components_df = cbind(pr_components_df,digits_data$target)
 74 | names(pr_components_df)[K_prcomps+1] = "target"
 75 | 
 76 | out <- split( pr_components_df , f = pr_components_df$target )
 77 | zero_df = out$`0`;one_df = out$`1`;two_df = out$`2`; three_df = out$`3`; four_df = out$`4`
 78 | five_df = out$`5`;six_df = out$`6`;seven_df = out$`7`;eight_df = out$`8`;nine_df = out$`9`
 79 | 
 80 | 
 81 | library(ggplot2)
 82 | # Plotting 2-dimensional PCA
 83 | ggplot(pr_components_df, aes(x = PC1, y = PC2, color = factor(target,labels = c("zero","one","two",
 84 |         "three","four","five","six","seven","eight","nine")))) + 
 85 |         geom_point()+ggtitle("2-D PCA on Digits Data") +
 86 |         labs(color = "Digtis")
 87 | 
 88 | 
 89 | 
 90 | # 3- Dimensional PCA
 91 | # Plotting 3-dimensional PCA
 92 | K_prcomps = 3
 93 | 
 94 | pr_components = pr_components_all[,1:K_prcomps]
 95 | pr_components_df = data.frame(pr_components)
 96 | pr_components_df = cbind(pr_components_df,digits_data$target)
 97 | names(pr_components_df)[K_prcomps+1] = "target"
 98 | 
 99 | pr_components_df$target = as.factor(pr_components_df$target)
100 | 
101 | out <- split( pr_components_df , f = pr_components_df$target )
102 | zero_df = out$`0`;one_df = out$`1`;two_df = out$`2`; three_df = out$`3`; four_df = out$`4`
103 | five_df = out$`5`;six_df = out$`6`;seven_df = out$`7`;eight_df = out$`8`;nine_df = out$`9`
104 | 
105 | library(scatterplot3d)
106 | 
107 | colors <- c("darkred", "darkseagreen4", "deeppink4", "greenyellow", "orange"
108 |             , "navyblue", "red", "tan3", "steelblue1", "slateblue")
109 | colors <- colors[as.numeric(pr_components_df$target)]
110 | s3d = scatterplot3d(pr_components_df[,1:3], pch = 16, color=colors,
111 |               xlab = "PC1",ylab = "PC2",zlab = "PC3",col.grid="lightblue",main = "3-D PCA on Digits Data")
112 | legend(s3d$xyz.convert(3.1, 0.1, -3.5), pch = 16, yjust=0,
113 |        legend = levels(pr_components_df$target),col =colors,cex = 1.1,xjust = 0)
114 |        
115 | # Chosing number of Principal Components
116 | pr_var =pr_out$sdev ^2
117 | pr_totvar = pr_var/sum(pr_var)
118 | plot(cumsum(pr_totvar), xlab="Principal Component", ylab ="Cumilative Prop. of Var.",
119 |      ylim=c(0,1),type="b",main = "PCAs vs. Cum prop of Var Explained")
120 | 
121 | 
122 | 
123 | #SVD 
124 | library(svd)
125 | 
126 | digits_data = read.csv("digitsdata.csv")
127 | 
128 | remove_cols = c("target")
129 | x_data = digits_data[,!(names(digits_data) %in% remove_cols)]
130 | y_data = digits_data[,c("target")]
131 | 
132 | 
133 | 
134 | sv2 <- svd(x_data,nu=15)
135 | 
136 | sv_check = sv2$d
137 | 
138 | # Computing the square of the singular values, which can be thought of as the vector of matrix energy
139 | # in order to pick top singular values which preserve at least 80% of variance explained
140 | energy <- sv2$d ^ 2
141 | tot_varexp = data.frame(cumsum(energy) / sum(energy))
142 | 
143 | names(tot_varexp) = "cum_var_explained"
144 | tot_varexp$K_value = 1:nrow(tot_varexp)
145 | 
146 | plot(tot_varexp[,2],tot_varexp[,1],type = 'o',xlab = "K_Value",ylab = "Prop. of Var Explained")
147 | title("SVD - Prop. of Var explained with K-value")
148 | 
149 | 


--------------------------------------------------------------------------------
/Chapter08/Chapter 08_Kmeans_PCA.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | import os
  5 | """ First change the following directory link to where all input files do exist """
  6 | os.chdir("D:\\Book writing\\Codes\\Chapter 8")
  7 | 
  8 | 
  9 | # K-means clustering
 10 | import numpy as np
 11 | import pandas as pd
 12 | import matplotlib.pyplot as plt
 13 | from scipy.spatial.distance import cdist, pdist
 14 | 
 15 | from sklearn.cluster import KMeans
 16 | from sklearn.metrics import silhouette_score
 17 | 
 18 | 
 19 | iris = pd.read_csv("iris.csv")
 20 | print (iris.head())
 21 | 
 22 | x_iris = iris.drop(['class'],axis=1)
 23 | y_iris = iris["class"]
 24 | 
 25 | 
 26 | k_means_fit = KMeans(n_clusters=3,max_iter=300)
 27 | k_means_fit.fit(x_iris)
 28 | 
 29 | print ("\nK-Means Clustering - Confusion Matrix\n\n",pd.crosstab(y_iris,k_means_fit.labels_,rownames = ["Actuall"],colnames = ["Predicted"]) )     
 30 | print ("\nSilhouette-score: %0.3f" % silhouette_score(x_iris, k_means_fit.labels_, metric='euclidean'))
 31 | 
 32 | for k in range(2,10):
 33 |     k_means_fitk = KMeans(n_clusters=k,max_iter=300)
 34 |     k_means_fitk.fit(x_iris)
 35 |     print ("For K value",k,",Silhouette-score: %0.3f" % silhouette_score(x_iris, k_means_fitk.labels_, metric='euclidean'))
 36 |     
 37 | 
 38 | # Avg. within-cluster sum of squares
 39 | K = range(1,10)
 40 | 
 41 | KM = [KMeans(n_clusters=k).fit(x_iris) for k in K]
 42 | centroids = [k.cluster_centers_ for k in KM]
 43 | 
 44 | D_k = [cdist(x_iris, centrds, 'euclidean') for centrds in centroids]
 45 | 
 46 | cIdx = [np.argmin(D,axis=1) for D in D_k]
 47 | dist = [np.min(D,axis=1) for D in D_k]
 48 | avgWithinSS = [sum(d)/x_iris.shape[0] for d in dist]
 49 | 
 50 | # Total with-in sum of square
 51 | wcss = [sum(d**2) for d in dist]
 52 | tss = sum(pdist(x_iris)**2)/x_iris.shape[0]
 53 | bss = tss-wcss
 54 | 
 55 | 
 56 | 
 57 | # elbow curve - Avg. within-cluster sum of squares
 58 | fig = plt.figure()
 59 | ax = fig.add_subplot(111)
 60 | ax.plot(K, avgWithinSS, 'b*-')
 61 | plt.grid(True)
 62 | plt.xlabel('Number of clusters')
 63 | plt.ylabel('Average within-cluster sum of squares')
 64 | #plt.title('Elbow for KMeans clustering')
 65 | 
 66 | 
 67 | # elbow curve - percentage of variance explained
 68 | fig = plt.figure()
 69 | ax = fig.add_subplot(111)
 70 | ax.plot(K, bss/tss*100, 'b*-')
 71 | plt.grid(True)
 72 | plt.xlabel('Number of clusters')
 73 | plt.ylabel('Percentage of variance explained')
 74 | #plt.title('Elbow for KMeans clustering')
 75 | 
 76 | 
 77 | 
 78 | 
 79 | 
 80 | # Calculation of eigenvectors & eigenvalues
 81 | import numpy as np
 82 | w,v = np.linalg.eig(np.array([[ 0.91335 ,0.75969 ],[ 0.75969,0.69702]]))
 83 | print ("\nEigen Values\n",w) 
 84 | print ("\nEigen Vectors\n",v)
 85 | 
 86 | 
 87 | 
 88 | 
 89 | # PCA - Principal Component Analysis
 90 | import matplotlib.pyplot as plt
 91 | from sklearn.decomposition import PCA
 92 | from sklearn.datasets import load_digits
 93 | 
 94 | 
 95 | 
 96 | digits = load_digits()
 97 | X = digits.data
 98 | y = digits.target
 99 | 
100 | 
101 | print (digits.data[0].reshape(8,8))
102 | 
103 | plt.matshow(digits.images[0]) 
104 | plt.show() 
105 | 
106 | 
107 | from sklearn.preprocessing import scale
108 | X_scale = scale(X,axis=0)
109 | 
110 | pca = PCA(n_components=2)
111 | reduced_X = pca.fit_transform(X_scale)
112 | 
113 | 
114 | zero_x, zero_y = [],[] ; one_x, one_y = [],[]
115 | two_x,two_y = [],[]; three_x, three_y = [],[]
116 | four_x,four_y = [],[]; five_x,five_y = [],[]
117 | six_x,six_y = [],[]; seven_x,seven_y = [],[]
118 | eight_x,eight_y = [],[]; nine_x,nine_y = [],[]
119 | 
120 | 
121 | for i in range(len(reduced_X)):
122 |     if y[i] == 0:
123 |         zero_x.append(reduced_X[i][0])
124 |         zero_y.append(reduced_X[i][1])
125 |         
126 |     elif y[i] == 1:
127 |         one_x.append(reduced_X[i][0])
128 |         one_y.append(reduced_X[i][1])
129 | 
130 |     elif y[i] == 2:
131 |         two_x.append(reduced_X[i][0])
132 |         two_y.append(reduced_X[i][1])
133 | 
134 |     elif y[i] == 3:
135 |         three_x.append(reduced_X[i][0])
136 |         three_y.append(reduced_X[i][1])
137 | 
138 |     elif y[i] == 4:
139 |         four_x.append(reduced_X[i][0])
140 |         four_y.append(reduced_X[i][1])
141 | 
142 |     elif y[i] == 5:
143 |         five_x.append(reduced_X[i][0])
144 |         five_y.append(reduced_X[i][1])
145 | 
146 |     elif y[i] == 6:
147 |         six_x.append(reduced_X[i][0])
148 |         six_y.append(reduced_X[i][1])
149 | 
150 |     elif y[i] == 7:
151 |         seven_x.append(reduced_X[i][0])
152 |         seven_y.append(reduced_X[i][1])
153 | 
154 |     elif y[i] == 8:
155 |         eight_x.append(reduced_X[i][0])
156 |         eight_y.append(reduced_X[i][1])
157 |     
158 |     elif y[i] == 9:
159 |         nine_x.append(reduced_X[i][0])
160 |         nine_y.append(reduced_X[i][1])
161 | 
162 | 
163 | 
164 | zero = plt.scatter(zero_x, zero_y, c='r', marker='x',label='zero')
165 | one = plt.scatter(one_x, one_y, c='g', marker='+')
166 | two = plt.scatter(two_x, two_y, c='b', marker='s')
167 | 
168 | three = plt.scatter(three_x, three_y, c='m', marker='*')
169 | four = plt.scatter(four_x, four_y, c='c', marker='h')
170 | five = plt.scatter(five_x, five_y, c='r', marker='D')
171 | 
172 | six = plt.scatter(six_x, six_y, c='y', marker='8')
173 | seven = plt.scatter(seven_x, seven_y, c='k', marker='*')
174 | eight = plt.scatter(eight_x, eight_y, c='r', marker='x')
175 | 
176 | nine = plt.scatter(nine_x, nine_y, c='b', marker='D')
177 | 
178 | 
179 | plt.legend((zero,one,two,three,four,five,six,seven,eight,nine),
180 |            ('zero','one','two','three','four','five','six','seven','eight','nine'),
181 |            scatterpoints=1,
182 |            loc='lower left',
183 |            ncol=3,
184 |            fontsize=10)
185 | 
186 | plt.xlabel('PC 1')
187 | plt.ylabel('PC 2')
188 | 
189 | plt.show()
190 | 
191 | 
192 | 
193 | 
194 | # 3-Dimensional data
195 | pca_3d = PCA(n_components=3)
196 | reduced_X3D = pca_3d.fit_transform(X_scale)
197 | 
198 | print (pca_3d.explained_variance_ratio_)
199 | 
200 | 
201 | 
202 | zero_x, zero_y,zero_z = [],[],[] ; one_x, one_y,one_z = [],[],[]
203 | two_x,two_y,two_z = [],[],[]; three_x, three_y,three_z = [],[],[]
204 | four_x,four_y,four_z = [],[],[]; five_x,five_y,five_z = [],[],[]
205 | six_x,six_y,six_z = [],[],[]; seven_x,seven_y,seven_z = [],[],[]
206 | eight_x,eight_y,eight_z = [],[],[]; nine_x,nine_y,nine_z = [],[],[]
207 | 
208 | 
209 | for i in range(len(reduced_X3D)):
210 |     
211 |     if y[i]==10:
212 |         continue
213 |     
214 |     elif y[i] == 0:
215 |         zero_x.append(reduced_X3D[i][0])
216 |         zero_y.append(reduced_X3D[i][1])
217 |         zero_z.append(reduced_X3D[i][2])
218 |         
219 |     elif y[i] == 1:
220 |         one_x.append(reduced_X3D[i][0])
221 |         one_y.append(reduced_X3D[i][1])
222 |         one_z.append(reduced_X3D[i][2])
223 | 
224 |     elif y[i] == 2:
225 |         two_x.append(reduced_X3D[i][0])
226 |         two_y.append(reduced_X3D[i][1])
227 |         two_z.append(reduced_X3D[i][2])
228 | 
229 |     elif y[i] == 3:
230 |         three_x.append(reduced_X3D[i][0])
231 |         three_y.append(reduced_X3D[i][1])
232 |         three_z.append(reduced_X3D[i][2])
233 | 
234 |     elif y[i] == 4:
235 |         four_x.append(reduced_X3D[i][0])
236 |         four_y.append(reduced_X3D[i][1])
237 |         four_z.append(reduced_X3D[i][2])
238 | 
239 |     elif y[i] == 5:
240 |         five_x.append(reduced_X3D[i][0])
241 |         five_y.append(reduced_X3D[i][1])
242 |         five_z.append(reduced_X3D[i][2])
243 | 
244 |     elif y[i] == 6:
245 |         six_x.append(reduced_X3D[i][0])
246 |         six_y.append(reduced_X3D[i][1])
247 |         six_z.append(reduced_X3D[i][2])
248 | 
249 |     elif y[i] == 7:
250 |         seven_x.append(reduced_X3D[i][0])
251 |         seven_y.append(reduced_X3D[i][1])
252 |         seven_z.append(reduced_X3D[i][2])
253 | 
254 |     elif y[i] == 8:
255 |         eight_x.append(reduced_X3D[i][0])
256 |         eight_y.append(reduced_X3D[i][1])
257 |         eight_z.append(reduced_X3D[i][2])
258 |     
259 |     elif y[i] == 9:
260 |         nine_x.append(reduced_X3D[i][0])
261 |         nine_y.append(reduced_X3D[i][1])
262 |         nine_z.append(reduced_X3D[i][2])
263 | 
264 | 
265 | # 3- Dimensional plot
266 | from mpl_toolkits.mplot3d import Axes3D
267 | fig = plt.figure()
268 | ax = fig.add_subplot(111, projection='3d')
269 | 
270 | ax.scatter(zero_x, zero_y,zero_z, c='r', marker='x',label='zero')
271 | ax.scatter(one_x, one_y,one_z, c='g', marker='+',label='one')
272 | ax.scatter(two_x, two_y,two_z, c='b', marker='s',label='two')
273 | 
274 | ax.scatter(three_x, three_y,three_z, c='m', marker='*',label='three')
275 | ax.scatter(four_x, four_y,four_z, c='c', marker='h',label='four')
276 | ax.scatter(five_x, five_y,five_z, c='r', marker='D',label='five')
277 | 
278 | ax.scatter(six_x, six_y,six_z, c='y', marker='8',label='six')
279 | ax.scatter(seven_x, seven_y,seven_z, c='k', marker='*',label='seven')
280 | ax.scatter(eight_x, eight_y,eight_z, c='r', marker='x',label='eight')
281 | 
282 | ax.scatter(nine_x, nine_y,nine_z, c='b', marker='D',label='nine')
283 | 
284 | ax.set_xlabel('PC 1')
285 | ax.set_ylabel('PC 2')
286 | ax.set_zlabel('PC 3')
287 | 
288 | plt.legend(loc='upper left', numpoints=1, ncol=3, fontsize=10, bbox_to_anchor=(0, 0))
289 | 
290 | plt.show()
291 | 
292 | 
293 | 
294 | 
295 | # Chosing number of Principal Components
296 | max_pc = 30
297 | 
298 | pcs = []
299 | totexp_var = []
300 | 
301 | for i in range(max_pc):
302 |     pca = PCA(n_components=i+1)
303 |     reduced_X = pca.fit_transform(X_scale)
304 |     tot_var = pca.explained_variance_ratio_.sum()
305 |     pcs.append(i+1)
306 |     totexp_var.append(tot_var)
307 | 
308 | plt.plot(pcs,totexp_var,'r')
309 | plt.plot(pcs,totexp_var,'bs')
310 | plt.xlabel('No. of PCs',fontsize = 13)
311 | plt.ylabel('Total variance explained',fontsize = 13)
312 | 
313 | plt.xticks(pcs,fontsize=13)
314 | plt.yticks(fontsize=13)
315 | plt.show()
316 | 
317 | 
318 | 
319 | # SVD
320 | import matplotlib.pyplot as plt
321 | from sklearn.datasets import load_digits
322 | 
323 | digits = load_digits()
324 | X = digits.data
325 | y = digits.target
326 | 
327 | 
328 | from sklearn.utils.extmath import randomized_svd
329 | U,Sigma,VT = randomized_svd(X,n_components=15,n_iter=300,random_state=42)
330 | 
331 | print ("\nShape of Original Matrix:",X.shape)
332 | print ("\nShape of Left Singular vector:",U.shape)
333 | print ("Shape of Singular value:",Sigma.shape)
334 | print ("Shape of Right Singular vector",VT.shape)
335 | 
336 | import pandas as pd
337 | VT_df = pd.DataFrame(VT)
338 | 
339 | n_comps = 15
340 | from sklearn.decomposition import TruncatedSVD
341 | svd = TruncatedSVD(n_components=n_comps, n_iter=300, random_state=42)
342 | reduced_X = svd.fit_transform(X)
343 | 
344 | print("\nTotal Variance explained for %d singular features are %0.3f"%(n_comps,svd.explained_variance_ratio_.sum())) 
345 | 
346 | 
347 | # Choosing number of Singular Values
348 | max_singfeat = 30
349 | 
350 | singfeats = []
351 | totexp_var = []
352 | 
353 | for i in range(max_singfeat):
354 |     svd = TruncatedSVD(n_components=i+1, n_iter=300, random_state=42)
355 |     reduced_X = svd.fit_transform(X)
356 |     tot_var = svd.explained_variance_ratio_.sum()
357 |     singfeats.append(i+1)
358 |     totexp_var.append(tot_var)
359 | 
360 | plt.plot(singfeats,totexp_var,'r')
361 | plt.plot(singfeats,totexp_var,'bs')
362 | plt.xlabel('No. of Features',fontsize = 13)
363 | plt.ylabel('Total variance explained',fontsize = 13)
364 | 
365 | #plt.xticks(singfeats,fontsize=13)
366 | plt.yticks(fontsize=13)
367 | plt.show()
368 | 
369 | 
370 | 
371 | 
372 | 
373 | # Deep Auto Encoders
374 | import matplotlib.pyplot as plt
375 | from sklearn.preprocessing import StandardScaler
376 | from sklearn.datasets import load_digits
377 | 
378 | 
379 | digits = load_digits()
380 | X = digits.data
381 | y = digits.target
382 | 
383 | print (X.shape)
384 | print (y.shape)
385 | 
386 | 
387 | x_vars_stdscle = StandardScaler().fit_transform(X)
388 | print (x_vars_stdscle.shape)
389 | 
390 | 
391 | 
392 | from keras.layers import Input,Dense
393 | from keras.models import Model
394 | 
395 | # 2-Dimensional Architecture
396 | 
397 | input_layer = Input(shape=(64,),name="input")
398 | 
399 | encoded = Dense(32, activation='relu',name="h1encode")(input_layer)
400 | encoded = Dense(16, activation='relu',name="h2encode")(encoded)
401 | encoded = Dense(2, activation='relu',name="h3latent_layer")(encoded)
402 | 
403 | decoded = Dense(16, activation='relu',name="h4decode")(encoded)
404 | decoded = Dense(32, activation='relu',name="h5decode")(decoded)
405 | decoded = Dense(64, activation='sigmoid',name="h6decode")(decoded)
406 | 
407 | autoencoder = Model(input_layer, decoded)
408 | autoencoder.compile(optimizer="adam", loss="mse")
409 | 
410 | 
411 | # Fitting Encoder-Decoder model
412 | autoencoder.fit(x_vars_stdscle, x_vars_stdscle, epochs=100,batch_size=256,shuffle=True,validation_split= 0.2 )
413 | 
414 | # Extracting Encoder section of the Model for prediction of latent variables
415 | encoder = Model(autoencoder.input,autoencoder.get_layer("h3latent_layer").output)
416 | 
417 | # Predicting latent variables with extracted Encoder model
418 | reduced_X = encoder.predict(x_vars_stdscle)
419 | 
420 | print (reduced_X.shape)
421 | 
422 | 
423 | zero_x, zero_y = [],[] ; one_x, one_y = [],[]
424 | two_x,two_y = [],[]; three_x, three_y = [],[]
425 | four_x,four_y = [],[]; five_x,five_y = [],[]
426 | six_x,six_y = [],[]; seven_x,seven_y = [],[]
427 | eight_x,eight_y = [],[]; nine_x,nine_y = [],[]
428 | 
429 | 
430 | 
431 | 
432 | # For 2-Dimensional data
433 | for i in range(len(reduced_X)):
434 |     if y[i] == 0:
435 |         zero_x.append(reduced_X[i][0])
436 |         zero_y.append(reduced_X[i][1])
437 |         
438 |     elif y[i] == 1:
439 |         one_x.append(reduced_X[i][0])
440 |         one_y.append(reduced_X[i][1])
441 | 
442 |     elif y[i] == 2:
443 |         two_x.append(reduced_X[i][0])
444 |         two_y.append(reduced_X[i][1])
445 | 
446 |     elif y[i] == 3:
447 |         three_x.append(reduced_X[i][0])
448 |         three_y.append(reduced_X[i][1])
449 | 
450 |     elif y[i] == 4:
451 |         four_x.append(reduced_X[i][0])
452 |         four_y.append(reduced_X[i][1])
453 | 
454 |     elif y[i] == 5:
455 |         five_x.append(reduced_X[i][0])
456 |         five_y.append(reduced_X[i][1])
457 | 
458 |     elif y[i] == 6:
459 |         six_x.append(reduced_X[i][0])
460 |         six_y.append(reduced_X[i][1])
461 | 
462 |     elif y[i] == 7:
463 |         seven_x.append(reduced_X[i][0])
464 |         seven_y.append(reduced_X[i][1])
465 | 
466 |     elif y[i] == 8:
467 |         eight_x.append(reduced_X[i][0])
468 |         eight_y.append(reduced_X[i][1])
469 |     
470 |     elif y[i] == 9:
471 |         nine_x.append(reduced_X[i][0])
472 |         nine_y.append(reduced_X[i][1])
473 | 
474 | 
475 | 
476 | 
477 | zero = plt.scatter(zero_x, zero_y, c='r', marker='x',label='zero')
478 | one = plt.scatter(one_x, one_y, c='g', marker='+')
479 | two = plt.scatter(two_x, two_y, c='b', marker='s')
480 | 
481 | three = plt.scatter(three_x, three_y, c='m', marker='*')
482 | four = plt.scatter(four_x, four_y, c='c', marker='h')
483 | five = plt.scatter(five_x, five_y, c='r', marker='D')
484 | 
485 | six = plt.scatter(six_x, six_y, c='y', marker='8')
486 | seven = plt.scatter(seven_x, seven_y, c='k', marker='*')
487 | eight = plt.scatter(eight_x, eight_y, c='r', marker='x')
488 | 
489 | nine = plt.scatter(nine_x, nine_y, c='b', marker='D')
490 | 
491 | 
492 | plt.legend((zero,one,two,three,four,five,six,seven,eight,nine),
493 |            ('zero','one','two','three','four','five','six','seven','eight','nine'),
494 |            scatterpoints=1,
495 |            loc='lower right',
496 |            ncol=3,
497 |            fontsize=10)
498 | 
499 | plt.xlabel('Latent Feature 1',fontsize = 13)
500 | plt.ylabel('Latent Feature 2',fontsize = 13)
501 | 
502 | plt.show()
503 | 
504 | 
505 | 
506 | 
507 | # 3-Dimensional architecture
508 | input_layer = Input(shape=(64,),name="input")
509 | 
510 | encoded = Dense(32, activation='relu',name="h1encode")(input_layer)
511 | encoded = Dense(16, activation='relu',name="h2encode")(encoded)
512 | encoded = Dense(3, activation='relu',name="h3latent_layer")(encoded)
513 | 
514 | decoded = Dense(16, activation='relu',name="h4decode")(encoded)
515 | decoded = Dense(32, activation='relu',name="h5decode")(decoded)
516 | decoded = Dense(64, activation='sigmoid',name="h6decode")(decoded)
517 | 
518 | autoencoder = Model(input_layer, decoded)
519 | autoencoder.compile(optimizer="adam", loss="mse")
520 | 
521 | # Fitting Encoder-Decoder model
522 | autoencoder.fit(x_vars_stdscle, x_vars_stdscle, epochs=100,batch_size=256,shuffle=True,validation_split= 0.2 )
523 | 
524 | # Extracting Encoder section of the Model for prediction of latent variables
525 | encoder = Model(autoencoder.input,autoencoder.get_layer("h3latent_layer").output)
526 | 
527 | # Predicting latent variables with extracted Encoder model
528 | reduced_X3D = encoder.predict(x_vars_stdscle)
529 | 
530 | 
531 | 
532 | zero_x, zero_y,zero_z = [],[],[] ; one_x, one_y,one_z = [],[],[]
533 | two_x,two_y,two_z = [],[],[]; three_x, three_y,three_z = [],[],[]
534 | four_x,four_y,four_z = [],[],[]; five_x,five_y,five_z = [],[],[]
535 | six_x,six_y,six_z = [],[],[]; seven_x,seven_y,seven_z = [],[],[]
536 | eight_x,eight_y,eight_z = [],[],[]; nine_x,nine_y,nine_z = [],[],[]
537 | 
538 | 
539 | 
540 | for i in range(len(reduced_X3D)):
541 |     
542 |     if y[i]==10:
543 |         continue
544 |     
545 |     elif y[i] == 0:
546 |         zero_x.append(reduced_X3D[i][0])
547 |         zero_y.append(reduced_X3D[i][1])
548 |         zero_z.append(reduced_X3D[i][2])
549 |         
550 |     elif y[i] == 1:
551 |         one_x.append(reduced_X3D[i][0])
552 |         one_y.append(reduced_X3D[i][1])
553 |         one_z.append(reduced_X3D[i][2])
554 | 
555 |     elif y[i] == 2:
556 |         two_x.append(reduced_X3D[i][0])
557 |         two_y.append(reduced_X3D[i][1])
558 |         two_z.append(reduced_X3D[i][2])
559 | 
560 |     elif y[i] == 3:
561 |         three_x.append(reduced_X3D[i][0])
562 |         three_y.append(reduced_X3D[i][1])
563 |         three_z.append(reduced_X3D[i][2])
564 | 
565 |     elif y[i] == 4:
566 |         four_x.append(reduced_X3D[i][0])
567 |         four_y.append(reduced_X3D[i][1])
568 |         four_z.append(reduced_X3D[i][2])
569 | 
570 |     elif y[i] == 5:
571 |         five_x.append(reduced_X3D[i][0])
572 |         five_y.append(reduced_X3D[i][1])
573 |         five_z.append(reduced_X3D[i][2])
574 | 
575 |     elif y[i] == 6:
576 |         six_x.append(reduced_X3D[i][0])
577 |         six_y.append(reduced_X3D[i][1])
578 |         six_z.append(reduced_X3D[i][2])
579 | 
580 |     elif y[i] == 7:
581 |         seven_x.append(reduced_X3D[i][0])
582 |         seven_y.append(reduced_X3D[i][1])
583 |         seven_z.append(reduced_X3D[i][2])
584 | 
585 |     elif y[i] == 8:
586 |         eight_x.append(reduced_X3D[i][0])
587 |         eight_y.append(reduced_X3D[i][1])
588 |         eight_z.append(reduced_X3D[i][2])
589 |     
590 |     elif y[i] == 9:
591 |         nine_x.append(reduced_X3D[i][0])
592 |         nine_y.append(reduced_X3D[i][1])
593 |         nine_z.append(reduced_X3D[i][2])
594 | 
595 | 
596 | 
597 | # 3- Dimensional plot
598 | from mpl_toolkits.mplot3d import Axes3D
599 | fig = plt.figure()
600 | ax = fig.add_subplot(111, projection='3d')
601 | 
602 | ax.scatter(zero_x, zero_y,zero_z, c='r', marker='x',label='zero')
603 | ax.scatter(one_x, one_y,one_z, c='g', marker='+',label='one')
604 | ax.scatter(two_x, two_y,two_z, c='b', marker='s',label='two')
605 | 
606 | ax.scatter(three_x, three_y,three_z, c='m', marker='*',label='three')
607 | ax.scatter(four_x, four_y,four_z, c='c', marker='h',label='four')
608 | ax.scatter(five_x, five_y,five_z, c='r', marker='D',label='five')
609 | 
610 | ax.scatter(six_x, six_y,six_z, c='y', marker='8',label='six')
611 | ax.scatter(seven_x, seven_y,seven_z, c='k', marker='*',label='seven')
612 | ax.scatter(eight_x, eight_y,eight_z, c='r', marker='x',label='eight')
613 | 
614 | ax.scatter(nine_x, nine_y,nine_z, c='b', marker='D',label='nine')
615 | 
616 | ax.set_xlabel('Latent Feature 1',fontsize = 13)
617 | ax.set_ylabel('Latent Feature 2',fontsize = 13)
618 | ax.set_zlabel('Latent Feature 3',fontsize = 13)
619 | 
620 | ax.set_xlim3d(0,60)
621 | 
622 | plt.legend(loc='upper left', numpoints=1, ncol=3, fontsize=10, bbox_to_anchor=(0, 0))
623 | 
624 | plt.show()
625 | 
626 | 
627 | 
628 | ax.set_xlim3d(left = 0,right = 30)
629 | ax.set_ylim3d(left = 0,right = 30)
630 | ax.set_zlim3d(left = 0,right = 30)
631 | 
632 | 
633 | 
634 | 
635 | 
636 | 
637 | 
638 | 
639 | 
640 | 
641 | 
642 | 
643 | 
644 | 
645 | 
646 | 
647 | 
648 | 
649 | 
650 | 
651 | 


--------------------------------------------------------------------------------
/Chapter08/iris.csv:
--------------------------------------------------------------------------------
  1 | sepal_length,sepal_width,petal_length,petal_width,class
  2 | 5.1,3.5,1.4,0.2,Iris-setosa
  3 | 4.9,3,1.4,0.2,Iris-setosa
  4 | 4.7,3.2,1.3,0.2,Iris-setosa
  5 | 4.6,3.1,1.5,0.2,Iris-setosa
  6 | 5,3.6,1.4,0.2,Iris-setosa
  7 | 5.4,3.9,1.7,0.4,Iris-setosa
  8 | 4.6,3.4,1.4,0.3,Iris-setosa
  9 | 5,3.4,1.5,0.2,Iris-setosa
 10 | 4.4,2.9,1.4,0.2,Iris-setosa
 11 | 4.9,3.1,1.5,0.1,Iris-setosa
 12 | 5.4,3.7,1.5,0.2,Iris-setosa
 13 | 4.8,3.4,1.6,0.2,Iris-setosa
 14 | 4.8,3,1.4,0.1,Iris-setosa
 15 | 4.3,3,1.1,0.1,Iris-setosa
 16 | 5.8,4,1.2,0.2,Iris-setosa
 17 | 5.7,4.4,1.5,0.4,Iris-setosa
 18 | 5.4,3.9,1.3,0.4,Iris-setosa
 19 | 5.1,3.5,1.4,0.3,Iris-setosa
 20 | 5.7,3.8,1.7,0.3,Iris-setosa
 21 | 5.1,3.8,1.5,0.3,Iris-setosa
 22 | 5.4,3.4,1.7,0.2,Iris-setosa
 23 | 5.1,3.7,1.5,0.4,Iris-setosa
 24 | 4.6,3.6,1,0.2,Iris-setosa
 25 | 5.1,3.3,1.7,0.5,Iris-setosa
 26 | 4.8,3.4,1.9,0.2,Iris-setosa
 27 | 5,3,1.6,0.2,Iris-setosa
 28 | 5,3.4,1.6,0.4,Iris-setosa
 29 | 5.2,3.5,1.5,0.2,Iris-setosa
 30 | 5.2,3.4,1.4,0.2,Iris-setosa
 31 | 4.7,3.2,1.6,0.2,Iris-setosa
 32 | 4.8,3.1,1.6,0.2,Iris-setosa
 33 | 5.4,3.4,1.5,0.4,Iris-setosa
 34 | 5.2,4.1,1.5,0.1,Iris-setosa
 35 | 5.5,4.2,1.4,0.2,Iris-setosa
 36 | 4.9,3.1,1.5,0.1,Iris-setosa
 37 | 5,3.2,1.2,0.2,Iris-setosa
 38 | 5.5,3.5,1.3,0.2,Iris-setosa
 39 | 4.9,3.1,1.5,0.1,Iris-setosa
 40 | 4.4,3,1.3,0.2,Iris-setosa
 41 | 5.1,3.4,1.5,0.2,Iris-setosa
 42 | 5,3.5,1.3,0.3,Iris-setosa
 43 | 4.5,2.3,1.3,0.3,Iris-setosa
 44 | 4.4,3.2,1.3,0.2,Iris-setosa
 45 | 5,3.5,1.6,0.6,Iris-setosa
 46 | 5.1,3.8,1.9,0.4,Iris-setosa
 47 | 4.8,3,1.4,0.3,Iris-setosa
 48 | 5.1,3.8,1.6,0.2,Iris-setosa
 49 | 4.6,3.2,1.4,0.2,Iris-setosa
 50 | 5.3,3.7,1.5,0.2,Iris-setosa
 51 | 5,3.3,1.4,0.2,Iris-setosa
 52 | 7,3.2,4.7,1.4,Iris-versicolor
 53 | 6.4,3.2,4.5,1.5,Iris-versicolor
 54 | 6.9,3.1,4.9,1.5,Iris-versicolor
 55 | 5.5,2.3,4,1.3,Iris-versicolor
 56 | 6.5,2.8,4.6,1.5,Iris-versicolor
 57 | 5.7,2.8,4.5,1.3,Iris-versicolor
 58 | 6.3,3.3,4.7,1.6,Iris-versicolor
 59 | 4.9,2.4,3.3,1,Iris-versicolor
 60 | 6.6,2.9,4.6,1.3,Iris-versicolor
 61 | 5.2,2.7,3.9,1.4,Iris-versicolor
 62 | 5,2,3.5,1,Iris-versicolor
 63 | 5.9,3,4.2,1.5,Iris-versicolor
 64 | 6,2.2,4,1,Iris-versicolor
 65 | 6.1,2.9,4.7,1.4,Iris-versicolor
 66 | 5.6,2.9,3.6,1.3,Iris-versicolor
 67 | 6.7,3.1,4.4,1.4,Iris-versicolor
 68 | 5.6,3,4.5,1.5,Iris-versicolor
 69 | 5.8,2.7,4.1,1,Iris-versicolor
 70 | 6.2,2.2,4.5,1.5,Iris-versicolor
 71 | 5.6,2.5,3.9,1.1,Iris-versicolor
 72 | 5.9,3.2,4.8,1.8,Iris-versicolor
 73 | 6.1,2.8,4,1.3,Iris-versicolor
 74 | 6.3,2.5,4.9,1.5,Iris-versicolor
 75 | 6.1,2.8,4.7,1.2,Iris-versicolor
 76 | 6.4,2.9,4.3,1.3,Iris-versicolor
 77 | 6.6,3,4.4,1.4,Iris-versicolor
 78 | 6.8,2.8,4.8,1.4,Iris-versicolor
 79 | 6.7,3,5,1.7,Iris-versicolor
 80 | 6,2.9,4.5,1.5,Iris-versicolor
 81 | 5.7,2.6,3.5,1,Iris-versicolor
 82 | 5.5,2.4,3.8,1.1,Iris-versicolor
 83 | 5.5,2.4,3.7,1,Iris-versicolor
 84 | 5.8,2.7,3.9,1.2,Iris-versicolor
 85 | 6,2.7,5.1,1.6,Iris-versicolor
 86 | 5.4,3,4.5,1.5,Iris-versicolor
 87 | 6,3.4,4.5,1.6,Iris-versicolor
 88 | 6.7,3.1,4.7,1.5,Iris-versicolor
 89 | 6.3,2.3,4.4,1.3,Iris-versicolor
 90 | 5.6,3,4.1,1.3,Iris-versicolor
 91 | 5.5,2.5,4,1.3,Iris-versicolor
 92 | 5.5,2.6,4.4,1.2,Iris-versicolor
 93 | 6.1,3,4.6,1.4,Iris-versicolor
 94 | 5.8,2.6,4,1.2,Iris-versicolor
 95 | 5,2.3,3.3,1,Iris-versicolor
 96 | 5.6,2.7,4.2,1.3,Iris-versicolor
 97 | 5.7,3,4.2,1.2,Iris-versicolor
 98 | 5.7,2.9,4.2,1.3,Iris-versicolor
 99 | 6.2,2.9,4.3,1.3,Iris-versicolor
100 | 5.1,2.5,3,1.1,Iris-versicolor
101 | 5.7,2.8,4.1,1.3,Iris-versicolor
102 | 6.3,3.3,6,2.5,Iris-virginica
103 | 5.8,2.7,5.1,1.9,Iris-virginica
104 | 7.1,3,5.9,2.1,Iris-virginica
105 | 6.3,2.9,5.6,1.8,Iris-virginica
106 | 6.5,3,5.8,2.2,Iris-virginica
107 | 7.6,3,6.6,2.1,Iris-virginica
108 | 4.9,2.5,4.5,1.7,Iris-virginica
109 | 7.3,2.9,6.3,1.8,Iris-virginica
110 | 6.7,2.5,5.8,1.8,Iris-virginica
111 | 7.2,3.6,6.1,2.5,Iris-virginica
112 | 6.5,3.2,5.1,2,Iris-virginica
113 | 6.4,2.7,5.3,1.9,Iris-virginica
114 | 6.8,3,5.5,2.1,Iris-virginica
115 | 5.7,2.5,5,2,Iris-virginica
116 | 5.8,2.8,5.1,2.4,Iris-virginica
117 | 6.4,3.2,5.3,2.3,Iris-virginica
118 | 6.5,3,5.5,1.8,Iris-virginica
119 | 7.7,3.8,6.7,2.2,Iris-virginica
120 | 7.7,2.6,6.9,2.3,Iris-virginica
121 | 6,2.2,5,1.5,Iris-virginica
122 | 6.9,3.2,5.7,2.3,Iris-virginica
123 | 5.6,2.8,4.9,2,Iris-virginica
124 | 7.7,2.8,6.7,2,Iris-virginica
125 | 6.3,2.7,4.9,1.8,Iris-virginica
126 | 6.7,3.3,5.7,2.1,Iris-virginica
127 | 7.2,3.2,6,1.8,Iris-virginica
128 | 6.2,2.8,4.8,1.8,Iris-virginica
129 | 6.1,3,4.9,1.8,Iris-virginica
130 | 6.4,2.8,5.6,2.1,Iris-virginica
131 | 7.2,3,5.8,1.6,Iris-virginica
132 | 7.4,2.8,6.1,1.9,Iris-virginica
133 | 7.9,3.8,6.4,2,Iris-virginica
134 | 6.4,2.8,5.6,2.2,Iris-virginica
135 | 6.3,2.8,5.1,1.5,Iris-virginica
136 | 6.1,2.6,5.6,1.4,Iris-virginica
137 | 7.7,3,6.1,2.3,Iris-virginica
138 | 6.3,3.4,5.6,2.4,Iris-virginica
139 | 6.4,3.1,5.5,1.8,Iris-virginica
140 | 6,3,4.8,1.8,Iris-virginica
141 | 6.9,3.1,5.4,2.1,Iris-virginica
142 | 6.7,3.1,5.6,2.4,Iris-virginica
143 | 6.9,3.1,5.1,2.3,Iris-virginica
144 | 5.8,2.7,5.1,1.9,Iris-virginica
145 | 6.8,3.2,5.9,2.3,Iris-virginica
146 | 6.7,3.3,5.7,2.5,Iris-virginica
147 | 6.7,3,5.2,2.3,Iris-virginica
148 | 6.3,2.5,5,1.9,Iris-virginica
149 | 6.5,3,5.2,2,Iris-virginica
150 | 6.2,3.4,5.4,2.3,Iris-virginica
151 | 5.9,3,5.1,1.8,Iris-virginica
152 | 


--------------------------------------------------------------------------------
/Chapter09/Chapter 09_RL.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # Solving MDP with Dynamic Programming - Value & Policy Iterations
  4 | 
  5 | import random,operator
  6 | 
  7 | 
  8 | def argmax(seq, fn):
  9 |     best = seq[0]; best_score = fn(best)
 10 |     for x in seq:
 11 |         x_score = fn(x)
 12 |         if x_score > best_score:
 13 |             best, best_score = x, x_score
 14 |     return best
 15 | 
 16 | def vector_add(a, b):
 17 |     return tuple(map(operator.add, a, b))
 18 |     
 19 |  
 20 | orientations = [(1,0), (0, 1), (-1, 0), (0, -1)]
 21 | 
 22 | def turn_right(orientation):
 23 |     return orientations[orientations.index(orientation)-1]
 24 | 
 25 | def turn_left(orientation):
 26 |     return orientations[(orientations.index(orientation)+1) % len(orientations)]
 27 | 
 28 | def isnumber(x):
 29 |     return hasattr(x, '__int__')
 30 | 
 31 | 
 32 |    
 33 | """A Markov Decision Process, defined by an init_pos_posial state, transition model,
 34 | and reward function. """
 35 | 
 36 | class MDP:
 37 | 
 38 |     def __init__(self, init_pos, actlist, terminals, transitions={}, states=None, gamma=0.99):
 39 |         if not (0 < gamma <= 1):
 40 |             raise ValueError("MDP should have 0 < gamma <= 1 values")
 41 | 
 42 |         if states:
 43 |             self.states = states
 44 |         else:
 45 |             self.states = set()
 46 |         self.init_pos = init_pos
 47 |         self.actlist = actlist
 48 |         self.terminals = terminals
 49 |         self.transitions = transitions
 50 |         self.gamma = gamma
 51 |         self.reward = {}
 52 | 
 53 |     """Returns a numeric reward for the state."""
 54 |     def R(self, state):
 55 |         return self.reward[state]
 56 | 
 57 |     """Transition model. From a state and an action, return a list of (probability, result-state) pairs"""
 58 |     def T(self, state, action):
 59 |         if(self.transitions == {}):
 60 |             raise ValueError("Transition model is missing")
 61 |         else:
 62 |             return self.transitions[state][action]
 63 | 
 64 |     """Set of actions that can be performed for a particular state"""
 65 |     def actions(self, state):
 66 |         if state in self.terminals:
 67 |             return [None]
 68 |         else:
 69 |             return self.actlist
 70 | 
 71 | 
 72 | 
 73 | """A two-dimensional grid MDP"""
 74 | class GridMDP(MDP):
 75 | 
 76 |     def __init__(self, grid, terminals, init_pos=(0, 0), gamma=0.99):
 77 |         
 78 |         """ because we want row 0 on bottom, not on top """
 79 |         grid.reverse()  
 80 |         
 81 |         MDP.__init__(self, init_pos, actlist=orientations,
 82 |                      terminals=terminals, gamma=gamma)
 83 |         self.grid = grid
 84 |         self.rows = len(grid)
 85 |         self.cols = len(grid[0])
 86 |         for x in range(self.cols):
 87 |             for y in range(self.rows):
 88 |                 self.reward[x, y] = grid[y][x]
 89 |                 if grid[y][x] is not None:
 90 |                     self.states.add((x, y))
 91 | 
 92 |     def T(self, state, action):
 93 |         if action is None:
 94 |             return [(0.0, state)]
 95 |         else:
 96 |             return [(0.8, self.go(state, action)),
 97 |                     (0.1, self.go(state, turn_right(action))),
 98 |                     (0.1, self.go(state, turn_left(action)))]
 99 | 
100 |     """Return the state that results from going in this direction."""
101 |     def go(self, state, direction):
102 |         state1 = vector_add(state, direction)
103 |         return state1 if state1 in self.states else state
104 | 
105 |     """Convert a mapping from (x, y) to v into a [[..., v, ...]] grid."""
106 |     def to_grid(self, mapping):
107 |         return list(reversed([[mapping.get((x, y), None)
108 |                                for x in range(self.cols)]
109 |                               for y in range(self.rows)]))
110 |                 
111 |     """Convert a mapping from (x, y) to v into a [[..., v, ...]] grid."""
112 |     def to_arrows(self, policy):
113 |         chars = {
114 |             (1, 0): '>', (0, 1): '^', (-1, 0): '<', (0, -1): 'v', None: '.'}
115 |         return self.to_grid({s: chars[a] for (s, a) in policy.items()})
116 | 
117 | """Solving an MDP by value iteration and returns the optimum state values """
118 | def value_iteration(mdp, epsilon=0.001):
119 |     STSN = {s: 0 for s in mdp.states}
120 |     R, T, gamma = mdp.R, mdp.T, mdp.gamma
121 |     while True:
122 |         STS = STSN.copy()
123 |         delta = 0
124 |         for s in mdp.states:
125 |             STSN[s] = R(s) + gamma * max([sum([p * STS[s1] for (p, s1) in T(s, a)])
126 |                                         for a in mdp.actions(s)])
127 |             delta = max(delta, abs(STSN[s] - STS[s]))
128 |         if delta < epsilon * (1 - gamma) / gamma:
129 |             return STS
130 | 
131 | """Given an MDP and a utility function STS, determine the best policy,
132 | as a mapping from state to action """
133 | def best_policy(mdp, STS):
134 |     pi = {}
135 |     for s in mdp.states:
136 |         pi[s] = argmax(mdp.actions(s), lambda a: expected_utility(a, s, STS, mdp))
137 |     return pi
138 | 
139 | """The expected utility of doing a in state s, according to the MDP and STS."""
140 | def expected_utility(a, s, STS, mdp):
141 |     return sum([p * STS[s1] for (p, s1) in mdp.T(s, a)])
142 | 
143 | """Solve an MDP by policy iteration"""
144 | def policy_iteration(mdp):
145 |     STS = {s: 0 for s in mdp.states}
146 |     pi = {s: random.choice(mdp.actions(s)) for s in mdp.states}
147 |     while True:
148 |         STS = policy_evaluation(pi, STS, mdp)
149 |         unchanged = True
150 |         for s in mdp.states:
151 |             a = argmax(mdp.actions(s),lambda a: expected_utility(a, s, STS, mdp))
152 |             if a != pi[s]:
153 |                 pi[s] = a
154 |                 unchanged = False
155 |         if unchanged:
156 |             return pi
157 | 
158 | """Return an updated utility mapping U from each state in the MDP to its
159 | utility, using an approximation (modified policy iteration)"""
160 | def policy_evaluation(pi, STS, mdp, k=20):
161 |     R, T, gamma = mdp.R, mdp.T, mdp.gamma
162 |     for i in range(k):
163 |         for s in mdp.states:
164 |             STS[s] = R(s) + gamma * sum([p * STS[s1] for (p, s1) in T(s, pi[s])])
165 |     return STS
166 | 
167 | 
168 | def print_table(table, header=None, sep='   ', numfmt='{}'):
169 |     justs = ['rjust' if isnumber(x) else 'ljust' for x in table[0]]
170 |     if header:
171 |         table.insert(0, header)
172 |     table = [[numfmt.format(x) if isnumber(x) else x for x in row]
173 |              for row in table]
174 |     sizes = list(
175 |             map(lambda seq: max(map(len, seq)),
176 |                 list(zip(*[map(str, row) for row in table]))))
177 |     for row in table:
178 |         print(sep.join(getattr(
179 |             str(x), j)(size) for (j, size, x) in zip(justs, sizes, row)))
180 | 
181 | 
182 | 
183 | """ A 4x3 grid environment that presents the agent with a sequential decision problem"""
184 | sequential_decision_environment = GridMDP([[-0.02, -0.02, -0.02, +1],
185 |                                            [-0.02, None, -0.02, -1],
186 |                                            [-0.02, -0.02, -0.02, -0.02]],
187 |                                           terminals=[(3, 2), (3, 1)])
188 | 
189 | # Value Iteration
190 | value_iter = best_policy(sequential_decision_environment, value_iteration(sequential_decision_environment, .01))
191 | print("\n Optimal Policy based on Value Iteration\n")
192 | print_table(sequential_decision_environment.to_arrows(value_iter))
193 | 
194 | 
195 | #Policy Iteration
196 | policy_iter = policy_iteration(sequential_decision_environment)
197 | print("\n Optimal Policy based on Policy Iteration & Evaluation\n")
198 | print_table(sequential_decision_environment.to_arrows(policy_iter))
199 | 
200 | 
201 | 
202 | 
203 | # Monte Carlo Methods
204 | 
205 | from __future__ import print_function
206 | import numpy as np
207 | import matplotlib.pyplot as plt
208 | from mpl_toolkits.mplot3d import Axes3D
209 | 
210 | 
211 | #actions: hit or stand
212 | ACTION_HIT = 0
213 | ACTION_STAND = 1  
214 | actions = [ACTION_HIT, ACTION_STAND]
215 | 
216 | 
217 | 
218 | "policy for player"
219 | policyPlayer = np.zeros(22)
220 | 
221 | for i in range(12, 20):
222 |     policyPlayer[i] = ACTION_HIT
223 | 
224 | policyPlayer[20] = ACTION_STAND
225 | policyPlayer[21] = ACTION_STAND
226 | 
227 | "function form of target policy of player"
228 | def targetPolicyPlayer(usableAcePlayer, playerSum, dealerCard):
229 |     return policyPlayer[playerSum]
230 | 
231 | "function form of behavior policy of player"
232 | def behaviorPolicyPlayer(usableAcePlayer, playerSum, dealerCard):
233 |     if np.random.binomial(1, 0.5) == 1:
234 |         return ACTION_STAND
235 |     return ACTION_HIT
236 | 
237 | "policy for dealer"
238 | policyDealer = np.zeros(22)
239 | for i in range(12, 17):
240 |     policyDealer[i] = ACTION_HIT
241 | for i in range(17, 22):
242 |     policyDealer[i] = ACTION_STAND
243 | 
244 | "get a new card"
245 | def getCard():
246 |     card = np.random.randint(1, 14)
247 |     card = min(card, 10)
248 |     return card
249 | 
250 | # play a game
251 | 
252 | def play(policyPlayerFn, initialState=None, initialAction=None):
253 |     # player status
254 | 
255 |     # sum of player
256 |     playerSum = 0
257 | 
258 |     # trajectory of player
259 |     playerTrajectory = []
260 | 
261 |     # whether player uses Ace as 11
262 |     usableAcePlayer = False
263 | 
264 |     # dealer status
265 |     dealerCard1 = 0
266 |     dealerCard2 = 0
267 |     usableAceDealer = False
268 | 
269 |     if initialState is None:
270 |         # generate a random initial state
271 | 
272 |         numOfAce = 0
273 | 
274 |         # initialize cards of player
275 |         while playerSum < 12:
276 |             # if sum of player is less than 12, always hit
277 |             card = getCard()
278 | 
279 |             # if get an Ace, use it as 11
280 |             if card == 1:
281 |                 numOfAce += 1
282 |                 card = 11
283 |                 usableAcePlayer = True
284 |             playerSum += card
285 | 
286 |         # if player's sum is larger than 21, he must hold at least one Ace, two Aces are possible
287 |         if playerSum > 21:
288 |             # use the Ace as 1 rather than 11
289 |             playerSum -= 10
290 | 
291 |             # if the player only has one Ace, then he doesn't have usable Ace any more
292 |             if numOfAce == 1:
293 |                 usableAcePlayer = False
294 | 
295 |         # initialize cards of dealer, suppose dealer will show the first card he gets
296 |         dealerCard1 = getCard()
297 |         dealerCard2 = getCard()
298 | 
299 |     else:
300 |         # use specified initial state
301 |         usableAcePlayer = initialState[0]
302 |         playerSum = initialState[1]
303 |         dealerCard1 = initialState[2]
304 |         dealerCard2 = getCard()
305 | 
306 |     # initial state of the game
307 |     state = [usableAcePlayer, playerSum, dealerCard1]
308 | 
309 |     # initialize dealer's sum
310 |     dealerSum = 0
311 |     if dealerCard1 == 1 and dealerCard2 != 1:
312 |         dealerSum += 11 + dealerCard2
313 |         usableAceDealer = True
314 |     elif dealerCard1 != 1 and dealerCard2 == 1:
315 |         dealerSum += dealerCard1 + 11
316 |         usableAceDealer = True
317 |     elif dealerCard1 == 1 and dealerCard2 == 1:
318 |         dealerSum += 1 + 11
319 |         usableAceDealer = True
320 |     else:
321 |         dealerSum += dealerCard1 + dealerCard2
322 | 
323 |     # game starts!
324 | 
325 |     # player's turn
326 |     while True:
327 |         if initialAction is not None:
328 |             action = initialAction
329 |             initialAction = None
330 |         else:
331 |             # get action based on current sum
332 |             action = policyPlayerFn(usableAcePlayer, playerSum, dealerCard1)
333 | 
334 |         # track player's trajectory for importance sampling
335 |         playerTrajectory.append([action, (usableAcePlayer, playerSum, dealerCard1)])
336 | 
337 |         if action == ACTION_STAND:
338 |             break
339 |         # if hit, get new card
340 |         playerSum += getCard()
341 | 
342 |         # player busts
343 |         if playerSum > 21:
344 |             # if player has a usable Ace, use it as 1 to avoid busting and continue
345 |             if usableAcePlayer == True:
346 |                 playerSum -= 10
347 |                 usableAcePlayer = False
348 |             else:
349 |                 # otherwise player loses
350 |                 return state, -1, playerTrajectory
351 | 
352 |     # dealer's turn
353 |     while True:
354 |         # get action based on current sum
355 |         action = policyDealer[dealerSum]
356 |         if action == ACTION_STAND:
357 |             break
358 |         # if hit, get a new card
359 |         dealerSum += getCard()
360 |         # dealer busts
361 |         if dealerSum > 21:
362 |             if usableAceDealer == True:
363 |             # if dealer has a usable Ace, use it as 1 to avoid busting and continue
364 |                 dealerSum -= 10
365 |                 usableAceDealer = False
366 |             else:
367 |             # otherwise dealer loses
368 |                 return state, 1, playerTrajectory
369 | 
370 |     # compare the sum between player and dealer
371 |     if playerSum > dealerSum:
372 |         return state, 1, playerTrajectory
373 |     elif playerSum == dealerSum:
374 |         return state, 0, playerTrajectory
375 |     else:
376 |         return state, -1, playerTrajectory
377 | 
378 | # Monte Carlo Sample with On-Policy
379 | def monteCarloOnPolicy(nEpisodes):
380 |     statesUsableAce = np.zeros((10, 10))
381 |     # initialze counts to 1 to avoid 0 being divided
382 |     statesUsableAceCount = np.ones((10, 10))
383 |     statesNoUsableAce = np.zeros((10, 10))
384 |     # initialze counts to 1 to avoid 0 being divided
385 |     statesNoUsableAceCount = np.ones((10, 10))
386 |     for i in range(0, nEpisodes):
387 |         state, reward, _ = play(targetPolicyPlayer)
388 |         state[1] -= 12
389 |         state[2] -= 1
390 |         if state[0]:
391 |             statesUsableAceCount[state[1], state[2]] += 1
392 |             statesUsableAce[state[1], state[2]] += reward
393 |         else:
394 |             statesNoUsableAceCount[state[1], state[2]] += 1
395 |             statesNoUsableAce[state[1], state[2]] += reward
396 |     return statesUsableAce / statesUsableAceCount, statesNoUsableAce / statesNoUsableAceCount
397 | 
398 | # Monte Carlo with Exploring Starts
399 | def monteCarloES(nEpisodes):
400 |     # (playerSum, dealerCard, usableAce, action)
401 |     stateActionValues = np.zeros((10, 10, 2, 2))
402 |     # initialze counts to 1 to avoid division by 0
403 |     stateActionPairCount = np.ones((10, 10, 2, 2))
404 | 
405 |     # behavior policy is greedy
406 |     def behaviorPolicy(usableAce, playerSum, dealerCard):
407 |         usableAce = int(usableAce)
408 |         playerSum -= 12
409 |         dealerCard -= 1
410 |         # get argmax of the average returns(s, a)
411 |         return np.argmax(stateActionValues[playerSum, dealerCard, usableAce, :]
412 |                       / stateActionPairCount[playerSum, dealerCard, usableAce, :])
413 | 
414 |     # play for several episodes
415 |     for episode in range(nEpisodes):
416 |         if episode % 1000 == 0:
417 |             print('episode:', episode)
418 |         # for each episode, use a randomly initialized state and action
419 |         initialState = [bool(np.random.choice([0, 1])),
420 |                        np.random.choice(range(12, 22)),
421 |                        np.random.choice(range(1, 11))]
422 |         initialAction = np.random.choice(actions)
423 |         _, reward, trajectory = play(behaviorPolicy, initialState, initialAction)
424 |         for action, (usableAce, playerSum, dealerCard) in trajectory:
425 |             usableAce = int(usableAce)
426 |             playerSum -= 12
427 |             dealerCard -= 1
428 |             # update values of state-action pairs
429 |             stateActionValues[playerSum, dealerCard, usableAce, action] += reward
430 |             stateActionPairCount[playerSum, dealerCard, usableAce, action] += 1
431 | 
432 |     return stateActionValues / stateActionPairCount
433 | 
434 | 
435 | # print the state value
436 | figureIndex = 0
437 | def prettyPrint(data, tile, zlabel='reward'):
438 |     global figureIndex
439 |     fig = plt.figure(figureIndex)
440 |     figureIndex += 1
441 |     fig.suptitle(tile)
442 |     ax = fig.add_subplot(111, projection='3d')
443 |     x_axis = []
444 |     y_axis = []
445 |     z_axis = []
446 |     for i in range(12, 22):
447 |         for j in range(1, 11):
448 |             x_axis.append(i)
449 |             y_axis.append(j)
450 |             z_axis.append(data[i - 12, j - 1])
451 |     ax.scatter(x_axis, y_axis, z_axis,c='red')
452 |     ax.set_xlabel('player sum')
453 |     ax.set_ylabel('dealer showing')
454 |     ax.set_zlabel(zlabel)
455 | 
456 | 
457 | 
458 | # On-Policy results
459 | def onPolicy():
460 |     statesUsableAce1, statesNoUsableAce1 = monteCarloOnPolicy(10000)
461 |     statesUsableAce2, statesNoUsableAce2 = monteCarloOnPolicy(500000)
462 |     prettyPrint(statesUsableAce1, 'Usable Ace & 10000 Episodes')
463 |     prettyPrint(statesNoUsableAce1, 'No Usable Ace & 10000 Episodes')
464 |     prettyPrint(statesUsableAce2, 'Usable Ace & 500000 Episodes')
465 |     prettyPrint(statesNoUsableAce2, 'No Usable Ace & 500000 Episodes')
466 |     plt.show()
467 |     
468 |     
469 | # Optimized or Monte Calro Control 
470 | def MC_ES_optimalPolicy():
471 |     stateActionValues = monteCarloES(500000)
472 |     stateValueUsableAce = np.zeros((10, 10))
473 |     stateValueNoUsableAce = np.zeros((10, 10))
474 |     # get the optimal policy
475 |     actionUsableAce = np.zeros((10, 10), dtype='int')
476 |     actionNoUsableAce = np.zeros((10, 10), dtype='int')
477 |     for i in range(10):
478 |         for j in range(10):
479 |             stateValueNoUsableAce[i, j] = np.max(stateActionValues[i, j, 0, :])
480 |             stateValueUsableAce[i, j] = np.max(stateActionValues[i, j, 1, :])
481 |             actionNoUsableAce[i, j] = np.argmax(stateActionValues[i, j, 0, :])
482 |             actionUsableAce[i, j] = np.argmax(stateActionValues[i, j, 1, :])
483 |     prettyPrint(stateValueUsableAce, 'Optimal state value with usable Ace')
484 |     prettyPrint(stateValueNoUsableAce, 'Optimal state value with no usable Ace')
485 |     prettyPrint(actionUsableAce, 'Optimal policy with usable Ace', 'Action (0 Hit, 1 Stick)')
486 |     prettyPrint(actionNoUsableAce, 'Optimal policy with no usable Ace', 'Action (0 Hit, 1 Stick)')
487 |     plt.show()
488 | 
489 | 
490 | 
491 | # Run on policy function
492 | onPolicy()
493 | 
494 | # Run Monte Carlo Control or Explored starts
495 | MC_ES_optimalPolicy()
496 | 
497 | 
498 | 
499 | 
500 | 
501 | 
502 | 
503 | 
504 | 
505 | 
506 | 
507 | # Cliff-Walking - TD Learning - SARSA & Q-Learning
508 | from __future__ import print_function
509 | import numpy as np
510 | import matplotlib.pyplot as plt
511 | 
512 | # Grid dimensions
513 | GRID_HEIGHT = 4
514 | GRID_WIDTH = 12
515 | 
516 | # probability for exploration, step size,gamma 
517 | EPSILON = 0.1
518 | ALPHA = 0.5
519 | GAMMA = 1
520 | 
521 | # all possible actions
522 | ACTION_UP = 0; ACTION_DOWN = 1;ACTION_LEFT = 2;ACTION_RIGHT = 3
523 | actions = [ACTION_UP, ACTION_DOWN, ACTION_LEFT, ACTION_RIGHT]
524 | 
525 | # initial state action pair values
526 | stateActionValues = np.zeros((GRID_HEIGHT, GRID_WIDTH, 4))
527 | startState = [3, 0]
528 | goalState = [3, 11]
529 | 
530 | # reward for each action in each state
531 | actionRewards = np.zeros((GRID_HEIGHT, GRID_WIDTH, 4))
532 | actionRewards[:, :, :] = -1.0
533 | actionRewards[2, 1:11, ACTION_DOWN] = -100.0
534 | actionRewards[3, 0, ACTION_RIGHT] = -100.0
535 | 
536 | # set up destinations for each action in each state
537 | actionDestination = []
538 | for i in range(0, GRID_HEIGHT):
539 |     actionDestination.append([])
540 |     for j in range(0, GRID_WIDTH):
541 |         destinaion = dict()
542 |         destinaion[ACTION_UP] = [max(i - 1, 0), j]
543 |         destinaion[ACTION_LEFT] = [i, max(j - 1, 0)]
544 |         destinaion[ACTION_RIGHT] = [i, min(j + 1, GRID_WIDTH - 1)]
545 |         if i == 2 and 1 <= j <= 10:
546 |             destinaion[ACTION_DOWN] = startState
547 |         else:
548 |             destinaion[ACTION_DOWN] = [min(i + 1, GRID_HEIGHT - 1), j]
549 |         actionDestination[-1].append(destinaion)
550 | actionDestination[3][0][ACTION_RIGHT] = startState
551 | 
552 | # choose an action based on epsilon greedy algorithm
553 | def chooseAction(state, stateActionValues):
554 |     if np.random.binomial(1, EPSILON) == 1:
555 |         return np.random.choice(actions)
556 |     else:
557 |         return np.argmax(stateActionValues[state[0], state[1], :])
558 | 
559 | 
560 | # SARSA update
561 | 
562 | def sarsa(stateActionValues, expected=False, stepSize=ALPHA):
563 |     currentState = startState
564 |     currentAction = chooseAction(currentState, stateActionValues)
565 |     rewards = 0.0
566 |     while currentState != goalState:
567 |         newState = actionDestination[currentState[0]][currentState[1]][currentAction]
568 |         newAction = chooseAction(newState, stateActionValues)
569 |         reward = actionRewards[currentState[0], currentState[1], currentAction]
570 |         rewards += reward
571 |         if not expected:
572 |             valueTarget = stateActionValues[newState[0], newState[1], newAction]
573 |         else:
574 |             valueTarget = 0.0
575 |             actionValues = stateActionValues[newState[0], newState[1], :]
576 |             bestActions = np.argwhere(actionValues == np.max(actionValues))
577 |             for action in actions:
578 |                 if action in bestActions:
579 |                     valueTarget += ((1.0 - EPSILON) / len(bestActions) + EPSILON / len(actions)) * stateActionValues[newState[0], newState[1], action]
580 |                 else:
581 |                     valueTarget += EPSILON / len(actions) * stateActionValues[newState[0], newState[1], action]
582 |         valueTarget *= GAMMA
583 |         stateActionValues[currentState[0], currentState[1], currentAction] += stepSize * (reward +
584 |             valueTarget - stateActionValues[currentState[0], currentState[1], currentAction])
585 |         currentState = newState
586 |         currentAction = newAction
587 |     return rewards
588 | 
589 | # Q-Learning update
590 | def qLearning(stateActionValues, stepSize=ALPHA):
591 |     currentState = startState
592 |     rewards = 0.0
593 |     while currentState != goalState:
594 |         currentAction = chooseAction(currentState, stateActionValues)
595 |         reward = actionRewards[currentState[0], currentState[1], currentAction]
596 |         rewards += reward
597 |         newState = actionDestination[currentState[0]][currentState[1]][currentAction]
598 |         stateActionValues[currentState[0], currentState[1], currentAction] += stepSize * (
599 |             reward + GAMMA * np.max(stateActionValues[newState[0], newState[1], :]) -
600 |             stateActionValues[currentState[0], currentState[1], currentAction])
601 |         currentState = newState
602 |     return rewards
603 | 
604 | # print optimal policy
605 | def printOptimalPolicy(stateActionValues):
606 |     optimalPolicy = []
607 |     for i in range(0, GRID_HEIGHT):
608 |         optimalPolicy.append([])
609 |         for j in range(0, GRID_WIDTH):
610 |             if [i, j] == goalState:
611 |                 optimalPolicy[-1].append('G')
612 |                 continue
613 |             bestAction = np.argmax(stateActionValues[i, j, :])
614 |             if bestAction == ACTION_UP:
615 |                 optimalPolicy[-1].append('U')
616 |             elif bestAction == ACTION_DOWN:
617 |                 optimalPolicy[-1].append('D')
618 |             elif bestAction == ACTION_LEFT:
619 |                 optimalPolicy[-1].append('L')
620 |             elif bestAction == ACTION_RIGHT:
621 |                 optimalPolicy[-1].append('R')
622 |     for row in optimalPolicy:
623 |         print(row)
624 | 
625 | def SARSAnQLPlot():
626 |     # averaging the reward sums from 10 successive episodes
627 |     averageRange = 10
628 | 
629 |     # episodes of each run
630 |     nEpisodes = 500
631 | 
632 |     # perform 20 independent runs
633 |     runs = 20
634 | 
635 |     rewardsSarsa = np.zeros(nEpisodes)
636 |     rewardsQLearning = np.zeros(nEpisodes)
637 |     for run in range(0, runs):
638 |         stateActionValuesSarsa = np.copy(stateActionValues)
639 |         stateActionValuesQLearning = np.copy(stateActionValues)
640 |         for i in range(0, nEpisodes):
641 |             # cut off the value by -100 to draw the figure more elegantly
642 |             rewardsSarsa[i] += max(sarsa(stateActionValuesSarsa), -100)
643 |             rewardsQLearning[i] += max(qLearning(stateActionValuesQLearning), -100)
644 | 
645 |     # averaging over independt runs
646 |     rewardsSarsa /= runs
647 |     rewardsQLearning /= runs
648 | 
649 |     # averaging over successive episodes
650 |     smoothedRewardsSarsa = np.copy(rewardsSarsa)
651 |     smoothedRewardsQLearning = np.copy(rewardsQLearning)
652 |     for i in range(averageRange, nEpisodes):
653 |         smoothedRewardsSarsa[i] = np.mean(rewardsSarsa[i - averageRange: i + 1])
654 |         smoothedRewardsQLearning[i] = np.mean(rewardsQLearning[i - averageRange: i + 1])
655 | 
656 |     # display optimal policy
657 |     print('Sarsa Optimal Policy:')
658 |     printOptimalPolicy(stateActionValuesSarsa)
659 |     print('Q-Learning Optimal Policy:')
660 |     printOptimalPolicy(stateActionValuesQLearning)
661 | 
662 |     # draw reward curves
663 |     plt.figure(1)
664 |     plt.plot(smoothedRewardsSarsa, label='Sarsa')
665 |     plt.plot(smoothedRewardsQLearning, label='Q-Learning')
666 |     plt.xlabel('Episodes')
667 |     plt.ylabel('Sum of rewards during episode')
668 |     plt.legend()
669 | 
670 | 
671 | # Sum of Rewards for SARSA vs. QLearning
672 | SARSAnQLPlot()
673 | 
674 | 
675 | 
676 | 
677 | 
678 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # Statistics for Machine Learning
 4 | This is the code repository for [Statistics for Machine Learning](https://www.packtpub.com/big-data-and-business-intelligence/statistics-machine-learning?utm_source=github&utm_medium=repository&utm_campaign=9781788295758), published by [Packt](https://www.packtpub.com/?utm_source=github). It contains all the supporting project files necessary to work through the book from start to finish.
 5 | ## About the Book
 6 | Complex statistics in Machine Learning worry a lot of developers. Knowing statistics helps you build strong Machine Learning models that are optimized for a given problem statement. This book will teach you all it takes to perform complex statistical computations required for Machine Learning. You will gain information on statistics behind supervised learning, unsupervised learning, reinforcement learning, and more. You will see real-world examples that discuss the statistical side of Machine Learning and familiarize yourself with it. You will come across programs for performing tasks such as model, parameter fitting, regression, classification, density collection, working with vectors, matrices, and more. By the end of the book, you will have mastered the required statistics for Machine Learning and will be able to apply your new skills to any sort of industry problem.
 7 | ## Instructions and Navigation
 8 | All of the code is organized into folders. Each folder starts with a number followed by the application name. For example, Chapter02.
 9 | 
10 | 
11 | 
12 | The code will look like the following:
13 | ```
14 | >>> import numpy as np
15 | >>> from scipy import stats
16 | >>> data = np.array([4,5,1,2,7,2,6,9,3])
17 | # Calculate Mean
18 | >>> dt_mean = np.mean(data) ; print ("Mean :",round(dt_mean,2))
19 | # Calculate Median
20 | >>> dt_median = np.median(data) ; print ("Median :",dt_median)
21 | # Calculate Mode
22 | >>> dt_mode = stats.mode(data); print ("Mode :",dt_mode[0][0])
23 | ```
24 | 
25 | This book assumes that you know the basics of Python and R and how to install the
26 | libraries. It does not assume that you are already equipped with the knowledge of advanced
27 | statistics and mathematics, like linear algebra and so on.
28 | The following versions of software are used throughout this book, but it should run fine
29 | with any more recent ones as well:
30 | * Anaconda 3–4.3.1 (all Python and its relevant packages are included in
31 | Anaconda, Python 3.6.1, NumPy 1.12.1, Pandas 0.19.2, and scikit-learn 0.18.1)
32 | * R 3.4.0 and RStudio 1.0.143
33 | * Theano 0.9.0
34 | * Keras 2.0.2
35 | 
36 | ## Related Products
37 | * [Machine Learning for Developers](https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-developers?utm_source=github&utm_medium=repository&utm_campaign=9781786469878)
38 | 
39 | * [Scala for Machine Learning](https://www.packtpub.com/big-data-and-business-intelligence/scala-machine-learning?utm_source=github&utm_medium=repository&utm_campaign=9781783558742)
40 | 
41 | * [Machine Learning for OpenCV](https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-opencv?utm_source=github&utm_medium=repository&utm_campaign=9781783980284)
42 | ### Download a free PDF
43 | 
44 |  <i>If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.<br>Simply click on the link to claim your free PDF.</i>
45 | <p align="center"> <a href="https://packt.link/free-ebook/9781788295758">https://packt.link/free-ebook/9781788295758 </a> </p>


--------------------------------------------------------------------------------