├── Predict Customer Churn.ipynb
├── README.md
├── customer_churn_data.csv
├── final_model.model
└── predict_customer_churn.py


/README.md:
--------------------------------------------------------------------------------
1 | # predict-churn-py
2 | 
3 | Source code and the raw data that I have used to build the machine learning classifiers can be found in this repository for reference. Hope this helps!
4 | 


--------------------------------------------------------------------------------
/final_model.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srees1988/predict-churn-py/8cfd95f64a2ff48edd21af48a706a23227e76616/final_model.model


--------------------------------------------------------------------------------
/predict_customer_churn.py:
--------------------------------------------------------------------------------
   1 | 
   2 | # -*- coding: utf-8 -*-
   3 | """
   4 | Created on Fri Oct 16 06:30:42 2020
   5 | 
   6 | @author: srees
   7 | 
   8 | Title: Predict Customer Churn using Supervised ML
   9 | """
  10 | 
  11 | 
  12 | 
  13 | #Step 0: Restarting the session and Clearning all temporary variables----------------------------
  14 | 
  15 | 
  16 | try:
  17 |     from IPython import get_ipython
  18 |     get_ipython().magic('clear')
  19 |     get_ipython().magic('reset -f')
  20 | except:
  21 |     pass
  22 | 
  23 | 
  24 | #----------------------------------------------------------------------------------------
  25 | #-----------------Section A: Data Preprocessing------------------------------------------
  26 | #----------------------------------------------------------------------------------------
  27 | 
  28 | # Step 1: Import relevant libraries---------------------------------------------------------
  29 | 
  30 | #Standard libraries for data analysis:----------------------
  31 |     
  32 | import numpy as np
  33 | import matplotlib.pyplot as plt
  34 | import pandas as pd
  35 | from scipy.stats import norm, skew
  36 | from scipy import stats
  37 | import statsmodels.api as sm
  38 | 
  39 | 
  40 | # sklearn modules for data preprocessing-------------------------------------
  41 | 
  42 | from sklearn.impute import SimpleImputer
  43 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder
  44 | from sklearn.compose import ColumnTransformer
  45 | from sklearn.preprocessing import OneHotEncoder
  46 | from sklearn.model_selection import train_test_split
  47 | from sklearn.preprocessing import StandardScaler 
  48 | 
  49 | 
  50 | #sklearn modules for Model Selection--------------------------------------
  51 | 
  52 | from sklearn import svm, tree, linear_model, neighbors
  53 | from sklearn import naive_bayes, ensemble, discriminant_analysis, gaussian_process
  54 | from sklearn.neighbors import KNeighborsClassifier
  55 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
  56 | from xgboost import XGBClassifier
  57 | 
  58 | from sklearn.linear_model import LogisticRegression
  59 | from sklearn.svm import SVC
  60 | from sklearn.neighbors import KNeighborsClassifier
  61 | from sklearn.naive_bayes import GaussianNB
  62 | from sklearn.tree import DecisionTreeClassifier
  63 | from sklearn.ensemble import RandomForestClassifier
  64 | 
  65 | 
  66 | #sklearn modules for Model Evaluation & Improvement---------------------------
  67 |     
  68 | from sklearn.metrics import confusion_matrix, accuracy_score 
  69 | from sklearn.metrics import f1_score, precision_score, recall_score, fbeta_score
  70 | from statsmodels.stats.outliers_influence import variance_inflation_factor
  71 | from sklearn.model_selection import cross_val_score
  72 | from sklearn.model_selection import GridSearchCV
  73 | 
  74 | from sklearn.model_selection import ShuffleSplit
  75 | from sklearn.model_selection import KFold
  76 | 
  77 | from sklearn import feature_selection
  78 | from sklearn import model_selection
  79 | from sklearn import metrics
  80 | 
  81 | from sklearn.metrics import classification_report, precision_recall_curve
  82 | from sklearn.metrics import auc, roc_auc_score, roc_curve
  83 | from sklearn.metrics import make_scorer, recall_score, log_loss
  84 | from sklearn.metrics import average_precision_score
  85 |   
  86 | 
  87 | #Standard libraries for data visualization---------------------
  88 | 
  89 | import seaborn as sn
  90 | from matplotlib import pyplot
  91 | import matplotlib.pyplot as plt
  92 | import matplotlib.pylab as pylab
  93 | import matplotlib 
  94 | %matplotlib inline
  95 | color = sn.color_palette()
  96 | import matplotlib.ticker as mtick
  97 | from IPython.display import display
  98 | pd.options.display.max_columns = None
  99 | from pandas.plotting import scatter_matrix
 100 | from sklearn.metrics import roc_curve
 101 | 
 102 | 
 103 | #Miscellaneous Utilitiy Libraries--------------------------------------
 104 |     
 105 | import random
 106 | import os
 107 | import re
 108 | import sys
 109 | import timeit
 110 | import string
 111 | import time
 112 | from datetime import datetime
 113 | from time import time
 114 | from dateutil.parser import parse
 115 | import joblib
 116 | 
 117 | 
 118 | 
 119 | 
 120 | #Step 2: Set up current working directory------------------------------------------
 121 | 
 122 | os.chdir(r"C:/Users/srees/Desktop/Blogs and Projects/Upcoming Blogs and Projects/2. Propensity Scoring Models/3. Predict Customer Churn/")
 123 | 
 124 | 
 125 | # Step 3: Import the dataset--------------------------------------------------------
 126 | 
 127 | dataset = pd.read_csv('1.Input/customer_churn_data.csv')
 128 | 
 129 | 
 130 | # Step 4: Evaluate Datastructure --------------------------------------------------------
 131 | 
 132 | dataset.head()
 133 | 
 134 | dataset.columns
 135 | 
 136 | dataset.describe()
 137 | 
 138 | dataset.dtypes
 139 | 
 140 | 
 141 | 
 142 | #Recheck Column Datatypes and Missing Values:
 143 |     
 144 | dataset.columns.to_series().groupby(dataset.dtypes).groups
 145 | 
 146 | dataset.info()
 147 | 
 148 | dataset.isna().any()
 149 | 
 150 | #Unique values in each categorical variable:
 151 | 
 152 | dataset["PaymentMethod"].nunique()
 153 | 
 154 | dataset["PaymentMethod"].unique()
 155 | 
 156 | 
 157 | dataset["Contract"].nunique()
 158 | 
 159 | dataset["Contract"].unique()
 160 | 
 161 | #Step 5: Check Target Variable Distribution ----------------------------------------------- 
 162 | 
 163 | dataset["Churn"].value_counts()
 164 | 
 165 | # ======================================================================================================
 166 | #In this case, we have class imbalance with few negatives. In our business challenge, 
 167 | #false negatives are costly. Hence let's keep an eye onto the Precision, Recall & F2 score besides accuracy
 168 | # ======================================================================================================
 169 | 
 170 | 
 171 | #Step 6: Clean the Dataset----------------------------------------------------------------------
 172 | 
 173 | 
 174 | dataset['TotalCharges'] = pd.to_numeric(dataset['TotalCharges'],errors='coerce')
 175 | 
 176 | dataset['TotalCharges'] = dataset['TotalCharges'].astype("float")
 177 | 
 178 | 
 179 | 
 180 | #Step 7: Take care of missing data---------------------------------------------------------------
 181 | 
 182 | dataset.info()
 183 | 
 184 | dataset.isna().any()
 185 | 
 186 | #*Find the average and fill missing values of each columns programmatically.
 187 | 
 188 | na_cols = dataset.isna().any()
 189 | 
 190 | na_cols = na_cols[na_cols == True].reset_index()
 191 | 
 192 | na_cols = na_cols["index"].tolist()
 193 | 
 194 | for col in dataset.columns[1:]:
 195 |      if col in na_cols:
 196 |         if dataset[col].dtype != 'object':
 197 |              dataset[col] =  dataset[col].fillna(dataset[col].mean()).round(0)
 198 | 
 199 | #Revalidate:
 200 |   
 201 | dataset.isna().any()  
 202 | 
 203 | 
 204 | #Step 8: label Encode Binary data----------------------------------------------------------------
 205 | 
 206 | #Create a label encoder object
 207 | le = LabelEncoder()
 208 | 
 209 | # Label Encoding will be used for columns with 2 or less unique values
 210 | le_count = 0
 211 | for col in dataset.columns[1:]:
 212 |     if dataset[col].dtype == 'object':
 213 |         if len(list(dataset[col].unique())) <= 2:
 214 |             le.fit(dataset[col])
 215 |             dataset[col] = le.transform(dataset[col])
 216 |             le_count += 1
 217 | print('{} columns were label encoded.'.format(le_count))
 218 | 
 219 | 
 220 | #----------------------------------------------------------------------------------------
 221 | #-----------------Section B: Data Evaluation------------------------------------------
 222 | #----------------------------------------------------------------------------------------
 223 | 
 224 | #Step 9: Exploratory Data Analysis----------------------------------------------------------------------
 225 |   
 226 | #Step 9.1. Plot Histogram of numeric Columns--------------------------------------
 227 | 
 228 | dataset2 = dataset[['gender', 'SeniorCitizen', 'Partner', 'Dependents',
 229 |        'tenure', 'PhoneService', 'PaperlessBilling',
 230 |         'MonthlyCharges', 'TotalCharges']]
 231 | 
 232 | #Histogram:
 233 |     
 234 | fig = plt.figure(figsize=(15, 12))
 235 | plt.suptitle('Histograms of Numerical Columns\n',horizontalalignment="center",fontstyle = "normal", fontsize = 24, fontfamily = "sans-serif")
 236 | for i in range(dataset2.shape[1]):
 237 |     plt.subplot(6, 3, i + 1)
 238 |     f = plt.gca()
 239 |     f.set_title(dataset2.columns.values[i])
 240 | 
 241 |     vals = np.size(dataset2.iloc[:, i].unique())
 242 |     if vals >= 100:
 243 |         vals = 100
 244 |     
 245 |     plt.hist(dataset2.iloc[:, i], bins=vals, color = '#ec838a')
 246 | plt.tight_layout(rect=[0, 0.03, 1, 0.95])
 247 | 
 248 | 
 249 | #Step 9.2. Analyze distribution of Key Categorical Variables---------------------------------------------
 250 |  
 251 |     
 252 | #(1) Distribution of Contract Type----------------------------------------------------------------------------------------
 253 | 
 254 | contract_split = dataset[[ "customerID", "Contract"]]
 255 | sectors = contract_split .groupby ("Contract")
 256 | contract_split = pd.DataFrame(sectors["customerID"].count())
 257 | contract_split.rename(columns={'customerID':'No. of customers'}, inplace=True)
 258 | 
 259 | 
 260 | ax =  contract_split[["No. of customers"]].plot.bar(title = 'Customers by Contract Type', legend =True, table = False, grid = False,  subplots = False,  figsize =(12, 7), color ='#ec838a', fontsize = 15, stacked=False)
 261 | 
 262 | plt.ylabel('No. of Customers\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif")
 263 | plt.xlabel('\n Contract Type',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif")
 264 | plt.title('Customers by Contract Type \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif")
 265 | plt.legend(loc='top right', fontsize = "medium")
 266 | plt.xticks(rotation=0, horizontalalignment="center")
 267 | plt.yticks(rotation=0, horizontalalignment="right")
 268 | 
 269 | x_labels = np.array(contract_split[["No. of customers"]])
 270 | 
 271 | def add_value_labels(ax, spacing=5):   
 272 |     for rect in ax.patches:      
 273 |         y_value = rect.get_height()
 274 |         x_value = rect.get_x() + rect.get_width() / 2       
 275 |         space = spacing        
 276 |         va = 'bottom'      
 277 |         if y_value < 0:           
 278 |             space *= -1            
 279 |             va = 'top'       
 280 |         label = "{:.0f}".format(y_value)      
 281 |         ax.annotate(
 282 |             label,                      
 283 |             (x_value, y_value),         
 284 |             xytext=(0, space),          
 285 |             textcoords="offset points", 
 286 |             ha='center',                
 287 |             va=va)                                                             
 288 | add_value_labels(ax)
 289 | 
 290 | 
 291 | #(2) Distribution of Payment Method Type---------------------------------------------------------------------------------------
 292 | 
 293 | payment_method_split = dataset[[ "customerID", "PaymentMethod"]]
 294 | sectors = payment_method_split  .groupby ("PaymentMethod")
 295 | payment_method_split  = pd.DataFrame(sectors["customerID"].count())
 296 | payment_method_split.rename(columns={'customerID':'No. of customers'}, inplace=True)
 297 | 
 298 | 
 299 | ax =  payment_method_split [["No. of customers"]].plot.bar(title = 'Customers by Payment Method', legend =True, table = False, grid = False,  subplots = False,  figsize =(15, 10), color ='#ec838a', fontsize = 15, stacked=False)
 300 | 
 301 | plt.ylabel('No. of Customers\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif")
 302 | plt.xlabel('\n Contract Type',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif")
 303 | plt.title('Customers by Payment Method \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif")
 304 | plt.legend(loc='top right', fontsize = "medium")
 305 | plt.xticks(rotation=0, horizontalalignment="center")
 306 | plt.yticks(rotation=0, horizontalalignment="right")
 307 | 
 308 | x_labels = np.array(payment_method_split [["No. of customers"]])
 309 | 
 310 | def add_value_labels(ax, spacing=5):   
 311 |     for rect in ax.patches:      
 312 |         y_value = rect.get_height()
 313 |         x_value = rect.get_x() + rect.get_width() / 2       
 314 |         space = spacing        
 315 |         va = 'bottom'      
 316 |         if y_value < 0:           
 317 |             space *= -1            
 318 |             va = 'top'       
 319 |         label = "{:.0f}".format(y_value)      
 320 |         ax.annotate(
 321 |             label,                      
 322 |             (x_value, y_value),         
 323 |             xytext=(0, space),          
 324 |             textcoords="offset points", 
 325 |             ha='center',                
 326 |             va=va)                                                             
 327 | add_value_labels(ax)
 328 | 
 329 | 
 330 | #(3) Distribution of various Label Encoded Categorical Variables---------------------------------------------------------------------------------------
 331 | 
 332 | services = ['PhoneService','MultipleLines','InternetService','OnlineSecurity',
 333 |            'OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']
 334 | 
 335 | fig, axes = plt.subplots(nrows = 3,ncols = 3,figsize = (15,12))
 336 | for i, item in enumerate(services):
 337 |     if i < 3:
 338 |         ax = dataset[item].value_counts().plot(kind = 'bar',ax=axes[i,0],rot = 0, color ='#f3babc' )
 339 |         
 340 |     elif i >=3 and i < 6:
 341 |         ax = dataset[item].value_counts().plot(kind = 'bar',ax=axes[i-3,1],rot = 0,color ='#9b9c9a')
 342 |         
 343 |     elif i < 9:
 344 |         ax = dataset[item].value_counts().plot(kind = 'bar',ax=axes[i-6,2],rot = 0,color = '#ec838a')
 345 |     ax.set_title(item)
 346 | 
 347 | 
 348 |     
 349 | #Step 9.3: Analyze Churn Rate by Categorical variables:   -------------------------------------------------------------
 350 | 
 351 | #(1) Overall Churn Rate------------------------------------------------------------------------------------------
 352 | 
 353 | import matplotlib.ticker as mtick
 354 | churn_rate = dataset[["Churn", "customerID"]]
 355 | churn_rate ["churn_label"] = pd.Series(np.where((churn_rate["Churn"] == 0), "No", "Yes"))
 356 | sectors = churn_rate .groupby ("churn_label")
 357 | churn_rate = pd.DataFrame(sectors["customerID"].count())
 358 | churn_rate ["Churn Rate"] = (churn_rate ["customerID"] / sum(churn_rate ["customerID"]) )*100
 359 | ax =  churn_rate[["Churn Rate"]].plot.bar(title = 'Overall Churn Rate', legend =True, table = False, grid = False,  subplots = False,  figsize =(12, 7), color = '#ec838a', fontsize = 15, stacked=False, ylim =(0,100))
 360 | 
 361 | plt.ylabel('Proportion of Customers',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif")
 362 | plt.xlabel('Churn',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif")
 363 | plt.title('Overall Churn Rate \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif")
 364 | plt.legend(loc='top right', fontsize = "medium")
 365 | plt.xticks(rotation=0, horizontalalignment="center")
 366 | plt.yticks(rotation=0, horizontalalignment="right")
 367 | ax.yaxis.set_major_formatter(mtick.PercentFormatter())
 368 | x_labels = np.array(churn_rate[["customerID"]])
 369 | 
 370 | def add_value_labels(ax, spacing=5):   
 371 |     for rect in ax.patches:     
 372 |         y_value = rect.get_height()
 373 |         x_value = rect.get_x() + rect.get_width() / 2       
 374 |         space = spacing
 375 |         va = 'bottom'        
 376 |         if y_value < 0:           
 377 |             space *= -1          
 378 |             va = 'top'
 379 |         
 380 |         label = "{:.1f}%".format(y_value)    
 381 |         ax.annotate(
 382 |             label,                      
 383 |             (x_value, y_value),        
 384 |             xytext=(0, space),         
 385 |             textcoords="offset points", 
 386 |             ha='center',                
 387 |             va=va)                                                            
 388 | add_value_labels(ax)
 389 | ax.autoscale(enable=False, axis='both', tight=False)  
 390 | 
 391 | 
 392 | #(2) Churn Rate by Contract Type ----------------------------------------------------------------------------------------
 393 | 
 394 | 
 395 | import matplotlib.ticker as mtick
 396 | 
 397 | contract_churn = dataset.groupby(['Contract','Churn']).size().unstack()
 398 | 
 399 | contract_churn.rename(columns={0:'No', 1:'Yes'}, inplace=True)
 400 | 
 401 | colors  = ['#ec838a','#9b9c9a']
 402 | 
 403 | ax = (contract_churn.T*100.0 / contract_churn.T.sum()).T.plot(kind='bar',
 404 |                                                                 width = 0.3,
 405 |                                                                 stacked = True,
 406 |                                                                 rot = 0, 
 407 |                                                                 figsize = (12,7),
 408 |                                                                 color = colors)
 409 | 
 410 | 
 411 | 
 412 | 
 413 | 
 414 | plt.ylabel('Proportion of Customers\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif")
 415 | plt.xlabel('Contract Type\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif")
 416 | plt.title('Churn Rate by Contract type \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif")
 417 | plt.legend(loc='top right', fontsize = "medium")
 418 | plt.xticks(rotation=0, horizontalalignment="center")
 419 | plt.yticks(rotation=0, horizontalalignment="right")
 420 | ax.yaxis.set_major_formatter(mtick.PercentFormatter())
 421 | for p in ax.patches:
 422 |     width, height = p.get_width(), p.get_height()
 423 |     x, y = p.get_xy() 
 424 |     ax.text(x+width/2, 
 425 |             y+height/2, 
 426 |             '{:.1f}%'.format(height), 
 427 |             horizontalalignment='center', 
 428 |             verticalalignment='center')
 429 | ax.autoscale(enable=False, axis='both', tight=False)   
 430 | 
 431 | 
 432 | 
 433 | #(3) Churn Rate by Payment Method Type----------------------------------------------------------------------------------------
 434 | 
 435 | 
 436 | import matplotlib.ticker as mtick
 437 | 
 438 | contract_churn = dataset.groupby(['Contract','PaymentMethod']).size().unstack()
 439 | 
 440 | contract_churn.rename(columns={0:'No', 1:'Yes'}, inplace=True)
 441 | 
 442 | colors  = ['#ec838a','#9b9c9a', '#f3babc' , '#4d4f4c']
 443 | 
 444 | ax = (contract_churn.T*100.0 / contract_churn.T.sum()).T.plot(kind='bar',
 445 |                                                                 width = 0.3,
 446 |                                                                 stacked = True,
 447 |                                                                 rot = 0, 
 448 |                                                                 figsize = (12,7),
 449 |                                                                 color = colors)
 450 | 
 451 | 
 452 | 
 453 | 
 454 | 
 455 | plt.ylabel('Proportion of Customers\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif")
 456 | plt.xlabel('Contract Type\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif")
 457 | plt.title('Churn Rate by Payment Method \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif")
 458 | plt.legend(loc='best', fontsize = "medium")
 459 | plt.xticks(rotation=0, horizontalalignment="center")
 460 | plt.yticks(rotation=0, horizontalalignment="right")
 461 | ax.yaxis.set_major_formatter(mtick.PercentFormatter())
 462 | for p in ax.patches:
 463 |     width, height = p.get_width(), p.get_height()
 464 |     x, y = p.get_xy() 
 465 |     ax.text(x+width/2, 
 466 |             y+height/2, 
 467 |             '{:.1f}%'.format(height), 
 468 |             horizontalalignment='center', 
 469 |             verticalalignment='center')
 470 | ax.autoscale(enable=False, axis='both', tight=False)   
 471 | 
 472 | 
 473 | # Step 9.4. Find positive and negative correlations with the Response Variable--------------------
 474 | 
 475 | dataset2 = dataset[['SeniorCitizen', 'Partner', 'Dependents',
 476 |        'tenure', 'PhoneService', 'PaperlessBilling',
 477 |         'MonthlyCharges', 'TotalCharges']]
 478 | 
 479 | correlations = dataset2.corrwith(dataset.Churn)
 480 | correlations = correlations[correlations!=1]
 481 | positive_correlations = correlations[correlations >0].sort_values(ascending = False)
 482 | negative_correlations = correlations[correlations<0].sort_values(ascending = False)
 483 | 
 484 | print('Most Positive Correlations: \n', positive_correlations)
 485 | print('\nMost Negative Correlations: \n', negative_correlations)
 486 | 
 487 | 
 488 | #Step 10.4. Plot positive & negative correlation with Response Variable-----------------------------
 489 | 
 490 | correlations = dataset2.corrwith(dataset.Churn)
 491 | correlations = correlations[correlations!=1]
 492 | 
 493 | correlations.plot.bar(
 494 |         figsize = (18, 10), fontsize = 15, color = '#ec838a',
 495 |         rot = 45, grid = True)
 496 | 
 497 | plt.title('Correlation with Churn Rate \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif")
 498 | 
 499 | 
 500 | 
 501 | 
 502 | #Step 9.5. Plot Correlation Matrix of all independent variables------------------------
 503 | 
 504 | ## Set and compute the Correlation Matrix
 505 | sn.set(style="white")
 506 | corr = dataset2.corr()
 507 | 
 508 | # Generate a mask for the upper triangle
 509 | mask = np.zeros_like(corr, dtype=np.bool)
 510 | mask[np.triu_indices_from(mask)] = True
 511 | 
 512 | # Set up the matplotlib figure and a diverging colormap
 513 | f, ax = plt.subplots(figsize=(18, 15))
 514 | cmap = sn.diverging_palette(220, 10, as_cmap=True)
 515 | 
 516 | # Draw the heatmap with the mask and correct aspect ratio
 517 | sn.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
 518 |             square=True, linewidths=.5, cbar_kws={"shrink": .5})
 519 | 
 520 | 
 521 | 
 522 | 
 523 | #Step 9.6: Check Multicolinearity using VIF----------------------------------------------
 524 | 
 525 | def calc_vif(X):
 526 | 
 527 |     # Calculating VIF
 528 |     vif = pd.DataFrame()
 529 |     vif["variables"] = X.columns
 530 |     vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
 531 | 
 532 |     return(vif)
 533 | 
 534 | dataset2 = dataset[['gender', 'SeniorCitizen', 'Partner', 'Dependents',
 535 |        'tenure', 'PhoneService', 'PaperlessBilling','MonthlyCharges','TotalCharges']]
 536 | 
 537 | calc_vif(dataset2)
 538 | 
 539 | #Total Charges seem to be colinear with Monthly Charges.
 540 | 
 541 | #check colinearity:
 542 |     
 543 | dataset2[['MonthlyCharges', 'TotalCharges']].plot.scatter(figsize = (15, 10), x = 'MonthlyCharges',
 544 |                                                               y='TotalCharges', color =  '#ec838a')
 545 | 
 546 | 
 547 | plt.title('Co-linearity of Monthly Charges and Total Charges \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif")
 548 | 
 549 | #dropping TotalCharges:
 550 |     
 551 | dataset2 = dataset2.drop(columns = "TotalCharges")
 552 | 
 553 | #Revalidate Colinearity:
 554 | 
 555 | dataset2 = dataset[['gender', 'SeniorCitizen', 'Partner', 'Dependents',
 556 |        'tenure', 'PhoneService', 'PaperlessBilling','MonthlyCharges']]
 557 | 
 558 | calc_vif(dataset2)
 559 | 
 560 | #Applying changes in the main dataset:
 561 |     
 562 | dataset = dataset.drop(columns = "TotalCharges")   
 563 | 
 564 | 
 565 | 
 566 | #Step 10: Encode Categorical data----------------------------------------------------------------
 567 | 
 568 | #Incase if user_id is an object:
 569 |     
 570 | identity = dataset["customerID"]
 571 | 
 572 | dataset = dataset.drop(columns="customerID")
 573 | 
 574 | # convert rest of categorical variable into dummy
 575 | 
 576 | dataset= pd.get_dummies(dataset)
 577 | 
 578 | #Rejoin userid to dataset (column concatenation)
 579 | 
 580 | dataset = pd.concat([dataset, identity], axis = 1)
 581 |  
 582 | 
 583 | 
 584 | #Step 11: Split dataset into dependent and independent variables-----------------------------------
 585 | 
 586 | #identify response variable:
 587 |     
 588 | response = dataset["Churn"]
 589 | 
 590 | dataset = dataset.drop(columns="Churn")
 591 | 
 592 | #Step 12: Generate training and test datasets of dependent and independent variables-----------------
 593 | 
 594 | 
 595 | X_train, X_test, y_train, y_test = train_test_split(dataset, response,
 596 |                                                     stratify=response, 
 597 |                                                     test_size = 0.2, #use 0.9 if data is huge.
 598 |                                                     random_state = 0)
 599 | 
 600 | #to resolve any class imbalance - use stratify parameter.
 601 | 
 602 | print("Number transactions X_train dataset: ", X_train.shape)
 603 | print("Number transactions y_train dataset: ", y_train.shape)
 604 | print("Number transactions X_test dataset: ", X_test.shape)
 605 | print("Number transactions y_test dataset: ", y_test.shape)
 606 | 
 607 | 
 608 | 
 609 | # Step 13: Removing Identifiers-------------------------------------------------------------------
 610 | 
 611 | train_identity = X_train['customerID']
 612 | X_train = X_train.drop(columns = ['customerID'])
 613 | 
 614 | test_identity = X_test['customerID']
 615 | X_test = X_test.drop(columns = ['customerID'])
 616 | 
 617 | 
 618 | # Step 14: Feature Scaling-----------------------------------------------------------------------
 619 | 
 620 | sc_X = StandardScaler()
 621 | X_train2 = pd.DataFrame(sc_X.fit_transform(X_train))
 622 | X_train2.columns = X_train.columns.values
 623 | X_train2.index = X_train.index.values
 624 | X_train = X_train2
 625 | 
 626 | X_test2 = pd.DataFrame(sc_X.transform(X_test))
 627 | X_test2.columns = X_test.columns.values
 628 | X_test2.index = X_test.index.values
 629 | X_test = X_test2
 630 | 
 631 | 
 632 | 
 633 | #----------------------------------------------------------------------------------------
 634 | #-----------------Section C: Model Selection------------------------------------------
 635 | #----------------------------------------------------------------------------------------
 636 | 
 637 | #Step 15.1: Compare Baseline Classification Algorithms - First Iteration
 638 | #Using Accuracy and ROC AUC Mean Metrics
 639 | 
 640 | 
 641 | models = []
 642 | 
 643 | models.append(('Logistic Regression', LogisticRegression(solver='liblinear', random_state = 0,
 644 |                                                          class_weight='balanced')))
 645 | 
 646 | models.append(('SVC', SVC(kernel = 'linear', random_state = 0)))
 647 | 
 648 | 
 649 | models.append(('Kernel SVM', SVC(kernel = 'rbf', random_state = 0)))
 650 | 
 651 | 
 652 | models.append(('KNN', KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)))
 653 | 
 654 | 
 655 | models.append(('Gaussian NB', GaussianNB()))
 656 | 
 657 | 
 658 | models.append(('Decision Tree Classifier',
 659 |                DecisionTreeClassifier(criterion = 'entropy', random_state = 0)))
 660 | 
 661 | 
 662 | models.append(('Random Forest', RandomForestClassifier(
 663 |     n_estimators=100, criterion = 'entropy', random_state = 0)))
 664 | 
 665 | 
 666 | 
 667 | #Evaluating Model Results: 
 668 | 
 669 |     
 670 | acc_results = []
 671 | auc_results = []
 672 | names = []
 673 | # set table to table to populate with performance results
 674 | col = ['Algorithm', 'ROC AUC Mean', 'ROC AUC STD', 
 675 |        'Accuracy Mean', 'Accuracy STD']
 676 | 
 677 | model_results = pd.DataFrame(columns=col)
 678 | i = 0
 679 | # evaluate each model using k-fold cross-validation
 680 | for name, model in models:
 681 |     kfold = model_selection.KFold(
 682 |         n_splits=10, random_state=0)  # 10-fold cross-validation
 683 | 
 684 |     cv_acc_results = model_selection.cross_val_score(  # accuracy scoring
 685 |         model, X_train, y_train, cv=kfold, scoring='accuracy')
 686 | 
 687 |     cv_auc_results = model_selection.cross_val_score(  # roc_auc scoring
 688 |         model, X_train, y_train, cv=kfold, scoring='roc_auc')
 689 | 
 690 |     acc_results.append(cv_acc_results)
 691 |     auc_results.append(cv_auc_results)
 692 |     names.append(name)
 693 |     model_results.loc[i] = [name,
 694 |                          round(cv_auc_results.mean()*100, 2),
 695 |                          round(cv_auc_results.std()*100, 2),
 696 |                          round(cv_acc_results.mean()*100, 2),
 697 |                          round(cv_acc_results.std()*100, 2)
 698 |                          ]
 699 |     i += 1
 700 |     
 701 | model_results.sort_values(by=['ROC AUC Mean'], ascending=False)
 702 | 
 703 | 
 704 | #Step 15.2.  Visualize Classification Algorithms Accuracy Comparisons:-----------------------------------
 705 | 
 706 |   
 707 |   
 708 | #Using Accuracy Mean:
 709 |     
 710 | fig = plt.figure(figsize=(15, 7))
 711 | ax = fig.add_subplot(111)
 712 | plt.boxplot(acc_results)
 713 | ax.set_xticklabels(names)
 714 | 
 715 | #plt.ylabel('ROC AUC Score\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif")
 716 | #plt.xlabel('\n Baseline Classification Algorithms\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif")
 717 | plt.title('Accuracy Score Comparison \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif")
 718 | #plt.legend(loc='top right', fontsize = "medium")
 719 | plt.xticks(rotation=0, horizontalalignment="center")
 720 | plt.yticks(rotation=0, horizontalalignment="right")
 721 | 
 722 | plt.show()
 723 | 
 724 | 
 725 | #using Area under ROC Curve:
 726 | 
 727 | fig = plt.figure(figsize=(15, 7))
 728 | ax = fig.add_subplot(111)
 729 | plt.boxplot(auc_results)
 730 | ax.set_xticklabels(names)
 731 | 
 732 | #plt.ylabel('ROC AUC Score\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif")
 733 | #plt.xlabel('\n Baseline Classification Algorithms\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif")
 734 | plt.title('ROC AUC Comparison \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif")
 735 | #plt.legend(loc='top right', fontsize = "medium")
 736 | plt.xticks(rotation=0, horizontalalignment="center")
 737 | plt.yticks(rotation=0, horizontalalignment="right")
 738 | 
 739 | 
 740 | plt.show()
 741 | 
 742 | 
 743 | #------------------------------------------------------------------------------------------
 744 | #Compare Baseline Classification Algorithms - Second Iteration
 745 | #Using Accuracy, Precision, Recall, F1 and F2 Score Metrics
 746 | #-------------------------------------------------------------------------------------------
 747 | 
 748 | 
 749 | #Step 15.3. Get the right parameters for the baseline models:---------------------------
 750 | 
 751 | #Identify optimal number of K neighbors for KNN Model:
 752 | 
 753 | 
 754 | score_array = []
 755 | for each in range(1,25):
 756 |     knn_loop = KNeighborsClassifier(n_neighbors = each) #set K neighbor as 3
 757 |     knn_loop.fit(X_train,y_train)
 758 |     score_array.append(knn_loop.score(X_test,y_test))
 759 | 
 760 | fig = plt.figure(figsize=(15, 7))
 761 | plt.plot(range(1,25),score_array, color = '#ec838a')
 762 | 
 763 | 
 764 | plt.ylabel('Range\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif")
 765 | plt.xlabel('Score\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif")
 766 | plt.title('Optimal Number of K Neighbors \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif")
 767 | #plt.legend(loc='top right', fontsize = "medium")
 768 | plt.xticks(rotation=0, horizontalalignment="center")
 769 | plt.yticks(rotation=0, horizontalalignment="right")
 770 | 
 771 | 
 772 | plt.show()
 773 | 
 774 | #optimal number of K neigbors = 22
 775 | 
 776 | #Identify optimal number of trees for Random Forest Model:
 777 |  
 778 | score_array = []
 779 | for each in range(1,100):
 780 |     rf_loop = RandomForestClassifier(n_estimators = each, random_state = 1) 
 781 |     rf_loop.fit(X_train,y_train)
 782 |     score_array.append(rf_loop.score(X_test,y_test))
 783 |  
 784 | fig = plt.figure(figsize=(15, 7))
 785 | plt.plot(range(1,100),score_array, color = '#ec838a')
 786 | 
 787 | 
 788 | plt.ylabel('Range\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif")
 789 | plt.xlabel('Score\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif")
 790 | plt.title('Optimal Number of Trees for Random Forest Model \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif")
 791 | #plt.legend(loc='top right', fontsize = "medium")
 792 | plt.xticks(rotation=0, horizontalalignment="center")
 793 | plt.yticks(rotation=0, horizontalalignment="right")
 794 | 
 795 | 
 796 | plt.show()
 797 |  
 798 |  
 799 | #Optimal number of decision trees = 72
 800 | 
 801 |  
 802 | #Step 15.4. Compare Baseline Classification Algorithms - Second Iteration-----------------------------
 803 | 
 804 | #--Step 15.4.1. Logistic Regression-----------------
 805 | 
 806 | # Fitting Logistic Regression to the Training set 
 807 | classifier = LogisticRegression(random_state = 0)
 808 | classifier.fit(X_train, y_train)
 809 | 
 810 | # Predicting the Test set results
 811 | y_pred = classifier.predict(X_test)
 812 | 
 813 | #Evaluate results
 814 | 
 815 | acc = accuracy_score(y_test, y_pred )
 816 | prec = precision_score(y_test, y_pred )
 817 | rec = recall_score(y_test, y_pred )
 818 | f1 = f1_score(y_test, y_pred )
 819 | f2 = fbeta_score(y_test, y_pred, beta=2.0)
 820 | 
 821 | results = pd.DataFrame([['Logistic Regression', acc, prec, rec, f1, f2]],
 822 |                columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score'])
 823 | 
 824 | 
 825 | 
 826 | #Step 15.4.2. . Support Vector Machine (linear classifier)------------------------
 827 | 
 828 | 
 829 | # Fitting SVM (SVC class) to the Training set:
 830 | 
 831 | classifier = SVC(kernel = 'linear', random_state = 0)
 832 | classifier.fit(X_train, y_train)
 833 | 
 834 | # Predicting the Test set results 
 835 | y_pred = classifier.predict(X_test)
 836 | 
 837 | #Evaluate results
 838 | 
 839 | acc = accuracy_score(y_test, y_pred )
 840 | prec = precision_score(y_test, y_pred )
 841 | rec = recall_score(y_test, y_pred)
 842 | f1 = f1_score(y_test, y_pred )
 843 | f2 = fbeta_score(y_test, y_pred, beta=2.0)
 844 | 
 845 | model_results = pd.DataFrame([['SVM (Linear)', acc, prec, rec, f1, f2]],
 846 |                columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score'])
 847 | 
 848 | results = results.append(model_results, ignore_index = True)
 849 | 
 850 | 
 851 | #Step 15.4.3. K-Nearest Neighbours------------------------
 852 | 
 853 | 
 854 | # Fitting KNN to the Training set:
 855 | 
 856 | classifier = KNeighborsClassifier(n_neighbors = 22, metric = 'minkowski', p = 2)
 857 | classifier.fit(X_train, y_train)
 858 | 
 859 | # Predicting the Test set results 
 860 | y_pred  = classifier.predict(X_test)
 861 | 
 862 | #Evaluate results
 863 | acc = accuracy_score(y_test, y_pred )
 864 | prec = precision_score(y_test, y_pred )
 865 | rec = recall_score(y_test, y_pred )
 866 | f1 = f1_score(y_test, y_pred )
 867 | f2 = fbeta_score(y_test, y_pred, beta=2.0)
 868 | 
 869 | model_results = pd.DataFrame([['K-Nearest Neighbours', acc, prec, rec, f1, f2]],
 870 |                columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score'])
 871 | 
 872 | results = results.append(model_results, ignore_index = True)
 873 | 
 874 | 
 875 | 
 876 | #Step 15.4.4.  Kernel SVM------------------------
 877 | 
 878 | # Fitting Kernel SVM to the Training set:
 879 | 
 880 | classifier = SVC(kernel = 'rbf', random_state = 0)
 881 | classifier.fit(X_train, y_train)
 882 | 
 883 | # Predicting the Test set results 
 884 | y_pred = classifier.predict(X_test)
 885 | 
 886 | #Evaluate results
 887 | 
 888 | acc = accuracy_score(y_test, y_pred )
 889 | prec = precision_score(y_test, y_pred )
 890 | rec = recall_score(y_test, y_pred )
 891 | f1 = f1_score(y_test, y_pred )
 892 | f2 = fbeta_score(y_test, y_pred, beta=2.0)
 893 | 
 894 | model_results = pd.DataFrame([['Kernel SVM', acc, prec, rec, f1, f2]],
 895 |                columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score'])
 896 | 
 897 | results = results.append(model_results, ignore_index = True)
 898 | 
 899 | 
 900 | #Step 15.4.5.  Naive Byes------------------------------------------------
 901 | 
 902 | # Fitting Naive Byes to the Training set:
 903 |     
 904 | classifier = GaussianNB()
 905 | classifier.fit(X_train, y_train)
 906 | 
 907 | # Predicting the Test set results 
 908 | y_pred = classifier.predict(X_test)
 909 | 
 910 | #Evaluate results
 911 | acc = accuracy_score(y_test, y_pred )
 912 | prec = precision_score(y_test, y_pred )
 913 | rec = recall_score(y_test, y_pred )
 914 | f1 = f1_score(y_test, y_pred )
 915 | f2 = fbeta_score(y_test, y_pred, beta=2.0)
 916 | 
 917 | model_results = pd.DataFrame([['Naive Byes', acc, prec, rec, f1, f2]],
 918 |                 columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score'])
 919 | 
 920 | results = results.append(model_results, ignore_index = True)
 921 | 
 922 | 
 923 | 
 924 | #Step 15.4.6. Decision Tree---------------------------------------------
 925 | 
 926 | 
 927 | # Fitting Decision Tree to the Training set:
 928 | 
 929 | classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
 930 | classifier.fit(X_train, y_train)
 931 | 
 932 | 
 933 | # Predicting the Test set results 
 934 | y_pred = classifier.predict(X_test)
 935 | 
 936 | #Evaluate results
 937 | acc = accuracy_score(y_test, y_pred )
 938 | prec = precision_score(y_test, y_pred )
 939 | rec = recall_score(y_test, y_pred )
 940 | f1 = f1_score(y_test, y_pred )
 941 | f2 = fbeta_score(y_test, y_pred, beta=2.0)
 942 | 
 943 | model_results = pd.DataFrame([['Decision Tree', acc, prec, rec, f1, f2]],
 944 |                columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score'])
 945 | 
 946 | results = results.append(model_results, ignore_index = True)
 947 | 
 948 | 
 949 | #Step 15.4.7. Random Forest--------------------------------------------
 950 | 
 951 | 
 952 | # Fitting Random Forest to the Training set:
 953 |     
 954 | classifier = RandomForestClassifier(n_estimators = 72, criterion = 'entropy', random_state = 0)
 955 | classifier.fit(X_train, y_train)
 956 | 
 957 | 
 958 | 
 959 | # Predicting the Test set results 
 960 | y_pred = classifier.predict(X_test)
 961 | 
 962 | #Evaluate results
 963 | 
 964 | from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
 965 | acc = accuracy_score(y_test, y_pred )
 966 | prec = precision_score(y_test, y_pred )
 967 | rec = recall_score(y_test, y_pred )
 968 | f1 = f1_score(y_test, y_pred )
 969 | f2 = fbeta_score(y_test, y_pred, beta=2.0)
 970 | 
 971 | model_results = pd.DataFrame([['Random Forest', acc, prec, rec, f1, f2]],
 972 |                columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score'])
 973 | 
 974 | results = results.append(model_results, ignore_index = True)
 975 | 
 976 | #Step 15.5. Visualize the results and compare the baseline algorithms----------------------------------
 977 | 
 978 | # =======================================================================================================================
 979 | #Sort results based on the right classification metric:
 980 | #(Accuracy/ROC_AUC / Precision/Recall/F1/F2 scores)
 981 | 
 982 | #Since we have class imbalance. When we look into the business challenge, 
 983 | # our false negatives will be costly and hence we need to Keep an eye onto the Precision, Recall & F2 score besides accuracy
 984 | # =======================================================================================================================
 985 | 
 986 | results = results.sort_values(["Precision", "Recall", "F2 Score"], ascending = False)
 987 |     
 988 | 
 989 | print (results)
 990 | 
 991 | 
 992 | #----------------------------------------------------------------------------------------
 993 | #-----------------Section D: Model Evaluation (Logistic Regression)----------------------
 994 | #----------------------------------------------------------------------------------------
 995 | 
 996 | 
 997 | #Step 16: Train & evaluate Chosen Model---------------------------------------------
 998 |     
 999 |     
1000 | # Fit Logistic Regression on the Training dataset:
1001 |     
1002 | classifier = LogisticRegression(random_state = 0, penalty = 'l2')
1003 | classifier.fit(X_train, y_train)
1004 | 
1005 | 
1006 | # Predict the Test set results
1007 | 
1008 | y_pred = classifier.predict(X_test)
1009 | 
1010 | 
1011 | #Evaluate Model Results on Test Set:
1012 | 
1013 | acc = accuracy_score(y_test, y_pred )
1014 | prec = precision_score(y_test, y_pred )
1015 | rec = recall_score(y_test, y_pred )
1016 | f1 = f1_score(y_test, y_pred )
1017 | f2 = fbeta_score(y_test, y_pred, beta=2.0)
1018 | 
1019 | results = pd.DataFrame([['Logistic Regression', acc, prec, rec, f1, f2]],
1020 |                columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score'])
1021 | 
1022 | print (results)
1023 | 
1024 | 
1025 | 
1026 | # Re-check k-Fold Cross Validation:
1027 | 
1028 | accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
1029 | print("Logistic Regression Classifier Accuracy: %0.2f (+/- %0.2f)"  % (accuracies.mean(), accuracies.std() * 2))
1030 | 
1031 | 
1032 | #Visualize results on a Confusion Matrix:
1033 |     
1034 | cm = confusion_matrix(y_test, y_pred) 
1035 | df_cm = pd.DataFrame(cm, index = (0, 1), columns = (0, 1))
1036 | plt.figure(figsize = (28,20))
1037 | 
1038 | 
1039 | fig, ax = plt.subplots()
1040 | sn.set(font_scale=1.4)
1041 | sn.heatmap(df_cm, annot=True, fmt='g'#,cmap="YlGnBu" 
1042 |            )
1043 | class_names=[0,1]
1044 | tick_marks = np.arange(len(class_names))
1045 | plt.tight_layout()
1046 | plt.title('Confusion matrix\n', y=1.1)
1047 | plt.xticks(tick_marks, class_names)
1048 | plt.yticks(tick_marks, class_names)
1049 | ax.xaxis.set_label_position("top")
1050 | plt.ylabel('Actual label\n')
1051 | plt.xlabel('Predicted label\n')
1052 | 
1053 | 
1054 | 
1055 | # Evaluate the model using ROC Graph
1056 | 
1057 | classifier.fit(X_train, y_train) 
1058 | probs = classifier.predict_proba(X_test) 
1059 | probs = probs[:, 1] 
1060 | classifier_roc_auc = accuracy_score(y_test, y_pred )
1061 | 
1062 | rf_fpr, rf_tpr, rf_thresholds = roc_curve(y_test, classifier.predict_proba(X_test)[:,1])
1063 | plt.figure(figsize=(14, 6))
1064 | 
1065 | # Plot Logistic Regression ROC
1066 | plt.plot(rf_fpr, rf_tpr, label='Logistic Regression (area = %0.2f)' % classifier_roc_auc)
1067 | # Plot Base Rate ROC
1068 | plt.plot([0,1], [0,1],label='Base Rate' 'k--')
1069 | 
1070 | plt.xlim([0.0, 1.0])
1071 | plt.ylim([0.0, 1.05])
1072 | 
1073 | 
1074 | plt.ylabel('True Positive Rate \n',horizontalalignment="center",fontstyle = "normal", fontsize = "medium", fontfamily = "sans-serif")
1075 | plt.xlabel('\nFalse Positive Rate \n',horizontalalignment="center",fontstyle = "normal", fontsize = "medium", fontfamily = "sans-serif")
1076 | plt.title('ROC Graph \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif")
1077 | plt.legend(loc="lower right", fontsize = "medium")
1078 | plt.xticks(rotation=0, horizontalalignment="center")
1079 | plt.yticks(rotation=0, horizontalalignment="right")
1080 | 
1081 | plt.show()
1082 | 
1083 | 
1084 | 
1085 | 
1086 | #Step 17:Predict Feature Importance------------------------------------------------------
1087 |  
1088 | 
1089 | # Analyzing Coefficients
1090 | feature_importances = pd.concat([pd.DataFrame(dataset.drop(columns = 'customerID').columns, columns = ["features"]),
1091 |            pd.DataFrame(np.transpose(classifier.coef_), columns = ["coef"])
1092 |            ],axis = 1)
1093 | 
1094 | feature_importances.sort_values("coef", ascending = False)
1095 | 
1096 | 
1097 | 
1098 | #----------------------------------------------------------------------------------------
1099 | #-----------------Section E: Model Improvement (Logistic Regression)----------------------------
1100 | #----------------------------------------------------------------------------------------
1101 | 
1102 | 
1103 | 
1104 | #Step 18:Hyper parameter Tuning  --------------------------------------
1105 | 
1106 | 
1107 | # Round 1: -----------------------------------------------------------------
1108 |  
1109 | # Select Regularization Method   
1110 | import time
1111 | penalty = ['l1', 'l2']
1112 | 
1113 | # Create regularization hyperparameter space
1114 | C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
1115 | 
1116 | # Combine Parameters
1117 | parameters = dict(C=C, penalty=penalty)
1118 | 
1119 | lr_classifier = GridSearchCV(estimator = classifier,
1120 |                            param_grid = parameters,
1121 |                            scoring = "balanced_accuracy",
1122 |                            cv = 10,
1123 |                            n_jobs = -1)
1124 | t0 = time.time()
1125 | lr_classifier  = lr_classifier .fit(X_train, y_train)
1126 | t1 = time.time()
1127 | print("Took %0.2f seconds" % (t1 - t0))
1128 | 
1129 | lr_best_accuracy = lr_classifier.best_score_
1130 | lr_best_parameters = lr_classifier.best_params_
1131 | lr_best_accuracy, lr_best_parameters
1132 | 
1133 | #verdict: No accuracy lift post hyperparameter tuning (round1)
1134 | 
1135 | # Round 2: -----------------------------------------------------------------
1136 |     
1137 | 
1138 | # Select Regularization Method
1139 | import time
1140 | penalty = ['l2']
1141 | 
1142 | # Create regularization hyperparameter space
1143 | C = [ 0.0001, 0.001, 0.01, 0.02, 0.05]
1144 | 
1145 | # Combine Parameters
1146 | parameters = dict(C=C, penalty=penalty)
1147 | 
1148 | lr_classifier = GridSearchCV(estimator = classifier,
1149 |                            param_grid = parameters,
1150 |                            scoring = "balanced_accuracy",
1151 |                            cv = 10,
1152 |                            n_jobs = -1)
1153 | t0 = time.time()
1154 | lr_classifier  = lr_classifier .fit(X_train, y_train)
1155 | t1 = time.time()
1156 | print("Took %0.2f seconds" % (t1 - t0))
1157 | 
1158 | lr_best_accuracy = lr_classifier.best_score_
1159 | lr_best_parameters = lr_classifier.best_params_
1160 | lr_best_accuracy, lr_best_parameters
1161 | 
1162 | #verdict: No accuracy lift post hyperparameter tuning (round1)
1163 | 
1164 | #Step 18.3:Final Hyper parameter tuning and selection --------------------------------------
1165 | 
1166 | 
1167 | lr_classifier = LogisticRegression(random_state = 0, penalty = 'l2')
1168 | lr_classifier.fit(X_train, y_train)
1169 | 
1170 | 
1171 | # Predict the Test set results
1172 | 
1173 | y_pred = lr_classifier.predict(X_test)
1174 | 
1175 | #probability score
1176 | y_pred_probs = lr_classifier.predict_proba(X_test)
1177 | y_pred_probs  = y_pred_probs [:, 1] 
1178 | 
1179 | #----------------------------------------------------------------------------------------
1180 | #-----------------Section F: Comparing Model Predictions against test set----------------
1181 | #----------------------------------------------------------------------------------------
1182 | 
1183 | 
1184 | # Step 19: Compare predictions against test set -------------------------------------------------------
1185 | 
1186 | 
1187 | #Revalidate final results with Confusion Matrix:
1188 | 
1189 | cm = confusion_matrix(y_test, y_pred) 
1190 | print (cm)
1191 | 
1192 | #Confusion Matrix as a quick Crosstab:
1193 |     
1194 | pd.crosstab(y_test,pd.Series(y_pred),rownames=['ACTUAL'],colnames=['PRED'])
1195 | 
1196 | #visualize Confusion Matrix:
1197 | 
1198 | cm = confusion_matrix(y_test, y_pred) 
1199 | df_cm = pd.DataFrame(cm, index = (0, 1), columns = (0, 1))
1200 | plt.figure(figsize = (28,20))
1201 | 
1202 | 
1203 | fig, ax = plt.subplots()
1204 | sn.set(font_scale=1.4)
1205 | sn.heatmap(df_cm, annot=True, fmt='g'#,cmap="YlGnBu" 
1206 |            )
1207 | class_names=[0,1]
1208 | tick_marks = np.arange(len(class_names))
1209 | plt.tight_layout()
1210 | plt.title('Confusion matrix\n', y=1.1)
1211 | plt.xticks(tick_marks, class_names)
1212 | plt.yticks(tick_marks, class_names)
1213 | ax.xaxis.set_label_position("top")
1214 | plt.ylabel('Actual label\n')
1215 | plt.xlabel('Predicted label\n')
1216 | print("Test Data Accuracy: %0.4f" % accuracy_score(y_test, y_pred))
1217 | 
1218 | 
1219 | 
1220 | # Step 20: Format Final Results:-------------------------------------------------------
1221 | 
1222 | 
1223 | final_results = pd.concat([test_identity, y_test], axis = 1).dropna()
1224 | 
1225 | final_results['predictions'] = y_pred 
1226 | 
1227 | final_results["propensity_to_convert(%)"] = y_pred_probs 
1228 | 
1229 | final_results["propensity_to_convert(%)"] = final_results["propensity_to_convert(%)"]*100
1230 | 
1231 | final_results["propensity_to_convert(%)"]=final_results["propensity_to_convert(%)"].round(2)
1232 | 
1233 | final_results = final_results[['customerID', 'Churn', 'predictions', 'propensity_to_convert(%)']]
1234 | 
1235 | final_results ['Ranking'] = pd.qcut(final_results['propensity_to_convert(%)'].rank(method = 'first'),10,labels=range(10,0,-1))
1236 | 
1237 | print (final_results)
1238 | 
1239 | 
1240 | #----------------------------------------------------------------------------------------
1241 | #-----------------Section G: Deploy the Model and Predict Future Datapoints---------------
1242 | #----------------------------------------------------------------------------------------
1243 | 
1244 | #Step 21: Save the model-------------------------------------------------------------------
1245 | 
1246 | filename = 'final_model.model'
1247 | i = [lr_classifier]
1248 | joblib.dump(i,filename)


--------------------------------------------------------------------------------