├── Predict Customer Churn.ipynb ├── README.md ├── customer_churn_data.csv ├── final_model.model └── predict_customer_churn.py /README.md: -------------------------------------------------------------------------------- 1 | # predict-churn-py 2 | 3 | Source code and the raw data that I have used to build the machine learning classifiers can be found in this repository for reference. Hope this helps! 4 | -------------------------------------------------------------------------------- /final_model.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srees1988/predict-churn-py/8cfd95f64a2ff48edd21af48a706a23227e76616/final_model.model -------------------------------------------------------------------------------- /predict_customer_churn.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri Oct 16 06:30:42 2020 5 | 6 | @author: srees 7 | 8 | Title: Predict Customer Churn using Supervised ML 9 | """ 10 | 11 | 12 | 13 | #Step 0: Restarting the session and Clearning all temporary variables---------------------------- 14 | 15 | 16 | try: 17 | from IPython import get_ipython 18 | get_ipython().magic('clear') 19 | get_ipython().magic('reset -f') 20 | except: 21 | pass 22 | 23 | 24 | #---------------------------------------------------------------------------------------- 25 | #-----------------Section A: Data Preprocessing------------------------------------------ 26 | #---------------------------------------------------------------------------------------- 27 | 28 | # Step 1: Import relevant libraries--------------------------------------------------------- 29 | 30 | #Standard libraries for data analysis:---------------------- 31 | 32 | import numpy as np 33 | import matplotlib.pyplot as plt 34 | import pandas as pd 35 | from scipy.stats import norm, skew 36 | from scipy import stats 37 | import statsmodels.api as sm 38 | 39 | 40 | # sklearn modules for data preprocessing------------------------------------- 41 | 42 | from sklearn.impute import SimpleImputer 43 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder 44 | from sklearn.compose import ColumnTransformer 45 | from sklearn.preprocessing import OneHotEncoder 46 | from sklearn.model_selection import train_test_split 47 | from sklearn.preprocessing import StandardScaler 48 | 49 | 50 | #sklearn modules for Model Selection-------------------------------------- 51 | 52 | from sklearn import svm, tree, linear_model, neighbors 53 | from sklearn import naive_bayes, ensemble, discriminant_analysis, gaussian_process 54 | from sklearn.neighbors import KNeighborsClassifier 55 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 56 | from xgboost import XGBClassifier 57 | 58 | from sklearn.linear_model import LogisticRegression 59 | from sklearn.svm import SVC 60 | from sklearn.neighbors import KNeighborsClassifier 61 | from sklearn.naive_bayes import GaussianNB 62 | from sklearn.tree import DecisionTreeClassifier 63 | from sklearn.ensemble import RandomForestClassifier 64 | 65 | 66 | #sklearn modules for Model Evaluation & Improvement--------------------------- 67 | 68 | from sklearn.metrics import confusion_matrix, accuracy_score 69 | from sklearn.metrics import f1_score, precision_score, recall_score, fbeta_score 70 | from statsmodels.stats.outliers_influence import variance_inflation_factor 71 | from sklearn.model_selection import cross_val_score 72 | from sklearn.model_selection import GridSearchCV 73 | 74 | from sklearn.model_selection import ShuffleSplit 75 | from sklearn.model_selection import KFold 76 | 77 | from sklearn import feature_selection 78 | from sklearn import model_selection 79 | from sklearn import metrics 80 | 81 | from sklearn.metrics import classification_report, precision_recall_curve 82 | from sklearn.metrics import auc, roc_auc_score, roc_curve 83 | from sklearn.metrics import make_scorer, recall_score, log_loss 84 | from sklearn.metrics import average_precision_score 85 | 86 | 87 | #Standard libraries for data visualization--------------------- 88 | 89 | import seaborn as sn 90 | from matplotlib import pyplot 91 | import matplotlib.pyplot as plt 92 | import matplotlib.pylab as pylab 93 | import matplotlib 94 | %matplotlib inline 95 | color = sn.color_palette() 96 | import matplotlib.ticker as mtick 97 | from IPython.display import display 98 | pd.options.display.max_columns = None 99 | from pandas.plotting import scatter_matrix 100 | from sklearn.metrics import roc_curve 101 | 102 | 103 | #Miscellaneous Utilitiy Libraries-------------------------------------- 104 | 105 | import random 106 | import os 107 | import re 108 | import sys 109 | import timeit 110 | import string 111 | import time 112 | from datetime import datetime 113 | from time import time 114 | from dateutil.parser import parse 115 | import joblib 116 | 117 | 118 | 119 | 120 | #Step 2: Set up current working directory------------------------------------------ 121 | 122 | os.chdir(r"C:/Users/srees/Desktop/Blogs and Projects/Upcoming Blogs and Projects/2. Propensity Scoring Models/3. Predict Customer Churn/") 123 | 124 | 125 | # Step 3: Import the dataset-------------------------------------------------------- 126 | 127 | dataset = pd.read_csv('1.Input/customer_churn_data.csv') 128 | 129 | 130 | # Step 4: Evaluate Datastructure -------------------------------------------------------- 131 | 132 | dataset.head() 133 | 134 | dataset.columns 135 | 136 | dataset.describe() 137 | 138 | dataset.dtypes 139 | 140 | 141 | 142 | #Recheck Column Datatypes and Missing Values: 143 | 144 | dataset.columns.to_series().groupby(dataset.dtypes).groups 145 | 146 | dataset.info() 147 | 148 | dataset.isna().any() 149 | 150 | #Unique values in each categorical variable: 151 | 152 | dataset["PaymentMethod"].nunique() 153 | 154 | dataset["PaymentMethod"].unique() 155 | 156 | 157 | dataset["Contract"].nunique() 158 | 159 | dataset["Contract"].unique() 160 | 161 | #Step 5: Check Target Variable Distribution ----------------------------------------------- 162 | 163 | dataset["Churn"].value_counts() 164 | 165 | # ====================================================================================================== 166 | #In this case, we have class imbalance with few negatives. In our business challenge, 167 | #false negatives are costly. Hence let's keep an eye onto the Precision, Recall & F2 score besides accuracy 168 | # ====================================================================================================== 169 | 170 | 171 | #Step 6: Clean the Dataset---------------------------------------------------------------------- 172 | 173 | 174 | dataset['TotalCharges'] = pd.to_numeric(dataset['TotalCharges'],errors='coerce') 175 | 176 | dataset['TotalCharges'] = dataset['TotalCharges'].astype("float") 177 | 178 | 179 | 180 | #Step 7: Take care of missing data--------------------------------------------------------------- 181 | 182 | dataset.info() 183 | 184 | dataset.isna().any() 185 | 186 | #*Find the average and fill missing values of each columns programmatically. 187 | 188 | na_cols = dataset.isna().any() 189 | 190 | na_cols = na_cols[na_cols == True].reset_index() 191 | 192 | na_cols = na_cols["index"].tolist() 193 | 194 | for col in dataset.columns[1:]: 195 | if col in na_cols: 196 | if dataset[col].dtype != 'object': 197 | dataset[col] = dataset[col].fillna(dataset[col].mean()).round(0) 198 | 199 | #Revalidate: 200 | 201 | dataset.isna().any() 202 | 203 | 204 | #Step 8: label Encode Binary data---------------------------------------------------------------- 205 | 206 | #Create a label encoder object 207 | le = LabelEncoder() 208 | 209 | # Label Encoding will be used for columns with 2 or less unique values 210 | le_count = 0 211 | for col in dataset.columns[1:]: 212 | if dataset[col].dtype == 'object': 213 | if len(list(dataset[col].unique())) <= 2: 214 | le.fit(dataset[col]) 215 | dataset[col] = le.transform(dataset[col]) 216 | le_count += 1 217 | print('{} columns were label encoded.'.format(le_count)) 218 | 219 | 220 | #---------------------------------------------------------------------------------------- 221 | #-----------------Section B: Data Evaluation------------------------------------------ 222 | #---------------------------------------------------------------------------------------- 223 | 224 | #Step 9: Exploratory Data Analysis---------------------------------------------------------------------- 225 | 226 | #Step 9.1. Plot Histogram of numeric Columns-------------------------------------- 227 | 228 | dataset2 = dataset[['gender', 'SeniorCitizen', 'Partner', 'Dependents', 229 | 'tenure', 'PhoneService', 'PaperlessBilling', 230 | 'MonthlyCharges', 'TotalCharges']] 231 | 232 | #Histogram: 233 | 234 | fig = plt.figure(figsize=(15, 12)) 235 | plt.suptitle('Histograms of Numerical Columns\n',horizontalalignment="center",fontstyle = "normal", fontsize = 24, fontfamily = "sans-serif") 236 | for i in range(dataset2.shape[1]): 237 | plt.subplot(6, 3, i + 1) 238 | f = plt.gca() 239 | f.set_title(dataset2.columns.values[i]) 240 | 241 | vals = np.size(dataset2.iloc[:, i].unique()) 242 | if vals >= 100: 243 | vals = 100 244 | 245 | plt.hist(dataset2.iloc[:, i], bins=vals, color = '#ec838a') 246 | plt.tight_layout(rect=[0, 0.03, 1, 0.95]) 247 | 248 | 249 | #Step 9.2. Analyze distribution of Key Categorical Variables--------------------------------------------- 250 | 251 | 252 | #(1) Distribution of Contract Type---------------------------------------------------------------------------------------- 253 | 254 | contract_split = dataset[[ "customerID", "Contract"]] 255 | sectors = contract_split .groupby ("Contract") 256 | contract_split = pd.DataFrame(sectors["customerID"].count()) 257 | contract_split.rename(columns={'customerID':'No. of customers'}, inplace=True) 258 | 259 | 260 | ax = contract_split[["No. of customers"]].plot.bar(title = 'Customers by Contract Type', legend =True, table = False, grid = False, subplots = False, figsize =(12, 7), color ='#ec838a', fontsize = 15, stacked=False) 261 | 262 | plt.ylabel('No. of Customers\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif") 263 | plt.xlabel('\n Contract Type',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif") 264 | plt.title('Customers by Contract Type \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif") 265 | plt.legend(loc='top right', fontsize = "medium") 266 | plt.xticks(rotation=0, horizontalalignment="center") 267 | plt.yticks(rotation=0, horizontalalignment="right") 268 | 269 | x_labels = np.array(contract_split[["No. of customers"]]) 270 | 271 | def add_value_labels(ax, spacing=5): 272 | for rect in ax.patches: 273 | y_value = rect.get_height() 274 | x_value = rect.get_x() + rect.get_width() / 2 275 | space = spacing 276 | va = 'bottom' 277 | if y_value < 0: 278 | space *= -1 279 | va = 'top' 280 | label = "{:.0f}".format(y_value) 281 | ax.annotate( 282 | label, 283 | (x_value, y_value), 284 | xytext=(0, space), 285 | textcoords="offset points", 286 | ha='center', 287 | va=va) 288 | add_value_labels(ax) 289 | 290 | 291 | #(2) Distribution of Payment Method Type--------------------------------------------------------------------------------------- 292 | 293 | payment_method_split = dataset[[ "customerID", "PaymentMethod"]] 294 | sectors = payment_method_split .groupby ("PaymentMethod") 295 | payment_method_split = pd.DataFrame(sectors["customerID"].count()) 296 | payment_method_split.rename(columns={'customerID':'No. of customers'}, inplace=True) 297 | 298 | 299 | ax = payment_method_split [["No. of customers"]].plot.bar(title = 'Customers by Payment Method', legend =True, table = False, grid = False, subplots = False, figsize =(15, 10), color ='#ec838a', fontsize = 15, stacked=False) 300 | 301 | plt.ylabel('No. of Customers\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif") 302 | plt.xlabel('\n Contract Type',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif") 303 | plt.title('Customers by Payment Method \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif") 304 | plt.legend(loc='top right', fontsize = "medium") 305 | plt.xticks(rotation=0, horizontalalignment="center") 306 | plt.yticks(rotation=0, horizontalalignment="right") 307 | 308 | x_labels = np.array(payment_method_split [["No. of customers"]]) 309 | 310 | def add_value_labels(ax, spacing=5): 311 | for rect in ax.patches: 312 | y_value = rect.get_height() 313 | x_value = rect.get_x() + rect.get_width() / 2 314 | space = spacing 315 | va = 'bottom' 316 | if y_value < 0: 317 | space *= -1 318 | va = 'top' 319 | label = "{:.0f}".format(y_value) 320 | ax.annotate( 321 | label, 322 | (x_value, y_value), 323 | xytext=(0, space), 324 | textcoords="offset points", 325 | ha='center', 326 | va=va) 327 | add_value_labels(ax) 328 | 329 | 330 | #(3) Distribution of various Label Encoded Categorical Variables--------------------------------------------------------------------------------------- 331 | 332 | services = ['PhoneService','MultipleLines','InternetService','OnlineSecurity', 333 | 'OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies'] 334 | 335 | fig, axes = plt.subplots(nrows = 3,ncols = 3,figsize = (15,12)) 336 | for i, item in enumerate(services): 337 | if i < 3: 338 | ax = dataset[item].value_counts().plot(kind = 'bar',ax=axes[i,0],rot = 0, color ='#f3babc' ) 339 | 340 | elif i >=3 and i < 6: 341 | ax = dataset[item].value_counts().plot(kind = 'bar',ax=axes[i-3,1],rot = 0,color ='#9b9c9a') 342 | 343 | elif i < 9: 344 | ax = dataset[item].value_counts().plot(kind = 'bar',ax=axes[i-6,2],rot = 0,color = '#ec838a') 345 | ax.set_title(item) 346 | 347 | 348 | 349 | #Step 9.3: Analyze Churn Rate by Categorical variables: ------------------------------------------------------------- 350 | 351 | #(1) Overall Churn Rate------------------------------------------------------------------------------------------ 352 | 353 | import matplotlib.ticker as mtick 354 | churn_rate = dataset[["Churn", "customerID"]] 355 | churn_rate ["churn_label"] = pd.Series(np.where((churn_rate["Churn"] == 0), "No", "Yes")) 356 | sectors = churn_rate .groupby ("churn_label") 357 | churn_rate = pd.DataFrame(sectors["customerID"].count()) 358 | churn_rate ["Churn Rate"] = (churn_rate ["customerID"] / sum(churn_rate ["customerID"]) )*100 359 | ax = churn_rate[["Churn Rate"]].plot.bar(title = 'Overall Churn Rate', legend =True, table = False, grid = False, subplots = False, figsize =(12, 7), color = '#ec838a', fontsize = 15, stacked=False, ylim =(0,100)) 360 | 361 | plt.ylabel('Proportion of Customers',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif") 362 | plt.xlabel('Churn',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif") 363 | plt.title('Overall Churn Rate \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif") 364 | plt.legend(loc='top right', fontsize = "medium") 365 | plt.xticks(rotation=0, horizontalalignment="center") 366 | plt.yticks(rotation=0, horizontalalignment="right") 367 | ax.yaxis.set_major_formatter(mtick.PercentFormatter()) 368 | x_labels = np.array(churn_rate[["customerID"]]) 369 | 370 | def add_value_labels(ax, spacing=5): 371 | for rect in ax.patches: 372 | y_value = rect.get_height() 373 | x_value = rect.get_x() + rect.get_width() / 2 374 | space = spacing 375 | va = 'bottom' 376 | if y_value < 0: 377 | space *= -1 378 | va = 'top' 379 | 380 | label = "{:.1f}%".format(y_value) 381 | ax.annotate( 382 | label, 383 | (x_value, y_value), 384 | xytext=(0, space), 385 | textcoords="offset points", 386 | ha='center', 387 | va=va) 388 | add_value_labels(ax) 389 | ax.autoscale(enable=False, axis='both', tight=False) 390 | 391 | 392 | #(2) Churn Rate by Contract Type ---------------------------------------------------------------------------------------- 393 | 394 | 395 | import matplotlib.ticker as mtick 396 | 397 | contract_churn = dataset.groupby(['Contract','Churn']).size().unstack() 398 | 399 | contract_churn.rename(columns={0:'No', 1:'Yes'}, inplace=True) 400 | 401 | colors = ['#ec838a','#9b9c9a'] 402 | 403 | ax = (contract_churn.T*100.0 / contract_churn.T.sum()).T.plot(kind='bar', 404 | width = 0.3, 405 | stacked = True, 406 | rot = 0, 407 | figsize = (12,7), 408 | color = colors) 409 | 410 | 411 | 412 | 413 | 414 | plt.ylabel('Proportion of Customers\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif") 415 | plt.xlabel('Contract Type\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif") 416 | plt.title('Churn Rate by Contract type \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif") 417 | plt.legend(loc='top right', fontsize = "medium") 418 | plt.xticks(rotation=0, horizontalalignment="center") 419 | plt.yticks(rotation=0, horizontalalignment="right") 420 | ax.yaxis.set_major_formatter(mtick.PercentFormatter()) 421 | for p in ax.patches: 422 | width, height = p.get_width(), p.get_height() 423 | x, y = p.get_xy() 424 | ax.text(x+width/2, 425 | y+height/2, 426 | '{:.1f}%'.format(height), 427 | horizontalalignment='center', 428 | verticalalignment='center') 429 | ax.autoscale(enable=False, axis='both', tight=False) 430 | 431 | 432 | 433 | #(3) Churn Rate by Payment Method Type---------------------------------------------------------------------------------------- 434 | 435 | 436 | import matplotlib.ticker as mtick 437 | 438 | contract_churn = dataset.groupby(['Contract','PaymentMethod']).size().unstack() 439 | 440 | contract_churn.rename(columns={0:'No', 1:'Yes'}, inplace=True) 441 | 442 | colors = ['#ec838a','#9b9c9a', '#f3babc' , '#4d4f4c'] 443 | 444 | ax = (contract_churn.T*100.0 / contract_churn.T.sum()).T.plot(kind='bar', 445 | width = 0.3, 446 | stacked = True, 447 | rot = 0, 448 | figsize = (12,7), 449 | color = colors) 450 | 451 | 452 | 453 | 454 | 455 | plt.ylabel('Proportion of Customers\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif") 456 | plt.xlabel('Contract Type\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif") 457 | plt.title('Churn Rate by Payment Method \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif") 458 | plt.legend(loc='best', fontsize = "medium") 459 | plt.xticks(rotation=0, horizontalalignment="center") 460 | plt.yticks(rotation=0, horizontalalignment="right") 461 | ax.yaxis.set_major_formatter(mtick.PercentFormatter()) 462 | for p in ax.patches: 463 | width, height = p.get_width(), p.get_height() 464 | x, y = p.get_xy() 465 | ax.text(x+width/2, 466 | y+height/2, 467 | '{:.1f}%'.format(height), 468 | horizontalalignment='center', 469 | verticalalignment='center') 470 | ax.autoscale(enable=False, axis='both', tight=False) 471 | 472 | 473 | # Step 9.4. Find positive and negative correlations with the Response Variable-------------------- 474 | 475 | dataset2 = dataset[['SeniorCitizen', 'Partner', 'Dependents', 476 | 'tenure', 'PhoneService', 'PaperlessBilling', 477 | 'MonthlyCharges', 'TotalCharges']] 478 | 479 | correlations = dataset2.corrwith(dataset.Churn) 480 | correlations = correlations[correlations!=1] 481 | positive_correlations = correlations[correlations >0].sort_values(ascending = False) 482 | negative_correlations = correlations[correlations<0].sort_values(ascending = False) 483 | 484 | print('Most Positive Correlations: \n', positive_correlations) 485 | print('\nMost Negative Correlations: \n', negative_correlations) 486 | 487 | 488 | #Step 10.4. Plot positive & negative correlation with Response Variable----------------------------- 489 | 490 | correlations = dataset2.corrwith(dataset.Churn) 491 | correlations = correlations[correlations!=1] 492 | 493 | correlations.plot.bar( 494 | figsize = (18, 10), fontsize = 15, color = '#ec838a', 495 | rot = 45, grid = True) 496 | 497 | plt.title('Correlation with Churn Rate \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif") 498 | 499 | 500 | 501 | 502 | #Step 9.5. Plot Correlation Matrix of all independent variables------------------------ 503 | 504 | ## Set and compute the Correlation Matrix 505 | sn.set(style="white") 506 | corr = dataset2.corr() 507 | 508 | # Generate a mask for the upper triangle 509 | mask = np.zeros_like(corr, dtype=np.bool) 510 | mask[np.triu_indices_from(mask)] = True 511 | 512 | # Set up the matplotlib figure and a diverging colormap 513 | f, ax = plt.subplots(figsize=(18, 15)) 514 | cmap = sn.diverging_palette(220, 10, as_cmap=True) 515 | 516 | # Draw the heatmap with the mask and correct aspect ratio 517 | sn.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0, 518 | square=True, linewidths=.5, cbar_kws={"shrink": .5}) 519 | 520 | 521 | 522 | 523 | #Step 9.6: Check Multicolinearity using VIF---------------------------------------------- 524 | 525 | def calc_vif(X): 526 | 527 | # Calculating VIF 528 | vif = pd.DataFrame() 529 | vif["variables"] = X.columns 530 | vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])] 531 | 532 | return(vif) 533 | 534 | dataset2 = dataset[['gender', 'SeniorCitizen', 'Partner', 'Dependents', 535 | 'tenure', 'PhoneService', 'PaperlessBilling','MonthlyCharges','TotalCharges']] 536 | 537 | calc_vif(dataset2) 538 | 539 | #Total Charges seem to be colinear with Monthly Charges. 540 | 541 | #check colinearity: 542 | 543 | dataset2[['MonthlyCharges', 'TotalCharges']].plot.scatter(figsize = (15, 10), x = 'MonthlyCharges', 544 | y='TotalCharges', color = '#ec838a') 545 | 546 | 547 | plt.title('Co-linearity of Monthly Charges and Total Charges \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif") 548 | 549 | #dropping TotalCharges: 550 | 551 | dataset2 = dataset2.drop(columns = "TotalCharges") 552 | 553 | #Revalidate Colinearity: 554 | 555 | dataset2 = dataset[['gender', 'SeniorCitizen', 'Partner', 'Dependents', 556 | 'tenure', 'PhoneService', 'PaperlessBilling','MonthlyCharges']] 557 | 558 | calc_vif(dataset2) 559 | 560 | #Applying changes in the main dataset: 561 | 562 | dataset = dataset.drop(columns = "TotalCharges") 563 | 564 | 565 | 566 | #Step 10: Encode Categorical data---------------------------------------------------------------- 567 | 568 | #Incase if user_id is an object: 569 | 570 | identity = dataset["customerID"] 571 | 572 | dataset = dataset.drop(columns="customerID") 573 | 574 | # convert rest of categorical variable into dummy 575 | 576 | dataset= pd.get_dummies(dataset) 577 | 578 | #Rejoin userid to dataset (column concatenation) 579 | 580 | dataset = pd.concat([dataset, identity], axis = 1) 581 | 582 | 583 | 584 | #Step 11: Split dataset into dependent and independent variables----------------------------------- 585 | 586 | #identify response variable: 587 | 588 | response = dataset["Churn"] 589 | 590 | dataset = dataset.drop(columns="Churn") 591 | 592 | #Step 12: Generate training and test datasets of dependent and independent variables----------------- 593 | 594 | 595 | X_train, X_test, y_train, y_test = train_test_split(dataset, response, 596 | stratify=response, 597 | test_size = 0.2, #use 0.9 if data is huge. 598 | random_state = 0) 599 | 600 | #to resolve any class imbalance - use stratify parameter. 601 | 602 | print("Number transactions X_train dataset: ", X_train.shape) 603 | print("Number transactions y_train dataset: ", y_train.shape) 604 | print("Number transactions X_test dataset: ", X_test.shape) 605 | print("Number transactions y_test dataset: ", y_test.shape) 606 | 607 | 608 | 609 | # Step 13: Removing Identifiers------------------------------------------------------------------- 610 | 611 | train_identity = X_train['customerID'] 612 | X_train = X_train.drop(columns = ['customerID']) 613 | 614 | test_identity = X_test['customerID'] 615 | X_test = X_test.drop(columns = ['customerID']) 616 | 617 | 618 | # Step 14: Feature Scaling----------------------------------------------------------------------- 619 | 620 | sc_X = StandardScaler() 621 | X_train2 = pd.DataFrame(sc_X.fit_transform(X_train)) 622 | X_train2.columns = X_train.columns.values 623 | X_train2.index = X_train.index.values 624 | X_train = X_train2 625 | 626 | X_test2 = pd.DataFrame(sc_X.transform(X_test)) 627 | X_test2.columns = X_test.columns.values 628 | X_test2.index = X_test.index.values 629 | X_test = X_test2 630 | 631 | 632 | 633 | #---------------------------------------------------------------------------------------- 634 | #-----------------Section C: Model Selection------------------------------------------ 635 | #---------------------------------------------------------------------------------------- 636 | 637 | #Step 15.1: Compare Baseline Classification Algorithms - First Iteration 638 | #Using Accuracy and ROC AUC Mean Metrics 639 | 640 | 641 | models = [] 642 | 643 | models.append(('Logistic Regression', LogisticRegression(solver='liblinear', random_state = 0, 644 | class_weight='balanced'))) 645 | 646 | models.append(('SVC', SVC(kernel = 'linear', random_state = 0))) 647 | 648 | 649 | models.append(('Kernel SVM', SVC(kernel = 'rbf', random_state = 0))) 650 | 651 | 652 | models.append(('KNN', KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2))) 653 | 654 | 655 | models.append(('Gaussian NB', GaussianNB())) 656 | 657 | 658 | models.append(('Decision Tree Classifier', 659 | DecisionTreeClassifier(criterion = 'entropy', random_state = 0))) 660 | 661 | 662 | models.append(('Random Forest', RandomForestClassifier( 663 | n_estimators=100, criterion = 'entropy', random_state = 0))) 664 | 665 | 666 | 667 | #Evaluating Model Results: 668 | 669 | 670 | acc_results = [] 671 | auc_results = [] 672 | names = [] 673 | # set table to table to populate with performance results 674 | col = ['Algorithm', 'ROC AUC Mean', 'ROC AUC STD', 675 | 'Accuracy Mean', 'Accuracy STD'] 676 | 677 | model_results = pd.DataFrame(columns=col) 678 | i = 0 679 | # evaluate each model using k-fold cross-validation 680 | for name, model in models: 681 | kfold = model_selection.KFold( 682 | n_splits=10, random_state=0) # 10-fold cross-validation 683 | 684 | cv_acc_results = model_selection.cross_val_score( # accuracy scoring 685 | model, X_train, y_train, cv=kfold, scoring='accuracy') 686 | 687 | cv_auc_results = model_selection.cross_val_score( # roc_auc scoring 688 | model, X_train, y_train, cv=kfold, scoring='roc_auc') 689 | 690 | acc_results.append(cv_acc_results) 691 | auc_results.append(cv_auc_results) 692 | names.append(name) 693 | model_results.loc[i] = [name, 694 | round(cv_auc_results.mean()*100, 2), 695 | round(cv_auc_results.std()*100, 2), 696 | round(cv_acc_results.mean()*100, 2), 697 | round(cv_acc_results.std()*100, 2) 698 | ] 699 | i += 1 700 | 701 | model_results.sort_values(by=['ROC AUC Mean'], ascending=False) 702 | 703 | 704 | #Step 15.2. Visualize Classification Algorithms Accuracy Comparisons:----------------------------------- 705 | 706 | 707 | 708 | #Using Accuracy Mean: 709 | 710 | fig = plt.figure(figsize=(15, 7)) 711 | ax = fig.add_subplot(111) 712 | plt.boxplot(acc_results) 713 | ax.set_xticklabels(names) 714 | 715 | #plt.ylabel('ROC AUC Score\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif") 716 | #plt.xlabel('\n Baseline Classification Algorithms\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif") 717 | plt.title('Accuracy Score Comparison \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif") 718 | #plt.legend(loc='top right', fontsize = "medium") 719 | plt.xticks(rotation=0, horizontalalignment="center") 720 | plt.yticks(rotation=0, horizontalalignment="right") 721 | 722 | plt.show() 723 | 724 | 725 | #using Area under ROC Curve: 726 | 727 | fig = plt.figure(figsize=(15, 7)) 728 | ax = fig.add_subplot(111) 729 | plt.boxplot(auc_results) 730 | ax.set_xticklabels(names) 731 | 732 | #plt.ylabel('ROC AUC Score\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif") 733 | #plt.xlabel('\n Baseline Classification Algorithms\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif") 734 | plt.title('ROC AUC Comparison \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif") 735 | #plt.legend(loc='top right', fontsize = "medium") 736 | plt.xticks(rotation=0, horizontalalignment="center") 737 | plt.yticks(rotation=0, horizontalalignment="right") 738 | 739 | 740 | plt.show() 741 | 742 | 743 | #------------------------------------------------------------------------------------------ 744 | #Compare Baseline Classification Algorithms - Second Iteration 745 | #Using Accuracy, Precision, Recall, F1 and F2 Score Metrics 746 | #------------------------------------------------------------------------------------------- 747 | 748 | 749 | #Step 15.3. Get the right parameters for the baseline models:--------------------------- 750 | 751 | #Identify optimal number of K neighbors for KNN Model: 752 | 753 | 754 | score_array = [] 755 | for each in range(1,25): 756 | knn_loop = KNeighborsClassifier(n_neighbors = each) #set K neighbor as 3 757 | knn_loop.fit(X_train,y_train) 758 | score_array.append(knn_loop.score(X_test,y_test)) 759 | 760 | fig = plt.figure(figsize=(15, 7)) 761 | plt.plot(range(1,25),score_array, color = '#ec838a') 762 | 763 | 764 | plt.ylabel('Range\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif") 765 | plt.xlabel('Score\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif") 766 | plt.title('Optimal Number of K Neighbors \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif") 767 | #plt.legend(loc='top right', fontsize = "medium") 768 | plt.xticks(rotation=0, horizontalalignment="center") 769 | plt.yticks(rotation=0, horizontalalignment="right") 770 | 771 | 772 | plt.show() 773 | 774 | #optimal number of K neigbors = 22 775 | 776 | #Identify optimal number of trees for Random Forest Model: 777 | 778 | score_array = [] 779 | for each in range(1,100): 780 | rf_loop = RandomForestClassifier(n_estimators = each, random_state = 1) 781 | rf_loop.fit(X_train,y_train) 782 | score_array.append(rf_loop.score(X_test,y_test)) 783 | 784 | fig = plt.figure(figsize=(15, 7)) 785 | plt.plot(range(1,100),score_array, color = '#ec838a') 786 | 787 | 788 | plt.ylabel('Range\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif") 789 | plt.xlabel('Score\n',horizontalalignment="center",fontstyle = "normal", fontsize = "large", fontfamily = "sans-serif") 790 | plt.title('Optimal Number of Trees for Random Forest Model \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif") 791 | #plt.legend(loc='top right', fontsize = "medium") 792 | plt.xticks(rotation=0, horizontalalignment="center") 793 | plt.yticks(rotation=0, horizontalalignment="right") 794 | 795 | 796 | plt.show() 797 | 798 | 799 | #Optimal number of decision trees = 72 800 | 801 | 802 | #Step 15.4. Compare Baseline Classification Algorithms - Second Iteration----------------------------- 803 | 804 | #--Step 15.4.1. Logistic Regression----------------- 805 | 806 | # Fitting Logistic Regression to the Training set 807 | classifier = LogisticRegression(random_state = 0) 808 | classifier.fit(X_train, y_train) 809 | 810 | # Predicting the Test set results 811 | y_pred = classifier.predict(X_test) 812 | 813 | #Evaluate results 814 | 815 | acc = accuracy_score(y_test, y_pred ) 816 | prec = precision_score(y_test, y_pred ) 817 | rec = recall_score(y_test, y_pred ) 818 | f1 = f1_score(y_test, y_pred ) 819 | f2 = fbeta_score(y_test, y_pred, beta=2.0) 820 | 821 | results = pd.DataFrame([['Logistic Regression', acc, prec, rec, f1, f2]], 822 | columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score']) 823 | 824 | 825 | 826 | #Step 15.4.2. . Support Vector Machine (linear classifier)------------------------ 827 | 828 | 829 | # Fitting SVM (SVC class) to the Training set: 830 | 831 | classifier = SVC(kernel = 'linear', random_state = 0) 832 | classifier.fit(X_train, y_train) 833 | 834 | # Predicting the Test set results 835 | y_pred = classifier.predict(X_test) 836 | 837 | #Evaluate results 838 | 839 | acc = accuracy_score(y_test, y_pred ) 840 | prec = precision_score(y_test, y_pred ) 841 | rec = recall_score(y_test, y_pred) 842 | f1 = f1_score(y_test, y_pred ) 843 | f2 = fbeta_score(y_test, y_pred, beta=2.0) 844 | 845 | model_results = pd.DataFrame([['SVM (Linear)', acc, prec, rec, f1, f2]], 846 | columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score']) 847 | 848 | results = results.append(model_results, ignore_index = True) 849 | 850 | 851 | #Step 15.4.3. K-Nearest Neighbours------------------------ 852 | 853 | 854 | # Fitting KNN to the Training set: 855 | 856 | classifier = KNeighborsClassifier(n_neighbors = 22, metric = 'minkowski', p = 2) 857 | classifier.fit(X_train, y_train) 858 | 859 | # Predicting the Test set results 860 | y_pred = classifier.predict(X_test) 861 | 862 | #Evaluate results 863 | acc = accuracy_score(y_test, y_pred ) 864 | prec = precision_score(y_test, y_pred ) 865 | rec = recall_score(y_test, y_pred ) 866 | f1 = f1_score(y_test, y_pred ) 867 | f2 = fbeta_score(y_test, y_pred, beta=2.0) 868 | 869 | model_results = pd.DataFrame([['K-Nearest Neighbours', acc, prec, rec, f1, f2]], 870 | columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score']) 871 | 872 | results = results.append(model_results, ignore_index = True) 873 | 874 | 875 | 876 | #Step 15.4.4. Kernel SVM------------------------ 877 | 878 | # Fitting Kernel SVM to the Training set: 879 | 880 | classifier = SVC(kernel = 'rbf', random_state = 0) 881 | classifier.fit(X_train, y_train) 882 | 883 | # Predicting the Test set results 884 | y_pred = classifier.predict(X_test) 885 | 886 | #Evaluate results 887 | 888 | acc = accuracy_score(y_test, y_pred ) 889 | prec = precision_score(y_test, y_pred ) 890 | rec = recall_score(y_test, y_pred ) 891 | f1 = f1_score(y_test, y_pred ) 892 | f2 = fbeta_score(y_test, y_pred, beta=2.0) 893 | 894 | model_results = pd.DataFrame([['Kernel SVM', acc, prec, rec, f1, f2]], 895 | columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score']) 896 | 897 | results = results.append(model_results, ignore_index = True) 898 | 899 | 900 | #Step 15.4.5. Naive Byes------------------------------------------------ 901 | 902 | # Fitting Naive Byes to the Training set: 903 | 904 | classifier = GaussianNB() 905 | classifier.fit(X_train, y_train) 906 | 907 | # Predicting the Test set results 908 | y_pred = classifier.predict(X_test) 909 | 910 | #Evaluate results 911 | acc = accuracy_score(y_test, y_pred ) 912 | prec = precision_score(y_test, y_pred ) 913 | rec = recall_score(y_test, y_pred ) 914 | f1 = f1_score(y_test, y_pred ) 915 | f2 = fbeta_score(y_test, y_pred, beta=2.0) 916 | 917 | model_results = pd.DataFrame([['Naive Byes', acc, prec, rec, f1, f2]], 918 | columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score']) 919 | 920 | results = results.append(model_results, ignore_index = True) 921 | 922 | 923 | 924 | #Step 15.4.6. Decision Tree--------------------------------------------- 925 | 926 | 927 | # Fitting Decision Tree to the Training set: 928 | 929 | classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0) 930 | classifier.fit(X_train, y_train) 931 | 932 | 933 | # Predicting the Test set results 934 | y_pred = classifier.predict(X_test) 935 | 936 | #Evaluate results 937 | acc = accuracy_score(y_test, y_pred ) 938 | prec = precision_score(y_test, y_pred ) 939 | rec = recall_score(y_test, y_pred ) 940 | f1 = f1_score(y_test, y_pred ) 941 | f2 = fbeta_score(y_test, y_pred, beta=2.0) 942 | 943 | model_results = pd.DataFrame([['Decision Tree', acc, prec, rec, f1, f2]], 944 | columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score']) 945 | 946 | results = results.append(model_results, ignore_index = True) 947 | 948 | 949 | #Step 15.4.7. Random Forest-------------------------------------------- 950 | 951 | 952 | # Fitting Random Forest to the Training set: 953 | 954 | classifier = RandomForestClassifier(n_estimators = 72, criterion = 'entropy', random_state = 0) 955 | classifier.fit(X_train, y_train) 956 | 957 | 958 | 959 | # Predicting the Test set results 960 | y_pred = classifier.predict(X_test) 961 | 962 | #Evaluate results 963 | 964 | from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score 965 | acc = accuracy_score(y_test, y_pred ) 966 | prec = precision_score(y_test, y_pred ) 967 | rec = recall_score(y_test, y_pred ) 968 | f1 = f1_score(y_test, y_pred ) 969 | f2 = fbeta_score(y_test, y_pred, beta=2.0) 970 | 971 | model_results = pd.DataFrame([['Random Forest', acc, prec, rec, f1, f2]], 972 | columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score']) 973 | 974 | results = results.append(model_results, ignore_index = True) 975 | 976 | #Step 15.5. Visualize the results and compare the baseline algorithms---------------------------------- 977 | 978 | # ======================================================================================================================= 979 | #Sort results based on the right classification metric: 980 | #(Accuracy/ROC_AUC / Precision/Recall/F1/F2 scores) 981 | 982 | #Since we have class imbalance. When we look into the business challenge, 983 | # our false negatives will be costly and hence we need to Keep an eye onto the Precision, Recall & F2 score besides accuracy 984 | # ======================================================================================================================= 985 | 986 | results = results.sort_values(["Precision", "Recall", "F2 Score"], ascending = False) 987 | 988 | 989 | print (results) 990 | 991 | 992 | #---------------------------------------------------------------------------------------- 993 | #-----------------Section D: Model Evaluation (Logistic Regression)---------------------- 994 | #---------------------------------------------------------------------------------------- 995 | 996 | 997 | #Step 16: Train & evaluate Chosen Model--------------------------------------------- 998 | 999 | 1000 | # Fit Logistic Regression on the Training dataset: 1001 | 1002 | classifier = LogisticRegression(random_state = 0, penalty = 'l2') 1003 | classifier.fit(X_train, y_train) 1004 | 1005 | 1006 | # Predict the Test set results 1007 | 1008 | y_pred = classifier.predict(X_test) 1009 | 1010 | 1011 | #Evaluate Model Results on Test Set: 1012 | 1013 | acc = accuracy_score(y_test, y_pred ) 1014 | prec = precision_score(y_test, y_pred ) 1015 | rec = recall_score(y_test, y_pred ) 1016 | f1 = f1_score(y_test, y_pred ) 1017 | f2 = fbeta_score(y_test, y_pred, beta=2.0) 1018 | 1019 | results = pd.DataFrame([['Logistic Regression', acc, prec, rec, f1, f2]], 1020 | columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score']) 1021 | 1022 | print (results) 1023 | 1024 | 1025 | 1026 | # Re-check k-Fold Cross Validation: 1027 | 1028 | accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10) 1029 | print("Logistic Regression Classifier Accuracy: %0.2f (+/- %0.2f)" % (accuracies.mean(), accuracies.std() * 2)) 1030 | 1031 | 1032 | #Visualize results on a Confusion Matrix: 1033 | 1034 | cm = confusion_matrix(y_test, y_pred) 1035 | df_cm = pd.DataFrame(cm, index = (0, 1), columns = (0, 1)) 1036 | plt.figure(figsize = (28,20)) 1037 | 1038 | 1039 | fig, ax = plt.subplots() 1040 | sn.set(font_scale=1.4) 1041 | sn.heatmap(df_cm, annot=True, fmt='g'#,cmap="YlGnBu" 1042 | ) 1043 | class_names=[0,1] 1044 | tick_marks = np.arange(len(class_names)) 1045 | plt.tight_layout() 1046 | plt.title('Confusion matrix\n', y=1.1) 1047 | plt.xticks(tick_marks, class_names) 1048 | plt.yticks(tick_marks, class_names) 1049 | ax.xaxis.set_label_position("top") 1050 | plt.ylabel('Actual label\n') 1051 | plt.xlabel('Predicted label\n') 1052 | 1053 | 1054 | 1055 | # Evaluate the model using ROC Graph 1056 | 1057 | classifier.fit(X_train, y_train) 1058 | probs = classifier.predict_proba(X_test) 1059 | probs = probs[:, 1] 1060 | classifier_roc_auc = accuracy_score(y_test, y_pred ) 1061 | 1062 | rf_fpr, rf_tpr, rf_thresholds = roc_curve(y_test, classifier.predict_proba(X_test)[:,1]) 1063 | plt.figure(figsize=(14, 6)) 1064 | 1065 | # Plot Logistic Regression ROC 1066 | plt.plot(rf_fpr, rf_tpr, label='Logistic Regression (area = %0.2f)' % classifier_roc_auc) 1067 | # Plot Base Rate ROC 1068 | plt.plot([0,1], [0,1],label='Base Rate' 'k--') 1069 | 1070 | plt.xlim([0.0, 1.0]) 1071 | plt.ylim([0.0, 1.05]) 1072 | 1073 | 1074 | plt.ylabel('True Positive Rate \n',horizontalalignment="center",fontstyle = "normal", fontsize = "medium", fontfamily = "sans-serif") 1075 | plt.xlabel('\nFalse Positive Rate \n',horizontalalignment="center",fontstyle = "normal", fontsize = "medium", fontfamily = "sans-serif") 1076 | plt.title('ROC Graph \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif") 1077 | plt.legend(loc="lower right", fontsize = "medium") 1078 | plt.xticks(rotation=0, horizontalalignment="center") 1079 | plt.yticks(rotation=0, horizontalalignment="right") 1080 | 1081 | plt.show() 1082 | 1083 | 1084 | 1085 | 1086 | #Step 17:Predict Feature Importance------------------------------------------------------ 1087 | 1088 | 1089 | # Analyzing Coefficients 1090 | feature_importances = pd.concat([pd.DataFrame(dataset.drop(columns = 'customerID').columns, columns = ["features"]), 1091 | pd.DataFrame(np.transpose(classifier.coef_), columns = ["coef"]) 1092 | ],axis = 1) 1093 | 1094 | feature_importances.sort_values("coef", ascending = False) 1095 | 1096 | 1097 | 1098 | #---------------------------------------------------------------------------------------- 1099 | #-----------------Section E: Model Improvement (Logistic Regression)---------------------------- 1100 | #---------------------------------------------------------------------------------------- 1101 | 1102 | 1103 | 1104 | #Step 18:Hyper parameter Tuning -------------------------------------- 1105 | 1106 | 1107 | # Round 1: ----------------------------------------------------------------- 1108 | 1109 | # Select Regularization Method 1110 | import time 1111 | penalty = ['l1', 'l2'] 1112 | 1113 | # Create regularization hyperparameter space 1114 | C = [0.001, 0.01, 0.1, 1, 10, 100, 1000] 1115 | 1116 | # Combine Parameters 1117 | parameters = dict(C=C, penalty=penalty) 1118 | 1119 | lr_classifier = GridSearchCV(estimator = classifier, 1120 | param_grid = parameters, 1121 | scoring = "balanced_accuracy", 1122 | cv = 10, 1123 | n_jobs = -1) 1124 | t0 = time.time() 1125 | lr_classifier = lr_classifier .fit(X_train, y_train) 1126 | t1 = time.time() 1127 | print("Took %0.2f seconds" % (t1 - t0)) 1128 | 1129 | lr_best_accuracy = lr_classifier.best_score_ 1130 | lr_best_parameters = lr_classifier.best_params_ 1131 | lr_best_accuracy, lr_best_parameters 1132 | 1133 | #verdict: No accuracy lift post hyperparameter tuning (round1) 1134 | 1135 | # Round 2: ----------------------------------------------------------------- 1136 | 1137 | 1138 | # Select Regularization Method 1139 | import time 1140 | penalty = ['l2'] 1141 | 1142 | # Create regularization hyperparameter space 1143 | C = [ 0.0001, 0.001, 0.01, 0.02, 0.05] 1144 | 1145 | # Combine Parameters 1146 | parameters = dict(C=C, penalty=penalty) 1147 | 1148 | lr_classifier = GridSearchCV(estimator = classifier, 1149 | param_grid = parameters, 1150 | scoring = "balanced_accuracy", 1151 | cv = 10, 1152 | n_jobs = -1) 1153 | t0 = time.time() 1154 | lr_classifier = lr_classifier .fit(X_train, y_train) 1155 | t1 = time.time() 1156 | print("Took %0.2f seconds" % (t1 - t0)) 1157 | 1158 | lr_best_accuracy = lr_classifier.best_score_ 1159 | lr_best_parameters = lr_classifier.best_params_ 1160 | lr_best_accuracy, lr_best_parameters 1161 | 1162 | #verdict: No accuracy lift post hyperparameter tuning (round1) 1163 | 1164 | #Step 18.3:Final Hyper parameter tuning and selection -------------------------------------- 1165 | 1166 | 1167 | lr_classifier = LogisticRegression(random_state = 0, penalty = 'l2') 1168 | lr_classifier.fit(X_train, y_train) 1169 | 1170 | 1171 | # Predict the Test set results 1172 | 1173 | y_pred = lr_classifier.predict(X_test) 1174 | 1175 | #probability score 1176 | y_pred_probs = lr_classifier.predict_proba(X_test) 1177 | y_pred_probs = y_pred_probs [:, 1] 1178 | 1179 | #---------------------------------------------------------------------------------------- 1180 | #-----------------Section F: Comparing Model Predictions against test set---------------- 1181 | #---------------------------------------------------------------------------------------- 1182 | 1183 | 1184 | # Step 19: Compare predictions against test set ------------------------------------------------------- 1185 | 1186 | 1187 | #Revalidate final results with Confusion Matrix: 1188 | 1189 | cm = confusion_matrix(y_test, y_pred) 1190 | print (cm) 1191 | 1192 | #Confusion Matrix as a quick Crosstab: 1193 | 1194 | pd.crosstab(y_test,pd.Series(y_pred),rownames=['ACTUAL'],colnames=['PRED']) 1195 | 1196 | #visualize Confusion Matrix: 1197 | 1198 | cm = confusion_matrix(y_test, y_pred) 1199 | df_cm = pd.DataFrame(cm, index = (0, 1), columns = (0, 1)) 1200 | plt.figure(figsize = (28,20)) 1201 | 1202 | 1203 | fig, ax = plt.subplots() 1204 | sn.set(font_scale=1.4) 1205 | sn.heatmap(df_cm, annot=True, fmt='g'#,cmap="YlGnBu" 1206 | ) 1207 | class_names=[0,1] 1208 | tick_marks = np.arange(len(class_names)) 1209 | plt.tight_layout() 1210 | plt.title('Confusion matrix\n', y=1.1) 1211 | plt.xticks(tick_marks, class_names) 1212 | plt.yticks(tick_marks, class_names) 1213 | ax.xaxis.set_label_position("top") 1214 | plt.ylabel('Actual label\n') 1215 | plt.xlabel('Predicted label\n') 1216 | print("Test Data Accuracy: %0.4f" % accuracy_score(y_test, y_pred)) 1217 | 1218 | 1219 | 1220 | # Step 20: Format Final Results:------------------------------------------------------- 1221 | 1222 | 1223 | final_results = pd.concat([test_identity, y_test], axis = 1).dropna() 1224 | 1225 | final_results['predictions'] = y_pred 1226 | 1227 | final_results["propensity_to_convert(%)"] = y_pred_probs 1228 | 1229 | final_results["propensity_to_convert(%)"] = final_results["propensity_to_convert(%)"]*100 1230 | 1231 | final_results["propensity_to_convert(%)"]=final_results["propensity_to_convert(%)"].round(2) 1232 | 1233 | final_results = final_results[['customerID', 'Churn', 'predictions', 'propensity_to_convert(%)']] 1234 | 1235 | final_results ['Ranking'] = pd.qcut(final_results['propensity_to_convert(%)'].rank(method = 'first'),10,labels=range(10,0,-1)) 1236 | 1237 | print (final_results) 1238 | 1239 | 1240 | #---------------------------------------------------------------------------------------- 1241 | #-----------------Section G: Deploy the Model and Predict Future Datapoints--------------- 1242 | #---------------------------------------------------------------------------------------- 1243 | 1244 | #Step 21: Save the model------------------------------------------------------------------- 1245 | 1246 | filename = 'final_model.model' 1247 | i = [lr_classifier] 1248 | joblib.dump(i,filename) --------------------------------------------------------------------------------