├── Classification Using Pyspark_Home_Quote - v3.ipynb ├── Classification_Using _Pyspark.py ├── Image ├── Decision_Tree.png ├── Decision_Tree_Gini_LogLoss.png ├── Decision_Tree_ROC.png ├── Decision_Tree_confusion_matrix.png ├── Decision_Tree_ev1.png ├── EDA1.jpg ├── EDA2.jpg ├── EDA3.jpg ├── EDA4.jpg ├── EDA5.png ├── EDA6.png ├── Random_Forest.png ├── Random_Forest_Gini_LogLoss.png ├── Random_Forest_ROC.png ├── Random_Forest_confusion_matrix.png ├── Random_Forest_ev1.png ├── call_function_feature_engineering.png ├── call_insignificant_categories_function.jpg ├── callfunction_compare_categorical_variables.jpg ├── check_data.png ├── check_missing_values.png ├── check_missing_values2.png ├── check_missing_values3.png ├── define_categorical_numerical_variables1.png ├── define_categorical_numerical_variables2.png ├── feature_engineering.png ├── feature_engineering2.png ├── function_compare_categorical_variables.jpg ├── gradient_boosting.png ├── gradient_boosting_Gini_LogLoss.png ├── gradient_boosting_ROC.png ├── gradient_boosting_ROC_confusion_matrix.png ├── gradient_boosting_confusion_matrix.png ├── gradient_boosting_ev1.png ├── handle_missing_values.jpg ├── handle_missing_values2.jpg ├── handle_outlier.png ├── handle_outlier2.png ├── handle_outlier3.png ├── hyper_parameter_Random_Forest.png ├── hyper_parameter_tuning_DecisionTree.png ├── hyper_parameter_tuning_GradientBoost.png ├── hyper_parameter_tuning_LogisticRegression.png ├── implement_to_data_test.png ├── implement_to_data_test2.png ├── insignificant_categories_function.jpg ├── insignificant_categories_function3.jpg ├── insignificant_categories_function4.jpg ├── load_dataset_function.png ├── load_libraries.png ├── logistic_regression.png ├── logistic_regression_Gini_LogLoss.png ├── logistic_regression_ROC.png ├── logistic_regression_ROC_confusion_matrix.png ├── logistic_regression_confusion_matrix.png ├── logistic_regression_ev1.png ├── split_data_train.png └── test.txt ├── README.md ├── my_submission.csv ├── my_submission2.csv └── sample_submission.csv /Classification_Using _Pyspark.py: -------------------------------------------------------------------------------- 1 | #Classification Using Pyspark 2 | 3 | #Pyspark Initializasing 4 | # to make pyspark importable as a regular library 5 | import findspark 6 | findspark.init() 7 | 8 | import pyspark 9 | 10 | from pyspark import SparkContext 11 | sc = SparkContext.getOrCreate() 12 | 13 | #initializasing SparkSession for creating Spark DataFrame 14 | from pyspark.sql import SparkSession 15 | spark = SparkSession.builder.getOrCreate() 16 | 17 | 18 | #Load Libraries 19 | # Data Frame spark profiling 20 | from pyspark.sql.types import IntegerType, StringType, DoubleType, ShortType, DecimalType 21 | import pyspark.sql.functions as func 22 | from pyspark.sql.functions import isnull 23 | from pyspark.sql.functions import isnan, when, count, col, round 24 | from pyspark.sql.functions import mean 25 | from pyspark.sql.types import Row 26 | import matplotlib.pyplot as plt 27 | from pyspark.sql.functions import udf 28 | 29 | # Pandas DF operation 30 | import pandas as pd 31 | import numpy as np 32 | import matplotlib.pyplot as plt 33 | import seaborn as sns 34 | from numpy import array 35 | 36 | # Modeling + Evaluation 37 | from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer 38 | from pyspark.sql.functions import when 39 | from pyspark.sql import functions as F 40 | from pyspark.sql.functions import avg 41 | from pyspark.ml import Pipeline 42 | from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier 43 | from pyspark.ml.classification import DecisionTreeClassifier 44 | from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator 45 | from pyspark.mllib.evaluation import BinaryClassificationMetrics 46 | from pyspark.ml.tuning import CrossValidator, ParamGridBuilder 47 | from sklearn.metrics import roc_curve, auc 48 | from sklearn.metrics import log_loss 49 | from pyspark.sql import Window 50 | from pyspark.sql.functions import rank,sum,col 51 | from pyspark.ml.linalg import Vectors 52 | from pyspark.ml.feature import VectorSlicer 53 | 54 | window = Window.rowsBetween(Window.unboundedPreceding,Window.unboundedFollowing) 55 | 56 | 57 | #Load Data to Spark DataFrame 58 | #Initializing File Type and path for data train 59 | file_type = 'text' 60 | path=r'train.csv' 61 | delimeter=',' 62 | 63 | def load_data(file_type): 64 | """input type of file "text" or "parquet" and Return pyspark dataframe""" 65 | if file_type =="text": # use text as file type input 66 | df = spark.read.option("header", "true") \ 67 | .option("delimeter",delimeter)\ 68 | .option("inferSchema", "true") \ 69 | .csv(path) #path file that you want import 70 | else: 71 | df= spark.read.parquet("example.parquet") #path file that you want import 72 | return df 73 | 74 | #call function load_data 75 | df = load_data(file_type) 76 | 77 | #Initializing File Type and path for data test 78 | file_type = 'text' 79 | path=r'test.csv' 80 | delimeter=',' 81 | 82 | #call function load_data 83 | test_data = load_data(file_type) 84 | 85 | 86 | #Check data 87 | #check type of data train and data test 88 | type(df) 89 | type(test_data) 90 | 91 | #show 5 observation in data train 92 | df.show(5) 93 | 94 | #show 5 observation in data test 95 | test_data.show(5) 96 | 97 | #Print Schema and count number of columns from data train 98 | len(df.columns), df.printSchema() 99 | 100 | #Print Schema and count number of columns from data test 101 | len(test_data.columns), test_data.printSchema() 102 | 103 | #rename Target to 'label in data train 104 | df = df.withColumnRenamed('QuoteConversion_Flag','label') 105 | #rename Id number ('QuoteNumber') to 'Id' in data train 106 | df = df.withColumnRenamed('QuoteNumber','Id') 107 | 108 | #rename Id number ('QuoteNumber') to 'Id' in data test 109 | test_data = test_data.withColumnRenamed('QuoteNumber','Id') 110 | 111 | #drop column Original_Quote_Date from data train 112 | df_final=df.drop('Original_Quote_Date') 113 | 114 | #count number of observation in data train 115 | df_final.count() 116 | 117 | #drop column Original_Quote_Date from data test 118 | test_data=test_data.drop('Original_Quote_Date') 119 | 120 | #calculate percentage of target and save in dataframe called target_percent 121 | target_percent=df_final.groupBy('label').count().sort(col("count").desc())\ 122 | .withColumn('total',sum(col('count')).over(window))\ 123 | .withColumn('Percent',col('count')*100/col('total')) 124 | 125 | #show dataframe terget_percent to check the proportion 126 | target_percent.show() 127 | 128 | 129 | #Define categorical and nummerical variable in df_final (data train) 130 | #Categorical and numerical variable 131 | #just will select string data type 132 | cat_cols = [item[0] for item in df_final.dtypes if item[1].startswith('string')] 133 | print("cat_cols:", cat_cols) 134 | 135 | #just will select integer or double data type 136 | num_cols = [item[0] for item in df_final.dtypes if item[1].startswith('int') | item[1].startswith('double')] 137 | print("num_cols:", num_cols) 138 | 139 | #Select column 'Id' from num_cols 140 | num_id=num_cols.pop(0) 141 | print("num_id:", num_id) 142 | 143 | #save column 'Id' in num_id variable 144 | num_id=[num_id] 145 | #print num_id 146 | print(num_id) 147 | 148 | #Remove column 'label' from numerical columns group 149 | num_cols.remove('label') #label is removed because it's the target to validate the model 150 | 151 | #print num_cols variable 152 | print("num_cols:", num_cols) 153 | 154 | #count number of numerical and categorical columns in data train 155 | len(num_cols), len(cat_cols) 156 | 157 | #Define categorical and nummerical variable in test_data (data test) 158 | #Categorical and numerical variable 159 | #just will select string data type 160 | cat_cols_test = [item[0] for item in test_data.dtypes if item[1].startswith('string')] 161 | print("cat_cols_test:", cat_cols_test) 162 | 163 | #just will select integer or double data type 164 | num_cols_test = [item[0] for item in test_data.dtypes if item[1].startswith('int') | item[1].startswith('double')] 165 | print("num_cols_test:", num_cols_test) 166 | 167 | #Select 'Id' from num_cols_test and save in variable called 'num_id_test' 168 | num_id_test=num_cols_test.pop(0) 169 | print("num_id_test:", num_id_test) 170 | 171 | #save num_id_test to list called 'num_id_test' 172 | num_id_test=[num_id_test] 173 | print(num_id_test) 174 | print(num_cols_test) 175 | 176 | #count observation in data test 177 | test_data.count() 178 | 179 | #count number of numerical and categorical columns in data test 180 | len(num_cols_test), len(cat_cols_test) 181 | 182 | 183 | #Sample data 184 | #define ratio that want to sample 185 | ratio=0.1 #will take 10% from data 186 | 187 | #take 10% sample from data train with replacing false and seed 42 and save in df_sample 188 | df_sample=df_final.sample(False, ratio, 42) 189 | 190 | #count observation from df_sample 191 | df_sample.count() 192 | 193 | #take 10% sample from data test with replacing false and seed 42 and save in test_sample 194 | test_sample=test_data.sample(False, ratio, 42) 195 | 196 | #count observation from test_sample 197 | test_sample.count() 198 | 199 | 200 | #Check Missing Value in data train 201 | #Check Missing Value in Pyspark Dataframe 202 | def count_nulls(c): 203 | """Input pyspark dataframe and return list of columns with missing value and it's total value""" 204 | null_counts = [] #make an empty list to hold our results 205 | for col in c.dtypes: #iterate through the column data types we saw above, e.g. ('C0', 'bigint') 206 | cname = col[0] #splits out the column name, e.g. 'C0' 207 | ctype = col[1] #splits out the column type, e.g. 'bigint' 208 | nulls = c.where( c[cname].isNull()).count() #check count of null in column name 209 | result = tuple([cname, nulls]) #new tuple, (column name, null count) 210 | null_counts.append(result) #put the new tuple in our result list 211 | null_counts=[(x,y) for (x,y) in null_counts if y!=0] #view just columns that have missing values 212 | return null_counts 213 | 214 | #Call function count_nulls and apply it to data train (df_final) 215 | null_counts = count_nulls(df_final) 216 | null_counts 217 | 218 | #From null_counts, we just take information of columns name and save in list "list_cols_miss", like in the script below: 219 | list_cols_miss=[x[0] for x in null_counts] 220 | list_cols_miss 221 | 222 | #Create dataframe which just has list_cols_miss 223 | df_miss= df_final.select(*list_cols_miss) 224 | df_miss.dtypes 225 | 226 | #Define categorical columns and numerical columns which have missing value. 227 | ### for categorical columns 228 | catcolums_miss=[item[0] for item in df_miss.dtypes if item[1].startswith('string')] #will select name of column with string data type 229 | print("catcolums_miss:", catcolums_miss) 230 | 231 | ### for numerical columns 232 | numcolumns_miss = [item[0] for item in df_miss.dtypes if item[1].startswith('int') | item[1].startswith('double')] #will select name of column with integer or double data type 233 | print("numcolumns_miss:", numcolumns_miss) 234 | 235 | #Drop missing value 236 | df_Nomiss=df_final.na.drop() 237 | 238 | #fill missing value in categorical variable with most frequent 239 | for x in catcolums_miss: 240 | mode=df_Nomiss.groupBy(x).count().sort(col("count").desc()).collect()[0][0] #group by based on categories and count each categories and sort descending then take the first value in column 241 | print(x, mode) #print name of columns and it's most categories 242 | df_final = df_final.na.fill({x:mode}) #fill missing value in each columns with most frequent 243 | 244 | #fill missing value in numerical variable with average 245 | for i in numcolumns_miss: 246 | meanvalue = df_final.select(round(mean(i))).collect()[0][0] #calculate average in each numerical column 247 | print(i, meanvalue) #print name of columns and it's average value 248 | df_final=df_final.na.fill({i:meanvalue}) #fill missing value in each columns with it's average value 249 | 250 | #Check Missing value after filling 251 | null_counts = count_nulls(df_final) 252 | null_counts 253 | 254 | 255 | #Check Missing Value in data test 256 | #We will cleansing missing values in pyspark dataframe. 257 | #Call function to count missing values in test_data 258 | null_test= count_nulls(test_data) 259 | null_test 260 | 261 | #take just name of columns that have missing values 262 | list_miss_test=[x[0] for x in null_test] 263 | list_miss_test 264 | 265 | #Create dataframe which just has list_cols_miss 266 | test_miss= test_data.select(*list_miss_test) 267 | 268 | #view data types in df_miss 269 | test_miss.dtypes 270 | 271 | #Define categorical columns and numerical columns which have missing value. 272 | ### for categorical columns 273 | catcolums_miss_test=[item[0] for item in test_miss.dtypes if item[1].startswith('string')] #will select name of column with string data type 274 | print("catcolums_miss_test:", catcolums_miss_test) 275 | 276 | ### for numerical columns 277 | numcolumns_miss_test = [item[0] for item in test_miss.dtypes if item[1].startswith('int') | item[1].startswith('double')] #will select name of column with integer or double data type 278 | print("numcolumns_miss_test:", numcolumns_miss_test) 279 | 280 | #Drop missing value 281 | test_Nomiss=test_data.na.drop() 282 | 283 | #fill missing value in categorical variable with most frequent 284 | for x in catcolums_miss_test: 285 | mode=test_Nomiss.groupBy(x).count().sort(col("count").desc()).collect()[0][0] #group by based on categories and count each categories and sort descending then take the first value in column 286 | print(x, mode) #print name of columns and it's most categories 287 | test_data = test_data.na.fill({x:mode}) #fill missing value in each columns with most frequent 288 | 289 | #fill missing value in numerical variable with average 290 | for i in numcolumns_miss_test: 291 | meanvalue_test = test_data.select(round(mean(i))).collect()[0][0] #calculate average in each numerical column 292 | print(i, meanvalue_test) #print name of columns and it's average value 293 | test_data=test_data.na.fill({i:meanvalue_test}) #fill missing value in each columns with it's average value 294 | 295 | #Check Missing value after filling 296 | %time null_test = count_nulls(test_data) 297 | null_test 298 | 299 | 300 | #Compare categorical columns in df_final and test_data 301 | #Function to check categorical columns in both data train and data test 302 | def check_category2(a1,a2,y): 303 | """input are two dataframe you want to compare categorical variables and the colomn category name""" 304 | print('column:',y) 305 | #distinct1=a1.select([y]).distinct().count() #count distinct column in dataframe1 306 | #distinct2=a2.select([y]).distinct().count() #count distinct column in dataframe2 307 | #if distinct1 == distinct2: 308 | var1=a1.select([y]).distinct() #define distinct category in column in dataframe1 309 | var2=a2.select([y]).distinct() #define distinct category in column in dataframe2 310 | diff2=var2.subtract(var1).collect() #define the different category in dataframe2, return is list 311 | diff2=[r[y] for r in diff2] #just take the values 312 | diff1=var1.subtract(var2).collect() #define the different category in dataframe1, return is list 313 | diff1=[r[y] for r in diff1] #just take the values 314 | if diff1 == diff2: 315 | print('diff2:', diff2) 316 | print('diff1:', diff1) 317 | print('Columns match!!') 318 | else: 319 | if len(diff1)!=0 and len(diff2)==len(diff1): 320 | print('diff2:', diff2) 321 | print('diff1:', diff1) 322 | a2=a2.replace(diff2, diff1, y) #replace the different category in dataframe2 with category in dataframe1 323 | print('Columns match now!!') 324 | else: 325 | if len(diff2)!=len(diff1) and len(diff2)!=0: 326 | print('diff2:', diff2) 327 | print('diff1:', diff1) 328 | dominant1=a1.groupBy(y).count().sort(col("count").desc()).collect()[0][0] 329 | dominant2=a2.groupBy(y).count().sort(col("count").desc()).collect()[0][0] #define category dominant in dataframe2 330 | print('dominant2:', dominant2) 331 | print('dominant1:', dominant1) 332 | a2=a2.replace(diff2, dominant1, y) #replace different category in dataframe2 with dominant category 333 | print('Columns match now!!') 334 | else: 335 | print('diff1:', diff1) 336 | print('diff2:', diff2) 337 | return a2 338 | 339 | #call function to check catgories in data train and test, whether same or not, if not, the different categories will be replaced. 340 | for y in cat_cols_test: 341 | test_data=check_category2(df_final,test_data,y) 342 | 343 | 344 | #EDA 345 | #Check distribution in each variables 346 | #Pyspark dataframe has limitation in visualization. Then to create visualization we have to convert pyspark dataframe to pandas dataframe. 347 | # convert spark dataframe to pandas for visualization 348 | df_pd=df_final.toPandas() 349 | 350 | #Barchart for categorical variable 351 | plt.figure(figsize=(20,10)) 352 | plt.subplot(221) 353 | sns.countplot(x='label', data=df_pd, order=df_pd['label'].value_counts().index) 354 | plt.title('TARGET', fontsize=15) 355 | plt.subplot(222) 356 | sns.countplot(y='Field6', data=df_pd, order=df_pd['Field6'].value_counts().index) 357 | plt.title('Field6', fontsize=15) 358 | plt.subplot(223) 359 | sns.countplot(x='Field12', data=df_pd, order=df_pd['Field12'].value_counts().index) 360 | plt.title('Field12', fontsize=15) 361 | plt.show() 362 | 363 | #Barchart for categorical variable 364 | plt.figure(figsize=(20,10)) 365 | plt.subplot(221) 366 | sns.countplot(y='CoverageField8', data=df_pd, order=df_pd['CoverageField8'].value_counts().index) 367 | plt.title('CoverageField8', fontsize=15) 368 | plt.subplot(222) 369 | sns.countplot(y='CoverageField9', data=df_pd, order=df_pd['CoverageField9'].value_counts().index) 370 | plt.title('CoverageField9', fontsize=15) 371 | plt.subplot(223) 372 | sns.countplot(y='SalesField7', data=df_pd, order=df_pd['SalesField7'].value_counts().index) 373 | plt.title('SalesField7', fontsize=15) 374 | plt.show() 375 | 376 | #Categorical vs Target visualization 377 | pd.crosstab(df_pd['Field6'], df_pd['label'], normalize='index').plot.bar(rot=0, stacked=True, 378 | color=['green', 'red'], figsize=(4,4), title="Field6 VS label") 379 | plt.legend(loc='upper right', bbox_to_anchor=(1.2, 1)) 380 | 381 | pd.crosstab(df_pd['Field12'], df_pd['label'], normalize='index').plot.bar(rot=0, stacked=True, 382 | color=['green', 'red'], figsize=(4,4), title="Field12 VS label") 383 | plt.legend(loc='upper right', bbox_to_anchor=(1.2, 1)) 384 | plt.show() 385 | 386 | #Numerical Variables 387 | #We have 260 numerical variables, and we will plot just some variables. 388 | #density plot Field7 389 | #plt.figure(figsize=(24,5)) 390 | sns.distplot(df_pd['Field7']) 391 | plt.show() 392 | 393 | #Numerical vs Target visualization 394 | #show distribution 'Field7' vs 'label' 395 | #plt.figure(figsize=(20,8)) 396 | sns.kdeplot(df_pd[df_pd["label"]==0]["Field7"], label="0", color="green") 397 | sns.kdeplot(df_pd[df_pd["label"]==1]["Field7"], label="1", color="red") 398 | plt.title("Field7 VS label") 399 | plt.show() 400 | 401 | #Check outlier in numerical variable 402 | df_pd[["Field7"]].boxplot(sym='g-*', grid=True) 403 | plt.show() 404 | 405 | 406 | #Insignificant Categories in Data train 407 | #Define the threshold for insignificant categories 408 | threshold=98 409 | threshold2=0.7 410 | 411 | #function to replace insignificant categories in data train 412 | def replace_cat2(f,cols): 413 | """input are dataframe and categorical variables, replace insignificant categories (percentage <=0.7) with largest number 414 | of catgories and output is new dataframe """ 415 | df_percent=f.groupBy(cols).count().sort(col("count").desc())\ 416 | .withColumn('total',sum(col('count')).over(window))\ 417 | .withColumn('Percent',col('count')*100/col('total')) #calculate the percentage-save in Percent columns from each categories 418 | dominant_cat=df_percent.select(df_percent['Percent']).collect()[0][0] #calculate the highest percentage of category 419 | count_dist=f.select([cols]).distinct().count() #calculate distinct values in that columns 420 | if count_dist > 2 and dominant_cat <= threshold : 421 | print('column:', cols) 422 | cols_names.append(cols) #combine with previous list 423 | replacement=f.groupBy(cols).count().sort(col("count").desc()).collect()[0][0] #define dominant category 424 | print("replacement:",replacement) 425 | replacing.append(replacement) #combine with previous list 426 | insign_cat=df_percent.filter(df_percent['Percent']< threshold2).select(df_percent[cols]).collect() #calculate insignificant categories 427 | insign_cat=[r[cols] for r in insign_cat] #just take the values 428 | category.append(insign_cat) #combine with previous list 429 | print("insign_cat:",insign_cat) 430 | f=f.replace(insign_cat,replacement, cols) #replace insignificant categories with dominant categories 431 | return f 432 | 433 | #call function replacing insignificant categories in data train 434 | replacing=[] 435 | cols_names=[] 436 | category=[] 437 | for cols in cat_cols: 438 | df_final=replace_cat2(df_final,cols) 439 | 440 | #check length in list cols_names, category and replacing 441 | len(cols_names), len(category), len(replacing) 442 | 443 | #Create dataframe of replaced categories 444 | g=spark.createDataFrame(list(zip(cols_names, replacing, category)),['cols_names', 'replacing', 'category']) 445 | g.show(9) 446 | 447 | #Replacing Insignificant Categories in data test 448 | #We already have a dataframe containing any categories that need to be replaced, 449 | #we got it when the process of replacing the insignificant categories in the data train, the data frame is called g. 450 | #Based on those information, insignificant categories on data test will be replaced. 451 | cols_names_list=g.select('cols_names').collect() #select just cols_names from dataframe g 452 | cols_names_list=[r['cols_names'] for r in cols_names_list] #take just the values 453 | 454 | #function to replace insignificant categories in data test 455 | for z in cols_names_list: 456 | print('cols_names:',z) 457 | replacement_cat=g.filter(g['cols_names']== z).select(g['replacing']).collect()[0][0] #select values of replacing columns accoring to z in cols_names 458 | print('replacement_cat:', replacement_cat) 459 | insignificant_cat=g.filter(g['cols_names']== z).select(g['category']).collect()[0][0] #select values of category columns accoring to z in cols_names 460 | print('insignificant_cat:',insignificant_cat) 461 | test_data=test_data.replace(insignificant_cat,replacement_cat, z) #replace insignificant cat with replacement value 462 | 463 | #Handle of outlier in data train 464 | #Calculate Upper&Lower side in pandas dataframe 465 | df_describe=df_pd.describe() 466 | df_describe 467 | 468 | #Calculate Upper&Lower side in pyspark dataframe 469 | #create quantile dataframe 470 | def quantile(e): 471 | """Input is dataframe and return new dataframe with value of quantile from numerical columns""" 472 | percentiles = [0.25, 0.5, 0.75] 473 | quant=spark.createDataFrame(zip(percentiles, *e.approxQuantile(num_cols, percentiles, 0.0)), 474 | ['percentile']+num_cols) #calculate quantile from pyspark dataframe, 0.0 is relativeError, 475 | #The relative target precision to achieve (>= 0). If set to zero, 476 | #the exact quantiles are computed, which could be very expensive 477 | #and aggregate the result with percentiles variable, 478 | #then create pyspark dataframe 479 | return quant 480 | 481 | #call quantile function 482 | %time quantile=quantile(df_sample) 483 | 484 | #function to calculate uppler side 485 | def upper_value(b,c): 486 | """Input is quantile dataframe and name of numerical column and Retrun upper value from the column""" 487 | q1 = b.select(c).collect()[0][0] #select value of q1 from the column 488 | q2 = b.select(c).collect()[1][0] #select value of q2 from the column 489 | q3 = b.select(c).collect()[2][0] #select value of q3 from the column 490 | IQR=q3-q1 #calculate the value of IQR 491 | upper= q3 + (IQR*1.5) #calculate the value of upper side 492 | return upper 493 | 494 | #function to calculate lower side 495 | def lower_value(b,c): 496 | """Input is quantile dataframe and name of numerical column and Retrun lower value from the column""" 497 | q1 = b.select(c).collect()[0][0] #select value of q1 from the column 498 | q2 = b.select(c).collect()[1][0] #select value of q2 from the column 499 | q3 = b.select(c).collect()[2][0] #select value of q3 from the column 500 | IQR=q3-q1 #calculate the value of IQR 501 | lower= q1 - (IQR*1.5) #calculate the value of lower side 502 | return lower 503 | 504 | #function for replacing outlier by upper side 505 | def replce_outlier_up2(d,col, value): 506 | """Input is name of numerical column and it's upper side value""" 507 | d=d.withColumn(col, F.when(d[col] > value , value).otherwise(d[col])) 508 | return d 509 | 510 | #function for replacing outlier with lower side 511 | def replce_outlier_low2(d,col, value): 512 | """Input is name of numerical column and it's lower side value""" 513 | d=d.withColumn(col, F.when(d[col] < value , value).otherwise(d[col])) 514 | return d 515 | 516 | #call function to calculate lower side and replace value under lower side with value lower side at all numerical variables 517 | for i in num_cols: 518 | lower=lower_value(quantile,i) 519 | df_final=replce_outlier_low2(df_final, i, lower) 520 | 521 | #call function to calculate upper side and replace value above upper side with value upper side at all numerical variables 522 | for x in num_cols: 523 | upper=upper_value(quantile,x) 524 | df_final=replce_outlier_up2(df_final, x, upper) 525 | 526 | #Handle of outlier in data test 527 | #create quantile dataframe 528 | def quantile(e): 529 | """Input is dataframe and return new dataframe with value of quantile from numerical columns""" 530 | percentiles = [0.25, 0.5, 0.75] 531 | quant=spark.createDataFrame(zip(percentiles, *e.approxQuantile(num_cols_test, percentiles, 0.0)), 532 | ['percentile']+num_cols_test) #calculate quantile from pyspark dataframe, 0.0 is relativeError, 533 | #The relative target precision to achieve (>= 0). If set to zero, 534 | #the exact quantiles are computed, which could be very expensive 535 | #and aggregate the result with percentiles variable, 536 | #then create pyspark dataframe 537 | return quant 538 | 539 | #call funtion quantile 540 | quantile=quantile(test_sample) 541 | 542 | #call function to calculate lower side and replace value under lower side with value lower side at all numerical variables 543 | for i in num_cols_test: 544 | lower=lower_value(quantile,i) 545 | test_data=replce_outlier_low2(test_data, i, lower) 546 | 547 | #call function to calculate upper side and replace value above upper side with value upper side at all numerical variables 548 | for x in num_cols_test: 549 | upper=upper_value(quantile,x) 550 | test_data=replce_outlier_up2(test_data, x, upper) 551 | 552 | #Feature Engineering 553 | #function to check distinct categories in data train and data test 554 | def check_distinct(a1,a2): 555 | """input are two dataframe that you want to compare categorical variables and the output is 556 | total distinct categories in both dataframe""" 557 | total1=0 558 | total2=0 559 | for y in cat_cols: 560 | distinct1=a1.select([y]).distinct().count() #count distinct column in dataframe1 561 | distinct2=a2.select([y]).distinct().count() #count distinct column in dataframe2 562 | var1=a1.select([y]).distinct().collect() #define distinct category in column in dataframe1 563 | var1=[r[y] for r in var1] 564 | var2=a2.select([y]).distinct().collect() 565 | var2=[r[y] for r in var2] 566 | total1=total1+distinct1 567 | total2=total2+distinct2 568 | return total1, total2 569 | 570 | #function to execute feature engineering 571 | def feature_engineering(a1): 572 | """Function for feature engineering (StringIndexer and OneHotEncoder process)""" 573 | cat_columns_string_vec = [] 574 | for c in cat_cols: 575 | cat_columns_string= c+"_vec" 576 | cat_columns_string_vec.append(cat_columns_string) 577 | stringIndexer = [StringIndexer(inputCol=x, outputCol=x+"_Index") 578 | for x in cat_cols] 579 | #use oneHotEncoder to convert categorical variable to binary 580 | encoder = [OneHotEncoder(inputCol=x+"_Index", outputCol=y) 581 | for x,y in zip(cat_cols, cat_columns_string_vec)] 582 | #create list of stringIndexer and encoder with 2 dimension 583 | tmp = [[i,j] for i,j in zip(stringIndexer, encoder)] 584 | tmp = [i for sublist in tmp for i in sublist] 585 | cols_assember=num_id + num_cols + cat_columns_string_vec 586 | assembler=VectorAssembler(inputCols=cols_assember, outputCol='features') 587 | tmp += [assembler] 588 | pipeline=Pipeline(stages=tmp) 589 | df_final_feat=pipeline.fit(a1).transform(a1) 590 | return df_final_feat 591 | 592 | #fucntion to call fucntion feature_engineering and check_distinct 593 | def Main_feature_engineering(df,df2): 594 | """Function for calling check_distinct and feature_engineering. Then Join data train and data test if distinct categories 595 | between data train and data test not same then do feature engineering, If distinct same will do feature engineering data train 596 | and data test separately""" 597 | dist_total1, dist_total2=check_distinct(df,df2) 598 | if dist_total1!=dist_total2: 599 | Label_df=df.select('Id', 'label') 600 | df_final2=df.drop('label') 601 | all_df =df_final2.union(df2) 602 | all_df_feat=feature_engineering(all_df) 603 | id_train=df.select('Id').collect() 604 | id_train=[r['Id'] for r in id_train] 605 | id_test=df2.select('Id').collect() 606 | id_test=[r['Id'] for r in id_test] 607 | a=all_df_feat.filter(all_df['Id'].isin(id_train)) 608 | b=all_df_feat.filter(all_df['Id'].isin(id_test)) 609 | a=a.join(Label_df, 'Id') 610 | else: 611 | a=feature_engineering(df) 612 | b=feature_engineering(df2) 613 | return a,b 614 | 615 | #call function feature engineering 616 | %time data2, test2=Main_feature_engineering(df_final, test_data) 617 | 618 | #view result of feature engineering in data train 619 | data2.select('Id', 'features').show(5) 620 | 621 | #view result of feature engineering in data test 622 | test2.select('Id', 'features').show(5) 623 | 624 | #Split Data train to train and test 625 | #Split df_final to train and test, train 70% and test 30%. Define seed 24 so the random data that we split will not change. 626 | #we can define seed with any value 627 | data_train, data_test=data2.randomSplit([0.7,0.3], 24) 628 | 629 | 630 | #Modelling & Evaluation 631 | #Logistic Regression 632 | #Create logistic regression model to data train 633 | lr=LogisticRegression(featuresCol='features', labelCol='label') 634 | lr_model = lr.fit(data_train) 635 | 636 | #Transform model to data test 637 | lr_result = lr_model.transform(data_test) 638 | 639 | #view id, label, prediction and probability from result of modelling 640 | lr_result.select('Id', 'label', 'prediction', 'probability').show(5) 641 | 642 | #Logistic Regression Evaluation 643 | #Evaluate model by checking accuracy and AUC value 644 | lr_eval = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="label") 645 | lr_eval2= MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label") 646 | lr_AUC = lr_eval.evaluate(lr_result) 647 | lr_ACC = lr_eval2.evaluate(lr_result, {lr_eval2.metricName:"accuracy"}) 648 | 649 | print("Logistic Regression Performance Measure") 650 | print("Accuracy = %0.2f" % lr_ACC) 651 | print("AUC = %.2f" % lr_AUC) 652 | 653 | #ROC Grafik 654 | #Create ROC grafik from lr_result 655 | PredAndLabels = lr_result.select("probability", "label") 656 | PredAndLabels_collect = PredAndLabels.collect() 657 | PredAndLabels_list = [(float(i[0][0]), 1.0-float(i[1])) for i in PredAndLabels_collect] 658 | PredAndLabels = sc.parallelize(PredAndLabels_list) 659 | 660 | metrics = BinaryClassificationMetrics(PredAndLabels) 661 | 662 | # Area under ROC 663 | print("Logistic Regression Area Under ROC") 664 | print("Area under ROC = %.2f" % metrics.areaUnderROC) 665 | 666 | # Visualization 667 | FPR = dict() # FPR: False Positive Rate 668 | tpr = dict() # TPR: True Positive Rate 669 | roc_auc = dict() 670 | 671 | y_test = [i[1] for i in PredAndLabels_list] 672 | y_score = [i[0] for i in PredAndLabels_list] 673 | 674 | fpr, tpr, _ = roc_curve(y_test, y_score) 675 | roc_auc = auc(fpr, tpr) 676 | 677 | plt.figure(figsize=(5,4)) 678 | plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) 679 | plt.plot([0, 1], [0, 1], 'k--') 680 | plt.xlim([0.0, 1.0]) 681 | plt.ylim([0.0, 1.05]) 682 | plt.xlabel('False Positive Rate') 683 | plt.ylabel('True Positive Rate') 684 | plt.title('ROC Curve - Logistic Regression') 685 | plt.legend(loc="lower right") 686 | plt.show() 687 | 688 | #confusion Matrix 689 | cm_lr_result = lr_result.crosstab("prediction", "label") 690 | cm_lr_result = cm_lr_result.toPandas() 691 | cm_lr_result 692 | 693 | #calculate Accuracy, Sensitivity, Specificity, Precision 694 | TP = cm_lr_result["1"][0] 695 | FP = cm_lr_result["0"][0] 696 | TN = cm_lr_result["0"][1] 697 | FN = cm_lr_result["1"][1] 698 | Accuracy = (TP+TN)/(TP+FP+TN+FN) 699 | Sensitivity = TP/(TP+FN) 700 | Specificity = TN/(TN+FP) 701 | Precision = TP/(TP+FP) 702 | 703 | print ("Accuracy = %0.2f" %Accuracy ) 704 | print ("Sensitivity = %0.2f" %Sensitivity ) 705 | print ("Specificity = %0.2f" %Specificity ) 706 | print ("Precision = %0.2f" %Precision ) 707 | 708 | #Calculate Gini Coefficient from AUC 709 | AUC = lr_AUC 710 | Gini = (2 * AUC - 1) 711 | print("AUC=%.2f" % AUC) 712 | print("GINI ~=%.2f" % Gini) 713 | 714 | #Calculate Log Loss in pandas dataframe 715 | #Create Dataframe to Calculate Log Loss 716 | y_test= data_test.select('label') 717 | lr_proba=lr_result.select('probability') 718 | 719 | #Convert lr_probaspark dataframe to numpy array 720 | lr_proba= np.array(lr_result.select('probability').collect()) 721 | 722 | #Convert numpy array 3 dimentional to 2 dimentional 723 | lr_proba=lr_proba.reshape(-1, lr_proba.shape[-1]) 724 | 725 | #Convert y_test dataframe to pandas dataframe 726 | y_test=y_test.toPandas() 727 | 728 | #Convert y_test pandas dataframe to pandas series 729 | y_test=pd.Series(y_test['label'].values) 730 | 731 | #Calculate log loss from logistic regression 732 | LogLoss = log_loss(y_test, lr_proba) 733 | 734 | print("Log Loss Linear Regression:%.4f" % LogLoss) 735 | 736 | #Logistic Regression With Hyper-Parameter Tuning 737 | #define logistic regression model 738 | lr_hyper=LogisticRegression(featuresCol='features', labelCol='label') 739 | 740 | 741 | #Hyper-Parameter Tuning 742 | paramGrid_lr = ParamGridBuilder() \ 743 | .addGrid(lr_hyper.regParam, [0.1, 0.01]) \ 744 | .addGrid(lr_hyper.elasticNetParam, [0.8, 0.7]) \ 745 | .build() 746 | crossval_lr = CrossValidator(estimator=lr_hyper, 747 | estimatorParamMaps=paramGrid_lr, 748 | evaluator=BinaryClassificationEvaluator(), 749 | numFolds=3) 750 | #fit model to data train 751 | lr_model_hyper = crossval_lr.fit(data_train) 752 | 753 | #Transform model to data test 754 | lr_result_hyper = lr_model_hyper.transform(data_test) 755 | 756 | #view id, label, prediction and probability from result of modelling 757 | lr_result_hyper.select('Id', 'label', 'prediction', 'probability').show(5) 758 | 759 | #Logistic Regression With Hyper-Parameter Tuning Evaluation 760 | #Evaluate model by checking accuracy and AUC value 761 | lr_hyper_eval = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="label") 762 | lr_hyper_eval2= MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label") 763 | lr_hyper_AUC = lr_hyper_eval.evaluate(lr_result_hyper) 764 | lr_hyper_ACC = lr_hyper_eval2.evaluate(lr_result_hyper, {lr_hyper_eval2.metricName:"accuracy"}) 765 | 766 | print("Logistic Regression Performance Measure") 767 | print("Accuracy = %0.2f" % lr_hyper_ACC) 768 | print("AUC = %.2f" % lr_hyper_AUC) 769 | 770 | #ROC Grafik 771 | PredAndLabels = lr_result_hyper.select("probability", "label") 772 | PredAndLabels_collect = PredAndLabels.collect() 773 | PredAndLabels_list = [(float(i[0][0]), 1.0-float(i[1])) for i in PredAndLabels_collect] 774 | PredAndLabels = sc.parallelize(PredAndLabels_list) 775 | 776 | metrics = BinaryClassificationMetrics(PredAndLabels) 777 | 778 | # Area under ROC 779 | print("Logistic Regression Area Under ROC") 780 | print("Area under ROC = %.2f" % metrics.areaUnderROC) 781 | 782 | # Visualization 783 | FPR = dict() # FPR: False Positive Rate 784 | tpr = dict() # TPR: True Positive Rate 785 | roc_auc = dict() 786 | 787 | y_test = [i[1] for i in PredAndLabels_list] 788 | y_score = [i[0] for i in PredAndLabels_list] 789 | 790 | fpr, tpr, _ = roc_curve(y_test, y_score) 791 | roc_auc = auc(fpr, tpr) 792 | 793 | plt.figure(figsize=(5,4)) 794 | plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) 795 | plt.plot([0, 1], [0, 1], 'k--') 796 | plt.xlim([0.0, 1.0]) 797 | plt.ylim([0.0, 1.05]) 798 | plt.xlabel('False Positive Rate') 799 | plt.ylabel('True Positive Rate') 800 | plt.title('ROC Curve - Logistic Regression') 801 | plt.legend(loc="lower right") 802 | plt.show() 803 | 804 | #confusion matrix 805 | cm_lr_result_hyper = lr_result_hyper.crosstab("prediction", "label") 806 | cm_lr_result_hyper = cm_lr_result_hyper.toPandas() 807 | cm_lr_result_hyper 808 | 809 | #calculate Accuracy, Sensitivity, Specificity, Precision 810 | TP = cm_lr_result_hyper["1"][0] 811 | FP = cm_lr_result_hyper["0"][0] 812 | TN = cm_lr_result_hyper["0"][1] 813 | FN = cm_lr_result_hyper["1"][1] 814 | Accuracy = (TP+TN)/(TP+FP+TN+FN) 815 | Sensitivity = TP/(TP+FN) 816 | Specificity = TN/(TN+FP) 817 | Precision = TP/(TP+FP) 818 | 819 | print ("Accuracy = %0.2f" %Accuracy ) 820 | print ("Sensitivity = %0.2f" %Sensitivity ) 821 | print ("Specificity = %0.2f" %Specificity ) 822 | print ("Precision = %0.2f" %Precision ) 823 | 824 | #Calculate Gini Coefisient from AUC 825 | AUC = lr_hyper_AUC 826 | Gini_lr_hyper = (2 * AUC - 1) 827 | print("AUC=%.2f" % AUC) 828 | print("GINI ~=%.2f" % Gini_lr_hyper) 829 | 830 | #Calculate Log Loss in pandas dataframe 831 | #Create Dataframe to Calculate Log Loss 832 | y_test= titanic_test.select('label') 833 | lr_hyper_proba=lr_result_hyper.select('probability') 834 | 835 | #Convert lr_probaspark dataframe to numpy array 836 | lr_hyper_proba= np.array(lr_hyper_proba.select('probability').collect()) 837 | 838 | #Convert numpy array 3 dimentional to 2 dimentional 839 | lr_hyper_proba=lr_hyper_proba.reshape(-1, lr_hyper_proba.shape[-1]) 840 | 841 | #Convert y_test dataframe to pandas dataframe 842 | y_test=y_test.toPandas() 843 | 844 | #Convert y_test pandas dataframe to pandas series 845 | y_test=pd.Series(y_test['label'].values) 846 | 847 | #Calculate log loss from logistic regression hyper parameter 848 | LogLoss = log_loss(y_test, lr_hyper_proba) 849 | 850 | print("Log Loss Linear Regression:%.4f" % LogLoss) 851 | 852 | 853 | #Decision Tree 854 | #Create decision tree model to data train 855 | dt=DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3) 856 | dt_model = dt.fit(data_train) 857 | 858 | ##Transform model to data test 859 | dt_result = dt_model.transform(data_test) 860 | 861 | #view id, label, prediction and probability from result of modelling 862 | dt_result.select('Id', 'label', 'prediction', 'probability').show(5) 863 | 864 | # Decision Tree Evaluation 865 | #Evaluate model by calculating accuracy and area under curve (AUC) 866 | dt_eval = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="label") 867 | dt_eval2= MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label") 868 | dt_AUC = dt_eval.evaluate(dt_result) 869 | dt_ACC = dt_eval2.evaluate(dt_result, {dt_eval2.metricName:"accuracy"}) 870 | 871 | print("Decision Tree Performance Measure") 872 | print("Accuracy = %0.2f" % dt_ACC) 873 | print("AUC = %.2f" % dt_AUC) 874 | 875 | #ROC Grafik 876 | PredAndLabels = dt_result.select("probability", "label") 877 | PredAndLabels_collect = PredAndLabels.collect() 878 | PredAndLabels_list = [(float(i[0][0]), 1.0-float(i[1])) for i in PredAndLabels_collect] 879 | PredAndLabels = sc.parallelize(PredAndLabels_list) 880 | 881 | metrics = BinaryClassificationMetrics(PredAndLabels) 882 | 883 | # Area under ROC 884 | print("Decision Tree Area Under ROC") 885 | print("Area under ROC = %.2f" % metrics.areaUnderROC) 886 | 887 | # Visualization 888 | FPR = dict() # FPR: False Positive Rate 889 | tpr = dict() # TPR: True Positive Rate 890 | roc_auc = dict() 891 | 892 | y_test = [i[1] for i in PredAndLabels_list] 893 | y_score = [i[0] for i in PredAndLabels_list] 894 | 895 | fpr, tpr, _ = roc_curve(y_test, y_score) 896 | roc_auc = auc(fpr, tpr) 897 | 898 | plt.figure(figsize=(5,4)) 899 | plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) 900 | plt.plot([0, 1], [0, 1], 'k--') 901 | plt.xlim([0.0, 1.0]) 902 | plt.ylim([0.0, 1.05]) 903 | plt.xlabel('False Positive Rate') 904 | plt.ylabel('True Positive Rate') 905 | plt.title('ROC Curve - Decision Tree') 906 | plt.legend(loc="lower right") 907 | plt.show() 908 | 909 | #confusion matrix 910 | cm_dt_result = dt_result.crosstab("prediction", "label") 911 | cm_dt_result = cm_dt_result.toPandas() 912 | cm_dt_result 913 | 914 | #calculate accuracy, sensitivity, specificity and precision 915 | TP = cm_dt_result["1"][0] 916 | FP = cm_dt_result["0"][0] 917 | TN = cm_dt_result["0"][1] 918 | FN = cm_dt_result["1"][1] 919 | Accuracy = (TP+TN)/(TP+FP+TN+FN) 920 | Sensitivity = TP/(TP+FN) 921 | Specificity = TN/(TN+FP) 922 | Precision = TP/(TP+FP) 923 | 924 | print ("Accuracy = %0.2f" %Accuracy ) 925 | print ("Sensitivity = %0.2f" %Sensitivity ) 926 | print ("Specificity = %0.2f" %Specificity ) 927 | print ("Precision = %0.2f" %Precision ) 928 | 929 | #Calculate Gini Coeffiecient from AUC 930 | AUC = dt_AUC 931 | Gini_dt = (2 * AUC - 1) 932 | print("AUC=%.2f" % AUC) 933 | print("GINI ~=%.2f" % Gini_dt) 934 | 935 | #Calculate Log Loss in pandas dataframe 936 | #Create Dataframe to Calculate Log Loss 937 | y_test= data_test.select('label') 938 | dt_proba=dt_result.select('probability') 939 | 940 | ##Convert lr_probaspark dataframe to numpy array 941 | dt_proba= np.array(dt_proba.select('probability').collect()) 942 | 943 | #Convert numpy array 3 dimentional to 2 dimentional 944 | dt_proba=dt_proba.reshape(-1, dt_proba.shape[-1]) 945 | 946 | #Convert y_test dataframe to pandas dataframe 947 | y_test=y_test.toPandas() 948 | 949 | #Convert y_test pandas dataframe to pandas series 950 | y_test=pd.Series(y_test['label'].values) 951 | 952 | #Calculate log loss from Decision Tree 953 | LogLoss = log_loss(y_test, dt_proba) 954 | 955 | print("Log Loss Decision Tree:%.4f" % LogLoss) 956 | 957 | #Decision Tree With Hyper-Parameter Tuning 958 | #define decision tree model 959 | dt_hyper=DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', impurity='gini') 960 | 961 | #Hyper-Parameter Tuning 962 | paramGrid_dt = ParamGridBuilder() \ 963 | .addGrid(dt_hyper.maxDepth, [5, 7]) \ 964 | .addGrid(dt_hyper.maxBins, [10,20]) \ 965 | .build() 966 | crossval_dt = CrossValidator(estimator=dt_hyper, 967 | estimatorParamMaps=paramGrid_dt, 968 | evaluator=BinaryClassificationEvaluator(), 969 | numFolds=5) 970 | #fit model to data train 971 | dt_model_hyper = crossval_dt.fit(data_train) 972 | 973 | #transform model to data test 974 | dt_result_hyper = dt_model_hyper.transform(data_test) 975 | 976 | #view id, label, prediction and probability from result of modelling 977 | dt_result_hyper.select('Id', 'label', 'prediction', 'probability').show(5) 978 | 979 | #Decision Tree With Hyper-Parameter Tuning Evaluation 980 | #Evaluate model by calculating accuracy and area under curve (AUC) 981 | dt_hyper_eval = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="label") 982 | dt_hyper_eval2= MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label") 983 | dt_hyper_AUC = dt_hyper_eval.evaluate(dt_result_hyper) 984 | dt_hyper_ACC = dt_hyper_eval2.evaluate(dt_result_hyper, {dt_hyper_eval2.metricName:"accuracy"}) 985 | 986 | print("Decision Tree Performance Measure") 987 | print("Accuracy = %0.2f" % dt_hyper_ACC) 988 | print("AUC = %.2f" % dt_hyper_AUC) 989 | 990 | #ROC Grafik 991 | PredAndLabels = dt_result_hyper.select("probability", "label") 992 | PredAndLabels_collect = PredAndLabels.collect() 993 | PredAndLabels_list = [(float(i[0][0]), 1.0-float(i[1])) for i in PredAndLabels_collect] 994 | PredAndLabels = sc.parallelize(PredAndLabels_list) 995 | 996 | metrics = BinaryClassificationMetrics(PredAndLabels) 997 | 998 | # Area under ROC 999 | print("Decision Tree Area Under ROC") 1000 | print("Area under ROC = %.2f" % metrics.areaUnderROC) 1001 | 1002 | # Visualization 1003 | FPR = dict() # FPR: False Positive Rate 1004 | tpr = dict() # TPR: True Positive Rate 1005 | roc_auc = dict() 1006 | 1007 | y_test = [i[1] for i in PredAndLabels_list] 1008 | y_score = [i[0] for i in PredAndLabels_list] 1009 | 1010 | fpr, tpr, _ = roc_curve(y_test, y_score) 1011 | roc_auc = auc(fpr, tpr) 1012 | 1013 | plt.figure(figsize=(5,4)) 1014 | plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) 1015 | plt.plot([0, 1], [0, 1], 'k--') 1016 | plt.xlim([0.0, 1.0]) 1017 | plt.ylim([0.0, 1.05]) 1018 | plt.xlabel('False Positive Rate') 1019 | plt.ylabel('True Positive Rate') 1020 | plt.title('ROC Curve - Decision Tree') 1021 | plt.legend(loc="lower right") 1022 | plt.show() 1023 | 1024 | #Confusion Matrix 1025 | cm_dt_result_hyper = dt_result_hyper.crosstab("prediction", "label") 1026 | cm_dt_result_hyper = cm_dt_result_hyper.toPandas() 1027 | cm_dt_result_hyper 1028 | 1029 | #calculate accuracy, sensitivity, specificity and precision 1030 | TP = cm_dt_result_hyper["1"][0] 1031 | FP = cm_dt_result_hyper["0"][0] 1032 | TN = cm_dt_result_hyper["0"][1] 1033 | FN = cm_dt_result_hyper["1"][1] 1034 | Accuracy = (TP+TN)/(TP+FP+TN+FN) 1035 | Sensitivity = TP/(TP+FN) 1036 | Specificity = TN/(TN+FP) 1037 | Precision = TP/(TP+FP) 1038 | 1039 | print ("Accuracy = %0.2f" %Accuracy ) 1040 | print ("Sensitivity = %0.2f" %Sensitivity ) 1041 | print ("Specificity = %0.2f" %Specificity ) 1042 | print ("Precision = %0.2f" %Precision ) 1043 | 1044 | #Calculate Gini Coefficient from AUC 1045 | AUC = dt_hyper_AUC 1046 | Gini_dt_hyper= (2 * AUC -1) 1047 | 1048 | print("AUC=%.2f" % AUC) 1049 | print("GINI ~=%.2f" % Gini_dt_hyper) 1050 | 1051 | #Calculate Log Loss in pandas dataframe 1052 | #Create Dataframe to Calculate Log Loss 1053 | y_test= data_test.select('label') 1054 | dt_hyper_proba=dt_result_hyper.select('probability') 1055 | 1056 | #Convert lr_probaspark dataframe to numpy array 1057 | dt_hyper_proba= np.array(dt_hyper_proba.select('probability').collect()) 1058 | 1059 | #Convert numpy array 3 dimentional to 2 dimentional 1060 | dt_hyper_proba=dt_hyper_proba.reshape(-1, dt_hyper_proba.shape[-1]) 1061 | 1062 | #Convert y_test dataframe to pandas dataframe 1063 | y_test=y_test.toPandas() 1064 | 1065 | #Convert y_test pandas dataframe to pandas series 1066 | y_test=pd.Series(y_test['label'].values) 1067 | 1068 | #Calculate log loss from Decision Tree hyper parameter 1069 | LogLoss = log_loss(y_test, dt_hyper_proba) 1070 | 1071 | print("Log Loss Decision Tree:%.4f" % LogLoss) 1072 | 1073 | #Random Forest 1074 | #Create decision tree model to data train 1075 | rf = RandomForestClassifier(featuresCol='features', labelCol="label") 1076 | rf_model = rf.fit(data_train) 1077 | 1078 | #transform model to data test 1079 | rf_result = rf_model.transform(data_test) 1080 | 1081 | #view id, label, prediction and probability from result of modelling 1082 | rf_result.select('Id', 'label', 'prediction', 'probability').show(5) 1083 | 1084 | #Random Forest Evaluation 1085 | #Evaluate model by calculatin accuracy and area under curve (AUC) 1086 | rf_eval = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="label") 1087 | rf_eval2= MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label") 1088 | rf_AUC = rf_eval.evaluate(rf_result) 1089 | rf_ACC = rf_eval2.evaluate(rf_result, {rf_eval2.metricName:"accuracy"}) 1090 | 1091 | print("Decision Tree Performance Measure") 1092 | print("Accuracy = %0.2f" % rf_ACC) 1093 | print("AUC = %.2f" % rf_AUC) 1094 | 1095 | #ROC Grafik 1096 | PredAndLabels = rf_result.select("probability", "label") 1097 | PredAndLabels_collect = PredAndLabels.collect() 1098 | PredAndLabels_list = [(float(i[0][0]), 1.0-float(i[1])) for i in PredAndLabels_collect] 1099 | PredAndLabels = sc.parallelize(PredAndLabels_list) 1100 | 1101 | metrics = BinaryClassificationMetrics(PredAndLabels) 1102 | 1103 | # Area under ROC 1104 | print("Random Forest Area Under ROC") 1105 | print("Area under ROC = %.2f" % metrics.areaUnderROC) 1106 | 1107 | # Visualization 1108 | FPR = dict() # FPR: False Positive Rate 1109 | tpr = dict() # TPR: True Positive Rate 1110 | roc_auc = dict() 1111 | 1112 | y_test = [i[1] for i in PredAndLabels_list] 1113 | y_score = [i[0] for i in PredAndLabels_list] 1114 | 1115 | fpr, tpr, _ = roc_curve(y_test, y_score) 1116 | roc_auc = auc(fpr, tpr) 1117 | 1118 | plt.figure(figsize=(5,4)) 1119 | plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) 1120 | plt.plot([0, 1], [0, 1], 'k--') 1121 | plt.xlim([0.0, 1.0]) 1122 | plt.ylim([0.0, 1.05]) 1123 | plt.xlabel('False Positive Rate') 1124 | plt.ylabel('True Positive Rate') 1125 | plt.title('ROC Curve - Random Forest') 1126 | plt.legend(loc="lower right") 1127 | plt.show() 1128 | 1129 | #Confusion Matrix 1130 | cm_rf_result = rf_result.crosstab("prediction", "label") 1131 | cm_rf_result = cm_rf_result.toPandas() 1132 | cm_rf_result 1133 | 1134 | #calculate accurary,sensitivity, specificity and precision 1135 | TP = cm_rf_result["1"][0] 1136 | FP = cm_rf_result["0"][0] 1137 | TN = cm_rf_result["0"][1] 1138 | FN = cm_rf_result["1"][1] 1139 | Accuracy = (TP+TN)/(TP+FP+TN+FN) 1140 | Sensitivity = TP/(TP+FN) 1141 | Specificity = TN/(TN+FP) 1142 | Precision = TP/(TP+FP) 1143 | 1144 | print ("Accuracy = %0.2f" %Accuracy ) 1145 | print ("Sensitivity = %0.2f" %Sensitivity ) 1146 | print ("Specificity = %0.2f" %Specificity ) 1147 | print ("Precision = %0.2f" %Precision ) 1148 | 1149 | #Calculate Gini Coefficient from AUC 1150 | AUC = rf_AUC 1151 | Gini_rf= (2 * AUC -1) 1152 | 1153 | print("AUC=%.2f" % AUC) 1154 | print("GINI ~=%.2f" % Gini_rf) 1155 | 1156 | #Calculate Log Loss in pandas dataframe 1157 | #Create Dataframe to Calculate Log Loss 1158 | y_test= data_test.select('label') 1159 | rf_proba=rf_result.select('probability') 1160 | 1161 | #Convert rf_probaspark dataframe to numpy array 1162 | rf_proba= np.array(rf_proba.select('probability').collect()) 1163 | 1164 | #Convert numpy array 3 dimentional to 2 dimentional 1165 | rf_proba=rf_proba.reshape(-1, rf_proba.shape[-1]) 1166 | 1167 | #Convert y_test dataframe to pandas dataframe 1168 | y_test=y_test.toPandas() 1169 | 1170 | #Convert y_test pandas dataframe to pandas series 1171 | y_test=pd.Series(y_test['label'].values) 1172 | 1173 | #Calculate log loss from Random Forest 1174 | LogLoss = log_loss(y_test, rf_proba) 1175 | 1176 | print("Log Loss Random Forest:%.4f" % LogLoss) 1177 | 1178 | #Random Forest With Hyper-Parameter 1179 | #define random forest model 1180 | rf_hyper= RandomForestClassifier(featuresCol='features', labelCol="label") 1181 | 1182 | # Hyper-Parameter Tuning 1183 | paramGrid_rf = ParamGridBuilder() \ 1184 | .addGrid(rf_hyper.numTrees, [40, 60, 80, 100]) \ 1185 | .build() 1186 | crossval_rf = CrossValidator(estimator=rf_hyper, 1187 | estimatorParamMaps=paramGrid_rf, 1188 | evaluator=BinaryClassificationEvaluator(), 1189 | numFolds=3) 1190 | #fit model to data train 1191 | rf_model_hyper=crossval_rf.fit(data_train) 1192 | 1193 | #transfrom model to data test 1194 | rf_result_hyper = rf_model_hyper.transform(data_test) 1195 | 1196 | #view id, label, prediction and probability from result of modelling 1197 | rf_result_hyper.select('Id', 'label', 'prediction', 'probability').show(5) 1198 | 1199 | #Random Forest With Hyper-Parameter Evaluation 1200 | #Evaluate model by calculating accuracy and area under curve (AUC) 1201 | rf_hyper_eval = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="label") 1202 | rf_hyper_eval2= MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label") 1203 | rf_hyper_AUC = rf_hyper_eval.evaluate(rf_result_hyper) 1204 | rf_hyper_ACC = rf_hyper_eval2.evaluate(rf_result_hyper, {rf_hyper_eval2.metricName:"accuracy"}) 1205 | 1206 | print("Decision Tree Performance Measure") 1207 | print("Accuracy = %0.2f" % rf_hyper_ACC) 1208 | print("AUC = %.2f" % rf_hyper_AUC) 1209 | 1210 | #ROC Grafik 1211 | PredAndLabels = rf_result_hyper.select("probability", "label") 1212 | PredAndLabels_collect = PredAndLabels.collect() 1213 | PredAndLabels_list = [(float(i[0][0]), 1.0-float(i[1])) for i in PredAndLabels_collect] 1214 | PredAndLabels = sc.parallelize(PredAndLabels_list) 1215 | 1216 | metrics = BinaryClassificationMetrics(PredAndLabels) 1217 | 1218 | # Area under ROC 1219 | print("Random Forest Area Under ROC") 1220 | print("Area under ROC = %.2f" % metrics.areaUnderROC) 1221 | 1222 | # Visualization 1223 | FPR = dict() # FPR: False Positive Rate 1224 | tpr = dict() # TPR: True Positive Rate 1225 | roc_auc = dict() 1226 | 1227 | y_test = [i[1] for i in PredAndLabels_list] 1228 | y_score = [i[0] for i in PredAndLabels_list] 1229 | 1230 | fpr, tpr, _ = roc_curve(y_test, y_score) 1231 | roc_auc = auc(fpr, tpr) 1232 | 1233 | plt.figure(figsize=(5,4)) 1234 | plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) 1235 | plt.plot([0, 1], [0, 1], 'k--') 1236 | plt.xlim([0.0, 1.0]) 1237 | plt.ylim([0.0, 1.05]) 1238 | plt.xlabel('False Positive Rate') 1239 | plt.ylabel('True Positive Rate') 1240 | plt.title('ROC Curve - Random Forest') 1241 | plt.legend(loc="lower right") 1242 | plt.show() 1243 | 1244 | #Confusion Matrix 1245 | cm_rf_result_hyper = rf_result_hyper.crosstab("prediction", "label") 1246 | cm_rf_result_hyper = cm_rf_result_hyper.toPandas() 1247 | cm_rf_result_hyper 1248 | 1249 | #calculate accuracy, sensitivity, specificity and precision 1250 | TP = cm_rf_result_hyper["1"][0] 1251 | FP = cm_rf_result_hyper["0"][0] 1252 | TN = cm_rf_result_hyper["0"][1] 1253 | FN = cm_rf_result_hyper["1"][1] 1254 | Accuracy = (TP+TN)/(TP+FP+TN+FN) 1255 | Sensitivity = TP/(TP+FN) 1256 | Specificity = TN/(TN+FP) 1257 | Precision = TP/(TP+FP) 1258 | 1259 | print ("Accuracy = %0.2f" %Accuracy ) 1260 | print ("Sensitivity = %0.2f" %Sensitivity ) 1261 | print ("Specificity = %0.2f" %Specificity ) 1262 | print ("Precision = %0.2f" %Precision ) 1263 | 1264 | #Calculate Gini Coefficient from AUC 1265 | AUC = rf_hyper_AUC 1266 | Gini_rf_hyper= (2 * AUC -1) 1267 | 1268 | print("AUC=%.2f" % AUC) 1269 | print("GINI ~=%.2f" % Gini_rf_hyper) 1270 | 1271 | #Calculate Log Loss in pandas dataframe 1272 | #Create Dataframe to Calculate Log Loss 1273 | y_test= data_test.select('label') 1274 | rf_hyper_proba=rf_result_hyper.select('probability') 1275 | 1276 | #Convert pyspark dataframe to numpy array 1277 | rf_hyper_proba= np.array(rf_hyper_proba.select('probability').collect()) 1278 | 1279 | #Convert numpy array 3 dimentional to 2 dimentional 1280 | rf_hyper_proba=rf_hyper_proba.reshape(-1, rf_hyper_proba.shape[-1]) 1281 | 1282 | #Convert y_test dataframe to pandas dataframe 1283 | y_test=y_test.toPandas() 1284 | 1285 | #Convert y_test pandas dataframe to pandas series 1286 | y_test=pd.Series(y_test['label'].values) 1287 | 1288 | #Calculate log loss from Random Forest hyper parameter 1289 | LogLoss = log_loss(y_test, rf_hyper_proba) 1290 | 1291 | print("Log Loss Random Forest:%.4f" % LogLoss) 1292 | 1293 | #Gradient Boosting 1294 | #create gradient boosting model in data train 1295 | gbt = GBTClassifier(featuresCol="features", labelCol="label", maxIter=10) 1296 | gbt_model = gbt.fit(data_train) 1297 | 1298 | #transfrom model to data test 1299 | gbt_result = gbt_model.transform(data_test) 1300 | 1301 | #view id, label, prediction and probability from result of modelling 1302 | gbt_result.select('Id', 'label', 'prediction', 'probability').show(5) 1303 | 1304 | #Gradient Boosting Evaluation 1305 | #Evaluate model by calculating accuracy and area under curve (AUC) 1306 | gbt_eval = BinaryClassificationEvaluator(rawPredictionCol="probability",labelCol="label") 1307 | gbt_eval2= MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label") 1308 | gbt_AUC = gbt_eval.evaluate(gbt_result) 1309 | gbt_ACC = gbt_eval2.evaluate(gbt_result, {gbt_eval2.metricName:"accuracy"}) 1310 | 1311 | print("Gradient Boosted Tree Performance Measure") 1312 | print("Accuracy = %0.2f" % gbt_ACC) 1313 | print("AUC = %.2f" % gbt_AUC) 1314 | 1315 | #ROC Grafik 1316 | PredAndLabels = gbt_result.select("probability", "label") 1317 | PredAndLabels_collect = PredAndLabels.collect() 1318 | PredAndLabels_list = [(float(i[0][0]), 1.0-float(i[1])) for i in PredAndLabels_collect] 1319 | PredAndLabels = sc.parallelize(PredAndLabels_list) 1320 | 1321 | metrics = BinaryClassificationMetrics(PredAndLabels) 1322 | 1323 | # Area under ROC 1324 | print("Gradient Boosting Area Under ROC") 1325 | print("Area under ROC = %.2f" % metrics.areaUnderROC) 1326 | 1327 | # Visualization 1328 | FPR = dict() # FPR: False Positive Rate 1329 | tpr = dict() # TPR: True Positive Rate 1330 | roc_auc = dict() 1331 | 1332 | y_test = [i[1] for i in PredAndLabels_list] 1333 | y_score = [i[0] for i in PredAndLabels_list] 1334 | 1335 | fpr, tpr, _ = roc_curve(y_test, y_score) 1336 | roc_auc = auc(fpr, tpr) 1337 | 1338 | plt.figure(figsize=(5,4)) 1339 | plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) 1340 | plt.plot([0, 1], [0, 1], 'k--') 1341 | plt.xlim([0.0, 1.0]) 1342 | plt.ylim([0.0, 1.05]) 1343 | plt.xlabel('False Positive Rate') 1344 | plt.ylabel('True Positive Rate') 1345 | plt.title('ROC Curve - Gradient Boosting') 1346 | plt.legend(loc="lower right") 1347 | plt.show() 1348 | 1349 | #Confusion Matrix 1350 | cm_gbt_result = gbt_result.crosstab("prediction", "label") 1351 | cm_gbt_result = cm_gbt_result.toPandas() 1352 | cm_gbt_result 1353 | 1354 | #calculate accuracy, sensitivity, specificity and precision 1355 | TP = cm_gbt_result["1"][0] 1356 | FP = cm_gbt_result["0"][0] 1357 | TN = cm_gbt_result["0"][1] 1358 | FN = cm_gbt_result["1"][1] 1359 | Accuracy = (TP+TN)/(TP+FP+TN+FN) 1360 | Sensitivity = TP/(TP+FN) 1361 | Specificity = TN/(TN+FP) 1362 | Precision = TP/(TP+FP) 1363 | 1364 | print ("Accuracy = %0.2f" %Accuracy ) 1365 | print ("Sensitivity = %0.2f" %Sensitivity ) 1366 | print ("Specificity = %0.2f" %Specificity ) 1367 | print ("Precision = %0.2f" %Precision ) 1368 | 1369 | #Calculate Gini Coefficient from AUC 1370 | AUC = gbt_AUC 1371 | Gini_gbt= (2 * AUC -1) 1372 | 1373 | print("AUC=%.2f" % AUC) 1374 | print("GINI ~=%.2f" % Gini_gbt) 1375 | 1376 | #Calculate Log Loss in pandas dataframe 1377 | #Create Dataframe to Calculate Log Loss 1378 | y_test= data_test.select('label') 1379 | gbt_proba=gbt_result.select('probability') 1380 | 1381 | #Convert pyspark dataframe to numpy array 1382 | gbt_proba= np.array(gbt_proba.select('probability').collect()) 1383 | 1384 | #Convert numpy array 3 dimentional to 2 dimentional 1385 | gbt_proba=gbt_proba.reshape(-1, gbt_proba.shape[-1]) 1386 | 1387 | #Convert y_test dataframe to pandas dataframe 1388 | y_test=y_test.toPandas() 1389 | 1390 | #Convert y_test pandas dataframe to pandas series 1391 | y_test=pd.Series(y_test['label'].values) 1392 | 1393 | #Calculate log loss from Gradient Boosting 1394 | LogLoss = log_loss(y_test, gbt_proba) 1395 | 1396 | print("Log Loss Gradient Boosting:%.4f" % LogLoss) 1397 | 1398 | #Gradient Boosting With Hyper-Parameter 1399 | #define gradient boosting model 1400 | gbt_hyper= GBTClassifier(featuresCol="features", labelCol="label") 1401 | 1402 | # Hyper-Parameter Tuning 1403 | paramGrid_gbt = ParamGridBuilder() \ 1404 | .addGrid(gbt_hyper.maxIter, [10])\ 1405 | .addGrid(gbt_hyper.maxDepth, [6, 7,10]) \ 1406 | .build() 1407 | crossval_gbt = CrossValidator(estimator=gbt_hyper, 1408 | estimatorParamMaps=paramGrid_gbt, 1409 | evaluator=BinaryClassificationEvaluator(), 1410 | numFolds=3) 1411 | #fit model to data train 1412 | gbt_model_hyper = crossval_gbt.fit(data_train) 1413 | 1414 | #transfrom model to data test 1415 | gbt_result_hyper = gbt_model_hyper.transform(data_test) 1416 | 1417 | #view id, label, prediction and probability from result of modelling 1418 | gbt_result_hyper.select('Id', 'label', 'prediction', 'probability').show(5) 1419 | 1420 | #Gradient Boosting With Hyper-Parameter Evaluation 1421 | #Evaluate model by calculating accuracy and area under curve (AUC) 1422 | gbt_eval_hyper = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="label") 1423 | gbt_eval_hyper2= MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label") 1424 | gbt_hyper_AUC = gbt_eval_hyper.evaluate(gbt_result_hyper) 1425 | gbt_hyper_ACC = gbt_eval_hyper2.evaluate(gbt_result_hyper, {gbt_eval_hyper2.metricName:"accuracy"}) 1426 | 1427 | 1428 | print("Gradient Boosted Tree Performance Measure") 1429 | print("Accuracy = %0.2f" % gbt_hyper_ACC) 1430 | print("AUC = %.2f" % gbt_hyper_AUC) 1431 | 1432 | #ROC Grafik 1433 | PredAndLabels = gbt_result_hyper.select("probability", "label") 1434 | PredAndLabels_collect = PredAndLabels.collect() 1435 | PredAndLabels_list = [(float(i[0][0]), 1.0-float(i[1])) for i in PredAndLabels_collect] 1436 | PredAndLabels = sc.parallelize(PredAndLabels_list) 1437 | 1438 | metrics = BinaryClassificationMetrics(PredAndLabels) 1439 | 1440 | # Area under ROC 1441 | print("Gradient Boosting Area Under ROC") 1442 | print("Area under ROC = %.2f" % metrics.areaUnderROC) 1443 | 1444 | # Visualization 1445 | FPR = dict() # FPR: False Positive Rate 1446 | tpr = dict() # TPR: True Positive Rate 1447 | roc_auc = dict() 1448 | 1449 | y_test = [i[1] for i in PredAndLabels_list] 1450 | y_score = [i[0] for i in PredAndLabels_list] 1451 | 1452 | fpr, tpr, _ = roc_curve(y_test, y_score) 1453 | roc_auc = auc(fpr, tpr) 1454 | 1455 | plt.figure(figsize=(5,4)) 1456 | plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) 1457 | plt.plot([0, 1], [0, 1], 'k--') 1458 | plt.xlim([0.0, 1.0]) 1459 | plt.ylim([0.0, 1.05]) 1460 | plt.xlabel('False Positive Rate') 1461 | plt.ylabel('True Positive Rate') 1462 | plt.title('ROC Curve - Gradient Boosting') 1463 | plt.legend(loc="lower right") 1464 | plt.show() 1465 | 1466 | #confusion Matrix 1467 | cm_gbt_result_hyper = gbt_result_hyper.crosstab("prediction", "label") 1468 | cm_gbt_result_hyper = cm_gbt_result_hyper.toPandas() 1469 | cm_gbt_result_hyper 1470 | 1471 | #calculate accuracy, sensitivity, specificity and precision 1472 | TP = cm_gbt_result_hyper["1"][0] 1473 | FP = cm_gbt_result_hyper["0"][0] 1474 | TN = cm_gbt_result_hyper["0"][1] 1475 | FN = cm_gbt_result_hyper["1"][1] 1476 | Accuracy = (TP+TN)/(TP+FP+TN+FN) 1477 | Sensitivity = TP/(TP+FN) 1478 | Specificity = TN/(TN+FP) 1479 | Precision = TP/(TP+FP) 1480 | 1481 | print ("Accuracy = %0.2f" %Accuracy ) 1482 | print ("Sensitivity = %0.2f" %Sensitivity ) 1483 | print ("Specificity = %0.2f" %Specificity ) 1484 | print ("Precision = %0.2f" %Precision ) 1485 | 1486 | #Calculate Gini Coefficient from AUC 1487 | AUC = gbt_hyper_AUC 1488 | Gini_gbt_hyper= (2 * AUC -1) 1489 | 1490 | print("AUC=%.2f" % AUC) 1491 | print("GINI ~=%.2f" % Gini_gbt_hyper) 1492 | 1493 | #Calculate Log Loss in pandas dataframe 1494 | #Create Dataframe to Calculate Log Loss 1495 | y_test= data_test.select('label') 1496 | gbt_hyper_proba=gbt_result_hyper.select('probability') 1497 | 1498 | #Convert pyspark dataframe to numpy array 1499 | gbt_hyper_proba= np.array(gbt_hyper_proba.select('probability').collect()) 1500 | 1501 | #Convert numpy array 3 dimentional to 2 dimentional 1502 | gbt_hyper_proba=gbt_hyper_proba.reshape(-1, gbt_hyper_proba.shape[-1]) 1503 | 1504 | #Convert y_test dataframe to pandas dataframe 1505 | y_test=y_test.toPandas() 1506 | 1507 | #Convert y_test pandas dataframe to pandas series 1508 | y_test=pd.Series(y_test['label'].values) 1509 | 1510 | #Calculate log loss from Gradient Boosting hyper parameter 1511 | LogLoss = log_loss(y_test, gbt_hyper_proba) 1512 | 1513 | print("Log Loss Gradient Boosting:%.4f" % LogLoss) 1514 | 1515 | 1516 | #Implementation Modelling to data test 1517 | #Prediction using Logistic Regression 1518 | #transform logistic regression to data test 1519 | lr_predict = lr_model.transform(test2) 1520 | 1521 | #view id, label, prediction and probability from result of modelling 1522 | lr_predict.select('Id', 'prediction', 'probability').show(5) 1523 | 1524 | #select id and prediction from result of modelling and save in data frame called my_submission 1525 | my_submission=lr_predict.select("Id","prediction") 1526 | 1527 | #convert to Pandas dataframe 1528 | my_submission=my_submission.toPandas() 1529 | 1530 | #save to csv 1531 | my_submission.to_csv('E:/my_submission.csv', index = False, header = True) 1532 | 1533 | 1534 | #Prediction using Gradient Boosting 1535 | #transfrom gradient boosting model to data test 1536 | gbt_predict = gbt_model.transform(test_data_feat) 1537 | 1538 | #view id, label, prediction and probability from result of modelling 1539 | gbt_predict.select('Id', 'prediction', 'probability').show(5) 1540 | 1541 | #select id and prediction from result of modelling and save in data frame called my_submission 1542 | my_submission2=gbt_predict.select("Id","prediction") 1543 | 1544 | #convert to Pandas dataframe 1545 | my_submission2=my_submission2.toPandas() 1546 | 1547 | #save to csv 1548 | my_submission2.to_csv('E:/my_submission2.csv', index = False, header = True) 1549 | -------------------------------------------------------------------------------- /Image/Decision_Tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/Decision_Tree.png -------------------------------------------------------------------------------- /Image/Decision_Tree_Gini_LogLoss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/Decision_Tree_Gini_LogLoss.png -------------------------------------------------------------------------------- /Image/Decision_Tree_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/Decision_Tree_ROC.png -------------------------------------------------------------------------------- /Image/Decision_Tree_confusion_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/Decision_Tree_confusion_matrix.png -------------------------------------------------------------------------------- /Image/Decision_Tree_ev1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/Decision_Tree_ev1.png -------------------------------------------------------------------------------- /Image/EDA1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/EDA1.jpg -------------------------------------------------------------------------------- /Image/EDA2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/EDA2.jpg -------------------------------------------------------------------------------- /Image/EDA3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/EDA3.jpg -------------------------------------------------------------------------------- /Image/EDA4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/EDA4.jpg -------------------------------------------------------------------------------- /Image/EDA5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/EDA5.png -------------------------------------------------------------------------------- /Image/EDA6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/EDA6.png -------------------------------------------------------------------------------- /Image/Random_Forest.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/Random_Forest.png -------------------------------------------------------------------------------- /Image/Random_Forest_Gini_LogLoss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/Random_Forest_Gini_LogLoss.png -------------------------------------------------------------------------------- /Image/Random_Forest_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/Random_Forest_ROC.png -------------------------------------------------------------------------------- /Image/Random_Forest_confusion_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/Random_Forest_confusion_matrix.png -------------------------------------------------------------------------------- /Image/Random_Forest_ev1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/Random_Forest_ev1.png -------------------------------------------------------------------------------- /Image/call_function_feature_engineering.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/call_function_feature_engineering.png -------------------------------------------------------------------------------- /Image/call_insignificant_categories_function.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/call_insignificant_categories_function.jpg -------------------------------------------------------------------------------- /Image/callfunction_compare_categorical_variables.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/callfunction_compare_categorical_variables.jpg -------------------------------------------------------------------------------- /Image/check_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/check_data.png -------------------------------------------------------------------------------- /Image/check_missing_values.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/check_missing_values.png -------------------------------------------------------------------------------- /Image/check_missing_values2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/check_missing_values2.png -------------------------------------------------------------------------------- /Image/check_missing_values3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/check_missing_values3.png -------------------------------------------------------------------------------- /Image/define_categorical_numerical_variables1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/define_categorical_numerical_variables1.png -------------------------------------------------------------------------------- /Image/define_categorical_numerical_variables2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/define_categorical_numerical_variables2.png -------------------------------------------------------------------------------- /Image/feature_engineering.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/feature_engineering.png -------------------------------------------------------------------------------- /Image/feature_engineering2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/feature_engineering2.png -------------------------------------------------------------------------------- /Image/function_compare_categorical_variables.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/function_compare_categorical_variables.jpg -------------------------------------------------------------------------------- /Image/gradient_boosting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/gradient_boosting.png -------------------------------------------------------------------------------- /Image/gradient_boosting_Gini_LogLoss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/gradient_boosting_Gini_LogLoss.png -------------------------------------------------------------------------------- /Image/gradient_boosting_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/gradient_boosting_ROC.png -------------------------------------------------------------------------------- /Image/gradient_boosting_ROC_confusion_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/gradient_boosting_ROC_confusion_matrix.png -------------------------------------------------------------------------------- /Image/gradient_boosting_confusion_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/gradient_boosting_confusion_matrix.png -------------------------------------------------------------------------------- /Image/gradient_boosting_ev1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/gradient_boosting_ev1.png -------------------------------------------------------------------------------- /Image/handle_missing_values.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/handle_missing_values.jpg -------------------------------------------------------------------------------- /Image/handle_missing_values2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/handle_missing_values2.jpg -------------------------------------------------------------------------------- /Image/handle_outlier.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/handle_outlier.png -------------------------------------------------------------------------------- /Image/handle_outlier2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/handle_outlier2.png -------------------------------------------------------------------------------- /Image/handle_outlier3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/handle_outlier3.png -------------------------------------------------------------------------------- /Image/hyper_parameter_Random_Forest.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/hyper_parameter_Random_Forest.png -------------------------------------------------------------------------------- /Image/hyper_parameter_tuning_DecisionTree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/hyper_parameter_tuning_DecisionTree.png -------------------------------------------------------------------------------- /Image/hyper_parameter_tuning_GradientBoost.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/hyper_parameter_tuning_GradientBoost.png -------------------------------------------------------------------------------- /Image/hyper_parameter_tuning_LogisticRegression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/hyper_parameter_tuning_LogisticRegression.png -------------------------------------------------------------------------------- /Image/implement_to_data_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/implement_to_data_test.png -------------------------------------------------------------------------------- /Image/implement_to_data_test2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/implement_to_data_test2.png -------------------------------------------------------------------------------- /Image/insignificant_categories_function.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/insignificant_categories_function.jpg -------------------------------------------------------------------------------- /Image/insignificant_categories_function3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/insignificant_categories_function3.jpg -------------------------------------------------------------------------------- /Image/insignificant_categories_function4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/insignificant_categories_function4.jpg -------------------------------------------------------------------------------- /Image/load_dataset_function.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/load_dataset_function.png -------------------------------------------------------------------------------- /Image/load_libraries.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/load_libraries.png -------------------------------------------------------------------------------- /Image/logistic_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/logistic_regression.png -------------------------------------------------------------------------------- /Image/logistic_regression_Gini_LogLoss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/logistic_regression_Gini_LogLoss.png -------------------------------------------------------------------------------- /Image/logistic_regression_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/logistic_regression_ROC.png -------------------------------------------------------------------------------- /Image/logistic_regression_ROC_confusion_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/logistic_regression_ROC_confusion_matrix.png -------------------------------------------------------------------------------- /Image/logistic_regression_confusion_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/logistic_regression_confusion_matrix.png -------------------------------------------------------------------------------- /Image/logistic_regression_ev1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/logistic_regression_ev1.png -------------------------------------------------------------------------------- /Image/split_data_train.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/split_data_train.png -------------------------------------------------------------------------------- /Image/test.txt: -------------------------------------------------------------------------------- 1 | test 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Classification-Pyspark 2 | This is a repository of classification template using pyspark. 3 | 4 | I tried to make a template of classification machine learning using pyspark. I will try to explain step by step from load data, data cleansing and making a prediction. I created some functions in pyspark to make an automation, so user just need to update or replace the dataset. 5 | 6 | To test my template, I used data Home_Quote_Conversion from Kaggle https://www.kaggle.com/c/homesite-quote-conversion. This dataset represent the activity who are interested in buying policies from Homesite. QuoteConversion_Flag indicates whether the customer purchased a policy and the task is to predict QuoteConversion_Flag for each QuoteNumber in the test set. 7 | 8 | In general, the steps of classification machine learning are: 9 | 10 | * Load libraries 11 | 12 | The first step in applying classification model is we have to load all libraries are needed. The basic libraries for classification are LogisticRegression, RandomForestClassifier, GBTClassifier, etc. Below the capture of all libraries are needed in classification: 13 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/load_libraries.png) 14 | 15 | 16 | * Load Data into Spark Dataframe. 17 | 18 | Because we will work on spark environment so the dataset must be in spark dataframe. In this step, I created function to load data into spark dataframe. To run this function, first we have to define type of file of dataset (text or parquet) and path where dataset is stored and delimeter like ',' for example or other. 19 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/load_dataset_function.png) 20 | 21 | 22 | * Check the data. 23 | 24 | After load data, lets do some check of the dataset such as numbers of columns, numbers of observations, names of columns, type of columns, etc. In this part, we also do some changes like rename columns name if the column name too long, change the data type if data type not in accordance or drop unnecessary column and check the proportion of target. Those changes apply in both data train and data test. 25 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/check_data.png) 26 | 27 | 28 | * Define categorical and numerical variables. 29 | 30 | In this step, I tried to split the variables based on it's data types. If data types of variables is string will be saved in list called **cat_cols** and if data types of variables is integer or double will be saved in list called **num_cols**. This split applied on data train and data test. This step applied to make easier in the following step so I don't need to define categorical and numerical variables manually. 31 | Pictures below is example of code of define categorical and numerical variables in data train. 32 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/define_categorical_numerical_variables1.png) 33 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/define_categorical_numerical_variables2.png) 34 | 35 | 36 | * Sample data 37 | 38 | If the dataset is too large, we can take sample of data. 39 | Note: this step is optional. 40 | 41 | * Check Missing Values. 42 | 43 | Sometimes the data received is not clean. So, we need to check whether there are missing values or not. Output from this step is the name of columns which have missing values and the number of missing values. To check missing values, actually I created two method: 44 | - Using pandas dataframe, 45 | - Using pyspark dataframe. 46 | But the prefer method is method using pyspark dataframe so if dataset is too large we can still calculate / check missing values. 47 | Both data train and data test has to apply this step. 48 | This function refer to https://github.com/UrbanInstitute/pyspark-tutorials/blob/master/04_missing-data.ipynb. 49 | 50 | Pictures below are example check missing values using pyspark dataframe in data train. 51 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/check_missing_values.png) 52 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/check_missing_values2.png) 53 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/check_missing_values3.png) 54 | 55 | 56 | * Handle Missing Values. 57 | 58 | The approach that used to handle missing values between numerical and categorical variables is different. For numerical variables I fill the missing values with average in it's columns. While for categorical values I fill missing values use most frequent category in that column, therefore count categories which has max values in each columns is needed. 59 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/handle_missing_values.jpg) 60 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/handle_missing_values2.jpg) 61 | 62 | 63 | * Compare categorical variables in data train and data test. 64 | 65 | In this step, we check whether categories between data train and data test same or not. If not, categories in data test will be equated with data train. This step is needed to avoid error in feature engineering, if there are differences categories between data train and data test the error will appear at feature engineering process in data test so the modelling process cannot be applied in data test. 66 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/function_compare_categorical_variables.jpg) 67 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/callfunction_compare_categorical_variables.jpg) 68 | 69 | 70 | * EDA 71 | 72 | Create distribution visualization in each variables to get some insight of dataset. Pictures below are example of visualization of data train. 73 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/EDA1.jpg) 74 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/EDA2.jpg) 75 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/EDA3.jpg) 76 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/EDA4.jpg) 77 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/EDA5.png) 78 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/EDA6.png) 79 | 80 | 81 | * Handle insignificant categories in data train. 82 | 83 | Sometimes there are categories with fewest amount, those categories I called insignificant categories. Those insignificant categories will be replaced with the largest numbers of categories in each categorical columns. Sometimes this replacing will make better modelling. 84 | 85 | Note: the determination of threshold that category have fewest amount is based on trial n error. In this case I used threshold 98% for maximum amount and 0.7% for minimum amount. Each categories in a column that have percentage under 0.7% will be replaced with category that has percentage equal or lower than 98%. 86 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/insignificant_categories_function.jpg) 87 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/call_insignificant_categories_function.jpg) 88 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/insignificant_categories_function3.jpg) 89 | 90 | 91 | * Handle insignificant categories in data test. 92 | 93 | To handle insignificant categories in data test, I refer to insignificant categories in data train. Categories that replaced will be equated with data train to avoid differences categories between data train and data test. As known those differences will trigger error in feature angineering and modelling process. 94 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/insignificant_categories_function4.jpg) 95 | 96 | 97 | * Handle outlier. 98 | 99 | Outlier is observations that fall below lower side or above upper side. 100 | 101 | To handle outlier the approach is by replacing the value greater than upper side with upper side value and replacing the value lower than lower side with lower side value. So, we need calculate upper and lower side from quantile value, quantile is probability distribution of variable. In General, there are three quantile: 102 | 103 | - Q1 = the value that cut off 25% of the first data when it is sorted in ascending order. 104 | - Q2 = cut off data, or median, it's 50 % of the data 105 | - Q3 = the value that cut off 75% of the first data when it is sorted in ascending order. 106 | - IQR or interquartile range is range between Q1 and Q3. IQR = Q3 - Q1. 107 | 108 | Upper side = Q3 + 1.5 * IQR 109 | Lower side = Q1 - 1.5 * IQR 110 | 111 | To calculate quantile in pyspark dataframe I created a function and then created function to calculate uper side, lower side, replacing upper side and replacing lower side. function of replacing upper side and lower side will looping as much as numbers of numerical variables in dataset (data train or data test). This step also apply in both data train and data test. 112 | 113 | Pictures below are example of handle outlier in data train, for data test the treatment is the same just call the function and apply it to data test. 114 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/handle_outlier.png) 115 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/handle_outlier2.png) 116 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/handle_outlier3.png) 117 | 118 | * Feature Engineering. 119 | 120 | Before splitting the data train, all categorical variables must be made numerical. There are several approaches to categorical variables in SparkML, including: 121 | - StringIndexer, which is to encode the string label into the index label by sequencing the string frequency descending and giving the smallest index (0) at most string frequency. 122 | - One-hot Encoding, which is mapping the label column (string label) on the binary column. 123 | - Vector assembler, which is mapping all columns in vector. 124 | 125 | In this step, first I check the distinct values in each categorical columns between data train and data test. If data train has distinct values more than data test in one or more categorical column, data train and data test will be joined then apply feature engineering on that data combination - this merger is needed to avoid error in modelling due to differences length of vector between data train and data test- length of vector (result of feature engineering of data combination) must be same between data train and data test so we can move to the next step, modelling and prediction. But if distinct values between data train and data test same, we will apply feature angineering on data train and data test separately then move to the next step modelling and prediction. 126 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/feature_engineering.png) 127 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/feature_engineering2.png) 128 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/call_function_feature_engineering.png) 129 | 130 | * Split Data train to train and test. 131 | 132 | This step just apply on data train. In order to make validation on the model that we are used, we need to split data train into train and test data. Data train will be split with percentage: train 70% and test 30% and define seed 24 so the random data that we split will not change. We can define seed with any value. 133 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/split_data_train.png) 134 | 135 | * Modelling. 136 | 137 | Algorithm that I used to make a model and prediction are: 138 | - Logistic Regression Logistic regression used logit function in prediction the probability. 139 | - Decision Tree This algorithm will find the most significant independent variable to create a group. 140 | - Random Forest This algorithm build multiple decision trees and merges them together and use bagging method. 141 | - Gradient Boosting This algorithm use boosting ensemble technic. This technique employs the logic in which the subsequent predictors learn from the mistakes of the previous predictors. 142 | 143 | * Evaluation. 144 | 145 | To evaluate model I used four metrics, they are: 146 | - ROC 147 | ROC (Receiver Operating Characteristic) The graph shows the true positive rate versus the false positive rate. This metric is between 0 and 1 with a better model scoring higher. An area of 1 represents a perfect test; an area of .5 represents a worthless test. 148 | So, The model is said to be good enaught if the value of the area under the curve is above 0.5. 149 | 150 | - Gini Coefficient 151 | Gini is ratio between the ROC curve and the diagnol line & the area of the above triangle. So, we can calculate Gini by this formula: Gini = 2*AUC - 1 Such as AUC ROC, Gini above 50% or 60% is good model. 152 | 153 | - Confusion Matrix 154 | Confusion Matrix is a table is used to describe performance of a classification model. Some definition are: 155 | - Accuracy = Proportion of total number of predictions that were correct 156 | - Precision (Positive Predictive Value) : Proportion of positive cases that were correctly identified. 157 | - Negative Predictive Value : Proportion of negative cases that were correctly identified. 158 | - Sensitivity (Recall) : Proportion of actual positive cases which are correctly identified. 159 | - Specificity : Proportion of actual negative cases which are correctly identified. 160 | 161 | - Log Loss 162 | Log Loss is one of model performance evaluation in classification model. The purpose of model is to minimize log loss value. 163 | A perfect model would have of log loss of 0. Log Loss increase when predicted probability diverges from actual label. 164 | Pictures below will explain how to create model and make a prediction and also evaluate those model with those four metrics. 165 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/logistic_regression.png) 166 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/logistic_regression_ev1.png) 167 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/logistic_regression_ROC.png) 168 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/logistic_regression_ROC_confusion_matrix.png) 169 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/logistic_regression_confusion_matrix.png) 170 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/logistic_regression_Gini_LogLoss.png) 171 | 172 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/Decision_Tree.png) 173 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/Decision_Tree_ev1.png) 174 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/Decision_Tree_ROC.png) 175 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/Decision_Tree_confusion_matrix.png) 176 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/Decision_Tree_Gini_LogLoss.png) 177 | 178 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/Random_Forest.png) 179 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/Random_Forest_ev1.png) 180 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/Random_Forest_ROC.png) 181 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/Random_Forest_confusion_matrix.png) 182 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/Random_Forest_Gini_LogLoss.png) 183 | 184 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/gradient_boosting.png) 185 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/gradient_boosting_ev1.png) 186 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/gradient_boosting_ROC_confusion_matrix.png) 187 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/gradient_boosting_confusion_matrix.png) 188 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/gradient_boosting_Gini_LogLoss.png) 189 | 190 | 191 | * Hyper-Parameter Tuning. 192 | 193 | In this step, I provided hyper-parameter tuning script for all those model above. So could be compared the model evaluation between model with and without hyper parameter tuning. From those result we can choose model with the best evaluation to make prediction in data test. 194 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/hyper_parameter_tuning_LogisticRegression.png) 195 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/hyper_parameter_tuning_DecisionTree.png) 196 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/hyper_parameter_Random_Forest.png) 197 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/hyper_parameter_tuning_GradientBoost.png) 198 | 199 | * Implementation Modelling to data test. 200 | 201 | After all the steps above are executed, now we know which one model that has best evaluation. And that is the perfect model to make prediction our data test. We can choose the top two model from four model then transform that model to data test. In this case, I choose Logistic Regression and Gradient Boosting to make prediction. Then save the prediction into csv file. 202 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/implement_to_data_test.png) 203 | ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/implement_to_data_test2.png) 204 | 205 | **VIOLAAAAAA,, we got our prediction!!!!!** 206 | 207 | For more details please see my code. 208 | --------------------------------------------------------------------------------