├── Classification Using Pyspark_Home_Quote - v3.ipynb
├── Classification_Using _Pyspark.py
├── Image
    ├── Decision_Tree.png
    ├── Decision_Tree_Gini_LogLoss.png
    ├── Decision_Tree_ROC.png
    ├── Decision_Tree_confusion_matrix.png
    ├── Decision_Tree_ev1.png
    ├── EDA1.jpg
    ├── EDA2.jpg
    ├── EDA3.jpg
    ├── EDA4.jpg
    ├── EDA5.png
    ├── EDA6.png
    ├── Random_Forest.png
    ├── Random_Forest_Gini_LogLoss.png
    ├── Random_Forest_ROC.png
    ├── Random_Forest_confusion_matrix.png
    ├── Random_Forest_ev1.png
    ├── call_function_feature_engineering.png
    ├── call_insignificant_categories_function.jpg
    ├── callfunction_compare_categorical_variables.jpg
    ├── check_data.png
    ├── check_missing_values.png
    ├── check_missing_values2.png
    ├── check_missing_values3.png
    ├── define_categorical_numerical_variables1.png
    ├── define_categorical_numerical_variables2.png
    ├── feature_engineering.png
    ├── feature_engineering2.png
    ├── function_compare_categorical_variables.jpg
    ├── gradient_boosting.png
    ├── gradient_boosting_Gini_LogLoss.png
    ├── gradient_boosting_ROC.png
    ├── gradient_boosting_ROC_confusion_matrix.png
    ├── gradient_boosting_confusion_matrix.png
    ├── gradient_boosting_ev1.png
    ├── handle_missing_values.jpg
    ├── handle_missing_values2.jpg
    ├── handle_outlier.png
    ├── handle_outlier2.png
    ├── handle_outlier3.png
    ├── hyper_parameter_Random_Forest.png
    ├── hyper_parameter_tuning_DecisionTree.png
    ├── hyper_parameter_tuning_GradientBoost.png
    ├── hyper_parameter_tuning_LogisticRegression.png
    ├── implement_to_data_test.png
    ├── implement_to_data_test2.png
    ├── insignificant_categories_function.jpg
    ├── insignificant_categories_function3.jpg
    ├── insignificant_categories_function4.jpg
    ├── load_dataset_function.png
    ├── load_libraries.png
    ├── logistic_regression.png
    ├── logistic_regression_Gini_LogLoss.png
    ├── logistic_regression_ROC.png
    ├── logistic_regression_ROC_confusion_matrix.png
    ├── logistic_regression_confusion_matrix.png
    ├── logistic_regression_ev1.png
    ├── split_data_train.png
    └── test.txt
├── README.md
├── my_submission.csv
├── my_submission2.csv
└── sample_submission.csv


/Classification_Using _Pyspark.py:
--------------------------------------------------------------------------------
   1 | #Classification Using Pyspark
   2 | 
   3 | #Pyspark Initializasing
   4 | # to make pyspark importable as a regular library
   5 | import findspark
   6 | findspark.init()
   7 | 
   8 | import pyspark
   9 | 
  10 | from pyspark import SparkContext
  11 | sc = SparkContext.getOrCreate()
  12 | 
  13 | #initializasing SparkSession for creating Spark DataFrame
  14 | from pyspark.sql import SparkSession
  15 | spark = SparkSession.builder.getOrCreate()
  16 | 
  17 | 
  18 | #Load Libraries
  19 | # Data Frame spark profiling 
  20 | from pyspark.sql.types import IntegerType, StringType, DoubleType, ShortType, DecimalType
  21 | import pyspark.sql.functions as func
  22 | from pyspark.sql.functions import isnull
  23 | from pyspark.sql.functions import isnan, when, count, col, round
  24 | from pyspark.sql.functions import mean
  25 | from pyspark.sql.types import Row
  26 | import matplotlib.pyplot as plt
  27 | from pyspark.sql.functions import udf
  28 | 
  29 | # Pandas DF operation
  30 | import pandas as pd
  31 | import numpy as np
  32 | import matplotlib.pyplot as plt
  33 | import seaborn as sns
  34 | from numpy import array
  35 | 
  36 | # Modeling + Evaluation
  37 | from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer
  38 | from pyspark.sql.functions import when
  39 | from pyspark.sql import functions as F
  40 | from pyspark.sql.functions import avg
  41 | from pyspark.ml import Pipeline
  42 | from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
  43 | from pyspark.ml.classification import DecisionTreeClassifier
  44 | from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
  45 | from pyspark.mllib.evaluation import BinaryClassificationMetrics
  46 | from pyspark.ml.tuning import CrossValidator, ParamGridBuilder 
  47 | from sklearn.metrics import roc_curve, auc
  48 | from sklearn.metrics import log_loss
  49 | from pyspark.sql import Window
  50 | from pyspark.sql.functions import rank,sum,col
  51 | from pyspark.ml.linalg import Vectors
  52 | from pyspark.ml.feature import VectorSlicer
  53 | 
  54 | window = Window.rowsBetween(Window.unboundedPreceding,Window.unboundedFollowing)
  55 | 
  56 | 
  57 | #Load Data to Spark DataFrame
  58 | #Initializing File Type and path for data train
  59 | file_type = 'text'
  60 | path=r'train.csv'
  61 | delimeter=','
  62 | 
  63 | def load_data(file_type):
  64 |     """input type of file "text" or "parquet" and Return pyspark dataframe"""
  65 |     if file_type =="text": # use text as file type input
  66 |         df = spark.read.option("header", "true") \
  67 |                        .option("delimeter",delimeter)\
  68 |                        .option("inferSchema", "true") \
  69 |                        .csv(path)  #path file that you want import
  70 |     else:  
  71 |         df= spark.read.parquet("example.parquet") #path file that you want import
  72 |     return df
  73 | 
  74 | #call function load_data
  75 | df = load_data(file_type)
  76 | 
  77 | #Initializing File Type and path for data test
  78 | file_type = 'text'
  79 | path=r'test.csv'
  80 | delimeter=','
  81 | 
  82 | #call function load_data
  83 | test_data = load_data(file_type)
  84 | 
  85 | 
  86 | #Check data
  87 | #check type of data train and data test
  88 | type(df)
  89 | type(test_data)
  90 | 
  91 | #show 5 observation in data train
  92 | df.show(5)
  93 | 
  94 | #show 5 observation in data test
  95 | test_data.show(5)
  96 | 
  97 | #Print Schema and count number of columns from data train
  98 | len(df.columns), df.printSchema()
  99 | 
 100 | #Print Schema and count number of columns from data test
 101 | len(test_data.columns), test_data.printSchema()
 102 | 
 103 | #rename Target to 'label in data train
 104 | df = df.withColumnRenamed('QuoteConversion_Flag','label')
 105 | #rename Id number ('QuoteNumber') to 'Id' in data train
 106 | df = df.withColumnRenamed('QuoteNumber','Id')
 107 | 
 108 | #rename Id number ('QuoteNumber') to 'Id' in data test
 109 | test_data = test_data.withColumnRenamed('QuoteNumber','Id')
 110 | 
 111 | #drop column Original_Quote_Date from data train
 112 | df_final=df.drop('Original_Quote_Date')
 113 | 
 114 | #count number of observation in data train
 115 | df_final.count()
 116 | 
 117 | #drop column Original_Quote_Date from data test
 118 | test_data=test_data.drop('Original_Quote_Date')
 119 | 
 120 | #calculate percentage of target and save in dataframe called target_percent
 121 | target_percent=df_final.groupBy('label').count().sort(col("count").desc())\
 122 |                         .withColumn('total',sum(col('count')).over(window))\
 123 |                         .withColumn('Percent',col('count')*100/col('total')) 
 124 | 
 125 | #show dataframe terget_percent to check the proportion
 126 | target_percent.show()
 127 | 
 128 | 
 129 | #Define categorical and nummerical variable in df_final (data train)
 130 | #Categorical and numerical variable
 131 | #just will select string data type
 132 | cat_cols = [item[0] for item in df_final.dtypes if item[1].startswith('string')] 
 133 | print("cat_cols:", cat_cols)
 134 | 
 135 | #just will select integer or double data type
 136 | num_cols = [item[0] for item in df_final.dtypes if item[1].startswith('int') | item[1].startswith('double')] 
 137 | print("num_cols:", num_cols)
 138 | 
 139 | #Select column 'Id' from num_cols
 140 | num_id=num_cols.pop(0)
 141 | print("num_id:", num_id)
 142 | 
 143 | #save column 'Id' in num_id variable
 144 | num_id=[num_id]
 145 | #print num_id
 146 | print(num_id)
 147 | 
 148 | #Remove column 'label' from numerical columns group
 149 | num_cols.remove('label') #label is removed because it's the target to validate the model
 150 | 
 151 | #print num_cols variable
 152 | print("num_cols:", num_cols)
 153 | 
 154 | #count number of numerical and categorical columns in data train
 155 | len(num_cols), len(cat_cols)
 156 | 
 157 | #Define categorical and nummerical variable in test_data (data test)
 158 | #Categorical and numerical variable
 159 | #just will select string data type
 160 | cat_cols_test = [item[0] for item in test_data.dtypes if item[1].startswith('string')] 
 161 | print("cat_cols_test:", cat_cols_test)
 162 | 
 163 | #just will select integer or double data type
 164 | num_cols_test = [item[0] for item in test_data.dtypes if item[1].startswith('int') | item[1].startswith('double')] 
 165 | print("num_cols_test:", num_cols_test)
 166 | 
 167 | #Select 'Id' from num_cols_test and save in variable called 'num_id_test'
 168 | num_id_test=num_cols_test.pop(0)
 169 | print("num_id_test:", num_id_test)
 170 | 
 171 | #save num_id_test to list called 'num_id_test'
 172 | num_id_test=[num_id_test]
 173 | print(num_id_test)
 174 | print(num_cols_test)
 175 | 
 176 | #count observation in data test
 177 | test_data.count()
 178 | 
 179 | #count number of numerical and categorical columns in data test
 180 | len(num_cols_test), len(cat_cols_test)
 181 | 
 182 | 
 183 | #Sample data
 184 | #define ratio that want to sample
 185 | ratio=0.1 #will take 10% from data
 186 | 
 187 | #take 10% sample from data train with replacing false and seed 42 and save in df_sample
 188 | df_sample=df_final.sample(False, ratio, 42)
 189 | 
 190 | #count observation from df_sample
 191 | df_sample.count()
 192 | 
 193 | #take 10% sample from data test with replacing false and seed 42 and save in test_sample
 194 | test_sample=test_data.sample(False, ratio, 42)
 195 | 
 196 | #count observation from test_sample
 197 | test_sample.count()
 198 | 
 199 | 
 200 | #Check Missing Value in data train
 201 | #Check Missing Value in Pyspark Dataframe
 202 | def count_nulls(c):
 203 |     """Input pyspark dataframe and return list of columns with missing value and it's total value"""
 204 |     null_counts = []          #make an empty list to hold our results
 205 |     for col in c.dtypes:     #iterate through the column data types we saw above, e.g. ('C0', 'bigint')
 206 |         cname = col[0]        #splits out the column name, e.g. 'C0'    
 207 |         ctype = col[1]        #splits out the column type, e.g. 'bigint'
 208 |         nulls = c.where( c[cname].isNull()).count() #check count of null in column name
 209 |         result = tuple([cname, nulls])  #new tuple, (column name, null count)
 210 |         null_counts.append(result)      #put the new tuple in our result list
 211 |     null_counts=[(x,y) for (x,y) in null_counts if y!=0]  #view just columns that have missing values
 212 |     return null_counts
 213 | 
 214 | #Call function count_nulls and apply it to data train (df_final)
 215 | null_counts = count_nulls(df_final)
 216 | null_counts
 217 | 
 218 | #From null_counts, we just take information of columns name and save in list "list_cols_miss", like in the script below:
 219 | list_cols_miss=[x[0] for x in null_counts]
 220 | list_cols_miss
 221 | 
 222 | #Create dataframe which just has list_cols_miss
 223 | df_miss= df_final.select(*list_cols_miss)
 224 | df_miss.dtypes
 225 | 
 226 | #Define categorical columns and numerical columns which have missing value.
 227 | ### for categorical columns
 228 | catcolums_miss=[item[0] for item in df_miss.dtypes if item[1].startswith('string')]  #will select name of column with string data type
 229 | print("catcolums_miss:", catcolums_miss)
 230 | 
 231 | ### for numerical columns
 232 | numcolumns_miss = [item[0] for item in df_miss.dtypes if item[1].startswith('int') | item[1].startswith('double')] #will select name of column with integer or double data type
 233 | print("numcolumns_miss:", numcolumns_miss)
 234 | 
 235 | #Drop missing value
 236 | df_Nomiss=df_final.na.drop()
 237 | 
 238 | #fill missing value in categorical variable with most frequent
 239 | for x in catcolums_miss:
 240 |     mode=df_Nomiss.groupBy(x).count().sort(col("count").desc()).collect()[0][0] #group by based on categories and count each categories and sort descending then take the first value in column
 241 |     print(x, mode) #print name of columns and it's most categories 
 242 |     df_final = df_final.na.fill({x:mode}) #fill missing value in each columns with most frequent
 243 | 
 244 | #fill missing value in numerical variable with average
 245 | for i in numcolumns_miss:
 246 |     meanvalue = df_final.select(round(mean(i))).collect()[0][0] #calculate average in each numerical column
 247 |     print(i, meanvalue) #print name of columns and it's average value
 248 |     df_final=df_final.na.fill({i:meanvalue}) #fill missing value in each columns with it's average value
 249 |     
 250 | #Check Missing value after filling
 251 | null_counts = count_nulls(df_final)
 252 | null_counts
 253 | 
 254 | 
 255 | #Check Missing Value in data test
 256 | #We will cleansing missing values in pyspark dataframe.
 257 | #Call function to count missing values in test_data
 258 | null_test= count_nulls(test_data)
 259 | null_test
 260 | 
 261 | #take just name of columns that have missing values
 262 | list_miss_test=[x[0] for x in null_test]
 263 | list_miss_test
 264 | 
 265 | #Create dataframe which just has list_cols_miss
 266 | test_miss= test_data.select(*list_miss_test)
 267 | 
 268 | #view data types in df_miss
 269 | test_miss.dtypes
 270 | 
 271 | #Define categorical columns and numerical columns which have missing value.
 272 | ### for categorical columns
 273 | catcolums_miss_test=[item[0] for item in test_miss.dtypes if item[1].startswith('string')]  #will select name of column with string data type
 274 | print("catcolums_miss_test:", catcolums_miss_test)
 275 | 
 276 | ### for numerical columns
 277 | numcolumns_miss_test = [item[0] for item in test_miss.dtypes if item[1].startswith('int') | item[1].startswith('double')] #will select name of column with integer or double data type
 278 | print("numcolumns_miss_test:", numcolumns_miss_test)
 279 | 
 280 | #Drop missing value
 281 | test_Nomiss=test_data.na.drop()
 282 | 
 283 | #fill missing value in categorical variable with most frequent
 284 | for x in catcolums_miss_test:
 285 |     mode=test_Nomiss.groupBy(x).count().sort(col("count").desc()).collect()[0][0] #group by based on categories and count each categories and sort descending then take the first value in column
 286 |     print(x, mode) #print name of columns and it's most categories 
 287 |     test_data = test_data.na.fill({x:mode}) #fill missing value in each columns with most frequent
 288 | 
 289 | #fill missing value in numerical variable with average
 290 | for i in numcolumns_miss_test:
 291 |     meanvalue_test = test_data.select(round(mean(i))).collect()[0][0] #calculate average in each numerical column
 292 |     print(i, meanvalue_test) #print name of columns and it's average value
 293 |     test_data=test_data.na.fill({i:meanvalue_test}) #fill missing value in each columns with it's average value
 294 |     
 295 | #Check Missing value after filling
 296 | %time null_test = count_nulls(test_data)
 297 | null_test
 298 | 
 299 | 
 300 | #Compare categorical columns in df_final and test_data
 301 | #Function to check categorical columns in both data train and data test
 302 | def check_category2(a1,a2,y):
 303 |     """input are two dataframe you want to compare categorical variables and the colomn category name"""
 304 |     print('column:',y)
 305 |     #distinct1=a1.select([y]).distinct().count() #count distinct column in dataframe1
 306 |     #distinct2=a2.select([y]).distinct().count() #count distinct column in dataframe2
 307 |     #if distinct1 == distinct2:
 308 |     var1=a1.select([y]).distinct() #define distinct category in column in dataframe1
 309 |     var2=a2.select([y]).distinct() #define distinct category in column in dataframe2
 310 |     diff2=var2.subtract(var1).collect() #define the different category in dataframe2, return is list
 311 |     diff2=[r[y] for r in diff2] #just take the values
 312 |     diff1=var1.subtract(var2).collect() #define the different category in dataframe1, return is list
 313 |     diff1=[r[y] for r in diff1] #just take the values
 314 |     if diff1 == diff2:
 315 |         print('diff2:', diff2)
 316 |         print('diff1:', diff1)
 317 |         print('Columns match!!')
 318 |     else:
 319 |         if len(diff1)!=0 and len(diff2)==len(diff1):
 320 |             print('diff2:', diff2)
 321 |             print('diff1:', diff1)
 322 |             a2=a2.replace(diff2, diff1, y) #replace the different category in dataframe2 with category in dataframe1
 323 |             print('Columns match now!!')
 324 |         else:
 325 |             if len(diff2)!=len(diff1) and len(diff2)!=0:
 326 |                 print('diff2:', diff2)
 327 |                 print('diff1:', diff1)
 328 |                 dominant1=a1.groupBy(y).count().sort(col("count").desc()).collect()[0][0]
 329 |                 dominant2=a2.groupBy(y).count().sort(col("count").desc()).collect()[0][0] #define category dominant in dataframe2
 330 |                 print('dominant2:', dominant2)
 331 |                 print('dominant1:', dominant1)
 332 |                 a2=a2.replace(diff2, dominant1, y) #replace different category in dataframe2 with dominant category
 333 |                 print('Columns match now!!')
 334 |             else:     
 335 |                 print('diff1:', diff1)
 336 |                 print('diff2:', diff2)
 337 |     return a2
 338 | 
 339 | #call function to check catgories in data train and test, whether same or not, if not, the different categories will be replaced.
 340 | for y in cat_cols_test:
 341 |     test_data=check_category2(df_final,test_data,y)
 342 |   
 343 | 
 344 | #EDA
 345 | #Check distribution in each variables
 346 | #Pyspark dataframe has limitation in visualization. Then to create visualization we have to convert pyspark dataframe to pandas dataframe.
 347 | # convert spark dataframe to pandas for visualization
 348 | df_pd=df_final.toPandas()
 349 | 
 350 | #Barchart for categorical variable
 351 | plt.figure(figsize=(20,10))
 352 | plt.subplot(221)
 353 | sns.countplot(x='label', data=df_pd, order=df_pd['label'].value_counts().index)
 354 | plt.title('TARGET', fontsize=15)
 355 | plt.subplot(222)
 356 | sns.countplot(y='Field6', data=df_pd, order=df_pd['Field6'].value_counts().index)
 357 | plt.title('Field6', fontsize=15)
 358 | plt.subplot(223)
 359 | sns.countplot(x='Field12', data=df_pd, order=df_pd['Field12'].value_counts().index)
 360 | plt.title('Field12', fontsize=15)
 361 | plt.show()
 362 | 
 363 | #Barchart for categorical variable
 364 | plt.figure(figsize=(20,10))
 365 | plt.subplot(221)
 366 | sns.countplot(y='CoverageField8', data=df_pd, order=df_pd['CoverageField8'].value_counts().index)
 367 | plt.title('CoverageField8', fontsize=15)
 368 | plt.subplot(222)
 369 | sns.countplot(y='CoverageField9', data=df_pd, order=df_pd['CoverageField9'].value_counts().index)
 370 | plt.title('CoverageField9', fontsize=15)
 371 | plt.subplot(223)
 372 | sns.countplot(y='SalesField7', data=df_pd, order=df_pd['SalesField7'].value_counts().index)
 373 | plt.title('SalesField7', fontsize=15)
 374 | plt.show()
 375 | 
 376 | #Categorical vs Target visualization
 377 | pd.crosstab(df_pd['Field6'], df_pd['label'], normalize='index').plot.bar(rot=0, stacked=True,
 378 |             color=['green', 'red'], figsize=(4,4), title="Field6 VS label")
 379 | plt.legend(loc='upper right', bbox_to_anchor=(1.2, 1))
 380 | 
 381 | pd.crosstab(df_pd['Field12'], df_pd['label'], normalize='index').plot.bar(rot=0, stacked=True,                            
 382 |             color=['green', 'red'], figsize=(4,4), title="Field12 VS label")
 383 | plt.legend(loc='upper right', bbox_to_anchor=(1.2, 1))
 384 | plt.show()
 385 | 
 386 | #Numerical Variables
 387 | #We have 260 numerical variables, and we will plot just some variables.
 388 | #density plot Field7
 389 | #plt.figure(figsize=(24,5))
 390 | sns.distplot(df_pd['Field7'])
 391 | plt.show()
 392 | 
 393 | #Numerical vs Target visualization
 394 | #show distribution 'Field7' vs 'label'
 395 | #plt.figure(figsize=(20,8))
 396 | sns.kdeplot(df_pd[df_pd["label"]==0]["Field7"], label="0", color="green")
 397 | sns.kdeplot(df_pd[df_pd["label"]==1]["Field7"], label="1", color="red")
 398 | plt.title("Field7 VS label")
 399 | plt.show()
 400 | 
 401 | #Check outlier in numerical variable
 402 | df_pd[["Field7"]].boxplot(sym='g-*', grid=True)
 403 | plt.show()
 404 | 
 405 | 
 406 | #Insignificant Categories in Data train
 407 | #Define the threshold for insignificant categories
 408 | threshold=98
 409 | threshold2=0.7
 410 | 
 411 | #function to replace insignificant categories in data train
 412 | def replace_cat2(f,cols):
 413 |     """input are dataframe and categorical variables, replace insignificant categories (percentage <=0.7) with largest number
 414 |     of catgories and output is new dataframe """
 415 |     df_percent=f.groupBy(cols).count().sort(col("count").desc())\
 416 |                 .withColumn('total',sum(col('count')).over(window))\
 417 |                 .withColumn('Percent',col('count')*100/col('total')) #calculate the percentage-save in Percent columns from each categories
 418 |     dominant_cat=df_percent.select(df_percent['Percent']).collect()[0][0] #calculate the highest percentage of category
 419 |     count_dist=f.select([cols]).distinct().count() #calculate distinct values in that columns
 420 |     if count_dist > 2 and dominant_cat <= threshold :
 421 |         print('column:', cols)
 422 |         cols_names.append(cols)  #combine with previous list
 423 |         replacement=f.groupBy(cols).count().sort(col("count").desc()).collect()[0][0] #define dominant category 
 424 |         print("replacement:",replacement)
 425 |         replacing.append(replacement) #combine with previous list
 426 |         insign_cat=df_percent.filter(df_percent['Percent']< threshold2).select(df_percent[cols]).collect() #calculate insignificant categories
 427 |         insign_cat=[r[cols] for r in insign_cat] #just take the values
 428 |         category.append(insign_cat) #combine with previous list
 429 |         print("insign_cat:",insign_cat)
 430 |         f=f.replace(insign_cat,replacement, cols) #replace insignificant categories with dominant categories
 431 |     return f
 432 | 
 433 | #call function replacing insignificant categories in data train
 434 | replacing=[]
 435 | cols_names=[]
 436 | category=[]
 437 | for cols in cat_cols:
 438 |     df_final=replace_cat2(df_final,cols)
 439 | 
 440 | #check length in list cols_names, category and replacing
 441 | len(cols_names), len(category), len(replacing)
 442 | 
 443 | #Create dataframe of replaced categories
 444 | g=spark.createDataFrame(list(zip(cols_names, replacing, category)),['cols_names', 'replacing', 'category'])
 445 | g.show(9)
 446 | 
 447 | #Replacing Insignificant Categories in data test
 448 | #We already have a dataframe containing any categories that need to be replaced, 
 449 | #we got it when the process of replacing the insignificant categories in the data train, the data frame is called g. 
 450 | #Based on those information, insignificant categories on data test will be replaced.
 451 | cols_names_list=g.select('cols_names').collect() #select just cols_names from dataframe g
 452 | cols_names_list=[r['cols_names'] for r in cols_names_list] #take just the values
 453 | 
 454 | #function to replace insignificant categories in data test
 455 | for z in cols_names_list:
 456 |     print('cols_names:',z)
 457 |     replacement_cat=g.filter(g['cols_names']== z).select(g['replacing']).collect()[0][0] #select values of replacing columns accoring to z in cols_names 
 458 |     print('replacement_cat:', replacement_cat)
 459 |     insignificant_cat=g.filter(g['cols_names']== z).select(g['category']).collect()[0][0] #select values of category columns accoring to z in cols_names
 460 |     print('insignificant_cat:',insignificant_cat)
 461 |     test_data=test_data.replace(insignificant_cat,replacement_cat, z) #replace insignificant cat with replacement value
 462 |     
 463 | #Handle of outlier in data train
 464 | #Calculate Upper&Lower side in pandas dataframe
 465 | df_describe=df_pd.describe()
 466 | df_describe
 467 | 
 468 | #Calculate Upper&Lower side in pyspark dataframe
 469 | #create quantile dataframe
 470 | def quantile(e):
 471 |     """Input is dataframe and return new dataframe with value of quantile from numerical columns"""
 472 |     percentiles = [0.25, 0.5, 0.75]
 473 |     quant=spark.createDataFrame(zip(percentiles, *e.approxQuantile(num_cols, percentiles, 0.0)),
 474 |                                ['percentile']+num_cols) #calculate quantile from pyspark dataframe, 0.0 is relativeError,
 475 |                                                         #The relative target precision to achieve (>= 0). If set to zero, 
 476 |                                                         #the exact quantiles are computed, which could be very expensive
 477 |                                                         #and aggregate the result with percentiles variable, 
 478 |                                                         #then create pyspark dataframe
 479 |     return quant
 480 | 
 481 | #call quantile function 
 482 | %time quantile=quantile(df_sample)
 483 | 
 484 | #function to calculate uppler side
 485 | def upper_value(b,c):
 486 |     """Input is quantile dataframe and name of numerical column and Retrun upper value from the column"""
 487 |     q1 = b.select(c).collect()[0][0] #select value of q1 from the column
 488 |     q2 = b.select(c).collect()[1][0] #select value of q2 from the column
 489 |     q3 = b.select(c).collect()[2][0] #select value of q3 from the column
 490 |     IQR=q3-q1  #calculate the value of IQR
 491 |     upper= q3 + (IQR*1.5)   #calculate the value of upper side
 492 |     return upper
 493 | 
 494 | #function to calculate lower side
 495 | def lower_value(b,c):
 496 |     """Input is quantile dataframe and name of numerical column and Retrun lower value from the column"""
 497 |     q1 = b.select(c).collect()[0][0] #select value of q1 from the column
 498 |     q2 = b.select(c).collect()[1][0] #select value of q2 from the column
 499 |     q3 = b.select(c).collect()[2][0] #select value of q3 from the column
 500 |     IQR=q3-q1                   #calculate the value of IQR
 501 |     lower= q1 - (IQR*1.5)       #calculate the value of lower side
 502 |     return lower
 503 | 
 504 | #function for replacing outlier by upper side
 505 | def replce_outlier_up2(d,col, value):
 506 |     """Input is name of numerical column and it's upper side value"""
 507 |     d=d.withColumn(col, F.when(d[col] > value , value).otherwise(d[col]))
 508 |     return d
 509 | 
 510 | #function for replacing outlier with lower side
 511 | def replce_outlier_low2(d,col, value):
 512 |     """Input is name of numerical column and it's lower side value"""
 513 |     d=d.withColumn(col, F.when(d[col] < value , value).otherwise(d[col]))
 514 |     return d
 515 | 
 516 | #call function to calculate lower side and replace value under lower side with value lower side at all numerical variables
 517 | for i in num_cols:
 518 |     lower=lower_value(quantile,i)
 519 |     df_final=replce_outlier_low2(df_final, i, lower)
 520 |     
 521 | #call function to calculate upper side and replace value above upper side with value upper side at all numerical variables
 522 | for x in num_cols:
 523 |     upper=upper_value(quantile,x)
 524 |     df_final=replce_outlier_up2(df_final, x, upper)
 525 |     
 526 | #Handle of outlier in data test
 527 | #create quantile dataframe
 528 | def quantile(e):
 529 |     """Input is dataframe and return new dataframe with value of quantile from numerical columns"""
 530 |     percentiles = [0.25, 0.5, 0.75]
 531 |     quant=spark.createDataFrame(zip(percentiles, *e.approxQuantile(num_cols_test, percentiles, 0.0)),
 532 |                                ['percentile']+num_cols_test) #calculate quantile from pyspark dataframe, 0.0 is relativeError,
 533 |                                                         #The relative target precision to achieve (>= 0). If set to zero, 
 534 |                                                         #the exact quantiles are computed, which could be very expensive
 535 |                                                         #and aggregate the result with percentiles variable, 
 536 |                                                         #then create pyspark dataframe
 537 |     return quant
 538 | 
 539 | #call funtion quantile
 540 | quantile=quantile(test_sample)
 541 | 
 542 | #call function to calculate lower side and replace value under lower side with value lower side at all numerical variables
 543 | for i in num_cols_test:
 544 |     lower=lower_value(quantile,i)
 545 |     test_data=replce_outlier_low2(test_data, i, lower)
 546 |     
 547 | #call function to calculate upper side and replace value above upper side with value upper side at all numerical variables
 548 | for x in num_cols_test:
 549 |     upper=upper_value(quantile,x)
 550 |     test_data=replce_outlier_up2(test_data, x, upper)
 551 |     
 552 | #Feature Engineering
 553 | #function to check distinct categories in data train and data test
 554 | def check_distinct(a1,a2):
 555 |     """input are two dataframe that you want to compare categorical variables and the output is 
 556 |     total distinct categories in both dataframe"""
 557 |     total1=0
 558 |     total2=0
 559 |     for y in cat_cols:
 560 |         distinct1=a1.select([y]).distinct().count() #count distinct column in dataframe1
 561 |         distinct2=a2.select([y]).distinct().count() #count distinct column in dataframe2
 562 |         var1=a1.select([y]).distinct().collect() #define distinct category in column in dataframe1
 563 |         var1=[r[y] for r in var1]
 564 |         var2=a2.select([y]).distinct().collect()
 565 |         var2=[r[y] for r in var2]
 566 |         total1=total1+distinct1
 567 |         total2=total2+distinct2   
 568 |     return total1, total2  
 569 | 
 570 | #function to execute feature engineering
 571 | def feature_engineering(a1):    
 572 |     """Function for feature engineering (StringIndexer and OneHotEncoder process)"""
 573 |     cat_columns_string_vec = []
 574 |     for c in cat_cols:
 575 |         cat_columns_string= c+"_vec"
 576 |         cat_columns_string_vec.append(cat_columns_string)
 577 |     stringIndexer = [StringIndexer(inputCol=x, outputCol=x+"_Index")
 578 |                   for x in cat_cols]
 579 |     #use oneHotEncoder to convert categorical variable to binary
 580 |     encoder = [OneHotEncoder(inputCol=x+"_Index", outputCol=y)
 581 |            for x,y in zip(cat_cols, cat_columns_string_vec)]
 582 |     #create list of stringIndexer and encoder with 2 dimension
 583 |     tmp = [[i,j] for i,j in zip(stringIndexer, encoder)]
 584 |     tmp = [i for sublist in tmp for i in sublist]
 585 |     cols_assember=num_id + num_cols + cat_columns_string_vec
 586 |     assembler=VectorAssembler(inputCols=cols_assember, outputCol='features')
 587 |     tmp += [assembler]
 588 |     pipeline=Pipeline(stages=tmp)
 589 |     df_final_feat=pipeline.fit(a1).transform(a1)
 590 |     return df_final_feat
 591 | 
 592 | #fucntion to call fucntion feature_engineering and check_distinct
 593 | def Main_feature_engineering(df,df2): 
 594 |     """Function for calling check_distinct and feature_engineering. Then Join data train and data test if distinct categories 
 595 |     between data train and data test not same then do feature engineering, If distinct same will do feature engineering data train
 596 |     and data test separately"""
 597 |     dist_total1, dist_total2=check_distinct(df,df2)   
 598 |     if dist_total1!=dist_total2:
 599 |         Label_df=df.select('Id', 'label')
 600 |         df_final2=df.drop('label')
 601 |         all_df =df_final2.union(df2)
 602 |         all_df_feat=feature_engineering(all_df)
 603 |         id_train=df.select('Id').collect()
 604 |         id_train=[r['Id'] for r in id_train]
 605 |         id_test=df2.select('Id').collect()
 606 |         id_test=[r['Id'] for r in id_test]
 607 |         a=all_df_feat.filter(all_df['Id'].isin(id_train))
 608 |         b=all_df_feat.filter(all_df['Id'].isin(id_test))
 609 |         a=a.join(Label_df, 'Id')
 610 |     else:
 611 |         a=feature_engineering(df)
 612 |         b=feature_engineering(df2)        
 613 |     return a,b
 614 | 
 615 | #call function feature engineering
 616 | %time data2, test2=Main_feature_engineering(df_final, test_data)
 617 | 
 618 | #view result of feature engineering in data train
 619 | data2.select('Id', 'features').show(5)
 620 | 
 621 | #view result of feature engineering in data test
 622 | test2.select('Id', 'features').show(5)
 623 | 
 624 | #Split Data train to train and test
 625 | #Split df_final to train and test, train 70% and test 30%. Define seed 24 so the random data that we split will not change.
 626 | #we can define seed with any value
 627 | data_train, data_test=data2.randomSplit([0.7,0.3], 24)
 628 | 
 629 | 
 630 | #Modelling & Evaluation
 631 | #Logistic Regression
 632 | #Create logistic regression model to data train
 633 | lr=LogisticRegression(featuresCol='features', labelCol='label')
 634 | lr_model = lr.fit(data_train)
 635 | 
 636 | #Transform model to data test
 637 | lr_result = lr_model.transform(data_test)
 638 | 
 639 | #view id, label, prediction and probability from result of modelling
 640 | lr_result.select('Id', 'label', 'prediction', 'probability').show(5)
 641 | 
 642 | #Logistic Regression Evaluation
 643 | #Evaluate model by checking accuracy and AUC value
 644 | lr_eval = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="label")
 645 | lr_eval2= MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label")
 646 | lr_AUC  = lr_eval.evaluate(lr_result)
 647 | lr_ACC  = lr_eval2.evaluate(lr_result, {lr_eval2.metricName:"accuracy"})
 648 | 
 649 | print("Logistic Regression Performance Measure")
 650 | print("Accuracy = %0.2f" % lr_ACC)
 651 | print("AUC = %.2f" % lr_AUC)
 652 | 
 653 | #ROC Grafik
 654 | #Create ROC grafik from lr_result
 655 | PredAndLabels           = lr_result.select("probability", "label")
 656 | PredAndLabels_collect   = PredAndLabels.collect()
 657 | PredAndLabels_list      = [(float(i[0][0]), 1.0-float(i[1])) for i in PredAndLabels_collect]
 658 | PredAndLabels           = sc.parallelize(PredAndLabels_list)
 659 | 
 660 | metrics = BinaryClassificationMetrics(PredAndLabels)
 661 | 
 662 | # Area under ROC
 663 | print("Logistic Regression Area Under ROC")
 664 | print("Area under ROC = %.2f" % metrics.areaUnderROC)
 665 | 
 666 | # Visualization
 667 | FPR = dict()                                                        # FPR: False Positive Rate
 668 | tpr = dict()                                                        # TPR: True Positive Rate
 669 | roc_auc = dict()
 670 |  
 671 | y_test = [i[1] for i in PredAndLabels_list]
 672 | y_score = [i[0] for i in PredAndLabels_list]
 673 |  
 674 | fpr, tpr, _ = roc_curve(y_test, y_score)
 675 | roc_auc = auc(fpr, tpr)
 676 |  
 677 | plt.figure(figsize=(5,4))
 678 | plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
 679 | plt.plot([0, 1], [0, 1], 'k--')
 680 | plt.xlim([0.0, 1.0])
 681 | plt.ylim([0.0, 1.05])
 682 | plt.xlabel('False Positive Rate')
 683 | plt.ylabel('True Positive Rate')
 684 | plt.title('ROC Curve - Logistic Regression')
 685 | plt.legend(loc="lower right")
 686 | plt.show()
 687 | 
 688 | #confusion Matrix
 689 | cm_lr_result = lr_result.crosstab("prediction", "label")
 690 | cm_lr_result = cm_lr_result.toPandas()
 691 | cm_lr_result
 692 | 
 693 | #calculate Accuracy, Sensitivity, Specificity, Precision
 694 | TP = cm_lr_result["1"][0]
 695 | FP = cm_lr_result["0"][0]
 696 | TN = cm_lr_result["0"][1]
 697 | FN = cm_lr_result["1"][1]
 698 | Accuracy = (TP+TN)/(TP+FP+TN+FN)
 699 | Sensitivity = TP/(TP+FN)
 700 | Specificity = TN/(TN+FP)
 701 | Precision = TP/(TP+FP)
 702 | 
 703 | print ("Accuracy = %0.2f" %Accuracy )
 704 | print ("Sensitivity = %0.2f" %Sensitivity )
 705 | print ("Specificity = %0.2f" %Specificity )
 706 | print ("Precision = %0.2f" %Precision )
 707 | 
 708 | #Calculate Gini Coefficient from AUC
 709 | AUC = lr_AUC
 710 | Gini = (2 * AUC - 1)
 711 | print("AUC=%.2f" % AUC)
 712 | print("GINI ~=%.2f" % Gini)
 713 | 
 714 | #Calculate Log Loss in pandas dataframe
 715 | #Create Dataframe to Calculate Log Loss
 716 | y_test= data_test.select('label')
 717 | lr_proba=lr_result.select('probability')
 718 | 
 719 | #Convert lr_probaspark dataframe to numpy array
 720 | lr_proba= np.array(lr_result.select('probability').collect())
 721 | 
 722 | #Convert numpy array 3 dimentional to 2 dimentional
 723 | lr_proba=lr_proba.reshape(-1, lr_proba.shape[-1])
 724 | 
 725 | #Convert y_test dataframe to pandas dataframe
 726 | y_test=y_test.toPandas()
 727 | 
 728 | #Convert y_test pandas dataframe to pandas series
 729 | y_test=pd.Series(y_test['label'].values)
 730 | 
 731 | #Calculate log loss from logistic regression
 732 | LogLoss = log_loss(y_test, lr_proba) 
 733 | 
 734 | print("Log Loss Linear Regression:%.4f" % LogLoss)
 735 | 
 736 | #Logistic Regression With Hyper-Parameter Tuning
 737 | #define logistic regression model
 738 | lr_hyper=LogisticRegression(featuresCol='features', labelCol='label')
 739 | 
 740 | 
 741 | #Hyper-Parameter Tuning
 742 | paramGrid_lr = ParamGridBuilder() \
 743 |     .addGrid(lr_hyper.regParam, [0.1, 0.01]) \
 744 |     .addGrid(lr_hyper.elasticNetParam, [0.8, 0.7]) \
 745 |     .build()
 746 | crossval_lr = CrossValidator(estimator=lr_hyper,
 747 |                              estimatorParamMaps=paramGrid_lr,
 748 |                              evaluator=BinaryClassificationEvaluator(),
 749 |                              numFolds=3)
 750 | #fit model to data train
 751 | lr_model_hyper = crossval_lr.fit(data_train)
 752 | 
 753 | #Transform model to data test
 754 | lr_result_hyper = lr_model_hyper.transform(data_test)
 755 | 
 756 | #view id, label, prediction and probability from result of modelling
 757 | lr_result_hyper.select('Id', 'label', 'prediction', 'probability').show(5)
 758 | 
 759 | #Logistic Regression With Hyper-Parameter Tuning Evaluation
 760 | #Evaluate model by checking accuracy and AUC value
 761 | lr_hyper_eval = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="label")
 762 | lr_hyper_eval2= MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label")
 763 | lr_hyper_AUC  = lr_hyper_eval.evaluate(lr_result_hyper)
 764 | lr_hyper_ACC  = lr_hyper_eval2.evaluate(lr_result_hyper, {lr_hyper_eval2.metricName:"accuracy"})
 765 | 
 766 | print("Logistic Regression Performance Measure")
 767 | print("Accuracy = %0.2f" % lr_hyper_ACC)
 768 | print("AUC = %.2f" % lr_hyper_AUC)
 769 | 
 770 | #ROC Grafik
 771 | PredAndLabels           = lr_result_hyper.select("probability", "label")
 772 | PredAndLabels_collect   = PredAndLabels.collect()
 773 | PredAndLabels_list      = [(float(i[0][0]), 1.0-float(i[1])) for i in PredAndLabels_collect]
 774 | PredAndLabels           = sc.parallelize(PredAndLabels_list)
 775 | 
 776 | metrics = BinaryClassificationMetrics(PredAndLabels)
 777 | 
 778 | # Area under ROC
 779 | print("Logistic Regression Area Under ROC")
 780 | print("Area under ROC = %.2f" % metrics.areaUnderROC)
 781 | 
 782 | # Visualization
 783 | FPR = dict()                                                        # FPR: False Positive Rate
 784 | tpr = dict()                                                        # TPR: True Positive Rate
 785 | roc_auc = dict()
 786 |  
 787 | y_test = [i[1] for i in PredAndLabels_list]
 788 | y_score = [i[0] for i in PredAndLabels_list]
 789 |  
 790 | fpr, tpr, _ = roc_curve(y_test, y_score)
 791 | roc_auc = auc(fpr, tpr)
 792 |  
 793 | plt.figure(figsize=(5,4))
 794 | plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
 795 | plt.plot([0, 1], [0, 1], 'k--')
 796 | plt.xlim([0.0, 1.0])
 797 | plt.ylim([0.0, 1.05])
 798 | plt.xlabel('False Positive Rate')
 799 | plt.ylabel('True Positive Rate')
 800 | plt.title('ROC Curve - Logistic Regression')
 801 | plt.legend(loc="lower right")
 802 | plt.show()
 803 | 
 804 | #confusion matrix
 805 | cm_lr_result_hyper = lr_result_hyper.crosstab("prediction", "label")
 806 | cm_lr_result_hyper = cm_lr_result_hyper.toPandas()
 807 | cm_lr_result_hyper
 808 | 
 809 | #calculate Accuracy, Sensitivity, Specificity, Precision
 810 | TP = cm_lr_result_hyper["1"][0]
 811 | FP = cm_lr_result_hyper["0"][0]
 812 | TN = cm_lr_result_hyper["0"][1]
 813 | FN = cm_lr_result_hyper["1"][1]
 814 | Accuracy = (TP+TN)/(TP+FP+TN+FN)
 815 | Sensitivity = TP/(TP+FN)
 816 | Specificity = TN/(TN+FP)
 817 | Precision = TP/(TP+FP)
 818 | 
 819 | print ("Accuracy = %0.2f" %Accuracy )
 820 | print ("Sensitivity = %0.2f" %Sensitivity )
 821 | print ("Specificity = %0.2f" %Specificity )
 822 | print ("Precision = %0.2f" %Precision )
 823 | 
 824 | #Calculate Gini Coefisient from AUC
 825 | AUC = lr_hyper_AUC
 826 | Gini_lr_hyper = (2 * AUC - 1)
 827 | print("AUC=%.2f" % AUC)
 828 | print("GINI ~=%.2f" % Gini_lr_hyper)
 829 | 
 830 | #Calculate Log Loss in pandas dataframe
 831 | #Create Dataframe to Calculate Log Loss
 832 | y_test= titanic_test.select('label')
 833 | lr_hyper_proba=lr_result_hyper.select('probability')
 834 | 
 835 | #Convert lr_probaspark dataframe to numpy array
 836 | lr_hyper_proba= np.array(lr_hyper_proba.select('probability').collect())
 837 | 
 838 | #Convert numpy array 3 dimentional to 2 dimentional
 839 | lr_hyper_proba=lr_hyper_proba.reshape(-1, lr_hyper_proba.shape[-1])
 840 | 
 841 | #Convert y_test dataframe to pandas dataframe
 842 | y_test=y_test.toPandas()
 843 | 
 844 | #Convert y_test pandas dataframe to pandas series
 845 | y_test=pd.Series(y_test['label'].values)
 846 | 
 847 | #Calculate log loss from logistic regression hyper parameter
 848 | LogLoss = log_loss(y_test, lr_hyper_proba) 
 849 | 
 850 | print("Log Loss Linear Regression:%.4f" % LogLoss)
 851 | 
 852 | 
 853 | #Decision Tree
 854 | #Create decision tree model to data train
 855 | dt=DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3)
 856 | dt_model = dt.fit(data_train)
 857 | 
 858 | ##Transform model to data test
 859 | dt_result = dt_model.transform(data_test)
 860 | 
 861 | #view id, label, prediction and probability from result of modelling
 862 | dt_result.select('Id', 'label', 'prediction', 'probability').show(5)
 863 | 
 864 | # Decision Tree Evaluation
 865 | #Evaluate model by calculating accuracy and area under curve (AUC)
 866 | dt_eval = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="label")
 867 | dt_eval2= MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label")
 868 | dt_AUC  = dt_eval.evaluate(dt_result)
 869 | dt_ACC  = dt_eval2.evaluate(dt_result, {dt_eval2.metricName:"accuracy"})
 870 | 
 871 | print("Decision Tree Performance Measure")
 872 | print("Accuracy = %0.2f" % dt_ACC)
 873 | print("AUC = %.2f" % dt_AUC)
 874 | 
 875 | #ROC Grafik
 876 | PredAndLabels           = dt_result.select("probability", "label")
 877 | PredAndLabels_collect   = PredAndLabels.collect()
 878 | PredAndLabels_list      = [(float(i[0][0]), 1.0-float(i[1])) for i in PredAndLabels_collect]
 879 | PredAndLabels           = sc.parallelize(PredAndLabels_list)
 880 | 
 881 | metrics = BinaryClassificationMetrics(PredAndLabels)
 882 | 
 883 | # Area under ROC
 884 | print("Decision Tree Area Under ROC")
 885 | print("Area under ROC = %.2f" % metrics.areaUnderROC)
 886 | 
 887 | # Visualization
 888 | FPR = dict()                                                        # FPR: False Positive Rate
 889 | tpr = dict()                                                        # TPR: True Positive Rate
 890 | roc_auc = dict()
 891 |  
 892 | y_test = [i[1] for i in PredAndLabels_list]
 893 | y_score = [i[0] for i in PredAndLabels_list]
 894 |  
 895 | fpr, tpr, _ = roc_curve(y_test, y_score)
 896 | roc_auc = auc(fpr, tpr)
 897 |  
 898 | plt.figure(figsize=(5,4))
 899 | plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
 900 | plt.plot([0, 1], [0, 1], 'k--')
 901 | plt.xlim([0.0, 1.0])
 902 | plt.ylim([0.0, 1.05])
 903 | plt.xlabel('False Positive Rate')
 904 | plt.ylabel('True Positive Rate')
 905 | plt.title('ROC Curve - Decision Tree')
 906 | plt.legend(loc="lower right")
 907 | plt.show()
 908 | 
 909 | #confusion matrix
 910 | cm_dt_result = dt_result.crosstab("prediction", "label")
 911 | cm_dt_result = cm_dt_result.toPandas()
 912 | cm_dt_result
 913 | 
 914 | #calculate accuracy, sensitivity, specificity and precision
 915 | TP = cm_dt_result["1"][0]
 916 | FP = cm_dt_result["0"][0]
 917 | TN = cm_dt_result["0"][1]
 918 | FN = cm_dt_result["1"][1]
 919 | Accuracy = (TP+TN)/(TP+FP+TN+FN)
 920 | Sensitivity = TP/(TP+FN)
 921 | Specificity = TN/(TN+FP)
 922 | Precision = TP/(TP+FP)
 923 | 
 924 | print ("Accuracy = %0.2f" %Accuracy )
 925 | print ("Sensitivity = %0.2f" %Sensitivity )
 926 | print ("Specificity = %0.2f" %Specificity )
 927 | print ("Precision = %0.2f" %Precision )
 928 | 
 929 | #Calculate Gini Coeffiecient from AUC
 930 | AUC = dt_AUC
 931 | Gini_dt = (2 * AUC - 1)
 932 | print("AUC=%.2f" % AUC)
 933 | print("GINI ~=%.2f" % Gini_dt)
 934 | 
 935 | #Calculate Log Loss in pandas dataframe
 936 | #Create Dataframe to Calculate Log Loss
 937 | y_test= data_test.select('label')
 938 | dt_proba=dt_result.select('probability')
 939 | 
 940 | ##Convert lr_probaspark dataframe to numpy array
 941 | dt_proba= np.array(dt_proba.select('probability').collect())
 942 | 
 943 | #Convert numpy array 3 dimentional to 2 dimentional
 944 | dt_proba=dt_proba.reshape(-1, dt_proba.shape[-1])
 945 | 
 946 | #Convert y_test dataframe to pandas dataframe
 947 | y_test=y_test.toPandas()
 948 | 
 949 | #Convert y_test pandas dataframe to pandas series
 950 | y_test=pd.Series(y_test['label'].values)
 951 | 
 952 | #Calculate log loss from Decision Tree
 953 | LogLoss = log_loss(y_test, dt_proba) 
 954 | 
 955 | print("Log Loss Decision Tree:%.4f" % LogLoss)
 956 | 
 957 | #Decision Tree With Hyper-Parameter Tuning
 958 | #define decision tree model
 959 | dt_hyper=DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', impurity='gini')
 960 | 
 961 | #Hyper-Parameter Tuning
 962 | paramGrid_dt = ParamGridBuilder() \
 963 |     .addGrid(dt_hyper.maxDepth, [5, 7]) \
 964 |     .addGrid(dt_hyper.maxBins, [10,20]) \
 965 |     .build()
 966 | crossval_dt = CrossValidator(estimator=dt_hyper,
 967 |                              estimatorParamMaps=paramGrid_dt,
 968 |                              evaluator=BinaryClassificationEvaluator(),
 969 |                              numFolds=5)
 970 | #fit model to data train
 971 | dt_model_hyper = crossval_dt.fit(data_train)
 972 | 
 973 | #transform model to data test
 974 | dt_result_hyper = dt_model_hyper.transform(data_test)
 975 | 
 976 | #view id, label, prediction and probability from result of modelling 
 977 | dt_result_hyper.select('Id', 'label', 'prediction', 'probability').show(5)
 978 | 
 979 | #Decision Tree With Hyper-Parameter Tuning Evaluation
 980 | #Evaluate model by calculating accuracy and area under curve (AUC)
 981 | dt_hyper_eval = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="label")
 982 | dt_hyper_eval2= MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label")
 983 | dt_hyper_AUC  = dt_hyper_eval.evaluate(dt_result_hyper)
 984 | dt_hyper_ACC  = dt_hyper_eval2.evaluate(dt_result_hyper, {dt_hyper_eval2.metricName:"accuracy"})
 985 | 
 986 | print("Decision Tree Performance Measure")
 987 | print("Accuracy = %0.2f" % dt_hyper_ACC)
 988 | print("AUC = %.2f" % dt_hyper_AUC)
 989 | 
 990 | #ROC Grafik
 991 | PredAndLabels           = dt_result_hyper.select("probability", "label")
 992 | PredAndLabels_collect   = PredAndLabels.collect()
 993 | PredAndLabels_list      = [(float(i[0][0]), 1.0-float(i[1])) for i in PredAndLabels_collect]
 994 | PredAndLabels           = sc.parallelize(PredAndLabels_list)
 995 | 
 996 | metrics = BinaryClassificationMetrics(PredAndLabels)
 997 | 
 998 | # Area under ROC
 999 | print("Decision Tree Area Under ROC")
1000 | print("Area under ROC = %.2f" % metrics.areaUnderROC)
1001 | 
1002 | # Visualization
1003 | FPR = dict()                                                        # FPR: False Positive Rate
1004 | tpr = dict()                                                        # TPR: True Positive Rate
1005 | roc_auc = dict()
1006 |  
1007 | y_test = [i[1] for i in PredAndLabels_list]
1008 | y_score = [i[0] for i in PredAndLabels_list]
1009 |  
1010 | fpr, tpr, _ = roc_curve(y_test, y_score)
1011 | roc_auc = auc(fpr, tpr)
1012 |  
1013 | plt.figure(figsize=(5,4))
1014 | plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
1015 | plt.plot([0, 1], [0, 1], 'k--')
1016 | plt.xlim([0.0, 1.0])
1017 | plt.ylim([0.0, 1.05])
1018 | plt.xlabel('False Positive Rate')
1019 | plt.ylabel('True Positive Rate')
1020 | plt.title('ROC Curve - Decision Tree')
1021 | plt.legend(loc="lower right")
1022 | plt.show()
1023 | 
1024 | #Confusion Matrix
1025 | cm_dt_result_hyper = dt_result_hyper.crosstab("prediction", "label")
1026 | cm_dt_result_hyper = cm_dt_result_hyper.toPandas()
1027 | cm_dt_result_hyper
1028 | 
1029 | #calculate accuracy, sensitivity, specificity and precision
1030 | TP = cm_dt_result_hyper["1"][0]
1031 | FP = cm_dt_result_hyper["0"][0]
1032 | TN = cm_dt_result_hyper["0"][1]
1033 | FN = cm_dt_result_hyper["1"][1]
1034 | Accuracy = (TP+TN)/(TP+FP+TN+FN)
1035 | Sensitivity = TP/(TP+FN)
1036 | Specificity = TN/(TN+FP)
1037 | Precision = TP/(TP+FP)
1038 | 
1039 | print ("Accuracy = %0.2f" %Accuracy )
1040 | print ("Sensitivity = %0.2f" %Sensitivity )
1041 | print ("Specificity = %0.2f" %Specificity )
1042 | print ("Precision = %0.2f" %Precision )
1043 | 
1044 | #Calculate Gini Coefficient from AUC
1045 | AUC = dt_hyper_AUC
1046 | Gini_dt_hyper= (2 * AUC -1)
1047 | 
1048 | print("AUC=%.2f" % AUC)
1049 | print("GINI ~=%.2f" % Gini_dt_hyper)
1050 | 
1051 | #Calculate Log Loss in pandas dataframe
1052 | #Create Dataframe to Calculate Log Loss
1053 | y_test= data_test.select('label')
1054 | dt_hyper_proba=dt_result_hyper.select('probability')
1055 | 
1056 | #Convert lr_probaspark dataframe to numpy array
1057 | dt_hyper_proba= np.array(dt_hyper_proba.select('probability').collect())
1058 | 
1059 | #Convert numpy array 3 dimentional to 2 dimentional
1060 | dt_hyper_proba=dt_hyper_proba.reshape(-1, dt_hyper_proba.shape[-1])
1061 | 
1062 | #Convert y_test dataframe to pandas dataframe
1063 | y_test=y_test.toPandas()
1064 | 
1065 | #Convert y_test pandas dataframe to pandas series
1066 | y_test=pd.Series(y_test['label'].values)
1067 | 
1068 | #Calculate log loss from Decision Tree hyper parameter
1069 | LogLoss = log_loss(y_test, dt_hyper_proba) 
1070 | 
1071 | print("Log Loss Decision Tree:%.4f" % LogLoss)
1072 | 
1073 | #Random Forest
1074 | #Create decision tree model to data train
1075 | rf = RandomForestClassifier(featuresCol='features', labelCol="label")
1076 | rf_model = rf.fit(data_train)
1077 | 
1078 | #transform model to data test
1079 | rf_result = rf_model.transform(data_test)
1080 | 
1081 | #view id, label, prediction and probability from result of modelling
1082 | rf_result.select('Id', 'label', 'prediction', 'probability').show(5)
1083 | 
1084 | #Random Forest Evaluation
1085 | #Evaluate model by calculatin accuracy and area under curve (AUC)
1086 | rf_eval = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="label")
1087 | rf_eval2= MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label")
1088 | rf_AUC  = rf_eval.evaluate(rf_result)
1089 | rf_ACC  = rf_eval2.evaluate(rf_result, {rf_eval2.metricName:"accuracy"})
1090 | 
1091 | print("Decision Tree Performance Measure")
1092 | print("Accuracy = %0.2f" % rf_ACC)
1093 | print("AUC = %.2f" % rf_AUC)
1094 | 
1095 | #ROC Grafik
1096 | PredAndLabels           = rf_result.select("probability", "label")
1097 | PredAndLabels_collect   = PredAndLabels.collect()
1098 | PredAndLabels_list      = [(float(i[0][0]), 1.0-float(i[1])) for i in PredAndLabels_collect]
1099 | PredAndLabels           = sc.parallelize(PredAndLabels_list)
1100 | 
1101 | metrics = BinaryClassificationMetrics(PredAndLabels)
1102 | 
1103 | # Area under ROC
1104 | print("Random Forest Area Under ROC")
1105 | print("Area under ROC = %.2f" % metrics.areaUnderROC)
1106 | 
1107 | # Visualization
1108 | FPR = dict()                                                        # FPR: False Positive Rate
1109 | tpr = dict()                                                        # TPR: True Positive Rate
1110 | roc_auc = dict()
1111 |  
1112 | y_test = [i[1] for i in PredAndLabels_list]
1113 | y_score = [i[0] for i in PredAndLabels_list]
1114 |  
1115 | fpr, tpr, _ = roc_curve(y_test, y_score)
1116 | roc_auc = auc(fpr, tpr)
1117 |  
1118 | plt.figure(figsize=(5,4))
1119 | plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
1120 | plt.plot([0, 1], [0, 1], 'k--')
1121 | plt.xlim([0.0, 1.0])
1122 | plt.ylim([0.0, 1.05])
1123 | plt.xlabel('False Positive Rate')
1124 | plt.ylabel('True Positive Rate')
1125 | plt.title('ROC Curve - Random Forest')
1126 | plt.legend(loc="lower right")
1127 | plt.show()
1128 | 
1129 | #Confusion Matrix
1130 | cm_rf_result = rf_result.crosstab("prediction", "label")
1131 | cm_rf_result = cm_rf_result.toPandas()
1132 | cm_rf_result
1133 | 
1134 | #calculate accurary,sensitivity, specificity and precision 
1135 | TP = cm_rf_result["1"][0]
1136 | FP = cm_rf_result["0"][0]
1137 | TN = cm_rf_result["0"][1]
1138 | FN = cm_rf_result["1"][1]
1139 | Accuracy = (TP+TN)/(TP+FP+TN+FN)
1140 | Sensitivity = TP/(TP+FN)
1141 | Specificity = TN/(TN+FP)
1142 | Precision = TP/(TP+FP)
1143 | 
1144 | print ("Accuracy = %0.2f" %Accuracy )
1145 | print ("Sensitivity = %0.2f" %Sensitivity )
1146 | print ("Specificity = %0.2f" %Specificity )
1147 | print ("Precision = %0.2f" %Precision )
1148 | 
1149 | #Calculate Gini Coefficient from AUC
1150 | AUC = rf_AUC
1151 | Gini_rf= (2 * AUC -1)
1152 | 
1153 | print("AUC=%.2f" % AUC)
1154 | print("GINI ~=%.2f" % Gini_rf)
1155 | 
1156 | #Calculate Log Loss in pandas dataframe
1157 | #Create Dataframe to Calculate Log Loss
1158 | y_test= data_test.select('label')
1159 | rf_proba=rf_result.select('probability')
1160 | 
1161 | #Convert rf_probaspark dataframe to numpy array
1162 | rf_proba= np.array(rf_proba.select('probability').collect())
1163 | 
1164 | #Convert numpy array 3 dimentional to 2 dimentional
1165 | rf_proba=rf_proba.reshape(-1, rf_proba.shape[-1])
1166 | 
1167 | #Convert y_test dataframe to pandas dataframe
1168 | y_test=y_test.toPandas()
1169 | 
1170 | #Convert y_test pandas dataframe to pandas series
1171 | y_test=pd.Series(y_test['label'].values)
1172 | 
1173 | #Calculate log loss from Random Forest
1174 | LogLoss = log_loss(y_test, rf_proba) 
1175 | 
1176 | print("Log Loss Random Forest:%.4f" % LogLoss)
1177 | 
1178 | #Random Forest With Hyper-Parameter
1179 | #define random forest model
1180 | rf_hyper= RandomForestClassifier(featuresCol='features', labelCol="label")
1181 | 
1182 | # Hyper-Parameter Tuning
1183 | paramGrid_rf = ParamGridBuilder() \
1184 |     .addGrid(rf_hyper.numTrees, [40, 60, 80, 100]) \
1185 |     .build()
1186 | crossval_rf = CrossValidator(estimator=rf_hyper,
1187 |                              estimatorParamMaps=paramGrid_rf,
1188 |                              evaluator=BinaryClassificationEvaluator(),
1189 |                              numFolds=3) 
1190 | #fit model to data train
1191 | rf_model_hyper=crossval_rf.fit(data_train)
1192 | 
1193 | #transfrom model to data test
1194 | rf_result_hyper = rf_model_hyper.transform(data_test)
1195 | 
1196 | #view id, label, prediction and probability from result of modelling
1197 | rf_result_hyper.select('Id', 'label', 'prediction', 'probability').show(5)
1198 | 
1199 | #Random Forest With Hyper-Parameter Evaluation
1200 | #Evaluate model by calculating accuracy and area under curve (AUC)
1201 | rf_hyper_eval = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="label")
1202 | rf_hyper_eval2= MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label")
1203 | rf_hyper_AUC  = rf_hyper_eval.evaluate(rf_result_hyper)
1204 | rf_hyper_ACC  = rf_hyper_eval2.evaluate(rf_result_hyper, {rf_hyper_eval2.metricName:"accuracy"})
1205 | 
1206 | print("Decision Tree Performance Measure")
1207 | print("Accuracy = %0.2f" % rf_hyper_ACC)
1208 | print("AUC = %.2f" % rf_hyper_AUC)
1209 | 
1210 | #ROC Grafik
1211 | PredAndLabels           = rf_result_hyper.select("probability", "label")
1212 | PredAndLabels_collect   = PredAndLabels.collect()
1213 | PredAndLabels_list      = [(float(i[0][0]), 1.0-float(i[1])) for i in PredAndLabels_collect]
1214 | PredAndLabels           = sc.parallelize(PredAndLabels_list)
1215 | 
1216 | metrics = BinaryClassificationMetrics(PredAndLabels)
1217 | 
1218 | # Area under ROC
1219 | print("Random Forest Area Under ROC")
1220 | print("Area under ROC = %.2f" % metrics.areaUnderROC)
1221 | 
1222 | # Visualization
1223 | FPR = dict()                                                        # FPR: False Positive Rate
1224 | tpr = dict()                                                        # TPR: True Positive Rate
1225 | roc_auc = dict()
1226 |  
1227 | y_test = [i[1] for i in PredAndLabels_list]
1228 | y_score = [i[0] for i in PredAndLabels_list]
1229 |  
1230 | fpr, tpr, _ = roc_curve(y_test, y_score)
1231 | roc_auc = auc(fpr, tpr)
1232 |  
1233 | plt.figure(figsize=(5,4))
1234 | plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
1235 | plt.plot([0, 1], [0, 1], 'k--')
1236 | plt.xlim([0.0, 1.0])
1237 | plt.ylim([0.0, 1.05])
1238 | plt.xlabel('False Positive Rate')
1239 | plt.ylabel('True Positive Rate')
1240 | plt.title('ROC Curve - Random Forest')
1241 | plt.legend(loc="lower right")
1242 | plt.show()
1243 | 
1244 | #Confusion Matrix
1245 | cm_rf_result_hyper = rf_result_hyper.crosstab("prediction", "label")
1246 | cm_rf_result_hyper = cm_rf_result_hyper.toPandas()
1247 | cm_rf_result_hyper
1248 | 
1249 | #calculate accuracy, sensitivity, specificity and precision
1250 | TP = cm_rf_result_hyper["1"][0]
1251 | FP = cm_rf_result_hyper["0"][0]
1252 | TN = cm_rf_result_hyper["0"][1]
1253 | FN = cm_rf_result_hyper["1"][1]
1254 | Accuracy = (TP+TN)/(TP+FP+TN+FN)
1255 | Sensitivity = TP/(TP+FN)
1256 | Specificity = TN/(TN+FP)
1257 | Precision = TP/(TP+FP)
1258 | 
1259 | print ("Accuracy = %0.2f" %Accuracy )
1260 | print ("Sensitivity = %0.2f" %Sensitivity )
1261 | print ("Specificity = %0.2f" %Specificity )
1262 | print ("Precision = %0.2f" %Precision )
1263 | 
1264 | #Calculate Gini Coefficient from AUC
1265 | AUC = rf_hyper_AUC
1266 | Gini_rf_hyper= (2 * AUC -1)
1267 | 
1268 | print("AUC=%.2f" % AUC)
1269 | print("GINI ~=%.2f" % Gini_rf_hyper)
1270 | 
1271 | #Calculate Log Loss in pandas dataframe
1272 | #Create Dataframe to Calculate Log Loss
1273 | y_test= data_test.select('label')
1274 | rf_hyper_proba=rf_result_hyper.select('probability')
1275 | 
1276 | #Convert pyspark dataframe to numpy array
1277 | rf_hyper_proba= np.array(rf_hyper_proba.select('probability').collect())
1278 | 
1279 | #Convert numpy array 3 dimentional to 2 dimentional
1280 | rf_hyper_proba=rf_hyper_proba.reshape(-1, rf_hyper_proba.shape[-1])
1281 | 
1282 | #Convert y_test dataframe to pandas dataframe
1283 | y_test=y_test.toPandas()
1284 | 
1285 | #Convert y_test pandas dataframe to pandas series
1286 | y_test=pd.Series(y_test['label'].values)
1287 | 
1288 | #Calculate log loss from Random Forest hyper parameter
1289 | LogLoss = log_loss(y_test, rf_hyper_proba) 
1290 | 
1291 | print("Log Loss Random Forest:%.4f" % LogLoss)
1292 | 
1293 | #Gradient Boosting
1294 | #create gradient boosting model in data train
1295 | gbt = GBTClassifier(featuresCol="features", labelCol="label",  maxIter=10)
1296 | gbt_model = gbt.fit(data_train)
1297 | 
1298 | #transfrom model to data test
1299 | gbt_result = gbt_model.transform(data_test)
1300 | 
1301 | #view id, label, prediction and probability from result of modelling
1302 | gbt_result.select('Id', 'label', 'prediction', 'probability').show(5)
1303 | 
1304 | #Gradient Boosting Evaluation
1305 | #Evaluate model by calculating accuracy and area under curve (AUC)
1306 | gbt_eval = BinaryClassificationEvaluator(rawPredictionCol="probability",labelCol="label")
1307 | gbt_eval2= MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label")
1308 | gbt_AUC  = gbt_eval.evaluate(gbt_result)
1309 | gbt_ACC  = gbt_eval2.evaluate(gbt_result, {gbt_eval2.metricName:"accuracy"})
1310 | 
1311 | print("Gradient Boosted Tree Performance Measure")
1312 | print("Accuracy = %0.2f" % gbt_ACC)
1313 | print("AUC = %.2f" % gbt_AUC)
1314 | 
1315 | #ROC Grafik
1316 | PredAndLabels           = gbt_result.select("probability", "label")
1317 | PredAndLabels_collect   = PredAndLabels.collect()
1318 | PredAndLabels_list      = [(float(i[0][0]), 1.0-float(i[1])) for i in PredAndLabels_collect]
1319 | PredAndLabels           = sc.parallelize(PredAndLabels_list)
1320 | 
1321 | metrics = BinaryClassificationMetrics(PredAndLabels)
1322 | 
1323 | # Area under ROC
1324 | print("Gradient Boosting Area Under ROC")
1325 | print("Area under ROC = %.2f" % metrics.areaUnderROC)
1326 | 
1327 | # Visualization
1328 | FPR = dict()                                                        # FPR: False Positive Rate
1329 | tpr = dict()                                                        # TPR: True Positive Rate
1330 | roc_auc = dict()
1331 |  
1332 | y_test = [i[1] for i in PredAndLabels_list]
1333 | y_score = [i[0] for i in PredAndLabels_list]
1334 |  
1335 | fpr, tpr, _ = roc_curve(y_test, y_score)
1336 | roc_auc = auc(fpr, tpr)
1337 |  
1338 | plt.figure(figsize=(5,4))
1339 | plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
1340 | plt.plot([0, 1], [0, 1], 'k--')
1341 | plt.xlim([0.0, 1.0])
1342 | plt.ylim([0.0, 1.05])
1343 | plt.xlabel('False Positive Rate')
1344 | plt.ylabel('True Positive Rate')
1345 | plt.title('ROC Curve - Gradient Boosting')
1346 | plt.legend(loc="lower right")
1347 | plt.show()
1348 | 
1349 | #Confusion Matrix
1350 | cm_gbt_result = gbt_result.crosstab("prediction", "label")
1351 | cm_gbt_result = cm_gbt_result.toPandas()
1352 | cm_gbt_result
1353 | 
1354 | #calculate accuracy, sensitivity, specificity and precision
1355 | TP = cm_gbt_result["1"][0]
1356 | FP = cm_gbt_result["0"][0]
1357 | TN = cm_gbt_result["0"][1]
1358 | FN = cm_gbt_result["1"][1]
1359 | Accuracy = (TP+TN)/(TP+FP+TN+FN)
1360 | Sensitivity = TP/(TP+FN)
1361 | Specificity = TN/(TN+FP)
1362 | Precision = TP/(TP+FP)
1363 | 
1364 | print ("Accuracy = %0.2f" %Accuracy )
1365 | print ("Sensitivity = %0.2f" %Sensitivity )
1366 | print ("Specificity = %0.2f" %Specificity )
1367 | print ("Precision = %0.2f" %Precision )
1368 | 
1369 | #Calculate Gini Coefficient from AUC
1370 | AUC = gbt_AUC
1371 | Gini_gbt= (2 * AUC -1)
1372 | 
1373 | print("AUC=%.2f" % AUC)
1374 | print("GINI ~=%.2f" % Gini_gbt)
1375 | 
1376 | #Calculate Log Loss in pandas dataframe
1377 | #Create Dataframe to Calculate Log Loss
1378 | y_test= data_test.select('label')
1379 | gbt_proba=gbt_result.select('probability')
1380 | 
1381 | #Convert pyspark dataframe to numpy array
1382 | gbt_proba= np.array(gbt_proba.select('probability').collect())
1383 | 
1384 | #Convert numpy array 3 dimentional to 2 dimentional
1385 | gbt_proba=gbt_proba.reshape(-1, gbt_proba.shape[-1])
1386 | 
1387 | #Convert y_test dataframe to pandas dataframe
1388 | y_test=y_test.toPandas()
1389 | 
1390 | #Convert y_test pandas dataframe to pandas series
1391 | y_test=pd.Series(y_test['label'].values)
1392 | 
1393 | #Calculate log loss from Gradient Boosting
1394 | LogLoss = log_loss(y_test, gbt_proba) 
1395 | 
1396 | print("Log Loss Gradient Boosting:%.4f" % LogLoss)
1397 | 
1398 | #Gradient Boosting With Hyper-Parameter
1399 | #define gradient boosting model
1400 | gbt_hyper= GBTClassifier(featuresCol="features", labelCol="label")
1401 | 
1402 | # Hyper-Parameter Tuning
1403 | paramGrid_gbt = ParamGridBuilder() \
1404 |     .addGrid(gbt_hyper.maxIter, [10])\
1405 |     .addGrid(gbt_hyper.maxDepth, [6, 7,10]) \
1406 |     .build()
1407 | crossval_gbt = CrossValidator(estimator=gbt_hyper,
1408 |                              estimatorParamMaps=paramGrid_gbt,
1409 |                              evaluator=BinaryClassificationEvaluator(),
1410 |                              numFolds=3)
1411 | #fit model to data train
1412 | gbt_model_hyper = crossval_gbt.fit(data_train)
1413 | 
1414 | #transfrom model to data test
1415 | gbt_result_hyper = gbt_model_hyper.transform(data_test)
1416 | 
1417 | #view id, label, prediction and probability from result of modelling
1418 | gbt_result_hyper.select('Id', 'label', 'prediction', 'probability').show(5)
1419 | 
1420 | #Gradient Boosting With Hyper-Parameter Evaluation
1421 | #Evaluate model by calculating accuracy and area under curve (AUC)
1422 | gbt_eval_hyper = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="label")
1423 | gbt_eval_hyper2= MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label")
1424 | gbt_hyper_AUC  = gbt_eval_hyper.evaluate(gbt_result_hyper)
1425 | gbt_hyper_ACC  = gbt_eval_hyper2.evaluate(gbt_result_hyper, {gbt_eval_hyper2.metricName:"accuracy"})
1426 | 
1427 | 
1428 | print("Gradient Boosted Tree Performance Measure")
1429 | print("Accuracy = %0.2f" % gbt_hyper_ACC)
1430 | print("AUC = %.2f" % gbt_hyper_AUC)
1431 | 
1432 | #ROC Grafik
1433 | PredAndLabels           = gbt_result_hyper.select("probability", "label")
1434 | PredAndLabels_collect   = PredAndLabels.collect()
1435 | PredAndLabels_list      = [(float(i[0][0]), 1.0-float(i[1])) for i in PredAndLabels_collect]
1436 | PredAndLabels           = sc.parallelize(PredAndLabels_list)
1437 | 
1438 | metrics = BinaryClassificationMetrics(PredAndLabels)
1439 | 
1440 | # Area under ROC
1441 | print("Gradient Boosting Area Under ROC")
1442 | print("Area under ROC = %.2f" % metrics.areaUnderROC)
1443 | 
1444 | # Visualization
1445 | FPR = dict()                                                        # FPR: False Positive Rate
1446 | tpr = dict()                                                        # TPR: True Positive Rate
1447 | roc_auc = dict()
1448 |  
1449 | y_test = [i[1] for i in PredAndLabels_list]
1450 | y_score = [i[0] for i in PredAndLabels_list]
1451 |  
1452 | fpr, tpr, _ = roc_curve(y_test, y_score)
1453 | roc_auc = auc(fpr, tpr)
1454 |  
1455 | plt.figure(figsize=(5,4))
1456 | plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
1457 | plt.plot([0, 1], [0, 1], 'k--')
1458 | plt.xlim([0.0, 1.0])
1459 | plt.ylim([0.0, 1.05])
1460 | plt.xlabel('False Positive Rate')
1461 | plt.ylabel('True Positive Rate')
1462 | plt.title('ROC Curve - Gradient Boosting')
1463 | plt.legend(loc="lower right")
1464 | plt.show()
1465 | 
1466 | #confusion Matrix
1467 | cm_gbt_result_hyper = gbt_result_hyper.crosstab("prediction", "label")
1468 | cm_gbt_result_hyper = cm_gbt_result_hyper.toPandas()
1469 | cm_gbt_result_hyper
1470 | 
1471 | #calculate accuracy, sensitivity, specificity and precision
1472 | TP = cm_gbt_result_hyper["1"][0]
1473 | FP = cm_gbt_result_hyper["0"][0]
1474 | TN = cm_gbt_result_hyper["0"][1]
1475 | FN = cm_gbt_result_hyper["1"][1]
1476 | Accuracy = (TP+TN)/(TP+FP+TN+FN)
1477 | Sensitivity = TP/(TP+FN)
1478 | Specificity = TN/(TN+FP)
1479 | Precision = TP/(TP+FP)
1480 | 
1481 | print ("Accuracy = %0.2f" %Accuracy )
1482 | print ("Sensitivity = %0.2f" %Sensitivity )
1483 | print ("Specificity = %0.2f" %Specificity )
1484 | print ("Precision = %0.2f" %Precision )
1485 | 
1486 | #Calculate Gini Coefficient from AUC
1487 | AUC = gbt_hyper_AUC
1488 | Gini_gbt_hyper= (2 * AUC -1)
1489 | 
1490 | print("AUC=%.2f" % AUC)
1491 | print("GINI ~=%.2f" % Gini_gbt_hyper)
1492 | 
1493 | #Calculate Log Loss in pandas dataframe
1494 | #Create Dataframe to Calculate Log Loss
1495 | y_test= data_test.select('label')
1496 | gbt_hyper_proba=gbt_result_hyper.select('probability')
1497 | 
1498 | #Convert pyspark dataframe to numpy array
1499 | gbt_hyper_proba= np.array(gbt_hyper_proba.select('probability').collect())
1500 | 
1501 | #Convert numpy array 3 dimentional to 2 dimentional
1502 | gbt_hyper_proba=gbt_hyper_proba.reshape(-1, gbt_hyper_proba.shape[-1])
1503 | 
1504 | #Convert y_test dataframe to pandas dataframe
1505 | y_test=y_test.toPandas()
1506 | 
1507 | #Convert y_test pandas dataframe to pandas series
1508 | y_test=pd.Series(y_test['label'].values)
1509 | 
1510 | #Calculate log loss from Gradient Boosting hyper parameter
1511 | LogLoss = log_loss(y_test, gbt_hyper_proba) 
1512 | 
1513 | print("Log Loss Gradient Boosting:%.4f" % LogLoss)
1514 | 
1515 | 
1516 | #Implementation Modelling to data test
1517 | #Prediction using Logistic Regression
1518 | #transform logistic regression to data test
1519 | lr_predict = lr_model.transform(test2)
1520 | 
1521 | #view id, label, prediction and probability from result of modelling
1522 | lr_predict.select('Id', 'prediction', 'probability').show(5)
1523 | 
1524 | #select id and prediction from result of modelling and save in data frame called my_submission
1525 | my_submission=lr_predict.select("Id","prediction")
1526 | 
1527 | #convert to Pandas dataframe
1528 | my_submission=my_submission.toPandas()
1529 | 
1530 | #save to csv
1531 | my_submission.to_csv('E:/my_submission.csv', index = False, header = True)
1532 | 
1533 | 
1534 | #Prediction using Gradient Boosting
1535 | #transfrom gradient boosting model to data test
1536 | gbt_predict = gbt_model.transform(test_data_feat)
1537 | 
1538 | #view id, label, prediction and probability from result of modelling
1539 | gbt_predict.select('Id', 'prediction', 'probability').show(5)
1540 | 
1541 | #select id and prediction from result of modelling and save in data frame called my_submission
1542 | my_submission2=gbt_predict.select("Id","prediction")
1543 | 
1544 | #convert to Pandas dataframe
1545 | my_submission2=my_submission2.toPandas()
1546 | 
1547 | #save to csv
1548 | my_submission2.to_csv('E:/my_submission2.csv', index = False, header = True)
1549 | 


--------------------------------------------------------------------------------
/Image/Decision_Tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/Decision_Tree.png


--------------------------------------------------------------------------------
/Image/Decision_Tree_Gini_LogLoss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/Decision_Tree_Gini_LogLoss.png


--------------------------------------------------------------------------------
/Image/Decision_Tree_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/Decision_Tree_ROC.png


--------------------------------------------------------------------------------
/Image/Decision_Tree_confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/Decision_Tree_confusion_matrix.png


--------------------------------------------------------------------------------
/Image/Decision_Tree_ev1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/Decision_Tree_ev1.png


--------------------------------------------------------------------------------
/Image/EDA1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/EDA1.jpg


--------------------------------------------------------------------------------
/Image/EDA2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/EDA2.jpg


--------------------------------------------------------------------------------
/Image/EDA3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/EDA3.jpg


--------------------------------------------------------------------------------
/Image/EDA4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/EDA4.jpg


--------------------------------------------------------------------------------
/Image/EDA5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/EDA5.png


--------------------------------------------------------------------------------
/Image/EDA6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/EDA6.png


--------------------------------------------------------------------------------
/Image/Random_Forest.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/Random_Forest.png


--------------------------------------------------------------------------------
/Image/Random_Forest_Gini_LogLoss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/Random_Forest_Gini_LogLoss.png


--------------------------------------------------------------------------------
/Image/Random_Forest_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/Random_Forest_ROC.png


--------------------------------------------------------------------------------
/Image/Random_Forest_confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/Random_Forest_confusion_matrix.png


--------------------------------------------------------------------------------
/Image/Random_Forest_ev1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/Random_Forest_ev1.png


--------------------------------------------------------------------------------
/Image/call_function_feature_engineering.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/call_function_feature_engineering.png


--------------------------------------------------------------------------------
/Image/call_insignificant_categories_function.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/call_insignificant_categories_function.jpg


--------------------------------------------------------------------------------
/Image/callfunction_compare_categorical_variables.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/callfunction_compare_categorical_variables.jpg


--------------------------------------------------------------------------------
/Image/check_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/check_data.png


--------------------------------------------------------------------------------
/Image/check_missing_values.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/check_missing_values.png


--------------------------------------------------------------------------------
/Image/check_missing_values2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/check_missing_values2.png


--------------------------------------------------------------------------------
/Image/check_missing_values3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/check_missing_values3.png


--------------------------------------------------------------------------------
/Image/define_categorical_numerical_variables1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/define_categorical_numerical_variables1.png


--------------------------------------------------------------------------------
/Image/define_categorical_numerical_variables2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/define_categorical_numerical_variables2.png


--------------------------------------------------------------------------------
/Image/feature_engineering.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/feature_engineering.png


--------------------------------------------------------------------------------
/Image/feature_engineering2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/feature_engineering2.png


--------------------------------------------------------------------------------
/Image/function_compare_categorical_variables.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/function_compare_categorical_variables.jpg


--------------------------------------------------------------------------------
/Image/gradient_boosting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/gradient_boosting.png


--------------------------------------------------------------------------------
/Image/gradient_boosting_Gini_LogLoss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/gradient_boosting_Gini_LogLoss.png


--------------------------------------------------------------------------------
/Image/gradient_boosting_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/gradient_boosting_ROC.png


--------------------------------------------------------------------------------
/Image/gradient_boosting_ROC_confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/gradient_boosting_ROC_confusion_matrix.png


--------------------------------------------------------------------------------
/Image/gradient_boosting_confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/gradient_boosting_confusion_matrix.png


--------------------------------------------------------------------------------
/Image/gradient_boosting_ev1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/gradient_boosting_ev1.png


--------------------------------------------------------------------------------
/Image/handle_missing_values.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/handle_missing_values.jpg


--------------------------------------------------------------------------------
/Image/handle_missing_values2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/handle_missing_values2.jpg


--------------------------------------------------------------------------------
/Image/handle_outlier.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/handle_outlier.png


--------------------------------------------------------------------------------
/Image/handle_outlier2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/handle_outlier2.png


--------------------------------------------------------------------------------
/Image/handle_outlier3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/handle_outlier3.png


--------------------------------------------------------------------------------
/Image/hyper_parameter_Random_Forest.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/hyper_parameter_Random_Forest.png


--------------------------------------------------------------------------------
/Image/hyper_parameter_tuning_DecisionTree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/hyper_parameter_tuning_DecisionTree.png


--------------------------------------------------------------------------------
/Image/hyper_parameter_tuning_GradientBoost.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/hyper_parameter_tuning_GradientBoost.png


--------------------------------------------------------------------------------
/Image/hyper_parameter_tuning_LogisticRegression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/hyper_parameter_tuning_LogisticRegression.png


--------------------------------------------------------------------------------
/Image/implement_to_data_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/implement_to_data_test.png


--------------------------------------------------------------------------------
/Image/implement_to_data_test2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/implement_to_data_test2.png


--------------------------------------------------------------------------------
/Image/insignificant_categories_function.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/insignificant_categories_function.jpg


--------------------------------------------------------------------------------
/Image/insignificant_categories_function3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/insignificant_categories_function3.jpg


--------------------------------------------------------------------------------
/Image/insignificant_categories_function4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/insignificant_categories_function4.jpg


--------------------------------------------------------------------------------
/Image/load_dataset_function.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/load_dataset_function.png


--------------------------------------------------------------------------------
/Image/load_libraries.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/load_libraries.png


--------------------------------------------------------------------------------
/Image/logistic_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/logistic_regression.png


--------------------------------------------------------------------------------
/Image/logistic_regression_Gini_LogLoss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/logistic_regression_Gini_LogLoss.png


--------------------------------------------------------------------------------
/Image/logistic_regression_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/logistic_regression_ROC.png


--------------------------------------------------------------------------------
/Image/logistic_regression_ROC_confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/logistic_regression_ROC_confusion_matrix.png


--------------------------------------------------------------------------------
/Image/logistic_regression_confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/logistic_regression_confusion_matrix.png


--------------------------------------------------------------------------------
/Image/logistic_regression_ev1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/logistic_regression_ev1.png


--------------------------------------------------------------------------------
/Image/split_data_train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsyifa/Classification-Pyspark/36e34ffd289e01ebc872c33677f2297b5193bf82/Image/split_data_train.png


--------------------------------------------------------------------------------
/Image/test.txt:
--------------------------------------------------------------------------------
1 | test
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Classification-Pyspark
  2 | This is a repository of classification template using pyspark.
  3 | 
  4 | I tried to make a template of classification machine learning using pyspark. I will try to explain step by step from load data, data cleansing and making a prediction. I created some functions in pyspark to make an automation, so user just need to update or replace the dataset.
  5 | 
  6 | To test my template, I used data Home_Quote_Conversion from Kaggle https://www.kaggle.com/c/homesite-quote-conversion. This dataset represent the activity who are interested in buying policies from Homesite. QuoteConversion_Flag indicates whether the customer purchased a policy and the task is to predict QuoteConversion_Flag for each QuoteNumber in the test set.
  7 | 
  8 | In general, the steps of classification machine learning are:
  9 | 
 10 | * Load libraries
 11 |   
 12 |   The first step in applying classification model is we have to load all libraries are needed. The basic libraries for classification are LogisticRegression, RandomForestClassifier, GBTClassifier, etc. Below the capture of all libraries are needed in classification:
 13 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/load_libraries.png)
 14 | 
 15 | 
 16 | * Load Data into Spark Dataframe.
 17 |   
 18 |   Because we will work on spark environment so the dataset must be in spark dataframe. In this step, I created function to load data into spark dataframe. To run this function, first we have to define type of file of dataset (text or parquet) and path where dataset is stored and delimeter like ',' for example or other. 
 19 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/load_dataset_function.png)
 20 |   
 21 |   
 22 | * Check the data.
 23 |   
 24 |   After load data, lets do some check of the dataset such as numbers of columns, numbers of observations, names of columns, type of columns, etc. In this part, we also do some changes like rename columns name if the column name too long, change the data type if data type not in accordance or drop unnecessary column and check the proportion of target. Those changes apply in both data train and data test.
 25 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/check_data.png)
 26 |   
 27 |   
 28 | * Define categorical and numerical variables.
 29 |   
 30 |   In this step, I tried to split the variables based on it's data types. If data types of variables is string will be saved in list called **cat_cols** and if data types of variables is integer or double will be saved in list called **num_cols**. This split applied on data train and data test. This step applied to make easier in the following step so I don't need to define categorical and numerical variables manually.
 31 |   Pictures below is example of code of define categorical and numerical variables in data train.
 32 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/define_categorical_numerical_variables1.png)
 33 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/define_categorical_numerical_variables2.png)
 34 |   
 35 |   
 36 | * Sample data
 37 |   
 38 |   If the dataset is too large, we can take sample of data. 
 39 |   Note: this step is optional.
 40 |    
 41 | * Check Missing Values.
 42 |   
 43 |   Sometimes the data received is not clean. So, we need to check whether there are missing values or not. Output from this step is the name of columns which have missing values and the number of missing values. To check missing values, actually I created two method:
 44 |    - Using pandas dataframe, 
 45 |    - Using pyspark dataframe.
 46 |   But the prefer method is method using pyspark dataframe so if dataset is too large we can still calculate / check missing values.
 47 |   Both data train and data test has to apply this step.
 48 |   This function refer to https://github.com/UrbanInstitute/pyspark-tutorials/blob/master/04_missing-data.ipynb.
 49 |   
 50 |     Pictures below are example check missing values using pyspark dataframe in data train.
 51 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/check_missing_values.png)
 52 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/check_missing_values2.png)
 53 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/check_missing_values3.png)
 54 |   
 55 |   
 56 | * Handle Missing Values.
 57 |   
 58 |   The approach that used to handle missing values between numerical and categorical variables is different. For numerical variables I fill the missing values with average in it's columns. While for categorical values I fill missing values use most frequent category in that column, therefore count categories which has max values in each columns is needed. 
 59 |  ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/handle_missing_values.jpg)
 60 |  ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/handle_missing_values2.jpg)
 61 |  
 62 | 
 63 | * Compare categorical variables in data train and data test.
 64 |  
 65 |  In this step, we check whether categories between data train and data test same or not. If not, categories in data test will be equated with data train. This step is needed to avoid error in feature engineering, if there are differences categories between data train and data test the error will appear at feature engineering process in data test so the modelling process cannot be applied in data test.
 66 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/function_compare_categorical_variables.jpg)
 67 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/callfunction_compare_categorical_variables.jpg)
 68 |   
 69 |   
 70 | * EDA 
 71 |   
 72 |   Create distribution visualization in each variables to get some insight of dataset. Pictures below are example of visualization of data train.
 73 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/EDA1.jpg)
 74 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/EDA2.jpg)
 75 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/EDA3.jpg)
 76 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/EDA4.jpg)
 77 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/EDA5.png)
 78 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/EDA6.png)
 79 |   
 80 |   
 81 | * Handle insignificant categories in data train.
 82 |   
 83 |   Sometimes there are categories with fewest amount, those categories I called insignificant categories. Those insignificant categories will be replaced with the largest numbers of categories in each categorical columns. Sometimes this replacing will make better modelling. 
 84 |   
 85 |   Note: the determination of threshold that category have fewest amount is based on trial n error. In this case I used threshold 98% for maximum amount and 0.7% for minimum amount. Each categories in a column that have percentage under 0.7% will be replaced with category that has percentage equal or lower than 98%.
 86 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/insignificant_categories_function.jpg)
 87 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/call_insignificant_categories_function.jpg)
 88 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/insignificant_categories_function3.jpg)
 89 |   
 90 |   
 91 | * Handle insignificant categories in data test.
 92 |   
 93 |   To handle insignificant categories in data test, I refer to insignificant categories in data train. Categories that replaced will be equated with data train to avoid differences categories between data train and data test. As known those differences will trigger error in feature angineering and modelling process.
 94 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/insignificant_categories_function4.jpg)
 95 | 
 96 | 
 97 | * Handle outlier.
 98 |   
 99 |   Outlier is observations that fall below lower side or above upper side.
100 |   
101 |   To handle outlier the approach is by replacing the value greater than upper side with upper side value and replacing the value lower than lower side with lower side value. So, we need calculate upper and lower side from quantile value, quantile is probability distribution of variable. In General, there are three quantile:
102 | 
103 |    - Q1 = the value that cut off 25% of the first data when it is sorted in ascending order.
104 |    - Q2 = cut off data, or median, it's 50 % of the data
105 |    - Q3 = the value that cut off 75% of the first data when it is sorted in ascending order.
106 |    - IQR or interquartile range is range between Q1 and Q3. IQR = Q3 - Q1.
107 | 
108 |   Upper side = Q3 + 1.5 * IQR
109 |   Lower side = Q1 - 1.5 * IQR
110 | 
111 |   To calculate quantile in pyspark dataframe I created a function and then created function to calculate uper side, lower side, replacing upper side and replacing lower side. function of replacing upper side and lower side will looping as much as numbers of numerical variables in dataset (data train or data test). This step also apply in both data train and data test.
112 |   
113 |   Pictures below are example of handle outlier in data train, for data test the treatment is the same just call the function and apply it to data test.
114 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/handle_outlier.png)
115 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/handle_outlier2.png)
116 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/handle_outlier3.png)
117 | 
118 | * Feature Engineering.
119 |   
120 |   Before splitting the data train, all categorical variables must be made numerical. There are several approaches to categorical variables in SparkML, including:
121 |   - StringIndexer, which is to encode the string label into the index label by sequencing the string frequency descending and giving the smallest index (0) at most string frequency.
122 |   - One-hot Encoding, which is mapping the label column (string label) on the binary column.
123 |   - Vector assembler, which is mapping all columns in vector.
124 |   
125 |   In this step, first I check the distinct values in each categorical columns between data train and data test. If data train has distinct values more than data test in one or more categorical column, data train and data test will be joined then apply feature engineering on that data combination - this merger is needed to avoid error in modelling due to differences length of vector between data train and data test- length of vector (result of feature engineering of data combination) must be same between data train and data test so we can move to the next step, modelling and prediction. But if distinct values between data train and data test same, we will apply feature angineering on data train and data test separately then move to the next step modelling and prediction.
126 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/feature_engineering.png)
127 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/feature_engineering2.png)
128 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/call_function_feature_engineering.png)
129 | 
130 | * Split Data train to train and test.
131 |   
132 |   This step just apply on data train. In order to make validation on the model that we are used, we need to split data train into train and test data. Data train will be split with percentage: train 70% and test 30% and define seed 24 so the random data that we split will not change. We can define seed with any value.
133 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/split_data_train.png)
134 |   
135 | * Modelling.
136 |   
137 |   Algorithm that I used to make a model and prediction are:
138 |    - Logistic Regression Logistic regression used logit function in prediction the probability.
139 |    - Decision Tree This algorithm will find the most significant independent variable to create a group.
140 |    - Random Forest This algorithm build multiple decision trees and merges them together and use bagging method.
141 |    - Gradient Boosting This algorithm use boosting ensemble technic. This technique employs the logic in which the subsequent predictors    learn from the mistakes of the previous predictors.
142 |    
143 | * Evaluation.
144 |   
145 |   To evaluate model I used four metrics, they are:
146 |     - ROC
147 |       ROC (Receiver Operating Characteristic) The graph shows the true positive rate versus the false positive rate. This metric is           between 0 and 1 with a better model scoring higher. An area of 1 represents a perfect test; an area of .5 represents a worthless         test.
148 |       So, The model is said to be good enaught if the value of the area under the curve is above 0.5.
149 | 
150 |     - Gini Coefficient
151 |       Gini is ratio between the ROC curve and the diagnol line & the area of the above triangle. So, we can calculate Gini by this             formula: Gini = 2*AUC - 1 Such as AUC ROC, Gini above 50% or 60% is good model.
152 | 
153 |     - Confusion Matrix
154 |       Confusion Matrix is a table is used to describe performance of a classification model. Some definition are: 
155 |           - Accuracy = Proportion of total number of predictions that were correct 
156 |           - Precision (Positive Predictive Value) : Proportion of positive cases that were correctly identified. 
157 |           - Negative Predictive Value : Proportion of negative cases that were correctly identified. 
158 |           - Sensitivity (Recall) : Proportion of actual positive cases which are correctly identified. 
159 |           - Specificity : Proportion of actual negative cases which are correctly identified.
160 | 
161 |     - Log Loss
162 |       Log Loss is one of model performance evaluation in classification model. The purpose of model is to minimize log loss value. 
163 |       A perfect model would have of log loss of 0. Log Loss increase when predicted probability diverges from actual label.
164 |   Pictures below will explain how to create model and make a prediction and also evaluate those model with those four metrics.
165 |     ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/logistic_regression.png)
166 |     ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/logistic_regression_ev1.png)
167 |     ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/logistic_regression_ROC.png)
168 |     ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/logistic_regression_ROC_confusion_matrix.png)
169 |     ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/logistic_regression_confusion_matrix.png)
170 |     ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/logistic_regression_Gini_LogLoss.png)
171 |     
172 |     ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/Decision_Tree.png)
173 |     ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/Decision_Tree_ev1.png)
174 |     ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/Decision_Tree_ROC.png)
175 |     ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/Decision_Tree_confusion_matrix.png)
176 |     ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/Decision_Tree_Gini_LogLoss.png)
177 |     
178 |     ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/Random_Forest.png)
179 |     ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/Random_Forest_ev1.png)
180 |     ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/Random_Forest_ROC.png)
181 |     ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/Random_Forest_confusion_matrix.png)
182 |     ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/Random_Forest_Gini_LogLoss.png)
183 |     
184 |     ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/gradient_boosting.png)
185 |     ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/gradient_boosting_ev1.png)
186 |     ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/gradient_boosting_ROC_confusion_matrix.png)
187 |     ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/gradient_boosting_confusion_matrix.png)
188 |     ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/gradient_boosting_Gini_LogLoss.png)
189 |     
190 |   
191 | * Hyper-Parameter Tuning.
192 |   
193 |   In this step, I provided hyper-parameter tuning script for all those model above. So could be compared the model evaluation between model with and without hyper parameter tuning. From those result we can choose model with the best evaluation to make prediction in data test. 
194 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/hyper_parameter_tuning_LogisticRegression.png)
195 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/hyper_parameter_tuning_DecisionTree.png)
196 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/hyper_parameter_Random_Forest.png)
197 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/hyper_parameter_tuning_GradientBoost.png)
198 | 
199 | * Implementation Modelling to data test.
200 |   
201 |   After all the steps above are executed, now we know which one model that has best evaluation. And that is the perfect model to make prediction our data test. We can choose the top two model from four model then transform that model to data test. In this case, I choose Logistic Regression and Gradient Boosting to make prediction. Then save the prediction into csv file.
202 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/implement_to_data_test.png)
203 |   ![alt text](https://github.com/elsyifa/Classification-Pyspark/blob/master/Image/implement_to_data_test2.png)
204 |   
205 |   **VIOLAAAAAA,, we got our prediction!!!!!**
206 |   
207 |   For more details please see my code.
208 | 


--------------------------------------------------------------------------------