├── images └── ERD.png ├── ML models └── .ipynb_checkpoints │ └── Untitled-checkpoint.ipynb ├── Imbalanced Strategies └── Readme.md ├── main.py ├── README.md ├── environment_hc.yml ├── model.py └── Feature Engineering Strategies └── XGBoost_Automated Features.ipynb /images/ERD.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ali-ghorbani-k/Credit-Risk-Management/HEAD/images/ERD.png -------------------------------------------------------------------------------- /ML models/.ipynb_checkpoints/Untitled-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 4 6 | } 7 | -------------------------------------------------------------------------------- /Imbalanced Strategies/Readme.md: -------------------------------------------------------------------------------- 1 | # Different strategies for balancing dataset: 2 | 1. Clustering Undersampling of majority class (HClusteringUnderSampling.ipynb) 3 | 2. Random undersampling of majority class & Oversampling of minority class (RUndersample_SMOTE.ipynb) 4 | 3. Clustering undersampling of majority class & Oversampling of minority class (HClustering_USample_SMOTE.ipynb) 5 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from model import classifier 3 | 4 | parser = argparse.ArgumentParser(description='') 5 | parser.add_argument('--use_ftools', dest='use_ftools', default=False, action='store_true', help='use automated feature extraction with ft') 6 | parser.add_argument('--ft_maxdep', dest='ft_maxdep', default=1, type= int, help='Max depth of deep feature synthesis in feature tools') 7 | parser.add_argument('--use_cnnft', dest='use_cnnft', default=False, action='store_true', help='use cnn feature extraction method') 8 | parser.add_argument('--cnn_bsize', dest='cnn_bsize', default=256, type= int, help='batch_size when training cnn for feature extraction') 9 | parser.add_argument('--cnn_epoch', dest='cnn_epoch', default=100, type= int, help='number of epochs when training cnn for feature extraction') 10 | parser.add_argument('--use_rnnft', dest='use_rnnft', default=False, action='store_true', help='use rnn feature extraction method') 11 | parser.add_argument('--rnn_bsize', dest='rnn_bsize', default=256, type= int, help='batch_size when training cnn for feature extraction') 12 | parser.add_argument('--rnn_epoch', dest='rnn_epoch', default=100, type= int, help='number of epochs when training cnn for feature extraction') 13 | parser.add_argument('--resample', dest='resample', default=False, action='store_true', help='resample training dataset to get balanced positive/negative label ratio') 14 | parser.add_argument('--use_hclstr', dest='use_hclstr', default=False, action='store_true', help='use hierarchical clustering (undersampling) of majority class') 15 | parser.add_argument('--use_hclstrsmote', dest='use_hclstrsmote', default=False, action='store_true', help='hierarchical clustering (undersampling) of majority class & Oversampling of minority class') 16 | parser.add_argument('--nfolds', dest='nfolds', type=int, default=5, help='# of folds for cross-validation') 17 | parser.add_argument('--test_size', dest='test_size', type=float, default=0.05, help='test to train data ratio') 18 | parser.add_argument('--pca_n', dest='pca_n', type=int, default=150, help='number of pca components considered for training xgb') 19 | parser.add_argument('--lgbm', dest='lgbm', default=True, action='store_true', help='use lightGBM algorithm') 20 | parser.add_argument('--xgb', dest='xgb' , default=False, action='store_true', help='use XGBoost algorithm') 21 | parser.add_argument('--catb', dest='catb', default=False, action='store_true', help='use Catboost algorithm') 22 | parser.add_argument('--fcnn', dest='fcnn', default=False, action='store_true', help='use fully connected neural network') 23 | parser.add_argument('--batch_size', dest='batch_size', type=int, default=256, help='batch size for FCNN algorithm') 24 | parser.add_argument('--epoch', dest='epoch', type=int, default=10, help='number of epochs for FCNN algorithm') 25 | 26 | args = parser.parse_args() 27 | 28 | if __name__ == '__main__': 29 | model = classifier(args) 30 | model.train(args) 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Credit-Risk-Management 2 | Building an end-to-end machine learning model to predict the probability of paying back a loan by an applicant. 3 | 4 | # Problem Statement 5 | This is a supervised binary classification problem since the labels are provided in the application_train table (supervised), and the label is a binary variable with 0 (repaying the loan) and 1 (having difficulty repaying the loan). 6 | 7 | # Setup 8 | This code can be run with the following steps. This setup assumes you already have conda installed. 9 | 1. Create the conda environment: conda env create -f environment_hc.yml 10 | 2. Activate the environment: conda activate hc 11 | 12 | The required files to run this projects are main.py and model.py. Jupyert notebooks for each algorithms is provided for the reference. 13 | 14 | # Data Source 15 | I have collected the data from kaggle that was provided by [Home Credit financial institution]( https://www.kaggle.com/c/home-credit-default-risk/data). 16 | 17 | There are two main tables related to the current credit application: 18 | 19 | * __application_train__: This tables includes the information for the each loan application represented by an id of loan (__SK_ID_CURR__). The applicatoin_train table includes a TARGET column (1 : client with payment difficulties: he/she had late payment more than X days on at least one of the first Y installments of the loan, 0 : the loan was repaid) 20 | 21 | * __application_test__ : This table has the same column as the application_train, but does not have TARGET column. The TARGET column will be predicted by the Machine learning model and could be used in kaggle competition. Application_test is not used in this project. 22 | 23 | In addition to application_train that includes the current application information, there are two other sources of data related to each customer historical transactions and records obtained from 1) Bureau 2) Home Credit, presented in following tables: 24 | 1. __Bureau__: 25 | 26 | 1. __Bureau__ : This table includes information for all client's previous credits provided by other financial institutions that were reported to the Credit Bureau. Each credit in the bureau table is represented by a bureau id (__SK_ID_BUREAU__) which is related to the one id of loan application (__SK_ID_CURR__). One SK_ID_CURR can have 0,1,2 or more related previous credits (SK_ID_BUREAU) in a bureau table showing a one-to-many relationship. 27 | 2. __Bureau_balance__ : This table includes information related to the monthly balance of previous credits in Credit Bureau. This table has one row for each month of history of every previous credit reported to Credit Bureau – i.e the table has (#loans in sample, #of relative previous credits, #of months where we have some history observable for the previous credits) rows. 28 | 29 | 2. __Home Credit__: 30 | 31 | 1. __previous_application__: This table includes all previous application at Home Credit which represented by an id of loan (__SK_ID_PREV__). One SK_ID_CURR can have 0,1,2 or more related previous credits (SK_ID_PREV) in previous_application table showing a one-to-many relationship. 32 | 33 | 2. __POS_CASH_BALANCE__: This table includes the monthly balance of previous point of sale (POS) with Home Credit. 34 | 35 | 3. __credit_card_balance__ : This table includes the monthly balance snapshots of previous credit cards that the applicant has with Home Credit 36 | 37 | 4. __installments_payments__ : This table includes repayment history for the previously disbursed credits related to the loans in the Home Credit database. 38 | 39 | Description of all the columns will be found in HomeCredit_columns_description.csv which is available in the provided link. 40 | The following Entity Relation Diagram (ERD) shows how different tables are related: 41 | 42 | ![ERD](images/ERD.png) 43 | 44 | # Preprocessing of data 45 | As shown in the above ERD each SK_ID_CURR is related to multiple SK_ID_BUREAU in Bureau tables and to multiple SK_ID_PREV in Home Credit tables. 46 | In order to develop a machine learning model, first we need to flatten out the database. It means aggregating the information from 2 bureau and 4 Home Credit tables to have one merged table. Each row in the final merged table represents one loan application (SK_ID_CURR). 47 | 48 | Three different strategies have been used in this project to flatten out the database: 49 | 50 | 1. __Manual feature engineering__: Manual Feature engineering involves leveraging domain knowledge to extract useful feature from data. With this strategy, we analyze each customer journey using bureau and Home Credit sources and extract useful information from previous loans cycle life. 51 | 52 | 2. __Automated feature engineering__: Automated feature engineering use [Featuretools](https://community.alteryx.com/t5/Data-Science/Feature-Engineering-Secret-to-Data-Science-Success/ba-p/545041) library to generate hundreds or thousands of new features. We have used one level of depth since this is a computationally expensive work. 53 | 54 | 3. __Deep learning__: Deep learning strategy employs Convolution Neural Network (CNN) and Recurrent Neural Network (RNN) to extract new feture from the data. The concept of using power of CNN for the feature engineering is discussed [here](https://towardsdatascience.com/convolutional-neural-network-on-a-structured-bank-customer-data-358e6b8aa759) 55 | 56 | # Imbalalanced Dataset 57 | The dataset of this problem is highly imbalanced with 91% of data not-defaulted and 9% being defaulted. The challenge of working with an imbalanced dataset is that most machine learning algorithms perform poorly on the minority class that is more important to detect in credit risk management. Two different strategies have been used to balance the positive and negative labels and their model performances are compared: 58 | 59 | 1. __Hierarchial clustering (undersampling) of majority class__: The idea is to undersample the majority class so that we end up having balanced data. We have ~27k positive training data (Minority class) and ~300k negative data (Majority class). Undersampling needs to be done in a fashion that the resulted majority class has similar distribution to the original 300k, therefore, we do not lose information from data. Hieracrchial clustering (Agglomerative Clustering) has been conducted on majority class with 27k cluster. In the end, the resampled dataset had 1:1 data with 27k for positive and negative class. 60 | 61 | 2. __Hierarchial clustering (undersampling) of majority class & Oversampling of minority class__: Combination of undersampling of majority class (method one) to 50% of total ratio with oversampling of the minority class (up to 1:1 minority/majority ratio) to get better performance. In this way, the majority class ratio decreases from 91% to 50% whereas minority class size increases from 9% to 50%. Hierarchial clustering is used for undersampling and SMOTE is used for oversampling. 62 | 63 | # Machine Learning Models: 64 | We have tried boosted algorithms (XGBoost, LightGBM, Catboost) and fully connected neural network (FCNN) in this project. 65 | There are some technical differences in the application of different algorithms that needs to noticed: 66 | 67 | * __Handling missing data__: XGBoost, LightGBM, and Catboost can handle missing data, but for FCNN the missing values needs to be imputed. The missing categorical variable is imputed by 'Not Available' new category and missing numerical feature is imputed by average of that column in the training data (to avoid data leakage). 68 | 69 | * __Categorical variables__: XGBoost and FCNN can not handle categorical variable, therefore, one-hot encoding is performed on the categorical features. On the other hand, LightGBM and Catboost can handle categorical feature (use Fisher method), but the categorical features should be given to the algorithm to avoid error. This is accomplished by encoding each category to non negative integer and save it astype 'category' in [pandas](https://medium.com/swlh/dealing-with-categorical-variables-in-machine-learning-4401b949b093). 70 | 71 | The hyperparameter of boosted algorithm is chosen using Bayasian hyperparameter optimization (Hyperopt ). 72 | 73 | # Performance Metric: 74 | 75 | * __Precision, Recall, F1-Score__: In credit risk management the cost of misclassification of a 'defaulted applicant' as 'non defaulted' is very high and may cause significant loss of money for the financial institution. Therefore, we need to significantly reduce the probability of approving a defaulted applicatn (False Negative (type 2 error)). Therefore, Recall is more important metric in this project due to the high risk of losing money. Low precision might result in losing of a customer (False Positive), but not harm the business drastically. I also have presented F1-score that considers both precision and recall. However, when we are dealing with an imbalanced dataset such as in our case, if we don't balance positive and negative labels, we need to be careful a proper threshold for the classifier, different from the 0.5 (0.09 could be a good choice in this problem since positive/negtative labels volume equals to 0.09 in the training dataset). 76 | 77 | * __Area Under ROC Curve (AUC)__: If we use original imbalanced dataset, ROC Curve is the appropriate metric to visualize the performance of binary classifier and Area under ROC (AUC) is the best way to summarize the classifier performance in just one number. In this way, the True positive rate and False positive rate for different threshold is presented and area under this curve is the AUC. 78 | 79 | * __Cohen's kappa__: is a more conservative metric that is used in industry as a performance metric of a classifier. Cohen’s kappa is a measure of the agreement between two raters and is between 0 to 1 . Cohen’s kappa of 1 indicates perfect agreement between the raters and 0 indicates that any agreement is totally due to chance. 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /environment_hc.yml: -------------------------------------------------------------------------------- 1 | name: hc 2 | channels: 3 | - anaconda 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - _anaconda_depends=2020.07=py37_0 8 | - _ipyw_jlab_nb_ext_conf=0.1.0=py37_0 9 | - _py-xgboost-mutex=2.0=cpu_0 10 | - _tflow_select=2.2.0=eigen 11 | - absl-py=0.10.0=py37_0 12 | - aiohttp=3.6.2=py37he774522_0 13 | - alabaster=0.7.12=py37_0 14 | - anaconda=custom=py37_1 15 | - anaconda-client=1.7.2=py37_0 16 | - anaconda-navigator=1.9.12=py37_0 17 | - anaconda-project=0.8.4=py_0 18 | - argh=0.26.2=py37_0 19 | - argon2-cffi=20.1.0=py37he774522_1 20 | - asn1crypto=1.4.0=py_0 21 | - astor=0.8.1=py37_0 22 | - astroid=2.4.2=py37_0 23 | - astropy=4.0.1.post1=py37he774522_1 24 | - async-timeout=3.0.1=py37_0 25 | - async_generator=1.10=py37h28b3542_0 26 | - atomicwrites=1.4.0=py_0 27 | - autopep8=1.5.4=py_0 28 | - babel=2.8.0=py_0 29 | - backcall=0.2.0=py_0 30 | - backports=1.0=py_2 31 | - backports.shutil_get_terminal_size=1.0.0=py37_2 32 | - basemap=1.2.0=py37h4e5d7af_0 33 | - basemap-data-hires=1.2.0=0 34 | - bcrypt=3.2.0=py37he774522_0 35 | - beautifulsoup4=4.9.3=pyhb0f4dca_0 36 | - bitarray=1.5.3=py37he774522_0 37 | - bkcharts=0.2=py37_0 38 | - blas=1.0=mkl 39 | - blinker=1.4=py37_0 40 | - blosc=1.20.0=h7bd577a_0 41 | - bokeh=2.2.1=py37_0 42 | - boto=2.49.0=py37_0 43 | - bottleneck=1.3.2=py37h2a96729_1 44 | - brotlipy=0.7.0=py37he774522_1000 45 | - bzip2=1.0.8=he774522_0 46 | - ca-certificates=2020.6.20=hecda079_0 47 | - cachetools=4.1.1=py_0 48 | - catboost=0.24=py37hc8dfbb8_0 49 | - certifi=2020.6.20=py37hf50a25e_2 50 | - cffi=1.14.3=py37h7a1dbc1_0 51 | - chardet=3.0.4=py37_1003 52 | - click=7.1.2=py_0 53 | - cloudpickle=1.6.0=py_0 54 | - clyent=1.2.2=py37_1 55 | - colorama=0.4.3=py_0 56 | - comtypes=1.1.7=py37_1001 57 | - conda=4.8.5=py37hf50a25e_2 58 | - conda-build=3.17.6=py37_0 59 | - conda-env=2.6.0=1 60 | - conda-package-handling=1.6.1=py37h62dcd97_0 61 | - conda-verify=3.1.1=py37_0 62 | - console_shortcut=0.1.1=4 63 | - contextlib2=0.6.0.post1=py_0 64 | - convertdate=2.1.3=py_1000 65 | - cryptography=3.1.1=py37h7a1dbc1_0 66 | - curl=7.71.1=h2a8f88b_1 67 | - cycler=0.10.0=py37_0 68 | - cython=0.29.21=py37ha925a31_0 69 | - cytoolz=0.11.0=py37he774522_0 70 | - dask=2.30.0=py_0 71 | - dask-core=2.30.0=py_0 72 | - decorator=4.4.2=py_0 73 | - defusedxml=0.6.0=py_0 74 | - diff-match-patch=20200713=py_0 75 | - distributed=2.30.0=py37_0 76 | - django=2.2.5=py37_1 77 | - docutils=0.16=py37_1 78 | - entrypoints=0.3=py37_0 79 | - ephem=3.7.7.1=py37he774522_0 80 | - et_xmlfile=1.0.1=py_1001 81 | - fastcache=1.1.0=py37he774522_0 82 | - fbprophet=0.6=py37h6538335_0 83 | - ffmpeg=4.2=ha925a31_0 84 | - filelock=3.0.12=py_0 85 | - findspark=1.3.0=py_1 86 | - flake8=3.8.4=py_0 87 | - flask=1.1.2=py_0 88 | - freetype=2.10.2=hd328e21_0 89 | - fsspec=0.8.0=py_0 90 | - future=0.18.2=py37_1 91 | - gast=0.2.2=py37_0 92 | - geos=3.6.2=h9ef7328_2 93 | - get_terminal_size=1.0.0=h38e98db_0 94 | - gevent=20.9.0=py37he774522_0 95 | - glob2=0.7=py_0 96 | - gmpy2=2.0.8=py37h0964b28_3 97 | - google-auth=1.22.1=py_0 98 | - google-auth-oauthlib=0.4.1=py_2 99 | - google-pasta=0.2.0=py_0 100 | - greenlet=0.4.17=py37he774522_0 101 | - grpcio=1.31.0=py37he7da953_0 102 | - h5py=2.10.0=py37h5e291fa_0 103 | - hdf5=1.10.4=h7ebc959_0 104 | - heapdict=1.0.1=py_0 105 | - holidays=0.10.1=py_0 106 | - html5lib=1.1=py_0 107 | - icc_rt=2019.0.0=h0cc432a_1 108 | - icu=58.2=ha925a31_3 109 | - idna=2.10=py_0 110 | - imageio=2.9.0=py_0 111 | - imagesize=1.2.0=py_0 112 | - imbalanced-learn=0.6.2=py_0 113 | - importlib_metadata=1.7.0=0 114 | - iniconfig=1.0.1=py_0 115 | - intel-openmp=2020.2=254 116 | - intervaltree=3.1.0=py_0 117 | - ipykernel=5.3.4=py37h5ca1d4c_0 118 | - ipython=7.18.1=py37h5ca1d4c_0 119 | - ipython_genutils=0.2.0=py37_0 120 | - ipywidgets=7.5.1=py_1 121 | - isort=5.5.4=py37_0 122 | - itsdangerous=1.1.0=py37_0 123 | - jdcal=1.4.1=py_0 124 | - jedi=0.17.1=py37_0 125 | - jinja2=2.11.2=py_0 126 | - joblib=0.17.0=py_0 127 | - jpeg=9b=hb83a4c4_2 128 | - json5=0.9.5=py_0 129 | - jsonschema=3.2.0=py37_1 130 | - jupyter=1.0.0=py37_7 131 | - jupyter_client=6.1.7=py_0 132 | - jupyter_console=6.2.0=py_0 133 | - jupyter_core=4.6.3=py37_0 134 | - jupyterlab=2.2.6=py_0 135 | - jupyterlab_pygments=0.1.2=py_0 136 | - jupyterlab_server=1.2.0=py_0 137 | - keras-applications=1.0.8=py_1 138 | - keras-preprocessing=1.1.0=py_1 139 | - keyring=21.4.0=py37_1 140 | - kiwisolver=1.2.0=py37h74a9793_0 141 | - krb5=1.18.2=hc04afaa_0 142 | - lazy-object-proxy=1.4.3=py37he774522_0 143 | - libarchive=3.4.2=h5e25573_0 144 | - libcurl=7.71.1=h2a8f88b_1 145 | - libiconv=1.15=h1df5818_7 146 | - liblief=0.10.1=ha925a31_0 147 | - libllvm9=9.0.1=h21ff451_0 148 | - libpng=1.6.37=h2a8f88b_0 149 | - libprotobuf=3.13.0=h200bbdf_0 150 | - libpython=2.1=py37_0 151 | - libsodium=1.0.18=h62dcd97_0 152 | - libspatialindex=1.9.3=h33f27b4_0 153 | - libssh2=1.9.0=h7a1dbc1_1 154 | - libtiff=4.1.0=h56a325e_1 155 | - libxcb=1.13=h301d43c_1002 156 | - libxgboost=0.90=1 157 | - libxml2=2.9.10=h464c3ec_1 158 | - libxslt=1.1.34=he774522_0 159 | - lightgbm=3.0.0=py37h1834ac0_1 160 | - llvmlite=0.34.0=py37h1a82afc_4 161 | - locket=0.2.0=py37_1 162 | - lunarcalendar=0.0.9=py_0 163 | - lxml=4.5.2=py37h1350720_0 164 | - lz4-c=1.9.2=h62dcd97_1 165 | - lzo=2.10=he774522_2 166 | - m2w64-binutils=2.25.1=5 167 | - m2w64-bzip2=1.0.6=6 168 | - m2w64-crt-git=5.0.0.4636.2595836=2 169 | - m2w64-gcc=5.3.0=6 170 | - m2w64-gcc-ada=5.3.0=6 171 | - m2w64-gcc-fortran=5.3.0=6 172 | - m2w64-gcc-libgfortran=5.3.0=6 173 | - m2w64-gcc-libs=5.3.0=7 174 | - m2w64-gcc-libs-core=5.3.0=7 175 | - m2w64-gcc-objc=5.3.0=6 176 | - m2w64-gmp=6.1.0=2 177 | - m2w64-headers-git=5.0.0.4636.c0ad18a=2 178 | - m2w64-isl=0.16.1=2 179 | - m2w64-libiconv=1.14=6 180 | - m2w64-libmangle-git=5.0.0.4509.2e5a9a2=2 181 | - m2w64-libwinpthread-git=5.0.0.4634.697f757=2 182 | - m2w64-make=4.1.2351.a80a8b8=2 183 | - m2w64-mpc=1.0.3=3 184 | - m2w64-mpfr=3.1.4=4 185 | - m2w64-pkg-config=0.29.1=2 186 | - m2w64-toolchain=5.3.0=7 187 | - m2w64-tools-git=5.0.0.4592.90b8472=2 188 | - m2w64-windows-default-manifest=6.4=3 189 | - m2w64-winpthreads-git=5.0.0.4634.697f757=2 190 | - m2w64-zlib=1.2.8=10 191 | - markdown=3.3=py37_0 192 | - markupsafe=1.1.1=py37hfa6e2cd_1 193 | - matplotlib=3.3.1=0 194 | - matplotlib-base=3.3.1=py37hba9282a_0 195 | - mccabe=0.6.1=py37_1 196 | - menuinst=1.4.16=py37he774522_1 197 | - mistune=0.8.4=py37hfa6e2cd_1001 198 | - mkl=2020.2=256 199 | - mkl-service=2.3.0=py37hb782905_0 200 | - mkl_fft=1.2.0=py37h45dec08_0 201 | - mkl_random=1.1.1=py37h47e9c7a_0 202 | - mock=4.0.2=py_0 203 | - mongodb=4.0.3=he170510_0 204 | - more-itertools=8.5.0=py_0 205 | - mpc=1.1.0=h7edee0f_1 206 | - mpfr=4.0.2=h62dcd97_1 207 | - mpir=3.0.0=hec2e145_1 208 | - mpmath=1.1.0=py37_0 209 | - msgpack-python=1.0.0=py37h74a9793_1 210 | - msys2-conda-epoch=20160418=1 211 | - multidict=4.7.6=py37he774522_1 212 | - multipledispatch=0.6.0=py37_0 213 | - navigator-updater=0.2.1=py37_0 214 | - nbclient=0.5.0=py_0 215 | - nbconvert=6.0.7=py37_0 216 | - nbformat=5.0.7=py_0 217 | - nest-asyncio=1.4.1=py_0 218 | - networkx=2.5=py_0 219 | - nltk=3.5=py_0 220 | - nose=1.3.7=py37_1004 221 | - notebook=6.1.4=py37_0 222 | - numba=0.51.2=py37hf9181ef_1 223 | - numexpr=2.7.1=py37h25d0782_0 224 | - numpy=1.19.1=py37h5510c5b_0 225 | - numpy-base=1.19.1=py37ha3acd2a_0 226 | - numpydoc=1.1.0=py_0 227 | - oauthlib=3.1.0=py_0 228 | - olefile=0.46=py37_0 229 | - openpyxl=3.0.5=py_0 230 | - openssl=1.1.1h=he774522_0 231 | - opt_einsum=3.1.0=py_0 232 | - packaging=20.4=py_0 233 | - pandas=1.1.3=py37ha925a31_0 234 | - pandoc=2.10.1=0 235 | - pandocfilters=1.4.2=py37_1 236 | - paramiko=2.7.2=py_0 237 | - parso=0.7.0=py_0 238 | - partd=1.1.0=py_0 239 | - path=15.0.0=py37_0 240 | - path.py=12.5.0=0 241 | - pathlib2=2.3.5=py37_1 242 | - pathtools=0.1.2=py_1 243 | - patsy=0.5.1=py37_0 244 | - pep8=1.7.1=py37_0 245 | - pexpect=4.8.0=py37_1 246 | - pickleshare=0.7.5=py37_1001 247 | - pillow=7.2.0=py37hcc1f983_0 248 | - pip=20.2.3=py37_0 249 | - pkginfo=1.5.0.1=py37_0 250 | - plotly=4.5.2=py_0 251 | - pluggy=0.13.1=py37_0 252 | - ply=3.11=py37_0 253 | - powershell_shortcut=0.0.1=3 254 | - proj4=5.2.0=ha925a31_1 255 | - prometheus_client=0.8.0=py_0 256 | - prompt-toolkit=3.0.7=py_0 257 | - prompt_toolkit=3.0.7=0 258 | - protobuf=3.13.0=py37h33f27b4_1 259 | - psutil=5.7.2=py37he774522_0 260 | - pthread-stubs=0.3=h3c9f919_1 261 | - py=1.9.0=py_0 262 | - py-lief=0.10.1=py37ha925a31_0 263 | - py-xgboost=0.90=py37_1 264 | - pyasn1=0.4.8=py_0 265 | - pyasn1-modules=0.2.8=py_0 266 | - pycodestyle=2.6.0=py_0 267 | - pycosat=0.6.3=py37he774522_0 268 | - pycparser=2.20=py_2 269 | - pycrypto=2.6.1=py37he774522_10 270 | - pycurl=7.43.0.5=py37h7a1dbc1_0 271 | - pydocstyle=5.1.1=py_0 272 | - pydotplus=2.0.2=py37_1 273 | - pyflakes=2.2.0=py_0 274 | - pygments=2.7.1=py_0 275 | - pyjwt=1.7.1=py37_0 276 | - pylint=2.6.0=py37_0 277 | - pymongo=3.9.0=py37ha925a31_0 278 | - pynacl=1.4.0=py37h62dcd97_1 279 | - pyodbc=4.0.30=py37ha925a31_0 280 | - pyopenssl=19.1.0=py_1 281 | - pyparsing=2.4.7=py_0 282 | - pyproj=1.9.6=py37h6782396_0 283 | - pyqt=5.9.2=py37h6538335_2 284 | - pyreadline=2.1=py37_1 285 | - pyrsistent=0.17.3=py37he774522_0 286 | - pyshp=2.1.2=py_0 287 | - pysocks=1.7.1=py37_1 288 | - pystan=2.19.0.0=py37hcce6980_0 289 | - pytables=3.6.1=py37h1da0976_0 290 | - pytest=6.1.1=py37_0 291 | - python=3.7.7=h60c2a47_0_cpython 292 | - python-dateutil=2.8.1=py_0 293 | - python-jsonrpc-server=0.4.0=py_0 294 | - python-language-server=0.35.1=py_0 295 | - python-libarchive-c=2.9=py_0 296 | - python_abi=3.7=1_cp37m 297 | - pytz=2019.3=py_0 298 | - pywavelets=1.1.1=py37he774522_2 299 | - pywin32=227=py37he774522_1 300 | - pywin32-ctypes=0.2.0=py37_1001 301 | - pywinpty=0.5.7=py37_0 302 | - pyyaml=5.3.1=py37he774522_1 303 | - pyzmq=19.0.2=py37ha925a31_1 304 | - qdarkstyle=2.8.1=py_0 305 | - qt=5.9.7=vc14h73c81de_0 306 | - qtawesome=0.7.2=py_0 307 | - qtconsole=4.7.7=py_0 308 | - qtpy=1.9.0=py_0 309 | - regex=2020.7.14=py37he774522_0 310 | - requests=2.24.0=py_0 311 | - requests-oauthlib=1.3.0=py_0 312 | - retrying=1.3.3=py37_2 313 | - rope=0.17.0=py_0 314 | - rsa=4.6=py_0 315 | - rtree=0.9.4=py37h21ff451_1 316 | - ruamel_yaml=0.15.87=py37he774522_1 317 | - scikit-image=0.16.2=py37h47e9c7a_0 318 | - scikit-learn=0.23.2=py37h47e9c7a_0 319 | - scipy=1.5.2=py37h9439919_0 320 | - seaborn=0.11.0=py_0 321 | - send2trash=1.5.0=py37_0 322 | - setuptools=50.3.0=py37h9490d1a_1 323 | - simplegeneric=0.8.1=py37_2 324 | - singledispatch=3.4.0.3=py37_1000 325 | - sip=4.19.8=py37h6538335_0 326 | - six=1.15.0=py_0 327 | - snappy=1.1.8=h33f27b4_0 328 | - snowballstemmer=2.0.0=py_0 329 | - sortedcollections=1.2.1=py_0 330 | - sortedcontainers=2.2.2=py_0 331 | - soupsieve=2.0.1=py_0 332 | - sphinx=3.2.1=py_0 333 | - sphinxcontrib=1.0=py37_1 334 | - sphinxcontrib-applehelp=1.0.2=py_0 335 | - sphinxcontrib-devhelp=1.0.2=py_0 336 | - sphinxcontrib-htmlhelp=1.0.3=py_0 337 | - sphinxcontrib-jsmath=1.0.1=py_0 338 | - sphinxcontrib-qthelp=1.0.3=py_0 339 | - sphinxcontrib-serializinghtml=1.1.4=py_0 340 | - sphinxcontrib-websupport=1.2.4=py_0 341 | - spyder=4.1.5=py37_0 342 | - spyder-kernels=1.9.4=py37_0 343 | - sqlalchemy=1.3.19=py37he774522_0 344 | - sqlite=3.33.0=h2a8f88b_0 345 | - sqlparse=0.4.1=py_0 346 | - statsmodels=0.11.1=py37he774522_0 347 | - sympy=1.6.2=py37_0 348 | - tbb=2020.3=h74a9793_0 349 | - tblib=1.7.0=py_0 350 | - tensorboard=2.2.1=pyh532a8cf_0 351 | - tensorboard-plugin-wit=1.6.0=py_0 352 | - tensorflow=2.1.0=eigen_py37hd727fc0_0 353 | - tensorflow-base=2.1.0=eigen_py37h49b2757_0 354 | - tensorflow-estimator=2.1.0=pyhd54b08b_0 355 | - termcolor=1.1.0=py37_1 356 | - terminado=0.8.3=py37_0 357 | - testpath=0.4.4=py_0 358 | - threadpoolctl=2.1.0=pyh5ca1d4c_0 359 | - tk=8.6.10=he774522_0 360 | - toml=0.10.1=py_0 361 | - toolz=0.11.1=py_0 362 | - tornado=6.0.4=py37he774522_1 363 | - tqdm=4.50.2=py_0 364 | - traitlets=5.0.4=py_0 365 | - typed-ast=1.4.1=py37he774522_0 366 | - typing_extensions=3.7.4.3=py_0 367 | - ujson=4.0.0=py37ha925a31_0 368 | - unicodecsv=0.14.1=py37_0 369 | - unidecode=1.1.1=py_0 370 | - urllib3=1.25.10=py_0 371 | - vc=14.1=h0510ff6_4 372 | - vs2015_runtime=14.16.27012=hf0eaf9b_3 373 | - watchdog=0.10.3=py37_0 374 | - wcwidth=0.2.5=py_0 375 | - webencodings=0.5.1=py37_1 376 | - werkzeug=0.16.1=py_0 377 | - wheel=0.35.1=py_0 378 | - widgetsnbextension=3.5.1=py37_0 379 | - win_inet_pton=1.1.0=py37_0 380 | - win_unicode_console=0.5=py37_0 381 | - wincertstore=0.2=py37_0 382 | - winpty=0.4.3=4 383 | - wrapt=1.11.2=py37he774522_0 384 | - xlrd=1.2.0=py37_0 385 | - xlsxwriter=1.3.6=py_0 386 | - xlwings=0.20.7=py37_0 387 | - xlwt=1.3.0=py37_0 388 | - xmltodict=0.12.0=py_0 389 | - xorg-kbproto=1.0.7=h301d43c_1002 390 | - xorg-libice=1.0.10=h301d43c_0 391 | - xorg-libsm=1.2.3=h301d43c_1000 392 | - xorg-libx11=1.6.12=h301d43c_0 393 | - xorg-libxau=1.0.9=h301d43c_0 394 | - xorg-libxdmcp=1.1.3=h301d43c_0 395 | - xorg-libxext=1.3.4=h301d43c_0 396 | - xorg-libxpm=3.5.13=h301d43c_0 397 | - xorg-libxrender=0.9.10=h301d43c_1002 398 | - xorg-libxt=1.1.5=h301d43c_1003 399 | - xorg-renderproto=0.11.1=h301d43c_1002 400 | - xorg-xextproto=7.3.0=h301d43c_1002 401 | - xorg-xproto=7.0.31=h301d43c_1007 402 | - xz=5.2.5=h62dcd97_0 403 | - yaml=0.2.5=he774522_0 404 | - yapf=0.30.0=py_0 405 | - yarl=1.6.0=py37he774522_0 406 | - zeromq=4.3.2=ha925a31_3 407 | - zict=2.0.0=py_0 408 | - zipp=3.3.0=py_0 409 | - zlib=1.2.11=h62dcd97_4 410 | - zope=1.0=py37_1 411 | - zope.event=4.4=py37_0 412 | - zope.interface=5.1.2=py37he774522_0 413 | - zstd=1.4.5=h04227a9_0 414 | - pip: 415 | - appdirs==1.4.4 416 | - attrs==18.1.0 417 | - bleach==3.1.5 418 | - configargparse==1.2.3 419 | - coursera-dl==0.11.5 420 | - edx-dl==0.1.13 421 | - featuretools==0.16.0 422 | - gower==0.0.5 423 | - hyperopt==0.2.4 424 | - importlib-metadata==2.1.1 425 | - mlxtend==0.17.2 426 | - notebook-as-pdf==0.4.0 427 | - py4j==0.10.9 428 | - pyee==8.1.0 429 | - pynisher==0.5.0 430 | - pypdf2==1.26.0 431 | - pyppeteer==0.2.5 432 | - pyprind==2.11.2 433 | - pyspark==3.0.1 434 | - swifter==0.305 435 | - websockets==8.1 436 | - youtube-dl==2020.3.8 437 | prefix: C:\Users\New User\Anaconda3 438 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from matplotlib import pyplot as plt 4 | from functools import reduce 5 | import gc 6 | import featuretools as ft 7 | from sklearn.preprocessing import LabelEncoder 8 | from sklearn.model_selection import train_test_split 9 | from sklearn.pipeline import make_pipeline, Pipeline 10 | from sklearn.preprocessing import StandardScaler 11 | from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score 12 | from sklearn.decomposition import PCA 13 | from xgboost import XGBClassifier 14 | from lightgbm import LGBMClassifier 15 | from catboost import CatBoostClassifier 16 | from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, precision_recall_curve, cohen_kappa_score 17 | from tensorflow.keras.models import Model, Sequential 18 | from tensorflow.keras.layers import Dense, Dropout, Activation, BatchNormalization, Conv2D, MaxPooling2D, Flatten, LSTM 19 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint 20 | from tensorflow.keras.utils import to_categorical 21 | from sklearn.cluster import AgglomerativeClustering 22 | from imblearn.over_sampling import SMOTE 23 | import warnings 24 | from sklearn.exceptions import ConvergenceWarning 25 | warnings.simplefilter(action='ignore', category=ConvergenceWarning) 26 | from pdb import set_trace as st 27 | 28 | 29 | class classifier: 30 | 31 | def __init__(self, args): 32 | self.ft_maxdep = args.ft_maxdep 33 | self.cnn_bsize = args.cnn_bsize 34 | self.cnn_epoch = args.cnn_epoch 35 | self.rnn_bsize = args.rnn_bsize 36 | self.rnn_epoch = args.rnn_epoch 37 | self.pca_n = args.pca_n 38 | self.nfolds = args.nfolds 39 | self.test_size = args.test_size 40 | self.batch_size = args.batch_size 41 | self.epoch = args.epoch 42 | 43 | if args.use_ftools: 44 | self.data = self.automated_features 45 | elif args.use_cnnft: 46 | self.data = self.bureau_cnn_features() 47 | elif args.use_rnnft: 48 | self.data = self.bureau_rnn_features() 49 | else: 50 | self.data = self.merge_tables() 51 | 52 | def hc_prv_app(self): 53 | """ 54 | This method reads previous_application table that includes the recorded 55 | previous credits at Home Credit financial institution, perform manually feature engineering, 56 | flatten multiple loans, and returns the statistics related to each application SK_ID_CURR. 57 | """ 58 | 59 | print('Processing previous_application table related to Home Credit source...') 60 | prev = pd.read_csv('data/previous_application.csv') 61 | 62 | # ------------------------------Feature Engineering (1): General --------------------------- 63 | # When was the last application applied and contract status? 64 | prev1 = prev.sort_values('DAYS_DECISION', ascending=False). \ 65 | groupby(['SK_ID_CURR']).agg( 66 | {'DAYS_DECISION': 'first', 'NAME_CONTRACT_STATUS': 'first', 'AMT_CREDIT': 'first'}) 67 | # last credit amount, interest rates, ... the most recent approved 68 | df = prev[prev['NAME_CONTRACT_STATUS'] == 'Approved'] \ 69 | .sort_values('DAYS_DECISION', ascending=False).groupby('SK_ID_CURR').first() 70 | df = df[['NAME_CONTRACT_TYPE', 'AMT_APPLICATION', 'AMT_CREDIT', 'AMT_DOWN_PAYMENT', 71 | 'NAME_YIELD_GROUP', 'NFLAG_INSURED_ON_APPROVAL', 'RATE_INTEREST_PRIMARY', 'RATE_INTEREST_PRIVILEGED']] 72 | df['Diff_applied_apprved'] = df['AMT_APPLICATION'] - df['AMT_CREDIT'] 73 | prev1 = prev1.merge(df, on='SK_ID_CURR', how='outer') 74 | 75 | # ----Feature Engineering (2): Ratios of Approved, Refused, Canceled, and Unused offer for each application----- 76 | df = prev.groupby(['SK_ID_CURR', 'NAME_CONTRACT_STATUS']).agg({'SK_ID_PREV': 'count'}) 77 | df = df.groupby(level='SK_ID_CURR').apply(lambda x: x / x.sum()) 78 | approved = df[df.index.isin(['Approved'], level='NAME_CONTRACT_STATUS')]['SK_ID_PREV'] 79 | approved.index = approved.index.droplevel('NAME_CONTRACT_STATUS') 80 | refused = df[df.index.isin(['Refused'], level='NAME_CONTRACT_STATUS')]['SK_ID_PREV'] 81 | refused.index = refused.index.droplevel('NAME_CONTRACT_STATUS') 82 | canceled = df[df.index.isin(['Canceled'], level='NAME_CONTRACT_STATUS')]['SK_ID_PREV'] 83 | canceled.index = canceled.index.droplevel('NAME_CONTRACT_STATUS') 84 | unused = df[df.index.isin(['Unused offer'], level='NAME_CONTRACT_STATUS')]['SK_ID_PREV'] 85 | unused.index = unused.index.droplevel('NAME_CONTRACT_STATUS') 86 | data_frames = [approved, refused, canceled, unused] 87 | df = reduce(lambda left, right: pd.merge(left, right, on='SK_ID_CURR', how='outer'), data_frames) 88 | df.columns = ['ratio_approved', 'ratio_refused', 'ratio_canceled', 'ratio_unused'] 89 | prev1 = prev1.merge(df, on='SK_ID_CURR', how='outer') 90 | 91 | # -----Feature Engineering (3): Latest credit time and amount for the approved different NAME_CONTRACT_TYPE ---- 92 | df = prev[prev['NAME_CONTRACT_STATUS'] == 'Approved'].sort_values('DAYS_DECISION', ascending=False). \ 93 | groupby(['SK_ID_CURR', 'NAME_CONTRACT_TYPE']).agg({'DAYS_DECISION': 'first', 'AMT_CREDIT': 'first'}) 94 | consumer_loans = df[df.index.isin(['Consumer loans'], level='NAME_CONTRACT_TYPE')][ 95 | ['DAYS_DECISION', 'AMT_CREDIT']] 96 | consumer_loans.index = consumer_loans.index.droplevel('NAME_CONTRACT_TYPE') 97 | cash_loans = df[df.index.isin(['Cash loans'], level='NAME_CONTRACT_TYPE')][['DAYS_DECISION', 'AMT_CREDIT']] 98 | cash_loans.index = cash_loans.index.droplevel('NAME_CONTRACT_TYPE') 99 | revolving_loans = df[df.index.isin(['Revolving loans'], level='NAME_CONTRACT_TYPE')][ 100 | ['DAYS_DECISION', 'AMT_CREDIT']] 101 | revolving_loans.index = revolving_loans.index.droplevel('NAME_CONTRACT_TYPE') 102 | dfs = [consumer_loans, cash_loans, revolving_loans] 103 | df = reduce(lambda left, right: pd.merge(left, right, on='SK_ID_CURR', how='outer'), dfs) 104 | df.columns = ['Days_consumerloans', 'AMT_credit_consumerloans', 'Days_cashloans', 'AMT_credit_cashloans', 105 | 'Days_revolvingloans', 'AMT_credit_revolvingloans'] 106 | prev1 = prev1.merge(df, on='SK_ID_CURR', how='outer') 107 | del prev, df 108 | gc.collect() 109 | return prev1 110 | 111 | def hc_credit_card(self): 112 | """ 113 | This method reads credit_card_balance table that includes the recorded 114 | previous credit card transactions at Home Credit financial institution, perform manually feature engineering, 115 | flatten many transactions, and returns the statistics related to each application SK_ID_CURR. 116 | """ 117 | 118 | print('Processing credit_card_balance table related to Home Credit source...') 119 | ccb = pd.read_csv('data/credit_card_balance.csv') 120 | ccb['Beyond_limit'] = np.where(ccb['AMT_BALANCE'] > ccb['AMT_CREDIT_LIMIT_ACTUAL'], 1, 0) 121 | ccb['Delinquent'] = np.where(ccb['AMT_PAYMENT_CURRENT'] < ccb['AMT_INST_MIN_REGULARITY'], 1, 0) 122 | 123 | # Percentage change of credit card balance between two successive months 124 | def _pct_diff(group): 125 | group['balance_pct_change'] = group['AMT_BALANCE'].diff() / (group['AMT_BALANCE'].shift(1) + 1) 126 | group['balance_pct_change'] = np.where(group['balance_pct_change'] > 30, 30, group['balance_pct_change']) 127 | return group 128 | 129 | ccb = ccb.sort_values('MONTHS_BALANCE').groupby('SK_ID_PREV').apply(_pct_diff) 130 | # how many times credit card reached 80% and above? 131 | ccb['times_bal80'] = np.where(ccb['AMT_BALANCE'] / ccb['AMT_CREDIT_LIMIT_ACTUAL'] >= 0.8, 1, 0) 132 | # How many credit card do applicant have at the moment? 133 | # What is the total balance on them? (outstanding debt of credit card) 134 | ccb['has_cc_now'] = np.where(ccb['MONTHS_BALANCE'] == -1, 1, 0) 135 | ccb['balance_now'] = ccb['has_cc_now'] * ccb['AMT_BALANCE'] 136 | stats = ccb.groupby('SK_ID_CURR', as_index=False).agg( 137 | {'has_cc_now': 'sum', 'balance_now': 'sum', 'times_bal80': 'sum'}) 138 | # Flatten last four months balance and percentage change for each applicant 139 | cols = ['MONTHS_BALANCE', 'Beyond_limit', 'Delinquent', 'balance_pct_change'] 140 | cclast = ccb.sort_values('MONTHS_BALANCE', ascending=False).groupby(['SK_ID_CURR'])[cols].nth(0) 141 | cclag1 = ccb.sort_values('MONTHS_BALANCE', ascending=False).groupby(['SK_ID_CURR'])[cols].nth(1) 142 | cclag2 = ccb.sort_values('MONTHS_BALANCE', ascending=False).groupby(['SK_ID_CURR'])[cols].nth(2) 143 | cclag3 = ccb.sort_values('MONTHS_BALANCE', ascending=False).groupby(['SK_ID_CURR'])[cols].nth(3) 144 | dfs = [cclast, cclag1, cclag2, cclag3] 145 | lags = reduce(lambda left, right: pd.concat([left, right], axis=1, sort=False), dfs) 146 | columns = [] 147 | for i in range(4): 148 | columns += ['MONTHS_BALANCE' + str(i), 'Beyond_limit' + str(i), 'Delinquent' + str(i), 149 | 'balance_pct_change' + str(i)] 150 | lags.columns = columns 151 | stats = stats.merge(lags, on='SK_ID_CURR', how='outer') 152 | del lags, ccb 153 | gc.collect() 154 | return stats 155 | 156 | def hc_installment(self): 157 | """ 158 | This method reads installments_payments table that includes the recorded 159 | previous installments at Home Credit financial institution, perform manually feature engineering, 160 | flatten many transactions, and returns the statistics related to each application SK_ID_CURR. 161 | """ 162 | 163 | print('Processing installments table related to Home Credit source...') 164 | insta = pd.read_csv('data/installments_payments.csv') 165 | # 'DAYS_INSTALMENT': days before credit card supposed to be paid, 166 | # 'DAYS_ENTRY_PAYMENT': days that amount was acutually paid. 167 | insta['insta_delinquency'] = np.where(insta['DAYS_INSTALMENT'] >= insta['DAYS_ENTRY_PAYMENT'], 0, 1) 168 | insta['insta_debt'] = insta['AMT_INSTALMENT'] - insta['AMT_PAYMENT'] 169 | stats = insta.sort_values('DAYS_INSTALMENT', ascending=False).groupby(['SK_ID_CURR']).agg( 170 | {'DAYS_INSTALMENT': 'first', 'insta_debt': ['sum', 'mean', 'first'], 'insta_delinquency': ['sum', 'first']}) 171 | stats.columns = stats.columns.map('_'.join) 172 | del insta 173 | gc.collect() 174 | return stats 175 | 176 | def hc_pos_cash(self): 177 | """ 178 | This method reads POS_CASH_balance table that includes the recorded 179 | previous point of sale (POS) at Home Credit financial institution, perform manually feature engineering, 180 | flatten many transactions, and returns the statistics related to each application SK_ID_CURR. 181 | """ 182 | 183 | print('Processing POS_CASH_balance table related to Home Credit source...') 184 | pc = pd.read_csv('data/POS_CASH_balance.csv') 185 | # Flatten all the columns for the latest 4 POS data for each application 186 | pc = pc.sort_values('MONTHS_BALANCE', ascending=False) 187 | cols = ['MONTHS_BALANCE', 'SK_DPD', 'SK_DPD_DEF'] 188 | pos0 = pc.groupby(['SK_ID_CURR'])[cols].first() 189 | pos1 = pc.groupby(['SK_ID_CURR'])[cols].nth(1) 190 | pos2 = pc.groupby(['SK_ID_CURR'])[cols].nth(2) 191 | pos3 = pc.groupby(['SK_ID_CURR'])[cols].nth(3) 192 | data_frames = [pos0, pos1, pos2, pos3] 193 | poslag = reduce(lambda left, right: pd.concat([left, right], axis=1, sort=False), data_frames) 194 | columns = [] 195 | for i in range(4): 196 | columns += ['MONTHS_BALANCE' + str(i), 'SK_DPD' + str(i), 'SK_DPD_DEF' + str(i)] 197 | poslag.columns = columns 198 | del pc, pos0, pos1, pos2, pos3 199 | gc.collect() 200 | return poslag 201 | 202 | def application_train(self): 203 | """ 204 | This method reads application_train table that includes all the current applications, cleans it, and 205 | performs manually feature engineering 206 | """ 207 | 208 | print('Processing application_train table for the current loan application') 209 | train = pd.read_csv('data/application_train.csv') 210 | # Delete four applications with XNA CODE_GENDER (train set) 211 | train = train[train['CODE_GENDER'] != 'XNA'] 212 | # Replace DAYS_EMPLOYED = 365243 by nan 213 | train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace=True) 214 | # Feature engineering 215 | train['Days_employed_age'] = train['DAYS_EMPLOYED'] / train['DAYS_BIRTH'] 216 | train['Credit_income_ratio'] = train['AMT_CREDIT'] / train['AMT_INCOME_TOTAL'] 217 | train['Anuity_income_ratio'] = train['AMT_ANNUITY'] / train['AMT_INCOME_TOTAL'] 218 | train['Income_per_person'] = train['AMT_INCOME_TOTAL'] / train['CNT_FAM_MEMBERS'] 219 | # length of the payment in months since the annuity is the monthly amount due 220 | train['Credit_term'] = train['AMT_ANNUITY'] / train['AMT_CREDIT'] 221 | return train 222 | 223 | def bureau(self): 224 | 225 | """ 226 | This method reads bureau and bureau_balance tables that includes the recorded 227 | at Bureau, perform manually feature engineering,flatten transactions, and 228 | returns the statistics related to each application SK_ID_CURR. 229 | """ 230 | 231 | print('Processing bureau and bureau balance tables...') 232 | bureau = pd.read_csv('data/bureau.csv') 233 | 234 | # ------------------------------------Feature Engineering (1): General --------------------------------------- 235 | bureau['Days_early_paidoff'] = bureau['DAYS_CREDIT_ENDDATE'] - bureau['DAYS_ENDDATE_FACT'] 236 | bureau['Duration_real'] = bureau['DAYS_ENDDATE_FACT'] - bureau['DAYS_CREDIT'] 237 | bureau['Duration_planned'] = bureau['DAYS_CREDIT_ENDDATE'] - bureau['DAYS_CREDIT'] 238 | # Replace data with Duration_planned = 0 with 1 to avoide devision by zero 239 | bureau['Duration_planned'].replace({0: 1}, inplace=True) 240 | # Weighted sum of credit a person borrowed (per days) 241 | bureau['AMT_weightdebt_duration'] = bureau['AMT_CREDIT_SUM_DEBT'] / bureau['Duration_planned'] 242 | # 'AMT_CREDIT_SUM_OVERDUE': Current amount overdue on credit 243 | bureau['AMT_Overdue_duration'] = bureau['AMT_CREDIT_SUM_OVERDUE'] / bureau['Duration_planned'] 244 | # Maximal amount overdue so far 245 | bureau['AMT_Maxoverdue_duration'] = bureau['AMT_CREDIT_MAX_OVERDUE'] / bureau['Duration_planned'] 246 | # Defaulted: CREDIT_DAY_OVERDUE > 270 days is considered defaluted 247 | bureau['Defaulted'] = np.where(bureau['CREDIT_DAY_OVERDUE'] > 270, 1, 0) 248 | bureau['AMT_defaulted'] = bureau['Defaulted'] * bureau['AMT_CREDIT_SUM_DEBT'] 249 | # Encoding CREDIT_ACTIVE ('Closed','Active') to (0,1) 250 | mapping = {'Closed': 0, 'Active': 1} 251 | bureau['CREDIT_ACTIVE'] = bureau['CREDIT_ACTIVE'].map(mapping) 252 | # Flatten manual features with aggregations 253 | stats = bureau.sort_values('DAYS_CREDIT', ascending=False).groupby('SK_ID_CURR') \ 254 | .agg({'AMT_CREDIT_SUM_DEBT': ['count', 'sum', 'mean'], 255 | 'AMT_weightdebt_duration': ['sum', 'mean'], 256 | 'AMT_Overdue_duration': ['sum', 'mean'], 257 | 'AMT_Maxoverdue_duration': ['mean'], 258 | 'Days_early_paidoff': ['sum', 'mean'], 259 | 'Defaulted': ['sum', 'mean'], 260 | 'AMT_defaulted': ['sum', 'mean'], 261 | 'CREDIT_ACTIVE': 'sum'}) 262 | stats.columns = stats.columns.map('_'.join) 263 | # Flatten last four stats for each applicant (nth() method does not work with .agg method) 264 | columns = ['SK_ID_CURR', 'DAYS_CREDIT', 'AMT_CREDIT_SUM_DEBT', 'AMT_weightdebt_duration', 265 | 'AMT_Overdue_duration', \ 266 | 'Days_early_paidoff', 'Defaulted', 'AMT_defaulted'] 267 | stats0 = bureau.sort_values('DAYS_CREDIT', ascending=False)[columns].groupby('SK_ID_CURR').nth(0) 268 | stats1 = bureau.sort_values('DAYS_CREDIT', ascending=False)[columns].groupby('SK_ID_CURR').nth(1) 269 | stats2 = bureau.sort_values('DAYS_CREDIT', ascending=False)[columns].groupby('SK_ID_CURR').nth(2) 270 | stats3 = bureau.sort_values('DAYS_CREDIT', ascending=False)[columns].groupby('SK_ID_CURR').nth(3) 271 | data_frames = [stats0, stats1, stats2, stats3] 272 | lags = reduce(lambda left, right: pd.concat([left, right], axis=1, sort=False), data_frames) 273 | col = [] 274 | for i in range(4): 275 | col += ['DAYS_CREDIT' + str(i), 'AMT_CREDIT_SUM_DEBT' + str(i), 'AMT_weightdebt_duration' + str(i), \ 276 | 'AMT_Overdue_duration' + str(i), 'Days_early_paidoff' + str(i), 'Defaulted' + str(i), 277 | 'AMT_defaulted' + str(i)] 278 | lags.columns = col 279 | stats = pd.merge(stats, lags, on='SK_ID_CURR', how='outer') 280 | 281 | # ------------------------Feature Engineering (2): Loan cycle life for different CREDIT_TYPE-------------------- 282 | # Select 6 categories of loans with the highest frequency 283 | filter = (bureau['CREDIT_TYPE'] == 'Mortgage') | (bureau['CREDIT_TYPE'] == 'Real estate loan') \ 284 | | (bureau['CREDIT_TYPE'] == 'Car loan') \ 285 | | (bureau['CREDIT_TYPE'] == 'Loan for business development') \ 286 | | (bureau['CREDIT_TYPE'] == 'Loan for the purchase of equipment') \ 287 | | (bureau['CREDIT_TYPE'] == 'Cash loan (non-earmarked)') \ 288 | | (bureau['CREDIT_TYPE'] == 'Loan for working capital replenishment') 289 | btype = bureau[filter].copy() 290 | # first(DAYS_CREDIT) is when the last credit of each credit type applied and 291 | # last(DAYS_CREDIT) is when the first onces applied. 292 | bt_stats = btype.sort_values('DAYS_CREDIT', ascending=False).groupby(['SK_ID_CURR', 'CREDIT_TYPE']) \ 293 | .agg({'DAYS_CREDIT': ['first', 'last'], 'AMT_CREDIT_SUM_DEBT': ['count', 'sum', 'mean'], \ 294 | 'CREDIT_ACTIVE': 'sum', 'Defaulted': 'sum', 'AMT_defaulted': ['sum']}) 295 | bt_stats.columns = bt_stats.columns.map('_'.join) 296 | mortgage = bt_stats[bt_stats.index.isin(['Mortgage'], level='CREDIT_TYPE')] 297 | mortgage.index = mortgage.index.droplevel('CREDIT_TYPE') 298 | realestate = bt_stats[bt_stats.index.isin(['Real estate loan'], level='CREDIT_TYPE')] 299 | realestate.index = realestate.index.droplevel('CREDIT_TYPE') 300 | carloan = bt_stats[bt_stats.index.isin(['Car loan'], level='CREDIT_TYPE')] 301 | carloan.index = carloan.index.droplevel('CREDIT_TYPE') 302 | loanbusiness = bt_stats[bt_stats.index.isin(['Loan for business development'], level='CREDIT_TYPE')] 303 | loanbusiness.index = loanbusiness.index.droplevel('CREDIT_TYPE') 304 | loanpurchase = bt_stats[bt_stats.index.isin(['Loan for the purchase of equipment'], level='CREDIT_TYPE')] 305 | loanpurchase.index = loanpurchase.index.droplevel('CREDIT_TYPE') 306 | cashloan = bt_stats[bt_stats.index.isin(['Cash loan (non-earmarked)'], level='CREDIT_TYPE')] 307 | cashloan.index = cashloan.index.droplevel('CREDIT_TYPE') 308 | workingloan = bt_stats[bt_stats.index.isin(['Loan for working capital replenishment'], level='CREDIT_TYPE')] 309 | workingloan.index = workingloan.index.droplevel('CREDIT_TYPE') 310 | dataframes = [mortgage, realestate, carloan, loanbusiness, loanpurchase, cashloan, workingloan] 311 | credit_type = reduce(lambda left, right: pd.merge(left, right, on='SK_ID_CURR', how='outer'), dataframes) 312 | types = ['mortgage', 'realestate', 'carloan', 'loanbusiness', 'loanpurchase', 'cashloan', 'workingloan'] 313 | columns = [] 314 | for s in types: 315 | columns += ['DAYS_CREDIT_first_' + s, 'DAYS_CREDIT_last_' + s, 'AMT_CREDIT_SUM_DEBT_count_' + s, \ 316 | 'AMT_CREDIT_SUM_DEBT_sum_' + s, 'AMT_CREDIT_SUM_DEBT_mean_' + s, \ 317 | 'CREDIT_ACTIVE_sum_x_' + s, 'Defaulted_sum_x_' + s, 'AMT_defaulted_sum_' + s] 318 | credit_type.columns = columns 319 | stats.merge(credit_type, on='SK_ID_CURR', how='outer') 320 | 321 | # -----------------------------Feature Engineering (3): Loan cycle life for Credit Card------------------------- 322 | ccdebt_bureau = bureau[bureau['CREDIT_TYPE'] == 'Credit card'].sort_values('DAYS_CREDIT', ascending=False) \ 323 | .groupby('SK_ID_CURR').agg({'AMT_CREDIT_SUM_DEBT': ['count', 'sum', 'mean', 'first']}) 324 | ccdebt_bureau.columns = ccdebt_bureau.columns.map('_'.join) 325 | stats = stats.merge(ccdebt_bureau, on='SK_ID_CURR', how='outer') 326 | # ------------------------------Feature Engineering (4): Bureau Balance Table----------------------------------- 327 | bureaubal = pd.read_csv('data/bureau_balance.csv') 328 | # When did the credit closed? When was the last delinquency for each bureau credit? 329 | # Last close is obtained by first 0 in MONTHS_BALANCE, last delinquency is obtained from first 1. 330 | bbalance = bureaubal.groupby(['SK_ID_BUREAU', 'STATUS'], as_index=False).first() 331 | left = bbalance[(bbalance['STATUS'] == '0')][['SK_ID_BUREAU', 'MONTHS_BALANCE']] 332 | right = bbalance[(bbalance['STATUS'] == '1')][['SK_ID_BUREAU', 'MONTHS_BALANCE']] 333 | bbalance = pd.merge(left, right, on='SK_ID_BUREAU', how='left') 334 | bbalance.columns = ['SK_ID_BUREAU', 'Months_latest_open', 'Months_latest_delin'] 335 | # Delinquency ratios: how often each bureau delayed? 336 | delinquency = pd.get_dummies(bureaubal[(bureaubal['STATUS'] != 'X') & (bureaubal['STATUS'] != 'C')]) 337 | delinquency = delinquency.groupby('SK_ID_BUREAU').agg( 338 | {'STATUS_0': 'mean', 'STATUS_1': 'mean', 'STATUS_2': 'mean', 'STATUS_3': 'mean', 339 | 'STATUS_4': 'mean', 'STATUS_5': 'mean'}) 340 | bbalance = bbalance.merge(delinquency, on='SK_ID_BUREAU', how='inner') 341 | # Add SK_ID_CURR to bbalance dataframe 342 | bbalance = bureau[['SK_ID_CURR', 'SK_ID_BUREAU']].merge(bbalance, on='SK_ID_BUREAU', how='inner') 343 | # pick the latest open SK_ID_Bureau 344 | bbalance = bbalance.sort_values('Months_latest_open', ascending=False).groupby('SK_ID_CURR').first() 345 | # merge with stats 346 | stats = stats.merge(bbalance, on='SK_ID_CURR', how='outer') 347 | del bbalance, bureau, bureaubal, bt_stats, ccdebt_bureau 348 | gc.collect() 349 | return stats 350 | 351 | def merge_tables(self): 352 | ''' 353 | This method merges the tables from bureau and home credit sources with the application_train table, 354 | and return one row for each SK_ID_CURR. Automated feature engineering and deep learning feature extraction 355 | are not included in this method. 356 | ''' 357 | 358 | prev_home = self.hc_prv_app() 359 | prev_home = prev_home.merge(self.hc_credit_card(), on='SK_ID_CURR', how='outer') 360 | prev_home = prev_home.merge(self.hc_installment(), on='SK_ID_CURR', how='outer') 361 | prev_home = prev_home.merge(self.hc_pos_cash(), on='SK_ID_CURR', how='outer') 362 | train = self.application_train() 363 | train = train.merge(self.bureau(), on='SK_ID_CURR', how='left') 364 | train = train.merge(prev_home, on='SK_ID_CURR', how='left') 365 | # train.to_csv('merged_tables.csv') 366 | del prev_home 367 | gc.collect() 368 | return train 369 | 370 | def automated_features(self): 371 | ''' 372 | This method performs automated feature engineering using feature tools package and returns a dataframe 373 | with added new features from all the tables. Important paramater of feature tools is max_depth of 374 | deep feature synthesis (typically set to 1 or 2). 375 | ''' 376 | 377 | print('Performing automated feature engineering using feature tools') 378 | train = pd.read_csv('data/application_train.csv') 379 | bureau = pd.read_csv('data/bureau.csv') 380 | bureaubal = pd.read_csv('data/bureau_balance.csv') 381 | prev = pd.read_csv('data/previous_application.csv') 382 | ccb = pd.read_csv('data/credit_card_balance.csv') 383 | insta = pd.read_csv('data/installments_payments.csv') 384 | pc = pd.read_csv('data/POS_CASH_balance.csv') 385 | 386 | # Choosing nrows data from all datasets 387 | train = train.sample(frac=1) 388 | ids = train['SK_ID_CURR'].values 389 | bureau = bureau.loc[bureau['SK_ID_CURR'].isin(ids)] 390 | idsb = bureau['SK_ID_BUREAU'].values 391 | bureaubal = bureaubal.loc[bureaubal['SK_ID_BUREAU'].isin(idsb)] 392 | prev = prev.loc[prev['SK_ID_CURR'].isin(ids)] 393 | ccb = ccb.loc[ccb['SK_ID_CURR'].isin(ids)] 394 | insta = insta.loc[insta['SK_ID_CURR'].isin(ids)] 395 | pc = pc.loc[pc['SK_ID_CURR'].isin(ids)] 396 | 397 | # creating EntitySet (collection of tables) 398 | es = ft.EntitySet(id='applications') 399 | # adding Entity (table) to EntitySet 400 | es = es.entity_from_dataframe(entity_id='train', dataframe=train, index='SK_ID_CURR') 401 | es = es.entity_from_dataframe(entity_id='bureau', dataframe=bureau, index='SK_ID_BUREAU') 402 | es = es.entity_from_dataframe(entity_id='bureaubal', dataframe=bureaubal, make_index=True, index='bb_id') 403 | es = es.entity_from_dataframe(entity_id='prev', dataframe=prev, index='SK_ID_PREV') 404 | es = es.entity_from_dataframe(entity_id='ccb', dataframe=ccb, make_index=True, index='cc_id') 405 | es = es.entity_from_dataframe(entity_id='insta', dataframe=insta, make_index=True, index='installment.id') 406 | es = es.entity_from_dataframe(entity_id='pc', dataframe=pc, make_index=True, index='pos_cash_id') 407 | # Creating relation between Entities 408 | # Relationship between application training and bureau 409 | r_applications_bureau = ft.Relationship(es['train']['SK_ID_CURR'], es['bureau']['SK_ID_CURR']) 410 | es = es.add_relationship(r_applications_bureau) 411 | # Relationship between bureau and bureau balance 412 | r_bureau_bureaubal = ft.Relationship(es['bureau']['SK_ID_BUREAU'], es['bureaubal']['SK_ID_BUREAU']) 413 | es = es.add_relationship(r_bureau_bureaubal) 414 | # Relationship between application training and previous applications 415 | r_app_prev = ft.Relationship(es['train']['SK_ID_CURR'], es['prev']['SK_ID_CURR']) 416 | es = es.add_relationship(r_app_prev) 417 | # Relationship between previous applications with credit card balance, pos cash, and installments 418 | r_prev_cc = ft.Relationship(es['prev']['SK_ID_PREV'], es['ccb']['SK_ID_PREV']) 419 | es = es.add_relationship(r_prev_cc) 420 | r_prev_insta = ft.Relationship(es['prev']['SK_ID_PREV'], es['insta']['SK_ID_PREV']) 421 | es = es.add_relationship(r_prev_insta) 422 | r_prev_pc2 = ft.Relationship(es['prev']['SK_ID_PREV'], es['pc']['SK_ID_PREV']) 423 | es = es.add_relationship(r_prev_pc2) 424 | 425 | # Deep feature synthesis with depth of 2 by stacking feature primitives (aggregations and transformations) 426 | # Automated features are concatenated to the original features; Therefore, 427 | train_ft, feature_names = ft.dfs(entityset=es, target_entity='train', max_depth=self.ft_maxdep) 428 | train_ft = train_ft.reset_index() 429 | 430 | print('\nTotal number of features after adding automated features: ', train_ft.shape[1]) 431 | del train, bureau, bureaubal, prev, ccb, insta, pc 432 | gc.collect() 433 | return train_ft 434 | 435 | def bureau_cnn_features(self): 436 | ''' 437 | Convolution Neural Network (CNN) is used to extract new feature from sequential data from bureau and 438 | bureaubal tables. The method concatenats new features to applicaiton_train table and returns final dataframe. 439 | ''' 440 | 441 | print('Extracting features using Convolutional Neural Network (CNN) ...') 442 | train = pd.read_csv('data/application_train.csv') 443 | idl = train['SK_ID_CURR'].values 444 | bureau = pd.read_csv('data/bureau.csv') 445 | # Imputating the missing data in bureau table 446 | # Missing categorical features are imputed with 'Not_applicable' 447 | # Missing numeric features are imputed with Zero (logical choice for this dataset) 448 | cols = bureau.select_dtypes(include=object).columns 449 | bureau[cols] = bureau[cols].fillna('Not_Applicable') 450 | cols = bureau.select_dtypes(exclude=object).columns 451 | bureau[cols] = bureau[cols].fillna(0) 452 | # One-hot encoding of categorical features 453 | bureau = pd.get_dummies(bureau, drop_first=True) 454 | bureau = bureau.sort_values('DAYS_CREDIT', ascending=False) 455 | lst = bureau['SK_ID_CURR'].values 456 | lst = list(set(lst)) 457 | lst.sort() 458 | 459 | # Making bureau table data structure similar to an image 460 | # Applications are grouoped by SK_ID_CURR and for each SK_ID_CURR, the 5 most recent SK_ID_BUREAU is considered. 461 | # If an SK_ID_CURR did not have 5 records, empty rows added and filled with -99 (to avoid confusion with zero). 462 | group = bureau.groupby('SK_ID_CURR') 463 | b = [] # b is the reshaped data structure of bureau table, suitable for use in CNN 464 | j = 0 465 | for sk in idl: 466 | if sk in lst: 467 | a = group.get_group(lst[j]) 468 | if a.shape[0] >= 5: 469 | a = a[:5] 470 | else: 471 | # m99 represents rows having value of -99 472 | m99 = np.ones((5 - a.shape[0], a.shape[1])) * -99 473 | m99 = pd.DataFrame(m99, columns=a.columns) 474 | a = a.append(m99) 475 | a = a.drop(['SK_ID_CURR', 'SK_ID_BUREAU'], axis=1) 476 | a = a.values.flatten().tolist() 477 | b.extend(a) 478 | j += 1 479 | else: 480 | m99 = np.ones((5, bureau.shape[1])) * -99 481 | m99 = pd.DataFrame(m99, columns=bureau.columns) 482 | m99 = m99.drop(['SK_ID_CURR', 'SK_ID_BUREAU'], axis=1) 483 | m99 = m99.values.flatten().tolist() 484 | b.extend(m99) 485 | b = np.array(b) 486 | b = np.reshape(b, (idl.shape[0], 5, bureau.shape[1] - 2, 1)) 487 | print('shape of channel(bureau):', b.shape) 488 | y = train['TARGET'] 489 | y = to_categorical(y, 2) 490 | 491 | # Deep CNN implementation 492 | # CNN architecture includes 2 convolution layer followed by two fully connected layer 493 | np.random.seed(5) 494 | model = Sequential() 495 | 496 | # 1st conv layer 497 | model.add(Conv2D(filters=32, kernel_size=(5, 5), padding="same", 498 | input_shape=(b.shape[1], b.shape[2], 1), data_format="channels_last" 499 | )) 500 | model.add(Activation("relu")) 501 | model.add(MaxPooling2D(pool_size=(2, 2))) 502 | # 2nd conv layer 503 | model.add(Conv2D(32, (5, 5), padding="same")) 504 | model.add(Activation("relu")) 505 | model.add(MaxPooling2D(pool_size=(2, 2))) 506 | model.add(Flatten()) 507 | # FC1 508 | model.add(Dense(units=128)) 509 | model.add(Activation("relu")) 510 | model.add(Dropout(0.5)) 511 | # FC2 512 | model.add(Dense(units=100, name='feature_extract')) 513 | model.add(Activation("relu")) 514 | # output FC 515 | model.add(Dense(units=2, activation='sigmoid')) 516 | model.build() 517 | model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['AUC']) 518 | model.summary() 519 | 520 | # Train deep neural network 521 | early_stops = EarlyStopping(patience=5, monitor='val_auc') 522 | mc = ModelCheckpoint('best_model.h5', 523 | monitor='val_loss', 524 | verbose=0, 525 | save_best_only=True) 526 | model.fit(b, y, validation_split=0.05, 527 | callbacks=[early_stops, mc], batch_size=self.cnn_bsize, epochs=self.cnn_epoch, verbose=1) 528 | 529 | # Extract the useful featuer from CNN after training the deep nerual network 530 | intermediate_layer_model = Model(inputs=model.input, 531 | outputs=model.get_layer('feature_extract').output) 532 | intermediate_layer_model.summary() 533 | 534 | # predict to get featured data 535 | feauture_engg_data = intermediate_layer_model.predict(b) 536 | feauture_engg_data = pd.DataFrame(feauture_engg_data) 537 | print('feauture_engg_data shape:', feauture_engg_data.shape) 538 | 539 | # Renaming columns 540 | new_col = [] 541 | for i in range(100): 542 | new_col.append('bfeat_%d' % (i + 1)) 543 | feauture_engg_data.columns = new_col 544 | feauture_engg_data['SK_ID_CURR'] = idl 545 | train = train.merge(feauture_engg_data, on='SK_ID_CURR', how='left') 546 | del feauture_engg_data, bureau 547 | gc.collect() 548 | return train 549 | 550 | def bureau_rnn_features(self): 551 | ''' 552 | Recurrent Neural Network (RNN) is used to extract new feature from sequential data from bureau and 553 | bureaubal tables. The method concatenats new features to applicaiton_train table and returns final dataframe. 554 | ''' 555 | 556 | print('Extracting features using Recurrent Neural Network (RNN) ...') 557 | train = pd.read_csv('data/application_train.csv') 558 | idl = train['SK_ID_CURR'].values 559 | bureau = pd.read_csv('data/bureau.csv') 560 | # Imputating the missing data in bureau table 561 | # Missing categorical features are imputed with 'Not_applicable' 562 | # Missing numeric features are imputed with Zero (logical choice for this dataset) 563 | cols = bureau.select_dtypes(include=object).columns 564 | bureau[cols] = bureau[cols].fillna('Not_Applicable') 565 | cols = bureau.select_dtypes(exclude=object).columns 566 | bureau[cols] = bureau[cols].fillna(0) 567 | 568 | # One-hot encoding of categorical features 569 | bureau = pd.get_dummies(bureau, drop_first=True) 570 | bureau = bureau.sort_values('DAYS_CREDIT', ascending=False) 571 | lst = bureau['SK_ID_CURR'].values 572 | lst = list(set(lst)) 573 | lst.sort() 574 | 575 | # Making bureau table data structure similar to an image 576 | # Applications are grouoped by SK_ID_CURR and for each SK_ID_CURR, the 5 most recent SK_ID_BUREAU is considered. 577 | # If an SK_ID_CURR did not have 5 records, empty rows added and filled with -99 (to avoid confusion with zero). 578 | group = bureau.groupby('SK_ID_CURR') 579 | b = [] # b is the reshaped data structure of bureau table, suitable for use in RNN 580 | j = 0 581 | for sk in idl: 582 | if sk in lst: 583 | a = group.get_group(lst[j]) 584 | if a.shape[0] >= 5: 585 | a = a[:5] 586 | else: 587 | # m99 represents rows having value of -99 588 | m99 = np.ones((5 - a.shape[0], a.shape[1])) * -99 589 | m99 = pd.DataFrame(m99, columns=a.columns) 590 | a = a.append(m99) 591 | a = a.drop(['SK_ID_CURR', 'SK_ID_BUREAU'], axis=1) 592 | a = a.values.flatten().tolist() 593 | b.extend(a) 594 | j += 1 595 | else: 596 | m99 = np.ones((5, bureau.shape[1])) * -99 597 | m99 = pd.DataFrame(m99, columns=bureau.columns) 598 | m99 = m99.drop(['SK_ID_CURR', 'SK_ID_BUREAU'], axis=1) 599 | m99 = m99.values.flatten().tolist() 600 | b.extend(m99) 601 | b = np.array(b) 602 | b = np.reshape(b, (idl.shape[0], 5, bureau.shape[1] - 2)) 603 | print('shape of channel(bureau):', b.shape) 604 | y = train['TARGET'] 605 | y = to_categorical(y, 2) 606 | 607 | # Deep RNN implementation 608 | # RNN architecture includes 2 Long Short Term Memory (LSTM) units followed by two fully connected layer 609 | np.random.seed(5) 610 | model = Sequential() 611 | # 1st LSTM layer 612 | model.add(LSTM(units=50, input_shape=(b.shape[1], b.shape[2]), return_sequences=True)) 613 | model.add(BatchNormalization()) 614 | model.add(Activation("relu")) 615 | model.add(Dropout(0.2)) 616 | # 2nd LSTM layer 617 | model.add(LSTM(50, return_sequences=True)) 618 | model.add(BatchNormalization()) 619 | model.add(Activation("relu")) 620 | model.add(Dropout(0.2)) 621 | model.add(Flatten()) 622 | # FC1 623 | model.add(Dense(units=128)) 624 | model.add(BatchNormalization()) 625 | model.add(Activation("relu")) 626 | model.add(Dropout(0.5)) 627 | # FC2 628 | model.add(Dense(units=100, name='RNN_feature_extract')) 629 | model.add(BatchNormalization()) 630 | model.add(Activation("relu")) 631 | # output FC 632 | model.add(Dense(units=2, activation='sigmoid')) 633 | model.build() 634 | model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['AUC']) 635 | model.summary() 636 | 637 | # Train recurrent neural network 638 | early_stops = EarlyStopping(patience=5, monitor='val_auc') 639 | mc = ModelCheckpoint('best_model.h5', 640 | monitor='val_loss', 641 | verbose=0, 642 | save_best_only=True) 643 | model.fit(b, y, validation_split=0.05, callbacks=[early_stops, mc], batch_size=self.rnn_bsize, 644 | epochs=self.rnn_epoch, 645 | verbose=1) 646 | 647 | # Extract the useful featuer from RNN after training the deep nerual network 648 | intermediate_layer_model = Model(inputs=model.input, 649 | outputs=model.get_layer('RNN_feature_extract').output) 650 | intermediate_layer_model.summary() 651 | # predict to get featured data 652 | feauture_engg_data = intermediate_layer_model.predict(b) 653 | feauture_engg_data = pd.DataFrame(feauture_engg_data) 654 | print('feauture_engg_data shape:', feauture_engg_data.shape) 655 | 656 | # Renaming columns 657 | new_col = [] 658 | for i in range(100): 659 | new_col.append('bfeat_%d' % (i + 1)) 660 | feauture_engg_data.columns = new_col 661 | feauture_engg_data['SK_ID_CURR'] = idl 662 | 663 | # Merge RNN features to application_train dataset 664 | train = train.merge(feauture_engg_data, on='SK_ID_CURR', how='left') 665 | del feauture_engg_data, bureau 666 | gc.collect() 667 | return train 668 | 669 | def XGBoost(self): 670 | ''' 671 | This method train a machine learning model using XGBoost algorithm. Before that, it imputes the empty cells 672 | in self.data table, encodes categorical features using one-hot encoding method, applies PCA transformation 673 | on first self.pca_n principles components. 674 | 675 | Returns: 676 | self.pred_class: Binary class prediction of the target variable. 677 | self.pred: Probability prediction of the target variable. 678 | self.y_test: y_test in the training dataset 679 | ''' 680 | 681 | print('Preprocessing final table one-hot encoding categorical features...') 682 | # Drop the columns with correlation > 0.98 683 | corr = self.data.corr() 684 | upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool)) 685 | to_drop = [column for column in upper.columns if any(upper[column] > 0.98)] 686 | self.data = self.data.drop(to_drop, axis=1) 687 | 688 | # Imputating the missing data, PCA can not handle missing data 689 | # Missing categorical features are imputed with 'Not_applicable' 690 | # Missing numeric features are imputed with Zero (logical choice for this dataset) 691 | cols = self.data.select_dtypes(include=object).columns 692 | self.data[cols] = self.data[cols].fillna('Not_Applicable') 693 | cols = self.data.select_dtypes(exclude=object).columns 694 | self.data[cols] = self.data[cols].fillna(0) 695 | 696 | # One-hot encoding categorical features for XGBoost algorithm. 697 | self.data = pd.get_dummies(self.data, drop_first=True) 698 | 699 | # Train and test set split 700 | y = self.data['TARGET'] 701 | X = self.data.drop('TARGET', axis=1) 702 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, random_state=1234) 703 | self.y_test = y_test 704 | 705 | # First pca_n principle components are used for XGBoost 706 | # PCA requires standardization of features 707 | sc = StandardScaler() 708 | X_pca = sc.fit_transform(X_train) 709 | pca = PCA(n_components=self.pca_n) 710 | X_pca = pca.fit_transform(X_pca) 711 | print('\nRatio of variance explained by {} principal components: '.format(self.pca_n), 712 | sum(pca.explained_variance_ratio_)) 713 | 714 | pipeline = make_pipeline(StandardScaler(), PCA(n_components=self.pca_n), XGBClassifier()) 715 | params = { 716 | 'xgbclassifier__learning_rate': [0.05, 0.1, 0.15, 0.2], 717 | 'xgbclassifier__max_depth': [3, 4, 5, 6, 8, 10], 718 | 'xgbclassifier__min_child_weight': [1, 3, 5, 7], 719 | 'xgbclassifier__gamma': [0, 0.1, 0.2, 0.3, 0.4], 720 | 'xgbclassifier__colsample_bytree': [0.5, 0.7, 1] 721 | } 722 | print('\nApplying XGBoost classifier... \n') 723 | model = RandomizedSearchCV(pipeline, params, n_iter=1, scoring='roc_auc', cv=self.nfolds, n_jobs=-1, verbose=3) 724 | model.fit(X_train, y_train) 725 | print('\nCross validation best score(AUC) is:', model.best_score_) 726 | # Hyperparameters of the model with the best performance 727 | print('\nModel best hyperparamters are:', model.best_params_) 728 | # Binary class prediction 729 | self.pred_class = model.predict(X_test) 730 | # Probability prediction 731 | self.pred = model.predict_proba(X_test) 732 | self.pred = [p[1] for p in self.pred] 733 | 734 | def lightGBM(self): 735 | ''' 736 | This method trains a machine learning model using LightGBM algorithm. The boosted algorithm hyper parameters 737 | was found using Bayasian optimization. This methods encodes categorical features as integer 738 | and save them as 'category' type for lightGBM algorithms. 739 | 740 | Returns: 741 | self.pred_class: Binary class prediction of the target variable. 742 | self.pred: Probability prediction of the target variable. 743 | self.y_test: y_test in the training dataset 744 | ''' 745 | 746 | print('Preprocessing final table and label encoding categorical features...') 747 | # Drop the columns with correlation > 0.98 748 | corr = self.data.corr() 749 | upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool)) 750 | to_drop = [column for column in upper.columns if any(upper[column] > 0.98)] 751 | self.data = self.data.drop(to_drop, axis=1) 752 | # Encoding categorical features because lightGBM offers good accuracy with integer-encoded categorical features. 753 | class_le = LabelEncoder() 754 | cols = self.data.select_dtypes(include=object).columns 755 | for col in cols: 756 | self.data[col] = class_le.fit_transform(self.data[col].values.astype(str)) 757 | self.data[col] = self.data[col].astype('category') 758 | print('Applying LightGBM algorithm...') 759 | y = self.data['TARGET'] 760 | X = self.data.drop('TARGET', axis=1) 761 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, random_state=1234) 762 | self.y_test = y_test 763 | # Lighgbm parameters was found using Bayesian Optimization 764 | model_params = { 765 | 'colsample_bytree': 0.45544541538547634, 766 | 'learning_rate': 0.09712737568777673, 767 | 'max_depth': 10, 768 | 'min_child_weight': 44.81416318834993, 769 | 'min_split_gain': 0.47913323843650946, 770 | 'num_leaves': 44, 771 | 'reg_alpha': 8.507126649843658, 772 | 'reg_lambda': 2.2113739093853257, 773 | 'subsample': 0.43342993037373423 774 | } 775 | model = make_pipeline(StandardScaler(), LGBMClassifier(**model_params)) 776 | # cross validation scores 777 | scores = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=self.nfolds, n_jobs=-1, verbose=100) 778 | print('max cross_val AUC: ', np.max(scores)) 779 | model.fit(X_train, y_train) 780 | # Binary class prediction 781 | self.pred_class = model.predict(X_test) 782 | # Probability prediction 783 | self.pred = model.predict_proba(X_test) 784 | self.pred = [p[1] for p in self.pred] 785 | 786 | def Catboost(self): 787 | ''' 788 | This methods trains a machine learning model using Catboost algorithm. 789 | This method encodes categorical features and save them as 'category' type for Catboost 790 | algorithm. 791 | 792 | Returns: 793 | self.pred_class: Binary class prediction of the target variable. 794 | self.pred: Probability prediction of the target variable. 795 | self.y_test: y_test in the training dataset. 796 | ''' 797 | 798 | print('Preprocessing final table and one-hot encoding categorical features...') 799 | # Drop the columns with correlation > 0.98 800 | corr = self.data.corr() 801 | upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool)) 802 | to_drop = [column for column in upper.columns if any(upper[column] > 0.98)] 803 | self.data = self.data.drop(to_drop, axis=1) 804 | # Encoding categorical features because Catboost offers good accuracy with integer-encoded categorical features. 805 | class_le = LabelEncoder() 806 | cols = self.data.select_dtypes(include=object).columns 807 | for col in cols: 808 | self.data[col] = class_le.fit_transform(self.data[col].values.astype(str)) 809 | self.data[col] = self.data[col].astype('category') 810 | 811 | print('Applying CatBoost algorithm...') 812 | y = self.data['TARGET'] 813 | X = self.data.drop('TARGET', axis=1) 814 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, random_state=1234) 815 | self.y_test = y_test 816 | pipeline = Pipeline(steps=[('sc', StandardScaler()), ('catboost', CatBoostClassifier())]) 817 | params = { 818 | "catboost__depth": [5, 6], 819 | "catboost__iterations": [500, 1000], 820 | "catboost__learning_rate": [0.001, 0.01, 0.1], 821 | "catboost__l2_leaf_reg": [5, 100] 822 | } 823 | model = RandomizedSearchCV(pipeline, params, n_iter=4, scoring='roc_auc', cv=self.nfolds, n_jobs=-1, verbose=3) 824 | model.fit(X_train, y_train) 825 | 826 | # Binary class prediction 827 | self.pred_class = model.predict(X_test) 828 | # Probability prediction 829 | self.pred = model.predict_proba(X_test) 830 | self.pred = [p[1] for p in self.pred] 831 | 832 | def FCNN(self): 833 | ''' 834 | This method employs a fully connected neural network as the binary classifier. 835 | It impute the mission data and applies one-hot encoding on categorical features. 836 | 837 | Returns: 838 | self.pred_class: Binary class prediction of the target variable. 839 | self.pred: Probability prediction of the target variable. 840 | self.y_test: y_test in the training dataset. 841 | ''' 842 | 843 | print('Preprocessing final table, imputing missing values, and One-hot encoding of categorical features...') 844 | # Drop the columns with correlation > 0.98 845 | corr = self.data.corr() 846 | upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool)) 847 | to_drop = [column for column in upper.columns if any(upper[column] > 0.98)] 848 | self.data = self.data.drop(to_drop, axis=1) 849 | 850 | # Missing categorical features are imputed with 'Not_applicable' 851 | # Missing numeric features are imputed with Zero (logical choice for this dataset) 852 | cols = self.data.select_dtypes(include=object).columns 853 | self.data[cols] = self.data[cols].fillna('Not_Applicable') 854 | cols = self.data.select_dtypes(exclude=object).columns 855 | self.data[cols] = self.data[cols].fillna(0) 856 | 857 | # One-hot encoding of categorical features 858 | self.data = pd.get_dummies(self.data, drop_first=True) 859 | 860 | print('Applying Fully Connected Neural Network (FCNN) ...') 861 | y = self.data['TARGET'] 862 | X = self.data.drop('TARGET', axis=1) 863 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, random_state=1234) 864 | self.y_test = y_test 865 | sc = StandardScaler() 866 | X_train = sc.fit_transform(X_train) 867 | X_test = sc.fit_transform(X_test) 868 | 869 | # Deep FCNN implementation 870 | # FCNN architecture includes 3 fully connected units having 150, 75, 25 neurons, respectively. 871 | np.random.seed(5) 872 | # FC1 873 | model = Sequential() 874 | model.add(Dense(input_shape=(X_train.shape[1],), units=150)) 875 | model.add(BatchNormalization()) 876 | model.add(Activation("relu")) 877 | model.add(Dropout(0.2)) 878 | # FC2 879 | model.add(Dense(units=75)) 880 | model.add(BatchNormalization()) 881 | model.add(Activation("relu")) 882 | model.add(Dropout(0.2)) 883 | # FC3 884 | model.add(Dense(units=25)) 885 | model.add(BatchNormalization()) 886 | model.add(Activation("relu")) 887 | model.add(Dropout(0.2)) 888 | # Output layer 889 | model.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid')) 890 | model.build() 891 | model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['AUC']) 892 | model.summary() 893 | 894 | # Train deep neural network 895 | early_stops = EarlyStopping(patience=10, monitor='val_auc') 896 | mc = ModelCheckpoint('best_model.h5', monitor='val_loss', verbose=0, save_best_only=True) 897 | model.fit(X_train, y_train, validation_split=self.test_size, callbacks=[early_stops, mc], 898 | batch_size=self.batch_size, epochs=self.epoch, verbose=1) 899 | # Binary class prediction, Keras predict method always return probability (unlike Sklearn!) 900 | self.pred_class = np.argmax(model.predict(X_test), axis=-1) 901 | # Probability prediction 902 | self.pred = model.predict(X_test) 903 | 904 | def Hclustering(self): 905 | ''' 906 | This method resamples trining data to have balanced positive to negative labels. It undersamples the 907 | majority class (negative) using hierarcical clustering with cluster size equals to the size of positive class. 908 | Then randomly chose a sample from each cluster as the representative of that cluster. 909 | 910 | Returns: 911 | self.pred_class: Binary class prediction of the target variable. 912 | self.pred: Probability prediction of the target variable. 913 | self.y_test: y_test in the training dataset. 914 | ''' 915 | 916 | print('Preprocessing final table and one-hot encoding categorical features... \n') 917 | # Drop the columns with correlation > 0.98 918 | corr = self.data.corr() 919 | upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool)) 920 | to_drop = [column for column in upper.columns if any(upper[column] > 0.98)] 921 | self.data = self.data.drop(to_drop, axis=1) 922 | # Impute the missing data which is required for calculating euclidean distance in clustering 923 | # Missing value in categorical columns are imputed by NA (not available), 924 | # Missing value in numeric columns are imputed by zero, which most of the time is the case in this dataset. 925 | cols = self.data.select_dtypes(include=object).columns 926 | self.data[cols] = self.data[cols].fillna('NA') 927 | cols = self.data.select_dtypes(exclude=object).columns 928 | self.data[cols] = self.data[cols].fillna(0) 929 | 930 | # One-hot encoding categorical features for clustering (eucliden distance) and XGBoost algorithm. 931 | self.data = pd.get_dummies(self.data, drop_first=True) 932 | del corr 933 | y = self.data['TARGET'] 934 | X = self.data.drop('TARGET', axis=1) 935 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, random_state=1234) 936 | self.y_test = y_test 937 | 938 | print('Undersampling the majority class using hierarchical clustering .... \n') 939 | # defining positive and negative classes 940 | negative = self.data.loc[self.data['TARGET'] == 0] 941 | positive = self.data.loc[self.data['TARGET'] == 1] 942 | # finding number of clusters 943 | nclusters = np.ceil(len(positive)).astype(int) 944 | # Standardize negative class for the clustering 945 | sc = StandardScaler() 946 | transform = sc.fit_transform(negative) 947 | negative = pd.DataFrame(transform, columns=negative.columns) 948 | # Clustering the majority class using euclidean affinity and ward linkage 949 | # In Ward's linkage, two clusters that lead to the minimum increase of the total within-cluster SSE are merged. 950 | ac = AgglomerativeClustering(n_clusters=nclusters, affinity='euclidean', linkage='ward') 951 | clustering = ac.fit(negative) 952 | # add a new feature for each row to show what cluster they belong 953 | negative['cluster'] = clustering.labels_ 954 | # Randomly choose a sample from each cluster 955 | # A function for choosing one sample from each cluster 956 | def sampling(group): 957 | return group.sample(1, random_state=1) 958 | 959 | # Grouping the train data based on the cluster and select one sample from each cluster 960 | negative = negative.groupby('cluster', as_index=False).apply(sampling) 961 | negative = negative.droplevel(level=1) 962 | negative = negative.drop('cluster', axis=1) 963 | negative = pd.DataFrame(sc.inverse_transform(negative), columns=negative.columns) 964 | 965 | # Merging negative and positive class to form balanced train set 966 | train = pd.concat([negative, positive], axis=0, sort=False) 967 | # train['SK_ID_CURR'] = train['SK_ID_CURR'].astype(int) 968 | train = train.sample(frac=1) 969 | train = train.reset_index(drop=True) 970 | # Return X, y training dataset 971 | y_train = train['TARGET'] 972 | X_train = train.drop('TARGET', axis=1) 973 | X_train = X_train.to_numpy() 974 | 975 | print('Applying XGBoost algorithm on balanced dataset... \n') 976 | pipeline = make_pipeline(StandardScaler(), PCA(n_components=self.pca_n), XGBClassifier()) 977 | params = { 978 | 'xgbclassifier__learning_rate': [0.05, 0.1, 0.15, 0.2], 979 | 'xgbclassifier__max_depth': [3, 4, 5, 6, 8, 10], 980 | 'xgbclassifier__min_child_weight': [1, 3, 5, 7], 981 | 'xgbclassifier__gamma': [0, 0.1, 0.2, 0.3, 0.4], 982 | 'xgbclassifier__colsample_bytree': [0.5, 0.7, 1] 983 | } 984 | model = RandomizedSearchCV(pipeline, params, n_iter=1, scoring='roc_auc', cv=self.nfolds, n_jobs=-1, verbose=3) 985 | model.fit(X_train, y_train) 986 | print('\nCross validation best score(AUC) is:', model.best_score_) 987 | # Hyperparameters of the model with the best performance 988 | print('\nModel best hyperparamters are:', model.best_params_) 989 | # Binary class prediction 990 | self.pred_class = model.predict(X_test) 991 | # Probability prediction 992 | self.pred = model.predict_proba(X_test) 993 | self.pred = [p[1] for p in self.pred] 994 | del negative, positive, train, clustering, transform 995 | gc.collect() 996 | 997 | def Hclustering_smote(self): 998 | ''' 999 | This method resamples training data to have balanced positive to negative labels. It undersamples the 1000 | majority class (negative) using hierarcical clustering up to 50% of total data and oversamples the minority 1001 | class up to 50% using SMOTE. 1002 | 1003 | Returns: 1004 | self.pred_class: Binary class prediction of the target variable. 1005 | self.pred: Probability prediction of the target variable. 1006 | self.y_test: y_test in the training dataset. 1007 | ''' 1008 | 1009 | print('Preprocessing final table and one-hot encoding categorical features... \n') 1010 | # Drop the columns with correlation > 0.98 1011 | corr = self.data.corr() 1012 | upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool)) 1013 | to_drop = [column for column in upper.columns if any(upper[column] > 0.98)] 1014 | self.data = self.data.drop(to_drop, axis=1) 1015 | # Impute the missing data which is required for calculating euclidean distance in clustering 1016 | # Missing value in categorical columns are imputed by NA (not available), 1017 | # Missing value in numeric columns are imputed by zero, which most of the time is the case in this dataset. 1018 | cols = self.data.select_dtypes(include=object).columns 1019 | self.data[cols] = self.data[cols].fillna('NA') 1020 | cols = self.data.select_dtypes(exclude=object).columns 1021 | self.data[cols] = self.data[cols].fillna(0) 1022 | 1023 | # One-hot encoding categorical features for clustering (eucliden distance) and XGBoost algorithm. 1024 | self.data = pd.get_dummies(self.data, drop_first=True) 1025 | 1026 | # Agglomerative Clustering is computationaly expensive, 1027 | # In this experiment only a fraction of application train file is considered (nrows= 30000). 1028 | del corr 1029 | y = self.data['TARGET'] 1030 | X = self.data.drop('TARGET', axis=1) 1031 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, random_state=1234) 1032 | self.y_test = y_test 1033 | 1034 | print('Undersample majority class and oversampling minority class ...\n') 1035 | 1036 | # defining positive and negative classes 1037 | negative = self.data.loc[self.data['TARGET'] == 0] 1038 | positive = self.data.loc[self.data['TARGET'] == 1] 1039 | 1040 | # finding number of clusters (half of the training data) 1041 | nclusters = np.ceil(len(self.data) / 2).astype(int) 1042 | # Standardize negative class for the clustering 1043 | sc = StandardScaler() 1044 | transform = sc.fit_transform(negative) 1045 | negative = pd.DataFrame(transform, columns=negative.columns) 1046 | del transform 1047 | 1048 | # Clustering the majority class using euclidean affinity and ward linkage 1049 | # In Ward's linkage, two clusters that lead to the minimum increase of the total within-cluster SSE are merged. 1050 | ac = AgglomerativeClustering(n_clusters=nclusters, affinity='euclidean', linkage='ward') 1051 | clustering = ac.fit(negative) 1052 | # add a new feature for each row to show what cluster they belong 1053 | negative['cluster'] = clustering.labels_ 1054 | # Randomly choose a sample from each cluster 1055 | # A function for choosing one sample from each cluster 1056 | def sampling(group): 1057 | return group.sample(1, random_state=1) 1058 | # Grouping the train data based on the cluster and select one sample from each cluster 1059 | negative = negative.groupby('cluster', as_index=False).apply(sampling) 1060 | negative = negative.droplevel(level=1) 1061 | negative = negative.drop('cluster', axis=1) 1062 | negative = pd.DataFrame(sc.inverse_transform(negative), columns=negative.columns) 1063 | 1064 | # Merging negative and positive class to form balanced train set 1065 | train = pd.concat([negative, positive], axis=0, sort=False) 1066 | train = train.sample(frac=1) 1067 | train = train.reset_index(drop=True) 1068 | 1069 | # Return X, y training dataset 1070 | y_train = train['TARGET'] 1071 | X_train = train.drop('TARGET', axis=1) 1072 | 1073 | # SMOTE oversampling of minority class 1074 | oversample = SMOTE() 1075 | X_train, y_train = oversample.fit_resample(X_train, y_train) 1076 | 1077 | print('Applying XGBoost algorithm on the balanced training data... \n') 1078 | pipeline = make_pipeline(StandardScaler(), PCA(n_components=self.pca_n), XGBClassifier()) 1079 | params = { 1080 | 'xgbclassifier__learning_rate': [0.05, 0.1, 0.15, 0.2], 1081 | 'xgbclassifier__max_depth': [3, 4, 5, 6, 8, 10], 1082 | 'xgbclassifier__min_child_weight': [1, 3, 5, 7], 1083 | 'xgbclassifier__gamma': [0, 0.1, 0.2, 0.3, 0.4], 1084 | 'xgbclassifier__colsample_bytree': [0.5, 0.7, 1] 1085 | } 1086 | model = RandomizedSearchCV(pipeline, params, n_iter=1, scoring='roc_auc', cv=self.nfolds, n_jobs=-1, verbose=3) 1087 | model.fit(X_train, y_train) 1088 | print('\nCross validation best score(AUC) is:', model.best_score_) 1089 | # Hyperparameters of the model with the best performance 1090 | print('\nModel best hyperparamters are:', model.best_params_) 1091 | # Binary class prediction 1092 | self.pred_class = model.predict(X_test) 1093 | # Probability prediction 1094 | self.pred = model.predict_proba(X_test) 1095 | self.pred = [p[1] for p in self.pred] 1096 | 1097 | def train(self, args): 1098 | ''' 1099 | This method train the model using the selected algorithm (default: lightGBM). 1100 | If resampling method is used, separate methods are implemented due to different pipeline structure. 1101 | ''' 1102 | 1103 | if args.resample: 1104 | if args.use_hclstr: 1105 | self.Hclustering() 1106 | else: 1107 | self.Hclustering_smote() 1108 | elif args.xgb: 1109 | print('using XGBoost for training ...') 1110 | self.XGBoost() 1111 | elif args.catb: 1112 | print('using Catboost for training ...') 1113 | self.Catboost() 1114 | elif args.fcnn: 1115 | print('Using Fully connected neural network for training ...') 1116 | self.FCNN() 1117 | else: 1118 | print('using LightGBM for training ...') 1119 | self.lightGBM() 1120 | 1121 | # Evaluate ROC_AUC, Precision, Recall, F1-Score, Cohen-Cappa metrics 1122 | self.calculate_metrics() 1123 | # Plot ROC curve 1124 | self.plot_ROC() 1125 | # Plot Precision/R curve 1126 | self.plot_precision_recall() 1127 | 1128 | def calculate_metrics(self): 1129 | ''' 1130 | This method calculates the classification metrics including precision, recall, F1-Score, AUC_ROC, 1131 | and Cohen's kappa coefficient. 1132 | ''' 1133 | 1134 | # ROC_AUC score 1135 | print('ROC_AUC:', roc_auc_score(self.y_test, self.pred)) 1136 | # Precision/Recall (0.1 Threshold) 1137 | pred_class_2 = (np.array(self.pred) > 0.1).astype(int) 1138 | cm = confusion_matrix(self.y_test, pred_class_2) 1139 | print('\nConfusion_metrix (0.1 Threshold): \n', cm) 1140 | # True Negatives (TN) 1141 | tn = cm[0][0] 1142 | # False Positives (FP) 1143 | fp = cm[0][1] 1144 | # False Negatives (FN) 1145 | fn = cm[1][0] 1146 | # True Positives (TP) 1147 | tp = cm[1][1] 1148 | precision = tp / (tp + fp) 1149 | print('Precision (0.1 Threshold): ', precision) 1150 | recall = tp / (tp + fn) 1151 | print('Recall (0.1 Threshold): ', recall) 1152 | print('F1-score ( 0.1 Threshold):', 2 * precision * recall / (precision + recall)) 1153 | cohen_kappa = cohen_kappa_score(self.y_test, pred_class_2) 1154 | print('\nCohen_kappa (0.1 Threshold): ', cohen_kappa) 1155 | 1156 | def plot_ROC(self): 1157 | ''' 1158 | This method plots ROC based on y_test and predicted probability of positive class by lightGBM. 1159 | ''' 1160 | 1161 | # Initialize figure 1162 | fig = plt.figure(figsize=(9, 9)) 1163 | plt.title('Receiver Operating Characteristic') 1164 | # Plot ROC curve 1165 | fpr, tpr, thresholds = roc_curve(self.y_test, self.pred) 1166 | plt.plot(fpr, tpr) 1167 | # Diagonal 45 degree line 1168 | plt.plot([0, 1], [0, 1], 'k--') 1169 | # Axes limits and labels 1170 | plt.xlim([-0.1, 1.1]) 1171 | plt.ylim([-0.1, 1.1]) 1172 | plt.ylabel('True Positive Rate') 1173 | plt.xlabel('False Positive Rate') 1174 | plt.show() 1175 | 1176 | def plot_precision_recall(self): 1177 | ''' 1178 | This method plots precision_recall curve based on y_test and predicted probability of positive class. 1179 | ''' 1180 | 1181 | precision, recall, thresholds = precision_recall_curve(self.y_test, self.pred) 1182 | fig = plt.figure(figsize=(9, 9)) 1183 | plt.title('Precision_Recall') 1184 | # Plot Precision-Recall curve 1185 | plt.plot(recall, precision) 1186 | # Axes limits and labels 1187 | plt.xlim([0, 1]) 1188 | plt.ylim([0, 1]) 1189 | plt.ylabel('Precision') 1190 | plt.xlabel('Recall') 1191 | plt.show() 1192 | -------------------------------------------------------------------------------- /Feature Engineering Strategies/XGBoost_Automated Features.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 29, 6 | "metadata": { 7 | "executionInfo": { 8 | "elapsed": 3527, 9 | "status": "ok", 10 | "timestamp": 1601504229624, 11 | "user": { 12 | "displayName": "Ali Ghorbani", 13 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhWTYg2QaNPZd4GGNiDkkHX8r9t7BRVHYGz3JwSKA=s64", 14 | "userId": "10869472433171243113" 15 | }, 16 | "user_tz": 240 17 | }, 18 | "id": "xp4I20KBm54b" 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "''' Import feature tools library and numpy, pandas, sklearn modules and libraries'''\n", 23 | "import pandas as pd\n", 24 | "import numpy as np\n", 25 | "from matplotlib import pyplot as plt\n", 26 | "%matplotlib inline\n", 27 | "import seaborn as sns\n", 28 | "from sklearn.model_selection import train_test_split\n", 29 | "from sklearn.pipeline import make_pipeline\n", 30 | "from sklearn.preprocessing import StandardScaler\n", 31 | "from sklearn.model_selection import RandomizedSearchCV, GridSearchCV\n", 32 | "from sklearn.decomposition import PCA \n", 33 | "import featuretools as ft\n", 34 | "from xgboost import XGBClassifier\n", 35 | "# Classification metrics\n", 36 | "from sklearn.metrics import confusion_matrix\n", 37 | "from sklearn.metrics import roc_curve, roc_auc_score\n", 38 | "from sklearn.metrics import precision_recall_curve\n", 39 | "from sklearn.metrics import cohen_kappa_score\n", 40 | "# Ignore ConvergenceWarning messages\n", 41 | "import warnings\n", 42 | "from sklearn.exceptions import ConvergenceWarning\n", 43 | "warnings.simplefilter(action='ignore', category=ConvergenceWarning)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 30, 49 | "metadata": { 50 | "executionInfo": { 51 | "elapsed": 3496, 52 | "status": "ok", 53 | "timestamp": 1601504229625, 54 | "user": { 55 | "displayName": "Ali Ghorbani", 56 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhWTYg2QaNPZd4GGNiDkkHX8r9t7BRVHYGz3JwSKA=s64", 57 | "userId": "10869472433171243113" 58 | }, 59 | "user_tz": 240 60 | }, 61 | "id": "mMmpzLtirfYi" 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "def train_application():\n", 66 | " \n", 67 | " \"\"\"\n", 68 | " This function reads application_train.csv, cleans it and perform manual feature engineering for each\n", 69 | " application (SK_ID_CURR). \n", 70 | " \n", 71 | " Parameters:\n", 72 | " None\n", 73 | "\n", 74 | " Returns:\n", 75 | " train: training dataFrame which includes hand engineered features from \n", 76 | " just application_train table. \n", 77 | " \n", 78 | " \"\"\"\n", 79 | " \n", 80 | " print('Processing application_train and application_test tables')\n", 81 | " train = pd.read_csv('application_train.csv')\n", 82 | " # Delete four applications with XNA CODE_GENDER (train set)\n", 83 | " train = train[train['CODE_GENDER'] != 'XNA']\n", 84 | " # Replace DAYS_EMPLOYED = 365243 by nan\n", 85 | " train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True) \n", 86 | " # Feature engineering\n", 87 | " train['Days_employed_age'] = train['DAYS_EMPLOYED'] / train['DAYS_BIRTH']\n", 88 | " train['Credit_income_ratio'] = train['AMT_CREDIT'] / train['AMT_INCOME_TOTAL']\n", 89 | " train['Anuity_income_ratio'] = train['AMT_ANNUITY'] / train['AMT_INCOME_TOTAL'] \n", 90 | " train['Income_per_person'] = train['AMT_INCOME_TOTAL'] / train['CNT_FAM_MEMBERS']\n", 91 | " #length of the payment in months since the annuity is the monthly amount due\n", 92 | " train['Credit_term'] = train['AMT_ANNUITY']/train['AMT_CREDIT'] \n", 93 | " \n", 94 | " return train" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 31, 100 | "metadata": { 101 | "executionInfo": { 102 | "elapsed": 784, 103 | "status": "ok", 104 | "timestamp": 1601507091521, 105 | "user": { 106 | "displayName": "Ali Ghorbani", 107 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhWTYg2QaNPZd4GGNiDkkHX8r9t7BRVHYGz3JwSKA=s64", 108 | "userId": "10869472433171243113" 109 | }, 110 | "user_tz": 240 111 | }, 112 | "id": "Ow3vmIJZrwr_" 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "def Automated_features(train, nrows, max_depth):\n", 117 | " \n", 118 | " ''' \n", 119 | " Automated feature engineering is conducted on all the tables using feature tools.\n", 120 | " \n", 121 | " Parameters:\n", 122 | " train: preprocessed training data \n", 123 | " nrows: number of rows considered in train data for the model due to the computational power limitation.\n", 124 | " max_depth: depth of a deep feature is the number of primitives required to make the feature.\n", 125 | "\n", 126 | " Returns:\n", 127 | " train_ft: train dataframe with added new features from automated feature engineering. \n", 128 | " \n", 129 | " '''\n", 130 | " bureau = pd.read_csv('bureau.csv')\n", 131 | " bureaubal = pd.read_csv('bureau_balance.csv')\n", 132 | " prev = pd.read_csv('previous_application.csv')\n", 133 | " ccb = pd.read_csv('credit_card_balance.csv')\n", 134 | " insta = pd.read_csv('installments_payments.csv')\n", 135 | " pc = pd.read_csv('POS_CASH_balance.csv')\n", 136 | "\n", 137 | " # Choosing nrows data from all datasets\n", 138 | " train = train.sample(frac=1)\n", 139 | " train = train[:nrows]\n", 140 | " ids = train['SK_ID_CURR'].values\n", 141 | " bureau = bureau.loc[bureau['SK_ID_CURR'].isin(ids)]\n", 142 | " idsb = bureau['SK_ID_BUREAU'].values\n", 143 | " bureaubal = bureaubal.loc[bureaubal['SK_ID_BUREAU'].isin(idsb)]\n", 144 | " prev = prev.loc[prev['SK_ID_CURR'].isin(ids)]\n", 145 | " ccb = ccb.loc[ccb['SK_ID_CURR'].isin(ids)]\n", 146 | " insta = insta.loc[insta['SK_ID_CURR'].isin(ids)]\n", 147 | " pc = pc.loc[pc['SK_ID_CURR'].isin(ids)]\n", 148 | "\n", 149 | " # creating EntitySet (collection of tables)\n", 150 | " es = ft.EntitySet(id = 'applications')\n", 151 | " # adding Entity (table) to EntitySet\n", 152 | " es = es.entity_from_dataframe(entity_id = 'train' , dataframe = train , index= 'SK_ID_CURR')\n", 153 | " es = es.entity_from_dataframe(entity_id = 'bureau' , dataframe = bureau , index= 'SK_ID_BUREAU')\n", 154 | " es = es.entity_from_dataframe(entity_id = 'bureaubal', dataframe = bureaubal, make_index = True, index = 'bb_id')\n", 155 | " es = es.entity_from_dataframe(entity_id = 'prev' , dataframe = prev , index = 'SK_ID_PREV')\n", 156 | " es = es.entity_from_dataframe(entity_id = 'ccb' , dataframe = ccb , make_index = True, index = 'cc_id')\n", 157 | " es = es.entity_from_dataframe(entity_id = 'insta' , dataframe = insta , make_index = True, index = 'installment.id')\n", 158 | " es = es.entity_from_dataframe(entity_id = 'pc' , dataframe = pc , make_index = True, index = 'pos_cash_id')\n", 159 | " # Creating relation between Entities\n", 160 | " # Relationship between application training and bureau\n", 161 | " r_applications_bureau = ft.Relationship(es['train']['SK_ID_CURR'], es['bureau']['SK_ID_CURR'])\n", 162 | " es = es.add_relationship(r_applications_bureau)\n", 163 | " # Relationship between bureau and bureau balance\n", 164 | " r_bureau_bureaubal = ft.Relationship(es['bureau']['SK_ID_BUREAU'], es['bureaubal']['SK_ID_BUREAU'])\n", 165 | " es = es.add_relationship(r_bureau_bureaubal)\n", 166 | " # Relationship between application training and previous applications\n", 167 | " r_app_prev = ft.Relationship(es['train']['SK_ID_CURR'], es['prev']['SK_ID_CURR'])\n", 168 | " es = es.add_relationship(r_app_prev)\n", 169 | " # Relationship between previous applications with credit card balance, pos cash, and installments\n", 170 | " r_prev_cc = ft.Relationship(es['prev']['SK_ID_PREV'], es['ccb']['SK_ID_PREV'])\n", 171 | " es = es.add_relationship(r_prev_cc)\n", 172 | " r_prev_insta = ft.Relationship(es['prev']['SK_ID_PREV'], es['insta']['SK_ID_PREV'])\n", 173 | " es = es.add_relationship(r_prev_insta)\n", 174 | " r_prev_pc2 = ft.Relationship(es['prev']['SK_ID_PREV'], es['pc']['SK_ID_PREV'])\n", 175 | " es = es.add_relationship(r_prev_pc2)\n", 176 | " print('EntitySet with Relationships', es)\n", 177 | " # Deep feature synthesis with depth of 2 by stacking feature primitives (agregations and tranformations)\n", 178 | " # Automated features are concatenated to the original features; Therefore, \n", 179 | " train_ft, feature_names = ft.dfs(entityset = es, target_entity = 'train', max_depth = max_depth)\n", 180 | " train_ft = train_ft.reset_index()\n", 181 | " print('\\nTotal number of features after adding automated features: ', train_ft.shape[1])\n", 182 | " return train_ft" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 32, 188 | "metadata": { 189 | "executionInfo": { 190 | "elapsed": 3447, 191 | "status": "ok", 192 | "timestamp": 1601504229626, 193 | "user": { 194 | "displayName": "Ali Ghorbani", 195 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhWTYg2QaNPZd4GGNiDkkHX8r9t7BRVHYGz3JwSKA=s64", 196 | "userId": "10869472433171243113" 197 | }, 198 | "user_tz": 240 199 | }, 200 | "id": "_NEVlazP1Ebx" 201 | }, 202 | "outputs": [], 203 | "source": [ 204 | "def preprocessing(train):\n", 205 | " '''\n", 206 | " This function calculates the correlation between all features in training data and drops the columns with \n", 207 | " correlation > 0.98. Then, it encodes categorical features and save them as 'category' type for lightGBM algorithms.\n", 208 | "\n", 209 | " Parameters:\n", 210 | " train: trainig sets after with automated features from feature tools.\n", 211 | "\n", 212 | " Returns:\n", 213 | " train: pandas.DataFrame which includes preprocessed training dataset.\n", 214 | " '''\n", 215 | "\n", 216 | " print('Preprocessing final table and one-hot encoding categorical features...')\n", 217 | " # Drop the columns with correlation > 0.98\n", 218 | " corr = train.corr()\n", 219 | " upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool))\n", 220 | " to_drop = [column for column in upper.columns if any(upper[column] > 0.98)]\n", 221 | " train = train.drop(to_drop, axis=1)\n", 222 | " \n", 223 | " # Imputating the missing data, PCA can not handle missing data\n", 224 | " # Missing categorical features are imputed with 'Not_applicable'\n", 225 | " # Missing numeric features are imputed with Zero (logical choice for this dataset)\n", 226 | " cols = train.select_dtypes(include = object).columns\n", 227 | " train[cols] = train[cols].fillna('Not_Applicable')\n", 228 | " cols = train.select_dtypes(exclude = object).columns\n", 229 | " train[cols] = train[cols].fillna(0)\n", 230 | " \n", 231 | " # One-hot encoding categorical features for XGBoost algorithm. \n", 232 | " train = pd.get_dummies(train, drop_first=True)\n", 233 | " \n", 234 | " return train" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 33, 240 | "metadata": { 241 | "executionInfo": { 242 | "elapsed": 3423, 243 | "status": "ok", 244 | "timestamp": 1601504229627, 245 | "user": { 246 | "displayName": "Ali Ghorbani", 247 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhWTYg2QaNPZd4GGNiDkkHX8r9t7BRVHYGz3JwSKA=s64", 248 | "userId": "10869472433171243113" 249 | }, 250 | "user_tz": 240 251 | }, 252 | "id": "o7_ludBW1WUN" 253 | }, 254 | "outputs": [], 255 | "source": [ 256 | "def XGBoost(train, num_folds, test_size):\n", 257 | " '''\n", 258 | " This function train a machine learning model using XGBoost algorithm. \n", 259 | "\n", 260 | " Parameters:\n", 261 | " train: preprocessed training data \n", 262 | " num_folds: number of folds for cross-validation (default is 5)\n", 263 | " test_size: ratio of train to test dataset\n", 264 | "\n", 265 | " Returns:\n", 266 | " pred_class: Binary class prediction of the target variable.\n", 267 | " pred: Probability prediction of the target variable.\n", 268 | " y_test: y_test in the trainig dataset\n", 269 | " '''\n", 270 | " \n", 271 | " # Train and test set split\n", 272 | " y = train['TARGET']\n", 273 | " X = train.drop('TARGET', axis=1)\n", 274 | " X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = test_size, random_state=1234)\n", 275 | " \n", 276 | " # First 200 principle components are used for XGBoost \n", 277 | " # PCA requires standardization of featuers\n", 278 | " sc = StandardScaler()\n", 279 | " X_pca = sc.fit_transform(X_train)\n", 280 | " pca = PCA(n_components = 200)\n", 281 | " X_pca = pca.fit_transform(X_pca)\n", 282 | " print('\\nRatio of variance explained by 200 principal components: ', sum(pca.explained_variance_ratio_))\n", 283 | " \n", 284 | " pipeline = make_pipeline(StandardScaler(),PCA(n_components = 200), XGBClassifier())\n", 285 | "\n", 286 | " params = {\n", 287 | " 'xgbclassifier__learning_rate': [0.05, 0.1, 0.15, 0.2],\n", 288 | " 'xgbclassifier__max_depth': [3, 4, 5, 6, 8, 10 ],\n", 289 | " 'xgbclassifier__min_child_weight': [1, 3, 5, 7], \n", 290 | " 'xgbclassifier__gamma' : [0, 0.1, 0.2, 0.3, 0.4], \n", 291 | " 'xgbclassifier__colsample_bytree': [ 0.5, 0.7, 1] \n", 292 | " }\n", 293 | " print('\\nApplying XGBoost classifier... \\n')\n", 294 | " \n", 295 | " model = RandomizedSearchCV(pipeline, params, n_iter=4, scoring='roc_auc', cv=5, n_jobs=-1, verbose = 3)\n", 296 | " model.fit(X_train, y_train)\n", 297 | " print('\\nCross validation best score(AUC) is:', model.best_score_)\n", 298 | " # Hyperparameters of the model with the best performance\n", 299 | " print('\\nModel best hyperparamters are:', model.best_params_)\n", 300 | " # Binary class prediction\n", 301 | " pred_class = model.predict(X_test)\n", 302 | " # Probability prediction\n", 303 | " pred = model.predict_proba(X_test)\n", 304 | " pred = [p[1] for p in pred]\n", 305 | " return pred_class, pred, y_test" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 34, 311 | "metadata": { 312 | "executionInfo": { 313 | "elapsed": 3406, 314 | "status": "ok", 315 | "timestamp": 1601504229628, 316 | "user": { 317 | "displayName": "Ali Ghorbani", 318 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhWTYg2QaNPZd4GGNiDkkHX8r9t7BRVHYGz3JwSKA=s64", 319 | "userId": "10869472433171243113" 320 | }, 321 | "user_tz": 240 322 | }, 323 | "id": "bI01HDaZeeOS" 324 | }, 325 | "outputs": [], 326 | "source": [ 327 | "def calculate_metrics(pred_class, pred, y_test):\n", 328 | " '''\n", 329 | " This function calculates the classificaiton metrics including precision, recall, F1-Score, AUC_ROC, and cohen-kappa coefficient.\n", 330 | "\n", 331 | " Parameters:\n", 332 | " pred_class: Binary class prediction of the target variable.\n", 333 | " pred: Probability prediction of the target variable.\n", 334 | " y_test: y_test in the trainig dataset\n", 335 | "\n", 336 | " Returns:\n", 337 | " None\n", 338 | " '''\n", 339 | "\n", 340 | " # ROC_AUC score\n", 341 | " print('ROC_AUC:', roc_auc_score(y_test, pred))\n", 342 | " # Precision/Recall (0.1 Threshold)\n", 343 | " pred_class_2 = (np.array(pred) > 0.1).astype(int)\n", 344 | " cm = confusion_matrix(y_test, pred_class_2)\n", 345 | " print('\\nConfusion_metrix (0.1 Threshold): \\n', cm)\n", 346 | " # True Negatives (TN)\n", 347 | " tn = cm[0][0]\n", 348 | " # False Positives (FP)\n", 349 | " fp = cm[0][1] \n", 350 | " # False Negatives (FN)\n", 351 | " fn = cm[1][0]\n", 352 | " # True Positives (TP)\n", 353 | " tp = cm[1][1]\n", 354 | " precision = tp / (tp + fp)\n", 355 | " print( 'Precision (0.1 Threshold): ', precision )\n", 356 | " recall = tp / (tp + fn)\n", 357 | " print( 'Recall (0.1 Threshold): ', recall )\n", 358 | " print( 'F1-score ( 0.1 Threshold):', 2*precision*recall/(precision+recall))\n", 359 | " cohen_kappa = cohen_kappa_score(y_test, pred_class_2)\n", 360 | " print( '\\nCohen_kappa (0.1 Threshold): ', cohen_kappa )\n", 361 | "\n", 362 | "def plot_ROC(y_test, pred):\n", 363 | " '''\n", 364 | " This function plots ROC based on y_test and predictied probability of positive class by lightGBM.\n", 365 | "\n", 366 | " Parameters:\n", 367 | " pred: Probability prediction of the target variable.\n", 368 | " y_test: y_test in the trainig dataset\n", 369 | "\n", 370 | " Returns:\n", 371 | " None\n", 372 | " '''\n", 373 | " # Initialize figure\n", 374 | " fig = plt.figure(figsize=(9,9))\n", 375 | " plt.title('Receiver Operating Characteristic')\n", 376 | " # Plot ROC curve\n", 377 | " fpr, tpr, thresholds = roc_curve(y_test, pred)\n", 378 | " plt.plot(fpr, tpr)\n", 379 | " # Diagonal 45 degree line\n", 380 | " plt.plot([0,1],[0,1],'k--')\n", 381 | " # Axes limits and labels\n", 382 | " plt.xlim([-0.1,1.1])\n", 383 | " plt.ylim([-0.1,1.1])\n", 384 | " plt.ylabel('True Positive Rate')\n", 385 | " plt.xlabel('False Positive Rate')\n", 386 | " plt.show()\n", 387 | "\n", 388 | "def plot_precision_recall(y_test, pred):\n", 389 | " '''\n", 390 | " This function plots precicision_recall curve based on y_test and predictied probability of positive class by lightGBM..\n", 391 | "\n", 392 | " Parameters:\n", 393 | " pred: Probability prediction of the target variable.\n", 394 | " y_test: y_test in the trainig dataset\n", 395 | " \n", 396 | " Returns:\n", 397 | " None\n", 398 | " '''\n", 399 | " precision, recall, thresholds = precision_recall_curve(y_test, pred)\n", 400 | " fig = plt.figure(figsize=(9,9))\n", 401 | " plt.title('Precision_Recall')\n", 402 | " # Plot Precision-Recall curve\n", 403 | " plt.plot(recall, precision)\n", 404 | " # Axes limits and labels\n", 405 | " plt.xlim([0,1])\n", 406 | " plt.ylim([0,1])\n", 407 | " plt.ylabel('Precision')\n", 408 | " plt.xlabel('Recall')\n", 409 | " plt.show()" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": 35, 415 | "metadata": { 416 | "colab": { 417 | "base_uri": "https://localhost:8080/", 418 | "height": 697 419 | }, 420 | "executionInfo": { 421 | "elapsed": 336143, 422 | "status": "error", 423 | "timestamp": 1601507431833, 424 | "user": { 425 | "displayName": "Ali Ghorbani", 426 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhWTYg2QaNPZd4GGNiDkkHX8r9t7BRVHYGz3JwSKA=s64", 427 | "userId": "10869472433171243113" 428 | }, 429 | "user_tz": 240 430 | }, 431 | "id": "I3-Q4ElLlcKl", 432 | "outputId": "3dbe33d1-40ad-42ac-fc56-da6ff8303a2a", 433 | "scrolled": false 434 | }, 435 | "outputs": [ 436 | { 437 | "name": "stdout", 438 | "output_type": "stream", 439 | "text": [ 440 | "Processing application_train and application_test tables\n", 441 | "EntitySet with Relationships Entityset: applications\n", 442 | " Entities:\n", 443 | " train [Rows: 30000, Columns: 127]\n", 444 | " bureau [Rows: 86286, Columns: 17]\n", 445 | " bureaubal [Rows: 911993, Columns: 4]\n", 446 | " prev [Rows: 138560, Columns: 37]\n", 447 | " ccb [Rows: 84931, Columns: 24]\n", 448 | " insta [Rows: 1125445, Columns: 9]\n", 449 | " pc [Rows: 86293, Columns: 9]\n", 450 | " Relationships:\n", 451 | " bureau.SK_ID_CURR -> train.SK_ID_CURR\n", 452 | " bureaubal.SK_ID_BUREAU -> bureau.SK_ID_BUREAU\n", 453 | " prev.SK_ID_CURR -> train.SK_ID_CURR\n", 454 | " ccb.SK_ID_PREV -> prev.SK_ID_PREV\n", 455 | " insta.SK_ID_PREV -> prev.SK_ID_PREV\n", 456 | " pc.SK_ID_PREV -> prev.SK_ID_PREV\n", 457 | "\n", 458 | "Total number of features after adding automated features: 2226\n", 459 | "Preprocessing final table and one-hot encoding categorical features...\n", 460 | "\n", 461 | "Ratio of variance explained by 200 principal components: 0.7023963403648208\n", 462 | "\n", 463 | "Applying XGBoost classifier... \n", 464 | "\n", 465 | "Fitting 5 folds for each of 4 candidates, totalling 20 fits\n" 466 | ] 467 | }, 468 | { 469 | "name": "stderr", 470 | "output_type": "stream", 471 | "text": [ 472 | "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n", 473 | "[Parallel(n_jobs=-1)]: Done 20 out of 20 | elapsed: 8.2min remaining: 0.0s\n", 474 | "[Parallel(n_jobs=-1)]: Done 20 out of 20 | elapsed: 8.2min finished\n" 475 | ] 476 | }, 477 | { 478 | "name": "stdout", 479 | "output_type": "stream", 480 | "text": [ 481 | "\n", 482 | "Cross validation best score(AUC) is: 0.6735569039652113\n", 483 | "\n", 484 | "Model best hyperparamters are: {'xgbclassifier__min_child_weight': 5, 'xgbclassifier__max_depth': 3, 'xgbclassifier__learning_rate': 0.1, 'xgbclassifier__gamma': 0.4, 'xgbclassifier__colsample_bytree': 0.7}\n", 485 | "ROC_AUC: 0.6674257906376042\n", 486 | "\n", 487 | "Confusion_metrix (0.1 Threshold): \n", 488 | " [[1025 354]\n", 489 | " [ 59 62]]\n", 490 | "Precision (0.1 Threshold): 0.14903846153846154\n", 491 | "Recall (0.1 Threshold): 0.512396694214876\n", 492 | "F1-score ( 0.1 Threshold): 0.23091247672253257\n", 493 | "\n", 494 | "Cohen_kappa (0.1 Threshold): 0.12106215984609003\n" 495 | ] 496 | }, 497 | { 498 | "data": { 499 | "image/png": "\n", 500 | "text/plain": [ 501 | "
" 502 | ] 503 | }, 504 | "metadata": { 505 | "needs_background": "light" 506 | }, 507 | "output_type": "display_data" 508 | }, 509 | { 510 | "data": { 511 | "image/png": "\n", 512 | "text/plain": [ 513 | "
" 514 | ] 515 | }, 516 | "metadata": { 517 | "needs_background": "light" 518 | }, 519 | "output_type": "display_data" 520 | } 521 | ], 522 | "source": [ 523 | "''' -------------------------------Main---------------------------\n", 524 | "The main calls all funcitons sequentillay, apply automated feature engineering, train XGBoost, and plot the classification metrics.\n", 525 | " \n", 526 | "'''\n", 527 | "# 1) Model performance with concatenating CNN features\n", 528 | "train = train_application()\n", 529 | "\n", 530 | "# Extract new features using feature tools\n", 531 | "# max_depth: depth of a deep feature is the number of primitives required to make the feature.\n", 532 | "max_depth = 2\n", 533 | "\n", 534 | "# nrows: number of rows considered in train data for the model due to the computational power limitation.\n", 535 | "nrows = 30000\n", 536 | "\n", 537 | "train = Automated_features(train, nrows, max_depth)\n", 538 | "\n", 539 | "# Preprocesing including one hot encoding of categorical varibles for XGBoost \n", 540 | "train = preprocessing(train)\n", 541 | "\n", 542 | "# Training XGBoost\n", 543 | "nfolds = 5\n", 544 | "test_size = 0.05\n", 545 | "pred_class, pred, y_test = XGBoost(train, nfolds, test_size)\n", 546 | "# Evalutate ROC_AUC, Precision, Recall, F1-Score, Kohen-Cappa metrics\n", 547 | "calculate_metrics(pred_class, pred, y_test)\n", 548 | "# Plot ROC curve\n", 549 | "plot_ROC(y_test, pred)\n", 550 | "# Plot Precision/R curve\n", 551 | "plot_precision_recall(y_test, pred)" 552 | ] 553 | } 554 | ], 555 | "metadata": { 556 | "accelerator": "GPU", 557 | "colab": { 558 | "collapsed_sections": [], 559 | "name": "5Featuretoolstree.ipynb", 560 | "provenance": [ 561 | { 562 | "file_id": "1R5IYeiRZufzRipTqBk5zcdMgdkmUoEHE", 563 | "timestamp": 1595445733426 564 | }, 565 | { 566 | "file_id": "1ZnpsttZrvD6wbKD6aETZuuMbHnE7FLFi", 567 | "timestamp": 1594932112784 568 | }, 569 | { 570 | "file_id": "1QIbvwXBVKCtoJHH1alQcP13irUjezFYp", 571 | "timestamp": 1594841666146 572 | }, 573 | { 574 | "file_id": "1W2dSR-Ua7USBX0gTUcVxEtEvVlEbZCsq", 575 | "timestamp": 1594734735941 576 | } 577 | ] 578 | }, 579 | "kernelspec": { 580 | "display_name": "Python 3", 581 | "language": "python", 582 | "name": "python3" 583 | }, 584 | "language_info": { 585 | "codemirror_mode": { 586 | "name": "ipython", 587 | "version": 3 588 | }, 589 | "file_extension": ".py", 590 | "mimetype": "text/x-python", 591 | "name": "python", 592 | "nbconvert_exporter": "python", 593 | "pygments_lexer": "ipython3", 594 | "version": "3.7.7" 595 | } 596 | }, 597 | "nbformat": 4, 598 | "nbformat_minor": 1 599 | } 600 | --------------------------------------------------------------------------------