├── .gitignore ├── .DS_Store ├── models ├── .DS_Store ├── reference │ └── lightgbm_final_model.pkl ├── dsmlbc2 │ ├── ATILLA_MUHAMMET.py │ └── merve_betul.ipynb └── dsmlbc1 │ └── homeCreditRiskFinal.ipynb ├── outputs ├── features │ ├── fold_auc_best_df.pkl │ ├── lgbm_importances.png │ ├── feature_importance_df.pkl │ ├── features.py │ └── features.ipynb └── hyperparameters │ ├── lightgbm_model.pkl │ └── hyperparameters.pkl ├── scripts ├── __pycache__ │ ├── train.cpython-37.pyc │ ├── pre_processing.cpython-37.pyc │ └── helper_functions.cpython-37.pyc ├── predict.py ├── model_tuning.py ├── helper_functions.py ├── feature_selection.py ├── train.py └── pre_processing.py ├── .idea ├── vcs.xml ├── other.xml ├── .gitignore ├── misc.xml ├── inspectionProfiles │ ├── profiles_settings.xml │ └── Project_Default.xml ├── modules.xml ├── home_credit.iml └── datalore.xml ├── README.md ├── Makefile ├── main.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvahit/home_credit/HEAD/.DS_Store -------------------------------------------------------------------------------- /models/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvahit/home_credit/HEAD/models/.DS_Store -------------------------------------------------------------------------------- /outputs/features/fold_auc_best_df.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvahit/home_credit/HEAD/outputs/features/fold_auc_best_df.pkl -------------------------------------------------------------------------------- /outputs/features/lgbm_importances.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvahit/home_credit/HEAD/outputs/features/lgbm_importances.png -------------------------------------------------------------------------------- /models/reference/lightgbm_final_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvahit/home_credit/HEAD/models/reference/lightgbm_final_model.pkl -------------------------------------------------------------------------------- /outputs/features/feature_importance_df.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvahit/home_credit/HEAD/outputs/features/feature_importance_df.pkl -------------------------------------------------------------------------------- /outputs/hyperparameters/lightgbm_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvahit/home_credit/HEAD/outputs/hyperparameters/lightgbm_model.pkl -------------------------------------------------------------------------------- /scripts/__pycache__/train.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvahit/home_credit/HEAD/scripts/__pycache__/train.cpython-37.pyc -------------------------------------------------------------------------------- /outputs/hyperparameters/hyperparameters.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvahit/home_credit/HEAD/outputs/hyperparameters/hyperparameters.pkl -------------------------------------------------------------------------------- /scripts/__pycache__/pre_processing.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvahit/home_credit/HEAD/scripts/__pycache__/pre_processing.cpython-37.pyc -------------------------------------------------------------------------------- /scripts/__pycache__/helper_functions.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvahit/home_credit/HEAD/scripts/__pycache__/helper_functions.cpython-37.pyc -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/other.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Datasource local storage ignored files 5 | /dataSources/ 6 | /dataSources.local.xml 7 | # Editor-based HTTP Client requests 8 | /httpRequests/ 9 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/home_credit.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # production_ready_home_credit_default_risk_model 2 | 3 | 4 | # Kaggle kurulum: 5 | 6 | 1. pip install kaggle 7 | 2. https://www.kaggle.com//account adresinden Create API'ye tıkla. 8 | 3. json dosyası inecek. Bu dosyayı home dizini altında ".kaggle" isimli bir klasör açıp içerisine yerleştir. 9 | 4. diğer kullanıcıların erişimini engellemek için "chmod 600 /Users/mvahit/.kaggle/kaggle.json" 10 | 5. go! 11 | 12 | 13 | # Bir klasorü ignore etmek 14 | 15 | 1. masaüstü uygulamasını aç 16 | 2. ilgili projeyi ac 17 | 3. üst menüden repository bölümüne gir 18 | 4. repo setting 19 | 5. ignored files 20 | 21 | 22 | -------------------------------------------------------------------------------- /outputs/features/features.py: -------------------------------------------------------------------------------- 1 | # TODO feature isimlerini duzgunce al txt olarak bas 2 | 3 | 4 | import pandas as pd 5 | pd.set_option('display.max_columns', None) 6 | df = pd.read_pickle("/Users/mvahit/Documents/GitHub/home_credit/outputs/features/feature_importance_df.pkl") 7 | df.head() 8 | df = df.groupby("feature")["importance"].agg({"mean"}).sort_values(by="mean", ascending=False) 9 | df.head() 10 | 11 | df[df["mean"] > 0] 12 | df.shape 13 | 14 | df2 = pd.read_pickle("/Users/mvahit/Documents/GitHub/home_credit/outputs/features/fold_auc_best_df.pkl") 15 | 16 | 17 | # FINAL DF 18 | 19 | import pandas as pd 20 | pd.set_option('display.max_columns', None) 21 | df = pd.read_pickle("/Users/mvahit/Documents/GitHub/home_credit/data/final_train_df.pkl") 22 | df.head() 23 | df.shape 24 | 25 | [col for col in df.columns if col.startswith("APP")] 26 | a = df[[col for col in df.columns if col.startswith("APP")]].head() -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # GITHUB 2 | 3 | commit: 4 | git commit -am "commit from make file" 5 | 6 | push: 7 | git push origin master 8 | 9 | pull: 10 | git pull origin master 11 | 12 | fetch: 13 | git fetch origin master 14 | 15 | reset: 16 | rm -f .git/index 17 | git reset 18 | 19 | req: 20 | pip freeze > requirements.txt 21 | 22 | compush: req commit push 23 | 24 | 25 | 26 | # CONSOL RUN 27 | run_no_debug: 28 | python main.py --no-debug 29 | 30 | run: 31 | python main.py 32 | 33 | 34 | # MODEL TUNING 35 | 36 | tuning: 37 | python scripts/model_tuning.py 38 | 39 | 40 | # predict.py fonksiyonunu kullanarak train seti değerleri tahmini ve AUC degeri 41 | predict: 42 | python scripts/predict.py 43 | 44 | # predict.py fonksiyonunu kullanarak test seti değerlerini tahmin etme 45 | predict_test: 46 | python scripts/predict.py --test 47 | 48 | # predict.py fonksiyonu ile tahmin edilen sonuçların kaggle'a gönderilmesi 49 | kaggle_submit_predict: 50 | kaggle competitions submit -c home-credit-default-risk -f outputs/predictions/sub_from_prediction_py.csv -m "Message" 51 | 52 | muhat: 53 | python models/dsmlbc2/muhat.py -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 26 | -------------------------------------------------------------------------------- /.idea/datalore.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 38 | 39 | -------------------------------------------------------------------------------- /scripts/predict.py: -------------------------------------------------------------------------------- 1 | """modele train ya test bağımsız değişken değerlerini sor""" 2 | 3 | import os 4 | import pickle 5 | import pandas as pd 6 | from sklearn.metrics import roc_auc_score 7 | import argparse 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--train', dest='prediction_type', action='store_true') 11 | parser.add_argument('--test', dest='prediction_type', action='store_false') 12 | parser.set_defaults(prediction_type=True) 13 | args = parser.parse_args() 14 | 15 | final_train = pd.read_pickle("data/final_train_df.pkl") 16 | final_test = pd.read_pickle("data/final_test_df.pkl") 17 | 18 | feats = [f for f in final_test.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index', 19 | "APP_index", "BURO_index", "PREV_index", "INSTAL_index", 20 | "CC_index", "POS_index"]] 21 | 22 | if args.prediction_type: 23 | y_train = final_train["TARGET"] 24 | x_train = final_train[feats] 25 | 26 | cur_dir = os.getcwd() 27 | os.chdir('models/reference/') 28 | model = pickle.load(open('lightgbm_final_model.pkl', 'rb')) 29 | os.chdir(cur_dir) 30 | 31 | y_pred = model.predict_proba(x_train)[:, 1] 32 | print("TRAIN AUC SCORE:", roc_auc_score(y_train, y_pred)) 33 | else: 34 | x_test = final_test[feats] 35 | cur_dir = os.getcwd() 36 | os.chdir('models/reference/') 37 | model = pickle.load(open('lightgbm_final_model.pkl', 'rb')) 38 | os.chdir(cur_dir) 39 | y_pred = model.predict_proba(x_test)[:, 1] 40 | ids = final_test['SK_ID_CURR'] 41 | submission = pd.DataFrame({'SK_ID_CURR': ids, 'TARGET': y_pred}) 42 | os.chdir('outputs/predictions/') 43 | submission.to_csv("sub_from_prediction_py.csv", index=False) 44 | print("Submission file has been created in:", "/Users/mvahit/Documents/GitHub/home_credit/predictions/") 45 | 46 | # calistirmak icin 47 | # python scripts/predict.py --train 48 | -------------------------------------------------------------------------------- /scripts/model_tuning.py: -------------------------------------------------------------------------------- 1 | """Model tuning scripti calistiginda hyperparameters klasörüne iki sonuc uretecek: 2 | 3 | hyperparameters.pkl 4 | lightgbm_model.pkl 5 | 6 | """ 7 | 8 | # TODO feature isimlerini modele sokarak bu feature'lar ile tuning 9 | 10 | import os 11 | import pickle 12 | from lightgbm import LGBMClassifier 13 | import pandas as pd 14 | from sklearn.model_selection import GridSearchCV 15 | 16 | lgbm = LGBMClassifier() 17 | 18 | lgbm_params = {"learning_rate": [0.01, 0.1], 19 | "n_estimators": [200, 100]} 20 | 21 | df = pd.read_pickle("data/final_train_df.pkl") 22 | 23 | 24 | y_train = df["TARGET"] 25 | 26 | X_train = df.drop("TARGET", axis=1) 27 | 28 | lgbm_cv_model = GridSearchCV(lgbm, 29 | lgbm_params, 30 | cv=5, 31 | n_jobs=-1, 32 | verbose=2).fit(X_train, y_train) 33 | 34 | dir(lgbm_cv_model) 35 | params = lgbm_cv_model.best_params_ 36 | 37 | # saving hyperparameters and model 38 | cur_dir = os.getcwd() 39 | os.chdir('outputs/hyperparameters/') 40 | pickle.dump(params, open("hyperparameters.pkl", 'wb')) # hyperparameters 41 | pickle.dump(lgbm_cv_model, open("lightgbm_model.pkl", 'wb')) # model 42 | os.chdir(cur_dir) 43 | 44 | print("Best hyperparameters", params) 45 | 46 | 47 | # loading and prediction with model 48 | 49 | # del lgbm_cv_model 50 | cur_dir = os.getcwd() 51 | os.chdir('/Users/mvahit/Documents/GitHub/home_credit/outputs/hyperparameters/') 52 | model = pickle.load(open('lightgbm_model.pkl', 'rb')) 53 | os.chdir(cur_dir) 54 | model.predict(X_train.head()) 55 | 56 | # loading hyperparameters 57 | del model 58 | del params 59 | cur_dir = os.getcwd() 60 | os.chdir('/Users/mvahit/Documents/GitHub/home_credit/outputs/hyperparameters/') 61 | params = pickle.load(open('hyperparameters.pkl', 'rb')) 62 | final_lgbm = LGBMClassifier(**params).fit(X_train, y_train) 63 | final_lgbm.get_params() 64 | final_lgbm.predict(X_train.head()) 65 | 66 | -------------------------------------------------------------------------------- /scripts/helper_functions.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | # One-hot encoding for categorical columns with get_dummies 3 | def one_hot_encoder(df, nan_as_category=True): 4 | import pandas as pd 5 | original_columns = list(df.columns) 6 | categorical_columns = [col for col in df.columns if df[col].dtype == 'object'] 7 | df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category) 8 | new_columns = [c for c in df.columns if c not in original_columns] 9 | return df, new_columns 10 | 11 | 12 | # command line access for debuging 13 | def get_namespace(): 14 | import argparse 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('--debug', dest='debug', action='store_true') 17 | parser.add_argument('--no-debug', dest='debug', action='store_false') 18 | parser.set_defaults(debug=True) 19 | return parser.parse_args() 20 | 21 | 22 | # i love data science 23 | def i_love_ds(): 24 | print('\n'.join([''.join([(' I_Love_Data_Science_'[(x - y) % len('I_Love_Data_Science_')] 25 | if ((x * 0.05) ** 2 + (y * 0.1) ** 2 - 1) ** 3 - (x * 0.05) ** 2 * ( 26 | y * 0.1) ** 3 <= 0 else ' ') 27 | for x in range(-30, 30)]) for y in range(15, -15, -1)])) 28 | 29 | 30 | # Display/plot feature importance 31 | def display_importances(feature_importance_df_): 32 | import seaborn as sns 33 | import matplotlib.pyplot as plt 34 | cols = (feature_importance_df_[["feature", "importance"]] 35 | .groupby("feature") 36 | .mean() 37 | .sort_values(by="importance", ascending=False)[:100].index) 38 | best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)] 39 | plt.figure(figsize=(10, 20)) 40 | sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)) 41 | plt.title('LightGBM Features (avg over folds)') 42 | plt.tight_layout() 43 | plt.savefig('outputs/features/lgbm_importances.png') 44 | 45 | 46 | # missing values 47 | # 48 | # def missing_values(df): 49 | # 50 | # cols_with_na = [col for col in df.columns if df[col].isnull().sum() > 0] 51 | # for col in cols_with_na: 52 | # print(col, np.round(df[cols_with_na].isnull().mean(), 3), " % missing values") 53 | 54 | 55 | 56 | # # saving models 57 | # def saving_models(): 58 | # import os 59 | # cur_dir = os.getcwd() 60 | # os.chdir('/models/reference/') 61 | # model_name = "lightgbm_fold_" + str(n_fold + 1) + "." + "pkl" 62 | # pickle.dump(model, open(model_name, 'wb')) # model 63 | # os.chdir(cur_dir) 64 | 65 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # HOME CREDIT DEFAULT RISK RUNNER FUNCTION 2 | 3 | # Linux 4 | # Context Manager 5 | # Decorator 6 | # requirements.txt 7 | # virtual env 8 | # Makefile 9 | # git github 10 | # CLIs 11 | 12 | 13 | import gc 14 | import time 15 | from contextlib import contextmanager 16 | import warnings 17 | 18 | 19 | from scripts.helper_functions import get_namespace, i_love_ds 20 | 21 | from scripts.pre_processing import application_train_test, bureau_and_balance, previous_applications, pos_cash, \ 22 | installments_payments, credit_card_balance 23 | 24 | from scripts.train import kfold_lightgbm 25 | 26 | warnings.simplefilter(action='ignore', category=FutureWarning) 27 | 28 | 29 | @contextmanager 30 | def timer(title): 31 | t0 = time.time() 32 | yield 33 | print("{} - done in {:.0f}s".format(title, time.time() - t0)) 34 | 35 | 36 | def main(debug=False): 37 | num_rows = 10000 if debug else None 38 | 39 | with timer("Pre-Processing"): 40 | 41 | i_love_ds() 42 | 43 | # application_train_test 44 | df = application_train_test(num_rows) 45 | # bureau & bureau_balance 46 | bureau = bureau_and_balance(num_rows) 47 | df = df.join(bureau, how='left', on='SK_ID_CURR') 48 | del bureau 49 | # previous_applications 50 | prev = previous_applications(num_rows) 51 | df = df.join(prev, how='left', on='SK_ID_CURR') 52 | del prev 53 | # posh_cash 54 | pos = pos_cash(num_rows) 55 | df = df.join(pos, how='left', on='SK_ID_CURR') 56 | del pos 57 | # installments_payments 58 | ins = installments_payments(num_rows) 59 | df = df.join(ins, how='left', on='SK_ID_CURR') 60 | del ins 61 | # credit_card_balance 62 | cc = credit_card_balance(num_rows) 63 | df = df.join(cc, how='left', on='SK_ID_CURR') 64 | del cc 65 | 66 | # saving final dataframes 67 | train_df = df[df['TARGET'].notnull()] 68 | test_df = df[df['TARGET'].isnull()] 69 | train_df.to_pickle("data/final_train_df.pkl") 70 | test_df.to_pickle("data/final_test_df.pkl") 71 | 72 | del train_df, test_df 73 | gc.collect() 74 | 75 | with timer("Run LightGBM"): 76 | feat_importance = kfold_lightgbm(df, debug=debug) 77 | 78 | 79 | if __name__ == "__main__": 80 | namespace = get_namespace() 81 | with timer("Full model run"): 82 | main(debug=namespace.debug) 83 | 84 | # kaggle model run: 7879s 85 | # server: 8290s 86 | # mac: 5073s 87 | # google 8: 3189s 88 | # workstation: 1987s 89 | # submission public score: 0.79186 90 | 91 | # 0.79557 mehmet_okan_kasim 92 | 93 | 94 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | appnope==0.1.0 2 | attrs @ file:///tmp/build/80754af9/attrs_1598374659300/work 3 | backcall==0.2.0 4 | bleach==3.1.5 5 | certifi==2020.6.20 6 | chardet==3.0.4 7 | colorama==0.4.3 8 | cycler==0.10.0 9 | decorator==4.4.2 10 | defusedxml==0.6.0 11 | entrypoints==0.3 12 | feature-engine==0.6.0 13 | idna==2.10 14 | importlib-metadata @ file:///opt/concourse/worker/volumes/live/84197498-cbc0-4436-7ce0-03c4490b7a28/volume/importlib-metadata_1593446431408/work 15 | iniconfig==1.0.1 16 | ipykernel @ file:///opt/concourse/worker/volumes/live/73e8766c-12c3-4f76-62a6-3dea9a7da5b7/volume/ipykernel_1596206701501/work/dist/ipykernel-5.3.4-py3-none-any.whl 17 | ipython @ file:///opt/concourse/worker/volumes/live/bb221eaa-cc1a-4ab2-40f7-74a2020a44b1/volume/ipython_1599056234390/work 18 | ipython-genutils==0.2.0 19 | ipywidgets==7.5.1 20 | jedi @ file:///opt/concourse/worker/volumes/live/152cd167-7b79-4fbd-5c97-d7b338805c2b/volume/jedi_1598371617305/work 21 | Jinja2==2.11.2 22 | joblib @ file:///tmp/build/80754af9/joblib_1594236160679/work 23 | jsonschema==3.2.0 24 | jupyter==1.0.0 25 | jupyter-client @ file:///tmp/build/80754af9/jupyter_client_1594826976318/work 26 | jupyter-console @ file:///tmp/build/80754af9/jupyter_console_1598884538475/work 27 | jupyter-core==4.6.3 28 | kaggle==1.5.8 29 | keyring==21.4.0 30 | kiwisolver==1.2.0 31 | lightgbm==2.3.0 32 | MarkupSafe==1.1.1 33 | matplotlib==3.3.1 34 | missingno==0.4.2 35 | mistune==0.8.4 36 | mkl-fft==1.1.0 37 | mkl-random==1.1.1 38 | mkl-service==2.3.0 39 | mlxtend==0.17.3 40 | more-itertools==8.5.0 41 | nbconvert==5.6.1 42 | nbformat==5.0.7 43 | notebook==6.0.3 44 | numpy==1.19.2 45 | packaging==20.4 46 | pandas==1.1.2 47 | pandocfilters==1.4.2 48 | parso==0.7.0 49 | patsy==0.5.1 50 | pexpect==4.8.0 51 | pickleshare==0.7.5 52 | Pillow==7.2.0 53 | pkginfo==1.5.0.1 54 | pluggy==0.13.1 55 | prometheus-client==0.8.0 56 | prompt-toolkit @ file:///tmp/build/80754af9/prompt-toolkit_1598885458782/work 57 | ptyprocess==0.6.0 58 | py==1.9.0 59 | Pygments==2.6.1 60 | PyMySQL==0.10.1 61 | pyparsing==2.4.7 62 | pyrsistent==0.16.0 63 | pytest==6.0.2 64 | python-dateutil==2.8.1 65 | python-slugify==4.0.1 66 | pytz==2020.1 67 | pyzmq==19.0.1 68 | qtconsole @ file:///tmp/build/80754af9/qtconsole_1598374667791/work 69 | QtPy==1.9.0 70 | readme-renderer==26.0 71 | requests==2.24.0 72 | requests-toolbelt==0.9.1 73 | rfc3986==1.4.0 74 | scikit-learn @ file:///opt/concourse/worker/volumes/live/2dacdc11-21e7-44f5-57b4-6b8eb6ceb626/volume/scikit-learn_1598376924598/work 75 | scipy @ file:///opt/concourse/worker/volumes/live/9698578f-91da-4d5f-6fce-b26b1f42eb5a/volume/scipy_1597686637948/work 76 | seaborn==0.11.0 77 | Send2Trash==1.5.0 78 | six==1.15.0 79 | slugify==0.0.1 80 | statsmodels==0.12.0 81 | terminado==0.8.3 82 | testpath==0.4.4 83 | text-unidecode==1.3 84 | threadpoolctl @ file:///tmp/tmp9twdgx9k/threadpoolctl-2.1.0-py3-none-any.whl 85 | toml==0.10.1 86 | tornado==6.0.4 87 | tqdm==4.49.0 88 | traitlets==4.3.3 89 | twine==3.2.0 90 | urllib3==1.24.3 91 | wcwidth @ file:///tmp/build/80754af9/wcwidth_1593447189090/work 92 | webencodings==0.5.1 93 | widgetsnbextension==3.5.1 94 | xlrd==1.2.0 95 | zipp==3.1.0 96 | -------------------------------------------------------------------------------- /outputs/features/features.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true, 8 | "pycharm": { 9 | "name": "#%% feature importance\n" 10 | } 11 | }, 12 | "outputs": [], 13 | "source": [ 14 | "import pandas as pd\n", 15 | "pd.set_option('display.max_columns', None)\n", 16 | "df = pd.read_pickle(\"/Users/mvahit/Documents/GitHub/home_credit/outputs/features/feature_importance_df.pkl\")\n", 17 | "df = df.groupby(\"feature\")[\"importance\"].agg({\"mean\"}).sort_values(by=\"mean\", ascending=False)" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 10, 23 | "outputs": [], 24 | "source": [ 25 | "df = pd.read_pickle(\"/Users/mvahit/Documents/GitHub/home_credit/outputs/features/fold_auc_best_df.pkl\")" 26 | ], 27 | "metadata": { 28 | "collapsed": false, 29 | "pycharm": { 30 | "name": "#%% best iteration\n" 31 | } 32 | } 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 11, 37 | "outputs": [ 38 | { 39 | "data": { 40 | "text/plain": " FOLD AUC BEST_ITER\n0 1.0 0.764298 393.0\n1 2.0 0.818620 115.0\n2 3.0 0.754580 283.0\n3 4.0 0.780804 283.0\n4 5.0 0.797317 197.0", 41 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
FOLDAUCBEST_ITER
01.00.764298393.0
12.00.818620115.0
23.00.754580283.0
34.00.780804283.0
45.00.797317197.0
\n
" 42 | }, 43 | "execution_count": 11, 44 | "metadata": {}, 45 | "output_type": "execute_result" 46 | } 47 | ], 48 | "source": [ 49 | "df.head()\n" 50 | ], 51 | "metadata": { 52 | "collapsed": false, 53 | "pycharm": { 54 | "name": "#%%\n" 55 | } 56 | } 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "outputs": [], 62 | "source": [], 63 | "metadata": { 64 | "collapsed": false, 65 | "pycharm": { 66 | "name": "#%%\n" 67 | } 68 | } 69 | } 70 | ], 71 | "metadata": { 72 | "kernelspec": { 73 | "display_name": "Python 3", 74 | "language": "python", 75 | "name": "python3" 76 | }, 77 | "language_info": { 78 | "codemirror_mode": { 79 | "name": "ipython", 80 | "version": 2 81 | }, 82 | "file_extension": ".py", 83 | "mimetype": "text/x-python", 84 | "name": "python", 85 | "nbconvert_exporter": "python", 86 | "pygments_lexer": "ipython2", 87 | "version": "2.7.6" 88 | } 89 | }, 90 | "nbformat": 4, 91 | "nbformat_minor": 0 92 | } -------------------------------------------------------------------------------- /scripts/feature_selection.py: -------------------------------------------------------------------------------- 1 | # EKSİK DEĞER, AYKIRI DEGER, FEATURE SCALING: 2 | # DOĞRUSAL MODELLERDE, SVM, YSA, KNN YA DA UZAKLIK TEMELLIK YONTEMLERDE ONEMLIDIR. 3 | 4 | # AĞAÇ YÖNTEMLERİNDE ÖNEMİ DÜZEYLERİ ÇOK AZDIR. 5 | 6 | # Filter Methods (Statistical methods: korelasyon, ki-kare) 7 | # Wrapper Methods (backward selection, forward selection, stepwise) 8 | # Embeded (Tree Based Methods, Ridge, Lasso) 9 | 10 | # Tree Based Methods 11 | # korelasyon, ki-kare 12 | 13 | # TODO TREE BASED SELECTION 14 | 15 | # TODO: tum değişkenleri, sayısal değişkenleri, kategorik değişkenleri (iki sınıflı ya da çok sınıflı), 16 | # yeni türetilen değişkenlerin isimlerini ayrı listelerde tut. 17 | 18 | all_cols = [] # target burada olmamalı 19 | num_cols = [col for col in df.columns if df[col].dtypes != 'O'] 20 | cat_cols = [] 21 | new_cols = [] 22 | target = [] 23 | 24 | # TODO: random forests, lightgbm, xgboost ve catboost modelleri geliştir. 25 | # Bu modellere orta şekerli hiperparametre optimizasyonu yap. Final modelleri kur. 26 | # Bu modellerin her birisine feature importance sor. Gelen feature importance'ların hepsini bir df'te topla. 27 | # Bu df'in sütunları aşağıdaki şekilde olsun: 28 | 29 | # model_name feature_name feature_importance 30 | 31 | # TODO: oluşacak df'i analiz et. Grupby ile importance'in ortalamasını alıp, değişken önemlerini küçükten büyüğe sırala. 32 | # En önemli değişkenleri bul. Sıfırdan küçük olan importance'a sahip değişkenleri sil. 33 | # Nihayi olarak karar verdiğin değişkenlerin adını aşağıdaki şekilde sakla: 34 | 35 | features_based_trees = [] 36 | 37 | # TODO: Önemli not. Yukarıdaki işlemler neticesinde catboost'un sonuçlarına özellikle odaklanıp 38 | # kategorik değişkenlerin incelenmesi gerekmektedir. 39 | # Çalışmanın başında tutulmuş olan cat_cols listesini kullanarak 40 | # sadece categorik değişkenler için hangi ağacın nasıl bir önem düzeyi verdiğini inceleyiniz 41 | # ve diğer algoritmalarca önemsiz catboost tarafından önemli olan değerlendirilen değişkenleri bulunuz 42 | # ve aşağıdaki şekilde kaydediniz: 43 | 44 | features_catboost_cat = [] 45 | 46 | # TODO: features_based_trees listesinde yer ALMAYIP catboost_cat_imp listesinde YER ALAN değişkenleri bulunuz 47 | # ve bu değişkenleri features_based_trees listesine ekleyiniz. 48 | 49 | 50 | # TODO STATISTICAL SELECTION 51 | 52 | # TODO bağımsız değişkenlerin birbiri arasındaki korelasyonlarına bakıp birbiri ile 53 | # yüzde 75 üzeri korelasyonlu olan değişkenler arasından 1 tane değişkeni rastgele seçiniz 54 | # ve değişkenlerin isimlerini aşağıdaki gibi kaydediniz: 55 | # elenen değişkenlerin isimlerini de aşağıdaki gibi kaydediniz: 56 | 57 | features_based_correlation = [] 58 | features_dropped_based_correlation = [] 59 | 60 | 61 | # TODO: features_based_trees listesinde olup aynı anda features_dropped_based_correlation listesinde olan feature'lara 62 | # odaklanarak inceleme yapınız ve gerekli gördüğünüz değişkenleri features_based_trees listesinden siliniz ya da 63 | # drop listesinden agaç listesine taşıyınız 64 | 65 | # TODO: veri setindeki kategorik değişkenler ile bağımlı değişken arasında chi-squared testi uygulayınız 66 | # ve bu test sonucuna göre target ile dependency'si bulunan değişkenleri aşağıdaki şekilde saklayınız: 67 | 68 | cat_cols_chi = [] 69 | 70 | # TODO: yukarıdan gelecek olan değişkenler ile features_based_trees listelerini karşılaştırınız. Durumu analiz ediniz. 71 | # cat_cols_chi listesinde olup features_based_trees listesinde olmayan değişkenleri eklemeyi değerlendiriniz. 72 | # ya da cat_cols_chi'de olmayıp features_based_trees'de olan değişkenkeri çıkarmayı değerlendiriniz. 73 | # Değerlendirmekten kastım sizin yorumunuza kalmış. 74 | 75 | 76 | # TODO: netice olarak en sonda aşağıdaki isimlendirme ile seçilmis feature'ları kaydediniz: 77 | 78 | 79 | features_selected = [] 80 | 81 | # TODO: seçilmiş feature'lar ile model tuning yaparak lightgbm için hiperparametre optimizasyonu yapınız. 82 | # TODO: yeni hiperparametrelerle final modeli oluşturunuz. -------------------------------------------------------------------------------- /scripts/train.py: -------------------------------------------------------------------------------- 1 | # LightGBM GBDT with KFold 2 | # Parameters from Tilii kernel: https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code 3 | import gc 4 | import os 5 | import pickle 6 | import pandas as pd 7 | 8 | from lightgbm import LGBMClassifier 9 | import numpy as np 10 | from sklearn.metrics import roc_auc_score 11 | from sklearn.model_selection import KFold 12 | 13 | from scripts.helper_functions import display_importances 14 | 15 | 16 | def kfold_lightgbm(df, debug=False): 17 | # Divide in training/validation and test data 18 | 19 | train_df = df[df['TARGET'].notnull()] 20 | test_df = df[df['TARGET'].isnull()] 21 | print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape)) 22 | 23 | 24 | del df 25 | gc.collect() 26 | 27 | folds = KFold(n_splits=10, shuffle=True, random_state=1001) 28 | 29 | # Create arrays and dataframes to store results 30 | 31 | oof_preds = np.zeros(train_df.shape[0]) # predicted valid_y 32 | sub_preds = np.zeros(test_df.shape[0]) # submission preds 33 | feature_importance_df = pd.DataFrame() # feature importance 34 | 35 | fold_auc_best_df = pd.DataFrame(columns=["FOLD", "AUC", "BEST_ITER"]) # holding best iter to save model 36 | 37 | feats = [f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index', 38 | "APP_index", "BURO_index", "PREV_index", "INSTAL_index", 39 | "CC_index", "POS_index"]] 40 | 41 | # folds split'e X,Y birlikte gösterildi. Bu veriyi bol dendi. 10 tane train-validasyon index cifti turetildi. 42 | # enumerate turetilen index çiftlerini çift olarak yakalama imkanı sagladi. 43 | 44 | for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])): 45 | train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx] 46 | valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx] 47 | 48 | # LightGBM parameters found by Bayesian optimization 49 | clf = LGBMClassifier( 50 | n_jobs=-1, 51 | n_estimators=10000, 52 | learning_rate=0.02, 53 | num_leaves=34, 54 | colsample_bytree=0.9497036, 55 | subsample=0.8715623, 56 | max_depth=8, 57 | reg_alpha=0.041545473, 58 | reg_lambda=0.0735294, 59 | min_split_gain=0.0222415, 60 | min_child_weight=39.3259775, 61 | silent=-1, 62 | verbose=-1, ) 63 | 64 | clf.fit(train_x, 65 | train_y, 66 | eval_set=[(train_x, train_y), 67 | (valid_x, valid_y)], 68 | eval_metric='auc', 69 | verbose=200, 70 | early_stopping_rounds=200) 71 | 72 | # predicted valid_y 73 | oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1] 74 | 75 | # submission preds. her kat icin test setini tahmin edip tum katların ortalamasini alıyor. 76 | sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits 77 | 78 | # fold, auc and best iteration 79 | print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx]))) 80 | 81 | 82 | 83 | # best auc & iteration 84 | fold_auc_best_df = fold_auc_best_df.append({'FOLD': int(n_fold + 1), 85 | 'AUC': roc_auc_score(valid_y, oof_preds[valid_idx]), 86 | "BEST_ITER": clf.best_iteration_}, ignore_index=True) 87 | 88 | 89 | 90 | fold_importance_df = pd.DataFrame() 91 | fold_importance_df["feature"] = feats 92 | fold_importance_df["importance"] = clf.feature_importances_ 93 | fold_importance_df["fold"] = n_fold + 1 94 | feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) 95 | 96 | del clf, train_x, train_y, valid_x, valid_y 97 | gc.collect() 98 | 99 | # OUTPUTS 100 | print(fold_auc_best_df) 101 | print(feature_importance_df) 102 | 103 | # feature importance'ları df olarak kaydet 104 | feature_importance_df.to_pickle("outputs/features/feature_importance_df.pkl") 105 | fold_auc_best_df.to_pickle("outputs/features/fold_auc_best_df.pkl") 106 | 107 | # Final Model 108 | best_iter_1 = int(fold_auc_best_df.sort_values(by="AUC", ascending=False)[:1]["BEST_ITER"].values) 109 | 110 | # AUC'ye gore sırala, ilk 3 fold'un best iter sayılarının ortalamasını al, virgulden sonra sayı olmasın. 111 | # best_iter_3 = round(fold_auc_best_df.sort_values(by="AUC", ascending=False)[:3]["BEST_ITER"].mean(), 0) 112 | 113 | y_train = train_df["TARGET"] 114 | x_train = train_df[feats] 115 | 116 | final_model = LGBMClassifier( 117 | n_jobs=-1, 118 | n_estimators=best_iter_1, 119 | learning_rate=0.02, 120 | num_leaves=34, 121 | colsample_bytree=0.9497036, 122 | subsample=0.8715623, 123 | max_depth=8, 124 | reg_alpha=0.041545473, 125 | reg_lambda=0.0735294, 126 | min_split_gain=0.0222415, 127 | min_child_weight=39.3259775, 128 | silent=-1, 129 | verbose=-1).fit(x_train, y_train) 130 | 131 | cur_dir = os.getcwd() 132 | os.chdir('models/reference/') 133 | pickle.dump(final_model, open("lightgbm_final_model.pkl", 'wb')) # model 134 | os.chdir(cur_dir) 135 | 136 | # her bir fold icin tahmin edilen valid_y'ler aslında train setinin y'lerinin farklı parcalarda yer alan tahminleri. 137 | print('Full Train(Validasyon) AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds)) 138 | 139 | # Write submission file and plot feature importance 140 | if not debug: 141 | test_df['TARGET'] = sub_preds 142 | test_df[['SK_ID_CURR', 'TARGET']].to_csv("predictions/reference_submission.csv", index=False) 143 | 144 | display_importances(feature_importance_df) 145 | del x_train, y_train 146 | 147 | return feature_importance_df 148 | 149 | -------------------------------------------------------------------------------- /scripts/pre_processing.py: -------------------------------------------------------------------------------- 1 | """bu scriptte ön işleme fonksiyonları yer almaktadır.""" 2 | import gc 3 | 4 | import pandas as pd 5 | import numpy as np 6 | from scripts.helper_functions import one_hot_encoder 7 | 8 | 9 | # Preprocess application_train.csv and application_test.csv 10 | 11 | def application_train_test(num_rows=None, nan_as_category=False): 12 | # Read data and merge 13 | df = pd.read_csv('data/application_train.csv', nrows=num_rows) 14 | test_df = pd.read_csv('data/application_test.csv', nrows=num_rows) 15 | print("Train samples: {}, test samples: {}".format(len(df), len(test_df))) 16 | df = df.append(test_df).reset_index() 17 | 18 | # Optional: Remove 4 applications with XNA CODE_GENDER (train set) 19 | df = df[df['CODE_GENDER'] != 'XNA'] 20 | 21 | # Categorical features with Binary encode (0 or 1; two categories) 22 | for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']: 23 | df[bin_feature], uniques = pd.factorize(df[bin_feature]) 24 | 25 | # Categorical features with One-Hot encode 26 | df, cat_cols = one_hot_encoder(df, nan_as_category) 27 | 28 | # NaN values for DAYS_EMPLOYED: 365.243 -> nan 29 | df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True) 30 | 31 | # Some simple new features (percentages) 32 | df['NEW_DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH'] 33 | df['NEW_INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT'] 34 | df['NEW_INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS'] 35 | df['NEW_ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL'] 36 | df['NEW_PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT'] 37 | 38 | df.columns = pd.Index(["APP_" + col for col in df.columns.tolist()]) 39 | df.rename(columns={"APP_SK_ID_CURR": "SK_ID_CURR"}, inplace=True) 40 | df.rename(columns={"APP_TARGET": "TARGET"}, inplace=True) 41 | 42 | del test_df 43 | gc.collect() 44 | return df 45 | 46 | 47 | # Preprocess bureau.csv and bureau_balance.csv 48 | def bureau_and_balance(num_rows=None, nan_as_category=True): 49 | # Preprocessing 50 | bureau = pd.read_csv('data/bureau.csv', nrows=num_rows) 51 | bb = pd.read_csv('data/bureau_balance.csv', nrows=num_rows) 52 | bb, bb_cat = one_hot_encoder(bb, nan_as_category) 53 | bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category) 54 | 55 | # Bureau balance: Perform aggregations and merge with bureau.csv 56 | bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']} 57 | 58 | for col in bb_cat: 59 | bb_aggregations[col] = ['mean'] 60 | 61 | bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations) 62 | bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()]) 63 | bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU') 64 | bureau.drop(['SK_ID_BUREAU'], axis=1, inplace=True) 65 | 66 | del bb, bb_agg 67 | gc.collect() 68 | 69 | # Bureau and bureau_balance numeric features 70 | num_aggregations = { 71 | 'DAYS_CREDIT': ['min', 'max', 'mean', 'var'], 72 | 'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'], 73 | 'DAYS_CREDIT_UPDATE': ['mean'], 74 | 'CREDIT_DAY_OVERDUE': ['max', 'mean'], 75 | 'AMT_CREDIT_MAX_OVERDUE': ['mean'], 76 | 'AMT_CREDIT_SUM': ['max', 'mean', 'sum'], 77 | 'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'], 78 | 'AMT_CREDIT_SUM_OVERDUE': ['mean'], 79 | 'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'], 80 | 'AMT_ANNUITY': ['max', 'mean'], 81 | 'CNT_CREDIT_PROLONG': ['sum'], 82 | 'MONTHS_BALANCE_MIN': ['min'], 83 | 'MONTHS_BALANCE_MAX': ['max'], 84 | 'MONTHS_BALANCE_SIZE': ['mean', 'sum'] 85 | } 86 | 87 | # Bureau and bureau_balance categorical features 88 | cat_aggregations = {} 89 | for cat in bureau_cat: 90 | cat_aggregations[cat] = ['mean'] 91 | 92 | for cat in bb_cat: 93 | cat_aggregations[cat + "_MEAN"] = ['mean'] 94 | 95 | bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations}) 96 | bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()]) 97 | 98 | # Bureau: Active credits - using only numerical aggregations 99 | active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1] 100 | active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations) 101 | active_agg.columns = pd.Index(['BURO_NEW_ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()]) 102 | bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR') 103 | del active, active_agg 104 | gc.collect() 105 | 106 | # Bureau: Closed credits - using only numerical aggregations 107 | closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1] 108 | closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations) 109 | closed_agg.columns = pd.Index(['BURO_NEW_CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()]) 110 | bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR') 111 | del closed, closed_agg, bureau 112 | gc.collect() 113 | return bureau_agg 114 | 115 | 116 | # Preprocess previous_applications.csv 117 | def previous_applications(num_rows=None, nan_as_category=True): 118 | prev = pd.read_csv('data/previous_application.csv', nrows=num_rows) 119 | prev, cat_cols = one_hot_encoder(prev, nan_as_category) 120 | 121 | # Days 365.243 values -> nan 122 | prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True) 123 | prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True) 124 | prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True) 125 | prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True) 126 | prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True) 127 | 128 | # Add feature: value ask / value received percentage 129 | prev['NEW_APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT'] 130 | 131 | # Previous applications numeric features 132 | num_aggregations = { 133 | 'AMT_ANNUITY': ['min', 'max', 'mean'], 134 | 'AMT_APPLICATION': ['min', 'max', 'mean'], 135 | 'AMT_CREDIT': ['min', 'max', 'mean'], 136 | 'NEW_APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'], 137 | 'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'], 138 | 'AMT_GOODS_PRICE': ['min', 'max', 'mean'], 139 | 'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'], 140 | 'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'], 141 | 'DAYS_DECISION': ['min', 'max', 'mean'], 142 | 'CNT_PAYMENT': ['mean', 'sum'], 143 | } 144 | 145 | # Previous applications categorical features 146 | cat_aggregations = {} 147 | for cat in cat_cols: 148 | cat_aggregations[cat] = ['mean'] 149 | 150 | prev_agg = prev.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations}) 151 | prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()]) 152 | 153 | # Previous Applications: Approved Applications - only numerical features 154 | approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1] 155 | approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations) 156 | approved_agg.columns = pd.Index( 157 | ['PREV_NEW_APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()]) 158 | prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR') 159 | 160 | # Previous Applications: Refused Applications - only numerical features 161 | refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1] 162 | refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations) 163 | refused_agg.columns = pd.Index( 164 | ['PREV_NEW_REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()]) 165 | prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR') 166 | del refused, refused_agg, approved, approved_agg, prev 167 | gc.collect() 168 | return prev_agg 169 | 170 | 171 | # Preprocess POS_CASH_balance.csv 172 | def pos_cash(num_rows=None, nan_as_category=True): 173 | pos = pd.read_csv('data/POS_CASH_balance.csv', nrows=num_rows) 174 | pos, cat_cols = one_hot_encoder(pos, nan_as_category) 175 | 176 | # Features 177 | aggregations = { 178 | 'MONTHS_BALANCE': ['max', 'mean', 'size'], 179 | 'SK_DPD': ['max', 'mean'], 180 | 'SK_DPD_DEF': ['max', 'mean'] 181 | } 182 | 183 | for cat in cat_cols: 184 | aggregations[cat] = ['mean'] 185 | 186 | pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations) 187 | pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()]) 188 | 189 | # Count pos cash accounts 190 | pos_agg['POS_NEW_COUNT'] = pos.groupby('SK_ID_CURR').size() 191 | del pos 192 | gc.collect() 193 | return pos_agg 194 | 195 | 196 | # Preprocess installments_payments.csv 197 | def installments_payments(num_rows=None, nan_as_category=True): 198 | ins = pd.read_csv('data/installments_payments.csv', nrows=num_rows) 199 | ins, cat_cols = one_hot_encoder(ins, nan_as_category) 200 | 201 | # Percentage and difference paid in each installment (amount paid and installment value) 202 | ins['NEW_PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT'] 203 | ins['NEW_PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT'] 204 | # Days past due and days before due (no negative values) 205 | ins['NEW_DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT'] 206 | ins['NEW_DPD'] = ins['NEW_DPD'].apply(lambda x: x if x > 0 else 0) 207 | 208 | ins['NEW_DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT'] 209 | ins['NEW_DBD'] = ins['NEW_DBD'].apply(lambda x: x if x > 0 else 0) 210 | 211 | # Features: Perform aggregations 212 | aggregations = { 213 | 'NUM_INSTALMENT_VERSION': ['nunique'], 214 | 'NEW_DPD': ['max', 'mean', 'sum'], 215 | 'NEW_DBD': ['max', 'mean', 'sum'], 216 | 'NEW_PAYMENT_PERC': ['max', 'mean', 'sum', 'var'], 217 | 'NEW_PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'], 218 | 'AMT_INSTALMENT': ['max', 'mean', 'sum'], 219 | 'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'], 220 | 'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum'] 221 | } 222 | 223 | for cat in cat_cols: 224 | aggregations[cat] = ['mean'] 225 | 226 | ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations) 227 | ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()]) 228 | 229 | # Count installments accounts 230 | ins_agg['INSTAL_NEW_COUNT'] = ins.groupby('SK_ID_CURR').size() 231 | del ins 232 | gc.collect() 233 | return ins_agg 234 | 235 | 236 | # Preprocess credit_card_balance.csv 237 | def credit_card_balance(num_rows=None, nan_as_category=True): 238 | cc = pd.read_csv('data/credit_card_balance.csv', nrows=num_rows) 239 | cc, cat_cols = one_hot_encoder(cc, nan_as_category) 240 | 241 | # General aggregations 242 | cc.drop(['SK_ID_PREV'], axis=1, inplace=True) 243 | cc_agg = cc.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum', 'var']) 244 | cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()]) 245 | 246 | # Count credit card lines 247 | cc_agg['CC_NEW_COUNT'] = cc.groupby('SK_ID_CURR').size() 248 | del cc 249 | gc.collect() 250 | return cc_agg 251 | -------------------------------------------------------------------------------- /models/dsmlbc2/ATILLA_MUHAMMET.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import re 3 | import time 4 | import warnings 5 | from contextlib import contextmanager 6 | 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | import pandas as pd 10 | import pymysql 11 | import seaborn as sns 12 | from feature_engine import categorical_encoders as ce 13 | from lightgbm import LGBMClassifier 14 | from sklearn.metrics import roc_auc_score 15 | from sklearn.model_selection import KFold 16 | from sklearn.preprocessing import LabelEncoder 17 | 18 | warnings.filterwarnings("ignore", category=DeprecationWarning) 19 | warnings.filterwarnings("ignore", category=FutureWarning) 20 | warnings.filterwarnings("ignore", category=UserWarning) 21 | warnings.simplefilter(action='ignore') 22 | 23 | 24 | @contextmanager 25 | def timer(title): 26 | t0 = time.time() 27 | yield 28 | print("{} - done in {:.0f}s".format(title, time.time() - t0)) 29 | 30 | 31 | # Display plot feature importance 32 | def display_importances(feature_importance_df_): 33 | cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", 34 | ascending=False)[ 35 | :100].index 36 | best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)] 37 | print(best_features) 38 | plt.figure(figsize=(15, 20)) 39 | sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)) 40 | plt.title('LightGBM Features (avg over folds)') 41 | plt.tight_layout() 42 | plt.savefig('lgbm_importances01.png') 43 | 44 | 45 | def load_dataset(file_path, index=0): 46 | df = pd.read_csv(file_path, index_col=index) 47 | return df 48 | 49 | 50 | def get_categoric_columns(df): 51 | cols = df.select_dtypes(include=['object', 'category']).columns 52 | return cols 53 | 54 | 55 | def apply_label_encoding(l_df, columns): 56 | lbe = LabelEncoder() 57 | for col in columns: 58 | l_df[col] = lbe.fit_transform(l_df[col]) 59 | return l_df 60 | 61 | 62 | def apply_one_hot_encoding(l_df): 63 | original_columns = list(l_df) # col names as string in a list 64 | categorical_columns = get_categoric_columns(l_df) # categorical col names 65 | l_df = pd.get_dummies(l_df, columns=categorical_columns, drop_first=True) # creating dummies 66 | new_columns = [c for c in l_df.columns if c not in original_columns] # new col names 67 | return l_df, new_columns 68 | 69 | 70 | def rare_encoding(data, variables, rare_threshold=0.05, n_rare_categories=4): 71 | encoder = ce.RareLabelCategoricalEncoder(tol=rare_threshold, n_categories=n_rare_categories, variables=variables, 72 | replace_with='Rare') 73 | # fit the encoder 74 | encoder.fit(data) 75 | # transform the data 76 | data = encoder.transform(data) 77 | return data 78 | 79 | 80 | # One-hot encoding for categorical columns with get_dummies 81 | def one_hot_encoder(df, nan_as_category=True): 82 | original_columns = list(df.columns) 83 | categorical_columns = [col for col in df.columns if df[col].dtype == 'object'] 84 | df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category) 85 | new_columns = [c for c in df.columns if c not in original_columns] 86 | return df, new_columns 87 | 88 | 89 | 90 | 91 | 92 | def reduce_mem_usage(df): 93 | """ iterate through all the columns of a dataframe and modify the data type 94 | to reduce memory usage. 95 | """ 96 | start_mem = df.memory_usage().sum() / 1024 ** 2 97 | print('Memory usage of dataframe is {:.2f} MB'.format(start_mem)) 98 | 99 | for col in df.columns: 100 | col_type = df[col].dtype 101 | 102 | if col_type != object: 103 | c_min = df[col].min() 104 | c_max = df[col].max() 105 | if str(col_type)[:3] == 'int': 106 | if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: 107 | df[col] = df[col].astype(np.int8) 108 | elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: 109 | df[col] = df[col].astype(np.int16) 110 | elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: 111 | df[col] = df[col].astype(np.int32) 112 | elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: 113 | df[col] = df[col].astype(np.int64) 114 | else: 115 | if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: 116 | df[col] = df[col].astype(np.float16) 117 | elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: 118 | df[col] = df[col].astype(np.float32) 119 | else: 120 | df[col] = df[col].astype(np.float64) 121 | end_mem = df.memory_usage().sum() / 1024 ** 2 122 | print('Memory usage after optimization is: {:.2f} MB'.format(end_mem)) 123 | print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem)) 124 | # code takenn from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage 125 | return df 126 | 127 | 128 | def feature_early_shutdown(row): 129 | early_shutdown = 0 130 | if row.CREDIT_ACTIVE == "Active" and row.DAYS_CREDIT_ENDDATE < 0: 131 | early_shutdown = 1 132 | return early_shutdown 133 | 134 | 135 | def buro_add_feature(df_breau): 136 | df_bureau_new = pd.DataFrame() 137 | # kredi başvuru sayısı 138 | df_bureau_new["BURO_CREDIT_APPLICATION_COUNT"] = df_breau.groupby("SK_ID_CURR").count()["SK_ID_BUREAU"] 139 | 140 | # aktif kredi sayısı 141 | df_bureau_new["BURO_ACTIVE_CREDIT_APPLICATION_COUNT"] = \ 142 | df_breau[df_breau["CREDIT_ACTIVE"] == "Active"].groupby("SK_ID_CURR").count()["CREDIT_ACTIVE"] 143 | df_bureau_new["BURO_ACTIVE_CREDIT_APPLICATION_COUNT"].fillna(0, inplace=True) 144 | 145 | # pasif kredi sayısı 146 | df_bureau_new["BURO_CLOSED_CREDIT_APPLICATION_COUNT"] = \ 147 | df_breau[df_breau["CREDIT_ACTIVE"] == "Closed"].groupby("SK_ID_CURR").count()["CREDIT_ACTIVE"] 148 | df_bureau_new["BURO_CLOSED_CREDIT_APPLICATION_COUNT"].fillna(0, inplace=True) 149 | 150 | # erken kredi kapama 151 | df_bureau_new["BURO_EARLY_SHUTDOWN_NEW"] = df_breau.apply(lambda x: feature_early_shutdown(x), axis=1) 152 | 153 | # geciktirilmiş ödeme sayısı 154 | df_bureau_new["BURO_NUMBER_OF_DELAYED_PAYMENTS"] = \ 155 | df_breau[df_breau["AMT_CREDIT_MAX_OVERDUE"] != 0].groupby("SK_ID_CURR")["AMT_CREDIT_MAX_OVERDUE"].count() 156 | df_bureau_new["BURO_NUMBER_OF_DELAYED_PAYMENTS"].fillna(0, inplace=True) 157 | 158 | # son kapanmış başvurusu üzerinden geçen max süre 159 | df_bureau_new["BURO_MAX_TIME_PASSED_CREDIT_APPLICATION"] = \ 160 | df_breau[df_breau["CREDIT_ACTIVE"] == "Closed"].groupby("SK_ID_CURR")["DAYS_ENDDATE_FACT"].max() 161 | df_bureau_new["BURO_MAX_TIME_PASSED_CREDIT_APPLICATION"].fillna(0, inplace=True) 162 | 163 | # geciktirilmiş max ödeme tutari 164 | df_bureau_new["BURO_MAX_DELAYED_PAYMENTS"] = df_breau.groupby("SK_ID_CURR")["AMT_CREDIT_MAX_OVERDUE"].max() 165 | df_bureau_new["BURO_MAX_DELAYED_PAYMENTS"].fillna(0, inplace=True) 166 | 167 | # geciktirilmiş ödeyenlerden oluşan top liste - en yüksek 100 168 | # gecikme olan (80302, 12) 169 | df_bureau_new["BURO_DELAYED_PAYMENTS_TOP_100_NEW"] = \ 170 | df_bureau_new.sort_values("BURO_MAX_DELAYED_PAYMENTS", ascending=False)["BURO_MAX_DELAYED_PAYMENTS"].rank() 171 | df_bureau_new["BURO_DELAYED_PAYMENTS_TOP_100_NEW"].fillna(0, inplace=True) 172 | 173 | # kredi uzatma yapilmis mi 174 | df_bureau_new["BURO_IS_CREDIT_EXTENSION_NEW"] = df_breau.groupby("SK_ID_CURR")["CNT_CREDIT_PROLONG"].count().apply( 175 | lambda x: 1 if x > 0 else 0) 176 | 177 | # max yapilan kredi uzatmasi 178 | df_bureau_new["BURO_CREDIT_EXTENSION_MAX"] = df_breau.groupby("SK_ID_CURR")["CNT_CREDIT_PROLONG"].max() 179 | df_bureau_new["BURO_CREDIT_EXTENSION_MAX"].fillna(0, inplace=True) 180 | 181 | # unsuccessful credit payment - borç takarak kapanmış kredi ödemeleri tespit et 182 | df_bureau_new["BURO_IS_UNSUCCESSFUL_CREDIT_PAYMENT_NEW"] = \ 183 | df_breau[(df_breau["CREDIT_ACTIVE"] == "Closed") & (df_breau["AMT_CREDIT_SUM_DEBT"] > 0)].groupby( 184 | "SK_ID_CURR").all()["AMT_CREDIT_SUM_DEBT"].apply(lambda x: 1 if x == True else 0) 185 | df_bureau_new["BURO_IS_UNSUCCESSFUL_CREDIT_PAYMENT_NEW"].fillna(0, inplace=True) 186 | 187 | return df_bureau_new 188 | 189 | 190 | def load_data_with_application_train(num_rows=None): 191 | df_app_train = application_train() 192 | print("application_train df shape:", df_app_train.shape) 193 | bureau, bureau_add_features = bureau_and_balance() 194 | print("Bureau df shape:", bureau.shape) 195 | bureau = bureau.fillna(0) 196 | return df_app_train, bureau, bureau_add_features 197 | 198 | 199 | def load_data_only_bureau_and_bureau_balance(num_rows=None): 200 | bureau, bureau_add_features = bureau_and_balance() 201 | print("Bureau df shape:", bureau.shape) 202 | bureau = bureau.fillna(0) 203 | return bureau, bureau_add_features 204 | 205 | 206 | def app_train_bureau_merge(num_rows=None): 207 | df_app_train, bureau, bureau_add_features = load_data_with_application_train(num_rows) 208 | # df_merge = pd.merge(df_app_train, bureau, on=['SK_ID_CURR'],how='inner') 209 | df_merge = bureau 210 | # print("app_train, bureau merge shape:", df_merge.shape) 211 | print("bureau merge shape:", df_merge.shape) 212 | df_final = pd.merge(df_merge, bureau_add_features, on=['SK_ID_CURR'], how='inner') 213 | print("Bureau add features df shape:", bureau_add_features.shape) 214 | del df_app_train, bureau, bureau_add_features, df_merge 215 | gc.collect() 216 | return df_final 217 | 218 | 219 | def bureau_and_bureau_balance_features(num_rows=None): 220 | bureau, bureau_add_features = load_data_only_bureau_and_bureau_balance(num_rows) 221 | df_final = pd.merge(bureau, bureau_add_features, on=['SK_ID_CURR'], how='inner') 222 | print("Bureau add features df shape:", bureau_add_features.shape) 223 | del bureau, bureau_add_features 224 | gc.collect() 225 | return df_final 226 | 227 | 228 | def application_train(): 229 | conn = pymysql.connect(host='35.228.28.142', port=int(63306), user='group2', passwd='123654', db='home_credit') 230 | df_app_train = pd.read_sql_query("SELECT * FROM application_train", conn) 231 | df_app_train = df_app_train[["TARGET", "SK_ID_CURR"]] 232 | # df_app_train = df_app_train.dropna() 233 | df_app_train.reset_index(drop=True, inplace=True) 234 | gc.collect() 235 | return df_app_train 236 | 237 | 238 | # Preprocess bureau.csv and bureau_balance.csv 239 | def bureau_and_balance(nan_as_category=True): 240 | conn = pymysql.connect(host='35.228.28.142', port=int(63306), user='group2', passwd='123654', db='home_credit') 241 | bureau = pd.read_sql_query("SELECT * FROM bureau", conn) 242 | bb = pd.read_sql_query("SELECT * FROM bureau_balance", conn) 243 | bureau["AMT_CREDIT_SUM_DEBT"] = bureau["AMT_CREDIT_SUM_DEBT"].fillna(0) 244 | bureau.fillna(0, inplace=True) 245 | bb.fillna(0, inplace=True) 246 | # bureau = bureau.dropna() 247 | bureau.reset_index(drop=True, inplace=True) 248 | # bb = bb.dropna() 249 | bb.reset_index(drop=True, inplace=True) 250 | 251 | # add_features 252 | bureau_add_features = buro_add_feature(df_breau=bureau) 253 | 254 | # sum agg b_balance 255 | # Status_sum ile ilgili yeni bir degisken olusturma 256 | bb_dummy = pd.get_dummies(bb, dummy_na=True) 257 | agg_list = {"MONTHS_BALANCE": "count", 258 | "STATUS_0": ["sum"], 259 | "STATUS_1": ["sum"], 260 | "STATUS_2": ["sum"], 261 | "STATUS_3": ["sum"], 262 | "STATUS_4": ["sum"], 263 | "STATUS_5": ["sum"], 264 | "STATUS_C": ["sum"], 265 | "STATUS_X": ["sum"]} 266 | bb_sum_agg = bb_dummy.groupby("SK_ID_BUREAU").agg(agg_list) 267 | # Degisken isimlerinin yeniden adlandirilmasi 268 | bb_sum_agg.columns = pd.Index(["BURO_" + col[0] + "_" + col[1].upper() for col in bb_sum_agg.columns.tolist()]) 269 | # Status_sum ile ilgili yeni bir degisken olusturma 270 | bb_sum_agg['BURO_NEW_STATUS_SCORE'] = bb_sum_agg['BURO_STATUS_1_SUM'] + bb_sum_agg['BURO_STATUS_2_SUM'] ^ 2 + \ 271 | bb_sum_agg['BURO_STATUS_3_SUM'] ^ 3 + bb_sum_agg['BURO_STATUS_4_SUM'] ^ 4 + \ 272 | bb_sum_agg['BURO_STATUS_5_SUM'] ^ 5 273 | bb_sum_agg.drop( 274 | ['BURO_STATUS_1_SUM', 'BURO_STATUS_2_SUM', 'BURO_STATUS_3_SUM', 'BURO_STATUS_4_SUM', 'BURO_STATUS_5_SUM'], 275 | axis=1, inplace=True) 276 | 277 | # CREDIT_TYPE degiskeninin sinif sayisini 3'e düsürmek 278 | bureau['CREDIT_TYPE'] = bureau['CREDIT_TYPE'].replace(['Car loan', 279 | 'Mortgage', 280 | 'Microloan', 281 | 'Loan for business development', 282 | 'Another type of loan', 283 | 'Unknown type of loan', 284 | 'Loan for working capital replenishment', 285 | "Loan for purchase of shares (margin lending)", 286 | 'Cash loan (non-earmarked)', 287 | 'Real estate loan', 288 | "Loan for the purchase of equipment", 289 | "Interbank credit", 290 | "Mobile operator loan"], 'Rare') 291 | 292 | # CREDIT_ACTIVE degiskeninin sinif sayisini 2'ye düsürmek (Sold' u Closed a dahil etmek daha mi uygun olur ???) 293 | bureau['CREDIT_ACTIVE'] = bureau['CREDIT_ACTIVE'].replace(['Bad debt', 'Sold'], 'Active') 294 | 295 | # one hot encoding start 296 | bb, bb_cat = one_hot_encoder(bb, nan_as_category) 297 | bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category) 298 | # one hot encoding end 299 | 300 | # Bureau balance: Perform aggregations and merge with bureau.csv 301 | bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size'], 302 | "STATUS_0": ["mean"], 303 | "STATUS_C": ["mean"], 304 | "STATUS_X": ["mean"]} 305 | for col in bb_cat: 306 | bb_aggregations[col] = ['mean'] 307 | 308 | bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations) 309 | bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()]) 310 | 311 | # b_balance sum değşkenlerinin eklenmesi 312 | bb_agg["BURO_MONTHS_BALANCE_COUNT"] = bb_sum_agg["BURO_MONTHS_BALANCE_COUNT"] 313 | bb_agg["BURO_STATUS_0_SUM"] = bb_sum_agg["BURO_STATUS_0_SUM"] 314 | bb_agg["BURO_STATUS_C_SUM"] = bb_sum_agg["BURO_STATUS_C_SUM"] 315 | bb_agg["BURO_STATUS_X_SUM"] = bb_sum_agg["BURO_STATUS_X_SUM"] 316 | bb_agg["BURO_NEW_STATUS_SCORE"] = bb_sum_agg["BURO_NEW_STATUS_SCORE"] 317 | 318 | bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU') 319 | 320 | bureau["BURO_MONTHS_BALANCE_COUNT"].fillna(0, inplace=True) 321 | bureau["BURO_STATUS_0_SUM"].fillna(0, inplace=True) 322 | bureau["BURO_STATUS_C_SUM"].fillna(0, inplace=True) 323 | bureau["BURO_STATUS_X_SUM"].fillna(0, inplace=True) 324 | bureau["BURO_NEW_STATUS_SCORE"].fillna(0, inplace=True) 325 | 326 | ##ek son değişkenler 327 | # ortalama kac aylık kredi aldıgını gösteren yeni degisken 328 | bureau["BURO_NEW_MONTHS_CREDIT"] = round((bureau.DAYS_CREDIT_ENDDATE - bureau.DAYS_CREDIT) / 30) 329 | 330 | bureau.drop(columns='SK_ID_BUREAU', inplace=True) 331 | 332 | del bb, bb_agg 333 | gc.collect() 334 | 335 | # Bureau and bureau_balance numeric features 336 | num_aggregations = { 337 | 'DAYS_CREDIT': ['min', 'max', 'mean', 'var'], 338 | 'CREDIT_DAY_OVERDUE': ['max', 'mean'], 339 | 'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'], 340 | 'AMT_CREDIT_MAX_OVERDUE': ['mean'], 341 | 'CNT_CREDIT_PROLONG': ['sum'], 342 | 'AMT_CREDIT_SUM': ['max', 'mean', 'sum', 'std'], 343 | 'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum', 'std', 'median'], 344 | 'AMT_CREDIT_SUM_OVERDUE': ['mean'], 345 | 'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'], 346 | 'DAYS_CREDIT_UPDATE': ['min', 'max', 'mean'], 347 | 'AMT_ANNUITY': ['max', 'mean'], 348 | 'MONTHS_BALANCE_MIN': ['min'], 349 | 'MONTHS_BALANCE_MAX': ['max'], 350 | 'MONTHS_BALANCE_SIZE': ['mean', 'sum'] 351 | } 352 | # Bureau and bureau_balance categorical features 353 | cat_aggregations = {} 354 | for cat in bureau_cat: cat_aggregations[cat] = ['mean'] 355 | for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean'] 356 | 357 | bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations}) 358 | bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()]) 359 | # Bureau: Active credits - using only numerical aggregations 360 | active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1] 361 | active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations) 362 | active_agg.columns = pd.Index(['BURO_ACT_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()]) 363 | bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR') 364 | del active, active_agg 365 | gc.collect() 366 | # Bureau: Closed credits - using only numerical aggregations 367 | closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1] 368 | closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations) 369 | closed_agg.columns = pd.Index(['BURO_CLS_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()]) 370 | bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR') 371 | del closed, closed_agg, bureau 372 | gc.collect() 373 | return bureau_agg, bureau_add_features 374 | 375 | 376 | def application_train_g(): 377 | conn = pymysql.connect(host='35.228.28.142', port=int(63306), user='group2', passwd='123654', db='home_credit') 378 | df = pd.read_sql_query("SELECT * FROM application_train", conn) 379 | test_df = pd.read_sql_query("SELECT * FROM application_test", conn) 380 | 381 | df = reduce_mem_usage(df) 382 | test_df = reduce_mem_usage(test_df) 383 | 384 | df = df.append(test_df).reset_index() 385 | 386 | pd.set_option('display.max_columns', 500) 387 | pd.set_option('display.max_rows', 500) 388 | 389 | le = LabelEncoder() 390 | 391 | df["NAME_EDUCATION_TYPE"] = le.fit_transform(df["NAME_EDUCATION_TYPE"]) 392 | df.loc[(df["NAME_EDUCATION_TYPE"] == 1), "NAME_EDUCATION_TYPE"] = 0 393 | 394 | df.loc[(df["CNT_FAM_MEMBERS"] > 3), "CNT_FAM_MEMBERS"] = 4 395 | 396 | df = df[df['CODE_GENDER'] != 'XNA'] 397 | 398 | lbe = LabelEncoder() 399 | 400 | for col in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']: 401 | df[col] = lbe.fit_transform(df[col]) 402 | 403 | # df = pd.get_dummies(df, dummy_na = True) 404 | 405 | nom_list = [ 406 | 'EMERGENCYSTATE_MODE', 407 | 'FONDKAPREMONT_MODE', 408 | 'HOUSETYPE_MODE', 409 | 'NAME_CONTRACT_TYPE', 410 | 'NAME_FAMILY_STATUS', 411 | 'NAME_HOUSING_TYPE', 412 | 'NAME_INCOME_TYPE', 413 | 'NAME_TYPE_SUITE', 414 | 'OCCUPATION_TYPE', 415 | 'ORGANIZATION_TYPE', 416 | 'WALLSMATERIAL_MODE', 417 | 'WEEKDAY_APPR_PROCESS_START'] 418 | 419 | df = rare_encoding(df, nom_list) 420 | df = pd.get_dummies(df, columns=nom_list, drop_first=True) 421 | 422 | # new_features 423 | # 1 424 | df["APP_NEW_GOODSPRICE/CREDIT"] = df["AMT_GOODS_PRICE"] / df["AMT_CREDIT"] 425 | # 2 426 | df["APP_NEW_ANNUITY/CREDIT"] = (df["AMT_ANNUITY"] / df["AMT_CREDIT"]) 427 | # 3 428 | df["APP_NEW_INCOME/ANNUITY"] = df["AMT_INCOME_TOTAL"] / df["AMT_ANNUITY"] 429 | # 4 430 | df["APP_NEW_DAYS_LAST_PHONE_CHANGE"] = df["DAYS_LAST_PHONE_CHANGE"] 431 | df.loc[(df["APP_NEW_DAYS_LAST_PHONE_CHANGE"] == 0), "APP_NEW_DAYS_LAST_PHONE_CHANGE"] = 1 432 | df.loc[(df["APP_NEW_DAYS_LAST_PHONE_CHANGE"] != 0), "APP_NEW_DAYS_LAST_PHONE_CHANGE"] = 0 433 | # 5 434 | df["DAYS_BIRTH"] = df["DAYS_BIRTH"] / 365 435 | df["APP_NEW_DAYS_BIRTH"] = df["DAYS_BIRTH"] 436 | df.loc[(df["APP_NEW_DAYS_BIRTH"] <= 41), "APP_NEW_DAYS_BIRTH"] = 1 437 | df.loc[(df["APP_NEW_DAYS_BIRTH"] > 41), "APP_NEW_DAYS_BIRTH"] = 0 438 | # 6 439 | df["APP_NEW_CREDIT/INCOME"] = df["AMT_CREDIT"] / df["AMT_INCOME_TOTAL"] 440 | # 7 441 | df["APP_NEW_WORK/NOTWORK"] = df["DAYS_EMPLOYED"] 442 | df.loc[(df["APP_NEW_WORK/NOTWORK"] == 0), "APP_NEW_WORK/NOTWORK"] = 0 # ÇALIŞMAYANLAR 443 | df.loc[(df["APP_NEW_WORK/NOTWORK"] != 0), "APP_NEW_WORK/NOTWORK"] = 1 # ÇALIŞANLAR 444 | # 8 445 | df["APP_NEW_INCOME/CREDIT"] = df["AMT_INCOME_TOTAL"] / df["AMT_CREDIT"] 446 | # 9 447 | # En yakın zaman (soruşturma olmayan 0, saat+gün+ hafta+ay için 1, ay+yıl için 2) 448 | df["APP_NEW_REQ"] = df["AMT_REQ_CREDIT_BUREAU_WEEK"] 449 | # yakın ve orta zamanda soruşturma 450 | df.loc[(df["AMT_REQ_CREDIT_BUREAU_HOUR"] > 0), "APP_NEW_REQ"] = 1 451 | df.loc[(df["AMT_REQ_CREDIT_BUREAU_DAY"] > 0), "APP_NEW_REQ"] = 1 452 | 453 | df.loc[(df["AMT_REQ_CREDIT_BUREAU_HOUR"] == 0) & (df["AMT_REQ_CREDIT_BUREAU_DAY"] == 0) & ( 454 | df["AMT_REQ_CREDIT_BUREAU_WEEK"] > 0), "APP_NEW_REQ"] = 1 455 | df.loc[(df["AMT_REQ_CREDIT_BUREAU_HOUR"] == 0) & (df["AMT_REQ_CREDIT_BUREAU_DAY"] == 0) & ( 456 | df["AMT_REQ_CREDIT_BUREAU_MON"] > 0), "APP_NEW_REQ"] = 1 457 | # uzak zaman soruşturma 458 | df.loc[(df["AMT_REQ_CREDIT_BUREAU_HOUR"] == 0) & (df["AMT_REQ_CREDIT_BUREAU_DAY"] == 0) & 459 | (df["AMT_REQ_CREDIT_BUREAU_WEEK"] == 0) & (df["AMT_REQ_CREDIT_BUREAU_MON"] == 0) & 460 | (df["AMT_REQ_CREDIT_BUREAU_QRT"] > 0), "APP_NEW_REQ"] = 2 461 | 462 | df.loc[(df["AMT_REQ_CREDIT_BUREAU_HOUR"] == 0) & (df["AMT_REQ_CREDIT_BUREAU_DAY"] == 0) & 463 | (df["AMT_REQ_CREDIT_BUREAU_WEEK"] == 0) & (df["AMT_REQ_CREDIT_BUREAU_MON"] == 0) & 464 | (df["AMT_REQ_CREDIT_BUREAU_YEAR"] > 0), "APP_NEW_REQ"] = 2 465 | # soruşturma olmayanlar 466 | df.loc[(pd.isna(df["APP_NEW_REQ"])), "APP_NEW_REQ"] = 0 467 | 468 | # eski grup yeni feature ları 469 | df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True) 470 | df['NEW_DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH'] 471 | df['NEW_INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS'] 472 | 473 | df['NEW_EXT_RESOURCE_3_CREDIT_TO_GOODS_RATIO'] = df['EXT_SOURCE_3'] / (df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']) 474 | df['NEW_EXT_RESOURCE_2_CREDIT_TO_GOODS_RATIO'] = df['EXT_SOURCE_2'] / (df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']) 475 | df['NEW_EXT_RESOURCE_1_CREDIT_TO_GOODS_RATIO'] = df['EXT_SOURCE_1'] / (df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']) 476 | 477 | df.drop("index", axis=1, inplace=True) 478 | 479 | df.columns = pd.Index(["APP_" + col for col in df.columns.tolist()]) 480 | 481 | df.rename(columns={"APP_SK_ID_CURR": "SK_ID_CURR"}, inplace=True) 482 | 483 | df.rename(columns={"APP_TARGET": "TARGET"}, inplace=True) 484 | 485 | return df 486 | 487 | 488 | def previous_application(): 489 | conn = pymysql.connect(host='35.228.28.142', port=int(63306), user='group2', passwd='123654', db='home_credit') 490 | df_prev = pd.read_sql_query("SELECT * FROM previous_application", conn) 491 | df_prev = reduce_mem_usage(df_prev) 492 | pd.set_option('display.max_columns', 500) 493 | pd.set_option('display.max_rows', 500) 494 | df_prev = df_prev.sample(1000) 495 | 496 | # Features that has outliers 497 | feat_outlier = ["AMT_ANNUITY", "AMT_APPLICATION", "AMT_CREDIT", "AMT_DOWN_PAYMENT", "AMT_GOODS_PRICE", 498 | "SELLERPLACE_AREA"] 499 | 500 | # Replacing the outliers of the features with their own upper values 501 | for var in feat_outlier: 502 | Q1 = df_prev[var].quantile(0.01) 503 | Q3 = df_prev[var].quantile(0.99) 504 | IQR = Q3 - Q1 505 | lower = Q1 - 1.5 * IQR 506 | upper = Q3 + 1.5 * IQR 507 | 508 | df_prev[var][(df_prev[var] > upper)] = upper 509 | 510 | # 365243 value will be replaced by NaN in the following features 511 | feature_replace = ['DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_LAST_DUE', 512 | 'DAYS_TERMINATION'] 513 | 514 | for var in feature_replace: 515 | df_prev[var].replace(365243, np.nan, inplace=True) 516 | 517 | # One hot encoding 518 | categorical_columns = [col for col in df_prev.columns if df_prev[col].dtype == 'object'] 519 | df_prev = pd.get_dummies(df_prev, columns=categorical_columns, dummy_na=True) 520 | 521 | # Creating new features 522 | 523 | df_prev['APP_CREDIT_PERC'] = df_prev['AMT_APPLICATION'] / df_prev['AMT_CREDIT'] 524 | df_prev['NEW_CREDIT_TO_ANNUITY_RATIO'] = df_prev['AMT_CREDIT'] / df_prev['AMT_ANNUITY'] 525 | df_prev['NEW_DOWN_PAYMENT_TO_CREDIT'] = df_prev['AMT_DOWN_PAYMENT'] / df_prev['AMT_CREDIT'] 526 | df_prev['NEW_TOTAL_PAYMENT'] = df_prev['AMT_ANNUITY'] * df_prev['CNT_PAYMENT'] 527 | df_prev['NEW_TOTAL_PAYMENT_TO_AMT_CREDIT'] = df_prev['NEW_TOTAL_PAYMENT'] / df_prev['AMT_CREDIT'] 528 | # Innterest ratio previous application (simplified) 529 | 530 | df_prev['SIMPLE_INTERESTS'] = (df_prev['NEW_TOTAL_PAYMENT'] / df_prev['AMT_CREDIT'] - 1) / df_prev['CNT_PAYMENT'] 531 | 532 | # Previous applications numeric features 533 | num_aggregations = {} 534 | num_cols = df_prev.select_dtypes(exclude=['object']) 535 | num_cols.drop(['SK_ID_PREV', 'SK_ID_CURR'], axis=1, inplace=True) 536 | 537 | for num in num_cols: 538 | num_aggregations[num] = ['min', 'max', 'mean', 'var', 'sum'] 539 | 540 | # Previous applications categoric features 541 | cat_aggregations = {} 542 | for i in df_prev.columns: 543 | if df_prev[i].dtypes == "O": 544 | cat_aggregations[i] = ['mean'] 545 | 546 | prev_agg = df_prev.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations}) 547 | prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()]) 548 | 549 | # Dropping features with small variance 550 | features_with_small_variance = prev_agg.columns[(prev_agg.std(axis=0) < .1).values] 551 | prev_agg.drop(features_with_small_variance, axis=1, inplace=True) 552 | prev_agg.reset_index(inplace=True) 553 | 554 | return prev_agg 555 | 556 | 557 | def credit_card_balance(): 558 | conn = pymysql.connect(host='35.228.28.142', port=int(63306), user='group2', passwd='123654', db='home_credit') 559 | ccb = pd.read_sql_query("SELECT * FROM credit_card_balance", conn) 560 | 561 | ccb = reduce_mem_usage(ccb) 562 | ccb = ccb.sample(1000) 563 | 564 | ccb = ccb.groupby('SK_ID_CURR').agg(['mean']) 565 | e = 0 566 | ccb.columns = pd.Index( 567 | ['CC_' + ccb.columns[e][0] + "_" + ccb.columns[e][1].upper() for e in range(ccb.columns.size)]) 568 | 569 | # new feature1: calculating the rate of balance(loan) to the credit card limit 570 | ccb["CC_NEW_LOAN_TO_CREDIT_LIMIT_RATE"] = (ccb["CC_AMT_BALANCE_MEAN"] + 1) / ( 571 | ccb["CC_AMT_CREDIT_LIMIT_ACTUAL_MEAN"] + 1) 572 | 573 | # new feature2: at what rate the customer paid the loan:CC_AMT_PAYMENT_TOTAL_CURRENT_MEAN / 574 | # CC_AMT_TOTAL_RECEIVABLE_MEAN: CC_PAID_AMOUNT_RATE 575 | ccb["CC_NEW_PAID_AMOUNT_RATE"] = (ccb["CC_AMT_PAYMENT_TOTAL_CURRENT_MEAN"] + 1) / ( 576 | ccb["CC_AMT_TOTAL_RECEIVABLE_MEAN"] + 1) * 100 577 | 578 | # new feature3: how much money the customer withdrew in avg from ATM per drawing:AMOUNT PER ATM DRAWING 579 | ccb["CC_NEW_AMT_PER_ATM_DRAWING_MEAN"] = (ccb["CC_AMT_DRAWINGS_ATM_CURRENT_MEAN"] + 1) / ( 580 | ccb["CC_CNT_DRAWINGS_ATM_CURRENT_MEAN"] + 1) 581 | 582 | # new feature4: how much money the customer withdrew from POS in avg per drawing:AMOUNT PER POS DRAWING 583 | ccb["CC_NEW_AMT_PER_POS_DRAWING_MEAN"] = (ccb["CC_AMT_DRAWINGS_POS_CURRENT_MEAN"] + 1) / ( 584 | ccb["CC_CNT_DRAWINGS_POS_CURRENT_MEAN"] + 1) 585 | 586 | ccb = pd.concat([ccb.loc[:, "CC_NEW_LOAN_TO_CREDIT_LIMIT_RATE"], 587 | ccb.loc[:, "CC_NEW_PAID_AMOUNT_RATE"], 588 | ccb.loc[:, "CC_AMT_CREDIT_LIMIT_ACTUAL_MEAN"], 589 | ccb.loc[:, "CC_AMT_PAYMENT_CURRENT_MEAN"], 590 | ccb.loc[:, "CC_MONTHS_BALANCE_MEAN"], 591 | ccb.loc[:, "CC_CNT_INSTALMENT_MATURE_CUM_MEAN"], 592 | ccb.loc[:, "CC_AMT_INST_MIN_REGULARITY_MEAN"], 593 | ccb.loc[:, "CC_AMT_DRAWINGS_ATM_CURRENT_MEAN"], 594 | ccb.loc[:, "CC_AMT_DRAWINGS_POS_CURRENT_MEAN"], 595 | ccb.loc[:, "CC_CNT_DRAWINGS_ATM_CURRENT_MEAN"], 596 | ccb.loc[:, "CC_CNT_DRAWINGS_POS_CURRENT_MEAN"], 597 | ccb.loc[:, "CC_NEW_AMT_PER_ATM_DRAWING_MEAN"], 598 | ccb.loc[:, "CC_NEW_AMT_PER_POS_DRAWING_MEAN"]], axis=1) 599 | 600 | return ccb 601 | 602 | 603 | def prepare_instalment_payment(): 604 | conn = pymysql.connect(host='35.228.28.142', port=int(63306), user='group2', passwd='123654', db='home_credit') 605 | df_installments_payments = pd.read_sql_query("SELECT * FROM installments_payments", conn) 606 | 607 | df_installments_payments = reduce_mem_usage(df_installments_payments) 608 | 609 | # O anki taksitin yuzde kaci odendi 610 | df_installments_payments[['AMT_PAYMENT']] = df_installments_payments[['AMT_PAYMENT']].fillna(value=0) 611 | df_installments_payments['NEW_INSTALMENT_PAYMENT_RATE'] = df_installments_payments['AMT_PAYMENT'] / \ 612 | df_installments_payments['AMT_INSTALMENT'] * 100 613 | 614 | # O anki taksit son odeme gununden kac gun once odenmis. Bu degisken "NEW_INSTALMENT_PAYMENT_STATUS" degerini bulabilmek icin gecici olusturulur. 615 | df_installments_payments['NEW_DAY_BEFORE_END_DATE'] = df_installments_payments['DAYS_INSTALMENT'] - \ 616 | df_installments_payments['DAYS_ENTRY_PAYMENT'] 617 | 618 | df_installments_payments["NEW_INSTALMENT_PAYMENT_STATUS"] = "No Payment" 619 | df_installments_payments.loc[ 620 | df_installments_payments['NEW_DAY_BEFORE_END_DATE'] == 0, "NEW_INSTALMENT_PAYMENT_STATUS"] = "In Time" 621 | df_installments_payments.loc[ 622 | df_installments_payments['NEW_DAY_BEFORE_END_DATE'] > 0, "NEW_INSTALMENT_PAYMENT_STATUS"] = "Early" 623 | df_installments_payments.loc[ 624 | df_installments_payments['NEW_DAY_BEFORE_END_DATE'] < 0, "NEW_INSTALMENT_PAYMENT_STATUS"] = "Late" 625 | 626 | df_installments_payments["NEW_INS_IS_LATE"] = "No" 627 | df_installments_payments.loc[df_installments_payments['NEW_DAY_BEFORE_END_DATE'] < 0, "NEW_INS_IS_LATE"] = "Yes" 628 | # Iki siniftan olustugu icin LabelEncoding yapilir. 629 | df_installments_payments = apply_label_encoding(df_installments_payments, ["NEW_INS_IS_LATE"]) 630 | 631 | df_installments_payments.drop(columns=['NEW_DAY_BEFORE_END_DATE'], inplace=True) 632 | 633 | df_installments_payments, ip_cat = apply_one_hot_encoding(df_installments_payments) 634 | 635 | ip_aggregations = { 636 | 'NUM_INSTALMENT_VERSION': ['max'], 637 | 'NUM_INSTALMENT_NUMBER': ['max'], 638 | 'AMT_INSTALMENT': ['sum'], 639 | 'AMT_PAYMENT': ['sum'], 640 | 'NEW_INSTALMENT_PAYMENT_RATE': ['min', 'max', 'mean'], 641 | 'NEW_INS_IS_LATE': ['mean', 'sum'] 642 | } 643 | 644 | for col in ip_cat: 645 | ip_aggregations[col] = ['mean'] 646 | 647 | df_ip_agg = df_installments_payments.groupby(['SK_ID_CURR']).agg(ip_aggregations) 648 | 649 | df_ip_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in df_ip_agg.columns.tolist()]) 650 | 651 | return df_ip_agg 652 | 653 | 654 | def prepare_pos_cash_balance(): 655 | conn = pymysql.connect(host='35.228.28.142', port=int(63306), user='group2', passwd='123654', db='home_credit') 656 | df_pos_cash_balance = pd.read_sql_query("SELECT * FROM POS_CASH_balance", conn) 657 | 658 | df_pos_cash_balance, pcb_cat = apply_one_hot_encoding(df_pos_cash_balance) 659 | 660 | pcb_aggregations = { 661 | 'SK_ID_PREV': ['min', 'max', 'mean', 'count'], 662 | 'MONTHS_BALANCE': ['min', 'max'], 663 | 'CNT_INSTALMENT': ['min', 'max', 'mean'], 664 | 'CNT_INSTALMENT_FUTURE': ['min', 'max', 'mean'], 665 | 'SK_DPD': ['max', 'mean'], 666 | 'SK_DPD_DEF': ['max', 'mean'] 667 | } 668 | 669 | for col in pcb_cat: 670 | pcb_aggregations[col] = ['mean'] 671 | 672 | df_pcb_agg = df_pos_cash_balance.groupby(['SK_ID_CURR']).agg(pcb_aggregations) 673 | df_pcb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in df_pcb_agg.columns.tolist()]) 674 | 675 | return df_pcb_agg 676 | 677 | 678 | def installment_payment_main(): 679 | df_ip_agg = prepare_instalment_payment() # Intaslment Payment son hali 680 | 681 | df_pcb_agg = prepare_pos_cash_balance() # Pos cash balance son hali 682 | 683 | df_pos_ins = df_ip_agg.join(df_pcb_agg, how='inner', 684 | on=['SK_ID_CURR']) # instalment payment ve pos cash balance birlestirilmis hali 685 | 686 | return df_ip_agg, df_pcb_agg, df_pos_ins 687 | 688 | 689 | def pre_processing_and_combine(): 690 | with timer("Process application train"): 691 | df = application_train_g() 692 | print("application train & test shape:", df.shape) 693 | 694 | with timer("Bureau and Bureau Balance"): 695 | df_final = bureau_and_bureau_balance_features() 696 | print("Bureau and Bureau Balance:", df_final.shape) 697 | 698 | with timer("Installment Payments"): 699 | df_ip_agg, df_pcb_agg, df_pos_ins = installment_payment_main() 700 | print("Installment Payments", df_ip_agg.shape) 701 | 702 | with timer("Pos Cash Balance"): 703 | print("Pos Cash Balance:", df_pcb_agg.shape) 704 | 705 | with timer("Credit Card Balance"): 706 | ccb = credit_card_balance() 707 | print("Credit Card Balance:", ccb.shape) 708 | 709 | with timer("previous_application"): 710 | prev_agg = previous_application() 711 | print("previous_application:", prev_agg.shape) 712 | 713 | with timer("All tables are combining"): 714 | df = df.merge(df_final, how="left", on="SK_ID_CURR") 715 | df1 = df.merge(df_ip_agg, how='left', on='SK_ID_CURR') 716 | df2 = df1.merge(df_pcb_agg, how='left', on='SK_ID_CURR') 717 | df3 = df2.merge(ccb, how='left', on='SK_ID_CURR') 718 | all_df = df3.merge(prev_agg, how='left', on='SK_ID_CURR') 719 | 720 | print("all_df process:", all_df.shape) 721 | 722 | return all_df 723 | 724 | 725 | def modeling(all_data): 726 | all_data = all_data.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x)) 727 | 728 | train_df = all_data[all_data['TARGET'].notnull()] 729 | test_df = all_data[all_data['TARGET'].isnull()] 730 | 731 | folds = KFold(n_splits=10, shuffle=True, random_state=1001) 732 | 733 | oof_preds = np.zeros(train_df.shape[0]) 734 | sub_preds = np.zeros(test_df.shape[0]) 735 | feature_importance_df = pd.DataFrame() 736 | 737 | feats = [f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR']] 738 | 739 | for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])): 740 | train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx] 741 | 742 | valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx] 743 | 744 | clf = LGBMClassifier( 745 | n_jobs=-1, 746 | n_estimators=10000, 747 | learning_rate=0.02, 748 | num_leaves=34, 749 | colsample_bytree=0.9497036, 750 | subsample=0.8715623, 751 | max_depth=8, 752 | reg_alpha=0.041545473, 753 | reg_lambda=0.0735294, 754 | min_split_gain=0.0222415, 755 | min_child_weight=39.3259775, 756 | silent=-1, 757 | verbose=-1, ) 758 | 759 | clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 760 | eval_metric='auc', verbose=200, early_stopping_rounds=200) 761 | 762 | # y_pred_valid 763 | oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1] 764 | sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits 765 | 766 | fold_importance_df = pd.DataFrame() 767 | fold_importance_df["feature"] = feats 768 | fold_importance_df["importance"] = clf.feature_importances_ 769 | fold_importance_df["fold"] = n_fold + 1 770 | feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) 771 | 772 | print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx]))) 773 | 774 | print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds)) # y_pred_valid 775 | 776 | test_df['TARGET'] = sub_preds 777 | test_df[['SK_ID_CURR', 'TARGET']].to_csv("outputs/predictions/atilla_muhammet.csv'", index=False) 778 | 779 | display_importances(feature_importance_df) 780 | 781 | return feature_importance_df 782 | 783 | 784 | def main(): 785 | with timer("Preprocessing Time"): 786 | all_data = pre_processing_and_combine() 787 | 788 | with timer("Modeling"): 789 | feat_importance = modeling(all_data) 790 | 791 | 792 | if __name__ == "__main__": 793 | with timer("Full model run"): 794 | main() 795 | -------------------------------------------------------------------------------- /models/dsmlbc2/merve_betul.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "pycharm": { 7 | "name": "#%% md\n" 8 | } 9 | }, 10 | "source": [ 11 | "Libraries" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": { 18 | "pycharm": { 19 | "is_executing": false, 20 | "name": "#%% \n" 21 | } 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import numpy as np\n", 26 | "import pandas as pd\n", 27 | "import gc\n", 28 | "import time\n", 29 | "from contextlib import contextmanager\n", 30 | "from lightgbm import LGBMClassifier\n", 31 | "from sklearn.metrics import roc_auc_score\n", 32 | "from sklearn.model_selection import KFold, StratifiedKFold\n", 33 | "from sklearn.preprocessing import LabelEncoder\n", 34 | "import matplotlib.pyplot as plt\n", 35 | "import seaborn as sns\n", 36 | "import warnings\n", 37 | "warnings.simplefilter(action='ignore', category=FutureWarning)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": { 43 | "pycharm": { 44 | "name": "#%% md\n" 45 | } 46 | }, 47 | "source": [ 48 | "Time function for tracking run times of functions" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 2, 54 | "metadata": { 55 | "pycharm": { 56 | "is_executing": false, 57 | "name": "#%% \n" 58 | } 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "@contextmanager\n", 63 | "def timer(title):\n", 64 | " t0 = time.time()\n", 65 | " yield\n", 66 | " print(\"{} - done in {:.0f}s\".format(title, time.time() - t0))\n" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": { 72 | "pycharm": { 73 | "name": "#%% md\n" 74 | } 75 | }, 76 | "source": [ 77 | "One-hot encoding function for categorical variables with get_dummies" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 3, 83 | "metadata": { 84 | "pycharm": { 85 | "is_executing": false, 86 | "name": "#%%\n" 87 | } 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "def one_hot_encoder(df, nan_as_category = True):\n", 92 | " original_columns = list(df.columns) # col names as string in a list \n", 93 | " categorical_columns = [col for col in df.columns if df[col].dtype == 'object'] #categorical col names\n", 94 | " df = pd.get_dummies(df, columns = categorical_columns, dummy_na = nan_as_category) #creating dummies\n", 95 | " new_columns = [c for c in df.columns if c not in original_columns] #new col names\n", 96 | " return df, new_columns\n", 97 | "\n", 98 | "def label_encoder(df):\n", 99 | " # Create a label encoder object\n", 100 | " le = LabelEncoder()\n", 101 | " le_count = 0\n", 102 | "\n", 103 | " # Iterate through the columns\n", 104 | " for col in df:\n", 105 | " if df[col].dtype == 'object':\n", 106 | " # If 2 or fewer unique categories\n", 107 | " if len(list(df[col].unique())) <= 2:\n", 108 | " # Train on the training data\n", 109 | " le.fit(df[col])\n", 110 | " # Transform both training and testing data\n", 111 | " df[col] = le.transform(df[col])\n", 112 | "\n", 113 | " # Keep track of how many columns were label encoded\n", 114 | " le_count += 1\n", 115 | "\n", 116 | " print('%d columns were label encoded.' % le_count)\n", 117 | " \n", 118 | " return df" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": { 124 | "pycharm": { 125 | "name": "#%% md\n" 126 | } 127 | }, 128 | "source": [ 129 | "Preprocess application_train and application_test" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 4, 135 | "metadata": { 136 | "jupyter": { 137 | "outputs_hidden": false 138 | }, 139 | "pycharm": { 140 | "is_executing": false, 141 | "name": "#%%\n" 142 | } 143 | }, 144 | "outputs": [], 145 | "source": [ 146 | "\n", 147 | "# Preprocess application_train.csv and application_test.csv\n", 148 | "def application_train_test(num_rows = None, nan_as_category = False):\n", 149 | " df = pd.read_csv(\"data/application_train.csv\", nrows = num_rows)\n", 150 | " test_df = pd.read_csv(\"data//application_test.csv\", nrows = num_rows)\n", 151 | "\n", 152 | " df = df.append(test_df).reset_index()\n", 153 | " del df[\"index\"]\n", 154 | " \n", 155 | " df = df[df['CODE_GENDER'] != 'XNA']\n", 156 | " df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)\n", 157 | "\n", 158 | " df['APP_NEW_AGE'] = df['DAYS_BIRTH'] / (- 365.25)\n", 159 | " \n", 160 | " APP_NEW_AGE_CAT = pd.Series([\"Young\", \"Adult 1\",\"Adult 2\",\"Adult 3\", \"Adult 4\"], dtype = \"object\")\n", 161 | " df[\"APP_NEW_AGE_CAT\"] = APP_NEW_AGE_CAT\n", 162 | " df.loc[(df[\"APP_NEW_AGE\"] > 20.0) & (df[\"APP_NEW_AGE\"] <= 30.0), \"APP_NEW_AGE_CAT\"] = APP_NEW_AGE_CAT[0]\n", 163 | " df.loc[(df[\"APP_NEW_AGE\"] > 30.0) & (df[\"APP_NEW_AGE\"] <= 40.0), \"APP_NEW_AGE_CAT\"] = APP_NEW_AGE_CAT[1]\n", 164 | " df.loc[(df[\"APP_NEW_AGE\"] > 40.0) & (df[\"APP_NEW_AGE\"] <= 50.0), \"APP_NEW_AGE_CAT\"] = APP_NEW_AGE_CAT[2]\n", 165 | " df.loc[(df[\"APP_NEW_AGE\"] > 50.0) & (df[\"APP_NEW_AGE\"] <= 60.0), \"APP_NEW_AGE_CAT\"] = APP_NEW_AGE_CAT[3]\n", 166 | " df.loc[df[\"APP_NEW_AGE\"] > 60 ,\"APP_NEW_AGE_CAT\"] = APP_NEW_AGE_CAT[4]\n", 167 | " \n", 168 | " df[\"APP_NEW_AGE_DAYS_EMP\"] = df[\"DAYS_EMPLOYED\"] / (- 365.25)\n", 169 | " df[\"APP_NEW_AGE_WORK_PERCENT\"] = (df[\"APP_NEW_AGE_DAYS_EMP\"] / df['APP_NEW_AGE']) * 100\n", 170 | " df['APP_NEW_CREDIT_INCOME_PERCENT'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']\n", 171 | " df['APP_NEW_ANNUITY_INCOME_PERCENT'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']\n", 172 | " df['APP_NEW_DAYS_EMPLOYED_PERCENT'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']\n", 173 | " df['APP_NEW_INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']\n", 174 | " df['APP_NEW_INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']\n", 175 | " df['APP_NEW_PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']\n", 176 | " df['APP_NEW_AMT_PAY_YEAR'] = df['AMT_CREDIT'] / df['AMT_ANNUITY'] \n", 177 | " df['APP_NEW_AGE_PAYOFF'] = df['APP_NEW_AGE'] + df['APP_NEW_AMT_PAY_YEAR']\n", 178 | " df['APP_NEW_AMT_DIFF_CREDIT_GOODS'] = df['AMT_CREDIT'] - df['AMT_GOODS_PRICE'] \n", 179 | " df['APP_NEW_AMT_CREDIT_GOODS_PERC'] = ((df['AMT_GOODS_PRICE'] / df['AMT_CREDIT']) * 100)\n", 180 | " df['APP_NEW_CNT_ADULT'] = df['CNT_FAM_MEMBERS'] - df['CNT_CHILDREN']\n", 181 | " df['APP_NEW_EXT_SOURCES_MEAN'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)\n", 182 | "\n", 183 | " df = label_encoder(df)\n", 184 | " \n", 185 | " df, cat_cols = one_hot_encoder(df)\n", 186 | "\n", 187 | " del test_df\n", 188 | " gc.collect()\n", 189 | " return df" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": { 195 | "pycharm": { 196 | "name": "#%% md\n" 197 | } 198 | }, 199 | "source": [ 200 | "#Preprocess bureau.csv and bureau_balance.csv" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 5, 206 | "metadata": { 207 | "pycharm": { 208 | "is_executing": false, 209 | "name": "#%%\n" 210 | } 211 | }, 212 | "outputs": [], 213 | "source": [ 214 | "# Preprocess bureau.csv and bureau_balance.csv\n", 215 | "def bureau_and_balance(num_rows = None, nan_as_category = True):\n", 216 | " bureau = pd.read_csv('data/bureau.csv', nrows = num_rows)\n", 217 | " bureau_balance = pd.read_csv('data/bureau_balance.csv', nrows = num_rows)\n", 218 | " \n", 219 | " \n", 220 | " # Bureau balance: Perform aggregations and merge with bureau.csv\n", 221 | " def _status_to_int(status):\n", 222 | " if status in ['X', 'C']:\n", 223 | " return 0\n", 224 | " if pd.isnull(status):\n", 225 | " return np.nan\n", 226 | " return int(status)\n", 227 | "\n", 228 | " bureau_balance['NEW_BUREAU_BALANCE_DPD_LEVEL'] = bureau_balance['STATUS'].apply(_status_to_int)\n", 229 | " bureau_balance['NEW_BUREAU_BALANCE_STATUS_UNKNOW'] = (bureau_balance['STATUS'] == 'X').astype(int) \n", 230 | "\n", 231 | " bureau_balance[\"MONTHS_BALANCE\"] = (-1*bureau_balance[\"MONTHS_BALANCE\"])+1\n", 232 | "\n", 233 | " bb_aggregations = {'MONTHS_BALANCE': [\"max\"],\n", 234 | " 'NEW_BUREAU_BALANCE_DPD_LEVEL':['sum', 'mean', 'max', 'std', 'skew'],\n", 235 | " 'NEW_BUREAU_BALANCE_STATUS_UNKNOW':['sum', 'mean']}\n", 236 | "\n", 237 | " bb_agg = bureau_balance.groupby('SK_ID_BUREAU').agg(bb_aggregations)\n", 238 | "\n", 239 | " bb_agg.columns = pd.Index([e[0] + \"_\" + e[1].upper() for e in bb_agg.columns.tolist()])\n", 240 | " bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')\n", 241 | " bureau.drop(['SK_ID_BUREAU'], axis=1, inplace= True)\n", 242 | " del bureau_balance, bb_agg\n", 243 | " gc.collect()\n", 244 | " \n", 245 | " # Bureau new features\n", 246 | " bureau.drop([\"AMT_CREDIT_SUM_LIMIT\",\"AMT_CREDIT_SUM_OVERDUE\",\"CREDIT_DAY_OVERDUE\",\"AMT_CREDIT_SUM_OVERDUE\"],axis=1,inplace=True)\n", 247 | " bureau['BUREAU_CREDIT_TYPE_CONSUMER'] = (bureau['CREDIT_TYPE'] == 'Consumer credit').astype(int)\n", 248 | " bureau['BUREAU_CREDIT_TYPE_CAR'] = (bureau['CREDIT_TYPE'] == 'Car loan').astype(int)\n", 249 | " bureau['BUREAU_CREDIT_TYPE_MORTGAGE'] = (bureau['CREDIT_TYPE'] == 'Mortgage').astype(int)\n", 250 | " bureau['BUREAU_CREDIT_TYPE_CREDIT_CARD'] = (bureau['CREDIT_TYPE'] == 'Credit card').astype(int)\n", 251 | " bureau['BUREAU_CREDIT_TYPE_OTHER'] = (~(bureau['CREDIT_TYPE'].isin(['Consumer credit',\n", 252 | " 'Car loan', 'Mortgage', 'Credit card']))).astype(int)\n", 253 | " bureau['BUREAU_UNUSUAL_CURRENCY'] = (~(bureau['CREDIT_CURRENCY'] == 'currency 1')).astype(int)\n", 254 | " bureau['NEW_PAYMENT_RATE_SUM'] = bureau['AMT_ANNUITY'] / bureau['AMT_CREDIT_SUM']\n", 255 | " bureau['NEW_PAYMENT_RATE_SUM_DEBT'] = bureau['AMT_ANNUITY'] / bureau['AMT_CREDIT_SUM_DEBT']\n", 256 | " bureau['NEW_PAYMENT_RATE_AMT_CREDIT_MAX_OVERDUE'] = bureau['AMT_ANNUITY'] / bureau['AMT_CREDIT_MAX_OVERDUE']\n", 257 | " \n", 258 | " bureau.drop([\"CREDIT_TYPE\",\"CREDIT_CURRENCY\"],axis=1,inplace=True)\n", 259 | " # Bureau and bureau_balance numeric features\n", 260 | " num_aggregations = {\n", 261 | " \"DAYS_CREDIT\": ['min', 'max', 'mean', 'var', 'sum'],\n", 262 | " 'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],\n", 263 | " 'DAYS_CREDIT_UPDATE': ['mean'],\n", 264 | " 'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],\n", 265 | " 'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],\n", 266 | " 'AMT_CREDIT_MAX_OVERDUE': ['mean'],\n", 267 | " 'DAYS_ENDDATE_FACT': ['mean', 'sum'],\n", 268 | " 'AMT_ANNUITY': ['max', 'mean'],\n", 269 | " 'CNT_CREDIT_PROLONG': ['sum','std'],\n", 270 | " 'MONTHS_BALANCE_MAX': ['max'],\n", 271 | " \"NEW_BUREAU_BALANCE_DPD_LEVEL_SUM\" :['max',\"sum\"],\n", 272 | " \"NEW_BUREAU_BALANCE_DPD_LEVEL_MEAN\" :['max',\"sum\",\"mean\"],\n", 273 | " \"NEW_BUREAU_BALANCE_DPD_LEVEL_MAX\" :['max',\"sum\"],\n", 274 | " \"NEW_BUREAU_BALANCE_DPD_LEVEL_STD\" :['max',\"sum\",\"std\"],\n", 275 | " \"NEW_BUREAU_BALANCE_DPD_LEVEL_SKEW\" :['max',\"sum\",\"skew\"],\n", 276 | " \"NEW_BUREAU_BALANCE_STATUS_UNKNOW_SUM\" :['max',\"sum\"],\n", 277 | " \"NEW_BUREAU_BALANCE_STATUS_UNKNOW_MEAN\" :['max',\"sum\",\"mean\"],\n", 278 | " 'BUREAU_CREDIT_TYPE_CONSUMER': ['mean', 'sum'],\n", 279 | " 'BUREAU_CREDIT_TYPE_CAR': ['mean', 'sum'],\n", 280 | " 'BUREAU_CREDIT_TYPE_MORTGAGE': ['mean', 'sum'],\n", 281 | " 'BUREAU_CREDIT_TYPE_CREDIT_CARD': ['mean', 'sum'],\n", 282 | " 'BUREAU_CREDIT_TYPE_OTHER': ['mean', 'sum'],\n", 283 | " 'BUREAU_UNUSUAL_CURRENCY': ['mean', 'sum'],\n", 284 | " 'NEW_PAYMENT_RATE_SUM':['max',\"mean\",\"sum\"],\n", 285 | " 'NEW_PAYMENT_RATE_SUM_DEBT':['max',\"mean\",\"sum\"],\n", 286 | " 'NEW_PAYMENT_RATE_AMT_CREDIT_MAX_OVERDUE':['max',\"mean\",\"sum\"]\n", 287 | " }\n", 288 | " # Bureau and bureau_balance categorical features\n", 289 | " bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)\n", 290 | " cat_aggregations = {}\n", 291 | " for cat in bureau_cat: cat_aggregations[cat] = ['mean']\n", 292 | " \n", 293 | " \n", 294 | " bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})\n", 295 | " bureau_agg.columns = pd.Index(['BURO_' + e[0] + \"_\" + e[1].upper() for e in bureau_agg.columns.tolist()])\n", 296 | " # Bureau: Active credits - using only numerical aggregations\n", 297 | " active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]\n", 298 | " active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)\n", 299 | " active_agg.columns = pd.Index(['ACTIVE_' + e[0] + \"_\" + e[1].upper() for e in active_agg.columns.tolist()])\n", 300 | " bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')\n", 301 | " del active, active_agg\n", 302 | " gc.collect()\n", 303 | " # Bureau: Closed credits - using only numerical aggregations\n", 304 | " closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]\n", 305 | " closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)\n", 306 | " closed_agg.columns = pd.Index(['CLOSED_' + e[0] + \"_\" + e[1].upper() for e in closed_agg.columns.tolist()])\n", 307 | " bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')\n", 308 | " del closed, closed_agg, bureau\n", 309 | " gc.collect()\n", 310 | " return bureau_agg" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 6, 316 | "metadata": { 317 | "pycharm": { 318 | "is_executing": false, 319 | "name": "#%%\n" 320 | } 321 | }, 322 | "outputs": [], 323 | "source": [ 324 | "\n", 325 | "# Preprocess previous_applications.csv\n", 326 | "def previous_applications(num_rows = None, nan_as_category = True):\n", 327 | " \n", 328 | " df = pd.read_csv(\"data/previous_application.csv\", nrows = num_rows)\n", 329 | " \n", 330 | " df.replace(365243,np.nan,inplace = True)\n", 331 | " df.replace(\"XNA\",np.nan,inplace = True)\n", 332 | "\n", 333 | " df['NEW_RETURN_DAY'] = df['DAYS_DECISION'] + df['CNT_PAYMENT'] * 30\n", 334 | "\n", 335 | " df['NEW_DAYS_TERMINATION_diff'] = df['DAYS_TERMINATION'] - df['NEW_RETURN_DAY']\n", 336 | "\n", 337 | " df['NEW_AMT_DOWN_PAYMENT_rate'] = df['AMT_DOWN_PAYMENT'] / (df['AMT_CREDIT'] + 0.01)\n", 338 | "\n", 339 | " df['NEW_AMT_SPEND_TO_PRODUCT'] = df['AMT_GOODS_PRICE'] / df['AMT_CREDIT']\n", 340 | " \n", 341 | " df['NEW_DAYS_DUE']=df['DAYS_FIRST_DUE'] - df['DAYS_LAST_DUE_1ST_VERSION'] \n", 342 | " \n", 343 | " df['NEW_APP_CREDIT_PERC'] = df['AMT_APPLICATION'] / df['AMT_CREDIT']\n", 344 | " \n", 345 | " df[\"NAME_PAYMENT_TYPE\"].replace([\"Non-cash from your account\",\"Cashless from the account of the employer\"],np.nan,inplace=True)\n", 346 | "\n", 347 | " a = [\"Channel of corporate sales\",\"Car dealer\"]\n", 348 | " df[\"CHANNEL_TYPE\"].replace(a,\"Others_Type\",inplace=True)\n", 349 | "\n", 350 | " b = ['Family', 'Spouse, partner', 'Children', 'Other_B', 'Other_A', 'Group of people'] \n", 351 | " df[\"NAME_TYPE_SUITE\"] = df[\"NAME_TYPE_SUITE\"].replace(b, 'not_alone')\n", 352 | "\n", 353 | " df[\"WEEKDAY_APPR_PROCESS_START\"] = df[\"WEEKDAY_APPR_PROCESS_START\"].replace(['MONDAY','TUESDAY', 'WEDNESDAY','THURSDAY','FRIDAY'], 'WEEK_DAY') \n", 354 | " df[\"WEEKDAY_APPR_PROCESS_START\"] = df[\"WEEKDAY_APPR_PROCESS_START\"].replace(['SATURDAY', 'SUNDAY'], 'WEEKEND')\n", 355 | "\n", 356 | " a = ['Auto technology', 'Jewelry', 'MLM partners', 'Tourism'] \n", 357 | " df[\"NAME_SELLER_INDUSTRY\"] = df[\"NAME_SELLER_INDUSTRY\"].replace(a, 'Other_Ind')\n", 358 | "\n", 359 | " a = ['Auto Accessories', 'Jewelry', 'Homewares', 'Medical Supplies', 'Vehicles', 'Sport and Leisure','Gardening', 'Other', 'Office Appliances', 'Tourism', 'Medicine', 'Direct Sales', 'Fitness', 'Additional Service','Education', 'Weapon', 'Insurance', 'House Construction', 'Animals'] \n", 360 | " df[\"NAME_GOODS_CATEGORY\"] = df[\"NAME_GOODS_CATEGORY\"].replace(a, 'Other_Cat')\n", 361 | "\n", 362 | " a = ['Buying a used car','Building a house or an annex','Everyday expenses','Medicine','Payments on other loans','Education','Journey', 'Purchase of electronic equipment','Buying a new car','Wedding / gift / holiday','Buying a home','Car repairs','Furniture','Buying a holiday home / land', 'Business development','Gasification / water supply','Buying a garage','Hobby','Money for a third person','Refusal to name the goal','Urgent needs','Other']\n", 363 | " df['NAME_CASH_LOAN_PURPOSE']= df['NAME_CASH_LOAN_PURPOSE'].replace(a,'Others')\n", 364 | "\n", 365 | " df[\"NAME_PORTFOLIO\"].replace(\"cars\",np.nan,inplace=True)\n", 366 | " \n", 367 | " a = [8,9,10,11,12,13,14,15,16,17]\n", 368 | " df[\"HOUR_APPR_PROCESS_START\"] = df[\"HOUR_APPR_PROCESS_START\"].replace(a, 'Working_Hours')\n", 369 | "\n", 370 | " b = [18,19,20,21,22,23,0,1,2,3,4,5,6,7]\n", 371 | " df[\"HOUR_APPR_PROCESS_START\"] = df[\"HOUR_APPR_PROCESS_START\"].replace(b, 'Off_Hours')\n", 372 | " \n", 373 | " drops = [\"RATE_INTEREST_PRIMARY\",\"RATE_INTEREST_PRIVILEGED\",\"FLAG_LAST_APPL_PER_CONTRACT\",\"NFLAG_LAST_APPL_IN_DAY\",\"NAME_PRODUCT_TYPE\",\"SELLERPLACE_AREA\"]\n", 374 | " df.drop(drops,inplace=True,axis=1)\n", 375 | " \n", 376 | " df[\"NFLAG_INSURED_ON_APPROVAL\"] = df[\"NFLAG_INSURED_ON_APPROVAL\"].astype(\"object\")\n", 377 | " cat_features = list(df.select_dtypes(['object']).columns)\n", 378 | " df = pd.get_dummies(df, columns= cat_features, dummy_na= True,drop_first=True)\n", 379 | " \n", 380 | " agg1 = {'SK_ID_CURR': ['size'],\n", 381 | " 'AMT_ANNUITY': ['max', 'min', 'mean','std', 'sum'], \n", 382 | " 'AMT_APPLICATION':['max', 'min', 'mean','std', 'sum'],\n", 383 | " 'AMT_CREDIT':['max', 'min', 'mean','std', 'sum'],\n", 384 | " 'AMT_DOWN_PAYMENT': ['max', 'min', 'mean','std', 'sum'],\n", 385 | " 'AMT_GOODS_PRICE': ['max', 'min', 'mean','std', 'sum'],\n", 386 | " 'RATE_DOWN_PAYMENT': ['max', 'min', 'mean','std'],\n", 387 | " 'DAYS_DECISION': ['max', 'min', 'mean', 'sum'],\n", 388 | " 'CNT_PAYMENT': ['max', 'min', 'mean','std', 'sum'],\n", 389 | " 'DAYS_FIRST_DRAWING': ['max', 'min', 'mean', 'sum'],\n", 390 | " 'DAYS_FIRST_DUE': ['max', 'min', 'mean', 'sum'],\n", 391 | " 'DAYS_LAST_DUE_1ST_VERSION': ['max', 'min', 'mean', 'sum'],\n", 392 | " 'DAYS_LAST_DUE': ['max', 'min', 'mean', 'sum'],\n", 393 | " 'DAYS_TERMINATION': ['max', 'min', 'mean','std', 'sum'],\n", 394 | " 'NEW_RETURN_DAY': ['max', 'min', 'mean','std', 'sum'],\n", 395 | " 'NEW_DAYS_TERMINATION_diff': ['max', 'min', 'mean','std', 'sum'],\n", 396 | " 'NEW_AMT_DOWN_PAYMENT_rate': ['max', 'min', 'mean','std'],\n", 397 | " 'NEW_AMT_SPEND_TO_PRODUCT': ['max', 'min', 'mean','std', 'sum'],\n", 398 | " 'NEW_APP_CREDIT_PERC': ['max', 'min', 'mean'],\n", 399 | " 'NAME_CONTRACT_TYPE_Consumer loans': ['max', 'min','sum'],\n", 400 | " 'NAME_CONTRACT_TYPE_Revolving loans': ['max', 'min','sum'],\n", 401 | " 'NAME_CONTRACT_TYPE_nan': ['max', 'min','sum'],\n", 402 | " 'WEEKDAY_APPR_PROCESS_START_WEEK_DAY': ['max', 'min', 'sum'],\n", 403 | " 'WEEKDAY_APPR_PROCESS_START_nan': ['max', 'min', 'sum'],\n", 404 | " 'HOUR_APPR_PROCESS_START_Working_Hours': ['max', 'min', 'sum'],\n", 405 | " 'HOUR_APPR_PROCESS_START_nan': ['max', 'min', 'sum'],\n", 406 | " 'NAME_CASH_LOAN_PURPOSE_Repairs': ['max', 'min', 'sum'],\n", 407 | " 'NAME_CASH_LOAN_PURPOSE_XAP': ['max', 'min', 'sum'],\n", 408 | " 'NAME_CASH_LOAN_PURPOSE_nan': ['max', 'min', 'sum'],\n", 409 | " 'NAME_CONTRACT_STATUS_Canceled': ['max', 'min', 'sum'],\n", 410 | " 'NAME_CONTRACT_STATUS_Refused': ['max', 'min', 'sum'],\n", 411 | " 'NAME_CONTRACT_STATUS_Unused offer': ['max', 'min', 'sum'],\n", 412 | " 'NAME_CONTRACT_STATUS_nan': ['max', 'min', 'sum'],\n", 413 | " 'NAME_PAYMENT_TYPE_nan': ['max', 'min', 'sum'],\n", 414 | " 'CODE_REJECT_REASON_HC': ['max', 'min','sum'],\n", 415 | " 'CODE_REJECT_REASON_LIMIT': ['max', 'min','sum'],\n", 416 | " 'CODE_REJECT_REASON_SCO': ['max', 'min','sum'],\n", 417 | " 'CODE_REJECT_REASON_SCOFR': ['max', 'min', 'sum'],\n", 418 | " #'CODE_REJECT_REASON_SYSTEM': ['max', 'min', 'sum'],\n", 419 | " 'CODE_REJECT_REASON_VERIF': ['max', 'min', 'sum'],\n", 420 | " 'CODE_REJECT_REASON_XAP': ['max', 'min', 'sum'],\n", 421 | " 'CODE_REJECT_REASON_nan': ['max', 'min','sum'],\n", 422 | " 'NAME_TYPE_SUITE_not_alone': ['max', 'min','sum'],\n", 423 | " 'NAME_TYPE_SUITE_nan': ['max', 'min', 'sum'],\n", 424 | " 'NAME_CLIENT_TYPE_Refreshed': ['max', 'min','sum'],\n", 425 | " 'NAME_CLIENT_TYPE_Repeater': ['max', 'min', 'sum'],\n", 426 | " 'NAME_CLIENT_TYPE_nan': ['max', 'min','sum'],\n", 427 | " 'NAME_GOODS_CATEGORY_Clothing and Accessories': ['max', 'min', 'sum'],\n", 428 | " 'NAME_GOODS_CATEGORY_Computers': ['max', 'min','sum'],\n", 429 | " 'NAME_GOODS_CATEGORY_Construction Materials': ['max', 'min', 'sum'],\n", 430 | " 'NAME_GOODS_CATEGORY_Consumer Electronics': ['max', 'min', 'sum'],\n", 431 | " 'NAME_GOODS_CATEGORY_Furniture': ['max', 'min', 'sum'],\n", 432 | " 'NAME_GOODS_CATEGORY_Mobile': ['max', 'min', 'sum'],\n", 433 | " 'NAME_GOODS_CATEGORY_Other_Cat': ['max', 'min', 'sum'],\n", 434 | " 'NAME_GOODS_CATEGORY_Photo / Cinema Equipment': ['max', 'min', 'sum'],\n", 435 | " 'NAME_GOODS_CATEGORY_nan': ['max', 'min', 'sum'],\n", 436 | " 'NAME_PORTFOLIO_Cars': ['max', 'min', 'sum'],\n", 437 | " 'NAME_PORTFOLIO_Cash': ['max', 'min', 'sum'],\n", 438 | " 'NAME_PORTFOLIO_POS': ['max', 'min','sum'],\n", 439 | " 'NAME_PORTFOLIO_nan': ['max', 'min', 'sum'],\n", 440 | " 'CHANNEL_TYPE_Contact center': ['max', 'min', 'sum'],\n", 441 | " 'CHANNEL_TYPE_Country-wide': ['max', 'min', 'sum'],\n", 442 | " 'CHANNEL_TYPE_Credit and cash offices': ['max', 'min', 'sum'],\n", 443 | " 'CHANNEL_TYPE_Others_Type': ['max', 'min', 'sum'],\n", 444 | " 'CHANNEL_TYPE_Regional / Local': ['max', 'min','sum'],\n", 445 | " 'CHANNEL_TYPE_Stone': ['max', 'min','sum'],\n", 446 | " 'CHANNEL_TYPE_nan': ['max', 'min', 'sum'],\n", 447 | " 'NAME_SELLER_INDUSTRY_Connectivity': ['max', 'min','sum'],\n", 448 | " 'NAME_SELLER_INDUSTRY_Construction': ['max', 'min', 'sum'],\n", 449 | " 'NAME_SELLER_INDUSTRY_Consumer electronics': ['max', 'min', 'sum'],\n", 450 | " 'NAME_SELLER_INDUSTRY_Furniture': ['max', 'min', 'sum'],\n", 451 | " 'NAME_SELLER_INDUSTRY_Industry': ['max', 'min', 'sum'],\n", 452 | " 'NAME_SELLER_INDUSTRY_Other_Ind': ['max', 'min','sum'],\n", 453 | " 'NAME_SELLER_INDUSTRY_nan': ['max', 'min','sum'],\n", 454 | " 'NAME_YIELD_GROUP_low_action': ['max', 'min', 'sum'],\n", 455 | " 'NAME_YIELD_GROUP_low_normal': ['max', 'min', 'sum'],\n", 456 | " 'NAME_YIELD_GROUP_middle': ['max', 'min','sum'],\n", 457 | " 'NAME_YIELD_GROUP_nan': ['max', 'min','sum'],\n", 458 | " 'PRODUCT_COMBINATION_Card X-Sell': ['max', 'min', 'sum'],\n", 459 | " 'PRODUCT_COMBINATION_Cash': ['max', 'min', 'sum'],\n", 460 | " 'PRODUCT_COMBINATION_Cash Street: high': ['max', 'min', 'sum'],\n", 461 | " 'PRODUCT_COMBINATION_Cash Street: low': ['max', 'min','sum'],\n", 462 | " 'PRODUCT_COMBINATION_Cash Street: middle': ['max', 'min','sum'],\n", 463 | " 'PRODUCT_COMBINATION_Cash X-Sell: high': ['max', 'min','sum'],\n", 464 | " 'PRODUCT_COMBINATION_Cash X-Sell: low': ['max', 'min','sum'],\n", 465 | " 'PRODUCT_COMBINATION_Cash X-Sell: middle': ['max', 'min','sum'],\n", 466 | " 'PRODUCT_COMBINATION_POS household with interest': ['max', 'min','sum'],\n", 467 | " 'PRODUCT_COMBINATION_POS household without interest': ['max', 'min','sum'],\n", 468 | " 'PRODUCT_COMBINATION_POS industry with interest': ['max', 'min','sum'],\n", 469 | " 'PRODUCT_COMBINATION_POS industry without interest': ['max', 'min','sum'],\n", 470 | " 'PRODUCT_COMBINATION_POS mobile with interest': ['max', 'min','sum'],\n", 471 | " 'PRODUCT_COMBINATION_POS mobile without interest': ['max', 'min','sum'],\n", 472 | " 'PRODUCT_COMBINATION_POS other with interest': ['max', 'min','sum'],\n", 473 | " 'PRODUCT_COMBINATION_POS others without interest': ['max', 'min','sum'],\n", 474 | " 'PRODUCT_COMBINATION_nan': ['max', 'min','sum'],\n", 475 | " 'NFLAG_INSURED_ON_APPROVAL_1.0': ['max', 'min','sum'],\n", 476 | " 'NFLAG_INSURED_ON_APPROVAL_nan': ['max', 'min','sum']}\n", 477 | " df = df.groupby(['SK_ID_CURR']).agg(agg1)\n", 478 | " \n", 479 | " df.columns = pd.Index(['PREV_' + e[0] + \"_\" + e[1].upper() for e in df.columns.tolist()])\n", 480 | "\n", 481 | " return df\n", 482 | "\n", 483 | "# pytest" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": 9, 489 | "metadata": { 490 | "pycharm": { 491 | "is_executing": false, 492 | "name": "#%%\n" 493 | } 494 | }, 495 | "outputs": [], 496 | "source": [ 497 | "# Preprocess POS_CASH_balance.csv\n", 498 | "def pos_cash(num_rows = None, nan_as_category = True):\n", 499 | " df=pd.read_csv('data/POS_CASH_balance.csv',nrows = num_rows)\n", 500 | " \n", 501 | " df['NEW_ADJOURNMENT']=df['SK_DPD']-df['SK_DPD_DEF']\n", 502 | " \n", 503 | " \n", 504 | " b = [\"Demand\",\"Returned to the store\",\"Approved\",\"Amortized debt\",\"Canceled\",\"XNA\"]\n", 505 | " df[\"NAME_CONTRACT_STATUS\"].replace(b, 'Others',inplace=True)\n", 506 | " \n", 507 | " \n", 508 | " cat_features = list(df.select_dtypes(['object']).columns)\n", 509 | " df = pd.get_dummies(df, columns= cat_features, dummy_na= True)\n", 510 | " \n", 511 | " \n", 512 | " agg={\n", 513 | " 'MONTHS_BALANCE': ['max',\"min\"],\n", 514 | " 'SK_DPD': ['max', 'mean',\"std\"],\n", 515 | " 'SK_DPD_DEF': ['max', 'mean',\"std\"],\n", 516 | " 'CNT_INSTALMENT':['min','mean','max'],\n", 517 | " 'CNT_INSTALMENT_FUTURE':['mean','min','max'],\n", 518 | " 'SK_ID_CURR':['max','size'],\n", 519 | " 'NEW_ADJOURNMENT':['max','mean',\"std\"],\n", 520 | " 'NAME_CONTRACT_STATUS_Active':['sum'],\n", 521 | " 'NAME_CONTRACT_STATUS_Completed':['sum'],\n", 522 | " 'NAME_CONTRACT_STATUS_Signed':['sum'],\n", 523 | " 'NAME_CONTRACT_STATUS_Others':['sum']\n", 524 | " \n", 525 | " }\n", 526 | " \n", 527 | " \n", 528 | " pos_agg = df.groupby(['SK_ID_PREV']).agg(agg)\n", 529 | " \n", 530 | " \n", 531 | " pos_agg.columns = pd.Index([e[0] + \"_\" + e[1].upper() for e in pos_agg.columns.tolist()])\n", 532 | " \n", 533 | " pos_agg[\"NEW_PAID_MONTH\"] = pos_agg[\"CNT_INSTALMENT_MAX\"] - pos_agg[\"CNT_INSTALMENT_FUTURE_MIN\"]\n", 534 | " \n", 535 | " agg2={\n", 536 | " \"MONTHS_BALANCE_MAX\":[\"min\",\"max\",\"mean\"],\n", 537 | " \"MONTHS_BALANCE_MIN\":[\"min\",\"max\",\"mean\"],\n", 538 | " \"SK_DPD_MAX\":[\"max\",\"mean\",\"min\"],\n", 539 | " \"SK_DPD_MEAN\":[\"max\",\"mean\",\"min\"],\n", 540 | " \"SK_DPD_STD\":[\"max\",\"mean\",\"min\",\"std\"],\n", 541 | " \"SK_DPD_DEF_MAX\":[\"max\",\"mean\",\"min\"],\n", 542 | " \"SK_DPD_DEF_MEAN\":[\"max\",\"mean\",\"min\"],\n", 543 | " \"SK_DPD_DEF_STD\":[\"max\",\"mean\",\"min\"],\n", 544 | " \"CNT_INSTALMENT_MIN\":[\"max\",\"mean\",\"min\"],\n", 545 | " \"CNT_INSTALMENT_MEAN\":[\"max\",\"mean\",\"min\"],\n", 546 | " \"CNT_INSTALMENT_MAX\":[\"max\",\"mean\",\"min\"],\n", 547 | " \"CNT_INSTALMENT_FUTURE_MEAN\":[\"max\",\"mean\",\"min\"],\n", 548 | " \"CNT_INSTALMENT_FUTURE_MIN\":[\"max\",\"mean\",\"min\"],\n", 549 | " \"CNT_INSTALMENT_FUTURE_MAX\":[\"max\",\"mean\",\"min\"],\n", 550 | " \"SK_ID_CURR_MAX\":[\"max\",\"min\"],\n", 551 | " \"SK_ID_CURR_SIZE\":[\"max\",\"min\"],\n", 552 | " \"NEW_ADJOURNMENT_MAX\":[\"max\",\"mean\",\"min\"],\n", 553 | " \"NEW_ADJOURNMENT_MEAN\":[\"max\",\"mean\",\"min\"],\n", 554 | " \"NEW_ADJOURNMENT_STD\":[\"max\",\"mean\",\"min\"],\n", 555 | " \"NAME_CONTRACT_STATUS_Active_SUM\":[\"max\",\"min\",\"sum\"],\n", 556 | " 'NAME_CONTRACT_STATUS_Signed_SUM':[\"max\",\"min\",\"sum\"],\n", 557 | " 'NAME_CONTRACT_STATUS_Completed_SUM':[\"max\",\"min\",\"sum\"],\n", 558 | " 'NAME_CONTRACT_STATUS_Others_SUM':[\"max\",\"min\",\"sum\"]\n", 559 | " \n", 560 | " }\n", 561 | " \n", 562 | " pos_agg2 = pos_agg.groupby([\"SK_ID_CURR_MAX\"]).agg(agg2)\n", 563 | " pos_agg2.index.names = ['SK_ID_CURR']\n", 564 | " \n", 565 | " pos_agg2.columns = pd.Index([\"POS\" + \"_\" + e[0] + \"_\" + e[1].upper() for e in pos_agg2.columns.tolist()])\n", 566 | " \n", 567 | " return pos_agg2" 568 | ] 569 | }, 570 | { 571 | "cell_type": "code", 572 | "execution_count": 10, 573 | "metadata": { 574 | "pycharm": { 575 | "is_executing": false, 576 | "name": "#%%\n" 577 | } 578 | }, 579 | "outputs": [], 580 | "source": [ 581 | "# Preprocess installments_payments.csv\n", 582 | "def installments_payments(num_rows=None, nan_as_category = True):\n", 583 | " pd.options.mode.chained_assignment = None\n", 584 | " df = pd.read_csv(\"data/installments_payments.csv\", nrows = num_rows)\n", 585 | " df[\"NEW_DELAY\"] = df[\"DAYS_INSTALMENT\"] - df[\"DAYS_ENTRY_PAYMENT\"] \n", 586 | " \n", 587 | " df['NEW_FLAG_DELAY'] = df['NEW_DELAY'].apply(lambda x : 1 if x < 0 else 0)\n", 588 | " df['NEW_RATIO_DELAY'] = df[['SK_ID_PREV','NEW_FLAG_DELAY']].groupby('SK_ID_PREV')['NEW_FLAG_DELAY'].transform(lambda x : x.sum() / x.count())\n", 589 | " \n", 590 | " df[\"NEW_PAYMENT_DIFF\"] = df[\"AMT_INSTALMENT\"] - df[\"AMT_PAYMENT\"]\n", 591 | " \n", 592 | " \n", 593 | " df[\"NUM_INSTALMENT_VERSION\"] = df[\"NUM_INSTALMENT_VERSION\"].astype(\"object\")\n", 594 | " df[(df[\"NUM_INSTALMENT_VERSION\"] != 1) & (df[\"NUM_INSTALMENT_VERSION\"] != 0) & (df[\"NUM_INSTALMENT_VERSION\"] != 2) & (df[\"NUM_INSTALMENT_VERSION\"] != 3)]['NUM_INSTALMENT_VERSION'] = 4\n", 595 | " \n", 596 | " cat_features = list(df.select_dtypes(['object']).columns)\n", 597 | " df = pd.get_dummies(df, columns= cat_features,drop_first=True)\n", 598 | " \n", 599 | " \n", 600 | " agg1 = {'SK_ID_CURR': ['count','max'],\n", 601 | " 'NEW_DELAY': ['max', 'min', 'mean','std', 'sum'],\n", 602 | " 'NUM_INSTALMENT_NUMBER':['min','max'], \n", 603 | " 'DAYS_INSTALMENT':['max','min','std'], \n", 604 | " 'NEW_PAYMENT_DIFF': ['max', 'mean', 'std', 'min','sum'],\n", 605 | " 'AMT_INSTALMENT': ['max', 'mean', 'sum', 'min', 'std'],\n", 606 | " 'AMT_PAYMENT': ['min', 'max', 'mean', 'sum', 'std'],\n", 607 | " 'DAYS_ENTRY_PAYMENT': ['max', 'min', 'std'],\n", 608 | " \"NUM_INSTALMENT_VERSION_1.0\":[\"sum\"],\n", 609 | " \"NUM_INSTALMENT_VERSION_2.0\":[\"sum\"],\n", 610 | " \"NUM_INSTALMENT_VERSION_3.0\":[\"sum\"],\n", 611 | " \"NUM_INSTALMENT_VERSION_4.0\":[\"sum\"]\n", 612 | " }\n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " Installments_agg = df.groupby(['SK_ID_PREV']).agg(agg1)\n", 617 | " \n", 618 | " Installments_agg.columns = pd.Index([e[0] + \"_\" + e[1].upper() for e in Installments_agg.columns.tolist()])\n", 619 | " \n", 620 | " Installments_agg['NEW_DAYS_INSTALMENT_NUMBER']=Installments_agg['DAYS_INSTALMENT_MAX']-Installments_agg['DAYS_INSTALMENT_MIN'] \n", 621 | " \n", 622 | " Installments_agg['NEW_AMT_INSTALMENT_DIFF']=Installments_agg['AMT_INSTALMENT_MAX']-Installments_agg['AMT_INSTALMENT_MIN']\n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " agg2= {'SK_ID_CURR_COUNT':['min', 'max'],\n", 627 | " 'SK_ID_CURR_MAX':['min', 'max'],\n", 628 | " 'NEW_DELAY_MAX':['min', 'max', 'mean'],\n", 629 | " 'NEW_DELAY_MIN':['min', 'max', 'mean'],\n", 630 | " 'NEW_DELAY_MEAN':['min', 'max', 'mean'],\n", 631 | " 'NEW_DELAY_STD':['min', 'max', 'mean'],\n", 632 | " 'NEW_DELAY_SUM':['min', 'max', 'mean', 'sum', 'std'],\n", 633 | " 'NUM_INSTALMENT_NUMBER_MIN':['min','max','mean'], \n", 634 | " 'NUM_INSTALMENT_NUMBER_MAX':['min','max','mean','sum'],\n", 635 | " 'NEW_DAYS_INSTALMENT_NUMBER':['min','max','std'], \n", 636 | " 'DAYS_INSTALMENT_STD':['min','max','std'], \n", 637 | " 'DAYS_INSTALMENT_MIN':['std','min','max'],\n", 638 | " 'DAYS_INSTALMENT_MAX':['std','min','max'],\n", 639 | " 'NEW_PAYMENT_DIFF_MAX':['min', 'max', 'mean',\"std\"],\n", 640 | " 'NEW_PAYMENT_DIFF_MEAN':['min', 'max', 'mean',\"std\"],\n", 641 | " 'NEW_PAYMENT_DIFF_SUM':['min', 'max', 'mean',\"std\"],\n", 642 | " 'NEW_PAYMENT_DIFF_STD':['min', 'max', 'mean',\"std\"],\n", 643 | " 'NEW_PAYMENT_DIFF_MIN':['min', 'max', 'mean',\"std\"],\n", 644 | " 'AMT_INSTALMENT_MAX':['min', 'max', 'mean',\"sum\"],\n", 645 | " 'AMT_INSTALMENT_MEAN':['min', 'max', 'mean',\"sum\"],\n", 646 | " 'AMT_INSTALMENT_SUM':['min', 'max', 'mean',\"sum\"],\n", 647 | " 'AMT_INSTALMENT_STD':['min', 'max', 'mean',\"sum\"],\n", 648 | " 'AMT_INSTALMENT_MIN':['min', 'max', 'mean',\"sum\"],\n", 649 | " 'NEW_AMT_INSTALMENT_DIFF':['min','max','mean',\"sum\"],\n", 650 | " 'AMT_PAYMENT_MIN':['min', 'max', 'mean',\"std\",\"sum\"],\n", 651 | " 'AMT_PAYMENT_MAX':['min', 'max', 'mean',\"std\",\"sum\"],\n", 652 | " 'AMT_PAYMENT_MEAN':['min', 'max', 'mean',\"std\",\"sum\"],\n", 653 | " 'AMT_PAYMENT_STD':['min', 'max', 'mean',\"std\",\"sum\"],\n", 654 | " 'AMT_PAYMENT_SUM':['min', 'max', 'mean',\"std\",\"sum\"],\n", 655 | " 'DAYS_ENTRY_PAYMENT_MIN':['min', 'max', 'mean'],\n", 656 | " 'DAYS_ENTRY_PAYMENT_STD':['min', 'max', 'mean'],\n", 657 | " 'DAYS_ENTRY_PAYMENT_MAX':['min', 'max', 'mean'],\n", 658 | " 'NUM_INSTALMENT_VERSION_1.0_SUM':['sum'],\n", 659 | " 'NUM_INSTALMENT_VERSION_2.0_SUM':['sum'],\n", 660 | " 'NUM_INSTALMENT_VERSION_3.0_SUM':['sum'],\n", 661 | " 'NUM_INSTALMENT_VERSION_4.0_SUM':['sum']\n", 662 | " }\n", 663 | " \n", 664 | " Installments_agg2=Installments_agg.groupby('SK_ID_CURR_MAX').agg(agg2)\n", 665 | " Installments_agg2.index.names = ['SK_ID_CURR']\n", 666 | " \n", 667 | " \n", 668 | " Installments_agg2.columns = pd.Index(\"INSTAL_\" + e[0] + \"_\" + e[1].upper() for e in Installments_agg2.columns.tolist())\n", 669 | " return Installments_agg2\n" 670 | ] 671 | }, 672 | { 673 | "cell_type": "code", 674 | "execution_count": 11, 675 | "metadata": { 676 | "pycharm": { 677 | "is_executing": false, 678 | "name": "#%%\n" 679 | } 680 | }, 681 | "outputs": [], 682 | "source": [ 683 | "# Preprocess credit_card_balance.csv\n", 684 | "def credit_card_balance(num_rows = None, nan_as_category = True):\n", 685 | " cc = pd.read_csv('data/credit_card_balance.csv', nrows = num_rows)\n", 686 | " cc, cat_cols = one_hot_encoder(cc, nan_as_category= True)\n", 687 | " # General aggregations\n", 688 | " cc.drop(['SK_ID_PREV'], axis= 1, inplace = True)\n", 689 | " cc['number_of_instalments'] = cc.groupby(\n", 690 | " by=['SK_ID_CURR'])['CNT_INSTALMENT_MATURE_CUM'].agg('max').reset_index()[\n", 691 | " 'CNT_INSTALMENT_MATURE_CUM']\n", 692 | " cc['AMT_DRAWINGS_ATM_CURRENT'][cc['AMT_DRAWINGS_ATM_CURRENT'] < 0] = np.nan\n", 693 | " cc['AMT_DRAWINGS_CURRENT'][cc['AMT_DRAWINGS_CURRENT'] < 0] = np.nan\n", 694 | " cc_agg = cc.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum', 'var'])\n", 695 | " cc_agg.columns = pd.Index(['CC_' + e[0] + \"_\" + e[1].upper() for e in cc_agg.columns.tolist()])\n", 696 | " # Count credit card lines\n", 697 | " cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()\n", 698 | " del cc\n", 699 | " gc.collect()\n", 700 | " return cc_agg\n" 701 | ] 702 | }, 703 | { 704 | "cell_type": "code", 705 | "execution_count": 12, 706 | "metadata": { 707 | "pycharm": { 708 | "is_executing": false, 709 | "name": "#%%\n" 710 | } 711 | }, 712 | "outputs": [], 713 | "source": [ 714 | "# LightGBM GBDT with KFold or Stratified KFold\n", 715 | "# Parameters from Tilii kernel: https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code\n", 716 | "def kfold_lightgbm(df, num_folds, stratified = False, debug= False):\n", 717 | " import re\n", 718 | " df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))\n", 719 | " # Divide in training/validation and test data\n", 720 | " train_df = df[df['TARGET'].notnull()]\n", 721 | " test_df = df[df['TARGET'].isnull()]\n", 722 | " print(\"Starting LightGBM. Train shape: {}, test shape: {}\".format(train_df.shape, test_df.shape))\n", 723 | " del df\n", 724 | " gc.collect()\n", 725 | " # Cross validation model\n", 726 | " if stratified:\n", 727 | " folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)\n", 728 | " else:\n", 729 | " folds = KFold(n_splits= num_folds, shuffle=True, random_state=1001)\n", 730 | " # Create arrays and dataframes to store results\n", 731 | " oof_preds = np.zeros(train_df.shape[0])\n", 732 | " sub_preds = np.zeros(test_df.shape[0])\n", 733 | " feature_importance_df = pd.DataFrame()\n", 734 | " feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]\n", 735 | " \n", 736 | " for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):\n", 737 | " train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]\n", 738 | " valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]\n", 739 | "\n", 740 | " # LightGBM parameters found by Bayesian optimization\n", 741 | " clf = LGBMClassifier(\n", 742 | " njobs = -1,\n", 743 | " n_estimators=10000,\n", 744 | " learning_rate=0.02,\n", 745 | " num_leaves=34,\n", 746 | " colsample_bytree=0.9497036,\n", 747 | " subsample=0.8715623,\n", 748 | " max_depth=8,\n", 749 | " reg_alpha=0.041545473,\n", 750 | " reg_lambda=0.0735294,\n", 751 | " min_split_gain=0.0222415,\n", 752 | " min_child_weight=39.3259775,\n", 753 | " silent=-1,\n", 754 | " verbose=-1, )\n", 755 | "\n", 756 | " clf.fit(train_x, train_y, eval_set = [(train_x, train_y), (valid_x, valid_y)], \n", 757 | " eval_metric = 'auc', verbose = 300, early_stopping_rounds = 200)\n", 758 | "\n", 759 | " oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]\n", 760 | " sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits\n", 761 | "\n", 762 | " fold_importance_df = pd.DataFrame()\n", 763 | " fold_importance_df[\"feature\"] = feats\n", 764 | " fold_importance_df[\"importance\"] = clf.feature_importances_\n", 765 | " fold_importance_df[\"fold\"] = n_fold + 1\n", 766 | " feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n", 767 | " print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))\n", 768 | " del clf, train_x, train_y, valid_x, valid_y\n", 769 | " gc.collect()\n", 770 | "\n", 771 | " print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))\n", 772 | " # Write submission file and plot feature importance\n", 773 | " if not debug:\n", 774 | " test_df['TARGET'] = sub_preds\n", 775 | " test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False)\n", 776 | " display_importances(feature_importance_df)\n", 777 | " return feature_importance_df\n" 778 | ] 779 | }, 780 | { 781 | "cell_type": "code", 782 | "execution_count": 13, 783 | "metadata": { 784 | "pycharm": { 785 | "is_executing": false, 786 | "name": "#%%\n" 787 | } 788 | }, 789 | "outputs": [], 790 | "source": [ 791 | "\n", 792 | "# Display/plot feature importance\n", 793 | "def display_importances(feature_importance_df_):\n", 794 | " cols = feature_importance_df_[[\"feature\", \"importance\"]].groupby(\"feature\").mean().sort_values(by=\"importance\", ascending=False)[:40].index\n", 795 | " best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]\n", 796 | " plt.figure(figsize=(8, 10))\n", 797 | " sns.barplot(x=\"importance\", y=\"feature\", data=best_features.sort_values(by=\"importance\", ascending=False))\n", 798 | " plt.title('LightGBM Features (avg over folds)')\n", 799 | " plt.tight_layout()\n", 800 | " plt.savefig('lgbm_importances01.png')" 801 | ] 802 | }, 803 | { 804 | "cell_type": "code", 805 | "execution_count": null, 806 | "metadata": { 807 | "pycharm": { 808 | "is_executing": false, 809 | "name": "#%%\n" 810 | } 811 | }, 812 | "outputs": [ 813 | { 814 | "name": "stdout", 815 | "output_type": "stream", 816 | "text": [ 817 | "4 columns were label encoded.\n", 818 | "Bureau df shape: (305811, 191)\n", 819 | "Process bureau and bureau_balance - done in 523s\n", 820 | "Previous applications df shape: (338856, 316)\n", 821 | "Process previous_applications - done in 79s\n", 822 | "Pos-cash balance df shape: (337252, 68)\n", 823 | "Process POS-CASH balance - done in 48s\n", 824 | "Installments payments df shape: (339587, 122)\n", 825 | "Process installments payments - done in 585s\n", 826 | "Credit card balance df shape: (103558, 146)\n", 827 | "Process credit card balance - done in 35s\n", 828 | "Starting LightGBM. Train shape: (307507, 1117), test shape: (48744, 1117)\n", 829 | "Training until validation scores don't improve for 200 rounds\n", 830 | "[300]\ttraining's auc: 0.811262\ttraining's binary_logloss: 0.228348\tvalid_1's auc: 0.785853\tvalid_1's binary_logloss: 0.24296\n", 831 | "[600]\ttraining's auc: 0.837004\ttraining's binary_logloss: 0.217781\tvalid_1's auc: 0.793038\tvalid_1's binary_logloss: 0.240394\n", 832 | "[900]\ttraining's auc: 0.854723\ttraining's binary_logloss: 0.210303\tvalid_1's auc: 0.795312\tvalid_1's binary_logloss: 0.239707\n", 833 | "[1200]\ttraining's auc: 0.86938\ttraining's binary_logloss: 0.203926\tvalid_1's auc: 0.796109\tvalid_1's binary_logloss: 0.239448\n", 834 | "[1500]\ttraining's auc: 0.881774\ttraining's binary_logloss: 0.19816\tvalid_1's auc: 0.796389\tvalid_1's binary_logloss: 0.239438\n", 835 | "Early stopping, best iteration is:\n", 836 | "[1360]\ttraining's auc: 0.876153\ttraining's binary_logloss: 0.200834\tvalid_1's auc: 0.796395\tvalid_1's binary_logloss: 0.23939\n", 837 | "Fold 1 AUC : 0.796395\n", 838 | "Training until validation scores don't improve for 200 rounds\n", 839 | "[300]\ttraining's auc: 0.811257\ttraining's binary_logloss: 0.229009\tvalid_1's auc: 0.785097\tvalid_1's binary_logloss: 0.238063\n", 840 | "[600]\ttraining's auc: 0.836687\ttraining's binary_logloss: 0.218527\tvalid_1's auc: 0.791366\tvalid_1's binary_logloss: 0.235552\n", 841 | "[900]\ttraining's auc: 0.854237\ttraining's binary_logloss: 0.211137\tvalid_1's auc: 0.793311\tvalid_1's binary_logloss: 0.234739\n", 842 | "[1200]\ttraining's auc: 0.868395\ttraining's binary_logloss: 0.204903\tvalid_1's auc: 0.793761\tvalid_1's binary_logloss: 0.234514\n", 843 | "[1500]\ttraining's auc: 0.881034\ttraining's binary_logloss: 0.199079\tvalid_1's auc: 0.794216\tvalid_1's binary_logloss: 0.234371\n", 844 | "[1800]\ttraining's auc: 0.892325\ttraining's binary_logloss: 0.193587\tvalid_1's auc: 0.79437\tvalid_1's binary_logloss: 0.234313\n", 845 | "Early stopping, best iteration is:\n", 846 | "[1817]\ttraining's auc: 0.892948\ttraining's binary_logloss: 0.19327\tvalid_1's auc: 0.79441\tvalid_1's binary_logloss: 0.234299\n", 847 | "Fold 2 AUC : 0.794410\n", 848 | "Training until validation scores don't improve for 200 rounds\n", 849 | "[300]\ttraining's auc: 0.812067\ttraining's binary_logloss: 0.228295\tvalid_1's auc: 0.774185\tvalid_1's binary_logloss: 0.243945\n", 850 | "[600]\ttraining's auc: 0.838057\ttraining's binary_logloss: 0.217674\tvalid_1's auc: 0.781359\tvalid_1's binary_logloss: 0.241458\n", 851 | "[900]\ttraining's auc: 0.855459\ttraining's binary_logloss: 0.21034\tvalid_1's auc: 0.783278\tvalid_1's binary_logloss: 0.240845\n", 852 | "[1200]\ttraining's auc: 0.869922\ttraining's binary_logloss: 0.203992\tvalid_1's auc: 0.7842\tvalid_1's binary_logloss: 0.240608\n", 853 | "[1500]\ttraining's auc: 0.882433\ttraining's binary_logloss: 0.198129\tvalid_1's auc: 0.784853\tvalid_1's binary_logloss: 0.24043\n", 854 | "[1800]\ttraining's auc: 0.893632\ttraining's binary_logloss: 0.192698\tvalid_1's auc: 0.785133\tvalid_1's binary_logloss: 0.240421\n", 855 | "Early stopping, best iteration is:\n", 856 | "[1636]\ttraining's auc: 0.88773\ttraining's binary_logloss: 0.195601\tvalid_1's auc: 0.785005\tvalid_1's binary_logloss: 0.240401\n", 857 | "Fold 3 AUC : 0.785005\n", 858 | "Training until validation scores don't improve for 200 rounds\n", 859 | "[300]\ttraining's auc: 0.81111\ttraining's binary_logloss: 0.228961\tvalid_1's auc: 0.785304\tvalid_1's binary_logloss: 0.238185\n", 860 | "[600]\ttraining's auc: 0.83684\ttraining's binary_logloss: 0.218454\tvalid_1's auc: 0.792566\tvalid_1's binary_logloss: 0.235349\n", 861 | "[900]\ttraining's auc: 0.854772\ttraining's binary_logloss: 0.210942\tvalid_1's auc: 0.794335\tvalid_1's binary_logloss: 0.234542\n", 862 | "[1200]\ttraining's auc: 0.869597\ttraining's binary_logloss: 0.204469\tvalid_1's auc: 0.794757\tvalid_1's binary_logloss: 0.234228\n", 863 | "[1500]\ttraining's auc: 0.881997\ttraining's binary_logloss: 0.19879\tvalid_1's auc: 0.795144\tvalid_1's binary_logloss: 0.23402\n", 864 | "[1800]\ttraining's auc: 0.893607\ttraining's binary_logloss: 0.193246\tvalid_1's auc: 0.794941\tvalid_1's binary_logloss: 0.234077\n", 865 | "Early stopping, best iteration is:\n", 866 | "[1704]\ttraining's auc: 0.89002\ttraining's binary_logloss: 0.194985\tvalid_1's auc: 0.795307\tvalid_1's binary_logloss: 0.233958\n" 867 | ] 868 | } 869 | ], 870 | "source": [ 871 | "def main(debug = False):\n", 872 | " num_rows = 10000 if debug else None\n", 873 | " df = application_train_test(num_rows)\n", 874 | " with timer(\"Process bureau and bureau_balance\"):\n", 875 | " bureau = bureau_and_balance(num_rows)\n", 876 | " print(\"Bureau df shape:\", bureau.shape)\n", 877 | " df = df.join(bureau, how='left', on='SK_ID_CURR')\n", 878 | " del bureau\n", 879 | " gc.collect()\n", 880 | " with timer(\"Process previous_applications\"):\n", 881 | " prev = previous_applications(num_rows)\n", 882 | " print(\"Previous applications df shape:\", prev.shape)\n", 883 | " df = df.join(prev, how='left', on='SK_ID_CURR')\n", 884 | " del prev\n", 885 | " gc.collect()\n", 886 | " with timer(\"Process POS-CASH balance\"):\n", 887 | " pos = pos_cash(num_rows)\n", 888 | " print(\"Pos-cash balance df shape:\", pos.shape)\n", 889 | " df = df.join(pos, how='left', on='SK_ID_CURR')\n", 890 | " del pos\n", 891 | " gc.collect()\n", 892 | " with timer(\"Process installments payments\"):\n", 893 | " ins = installments_payments(num_rows)\n", 894 | " print(\"Installments payments df shape:\", ins.shape)\n", 895 | " df = df.join(ins, how='left', on='SK_ID_CURR')\n", 896 | " del ins\n", 897 | " gc.collect()\n", 898 | " with timer(\"Process credit card balance\"):\n", 899 | " cc = credit_card_balance(num_rows)\n", 900 | " print(\"Credit card balance df shape:\", cc.shape)\n", 901 | " df = df.join(cc, how='left', on='SK_ID_CURR')\n", 902 | " del cc\n", 903 | " gc.collect()\n", 904 | " with timer(\"Run LightGBM with kfold\"):\n", 905 | " feat_importance = kfold_lightgbm(df, num_folds= 10, stratified= False, debug= debug)\n", 906 | "\n", 907 | "if __name__ == \"__main__\":\n", 908 | " submission_file_name = \"outputs/predictions/merve_betul.csv\"\n", 909 | " with timer(\"Full model run\"):\n", 910 | " main(debug=True)" 911 | ] 912 | }, 913 | { 914 | "cell_type": "code", 915 | "execution_count": null, 916 | "metadata": { 917 | "pycharm": { 918 | "is_executing": false, 919 | "name": "#%% Notes\n" 920 | } 921 | }, 922 | "outputs": [], 923 | "source": [ 924 | "# !pip install lightgbm=='2.1.2'\n", 925 | "# lightgbm.__version__\n", 926 | "\n", 927 | "# Full AUC score 0.793601 : pos,installments ve posrevious degistirildiginde\n" 928 | ] 929 | }, 930 | { 931 | "cell_type": "code", 932 | "execution_count": null, 933 | "metadata": {}, 934 | "outputs": [], 935 | "source": [ 936 | "# 3309s" 937 | ] 938 | }, 939 | { 940 | "cell_type": "code", 941 | "execution_count": null, 942 | "metadata": {}, 943 | "outputs": [], 944 | "source": [] 945 | }, 946 | { 947 | "cell_type": "code", 948 | "execution_count": null, 949 | "metadata": {}, 950 | "outputs": [], 951 | "source": [] 952 | } 953 | ], 954 | "metadata": { 955 | "kernelspec": { 956 | "display_name": "Python 3", 957 | "language": "python", 958 | "name": "python3" 959 | }, 960 | "language_info": { 961 | "codemirror_mode": { 962 | "name": "ipython", 963 | "version": 3 964 | }, 965 | "file_extension": ".py", 966 | "mimetype": "text/x-python", 967 | "name": "python", 968 | "nbconvert_exporter": "python", 969 | "pygments_lexer": "ipython3", 970 | "version": "3.7.6" 971 | } 972 | }, 973 | "nbformat": 4, 974 | "nbformat_minor": 4 975 | } -------------------------------------------------------------------------------- /models/dsmlbc1/homeCreditRiskFinal.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Libraries" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 20, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "ename": "ImportError", 17 | "evalue": "Something is wrong with the numpy installation. While importing we detected an older version of numpy in ['/Users/mvahit/anaconda3/lib/python3.7/site-packages/numpy']. One method of fixing this is to repeatedly uninstall numpy until none is found, then reinstall this version.", 18 | "output_type": "error", 19 | "traceback": [ 20 | "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", 21 | "\u001B[0;31mImportError\u001B[0m Traceback (most recent call last)", 22 | "\u001B[0;32m\u001B[0m in \u001B[0;36m\u001B[0;34m\u001B[0m\n\u001B[0;32m----> 1\u001B[0;31m \u001B[0;32mimport\u001B[0m \u001B[0mnumpy\u001B[0m \u001B[0;32mas\u001B[0m \u001B[0mnp\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m 2\u001B[0m \u001B[0;32mimport\u001B[0m \u001B[0mpandas\u001B[0m \u001B[0;32mas\u001B[0m \u001B[0mpd\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 3\u001B[0m \u001B[0;32mimport\u001B[0m \u001B[0mlightgbm\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 4\u001B[0m \u001B[0;32mimport\u001B[0m \u001B[0mgc\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 5\u001B[0m \u001B[0;32mimport\u001B[0m \u001B[0mtime\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n", 23 | "\u001B[0;32m~/anaconda3/lib/python3.7/site-packages/numpy/__init__.py\u001B[0m in \u001B[0;36m\u001B[0;34m\u001B[0m\n\u001B[1;32m 140\u001B[0m \u001B[0;32mfrom\u001B[0m \u001B[0;34m.\u001B[0m \u001B[0;32mimport\u001B[0m \u001B[0m_distributor_init\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 141\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 142\u001B[0;31m \u001B[0;32mfrom\u001B[0m \u001B[0;34m.\u001B[0m \u001B[0;32mimport\u001B[0m \u001B[0mcore\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m 143\u001B[0m \u001B[0;32mfrom\u001B[0m \u001B[0;34m.\u001B[0m\u001B[0mcore\u001B[0m \u001B[0;32mimport\u001B[0m \u001B[0;34m*\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 144\u001B[0m \u001B[0;32mfrom\u001B[0m \u001B[0;34m.\u001B[0m \u001B[0;32mimport\u001B[0m \u001B[0mcompat\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n", 24 | "\u001B[0;32m~/anaconda3/lib/python3.7/site-packages/numpy/core/__init__.py\u001B[0m in \u001B[0;36m\u001B[0;34m\u001B[0m\n\u001B[1;32m 72\u001B[0m \u001B[0;34m\"numpy in {}. One method of fixing this is to repeatedly uninstall \"\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 73\u001B[0m \"numpy until none is found, then reinstall this version.\")\n\u001B[0;32m---> 74\u001B[0;31m \u001B[0;32mraise\u001B[0m \u001B[0mImportError\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mmsg\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mformat\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mpath\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m 75\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 76\u001B[0m \u001B[0;32mfrom\u001B[0m \u001B[0;34m.\u001B[0m \u001B[0;32mimport\u001B[0m \u001B[0mnumerictypes\u001B[0m \u001B[0;32mas\u001B[0m \u001B[0mnt\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n", 25 | "\u001B[0;31mImportError\u001B[0m: Something is wrong with the numpy installation. While importing we detected an older version of numpy in ['/Users/mvahit/anaconda3/lib/python3.7/site-packages/numpy']. One method of fixing this is to repeatedly uninstall numpy until none is found, then reinstall this version." 26 | ] 27 | } 28 | ], 29 | "source": [ 30 | "import numpy as np\n", 31 | "import pandas as pd\n", 32 | "import lightgbm\n", 33 | "import gc\n", 34 | "import time\n", 35 | "from contextlib import contextmanager\n", 36 | "from lightgbm import LGBMClassifier\n", 37 | "from sklearn.metrics import roc_auc_score\n", 38 | "from sklearn.model_selection import KFold, StratifiedKFold\n", 39 | "import matplotlib.pyplot as plt\n", 40 | "import seaborn as sns\n", 41 | "from sklearn.preprocessing import LabelEncoder\n", 42 | "from sklearn.model_selection import GridSearchCV\n", 43 | "\n", 44 | "import warnings\n", 45 | "warnings.filterwarnings(\"ignore\", category=DeprecationWarning) \n", 46 | "warnings.filterwarnings(\"ignore\", category=FutureWarning) \n", 47 | "warnings.filterwarnings(\"ignore\", category=UserWarning) " 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "# Helper Functions" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 50, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "@contextmanager\n", 64 | "def timer(title):\n", 65 | " t0 = time.time()\n", 66 | " yield\n", 67 | " print(\"{} - done in {:.0f}s\".format(title, time.time() - t0))" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 51, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "# Display/plot feature importance\n", 77 | "def display_importances(feature_importance_df_):\n", 78 | " cols = feature_importance_df_[[\"feature\", \"importance\"]].groupby(\"feature\").mean().sort_values(by=\"importance\", ascending=False)[:100].index\n", 79 | " best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]\n", 80 | " plt.figure(figsize=(15, 20))\n", 81 | " sns.barplot(x=\"importance\", y=\"feature\", data=best_features.sort_values(by=\"importance\", ascending=False))\n", 82 | " plt.title('LightGBM Features (avg over folds)')\n", 83 | " plt.tight_layout()\n", 84 | " plt.savefig('lgbm_importances01.png')" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 52, 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "name": "stdout", 94 | "output_type": "stream", 95 | "text": [ 96 | " \n", 97 | " \n", 98 | " \n", 99 | " cience I_ cience I_ \n", 100 | " a_Science I_Love_ a_Science I_Love_ \n", 101 | " ta_Science I_Love_Data_Science I_Love_Dat \n", 102 | " ta_Science I_Love_Data_Science I_Love_Data_ \n", 103 | " ta_Science I_Love_Data_Science I_Love_Data_Sc \n", 104 | " a_Science I_Love_Data_Science I_Love_Data_Sci \n", 105 | " _Science I_Love_Data_Science I_Love_Data_Scie \n", 106 | " Science I_Love_Data_Science I_Love_Data_Scien \n", 107 | " cience I_Love_Data_Science I_Love_Data_Scienc \n", 108 | " ience I_Love_Data_Science I_Love_Data_Science \n", 109 | " nce I_Love_Data_Science I_Love_Data_Science \n", 110 | " e I_Love_Data_Science I_Love_Data_Science \n", 111 | " I_Love_Data_Science I_Love_Data_Science \n", 112 | " Love_Data_Science I_Love_Data_Science \n", 113 | " ve_Data_Science I_Love_Data_Science \n", 114 | " _Data_Science I_Love_Data_Science \n", 115 | " ta_Science I_Love_Data_Scienc \n", 116 | " Science I_Love_Data_Scien \n", 117 | " ence I_Love_Data_Scie \n", 118 | " I_Love_Data_Sc \n", 119 | " ove_Data_ \n", 120 | " Dat \n", 121 | " t \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n" 126 | ] 127 | } 128 | ], 129 | "source": [ 130 | "print('\\n'.join([''.join([(' I_Love_Data_Science_'[(x-y) % len('I_Love_Data_Science_')] if ((x*0.05)**2+(y*0.1)**2-1)**3-(x*0.05)**2*(y*0.1)**3 <= 0 else ' ') for x in range(-30, 30)]) for y in range(15, -15, -1)]))" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "# application_train" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 53, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "def application_train():\n", 147 | "\n", 148 | " df = pd.read_csv('data/application_train.csv')\n", 149 | " test_df = pd.read_csv('data/application_test.csv')\n", 150 | "\n", 151 | " df = df.append(test_df).reset_index()\n", 152 | " df = df[df['CODE_GENDER'] != 'XNA']\n", 153 | "\n", 154 | " lbe = LabelEncoder()\n", 155 | "\n", 156 | " for col in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:\n", 157 | " df[col] = lbe.fit_transform(df[col])\n", 158 | "\n", 159 | " df = pd.get_dummies(df, dummy_na = True)\n", 160 | "\n", 161 | " df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace = True)\n", 162 | " df['NEW_DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']\n", 163 | " df['NEW_INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']\n", 164 | " df['NEW_INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']\n", 165 | " df['NEW_ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']\n", 166 | " df['NEW_PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']\n", 167 | "\n", 168 | " df.drop(\"index\", axis = 1, inplace = True)\n", 169 | "\n", 170 | " df.columns = pd.Index([\"APP_\" + col for col in df.columns.tolist()])\n", 171 | "\n", 172 | " df.rename(columns={\"APP_SK_ID_CURR\":\"SK_ID_CURR\"}, inplace = True)\n", 173 | "\n", 174 | " df.rename(columns={\"APP_TARGET\":\"TARGET\"}, inplace = True)\n", 175 | " \n", 176 | " return df" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "# bureau & bureau_balance" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 54, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "def bureau_bb():\n", 193 | "\n", 194 | " #bureau_balance tablosunun okutulması\n", 195 | "\n", 196 | " bb = pd.read_csv('data/bureau_balance.csv')\n", 197 | " bb = pd.get_dummies(bb, dummy_na = True)\n", 198 | "\n", 199 | " agg_list = {\"MONTHS_BALANCE\":\"count\",\n", 200 | " \"STATUS_0\":[\"sum\",\"mean\"],\n", 201 | " \"STATUS_1\":[\"sum\"],\n", 202 | " \"STATUS_2\":[\"sum\"],\n", 203 | " \"STATUS_3\":[\"sum\"],\n", 204 | " \"STATUS_4\":[\"sum\"],\n", 205 | " \"STATUS_5\":[\"sum\"],\n", 206 | " \"STATUS_C\":[\"sum\",\"mean\"],\n", 207 | " \"STATUS_X\":[\"sum\",\"mean\"] }\n", 208 | "\n", 209 | " bb_agg = bb.groupby(\"SK_ID_BUREAU\").agg(agg_list)\n", 210 | "\n", 211 | " # Degisken isimlerinin yeniden adlandirilmasi \n", 212 | " bb_agg.columns = pd.Index([col[0] + \"_\" + col[1].upper() for col in bb_agg.columns.tolist()])\n", 213 | "\n", 214 | " # Status_sum ile ilgili yeni bir degisken olusturma\n", 215 | " bb_agg['NEW_STATUS_SCORE'] = bb_agg['STATUS_1_SUM'] + bb_agg['STATUS_2_SUM']^2 + bb_agg['STATUS_3_SUM']^3 + bb_agg['STATUS_4_SUM']^4 + bb_agg['STATUS_5_SUM']^5\n", 216 | "\n", 217 | " bb_agg.drop(['STATUS_1_SUM','STATUS_2_SUM','STATUS_3_SUM','STATUS_4_SUM','STATUS_5_SUM'], axis=1,inplace=True)\n", 218 | "\n", 219 | " bureau = pd.read_csv('data/bureau.csv')\n", 220 | " bureau_and_bb = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')\n", 221 | "\n", 222 | " #BUREAU BALANCE VE BUREAU ORTAK TABLO\n", 223 | "\n", 224 | " #CREDIT_TYPE degiskeninin sinif sayisini 3'e düsürmek \n", 225 | " bureau_and_bb['CREDIT_TYPE'] = bureau_and_bb['CREDIT_TYPE'].replace(['Car loan',\n", 226 | " 'Mortgage',\n", 227 | " 'Microloan',\n", 228 | " 'Loan for business development', \n", 229 | " 'Another type of loan',\n", 230 | " 'Unknown type of loan', \n", 231 | " 'Loan for working capital replenishment',\n", 232 | " \"Loan for purchase of shares (margin lending)\", \n", 233 | " 'Cash loan (non-earmarked)', \n", 234 | " 'Real estate loan',\n", 235 | " \"Loan for the purchase of equipment\", \n", 236 | " \"Interbank credit\", \n", 237 | " \"Mobile operator loan\"], 'Rare')\n", 238 | "\n", 239 | "\n", 240 | " #CREDIT_ACTIVE degiskeninin sinif sayisini 2'ye düsürmek (Sold' u Closed a dahil etmek daha mi uygun olur ???)\n", 241 | " bureau_and_bb['CREDIT_ACTIVE'] = bureau_and_bb['CREDIT_ACTIVE'].replace(['Bad debt','Sold'], 'Active')\n", 242 | "\n", 243 | " # bureau_bb tablosundaki kategorik degiskenlere One Hot Encoding uygulanmasi\n", 244 | " bureau_and_bb = pd.get_dummies(bureau_and_bb, columns = [\"CREDIT_TYPE\",\"CREDIT_ACTIVE\"])\n", 245 | "\n", 246 | " # CREDIT_CURRENCY degiskeninin %99u currency1, bu sebeple ayirt ediciligi olmayacagini dusundugumuz icin sildik \n", 247 | " bureau_and_bb.drop([\"SK_ID_BUREAU\",\"CREDIT_CURRENCY\"], inplace = True, axis = 1)\n", 248 | "\n", 249 | "\n", 250 | " #NEW FEATURES\n", 251 | "\n", 252 | " #ortalama kac aylık kredi aldıgını gösteren yeni degisken\n", 253 | " bureau_and_bb[\"NEW_MONTHS_CREDIT\"]= round((bureau_and_bb.DAYS_CREDIT_ENDDATE - bureau_and_bb.DAYS_CREDIT)/30)\n", 254 | "\n", 255 | " agg_list = {\n", 256 | " \"SK_ID_CURR\":[\"count\"],\n", 257 | " \"DAYS_CREDIT\":[\"min\",\"max\"],\n", 258 | " \"CREDIT_DAY_OVERDUE\":[\"sum\",\"mean\",\"max\"], \n", 259 | " \"DAYS_CREDIT_ENDDATE\":[\"max\",\"min\"],\n", 260 | " \"DAYS_ENDDATE_FACT\":[\"max\",\"min\"],\n", 261 | " \"AMT_CREDIT_MAX_OVERDUE\":[\"mean\",\"max\",\"min\"],\n", 262 | " \"CNT_CREDIT_PROLONG\":[\"sum\",\"mean\",\"max\",\"min\"],\n", 263 | " \"AMT_CREDIT_SUM\":[\"mean\",\"max\",\"min\"], \n", 264 | " \"AMT_CREDIT_SUM_DEBT\":[\"sum\",\"mean\",\"max\"],\n", 265 | " \"AMT_CREDIT_SUM_LIMIT\":[\"sum\",\"mean\",\"max\"],\n", 266 | " 'AMT_CREDIT_SUM_OVERDUE':[\"sum\",\"mean\",\"max\"], \n", 267 | " 'DAYS_CREDIT_UPDATE':[\"max\",\"min\"],\n", 268 | " 'AMT_ANNUITY':[\"sum\",\"mean\"],\n", 269 | " 'MONTHS_BALANCE_COUNT':[\"sum\"], \n", 270 | " 'STATUS_0_SUM':[\"sum\"], \n", 271 | " 'STATUS_0_MEAN':[\"mean\"], \n", 272 | " 'STATUS_C_SUM':[\"sum\"], \n", 273 | " 'STATUS_C_MEAN':[\"mean\"],\n", 274 | " 'CREDIT_ACTIVE_Active':[\"sum\",\"mean\"], \n", 275 | " 'CREDIT_ACTIVE_Closed':[\"sum\",\"mean\"], \n", 276 | " 'CREDIT_TYPE_Rare':[\"sum\",\"mean\"], \n", 277 | " 'CREDIT_TYPE_Consumer credit':[\"sum\",\"mean\"], \n", 278 | " 'CREDIT_TYPE_Credit card':[\"sum\",\"mean\"],\n", 279 | " \"NEW_MONTHS_CREDIT\":[\"count\",\"sum\",\"mean\",\"max\",\"min\"]}\n", 280 | "\n", 281 | "\n", 282 | " # bureau_bb_agg tablosuna aggreagation islemlerinin uygulanamasi \n", 283 | " bureau_and_bb_agg = bureau_and_bb.groupby(\"SK_ID_CURR\").agg(agg_list).reset_index()\n", 284 | "\n", 285 | "\n", 286 | " # Degisken isimlerinin yeniden adlandirilmasi \n", 287 | " bureau_and_bb_agg.columns = pd.Index([\"BB_\" + col[0] + \"_\" + col[1].upper() for col in bureau_and_bb_agg.columns.tolist()])\n", 288 | "\n", 289 | " # kisinin aldıgı en yuksek ve en dusuk kredinin farkını gösteren yeni degisken\n", 290 | " bureau_and_bb_agg[\"BB_NEW_AMT_CREDIT_SUM_RANGE\"] = bureau_and_bb_agg[\"BB_AMT_CREDIT_SUM_MAX\"] - bureau_and_bb_agg[\"BB_AMT_CREDIT_SUM_MIN\"]\n", 291 | "\n", 292 | " # ortalama kac ayda bir kredi cektigini ifade eden yeni degisken\n", 293 | " bureau_and_bb_agg[\"BB_NEW_DAYS_CREDIT_RANGE\"]= round((bureau_and_bb_agg[\"BB_DAYS_CREDIT_MAX\"] - bureau_and_bb_agg[\"BB_DAYS_CREDIT_MIN\"])/(30 * bureau_and_bb_agg[\"BB_SK_ID_CURR_COUNT\"]))\n", 294 | "\n", 295 | "\n", 296 | " # Bureau: Active credits - using only numerical aggregations\n", 297 | " agg_list = {\n", 298 | " 'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],\n", 299 | " 'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],\n", 300 | " 'DAYS_CREDIT_UPDATE': ['mean'],\n", 301 | " 'CREDIT_DAY_OVERDUE': ['max', 'mean'],\n", 302 | " 'AMT_CREDIT_MAX_OVERDUE': ['mean'],\n", 303 | " 'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],\n", 304 | " 'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],\n", 305 | " 'AMT_CREDIT_SUM_OVERDUE': ['mean'],\n", 306 | " 'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],\n", 307 | " 'AMT_ANNUITY': ['max', 'mean'],\n", 308 | " 'CNT_CREDIT_PROLONG': ['sum']\n", 309 | " }\n", 310 | "\n", 311 | "\n", 312 | " active = bureau_and_bb[bureau_and_bb['CREDIT_ACTIVE_Active'] == 1]\n", 313 | " active_agg = active.groupby('SK_ID_CURR').agg(agg_list)\n", 314 | " active_agg.columns = pd.Index(['BB_NEW_ACTIVE_' + e[0] + \"_\" + e[1].upper() for e in active_agg.columns.tolist()])\n", 315 | " bureau_and_bb_agg.rename(columns = {'BB_SK_ID_CURR_': 'SK_ID_CURR'}, inplace = True)\n", 316 | " bureau_and_bb_agg = bureau_and_bb_agg.join(active_agg, how='left', on='SK_ID_CURR')\n", 317 | "\n", 318 | " # Bureau: Closed credits - using only numerical aggregations\n", 319 | " closed = bureau_and_bb[bureau_and_bb['CREDIT_ACTIVE_Closed'] == 1]\n", 320 | " closed_agg = closed.groupby('SK_ID_CURR').agg(agg_list)\n", 321 | " closed_agg.columns = pd.Index(['BB_NEW_CLOSED_' + e[0] + \"_\" + e[1].upper() for e in closed_agg.columns.tolist()])\n", 322 | " bureau_and_bb_agg = bureau_and_bb_agg.join(closed_agg, how='left', on='SK_ID_CURR')\n", 323 | " \n", 324 | " return bureau_and_bb_agg" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "# installments_payments" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 55, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "def installments_payments():\n", 341 | "\n", 342 | " #Read the installments_payments.csv\n", 343 | " ins = pd.read_csv('data/installments_payments.csv')\n", 344 | "\n", 345 | " ins['NEW_DAYS_PAID_EARLIER'] = ins['DAYS_INSTALMENT']-ins['DAYS_ENTRY_PAYMENT']\n", 346 | "\n", 347 | " # Her bir taksit ödemesinin gec olup olmama durumu 1: gec ödedi 0: erken ödemeyi temsil eder\n", 348 | " ins['NEW_NUM_PAID_LATER'] = ins['NEW_DAYS_PAID_EARLIER'].map(lambda x: 1 if x<0 else 0)\n", 349 | "\n", 350 | " # Agrregation ve degisken tekillestirme\n", 351 | " agg_list = {'NUM_INSTALMENT_VERSION':['nunique'],\n", 352 | " 'NUM_INSTALMENT_NUMBER':'max',\n", 353 | " 'DAYS_INSTALMENT':['min','max'],\n", 354 | " 'DAYS_ENTRY_PAYMENT':['min','max'],\n", 355 | " 'AMT_INSTALMENT':['min','max','sum','mean'],\n", 356 | " 'AMT_PAYMENT':['min','max','sum','mean'],\n", 357 | " 'NEW_DAYS_PAID_EARLIER':'mean',\n", 358 | " 'NEW_NUM_PAID_LATER':'sum'}\n", 359 | "\n", 360 | "\n", 361 | " ins_agg = ins.groupby('SK_ID_PREV').agg(agg_list)\n", 362 | "\n", 363 | "\n", 364 | " # Multi index problemi cözümü\n", 365 | " ins_agg.columns = pd.Index([\"INS_\" + e[0] + '_' + e[1].upper() for e in ins_agg.columns.tolist()])\n", 366 | "\n", 367 | " # drop variables \n", 368 | " ins_agg.drop(['INS_DAYS_INSTALMENT_MIN',\n", 369 | " 'INS_DAYS_INSTALMENT_MAX',\n", 370 | " 'INS_DAYS_ENTRY_PAYMENT_MIN',\n", 371 | " 'INS_DAYS_ENTRY_PAYMENT_MAX'],axis=1,inplace=True)\n", 372 | "\n", 373 | " # Kredi ödeme yüzdesi ve toplam kalan borc\n", 374 | " ins_agg['INS_NEW_PAYMENT_PERC'] = ins_agg['INS_AMT_PAYMENT_SUM'] / ins_agg['INS_AMT_INSTALMENT_SUM']\n", 375 | " ins_agg['INS_NEW_PAYMENT_DIFF'] = ins_agg['INS_AMT_INSTALMENT_SUM'] - ins_agg['INS_AMT_PAYMENT_SUM']\n", 376 | " \n", 377 | " agg_list_previous_application = {}\n", 378 | " \n", 379 | " for col in ins_agg.columns:\n", 380 | " agg_list_previous_application[col] = ['mean',\"min\",\"max\",\"sum\"]\n", 381 | " \n", 382 | " ins_agg.reset_index(inplace = True) \n", 383 | " \n", 384 | " return agg_list_previous_application, ins_agg" 385 | ] 386 | }, 387 | { 388 | "cell_type": "markdown", 389 | "metadata": {}, 390 | "source": [ 391 | "# pos_cash_balance" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": null, 397 | "metadata": {}, 398 | "outputs": [], 399 | "source": [] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 56, 404 | "metadata": {}, 405 | "outputs": [], 406 | "source": [ 407 | "def pos_cash_balance(agg_list_previous_application):\n", 408 | "\n", 409 | " pos = pd.read_csv('data/POS_CASH_balance.csv')\n", 410 | " # Kategorik Degiskenimizi Dummy Degiskenine Dönüstürme\n", 411 | " pos = pd.get_dummies(pos, columns=['NAME_CONTRACT_STATUS'], dummy_na = True)\n", 412 | " # Aggregation Islemi - Tekillestirme\n", 413 | " agg_list = {'MONTHS_BALANCE':['min','max'],\n", 414 | " 'CNT_INSTALMENT':['min','max'],\n", 415 | " 'CNT_INSTALMENT_FUTURE':['min','max'],\n", 416 | " 'SK_DPD':['max','mean'],\n", 417 | " 'SK_DPD_DEF':['max','mean'],\n", 418 | " 'NAME_CONTRACT_STATUS_Active':'sum',\n", 419 | " 'NAME_CONTRACT_STATUS_Amortized debt':'sum',\n", 420 | " 'NAME_CONTRACT_STATUS_Approved':'sum',\n", 421 | " 'NAME_CONTRACT_STATUS_Canceled':'sum',\n", 422 | " 'NAME_CONTRACT_STATUS_Completed':'sum',\n", 423 | " 'NAME_CONTRACT_STATUS_Demand':'sum',\n", 424 | " 'NAME_CONTRACT_STATUS_Returned to the store':'sum',\n", 425 | " 'NAME_CONTRACT_STATUS_Signed':'sum',\n", 426 | " 'NAME_CONTRACT_STATUS_XNA':'sum',\n", 427 | " 'NAME_CONTRACT_STATUS_nan':'sum'\n", 428 | " }\n", 429 | "\n", 430 | " pos_agg = pos.groupby('SK_ID_PREV').agg(agg_list)\n", 431 | "\n", 432 | " # Multilayer index'i tek boyutlu index'e dönüstürme\n", 433 | " pos_agg.columns= pd.Index([\"POS_\" + e[0] + '_' + e[1].upper() for e in pos_agg.columns.tolist()])\n", 434 | "\n", 435 | " # SK_DPD kac kredide 0 olma durumu (SK_DPD MAX alacagiz 0 durumunu veriyor) \n", 436 | " # SK_DPD_DEF (SK_DPD_DEF_MAX sifir olma durumunu veriyor)\n", 437 | " # CNT_INSTALMENT_FUTURE_MIN==0 oldugunda NAME_CONTRACT_STATUS_Completed_SUM==0 olma durumu \n", 438 | "\n", 439 | " pos_agg['POS_NEW_IS_CREDIT_NOT_COMPLETED_ON_TIME']= (pos_agg['POS_CNT_INSTALMENT_FUTURE_MIN']==0) & (pos_agg['POS_NAME_CONTRACT_STATUS_Completed_SUM']==0)\n", 440 | "\n", 441 | "\n", 442 | " # 1:kredi zamaninda kapanmamis 0:kredi zamaninda kapanmis\n", 443 | "\n", 444 | " pos_agg['POS_NEW_IS_CREDIT_NOT_COMPLETED_ON_TIME']=pos_agg['POS_NEW_IS_CREDIT_NOT_COMPLETED_ON_TIME'].astype(int)\n", 445 | "\n", 446 | " pos_agg.drop(['POS_NAME_CONTRACT_STATUS_Approved_SUM',\n", 447 | " 'POS_NAME_CONTRACT_STATUS_Amortized debt_SUM',\n", 448 | " 'POS_NAME_CONTRACT_STATUS_Canceled_SUM',\n", 449 | " 'POS_NAME_CONTRACT_STATUS_Returned to the store_SUM',\n", 450 | " 'POS_NAME_CONTRACT_STATUS_Signed_SUM',\n", 451 | " 'POS_NAME_CONTRACT_STATUS_XNA_SUM',\n", 452 | " 'POS_NAME_CONTRACT_STATUS_nan_SUM'],axis=1,inplace=True)\n", 453 | "\n", 454 | " for col in pos_agg.columns:\n", 455 | " agg_list_previous_application[col] = ['mean',\"min\",\"max\",\"sum\"]\n", 456 | "\n", 457 | " pos_agg.reset_index(inplace = True) \n", 458 | " \n", 459 | " return agg_list_previous_application, pos_agg" 460 | ] 461 | }, 462 | { 463 | "cell_type": "markdown", 464 | "metadata": {}, 465 | "source": [ 466 | "# credit_card_balance" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": 57, 472 | "metadata": {}, 473 | "outputs": [], 474 | "source": [ 475 | "def credit_card_balance():\n", 476 | "\n", 477 | " CCB = pd.read_csv('data/credit_card_balance.csv')\n", 478 | "\n", 479 | " CCB = pd.get_dummies(CCB, columns= ['NAME_CONTRACT_STATUS'] ) # artik tumu sayisal \n", 480 | "\n", 481 | " dropthis = ['NAME_CONTRACT_STATUS_Approved', 'NAME_CONTRACT_STATUS_Demand',\n", 482 | " 'NAME_CONTRACT_STATUS_Refused', 'NAME_CONTRACT_STATUS_Sent proposal',\n", 483 | " 'NAME_CONTRACT_STATUS_Signed' ]\n", 484 | "\n", 485 | " CCB = CCB.drop(dropthis, axis=1)\n", 486 | "\n", 487 | " grp = CCB.groupby(by = ['SK_ID_CURR'])['SK_ID_PREV'].nunique().reset_index().rename(index = str, columns = {'SK_ID_PREV': 'NUMBER_OF_LOANS_PER_CUSTOMER'})\n", 488 | " CCB = CCB.merge(grp, on = ['SK_ID_CURR'], how = 'left')\n", 489 | "\n", 490 | " grp = CCB.groupby(by = ['SK_ID_CURR', 'SK_ID_PREV'])['CNT_INSTALMENT_MATURE_CUM'].max().reset_index().rename(index = str, columns = {'CNT_INSTALMENT_MATURE_CUM': 'NUMBER_OF_INSTALMENTS'})\n", 491 | " grp1 = grp.groupby(by = ['SK_ID_CURR'])['NUMBER_OF_INSTALMENTS'].sum().reset_index().rename(index = str, columns = {'NUMBER_OF_INSTALMENTS': 'TOTAL_INSTALMENTS_OF_ALL_LOANS'})\n", 492 | " CCB = CCB.merge(grp1, on = ['SK_ID_CURR'], how = 'left')\n", 493 | "\n", 494 | " CCB['INSTALLMENTS_PER_LOAN'] = (CCB['TOTAL_INSTALMENTS_OF_ALL_LOANS']/CCB['NUMBER_OF_LOANS_PER_CUSTOMER']).astype('uint32')\n", 495 | "\n", 496 | "\n", 497 | " # Bu fonksiyon, kac defa odemelerin geciktigini hesaplar\n", 498 | " # Function to calculate number of times Days Past Due occurred\n", 499 | " def geciken_gun_hesapla(DPD):\n", 500 | "\n", 501 | " # DPD ile beklenen bir seri: SK_DPD degiskeninin her bir prev_app daki gecmis kredi icin olan degerleri\n", 502 | " # DPD is a series of values of SK_DPD for each of the groupby combination\n", 503 | " # We convert it to a list to get the number of SK_DPD values NOT EQUALS ZERO\n", 504 | " x = DPD.tolist()\n", 505 | " c = 0\n", 506 | " for i,j in enumerate(x):\n", 507 | " if j != 0:\n", 508 | " c += 1 \n", 509 | " return c \n", 510 | "\n", 511 | " grp = CCB.groupby(by = ['SK_ID_CURR', 'SK_ID_PREV']).apply(lambda x: geciken_gun_hesapla(x.SK_DPD)).reset_index().rename(index = str, columns = {0: 'NUMBER_OF_DPD'})\n", 512 | " grp1 = grp.groupby(by = ['SK_ID_CURR'])['NUMBER_OF_DPD'].mean().reset_index().rename(index = str, columns = {'NUMBER_OF_DPD' : 'DPD_COUNT'})\n", 513 | "\n", 514 | " CCB = CCB.merge(grp1, on = ['SK_ID_CURR'], how = 'left')\n", 515 | "\n", 516 | "\n", 517 | " def f(min_pay, total_pay):\n", 518 | "\n", 519 | " M = min_pay.tolist()\n", 520 | " T = total_pay.tolist()\n", 521 | " P = len(M) # P: taksit sayisi\n", 522 | " c = 0 \n", 523 | " # Find the count of transactions when Payment made is less than Minimum Payment \n", 524 | " for i in range(len(M)):\n", 525 | " if T[i] < M[i]:\n", 526 | " c += 1 \n", 527 | " return (100*c)/P\n", 528 | "\n", 529 | " grp = CCB.groupby(by = ['SK_ID_CURR']).apply(lambda x: f(x.AMT_INST_MIN_REGULARITY, x.AMT_PAYMENT_CURRENT)).reset_index().rename(index = str, columns = { 0 : 'PERCENTAGE_MIN_MISSED_PAYMENTS'})\n", 530 | " CCB = CCB.merge(grp, on = ['SK_ID_CURR'], how = 'left')\n", 531 | "\n", 532 | " grp = CCB.groupby(by = ['SK_ID_CURR'])['AMT_DRAWINGS_ATM_CURRENT'].sum().reset_index().rename(index = str, columns = {'AMT_DRAWINGS_ATM_CURRENT' : 'DRAWINGS_ATM'})\n", 533 | " CCB = CCB.merge(grp, on = ['SK_ID_CURR'], how = 'left')\n", 534 | "\n", 535 | "\n", 536 | " grp = CCB.groupby(by = ['SK_ID_CURR'])['AMT_DRAWINGS_CURRENT'].sum().reset_index().rename(index = str, columns = {'AMT_DRAWINGS_CURRENT' : 'DRAWINGS_TOTAL'})\n", 537 | " CCB = CCB.merge(grp, on = ['SK_ID_CURR'], how = 'left')\n", 538 | "\n", 539 | "\n", 540 | " CCB['CASH_CARD_RATIO1'] = (CCB['DRAWINGS_ATM']/CCB['DRAWINGS_TOTAL'])*100 # ATM den cektigi nakit / toplam cektigi\n", 541 | " del CCB['DRAWINGS_ATM']\n", 542 | " del CCB['DRAWINGS_TOTAL']\n", 543 | "\n", 544 | " grp = CCB.groupby(by = ['SK_ID_CURR'])['CASH_CARD_RATIO1'].mean().reset_index().rename(index = str, columns ={ 'CASH_CARD_RATIO1' : 'CASH_CARD_RATIO'})\n", 545 | " CCB = CCB.merge(grp, on = ['SK_ID_CURR'], how = 'left')\n", 546 | "\n", 547 | "\n", 548 | " grp = CCB.groupby(by = ['SK_ID_CURR'])['AMT_DRAWINGS_CURRENT'].sum().reset_index().rename(index = str, columns = {'AMT_DRAWINGS_CURRENT' : 'TOTAL_DRAWINGS'})\n", 549 | " CCB = CCB.merge(grp, on = ['SK_ID_CURR'], how = 'left')\n", 550 | "\n", 551 | "\n", 552 | " grp = CCB.groupby(by = ['SK_ID_CURR'])['CNT_DRAWINGS_CURRENT'].sum().reset_index().rename(index = str, columns = {'CNT_DRAWINGS_CURRENT' : 'NUMBER_OF_DRAWINGS'})\n", 553 | " CCB = CCB.merge(grp, on = ['SK_ID_CURR'], how = 'left')\n", 554 | "\n", 555 | "\n", 556 | " CCB['DRAWINGS_RATIO1'] = (CCB['TOTAL_DRAWINGS']/CCB['NUMBER_OF_DRAWINGS'])*100 # yuzdelik degil, genisleme yapmis\n", 557 | " del CCB['TOTAL_DRAWINGS']\n", 558 | " del CCB['NUMBER_OF_DRAWINGS']\n", 559 | "\n", 560 | "\n", 561 | " grp = CCB.groupby(by = ['SK_ID_CURR'])['DRAWINGS_RATIO1'].mean().reset_index().rename(index = str, columns ={ 'DRAWINGS_RATIO1' : 'DRAWINGS_RATIO'})\n", 562 | " CCB = CCB.merge(grp, on = ['SK_ID_CURR'], how = 'left')\n", 563 | "\n", 564 | " del CCB['DRAWINGS_RATIO1']\n", 565 | "\n", 566 | " CCB['CC_COUNT'] = CCB.groupby('SK_ID_CURR').size()\n", 567 | "\n", 568 | " CCB_agg = CCB.groupby('SK_ID_CURR').agg({\n", 569 | " 'MONTHS_BALANCE':[\"sum\",\"mean\"], \n", 570 | " 'AMT_BALANCE':[\"sum\",\"mean\",\"min\",\"max\"],\n", 571 | " 'AMT_CREDIT_LIMIT_ACTUAL':[\"sum\",\"mean\"], \n", 572 | "\n", 573 | " 'AMT_DRAWINGS_ATM_CURRENT':[\"sum\",\"mean\",\"min\",\"max\"],\n", 574 | " 'AMT_DRAWINGS_CURRENT':[\"sum\",\"mean\",\"min\",\"max\"], \n", 575 | " 'AMT_DRAWINGS_OTHER_CURRENT':[\"sum\",\"mean\",\"min\",\"max\"],\n", 576 | " 'AMT_DRAWINGS_POS_CURRENT':[\"sum\",\"mean\",\"min\",\"max\"], \n", 577 | " 'AMT_INST_MIN_REGULARITY':[\"sum\",\"mean\",\"min\",\"max\"],\n", 578 | " 'AMT_PAYMENT_CURRENT':[\"sum\",\"mean\",\"min\",\"max\"], \n", 579 | " 'AMT_PAYMENT_TOTAL_CURRENT':[\"sum\",\"mean\",\"min\",\"max\"],\n", 580 | " 'AMT_RECEIVABLE_PRINCIPAL':[\"sum\",\"mean\",\"min\",\"max\"], \n", 581 | " 'AMT_RECIVABLE':[\"sum\",\"mean\",\"min\",\"max\"], \n", 582 | " 'AMT_TOTAL_RECEIVABLE':[\"sum\",\"mean\",\"min\",\"max\"],\n", 583 | "\n", 584 | " 'CNT_DRAWINGS_ATM_CURRENT':[\"sum\",\"mean\"], \n", 585 | " 'CNT_DRAWINGS_CURRENT':[\"sum\",\"mean\",\"max\"],\n", 586 | " 'CNT_DRAWINGS_OTHER_CURRENT':[\"mean\",\"max\"], \n", 587 | " 'CNT_DRAWINGS_POS_CURRENT':[\"sum\",\"mean\",\"max\"],\n", 588 | " 'CNT_INSTALMENT_MATURE_CUM':[\"sum\",\"mean\",\"max\",\"min\"], \n", 589 | " 'SK_DPD':[\"sum\",\"mean\",\"max\"], \n", 590 | " 'SK_DPD_DEF':[\"sum\",\"mean\",\"max\"],\n", 591 | "\n", 592 | " 'NAME_CONTRACT_STATUS_Active':[\"sum\",\"mean\",\"min\",\"max\"], \n", 593 | " 'INSTALLMENTS_PER_LOAN':[\"sum\",\"mean\",\"min\",\"max\"],\n", 594 | "\n", 595 | " 'NUMBER_OF_LOANS_PER_CUSTOMER':[\"mean\"], \n", 596 | " 'DPD_COUNT':[\"mean\"],\n", 597 | " 'PERCENTAGE_MIN_MISSED_PAYMENTS':[\"mean\"], \n", 598 | " 'CASH_CARD_RATIO':[\"mean\"], \n", 599 | " 'DRAWINGS_RATIO':[\"mean\"]})\n", 600 | "\n", 601 | "\n", 602 | " CCB_agg.columns = pd.Index(['CCB_' + e[0] + \"_\" + e[1].upper() for e in CCB_agg.columns.tolist()])\n", 603 | "\n", 604 | " CCB_agg.reset_index(inplace = True)\n", 605 | " \n", 606 | " return CCB_agg" 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": null, 612 | "metadata": {}, 613 | "outputs": [], 614 | "source": [] 615 | }, 616 | { 617 | "cell_type": "markdown", 618 | "metadata": {}, 619 | "source": [ 620 | "# previous_application" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": 58, 626 | "metadata": {}, 627 | "outputs": [], 628 | "source": [ 629 | "def previous_application(agg_list_previous_application):\n", 630 | "\n", 631 | "\n", 632 | " df_prev = pd.read_csv('data/previous_application.csv')\n", 633 | "\n", 634 | " # \"WEEKDAY_APPR_PROCESS_START\" değişkeninin WEEK_DAY ve WEEKEND olarak iki kategoriye ayrılması\n", 635 | "\n", 636 | " df_prev[\"WEEKDAY_APPR_PROCESS_START\"] = df_prev[\"WEEKDAY_APPR_PROCESS_START\"].replace(['MONDAY','TUESDAY', 'WEDNESDAY','THURSDAY','FRIDAY'], 'WEEK_DAY')\n", 637 | " df_prev[\"WEEKDAY_APPR_PROCESS_START\"] = df_prev[\"WEEKDAY_APPR_PROCESS_START\"].replace(['SATURDAY', 'SUNDAY'], 'WEEKEND')\n", 638 | "\n", 639 | " # \"HOUR_APPR_PROCESS_START\" değişkeninin working_hours ve off_hours olarak iki kategoriye ayrılması\n", 640 | " a = [8,9,10,11,12,13,14,15,16,17]\n", 641 | " df_prev[\"HOUR_APPR_PROCESS_START\"] = df_prev[\"HOUR_APPR_PROCESS_START\"].replace(a, 'working_hours')\n", 642 | "\n", 643 | " b = [18,19,20,21,22,23,0,1,2,3,4,5,6,7]\n", 644 | " df_prev[\"HOUR_APPR_PROCESS_START\"] = df_prev[\"HOUR_APPR_PROCESS_START\"].replace(b, 'off_hours')\n", 645 | "\n", 646 | "\n", 647 | " # DAYS_DECISION değeri 1 yıldan küçük olanlara 1, büyük olanlara 0 değeri verildi.\n", 648 | " df_prev[\"DAYS_DECISION\"] = [1 if abs(i/(12*30)) <=1 else 0 for i in df_prev.DAYS_DECISION]\n", 649 | "\n", 650 | " # \"NAME_TYPE_SUITE\" değişkeninin alone ve not_alone olarak iki kategoriye ayrılması\n", 651 | "\n", 652 | " df_prev[\"NAME_TYPE_SUITE\"] = df_prev[\"NAME_TYPE_SUITE\"].replace('Unaccompanied', 'alone')\n", 653 | "\n", 654 | " b = ['Family', 'Spouse, partner', 'Children', 'Other_B', 'Other_A', 'Group of people']\n", 655 | " df_prev[\"NAME_TYPE_SUITE\"] = df_prev[\"NAME_TYPE_SUITE\"].replace(b, 'not_alone')\n", 656 | "\n", 657 | "\n", 658 | "\n", 659 | " # \"NAME_GOODS_CATEGORY\" değişkenindeki bu değerler others olarak kategorize edilecek\n", 660 | " a = ['Auto Accessories', 'Jewelry', 'Homewares', 'Medical Supplies', 'Vehicles', 'Sport and Leisure', \n", 661 | " 'Gardening', 'Other', 'Office Appliances', 'Tourism', 'Medicine', 'Direct Sales', 'Fitness', 'Additional Service', \n", 662 | " 'Education', 'Weapon', 'Insurance', 'House Construction', 'Animals'] \n", 663 | " df_prev[\"NAME_GOODS_CATEGORY\"] = df_prev[\"NAME_GOODS_CATEGORY\"].replace(a, 'others')\n", 664 | "\n", 665 | " # \"NAME_SELLER_INDUSTRY\" değişkenindeki bu değerler others olarak kategorize edilecek\n", 666 | " a = ['Auto technology', 'Jewelry', 'MLM partners', 'Tourism'] \n", 667 | " df_prev[\"NAME_SELLER_INDUSTRY\"] = df_prev[\"NAME_SELLER_INDUSTRY\"].replace(a, 'others')\n", 668 | " # İstenilen krecinin verilen krediye oranı içeren değişkeni türetir\n", 669 | " df_prev[\"LOAN_RATE\"] = df_prev.AMT_APPLICATION/df_prev.AMT_CREDIT\n", 670 | "\n", 671 | " #YENI DEGISKENLER\n", 672 | "\n", 673 | " # İstenilen krecinin verilen krediye oranı içeren değişkeni türetir\n", 674 | " df_prev[\"NEW_LOAN_RATE\"] = df_prev.AMT_APPLICATION/df_prev.AMT_CREDIT\n", 675 | "\n", 676 | " # Ödeme gününü geciktirmiş mi bunu gösteren churn_prev değişkeni türetilir.\n", 677 | " # 1= geciktirmiş, 0 = geciktirmemiş, NaN = boş değer\n", 678 | " k = df_prev.DAYS_LAST_DUE_1ST_VERSION - df_prev.DAYS_LAST_DUE\n", 679 | " df_prev[\"NEW_CHURN_PREV\"] = [1 if i >= 0 else (0 if i < 0 else \"NaN\") for i in k]\n", 680 | "\n", 681 | "\n", 682 | " # NFLAG_INSURED_ON_APPROVAL değişkeni yerine kullanılmak izere NEW_INSURANCE değişkeni tanımlandı.\n", 683 | " df_prev[(df_prev['AMT_CREDIT'] == 0) | (df_prev['AMT_GOODS_PRICE'] == 0)]['NEW_INSURANCE'] = np.nan\n", 684 | " df_prev['sigorta_miktari'] = df_prev['AMT_CREDIT'] - df_prev['AMT_GOODS_PRICE']\n", 685 | " df_prev[\"NEW_INSURANCE\"] = df_prev['sigorta_miktari'].apply(lambda x: 1 if x > 0 else (0 if x <= 0 else np.nan))\n", 686 | " df_prev.drop('sigorta_miktari', axis=1, inplace=True)\n", 687 | "\n", 688 | " # INTEREST_RATE değişkenini oluşturur.\n", 689 | " #df_prev['INTEREST_RATE'] = (df_prev.AMT_ANNUITY*df_prev.CNT_PAYMENT/df_prev.AMT_CREDIT)**(12/df_prev.CNT_PAYMENT)-1\n", 690 | " #df_prev[df_prev['INTEREST_RATE']==-1]=np.nan\n", 691 | "\n", 692 | "\n", 693 | " drop_list = ['AMT_DOWN_PAYMENT', 'SELLERPLACE_AREA', 'CNT_PAYMENT', 'PRODUCT_COMBINATION', 'DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE',\n", 694 | " 'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_LAST_DUE','DAYS_TERMINATION','NFLAG_INSURED_ON_APPROVAL']\n", 695 | " df_prev.drop(drop_list, axis = 1, inplace = True)\n", 696 | "\n", 697 | " # Previous tablosundaki kategorik değişkenlerin isimlerini tutar.\n", 698 | " category_columns=[]\n", 699 | " for i in df_prev.columns:\n", 700 | " if df_prev[i].dtypes == \"O\":\n", 701 | " category_columns.append(i)\n", 702 | "\n", 703 | " df_prev = pd.get_dummies(df_prev, columns = category_columns )\n", 704 | "\n", 705 | " prev_agg_list = {\"SK_ID_CURR\":[\"count\"], \n", 706 | " \"AMT_ANNUITY\":[\"max\"],\n", 707 | " \"AMT_APPLICATION\":[\"min\",\"mean\",\"max\"],\n", 708 | " \"AMT_CREDIT\":[\"max\"], \n", 709 | " \"AMT_GOODS_PRICE\":[\"sum\", \"mean\"],\n", 710 | " \"NFLAG_LAST_APPL_IN_DAY\":[\"sum\",\"mean\"], \n", 711 | " \"RATE_DOWN_PAYMENT\":[\"sum\", \"mean\"],\n", 712 | " \"RATE_INTEREST_PRIMARY\":[\"sum\", \"mean\"],\n", 713 | " \"RATE_INTEREST_PRIVILEGED\":[\"sum\", \"mean\"],\n", 714 | " \"DAYS_DECISION\":[\"sum\"],\n", 715 | " \"NEW_LOAN_RATE\":[\"sum\", \"mean\", \"min\", \"max\"],\n", 716 | " \"NEW_INSURANCE\":[\"sum\", \"mean\"],\n", 717 | " #\"INTEREST_RATE\":[\"sum\", \"mean\", \"min\", \"max\"],\n", 718 | " \"NAME_CONTRACT_TYPE_Cash loans\":[\"sum\", \"mean\"],\n", 719 | " \"NAME_CONTRACT_TYPE_Consumer loans\":[\"sum\", \"mean\"],\n", 720 | " \"NAME_CONTRACT_TYPE_Revolving loans\":[\"sum\", \"mean\"],\n", 721 | " \"NAME_CONTRACT_TYPE_XNA\":[\"sum\", \"mean\"],\n", 722 | " \"WEEKDAY_APPR_PROCESS_START_WEEKEND\":[\"sum\", \"mean\"],\n", 723 | " \"WEEKDAY_APPR_PROCESS_START_WEEK_DAY\":[\"sum\", \"mean\"],\n", 724 | " \"HOUR_APPR_PROCESS_START_off_hours\":[\"sum\", \"mean\"],\n", 725 | " \"HOUR_APPR_PROCESS_START_working_hours\":[\"sum\", \"mean\"],\n", 726 | " \"FLAG_LAST_APPL_PER_CONTRACT_N\":[\"sum\", \"mean\"],\n", 727 | " \"FLAG_LAST_APPL_PER_CONTRACT_Y\":[\"sum\", \"mean\"],\n", 728 | " \"NAME_CASH_LOAN_PURPOSE_Building a house or an annex\":[\"sum\", \"mean\"],\n", 729 | " \"NAME_CASH_LOAN_PURPOSE_Business development\":[\"sum\", \"mean\"],\n", 730 | " \"NAME_CASH_LOAN_PURPOSE_Buying a garage\":[\"sum\", \"mean\"],\n", 731 | " \"NAME_CASH_LOAN_PURPOSE_Buying a holiday home / land\":[\"sum\", \"mean\"],\n", 732 | " \"NAME_CASH_LOAN_PURPOSE_Buying a home\":[\"sum\", \"mean\"],\n", 733 | " \"NAME_CASH_LOAN_PURPOSE_Buying a new car\":[\"sum\", \"mean\"],\n", 734 | " \"NAME_CASH_LOAN_PURPOSE_Buying a used car\":[\"sum\", \"mean\"],\n", 735 | " \"NAME_CASH_LOAN_PURPOSE_Car repairs\":[\"sum\", \"mean\"],\n", 736 | " \"NAME_CASH_LOAN_PURPOSE_Education\":[\"sum\", \"mean\"],\n", 737 | " \"NAME_CASH_LOAN_PURPOSE_Everyday expenses\":[\"sum\", \"mean\"],\n", 738 | " \"NAME_CASH_LOAN_PURPOSE_Furniture\":[\"sum\", \"mean\"],\n", 739 | " \"NAME_CASH_LOAN_PURPOSE_Gasification / water supply\":[\"sum\", \"mean\"],\n", 740 | " \"NAME_CASH_LOAN_PURPOSE_Hobby\":[\"sum\", \"mean\"],\n", 741 | " \"NAME_CASH_LOAN_PURPOSE_Journey\":[\"sum\", \"mean\"],\n", 742 | " \"NAME_CASH_LOAN_PURPOSE_Medicine\":[\"sum\", \"mean\"],\n", 743 | " \"NAME_CASH_LOAN_PURPOSE_Money for a third person\":[\"sum\", \"mean\"],\n", 744 | " \"NAME_CASH_LOAN_PURPOSE_Other\":[\"sum\", \"mean\"],\n", 745 | " \"NAME_CASH_LOAN_PURPOSE_Payments on other loans\":[\"sum\", \"mean\"],\n", 746 | " \"NAME_CASH_LOAN_PURPOSE_Purchase of electronic equipment\":[\"sum\", \"mean\"],\n", 747 | " \"NAME_CASH_LOAN_PURPOSE_Refusal to name the goal\":[\"sum\", \"mean\"],\n", 748 | " \"NAME_CASH_LOAN_PURPOSE_Repairs\":[\"sum\", \"mean\"],\n", 749 | " \"NAME_CASH_LOAN_PURPOSE_Urgent needs\":[\"sum\", \"mean\"],\n", 750 | " \"NAME_CASH_LOAN_PURPOSE_Wedding / gift / holiday\":[\"sum\", \"mean\"],\n", 751 | " \"NAME_CASH_LOAN_PURPOSE_XAP\":[\"sum\", \"mean\"],\n", 752 | " \"NAME_CASH_LOAN_PURPOSE_XNA\":[\"sum\", \"mean\"],\n", 753 | " \"NAME_CONTRACT_STATUS_Approved\":[\"sum\", \"mean\"],\n", 754 | " \"NAME_CONTRACT_STATUS_Canceled\":[\"sum\", \"mean\"],\n", 755 | " \"NAME_CONTRACT_STATUS_Refused\":[\"sum\", \"mean\"],\n", 756 | " \"NAME_CONTRACT_STATUS_Unused offer\":[\"sum\", \"mean\"],\n", 757 | " \"NAME_PAYMENT_TYPE_Cash through the bank\":[\"sum\", \"mean\"],\n", 758 | " \"NAME_PAYMENT_TYPE_Cashless from the account of the employer\":[\"sum\", \"mean\"],\n", 759 | " \"NAME_PAYMENT_TYPE_Non-cash from your account\":[\"sum\", \"mean\"],\n", 760 | " \"NAME_PAYMENT_TYPE_XNA\":[\"sum\", \"mean\"],\n", 761 | " \"CODE_REJECT_REASON_CLIENT\":[\"sum\", \"mean\"],\n", 762 | " \"CODE_REJECT_REASON_HC\":[\"sum\", \"mean\"],\n", 763 | " \"CODE_REJECT_REASON_LIMIT\":[\"sum\", \"mean\"],\n", 764 | " \"CODE_REJECT_REASON_SCO\":[\"sum\", \"mean\"],\n", 765 | " \"CODE_REJECT_REASON_SCOFR\":[\"sum\", \"mean\"],\n", 766 | " \"CODE_REJECT_REASON_SYSTEM\":[\"sum\", \"mean\"],\n", 767 | " \"CODE_REJECT_REASON_VERIF\":[\"sum\", \"mean\"],\n", 768 | " \"CODE_REJECT_REASON_XAP\":[\"sum\", \"mean\"],\n", 769 | " \"CODE_REJECT_REASON_XNA\":[\"sum\", \"mean\"],\n", 770 | " \"NAME_TYPE_SUITE_alone\":[\"sum\", \"mean\"],\n", 771 | " \"NAME_TYPE_SUITE_not_alone\":[\"sum\", \"mean\"],\n", 772 | " \"NAME_CLIENT_TYPE_New\":[\"sum\", \"mean\"],\n", 773 | " \"NAME_CLIENT_TYPE_Refreshed\":[\"sum\", \"mean\"],\n", 774 | " \"NAME_CLIENT_TYPE_Repeater\":[\"sum\", \"mean\"],\n", 775 | " \"NAME_CLIENT_TYPE_XNA\":[\"sum\", \"mean\"],\n", 776 | " \"NAME_GOODS_CATEGORY_Audio/Video\":[\"sum\", \"mean\"],\n", 777 | " \"NAME_GOODS_CATEGORY_Clothing and Accessories\":[\"sum\", \"mean\"],\n", 778 | " \"NAME_GOODS_CATEGORY_Computers\":[\"sum\", \"mean\"],\n", 779 | " \"NAME_GOODS_CATEGORY_Construction Materials\":[\"sum\", \"mean\"],\n", 780 | " \"NAME_GOODS_CATEGORY_Consumer Electronics\":[\"sum\", \"mean\"],\n", 781 | " \"NAME_GOODS_CATEGORY_Furniture\":[\"sum\", \"mean\"],\n", 782 | " \"NAME_GOODS_CATEGORY_Mobile\":[\"sum\", \"mean\"],\n", 783 | " \"NAME_GOODS_CATEGORY_Photo / Cinema Equipment\":[\"sum\", \"mean\"],\n", 784 | " \"NAME_GOODS_CATEGORY_XNA\":[\"sum\", \"mean\"],\n", 785 | " \"NAME_GOODS_CATEGORY_others\":[\"sum\", \"mean\"],\n", 786 | " \"NAME_PORTFOLIO_Cards\":[\"sum\", \"mean\"],\n", 787 | " \"NAME_PORTFOLIO_Cars\":[\"sum\", \"mean\"],\n", 788 | " \"NAME_PORTFOLIO_Cash\":[\"sum\", \"mean\"],\n", 789 | " \"NAME_PORTFOLIO_POS\":[\"sum\", \"mean\"],\n", 790 | " \"NAME_PORTFOLIO_XNA\":[\"sum\", \"mean\"],\n", 791 | " \"NAME_PRODUCT_TYPE_XNA\":[\"sum\", \"mean\"],\n", 792 | " \"NAME_PRODUCT_TYPE_walk-in\":[\"sum\", \"mean\"],\n", 793 | " \"NAME_PRODUCT_TYPE_x-sell\":[\"sum\", \"mean\"],\n", 794 | " \"CHANNEL_TYPE_AP+ (Cash loan)\":[\"sum\", \"mean\"],\n", 795 | " \"CHANNEL_TYPE_Car dealer\":[\"sum\", \"mean\"],\n", 796 | " \"CHANNEL_TYPE_Channel of corporate sales\":[\"sum\", \"mean\"],\n", 797 | " \"CHANNEL_TYPE_Contact center\":[\"sum\", \"mean\"],\n", 798 | " \"CHANNEL_TYPE_Country-wide\":[\"sum\", \"mean\"],\n", 799 | " \"CHANNEL_TYPE_Credit and cash offices\":[\"sum\", \"mean\"],\n", 800 | " \"CHANNEL_TYPE_Regional / Local\":[\"sum\", \"mean\"],\n", 801 | " \"CHANNEL_TYPE_Stone\":[\"sum\", \"mean\"],\n", 802 | " \"NAME_SELLER_INDUSTRY_Clothing\":[\"sum\", \"mean\"],\n", 803 | " \"NAME_SELLER_INDUSTRY_Connectivity\":[\"sum\", \"mean\"],\n", 804 | " \"NAME_SELLER_INDUSTRY_Construction\":[\"sum\", \"mean\"],\n", 805 | " \"NAME_SELLER_INDUSTRY_Consumer electronics\":[\"sum\", \"mean\"],\n", 806 | " \"NAME_SELLER_INDUSTRY_Furniture\":[\"sum\", \"mean\"],\n", 807 | " \"NAME_SELLER_INDUSTRY_Industry\":[\"sum\", \"mean\"],\n", 808 | " \"NAME_SELLER_INDUSTRY_XNA\":[\"sum\", \"mean\"],\n", 809 | " \"NAME_SELLER_INDUSTRY_others\":[\"sum\", \"mean\"],\n", 810 | " \"NAME_YIELD_GROUP_XNA\":[\"sum\", \"mean\"],\n", 811 | " \"NAME_YIELD_GROUP_high\":[\"sum\", \"mean\"],\n", 812 | " \"NAME_YIELD_GROUP_low_action\":[\"sum\", \"mean\"],\n", 813 | " \"NAME_YIELD_GROUP_low_normal\":[\"sum\", \"mean\"],\n", 814 | " \"NAME_YIELD_GROUP_middle\":[\"sum\", \"mean\"],\n", 815 | " \"NEW_CHURN_PREV_0\":[\"sum\", \"mean\"],\n", 816 | " \"NEW_CHURN_PREV_1\":[\"sum\", \"mean\"],\n", 817 | " \"NEW_CHURN_PREV_NaN\":[\"sum\", \"mean\"]}\n", 818 | "\n", 819 | " prev_agg_list.update(agg_list_previous_application)\n", 820 | " \n", 821 | " \n", 822 | " return prev_agg_list, df_prev" 823 | ] 824 | }, 825 | { 826 | "cell_type": "code", 827 | "execution_count": null, 828 | "metadata": {}, 829 | "outputs": [], 830 | "source": [] 831 | }, 832 | { 833 | "cell_type": "markdown", 834 | "metadata": {}, 835 | "source": [ 836 | "# Combine" 837 | ] 838 | }, 839 | { 840 | "cell_type": "code", 841 | "execution_count": 59, 842 | "metadata": {}, 843 | "outputs": [], 844 | "source": [ 845 | "def pre_processing_and_combine():\n", 846 | "\n", 847 | " \n", 848 | " with timer(\"Process application train\"):\n", 849 | " df = application_train()\n", 850 | " print(\"application train & test shape:\", df.shape)\n", 851 | " \n", 852 | " \n", 853 | " with timer(\"Bureau and Bureau Balance\"):\n", 854 | " bureau_and_bb_agg = bureau_bb()\n", 855 | " print(\"Bureau and Bureau Balance:\", bureau_and_bb_agg.shape)\n", 856 | " \n", 857 | " with timer(\"Installment Payments\"):\n", 858 | " agg_list_previous_application, ins_agg = installments_payments()\n", 859 | " print(\"Installment Payments:\", ins_agg.shape) \n", 860 | " \n", 861 | " with timer(\"Pos Cash Balance\"):\n", 862 | " agg_list_previous_application, pos_agg = pos_cash_balance(agg_list_previous_application)\n", 863 | " print(\"Pos Cash Balance:\", pos_agg.shape) \n", 864 | " \n", 865 | " \n", 866 | " with timer(\"Credit Card Balance\"):\n", 867 | " CCB_agg = credit_card_balance()\n", 868 | " print(\"Credit Card Balance:\", CCB_agg.shape) \n", 869 | " \n", 870 | " with timer(\"previous_application\"):\n", 871 | " prev_agg_list, df_prev = previous_application(agg_list_previous_application)\n", 872 | " print(\"previous_application:\", df_prev.shape) \n", 873 | " \n", 874 | " \n", 875 | " with timer(\"All tables are combining\"):\n", 876 | " df_prev_ins = df_prev.merge(ins_agg, how = 'left', on = 'SK_ID_PREV')\n", 877 | " df_prev_ins_pos = df_prev_ins.merge(pos_agg, how = 'left', on = 'SK_ID_PREV')\n", 878 | " df_prev_ins_pos_agg = df_prev_ins_pos.groupby(\"SK_ID_CURR\").agg(prev_agg_list).reset_index()\n", 879 | " df_prev_ins_pos_agg.columns = pd.Index([\"PREV_\" + col[0] + \"_\" + col[1].upper() for col in df_prev_ins_pos_agg.columns.tolist()])\n", 880 | " df_prev_ins_pos_agg.rename(columns={\"PREV_SK_ID_CURR_\":\"SK_ID_CURR\"}, inplace = True)\n", 881 | " #prev_son ile ana tablo\n", 882 | " df_prev_others = df.merge(df_prev_ins_pos_agg, how = 'left',on = 'SK_ID_CURR')\n", 883 | " \n", 884 | " #credit_card_balance\n", 885 | " df_prev_ins_pos_ccb = df_prev_others.merge(CCB_agg, how = 'left',on = 'SK_ID_CURR')\n", 886 | " \n", 887 | " #bureau_balance\n", 888 | " all_data = df_prev_ins_pos_ccb.merge(bureau_and_bb_agg, how = 'left',on = 'SK_ID_CURR')\n", 889 | " \n", 890 | " print(\"all_data process:\", all_data.shape) \n", 891 | "\n", 892 | " \n", 893 | " \n", 894 | " return all_data\n", 895 | " " 896 | ] 897 | }, 898 | { 899 | "cell_type": "code", 900 | "execution_count": null, 901 | "metadata": {}, 902 | "outputs": [], 903 | "source": [] 904 | }, 905 | { 906 | "cell_type": "markdown", 907 | "metadata": {}, 908 | "source": [ 909 | "# Model Tuning" 910 | ] 911 | }, 912 | { 913 | "cell_type": "code", 914 | "execution_count": 60, 915 | "metadata": {}, 916 | "outputs": [], 917 | "source": [ 918 | "#lgbm = LGBMClassifier()\n", 919 | "\n", 920 | "#lgbm_params = {\"learning_rate\": [0.001, 0.01, 0.1],\n", 921 | "# \"n_estimators\": [200, 500, 100],\n", 922 | "# \"max_depth\":[1,2,35,8]}" 923 | ] 924 | }, 925 | { 926 | "cell_type": "code", 927 | "execution_count": 61, 928 | "metadata": {}, 929 | "outputs": [], 930 | "source": [ 931 | "#train = all_data[all_data['TARGET'].notnull()]\n", 932 | "#y_train = train[\"TARGET\"]\n", 933 | "#X_train = train.drop(\"TARGET\", axis = 1)\n", 934 | "\n", 935 | "#lgbm_cv_model = GridSearchCV(lgbm,lgbm_params, cv = 10, n_jobs = -1, verbose = 4).fit(X_train, y_train)\n", 936 | "#lgbm_cv_model.best_params_" 937 | ] 938 | }, 939 | { 940 | "cell_type": "code", 941 | "execution_count": null, 942 | "metadata": {}, 943 | "outputs": [], 944 | "source": [] 945 | }, 946 | { 947 | "cell_type": "markdown", 948 | "metadata": {}, 949 | "source": [ 950 | "# Machine Learning" 951 | ] 952 | }, 953 | { 954 | "cell_type": "code", 955 | "execution_count": 62, 956 | "metadata": {}, 957 | "outputs": [], 958 | "source": [ 959 | "def modeling(all_data):\n", 960 | "\n", 961 | " train_df = all_data[all_data['TARGET'].notnull()]\n", 962 | " test_df = all_data[all_data['TARGET'].isnull()]\n", 963 | "\n", 964 | " folds = KFold(n_splits = 10, shuffle = True, random_state = 1001)\n", 965 | "\n", 966 | " oof_preds = np.zeros(train_df.shape[0])\n", 967 | " sub_preds = np.zeros(test_df.shape[0])\n", 968 | " feature_importance_df = pd.DataFrame()\n", 969 | "\n", 970 | " feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR']]\n", 971 | "\n", 972 | " for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):\n", 973 | "\n", 974 | " train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]\n", 975 | "\n", 976 | " valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]\n", 977 | "\n", 978 | " clf = LGBMClassifier(\n", 979 | " n_jobs = -1,\n", 980 | " n_estimators=10000,\n", 981 | " learning_rate=0.02,\n", 982 | " num_leaves=34,\n", 983 | " colsample_bytree=0.9497036,\n", 984 | " subsample=0.8715623,\n", 985 | " max_depth=8,\n", 986 | " reg_alpha=0.041545473,\n", 987 | " reg_lambda=0.0735294,\n", 988 | " min_split_gain=0.0222415,\n", 989 | " min_child_weight=39.3259775,\n", 990 | " silent=-1,\n", 991 | " verbose=-1, )\n", 992 | "\n", 993 | " clf.fit(train_x, train_y, eval_set = [(train_x, train_y), (valid_x, valid_y)], \n", 994 | " eval_metric = 'auc', verbose = 200, early_stopping_rounds = 200)\n", 995 | "\n", 996 | " #y_pred_valid\n", 997 | " oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]\n", 998 | " sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits\n", 999 | "\n", 1000 | " fold_importance_df = pd.DataFrame()\n", 1001 | " fold_importance_df[\"feature\"] = feats\n", 1002 | " fold_importance_df[\"importance\"] = clf.feature_importances_\n", 1003 | " fold_importance_df[\"fold\"] = n_fold + 1\n", 1004 | " feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n", 1005 | "\n", 1006 | "\n", 1007 | " print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx]))) \n", 1008 | "\n", 1009 | "\n", 1010 | " print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds)) #y_pred_valid \n", 1011 | "\n", 1012 | " test_df['TARGET'] = sub_preds\n", 1013 | " test_df[['SK_ID_CURR', 'TARGET']].to_csv(\"dsmlbc1_submission.csv\", index= False)\n", 1014 | "\n", 1015 | " display_importances(feature_importance_df)\n", 1016 | " \n", 1017 | " return feature_importance_df\n", 1018 | "\n" 1019 | ] 1020 | }, 1021 | { 1022 | "cell_type": "markdown", 1023 | "metadata": {}, 1024 | "source": [ 1025 | "# main" 1026 | ] 1027 | }, 1028 | { 1029 | "cell_type": "code", 1030 | "execution_count": 63, 1031 | "metadata": {}, 1032 | "outputs": [], 1033 | "source": [ 1034 | "def main():\n", 1035 | " \n", 1036 | " with timer(\"Preprocessing Time\"):\n", 1037 | " all_data = pre_processing_and_combine()\n", 1038 | " \n", 1039 | " with timer(\"Modeling\"):\n", 1040 | " feat_importance = modeling(all_data)\n" 1041 | ] 1042 | }, 1043 | { 1044 | "cell_type": "code", 1045 | "execution_count": null, 1046 | "metadata": {}, 1047 | "outputs": [], 1048 | "source": [ 1049 | "if __name__ == \"__main__\":\n", 1050 | " with timer(\"Full model run\"):\n", 1051 | " main()" 1052 | ] 1053 | }, 1054 | { 1055 | "cell_type": "code", 1056 | "execution_count": null, 1057 | "metadata": {}, 1058 | "outputs": [], 1059 | "source": [] 1060 | }, 1061 | { 1062 | "cell_type": "code", 1063 | "execution_count": null, 1064 | "metadata": {}, 1065 | "outputs": [], 1066 | "source": [ 1067 | "\n", 1068 | "# dsmlbc1_ws: 2115s\n", 1069 | "# dsmlbc1_submission: 0.79441" 1070 | ] 1071 | } 1072 | ], 1073 | "metadata": { 1074 | "kernelspec": { 1075 | "display_name": "Python 3", 1076 | "language": "python", 1077 | "name": "python3" 1078 | }, 1079 | "language_info": { 1080 | "codemirror_mode": { 1081 | "name": "ipython", 1082 | "version": 3 1083 | }, 1084 | "file_extension": ".py", 1085 | "mimetype": "text/x-python", 1086 | "name": "python", 1087 | "nbconvert_exporter": "python", 1088 | "pygments_lexer": "ipython3", 1089 | "version": "3.7.6" 1090 | } 1091 | }, 1092 | "nbformat": 4, 1093 | "nbformat_minor": 4 1094 | } --------------------------------------------------------------------------------