├── .gitignore
├── .DS_Store
├── models
├── .DS_Store
├── reference
│ └── lightgbm_final_model.pkl
├── dsmlbc2
│ ├── ATILLA_MUHAMMET.py
│ └── merve_betul.ipynb
└── dsmlbc1
│ └── homeCreditRiskFinal.ipynb
├── outputs
├── features
│ ├── fold_auc_best_df.pkl
│ ├── lgbm_importances.png
│ ├── feature_importance_df.pkl
│ ├── features.py
│ └── features.ipynb
└── hyperparameters
│ ├── lightgbm_model.pkl
│ └── hyperparameters.pkl
├── scripts
├── __pycache__
│ ├── train.cpython-37.pyc
│ ├── pre_processing.cpython-37.pyc
│ └── helper_functions.cpython-37.pyc
├── predict.py
├── model_tuning.py
├── helper_functions.py
├── feature_selection.py
├── train.py
└── pre_processing.py
├── .idea
├── vcs.xml
├── other.xml
├── .gitignore
├── misc.xml
├── inspectionProfiles
│ ├── profiles_settings.xml
│ └── Project_Default.xml
├── modules.xml
├── home_credit.iml
└── datalore.xml
├── README.md
├── Makefile
├── main.py
└── requirements.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | data
2 |
--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mvahit/home_credit/HEAD/.DS_Store
--------------------------------------------------------------------------------
/models/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mvahit/home_credit/HEAD/models/.DS_Store
--------------------------------------------------------------------------------
/outputs/features/fold_auc_best_df.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mvahit/home_credit/HEAD/outputs/features/fold_auc_best_df.pkl
--------------------------------------------------------------------------------
/outputs/features/lgbm_importances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mvahit/home_credit/HEAD/outputs/features/lgbm_importances.png
--------------------------------------------------------------------------------
/models/reference/lightgbm_final_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mvahit/home_credit/HEAD/models/reference/lightgbm_final_model.pkl
--------------------------------------------------------------------------------
/outputs/features/feature_importance_df.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mvahit/home_credit/HEAD/outputs/features/feature_importance_df.pkl
--------------------------------------------------------------------------------
/outputs/hyperparameters/lightgbm_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mvahit/home_credit/HEAD/outputs/hyperparameters/lightgbm_model.pkl
--------------------------------------------------------------------------------
/scripts/__pycache__/train.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mvahit/home_credit/HEAD/scripts/__pycache__/train.cpython-37.pyc
--------------------------------------------------------------------------------
/outputs/hyperparameters/hyperparameters.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mvahit/home_credit/HEAD/outputs/hyperparameters/hyperparameters.pkl
--------------------------------------------------------------------------------
/scripts/__pycache__/pre_processing.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mvahit/home_credit/HEAD/scripts/__pycache__/pre_processing.cpython-37.pyc
--------------------------------------------------------------------------------
/scripts/__pycache__/helper_functions.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mvahit/home_credit/HEAD/scripts/__pycache__/helper_functions.cpython-37.pyc
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/other.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Datasource local storage ignored files
5 | /dataSources/
6 | /dataSources.local.xml
7 | # Editor-based HTTP Client requests
8 | /httpRequests/
9 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/home_credit.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # production_ready_home_credit_default_risk_model
2 |
3 |
4 | # Kaggle kurulum:
5 |
6 | 1. pip install kaggle
7 | 2. https://www.kaggle.com//account adresinden Create API'ye tıkla.
8 | 3. json dosyası inecek. Bu dosyayı home dizini altında ".kaggle" isimli bir klasör açıp içerisine yerleştir.
9 | 4. diğer kullanıcıların erişimini engellemek için "chmod 600 /Users/mvahit/.kaggle/kaggle.json"
10 | 5. go!
11 |
12 |
13 | # Bir klasorü ignore etmek
14 |
15 | 1. masaüstü uygulamasını aç
16 | 2. ilgili projeyi ac
17 | 3. üst menüden repository bölümüne gir
18 | 4. repo setting
19 | 5. ignored files
20 |
21 |
22 |
--------------------------------------------------------------------------------
/outputs/features/features.py:
--------------------------------------------------------------------------------
1 | # TODO feature isimlerini duzgunce al txt olarak bas
2 |
3 |
4 | import pandas as pd
5 | pd.set_option('display.max_columns', None)
6 | df = pd.read_pickle("/Users/mvahit/Documents/GitHub/home_credit/outputs/features/feature_importance_df.pkl")
7 | df.head()
8 | df = df.groupby("feature")["importance"].agg({"mean"}).sort_values(by="mean", ascending=False)
9 | df.head()
10 |
11 | df[df["mean"] > 0]
12 | df.shape
13 |
14 | df2 = pd.read_pickle("/Users/mvahit/Documents/GitHub/home_credit/outputs/features/fold_auc_best_df.pkl")
15 |
16 |
17 | # FINAL DF
18 |
19 | import pandas as pd
20 | pd.set_option('display.max_columns', None)
21 | df = pd.read_pickle("/Users/mvahit/Documents/GitHub/home_credit/data/final_train_df.pkl")
22 | df.head()
23 | df.shape
24 |
25 | [col for col in df.columns if col.startswith("APP")]
26 | a = df[[col for col in df.columns if col.startswith("APP")]].head()
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # GITHUB
2 |
3 | commit:
4 | git commit -am "commit from make file"
5 |
6 | push:
7 | git push origin master
8 |
9 | pull:
10 | git pull origin master
11 |
12 | fetch:
13 | git fetch origin master
14 |
15 | reset:
16 | rm -f .git/index
17 | git reset
18 |
19 | req:
20 | pip freeze > requirements.txt
21 |
22 | compush: req commit push
23 |
24 |
25 |
26 | # CONSOL RUN
27 | run_no_debug:
28 | python main.py --no-debug
29 |
30 | run:
31 | python main.py
32 |
33 |
34 | # MODEL TUNING
35 |
36 | tuning:
37 | python scripts/model_tuning.py
38 |
39 |
40 | # predict.py fonksiyonunu kullanarak train seti değerleri tahmini ve AUC degeri
41 | predict:
42 | python scripts/predict.py
43 |
44 | # predict.py fonksiyonunu kullanarak test seti değerlerini tahmin etme
45 | predict_test:
46 | python scripts/predict.py --test
47 |
48 | # predict.py fonksiyonu ile tahmin edilen sonuçların kaggle'a gönderilmesi
49 | kaggle_submit_predict:
50 | kaggle competitions submit -c home-credit-default-risk -f outputs/predictions/sub_from_prediction_py.csv -m "Message"
51 |
52 | muhat:
53 | python models/dsmlbc2/muhat.py
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
12 |
13 |
14 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/.idea/datalore.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/scripts/predict.py:
--------------------------------------------------------------------------------
1 | """modele train ya test bağımsız değişken değerlerini sor"""
2 |
3 | import os
4 | import pickle
5 | import pandas as pd
6 | from sklearn.metrics import roc_auc_score
7 | import argparse
8 |
9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('--train', dest='prediction_type', action='store_true')
11 | parser.add_argument('--test', dest='prediction_type', action='store_false')
12 | parser.set_defaults(prediction_type=True)
13 | args = parser.parse_args()
14 |
15 | final_train = pd.read_pickle("data/final_train_df.pkl")
16 | final_test = pd.read_pickle("data/final_test_df.pkl")
17 |
18 | feats = [f for f in final_test.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index',
19 | "APP_index", "BURO_index", "PREV_index", "INSTAL_index",
20 | "CC_index", "POS_index"]]
21 |
22 | if args.prediction_type:
23 | y_train = final_train["TARGET"]
24 | x_train = final_train[feats]
25 |
26 | cur_dir = os.getcwd()
27 | os.chdir('models/reference/')
28 | model = pickle.load(open('lightgbm_final_model.pkl', 'rb'))
29 | os.chdir(cur_dir)
30 |
31 | y_pred = model.predict_proba(x_train)[:, 1]
32 | print("TRAIN AUC SCORE:", roc_auc_score(y_train, y_pred))
33 | else:
34 | x_test = final_test[feats]
35 | cur_dir = os.getcwd()
36 | os.chdir('models/reference/')
37 | model = pickle.load(open('lightgbm_final_model.pkl', 'rb'))
38 | os.chdir(cur_dir)
39 | y_pred = model.predict_proba(x_test)[:, 1]
40 | ids = final_test['SK_ID_CURR']
41 | submission = pd.DataFrame({'SK_ID_CURR': ids, 'TARGET': y_pred})
42 | os.chdir('outputs/predictions/')
43 | submission.to_csv("sub_from_prediction_py.csv", index=False)
44 | print("Submission file has been created in:", "/Users/mvahit/Documents/GitHub/home_credit/predictions/")
45 |
46 | # calistirmak icin
47 | # python scripts/predict.py --train
48 |
--------------------------------------------------------------------------------
/scripts/model_tuning.py:
--------------------------------------------------------------------------------
1 | """Model tuning scripti calistiginda hyperparameters klasörüne iki sonuc uretecek:
2 |
3 | hyperparameters.pkl
4 | lightgbm_model.pkl
5 |
6 | """
7 |
8 | # TODO feature isimlerini modele sokarak bu feature'lar ile tuning
9 |
10 | import os
11 | import pickle
12 | from lightgbm import LGBMClassifier
13 | import pandas as pd
14 | from sklearn.model_selection import GridSearchCV
15 |
16 | lgbm = LGBMClassifier()
17 |
18 | lgbm_params = {"learning_rate": [0.01, 0.1],
19 | "n_estimators": [200, 100]}
20 |
21 | df = pd.read_pickle("data/final_train_df.pkl")
22 |
23 |
24 | y_train = df["TARGET"]
25 |
26 | X_train = df.drop("TARGET", axis=1)
27 |
28 | lgbm_cv_model = GridSearchCV(lgbm,
29 | lgbm_params,
30 | cv=5,
31 | n_jobs=-1,
32 | verbose=2).fit(X_train, y_train)
33 |
34 | dir(lgbm_cv_model)
35 | params = lgbm_cv_model.best_params_
36 |
37 | # saving hyperparameters and model
38 | cur_dir = os.getcwd()
39 | os.chdir('outputs/hyperparameters/')
40 | pickle.dump(params, open("hyperparameters.pkl", 'wb')) # hyperparameters
41 | pickle.dump(lgbm_cv_model, open("lightgbm_model.pkl", 'wb')) # model
42 | os.chdir(cur_dir)
43 |
44 | print("Best hyperparameters", params)
45 |
46 |
47 | # loading and prediction with model
48 |
49 | # del lgbm_cv_model
50 | cur_dir = os.getcwd()
51 | os.chdir('/Users/mvahit/Documents/GitHub/home_credit/outputs/hyperparameters/')
52 | model = pickle.load(open('lightgbm_model.pkl', 'rb'))
53 | os.chdir(cur_dir)
54 | model.predict(X_train.head())
55 |
56 | # loading hyperparameters
57 | del model
58 | del params
59 | cur_dir = os.getcwd()
60 | os.chdir('/Users/mvahit/Documents/GitHub/home_credit/outputs/hyperparameters/')
61 | params = pickle.load(open('hyperparameters.pkl', 'rb'))
62 | final_lgbm = LGBMClassifier(**params).fit(X_train, y_train)
63 | final_lgbm.get_params()
64 | final_lgbm.predict(X_train.head())
65 |
66 |
--------------------------------------------------------------------------------
/scripts/helper_functions.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | # One-hot encoding for categorical columns with get_dummies
3 | def one_hot_encoder(df, nan_as_category=True):
4 | import pandas as pd
5 | original_columns = list(df.columns)
6 | categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
7 | df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
8 | new_columns = [c for c in df.columns if c not in original_columns]
9 | return df, new_columns
10 |
11 |
12 | # command line access for debuging
13 | def get_namespace():
14 | import argparse
15 | parser = argparse.ArgumentParser()
16 | parser.add_argument('--debug', dest='debug', action='store_true')
17 | parser.add_argument('--no-debug', dest='debug', action='store_false')
18 | parser.set_defaults(debug=True)
19 | return parser.parse_args()
20 |
21 |
22 | # i love data science
23 | def i_love_ds():
24 | print('\n'.join([''.join([(' I_Love_Data_Science_'[(x - y) % len('I_Love_Data_Science_')]
25 | if ((x * 0.05) ** 2 + (y * 0.1) ** 2 - 1) ** 3 - (x * 0.05) ** 2 * (
26 | y * 0.1) ** 3 <= 0 else ' ')
27 | for x in range(-30, 30)]) for y in range(15, -15, -1)]))
28 |
29 |
30 | # Display/plot feature importance
31 | def display_importances(feature_importance_df_):
32 | import seaborn as sns
33 | import matplotlib.pyplot as plt
34 | cols = (feature_importance_df_[["feature", "importance"]]
35 | .groupby("feature")
36 | .mean()
37 | .sort_values(by="importance", ascending=False)[:100].index)
38 | best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
39 | plt.figure(figsize=(10, 20))
40 | sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
41 | plt.title('LightGBM Features (avg over folds)')
42 | plt.tight_layout()
43 | plt.savefig('outputs/features/lgbm_importances.png')
44 |
45 |
46 | # missing values
47 | #
48 | # def missing_values(df):
49 | #
50 | # cols_with_na = [col for col in df.columns if df[col].isnull().sum() > 0]
51 | # for col in cols_with_na:
52 | # print(col, np.round(df[cols_with_na].isnull().mean(), 3), " % missing values")
53 |
54 |
55 |
56 | # # saving models
57 | # def saving_models():
58 | # import os
59 | # cur_dir = os.getcwd()
60 | # os.chdir('/models/reference/')
61 | # model_name = "lightgbm_fold_" + str(n_fold + 1) + "." + "pkl"
62 | # pickle.dump(model, open(model_name, 'wb')) # model
63 | # os.chdir(cur_dir)
64 |
65 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | # HOME CREDIT DEFAULT RISK RUNNER FUNCTION
2 |
3 | # Linux
4 | # Context Manager
5 | # Decorator
6 | # requirements.txt
7 | # virtual env
8 | # Makefile
9 | # git github
10 | # CLIs
11 |
12 |
13 | import gc
14 | import time
15 | from contextlib import contextmanager
16 | import warnings
17 |
18 |
19 | from scripts.helper_functions import get_namespace, i_love_ds
20 |
21 | from scripts.pre_processing import application_train_test, bureau_and_balance, previous_applications, pos_cash, \
22 | installments_payments, credit_card_balance
23 |
24 | from scripts.train import kfold_lightgbm
25 |
26 | warnings.simplefilter(action='ignore', category=FutureWarning)
27 |
28 |
29 | @contextmanager
30 | def timer(title):
31 | t0 = time.time()
32 | yield
33 | print("{} - done in {:.0f}s".format(title, time.time() - t0))
34 |
35 |
36 | def main(debug=False):
37 | num_rows = 10000 if debug else None
38 |
39 | with timer("Pre-Processing"):
40 |
41 | i_love_ds()
42 |
43 | # application_train_test
44 | df = application_train_test(num_rows)
45 | # bureau & bureau_balance
46 | bureau = bureau_and_balance(num_rows)
47 | df = df.join(bureau, how='left', on='SK_ID_CURR')
48 | del bureau
49 | # previous_applications
50 | prev = previous_applications(num_rows)
51 | df = df.join(prev, how='left', on='SK_ID_CURR')
52 | del prev
53 | # posh_cash
54 | pos = pos_cash(num_rows)
55 | df = df.join(pos, how='left', on='SK_ID_CURR')
56 | del pos
57 | # installments_payments
58 | ins = installments_payments(num_rows)
59 | df = df.join(ins, how='left', on='SK_ID_CURR')
60 | del ins
61 | # credit_card_balance
62 | cc = credit_card_balance(num_rows)
63 | df = df.join(cc, how='left', on='SK_ID_CURR')
64 | del cc
65 |
66 | # saving final dataframes
67 | train_df = df[df['TARGET'].notnull()]
68 | test_df = df[df['TARGET'].isnull()]
69 | train_df.to_pickle("data/final_train_df.pkl")
70 | test_df.to_pickle("data/final_test_df.pkl")
71 |
72 | del train_df, test_df
73 | gc.collect()
74 |
75 | with timer("Run LightGBM"):
76 | feat_importance = kfold_lightgbm(df, debug=debug)
77 |
78 |
79 | if __name__ == "__main__":
80 | namespace = get_namespace()
81 | with timer("Full model run"):
82 | main(debug=namespace.debug)
83 |
84 | # kaggle model run: 7879s
85 | # server: 8290s
86 | # mac: 5073s
87 | # google 8: 3189s
88 | # workstation: 1987s
89 | # submission public score: 0.79186
90 |
91 | # 0.79557 mehmet_okan_kasim
92 |
93 |
94 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | appnope==0.1.0
2 | attrs @ file:///tmp/build/80754af9/attrs_1598374659300/work
3 | backcall==0.2.0
4 | bleach==3.1.5
5 | certifi==2020.6.20
6 | chardet==3.0.4
7 | colorama==0.4.3
8 | cycler==0.10.0
9 | decorator==4.4.2
10 | defusedxml==0.6.0
11 | entrypoints==0.3
12 | feature-engine==0.6.0
13 | idna==2.10
14 | importlib-metadata @ file:///opt/concourse/worker/volumes/live/84197498-cbc0-4436-7ce0-03c4490b7a28/volume/importlib-metadata_1593446431408/work
15 | iniconfig==1.0.1
16 | ipykernel @ file:///opt/concourse/worker/volumes/live/73e8766c-12c3-4f76-62a6-3dea9a7da5b7/volume/ipykernel_1596206701501/work/dist/ipykernel-5.3.4-py3-none-any.whl
17 | ipython @ file:///opt/concourse/worker/volumes/live/bb221eaa-cc1a-4ab2-40f7-74a2020a44b1/volume/ipython_1599056234390/work
18 | ipython-genutils==0.2.0
19 | ipywidgets==7.5.1
20 | jedi @ file:///opt/concourse/worker/volumes/live/152cd167-7b79-4fbd-5c97-d7b338805c2b/volume/jedi_1598371617305/work
21 | Jinja2==2.11.2
22 | joblib @ file:///tmp/build/80754af9/joblib_1594236160679/work
23 | jsonschema==3.2.0
24 | jupyter==1.0.0
25 | jupyter-client @ file:///tmp/build/80754af9/jupyter_client_1594826976318/work
26 | jupyter-console @ file:///tmp/build/80754af9/jupyter_console_1598884538475/work
27 | jupyter-core==4.6.3
28 | kaggle==1.5.8
29 | keyring==21.4.0
30 | kiwisolver==1.2.0
31 | lightgbm==2.3.0
32 | MarkupSafe==1.1.1
33 | matplotlib==3.3.1
34 | missingno==0.4.2
35 | mistune==0.8.4
36 | mkl-fft==1.1.0
37 | mkl-random==1.1.1
38 | mkl-service==2.3.0
39 | mlxtend==0.17.3
40 | more-itertools==8.5.0
41 | nbconvert==5.6.1
42 | nbformat==5.0.7
43 | notebook==6.0.3
44 | numpy==1.19.2
45 | packaging==20.4
46 | pandas==1.1.2
47 | pandocfilters==1.4.2
48 | parso==0.7.0
49 | patsy==0.5.1
50 | pexpect==4.8.0
51 | pickleshare==0.7.5
52 | Pillow==7.2.0
53 | pkginfo==1.5.0.1
54 | pluggy==0.13.1
55 | prometheus-client==0.8.0
56 | prompt-toolkit @ file:///tmp/build/80754af9/prompt-toolkit_1598885458782/work
57 | ptyprocess==0.6.0
58 | py==1.9.0
59 | Pygments==2.6.1
60 | PyMySQL==0.10.1
61 | pyparsing==2.4.7
62 | pyrsistent==0.16.0
63 | pytest==6.0.2
64 | python-dateutil==2.8.1
65 | python-slugify==4.0.1
66 | pytz==2020.1
67 | pyzmq==19.0.1
68 | qtconsole @ file:///tmp/build/80754af9/qtconsole_1598374667791/work
69 | QtPy==1.9.0
70 | readme-renderer==26.0
71 | requests==2.24.0
72 | requests-toolbelt==0.9.1
73 | rfc3986==1.4.0
74 | scikit-learn @ file:///opt/concourse/worker/volumes/live/2dacdc11-21e7-44f5-57b4-6b8eb6ceb626/volume/scikit-learn_1598376924598/work
75 | scipy @ file:///opt/concourse/worker/volumes/live/9698578f-91da-4d5f-6fce-b26b1f42eb5a/volume/scipy_1597686637948/work
76 | seaborn==0.11.0
77 | Send2Trash==1.5.0
78 | six==1.15.0
79 | slugify==0.0.1
80 | statsmodels==0.12.0
81 | terminado==0.8.3
82 | testpath==0.4.4
83 | text-unidecode==1.3
84 | threadpoolctl @ file:///tmp/tmp9twdgx9k/threadpoolctl-2.1.0-py3-none-any.whl
85 | toml==0.10.1
86 | tornado==6.0.4
87 | tqdm==4.49.0
88 | traitlets==4.3.3
89 | twine==3.2.0
90 | urllib3==1.24.3
91 | wcwidth @ file:///tmp/build/80754af9/wcwidth_1593447189090/work
92 | webencodings==0.5.1
93 | widgetsnbextension==3.5.1
94 | xlrd==1.2.0
95 | zipp==3.1.0
96 |
--------------------------------------------------------------------------------
/outputs/features/features.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true,
8 | "pycharm": {
9 | "name": "#%% feature importance\n"
10 | }
11 | },
12 | "outputs": [],
13 | "source": [
14 | "import pandas as pd\n",
15 | "pd.set_option('display.max_columns', None)\n",
16 | "df = pd.read_pickle(\"/Users/mvahit/Documents/GitHub/home_credit/outputs/features/feature_importance_df.pkl\")\n",
17 | "df = df.groupby(\"feature\")[\"importance\"].agg({\"mean\"}).sort_values(by=\"mean\", ascending=False)"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 10,
23 | "outputs": [],
24 | "source": [
25 | "df = pd.read_pickle(\"/Users/mvahit/Documents/GitHub/home_credit/outputs/features/fold_auc_best_df.pkl\")"
26 | ],
27 | "metadata": {
28 | "collapsed": false,
29 | "pycharm": {
30 | "name": "#%% best iteration\n"
31 | }
32 | }
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 11,
37 | "outputs": [
38 | {
39 | "data": {
40 | "text/plain": " FOLD AUC BEST_ITER\n0 1.0 0.764298 393.0\n1 2.0 0.818620 115.0\n2 3.0 0.754580 283.0\n3 4.0 0.780804 283.0\n4 5.0 0.797317 197.0",
41 | "text/html": "\n\n
\n \n \n | \n FOLD | \n AUC | \n BEST_ITER | \n
\n \n \n \n | 0 | \n 1.0 | \n 0.764298 | \n 393.0 | \n
\n \n | 1 | \n 2.0 | \n 0.818620 | \n 115.0 | \n
\n \n | 2 | \n 3.0 | \n 0.754580 | \n 283.0 | \n
\n \n | 3 | \n 4.0 | \n 0.780804 | \n 283.0 | \n
\n \n | 4 | \n 5.0 | \n 0.797317 | \n 197.0 | \n
\n \n
\n
"
42 | },
43 | "execution_count": 11,
44 | "metadata": {},
45 | "output_type": "execute_result"
46 | }
47 | ],
48 | "source": [
49 | "df.head()\n"
50 | ],
51 | "metadata": {
52 | "collapsed": false,
53 | "pycharm": {
54 | "name": "#%%\n"
55 | }
56 | }
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": null,
61 | "outputs": [],
62 | "source": [],
63 | "metadata": {
64 | "collapsed": false,
65 | "pycharm": {
66 | "name": "#%%\n"
67 | }
68 | }
69 | }
70 | ],
71 | "metadata": {
72 | "kernelspec": {
73 | "display_name": "Python 3",
74 | "language": "python",
75 | "name": "python3"
76 | },
77 | "language_info": {
78 | "codemirror_mode": {
79 | "name": "ipython",
80 | "version": 2
81 | },
82 | "file_extension": ".py",
83 | "mimetype": "text/x-python",
84 | "name": "python",
85 | "nbconvert_exporter": "python",
86 | "pygments_lexer": "ipython2",
87 | "version": "2.7.6"
88 | }
89 | },
90 | "nbformat": 4,
91 | "nbformat_minor": 0
92 | }
--------------------------------------------------------------------------------
/scripts/feature_selection.py:
--------------------------------------------------------------------------------
1 | # EKSİK DEĞER, AYKIRI DEGER, FEATURE SCALING:
2 | # DOĞRUSAL MODELLERDE, SVM, YSA, KNN YA DA UZAKLIK TEMELLIK YONTEMLERDE ONEMLIDIR.
3 |
4 | # AĞAÇ YÖNTEMLERİNDE ÖNEMİ DÜZEYLERİ ÇOK AZDIR.
5 |
6 | # Filter Methods (Statistical methods: korelasyon, ki-kare)
7 | # Wrapper Methods (backward selection, forward selection, stepwise)
8 | # Embeded (Tree Based Methods, Ridge, Lasso)
9 |
10 | # Tree Based Methods
11 | # korelasyon, ki-kare
12 |
13 | # TODO TREE BASED SELECTION
14 |
15 | # TODO: tum değişkenleri, sayısal değişkenleri, kategorik değişkenleri (iki sınıflı ya da çok sınıflı),
16 | # yeni türetilen değişkenlerin isimlerini ayrı listelerde tut.
17 |
18 | all_cols = [] # target burada olmamalı
19 | num_cols = [col for col in df.columns if df[col].dtypes != 'O']
20 | cat_cols = []
21 | new_cols = []
22 | target = []
23 |
24 | # TODO: random forests, lightgbm, xgboost ve catboost modelleri geliştir.
25 | # Bu modellere orta şekerli hiperparametre optimizasyonu yap. Final modelleri kur.
26 | # Bu modellerin her birisine feature importance sor. Gelen feature importance'ların hepsini bir df'te topla.
27 | # Bu df'in sütunları aşağıdaki şekilde olsun:
28 |
29 | # model_name feature_name feature_importance
30 |
31 | # TODO: oluşacak df'i analiz et. Grupby ile importance'in ortalamasını alıp, değişken önemlerini küçükten büyüğe sırala.
32 | # En önemli değişkenleri bul. Sıfırdan küçük olan importance'a sahip değişkenleri sil.
33 | # Nihayi olarak karar verdiğin değişkenlerin adını aşağıdaki şekilde sakla:
34 |
35 | features_based_trees = []
36 |
37 | # TODO: Önemli not. Yukarıdaki işlemler neticesinde catboost'un sonuçlarına özellikle odaklanıp
38 | # kategorik değişkenlerin incelenmesi gerekmektedir.
39 | # Çalışmanın başında tutulmuş olan cat_cols listesini kullanarak
40 | # sadece categorik değişkenler için hangi ağacın nasıl bir önem düzeyi verdiğini inceleyiniz
41 | # ve diğer algoritmalarca önemsiz catboost tarafından önemli olan değerlendirilen değişkenleri bulunuz
42 | # ve aşağıdaki şekilde kaydediniz:
43 |
44 | features_catboost_cat = []
45 |
46 | # TODO: features_based_trees listesinde yer ALMAYIP catboost_cat_imp listesinde YER ALAN değişkenleri bulunuz
47 | # ve bu değişkenleri features_based_trees listesine ekleyiniz.
48 |
49 |
50 | # TODO STATISTICAL SELECTION
51 |
52 | # TODO bağımsız değişkenlerin birbiri arasındaki korelasyonlarına bakıp birbiri ile
53 | # yüzde 75 üzeri korelasyonlu olan değişkenler arasından 1 tane değişkeni rastgele seçiniz
54 | # ve değişkenlerin isimlerini aşağıdaki gibi kaydediniz:
55 | # elenen değişkenlerin isimlerini de aşağıdaki gibi kaydediniz:
56 |
57 | features_based_correlation = []
58 | features_dropped_based_correlation = []
59 |
60 |
61 | # TODO: features_based_trees listesinde olup aynı anda features_dropped_based_correlation listesinde olan feature'lara
62 | # odaklanarak inceleme yapınız ve gerekli gördüğünüz değişkenleri features_based_trees listesinden siliniz ya da
63 | # drop listesinden agaç listesine taşıyınız
64 |
65 | # TODO: veri setindeki kategorik değişkenler ile bağımlı değişken arasında chi-squared testi uygulayınız
66 | # ve bu test sonucuna göre target ile dependency'si bulunan değişkenleri aşağıdaki şekilde saklayınız:
67 |
68 | cat_cols_chi = []
69 |
70 | # TODO: yukarıdan gelecek olan değişkenler ile features_based_trees listelerini karşılaştırınız. Durumu analiz ediniz.
71 | # cat_cols_chi listesinde olup features_based_trees listesinde olmayan değişkenleri eklemeyi değerlendiriniz.
72 | # ya da cat_cols_chi'de olmayıp features_based_trees'de olan değişkenkeri çıkarmayı değerlendiriniz.
73 | # Değerlendirmekten kastım sizin yorumunuza kalmış.
74 |
75 |
76 | # TODO: netice olarak en sonda aşağıdaki isimlendirme ile seçilmis feature'ları kaydediniz:
77 |
78 |
79 | features_selected = []
80 |
81 | # TODO: seçilmiş feature'lar ile model tuning yaparak lightgbm için hiperparametre optimizasyonu yapınız.
82 | # TODO: yeni hiperparametrelerle final modeli oluşturunuz.
--------------------------------------------------------------------------------
/scripts/train.py:
--------------------------------------------------------------------------------
1 | # LightGBM GBDT with KFold
2 | # Parameters from Tilii kernel: https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code
3 | import gc
4 | import os
5 | import pickle
6 | import pandas as pd
7 |
8 | from lightgbm import LGBMClassifier
9 | import numpy as np
10 | from sklearn.metrics import roc_auc_score
11 | from sklearn.model_selection import KFold
12 |
13 | from scripts.helper_functions import display_importances
14 |
15 |
16 | def kfold_lightgbm(df, debug=False):
17 | # Divide in training/validation and test data
18 |
19 | train_df = df[df['TARGET'].notnull()]
20 | test_df = df[df['TARGET'].isnull()]
21 | print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
22 |
23 |
24 | del df
25 | gc.collect()
26 |
27 | folds = KFold(n_splits=10, shuffle=True, random_state=1001)
28 |
29 | # Create arrays and dataframes to store results
30 |
31 | oof_preds = np.zeros(train_df.shape[0]) # predicted valid_y
32 | sub_preds = np.zeros(test_df.shape[0]) # submission preds
33 | feature_importance_df = pd.DataFrame() # feature importance
34 |
35 | fold_auc_best_df = pd.DataFrame(columns=["FOLD", "AUC", "BEST_ITER"]) # holding best iter to save model
36 |
37 | feats = [f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index',
38 | "APP_index", "BURO_index", "PREV_index", "INSTAL_index",
39 | "CC_index", "POS_index"]]
40 |
41 | # folds split'e X,Y birlikte gösterildi. Bu veriyi bol dendi. 10 tane train-validasyon index cifti turetildi.
42 | # enumerate turetilen index çiftlerini çift olarak yakalama imkanı sagladi.
43 |
44 | for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
45 | train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
46 | valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]
47 |
48 | # LightGBM parameters found by Bayesian optimization
49 | clf = LGBMClassifier(
50 | n_jobs=-1,
51 | n_estimators=10000,
52 | learning_rate=0.02,
53 | num_leaves=34,
54 | colsample_bytree=0.9497036,
55 | subsample=0.8715623,
56 | max_depth=8,
57 | reg_alpha=0.041545473,
58 | reg_lambda=0.0735294,
59 | min_split_gain=0.0222415,
60 | min_child_weight=39.3259775,
61 | silent=-1,
62 | verbose=-1, )
63 |
64 | clf.fit(train_x,
65 | train_y,
66 | eval_set=[(train_x, train_y),
67 | (valid_x, valid_y)],
68 | eval_metric='auc',
69 | verbose=200,
70 | early_stopping_rounds=200)
71 |
72 | # predicted valid_y
73 | oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
74 |
75 | # submission preds. her kat icin test setini tahmin edip tum katların ortalamasini alıyor.
76 | sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
77 |
78 | # fold, auc and best iteration
79 | print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
80 |
81 |
82 |
83 | # best auc & iteration
84 | fold_auc_best_df = fold_auc_best_df.append({'FOLD': int(n_fold + 1),
85 | 'AUC': roc_auc_score(valid_y, oof_preds[valid_idx]),
86 | "BEST_ITER": clf.best_iteration_}, ignore_index=True)
87 |
88 |
89 |
90 | fold_importance_df = pd.DataFrame()
91 | fold_importance_df["feature"] = feats
92 | fold_importance_df["importance"] = clf.feature_importances_
93 | fold_importance_df["fold"] = n_fold + 1
94 | feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
95 |
96 | del clf, train_x, train_y, valid_x, valid_y
97 | gc.collect()
98 |
99 | # OUTPUTS
100 | print(fold_auc_best_df)
101 | print(feature_importance_df)
102 |
103 | # feature importance'ları df olarak kaydet
104 | feature_importance_df.to_pickle("outputs/features/feature_importance_df.pkl")
105 | fold_auc_best_df.to_pickle("outputs/features/fold_auc_best_df.pkl")
106 |
107 | # Final Model
108 | best_iter_1 = int(fold_auc_best_df.sort_values(by="AUC", ascending=False)[:1]["BEST_ITER"].values)
109 |
110 | # AUC'ye gore sırala, ilk 3 fold'un best iter sayılarının ortalamasını al, virgulden sonra sayı olmasın.
111 | # best_iter_3 = round(fold_auc_best_df.sort_values(by="AUC", ascending=False)[:3]["BEST_ITER"].mean(), 0)
112 |
113 | y_train = train_df["TARGET"]
114 | x_train = train_df[feats]
115 |
116 | final_model = LGBMClassifier(
117 | n_jobs=-1,
118 | n_estimators=best_iter_1,
119 | learning_rate=0.02,
120 | num_leaves=34,
121 | colsample_bytree=0.9497036,
122 | subsample=0.8715623,
123 | max_depth=8,
124 | reg_alpha=0.041545473,
125 | reg_lambda=0.0735294,
126 | min_split_gain=0.0222415,
127 | min_child_weight=39.3259775,
128 | silent=-1,
129 | verbose=-1).fit(x_train, y_train)
130 |
131 | cur_dir = os.getcwd()
132 | os.chdir('models/reference/')
133 | pickle.dump(final_model, open("lightgbm_final_model.pkl", 'wb')) # model
134 | os.chdir(cur_dir)
135 |
136 | # her bir fold icin tahmin edilen valid_y'ler aslında train setinin y'lerinin farklı parcalarda yer alan tahminleri.
137 | print('Full Train(Validasyon) AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
138 |
139 | # Write submission file and plot feature importance
140 | if not debug:
141 | test_df['TARGET'] = sub_preds
142 | test_df[['SK_ID_CURR', 'TARGET']].to_csv("predictions/reference_submission.csv", index=False)
143 |
144 | display_importances(feature_importance_df)
145 | del x_train, y_train
146 |
147 | return feature_importance_df
148 |
149 |
--------------------------------------------------------------------------------
/scripts/pre_processing.py:
--------------------------------------------------------------------------------
1 | """bu scriptte ön işleme fonksiyonları yer almaktadır."""
2 | import gc
3 |
4 | import pandas as pd
5 | import numpy as np
6 | from scripts.helper_functions import one_hot_encoder
7 |
8 |
9 | # Preprocess application_train.csv and application_test.csv
10 |
11 | def application_train_test(num_rows=None, nan_as_category=False):
12 | # Read data and merge
13 | df = pd.read_csv('data/application_train.csv', nrows=num_rows)
14 | test_df = pd.read_csv('data/application_test.csv', nrows=num_rows)
15 | print("Train samples: {}, test samples: {}".format(len(df), len(test_df)))
16 | df = df.append(test_df).reset_index()
17 |
18 | # Optional: Remove 4 applications with XNA CODE_GENDER (train set)
19 | df = df[df['CODE_GENDER'] != 'XNA']
20 |
21 | # Categorical features with Binary encode (0 or 1; two categories)
22 | for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
23 | df[bin_feature], uniques = pd.factorize(df[bin_feature])
24 |
25 | # Categorical features with One-Hot encode
26 | df, cat_cols = one_hot_encoder(df, nan_as_category)
27 |
28 | # NaN values for DAYS_EMPLOYED: 365.243 -> nan
29 | df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
30 |
31 | # Some simple new features (percentages)
32 | df['NEW_DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
33 | df['NEW_INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
34 | df['NEW_INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
35 | df['NEW_ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
36 | df['NEW_PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
37 |
38 | df.columns = pd.Index(["APP_" + col for col in df.columns.tolist()])
39 | df.rename(columns={"APP_SK_ID_CURR": "SK_ID_CURR"}, inplace=True)
40 | df.rename(columns={"APP_TARGET": "TARGET"}, inplace=True)
41 |
42 | del test_df
43 | gc.collect()
44 | return df
45 |
46 |
47 | # Preprocess bureau.csv and bureau_balance.csv
48 | def bureau_and_balance(num_rows=None, nan_as_category=True):
49 | # Preprocessing
50 | bureau = pd.read_csv('data/bureau.csv', nrows=num_rows)
51 | bb = pd.read_csv('data/bureau_balance.csv', nrows=num_rows)
52 | bb, bb_cat = one_hot_encoder(bb, nan_as_category)
53 | bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)
54 |
55 | # Bureau balance: Perform aggregations and merge with bureau.csv
56 | bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
57 |
58 | for col in bb_cat:
59 | bb_aggregations[col] = ['mean']
60 |
61 | bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
62 | bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
63 | bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
64 | bureau.drop(['SK_ID_BUREAU'], axis=1, inplace=True)
65 |
66 | del bb, bb_agg
67 | gc.collect()
68 |
69 | # Bureau and bureau_balance numeric features
70 | num_aggregations = {
71 | 'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
72 | 'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
73 | 'DAYS_CREDIT_UPDATE': ['mean'],
74 | 'CREDIT_DAY_OVERDUE': ['max', 'mean'],
75 | 'AMT_CREDIT_MAX_OVERDUE': ['mean'],
76 | 'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
77 | 'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
78 | 'AMT_CREDIT_SUM_OVERDUE': ['mean'],
79 | 'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
80 | 'AMT_ANNUITY': ['max', 'mean'],
81 | 'CNT_CREDIT_PROLONG': ['sum'],
82 | 'MONTHS_BALANCE_MIN': ['min'],
83 | 'MONTHS_BALANCE_MAX': ['max'],
84 | 'MONTHS_BALANCE_SIZE': ['mean', 'sum']
85 | }
86 |
87 | # Bureau and bureau_balance categorical features
88 | cat_aggregations = {}
89 | for cat in bureau_cat:
90 | cat_aggregations[cat] = ['mean']
91 |
92 | for cat in bb_cat:
93 | cat_aggregations[cat + "_MEAN"] = ['mean']
94 |
95 | bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
96 | bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
97 |
98 | # Bureau: Active credits - using only numerical aggregations
99 | active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
100 | active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
101 | active_agg.columns = pd.Index(['BURO_NEW_ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
102 | bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')
103 | del active, active_agg
104 | gc.collect()
105 |
106 | # Bureau: Closed credits - using only numerical aggregations
107 | closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
108 | closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
109 | closed_agg.columns = pd.Index(['BURO_NEW_CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
110 | bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')
111 | del closed, closed_agg, bureau
112 | gc.collect()
113 | return bureau_agg
114 |
115 |
116 | # Preprocess previous_applications.csv
117 | def previous_applications(num_rows=None, nan_as_category=True):
118 | prev = pd.read_csv('data/previous_application.csv', nrows=num_rows)
119 | prev, cat_cols = one_hot_encoder(prev, nan_as_category)
120 |
121 | # Days 365.243 values -> nan
122 | prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
123 | prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
124 | prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True)
125 | prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True)
126 | prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True)
127 |
128 | # Add feature: value ask / value received percentage
129 | prev['NEW_APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
130 |
131 | # Previous applications numeric features
132 | num_aggregations = {
133 | 'AMT_ANNUITY': ['min', 'max', 'mean'],
134 | 'AMT_APPLICATION': ['min', 'max', 'mean'],
135 | 'AMT_CREDIT': ['min', 'max', 'mean'],
136 | 'NEW_APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
137 | 'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
138 | 'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
139 | 'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
140 | 'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
141 | 'DAYS_DECISION': ['min', 'max', 'mean'],
142 | 'CNT_PAYMENT': ['mean', 'sum'],
143 | }
144 |
145 | # Previous applications categorical features
146 | cat_aggregations = {}
147 | for cat in cat_cols:
148 | cat_aggregations[cat] = ['mean']
149 |
150 | prev_agg = prev.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
151 | prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
152 |
153 | # Previous Applications: Approved Applications - only numerical features
154 | approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
155 | approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
156 | approved_agg.columns = pd.Index(
157 | ['PREV_NEW_APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
158 | prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')
159 |
160 | # Previous Applications: Refused Applications - only numerical features
161 | refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
162 | refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
163 | refused_agg.columns = pd.Index(
164 | ['PREV_NEW_REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
165 | prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
166 | del refused, refused_agg, approved, approved_agg, prev
167 | gc.collect()
168 | return prev_agg
169 |
170 |
171 | # Preprocess POS_CASH_balance.csv
172 | def pos_cash(num_rows=None, nan_as_category=True):
173 | pos = pd.read_csv('data/POS_CASH_balance.csv', nrows=num_rows)
174 | pos, cat_cols = one_hot_encoder(pos, nan_as_category)
175 |
176 | # Features
177 | aggregations = {
178 | 'MONTHS_BALANCE': ['max', 'mean', 'size'],
179 | 'SK_DPD': ['max', 'mean'],
180 | 'SK_DPD_DEF': ['max', 'mean']
181 | }
182 |
183 | for cat in cat_cols:
184 | aggregations[cat] = ['mean']
185 |
186 | pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
187 | pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
188 |
189 | # Count pos cash accounts
190 | pos_agg['POS_NEW_COUNT'] = pos.groupby('SK_ID_CURR').size()
191 | del pos
192 | gc.collect()
193 | return pos_agg
194 |
195 |
196 | # Preprocess installments_payments.csv
197 | def installments_payments(num_rows=None, nan_as_category=True):
198 | ins = pd.read_csv('data/installments_payments.csv', nrows=num_rows)
199 | ins, cat_cols = one_hot_encoder(ins, nan_as_category)
200 |
201 | # Percentage and difference paid in each installment (amount paid and installment value)
202 | ins['NEW_PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
203 | ins['NEW_PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
204 | # Days past due and days before due (no negative values)
205 | ins['NEW_DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
206 | ins['NEW_DPD'] = ins['NEW_DPD'].apply(lambda x: x if x > 0 else 0)
207 |
208 | ins['NEW_DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
209 | ins['NEW_DBD'] = ins['NEW_DBD'].apply(lambda x: x if x > 0 else 0)
210 |
211 | # Features: Perform aggregations
212 | aggregations = {
213 | 'NUM_INSTALMENT_VERSION': ['nunique'],
214 | 'NEW_DPD': ['max', 'mean', 'sum'],
215 | 'NEW_DBD': ['max', 'mean', 'sum'],
216 | 'NEW_PAYMENT_PERC': ['max', 'mean', 'sum', 'var'],
217 | 'NEW_PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
218 | 'AMT_INSTALMENT': ['max', 'mean', 'sum'],
219 | 'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
220 | 'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum']
221 | }
222 |
223 | for cat in cat_cols:
224 | aggregations[cat] = ['mean']
225 |
226 | ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
227 | ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
228 |
229 | # Count installments accounts
230 | ins_agg['INSTAL_NEW_COUNT'] = ins.groupby('SK_ID_CURR').size()
231 | del ins
232 | gc.collect()
233 | return ins_agg
234 |
235 |
236 | # Preprocess credit_card_balance.csv
237 | def credit_card_balance(num_rows=None, nan_as_category=True):
238 | cc = pd.read_csv('data/credit_card_balance.csv', nrows=num_rows)
239 | cc, cat_cols = one_hot_encoder(cc, nan_as_category)
240 |
241 | # General aggregations
242 | cc.drop(['SK_ID_PREV'], axis=1, inplace=True)
243 | cc_agg = cc.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum', 'var'])
244 | cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
245 |
246 | # Count credit card lines
247 | cc_agg['CC_NEW_COUNT'] = cc.groupby('SK_ID_CURR').size()
248 | del cc
249 | gc.collect()
250 | return cc_agg
251 |
--------------------------------------------------------------------------------
/models/dsmlbc2/ATILLA_MUHAMMET.py:
--------------------------------------------------------------------------------
1 | import gc
2 | import re
3 | import time
4 | import warnings
5 | from contextlib import contextmanager
6 |
7 | import matplotlib.pyplot as plt
8 | import numpy as np
9 | import pandas as pd
10 | import pymysql
11 | import seaborn as sns
12 | from feature_engine import categorical_encoders as ce
13 | from lightgbm import LGBMClassifier
14 | from sklearn.metrics import roc_auc_score
15 | from sklearn.model_selection import KFold
16 | from sklearn.preprocessing import LabelEncoder
17 |
18 | warnings.filterwarnings("ignore", category=DeprecationWarning)
19 | warnings.filterwarnings("ignore", category=FutureWarning)
20 | warnings.filterwarnings("ignore", category=UserWarning)
21 | warnings.simplefilter(action='ignore')
22 |
23 |
24 | @contextmanager
25 | def timer(title):
26 | t0 = time.time()
27 | yield
28 | print("{} - done in {:.0f}s".format(title, time.time() - t0))
29 |
30 |
31 | # Display plot feature importance
32 | def display_importances(feature_importance_df_):
33 | cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance",
34 | ascending=False)[
35 | :100].index
36 | best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
37 | print(best_features)
38 | plt.figure(figsize=(15, 20))
39 | sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
40 | plt.title('LightGBM Features (avg over folds)')
41 | plt.tight_layout()
42 | plt.savefig('lgbm_importances01.png')
43 |
44 |
45 | def load_dataset(file_path, index=0):
46 | df = pd.read_csv(file_path, index_col=index)
47 | return df
48 |
49 |
50 | def get_categoric_columns(df):
51 | cols = df.select_dtypes(include=['object', 'category']).columns
52 | return cols
53 |
54 |
55 | def apply_label_encoding(l_df, columns):
56 | lbe = LabelEncoder()
57 | for col in columns:
58 | l_df[col] = lbe.fit_transform(l_df[col])
59 | return l_df
60 |
61 |
62 | def apply_one_hot_encoding(l_df):
63 | original_columns = list(l_df) # col names as string in a list
64 | categorical_columns = get_categoric_columns(l_df) # categorical col names
65 | l_df = pd.get_dummies(l_df, columns=categorical_columns, drop_first=True) # creating dummies
66 | new_columns = [c for c in l_df.columns if c not in original_columns] # new col names
67 | return l_df, new_columns
68 |
69 |
70 | def rare_encoding(data, variables, rare_threshold=0.05, n_rare_categories=4):
71 | encoder = ce.RareLabelCategoricalEncoder(tol=rare_threshold, n_categories=n_rare_categories, variables=variables,
72 | replace_with='Rare')
73 | # fit the encoder
74 | encoder.fit(data)
75 | # transform the data
76 | data = encoder.transform(data)
77 | return data
78 |
79 |
80 | # One-hot encoding for categorical columns with get_dummies
81 | def one_hot_encoder(df, nan_as_category=True):
82 | original_columns = list(df.columns)
83 | categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
84 | df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
85 | new_columns = [c for c in df.columns if c not in original_columns]
86 | return df, new_columns
87 |
88 |
89 |
90 |
91 |
92 | def reduce_mem_usage(df):
93 | """ iterate through all the columns of a dataframe and modify the data type
94 | to reduce memory usage.
95 | """
96 | start_mem = df.memory_usage().sum() / 1024 ** 2
97 | print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
98 |
99 | for col in df.columns:
100 | col_type = df[col].dtype
101 |
102 | if col_type != object:
103 | c_min = df[col].min()
104 | c_max = df[col].max()
105 | if str(col_type)[:3] == 'int':
106 | if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
107 | df[col] = df[col].astype(np.int8)
108 | elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
109 | df[col] = df[col].astype(np.int16)
110 | elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
111 | df[col] = df[col].astype(np.int32)
112 | elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
113 | df[col] = df[col].astype(np.int64)
114 | else:
115 | if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
116 | df[col] = df[col].astype(np.float16)
117 | elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
118 | df[col] = df[col].astype(np.float32)
119 | else:
120 | df[col] = df[col].astype(np.float64)
121 | end_mem = df.memory_usage().sum() / 1024 ** 2
122 | print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
123 | print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
124 | # code takenn from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
125 | return df
126 |
127 |
128 | def feature_early_shutdown(row):
129 | early_shutdown = 0
130 | if row.CREDIT_ACTIVE == "Active" and row.DAYS_CREDIT_ENDDATE < 0:
131 | early_shutdown = 1
132 | return early_shutdown
133 |
134 |
135 | def buro_add_feature(df_breau):
136 | df_bureau_new = pd.DataFrame()
137 | # kredi başvuru sayısı
138 | df_bureau_new["BURO_CREDIT_APPLICATION_COUNT"] = df_breau.groupby("SK_ID_CURR").count()["SK_ID_BUREAU"]
139 |
140 | # aktif kredi sayısı
141 | df_bureau_new["BURO_ACTIVE_CREDIT_APPLICATION_COUNT"] = \
142 | df_breau[df_breau["CREDIT_ACTIVE"] == "Active"].groupby("SK_ID_CURR").count()["CREDIT_ACTIVE"]
143 | df_bureau_new["BURO_ACTIVE_CREDIT_APPLICATION_COUNT"].fillna(0, inplace=True)
144 |
145 | # pasif kredi sayısı
146 | df_bureau_new["BURO_CLOSED_CREDIT_APPLICATION_COUNT"] = \
147 | df_breau[df_breau["CREDIT_ACTIVE"] == "Closed"].groupby("SK_ID_CURR").count()["CREDIT_ACTIVE"]
148 | df_bureau_new["BURO_CLOSED_CREDIT_APPLICATION_COUNT"].fillna(0, inplace=True)
149 |
150 | # erken kredi kapama
151 | df_bureau_new["BURO_EARLY_SHUTDOWN_NEW"] = df_breau.apply(lambda x: feature_early_shutdown(x), axis=1)
152 |
153 | # geciktirilmiş ödeme sayısı
154 | df_bureau_new["BURO_NUMBER_OF_DELAYED_PAYMENTS"] = \
155 | df_breau[df_breau["AMT_CREDIT_MAX_OVERDUE"] != 0].groupby("SK_ID_CURR")["AMT_CREDIT_MAX_OVERDUE"].count()
156 | df_bureau_new["BURO_NUMBER_OF_DELAYED_PAYMENTS"].fillna(0, inplace=True)
157 |
158 | # son kapanmış başvurusu üzerinden geçen max süre
159 | df_bureau_new["BURO_MAX_TIME_PASSED_CREDIT_APPLICATION"] = \
160 | df_breau[df_breau["CREDIT_ACTIVE"] == "Closed"].groupby("SK_ID_CURR")["DAYS_ENDDATE_FACT"].max()
161 | df_bureau_new["BURO_MAX_TIME_PASSED_CREDIT_APPLICATION"].fillna(0, inplace=True)
162 |
163 | # geciktirilmiş max ödeme tutari
164 | df_bureau_new["BURO_MAX_DELAYED_PAYMENTS"] = df_breau.groupby("SK_ID_CURR")["AMT_CREDIT_MAX_OVERDUE"].max()
165 | df_bureau_new["BURO_MAX_DELAYED_PAYMENTS"].fillna(0, inplace=True)
166 |
167 | # geciktirilmiş ödeyenlerden oluşan top liste - en yüksek 100
168 | # gecikme olan (80302, 12)
169 | df_bureau_new["BURO_DELAYED_PAYMENTS_TOP_100_NEW"] = \
170 | df_bureau_new.sort_values("BURO_MAX_DELAYED_PAYMENTS", ascending=False)["BURO_MAX_DELAYED_PAYMENTS"].rank()
171 | df_bureau_new["BURO_DELAYED_PAYMENTS_TOP_100_NEW"].fillna(0, inplace=True)
172 |
173 | # kredi uzatma yapilmis mi
174 | df_bureau_new["BURO_IS_CREDIT_EXTENSION_NEW"] = df_breau.groupby("SK_ID_CURR")["CNT_CREDIT_PROLONG"].count().apply(
175 | lambda x: 1 if x > 0 else 0)
176 |
177 | # max yapilan kredi uzatmasi
178 | df_bureau_new["BURO_CREDIT_EXTENSION_MAX"] = df_breau.groupby("SK_ID_CURR")["CNT_CREDIT_PROLONG"].max()
179 | df_bureau_new["BURO_CREDIT_EXTENSION_MAX"].fillna(0, inplace=True)
180 |
181 | # unsuccessful credit payment - borç takarak kapanmış kredi ödemeleri tespit et
182 | df_bureau_new["BURO_IS_UNSUCCESSFUL_CREDIT_PAYMENT_NEW"] = \
183 | df_breau[(df_breau["CREDIT_ACTIVE"] == "Closed") & (df_breau["AMT_CREDIT_SUM_DEBT"] > 0)].groupby(
184 | "SK_ID_CURR").all()["AMT_CREDIT_SUM_DEBT"].apply(lambda x: 1 if x == True else 0)
185 | df_bureau_new["BURO_IS_UNSUCCESSFUL_CREDIT_PAYMENT_NEW"].fillna(0, inplace=True)
186 |
187 | return df_bureau_new
188 |
189 |
190 | def load_data_with_application_train(num_rows=None):
191 | df_app_train = application_train()
192 | print("application_train df shape:", df_app_train.shape)
193 | bureau, bureau_add_features = bureau_and_balance()
194 | print("Bureau df shape:", bureau.shape)
195 | bureau = bureau.fillna(0)
196 | return df_app_train, bureau, bureau_add_features
197 |
198 |
199 | def load_data_only_bureau_and_bureau_balance(num_rows=None):
200 | bureau, bureau_add_features = bureau_and_balance()
201 | print("Bureau df shape:", bureau.shape)
202 | bureau = bureau.fillna(0)
203 | return bureau, bureau_add_features
204 |
205 |
206 | def app_train_bureau_merge(num_rows=None):
207 | df_app_train, bureau, bureau_add_features = load_data_with_application_train(num_rows)
208 | # df_merge = pd.merge(df_app_train, bureau, on=['SK_ID_CURR'],how='inner')
209 | df_merge = bureau
210 | # print("app_train, bureau merge shape:", df_merge.shape)
211 | print("bureau merge shape:", df_merge.shape)
212 | df_final = pd.merge(df_merge, bureau_add_features, on=['SK_ID_CURR'], how='inner')
213 | print("Bureau add features df shape:", bureau_add_features.shape)
214 | del df_app_train, bureau, bureau_add_features, df_merge
215 | gc.collect()
216 | return df_final
217 |
218 |
219 | def bureau_and_bureau_balance_features(num_rows=None):
220 | bureau, bureau_add_features = load_data_only_bureau_and_bureau_balance(num_rows)
221 | df_final = pd.merge(bureau, bureau_add_features, on=['SK_ID_CURR'], how='inner')
222 | print("Bureau add features df shape:", bureau_add_features.shape)
223 | del bureau, bureau_add_features
224 | gc.collect()
225 | return df_final
226 |
227 |
228 | def application_train():
229 | conn = pymysql.connect(host='35.228.28.142', port=int(63306), user='group2', passwd='123654', db='home_credit')
230 | df_app_train = pd.read_sql_query("SELECT * FROM application_train", conn)
231 | df_app_train = df_app_train[["TARGET", "SK_ID_CURR"]]
232 | # df_app_train = df_app_train.dropna()
233 | df_app_train.reset_index(drop=True, inplace=True)
234 | gc.collect()
235 | return df_app_train
236 |
237 |
238 | # Preprocess bureau.csv and bureau_balance.csv
239 | def bureau_and_balance(nan_as_category=True):
240 | conn = pymysql.connect(host='35.228.28.142', port=int(63306), user='group2', passwd='123654', db='home_credit')
241 | bureau = pd.read_sql_query("SELECT * FROM bureau", conn)
242 | bb = pd.read_sql_query("SELECT * FROM bureau_balance", conn)
243 | bureau["AMT_CREDIT_SUM_DEBT"] = bureau["AMT_CREDIT_SUM_DEBT"].fillna(0)
244 | bureau.fillna(0, inplace=True)
245 | bb.fillna(0, inplace=True)
246 | # bureau = bureau.dropna()
247 | bureau.reset_index(drop=True, inplace=True)
248 | # bb = bb.dropna()
249 | bb.reset_index(drop=True, inplace=True)
250 |
251 | # add_features
252 | bureau_add_features = buro_add_feature(df_breau=bureau)
253 |
254 | # sum agg b_balance
255 | # Status_sum ile ilgili yeni bir degisken olusturma
256 | bb_dummy = pd.get_dummies(bb, dummy_na=True)
257 | agg_list = {"MONTHS_BALANCE": "count",
258 | "STATUS_0": ["sum"],
259 | "STATUS_1": ["sum"],
260 | "STATUS_2": ["sum"],
261 | "STATUS_3": ["sum"],
262 | "STATUS_4": ["sum"],
263 | "STATUS_5": ["sum"],
264 | "STATUS_C": ["sum"],
265 | "STATUS_X": ["sum"]}
266 | bb_sum_agg = bb_dummy.groupby("SK_ID_BUREAU").agg(agg_list)
267 | # Degisken isimlerinin yeniden adlandirilmasi
268 | bb_sum_agg.columns = pd.Index(["BURO_" + col[0] + "_" + col[1].upper() for col in bb_sum_agg.columns.tolist()])
269 | # Status_sum ile ilgili yeni bir degisken olusturma
270 | bb_sum_agg['BURO_NEW_STATUS_SCORE'] = bb_sum_agg['BURO_STATUS_1_SUM'] + bb_sum_agg['BURO_STATUS_2_SUM'] ^ 2 + \
271 | bb_sum_agg['BURO_STATUS_3_SUM'] ^ 3 + bb_sum_agg['BURO_STATUS_4_SUM'] ^ 4 + \
272 | bb_sum_agg['BURO_STATUS_5_SUM'] ^ 5
273 | bb_sum_agg.drop(
274 | ['BURO_STATUS_1_SUM', 'BURO_STATUS_2_SUM', 'BURO_STATUS_3_SUM', 'BURO_STATUS_4_SUM', 'BURO_STATUS_5_SUM'],
275 | axis=1, inplace=True)
276 |
277 | # CREDIT_TYPE degiskeninin sinif sayisini 3'e düsürmek
278 | bureau['CREDIT_TYPE'] = bureau['CREDIT_TYPE'].replace(['Car loan',
279 | 'Mortgage',
280 | 'Microloan',
281 | 'Loan for business development',
282 | 'Another type of loan',
283 | 'Unknown type of loan',
284 | 'Loan for working capital replenishment',
285 | "Loan for purchase of shares (margin lending)",
286 | 'Cash loan (non-earmarked)',
287 | 'Real estate loan',
288 | "Loan for the purchase of equipment",
289 | "Interbank credit",
290 | "Mobile operator loan"], 'Rare')
291 |
292 | # CREDIT_ACTIVE degiskeninin sinif sayisini 2'ye düsürmek (Sold' u Closed a dahil etmek daha mi uygun olur ???)
293 | bureau['CREDIT_ACTIVE'] = bureau['CREDIT_ACTIVE'].replace(['Bad debt', 'Sold'], 'Active')
294 |
295 | # one hot encoding start
296 | bb, bb_cat = one_hot_encoder(bb, nan_as_category)
297 | bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)
298 | # one hot encoding end
299 |
300 | # Bureau balance: Perform aggregations and merge with bureau.csv
301 | bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size'],
302 | "STATUS_0": ["mean"],
303 | "STATUS_C": ["mean"],
304 | "STATUS_X": ["mean"]}
305 | for col in bb_cat:
306 | bb_aggregations[col] = ['mean']
307 |
308 | bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
309 | bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
310 |
311 | # b_balance sum değşkenlerinin eklenmesi
312 | bb_agg["BURO_MONTHS_BALANCE_COUNT"] = bb_sum_agg["BURO_MONTHS_BALANCE_COUNT"]
313 | bb_agg["BURO_STATUS_0_SUM"] = bb_sum_agg["BURO_STATUS_0_SUM"]
314 | bb_agg["BURO_STATUS_C_SUM"] = bb_sum_agg["BURO_STATUS_C_SUM"]
315 | bb_agg["BURO_STATUS_X_SUM"] = bb_sum_agg["BURO_STATUS_X_SUM"]
316 | bb_agg["BURO_NEW_STATUS_SCORE"] = bb_sum_agg["BURO_NEW_STATUS_SCORE"]
317 |
318 | bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
319 |
320 | bureau["BURO_MONTHS_BALANCE_COUNT"].fillna(0, inplace=True)
321 | bureau["BURO_STATUS_0_SUM"].fillna(0, inplace=True)
322 | bureau["BURO_STATUS_C_SUM"].fillna(0, inplace=True)
323 | bureau["BURO_STATUS_X_SUM"].fillna(0, inplace=True)
324 | bureau["BURO_NEW_STATUS_SCORE"].fillna(0, inplace=True)
325 |
326 | ##ek son değişkenler
327 | # ortalama kac aylık kredi aldıgını gösteren yeni degisken
328 | bureau["BURO_NEW_MONTHS_CREDIT"] = round((bureau.DAYS_CREDIT_ENDDATE - bureau.DAYS_CREDIT) / 30)
329 |
330 | bureau.drop(columns='SK_ID_BUREAU', inplace=True)
331 |
332 | del bb, bb_agg
333 | gc.collect()
334 |
335 | # Bureau and bureau_balance numeric features
336 | num_aggregations = {
337 | 'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
338 | 'CREDIT_DAY_OVERDUE': ['max', 'mean'],
339 | 'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
340 | 'AMT_CREDIT_MAX_OVERDUE': ['mean'],
341 | 'CNT_CREDIT_PROLONG': ['sum'],
342 | 'AMT_CREDIT_SUM': ['max', 'mean', 'sum', 'std'],
343 | 'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum', 'std', 'median'],
344 | 'AMT_CREDIT_SUM_OVERDUE': ['mean'],
345 | 'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
346 | 'DAYS_CREDIT_UPDATE': ['min', 'max', 'mean'],
347 | 'AMT_ANNUITY': ['max', 'mean'],
348 | 'MONTHS_BALANCE_MIN': ['min'],
349 | 'MONTHS_BALANCE_MAX': ['max'],
350 | 'MONTHS_BALANCE_SIZE': ['mean', 'sum']
351 | }
352 | # Bureau and bureau_balance categorical features
353 | cat_aggregations = {}
354 | for cat in bureau_cat: cat_aggregations[cat] = ['mean']
355 | for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean']
356 |
357 | bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
358 | bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
359 | # Bureau: Active credits - using only numerical aggregations
360 | active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
361 | active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
362 | active_agg.columns = pd.Index(['BURO_ACT_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
363 | bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')
364 | del active, active_agg
365 | gc.collect()
366 | # Bureau: Closed credits - using only numerical aggregations
367 | closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
368 | closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
369 | closed_agg.columns = pd.Index(['BURO_CLS_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
370 | bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')
371 | del closed, closed_agg, bureau
372 | gc.collect()
373 | return bureau_agg, bureau_add_features
374 |
375 |
376 | def application_train_g():
377 | conn = pymysql.connect(host='35.228.28.142', port=int(63306), user='group2', passwd='123654', db='home_credit')
378 | df = pd.read_sql_query("SELECT * FROM application_train", conn)
379 | test_df = pd.read_sql_query("SELECT * FROM application_test", conn)
380 |
381 | df = reduce_mem_usage(df)
382 | test_df = reduce_mem_usage(test_df)
383 |
384 | df = df.append(test_df).reset_index()
385 |
386 | pd.set_option('display.max_columns', 500)
387 | pd.set_option('display.max_rows', 500)
388 |
389 | le = LabelEncoder()
390 |
391 | df["NAME_EDUCATION_TYPE"] = le.fit_transform(df["NAME_EDUCATION_TYPE"])
392 | df.loc[(df["NAME_EDUCATION_TYPE"] == 1), "NAME_EDUCATION_TYPE"] = 0
393 |
394 | df.loc[(df["CNT_FAM_MEMBERS"] > 3), "CNT_FAM_MEMBERS"] = 4
395 |
396 | df = df[df['CODE_GENDER'] != 'XNA']
397 |
398 | lbe = LabelEncoder()
399 |
400 | for col in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
401 | df[col] = lbe.fit_transform(df[col])
402 |
403 | # df = pd.get_dummies(df, dummy_na = True)
404 |
405 | nom_list = [
406 | 'EMERGENCYSTATE_MODE',
407 | 'FONDKAPREMONT_MODE',
408 | 'HOUSETYPE_MODE',
409 | 'NAME_CONTRACT_TYPE',
410 | 'NAME_FAMILY_STATUS',
411 | 'NAME_HOUSING_TYPE',
412 | 'NAME_INCOME_TYPE',
413 | 'NAME_TYPE_SUITE',
414 | 'OCCUPATION_TYPE',
415 | 'ORGANIZATION_TYPE',
416 | 'WALLSMATERIAL_MODE',
417 | 'WEEKDAY_APPR_PROCESS_START']
418 |
419 | df = rare_encoding(df, nom_list)
420 | df = pd.get_dummies(df, columns=nom_list, drop_first=True)
421 |
422 | # new_features
423 | # 1
424 | df["APP_NEW_GOODSPRICE/CREDIT"] = df["AMT_GOODS_PRICE"] / df["AMT_CREDIT"]
425 | # 2
426 | df["APP_NEW_ANNUITY/CREDIT"] = (df["AMT_ANNUITY"] / df["AMT_CREDIT"])
427 | # 3
428 | df["APP_NEW_INCOME/ANNUITY"] = df["AMT_INCOME_TOTAL"] / df["AMT_ANNUITY"]
429 | # 4
430 | df["APP_NEW_DAYS_LAST_PHONE_CHANGE"] = df["DAYS_LAST_PHONE_CHANGE"]
431 | df.loc[(df["APP_NEW_DAYS_LAST_PHONE_CHANGE"] == 0), "APP_NEW_DAYS_LAST_PHONE_CHANGE"] = 1
432 | df.loc[(df["APP_NEW_DAYS_LAST_PHONE_CHANGE"] != 0), "APP_NEW_DAYS_LAST_PHONE_CHANGE"] = 0
433 | # 5
434 | df["DAYS_BIRTH"] = df["DAYS_BIRTH"] / 365
435 | df["APP_NEW_DAYS_BIRTH"] = df["DAYS_BIRTH"]
436 | df.loc[(df["APP_NEW_DAYS_BIRTH"] <= 41), "APP_NEW_DAYS_BIRTH"] = 1
437 | df.loc[(df["APP_NEW_DAYS_BIRTH"] > 41), "APP_NEW_DAYS_BIRTH"] = 0
438 | # 6
439 | df["APP_NEW_CREDIT/INCOME"] = df["AMT_CREDIT"] / df["AMT_INCOME_TOTAL"]
440 | # 7
441 | df["APP_NEW_WORK/NOTWORK"] = df["DAYS_EMPLOYED"]
442 | df.loc[(df["APP_NEW_WORK/NOTWORK"] == 0), "APP_NEW_WORK/NOTWORK"] = 0 # ÇALIŞMAYANLAR
443 | df.loc[(df["APP_NEW_WORK/NOTWORK"] != 0), "APP_NEW_WORK/NOTWORK"] = 1 # ÇALIŞANLAR
444 | # 8
445 | df["APP_NEW_INCOME/CREDIT"] = df["AMT_INCOME_TOTAL"] / df["AMT_CREDIT"]
446 | # 9
447 | # En yakın zaman (soruşturma olmayan 0, saat+gün+ hafta+ay için 1, ay+yıl için 2)
448 | df["APP_NEW_REQ"] = df["AMT_REQ_CREDIT_BUREAU_WEEK"]
449 | # yakın ve orta zamanda soruşturma
450 | df.loc[(df["AMT_REQ_CREDIT_BUREAU_HOUR"] > 0), "APP_NEW_REQ"] = 1
451 | df.loc[(df["AMT_REQ_CREDIT_BUREAU_DAY"] > 0), "APP_NEW_REQ"] = 1
452 |
453 | df.loc[(df["AMT_REQ_CREDIT_BUREAU_HOUR"] == 0) & (df["AMT_REQ_CREDIT_BUREAU_DAY"] == 0) & (
454 | df["AMT_REQ_CREDIT_BUREAU_WEEK"] > 0), "APP_NEW_REQ"] = 1
455 | df.loc[(df["AMT_REQ_CREDIT_BUREAU_HOUR"] == 0) & (df["AMT_REQ_CREDIT_BUREAU_DAY"] == 0) & (
456 | df["AMT_REQ_CREDIT_BUREAU_MON"] > 0), "APP_NEW_REQ"] = 1
457 | # uzak zaman soruşturma
458 | df.loc[(df["AMT_REQ_CREDIT_BUREAU_HOUR"] == 0) & (df["AMT_REQ_CREDIT_BUREAU_DAY"] == 0) &
459 | (df["AMT_REQ_CREDIT_BUREAU_WEEK"] == 0) & (df["AMT_REQ_CREDIT_BUREAU_MON"] == 0) &
460 | (df["AMT_REQ_CREDIT_BUREAU_QRT"] > 0), "APP_NEW_REQ"] = 2
461 |
462 | df.loc[(df["AMT_REQ_CREDIT_BUREAU_HOUR"] == 0) & (df["AMT_REQ_CREDIT_BUREAU_DAY"] == 0) &
463 | (df["AMT_REQ_CREDIT_BUREAU_WEEK"] == 0) & (df["AMT_REQ_CREDIT_BUREAU_MON"] == 0) &
464 | (df["AMT_REQ_CREDIT_BUREAU_YEAR"] > 0), "APP_NEW_REQ"] = 2
465 | # soruşturma olmayanlar
466 | df.loc[(pd.isna(df["APP_NEW_REQ"])), "APP_NEW_REQ"] = 0
467 |
468 | # eski grup yeni feature ları
469 | df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
470 | df['NEW_DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
471 | df['NEW_INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
472 |
473 | df['NEW_EXT_RESOURCE_3_CREDIT_TO_GOODS_RATIO'] = df['EXT_SOURCE_3'] / (df['AMT_CREDIT'] / df['AMT_GOODS_PRICE'])
474 | df['NEW_EXT_RESOURCE_2_CREDIT_TO_GOODS_RATIO'] = df['EXT_SOURCE_2'] / (df['AMT_CREDIT'] / df['AMT_GOODS_PRICE'])
475 | df['NEW_EXT_RESOURCE_1_CREDIT_TO_GOODS_RATIO'] = df['EXT_SOURCE_1'] / (df['AMT_CREDIT'] / df['AMT_GOODS_PRICE'])
476 |
477 | df.drop("index", axis=1, inplace=True)
478 |
479 | df.columns = pd.Index(["APP_" + col for col in df.columns.tolist()])
480 |
481 | df.rename(columns={"APP_SK_ID_CURR": "SK_ID_CURR"}, inplace=True)
482 |
483 | df.rename(columns={"APP_TARGET": "TARGET"}, inplace=True)
484 |
485 | return df
486 |
487 |
488 | def previous_application():
489 | conn = pymysql.connect(host='35.228.28.142', port=int(63306), user='group2', passwd='123654', db='home_credit')
490 | df_prev = pd.read_sql_query("SELECT * FROM previous_application", conn)
491 | df_prev = reduce_mem_usage(df_prev)
492 | pd.set_option('display.max_columns', 500)
493 | pd.set_option('display.max_rows', 500)
494 | df_prev = df_prev.sample(1000)
495 |
496 | # Features that has outliers
497 | feat_outlier = ["AMT_ANNUITY", "AMT_APPLICATION", "AMT_CREDIT", "AMT_DOWN_PAYMENT", "AMT_GOODS_PRICE",
498 | "SELLERPLACE_AREA"]
499 |
500 | # Replacing the outliers of the features with their own upper values
501 | for var in feat_outlier:
502 | Q1 = df_prev[var].quantile(0.01)
503 | Q3 = df_prev[var].quantile(0.99)
504 | IQR = Q3 - Q1
505 | lower = Q1 - 1.5 * IQR
506 | upper = Q3 + 1.5 * IQR
507 |
508 | df_prev[var][(df_prev[var] > upper)] = upper
509 |
510 | # 365243 value will be replaced by NaN in the following features
511 | feature_replace = ['DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_LAST_DUE',
512 | 'DAYS_TERMINATION']
513 |
514 | for var in feature_replace:
515 | df_prev[var].replace(365243, np.nan, inplace=True)
516 |
517 | # One hot encoding
518 | categorical_columns = [col for col in df_prev.columns if df_prev[col].dtype == 'object']
519 | df_prev = pd.get_dummies(df_prev, columns=categorical_columns, dummy_na=True)
520 |
521 | # Creating new features
522 |
523 | df_prev['APP_CREDIT_PERC'] = df_prev['AMT_APPLICATION'] / df_prev['AMT_CREDIT']
524 | df_prev['NEW_CREDIT_TO_ANNUITY_RATIO'] = df_prev['AMT_CREDIT'] / df_prev['AMT_ANNUITY']
525 | df_prev['NEW_DOWN_PAYMENT_TO_CREDIT'] = df_prev['AMT_DOWN_PAYMENT'] / df_prev['AMT_CREDIT']
526 | df_prev['NEW_TOTAL_PAYMENT'] = df_prev['AMT_ANNUITY'] * df_prev['CNT_PAYMENT']
527 | df_prev['NEW_TOTAL_PAYMENT_TO_AMT_CREDIT'] = df_prev['NEW_TOTAL_PAYMENT'] / df_prev['AMT_CREDIT']
528 | # Innterest ratio previous application (simplified)
529 |
530 | df_prev['SIMPLE_INTERESTS'] = (df_prev['NEW_TOTAL_PAYMENT'] / df_prev['AMT_CREDIT'] - 1) / df_prev['CNT_PAYMENT']
531 |
532 | # Previous applications numeric features
533 | num_aggregations = {}
534 | num_cols = df_prev.select_dtypes(exclude=['object'])
535 | num_cols.drop(['SK_ID_PREV', 'SK_ID_CURR'], axis=1, inplace=True)
536 |
537 | for num in num_cols:
538 | num_aggregations[num] = ['min', 'max', 'mean', 'var', 'sum']
539 |
540 | # Previous applications categoric features
541 | cat_aggregations = {}
542 | for i in df_prev.columns:
543 | if df_prev[i].dtypes == "O":
544 | cat_aggregations[i] = ['mean']
545 |
546 | prev_agg = df_prev.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
547 | prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
548 |
549 | # Dropping features with small variance
550 | features_with_small_variance = prev_agg.columns[(prev_agg.std(axis=0) < .1).values]
551 | prev_agg.drop(features_with_small_variance, axis=1, inplace=True)
552 | prev_agg.reset_index(inplace=True)
553 |
554 | return prev_agg
555 |
556 |
557 | def credit_card_balance():
558 | conn = pymysql.connect(host='35.228.28.142', port=int(63306), user='group2', passwd='123654', db='home_credit')
559 | ccb = pd.read_sql_query("SELECT * FROM credit_card_balance", conn)
560 |
561 | ccb = reduce_mem_usage(ccb)
562 | ccb = ccb.sample(1000)
563 |
564 | ccb = ccb.groupby('SK_ID_CURR').agg(['mean'])
565 | e = 0
566 | ccb.columns = pd.Index(
567 | ['CC_' + ccb.columns[e][0] + "_" + ccb.columns[e][1].upper() for e in range(ccb.columns.size)])
568 |
569 | # new feature1: calculating the rate of balance(loan) to the credit card limit
570 | ccb["CC_NEW_LOAN_TO_CREDIT_LIMIT_RATE"] = (ccb["CC_AMT_BALANCE_MEAN"] + 1) / (
571 | ccb["CC_AMT_CREDIT_LIMIT_ACTUAL_MEAN"] + 1)
572 |
573 | # new feature2: at what rate the customer paid the loan:CC_AMT_PAYMENT_TOTAL_CURRENT_MEAN /
574 | # CC_AMT_TOTAL_RECEIVABLE_MEAN: CC_PAID_AMOUNT_RATE
575 | ccb["CC_NEW_PAID_AMOUNT_RATE"] = (ccb["CC_AMT_PAYMENT_TOTAL_CURRENT_MEAN"] + 1) / (
576 | ccb["CC_AMT_TOTAL_RECEIVABLE_MEAN"] + 1) * 100
577 |
578 | # new feature3: how much money the customer withdrew in avg from ATM per drawing:AMOUNT PER ATM DRAWING
579 | ccb["CC_NEW_AMT_PER_ATM_DRAWING_MEAN"] = (ccb["CC_AMT_DRAWINGS_ATM_CURRENT_MEAN"] + 1) / (
580 | ccb["CC_CNT_DRAWINGS_ATM_CURRENT_MEAN"] + 1)
581 |
582 | # new feature4: how much money the customer withdrew from POS in avg per drawing:AMOUNT PER POS DRAWING
583 | ccb["CC_NEW_AMT_PER_POS_DRAWING_MEAN"] = (ccb["CC_AMT_DRAWINGS_POS_CURRENT_MEAN"] + 1) / (
584 | ccb["CC_CNT_DRAWINGS_POS_CURRENT_MEAN"] + 1)
585 |
586 | ccb = pd.concat([ccb.loc[:, "CC_NEW_LOAN_TO_CREDIT_LIMIT_RATE"],
587 | ccb.loc[:, "CC_NEW_PAID_AMOUNT_RATE"],
588 | ccb.loc[:, "CC_AMT_CREDIT_LIMIT_ACTUAL_MEAN"],
589 | ccb.loc[:, "CC_AMT_PAYMENT_CURRENT_MEAN"],
590 | ccb.loc[:, "CC_MONTHS_BALANCE_MEAN"],
591 | ccb.loc[:, "CC_CNT_INSTALMENT_MATURE_CUM_MEAN"],
592 | ccb.loc[:, "CC_AMT_INST_MIN_REGULARITY_MEAN"],
593 | ccb.loc[:, "CC_AMT_DRAWINGS_ATM_CURRENT_MEAN"],
594 | ccb.loc[:, "CC_AMT_DRAWINGS_POS_CURRENT_MEAN"],
595 | ccb.loc[:, "CC_CNT_DRAWINGS_ATM_CURRENT_MEAN"],
596 | ccb.loc[:, "CC_CNT_DRAWINGS_POS_CURRENT_MEAN"],
597 | ccb.loc[:, "CC_NEW_AMT_PER_ATM_DRAWING_MEAN"],
598 | ccb.loc[:, "CC_NEW_AMT_PER_POS_DRAWING_MEAN"]], axis=1)
599 |
600 | return ccb
601 |
602 |
603 | def prepare_instalment_payment():
604 | conn = pymysql.connect(host='35.228.28.142', port=int(63306), user='group2', passwd='123654', db='home_credit')
605 | df_installments_payments = pd.read_sql_query("SELECT * FROM installments_payments", conn)
606 |
607 | df_installments_payments = reduce_mem_usage(df_installments_payments)
608 |
609 | # O anki taksitin yuzde kaci odendi
610 | df_installments_payments[['AMT_PAYMENT']] = df_installments_payments[['AMT_PAYMENT']].fillna(value=0)
611 | df_installments_payments['NEW_INSTALMENT_PAYMENT_RATE'] = df_installments_payments['AMT_PAYMENT'] / \
612 | df_installments_payments['AMT_INSTALMENT'] * 100
613 |
614 | # O anki taksit son odeme gununden kac gun once odenmis. Bu degisken "NEW_INSTALMENT_PAYMENT_STATUS" degerini bulabilmek icin gecici olusturulur.
615 | df_installments_payments['NEW_DAY_BEFORE_END_DATE'] = df_installments_payments['DAYS_INSTALMENT'] - \
616 | df_installments_payments['DAYS_ENTRY_PAYMENT']
617 |
618 | df_installments_payments["NEW_INSTALMENT_PAYMENT_STATUS"] = "No Payment"
619 | df_installments_payments.loc[
620 | df_installments_payments['NEW_DAY_BEFORE_END_DATE'] == 0, "NEW_INSTALMENT_PAYMENT_STATUS"] = "In Time"
621 | df_installments_payments.loc[
622 | df_installments_payments['NEW_DAY_BEFORE_END_DATE'] > 0, "NEW_INSTALMENT_PAYMENT_STATUS"] = "Early"
623 | df_installments_payments.loc[
624 | df_installments_payments['NEW_DAY_BEFORE_END_DATE'] < 0, "NEW_INSTALMENT_PAYMENT_STATUS"] = "Late"
625 |
626 | df_installments_payments["NEW_INS_IS_LATE"] = "No"
627 | df_installments_payments.loc[df_installments_payments['NEW_DAY_BEFORE_END_DATE'] < 0, "NEW_INS_IS_LATE"] = "Yes"
628 | # Iki siniftan olustugu icin LabelEncoding yapilir.
629 | df_installments_payments = apply_label_encoding(df_installments_payments, ["NEW_INS_IS_LATE"])
630 |
631 | df_installments_payments.drop(columns=['NEW_DAY_BEFORE_END_DATE'], inplace=True)
632 |
633 | df_installments_payments, ip_cat = apply_one_hot_encoding(df_installments_payments)
634 |
635 | ip_aggregations = {
636 | 'NUM_INSTALMENT_VERSION': ['max'],
637 | 'NUM_INSTALMENT_NUMBER': ['max'],
638 | 'AMT_INSTALMENT': ['sum'],
639 | 'AMT_PAYMENT': ['sum'],
640 | 'NEW_INSTALMENT_PAYMENT_RATE': ['min', 'max', 'mean'],
641 | 'NEW_INS_IS_LATE': ['mean', 'sum']
642 | }
643 |
644 | for col in ip_cat:
645 | ip_aggregations[col] = ['mean']
646 |
647 | df_ip_agg = df_installments_payments.groupby(['SK_ID_CURR']).agg(ip_aggregations)
648 |
649 | df_ip_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in df_ip_agg.columns.tolist()])
650 |
651 | return df_ip_agg
652 |
653 |
654 | def prepare_pos_cash_balance():
655 | conn = pymysql.connect(host='35.228.28.142', port=int(63306), user='group2', passwd='123654', db='home_credit')
656 | df_pos_cash_balance = pd.read_sql_query("SELECT * FROM POS_CASH_balance", conn)
657 |
658 | df_pos_cash_balance, pcb_cat = apply_one_hot_encoding(df_pos_cash_balance)
659 |
660 | pcb_aggregations = {
661 | 'SK_ID_PREV': ['min', 'max', 'mean', 'count'],
662 | 'MONTHS_BALANCE': ['min', 'max'],
663 | 'CNT_INSTALMENT': ['min', 'max', 'mean'],
664 | 'CNT_INSTALMENT_FUTURE': ['min', 'max', 'mean'],
665 | 'SK_DPD': ['max', 'mean'],
666 | 'SK_DPD_DEF': ['max', 'mean']
667 | }
668 |
669 | for col in pcb_cat:
670 | pcb_aggregations[col] = ['mean']
671 |
672 | df_pcb_agg = df_pos_cash_balance.groupby(['SK_ID_CURR']).agg(pcb_aggregations)
673 | df_pcb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in df_pcb_agg.columns.tolist()])
674 |
675 | return df_pcb_agg
676 |
677 |
678 | def installment_payment_main():
679 | df_ip_agg = prepare_instalment_payment() # Intaslment Payment son hali
680 |
681 | df_pcb_agg = prepare_pos_cash_balance() # Pos cash balance son hali
682 |
683 | df_pos_ins = df_ip_agg.join(df_pcb_agg, how='inner',
684 | on=['SK_ID_CURR']) # instalment payment ve pos cash balance birlestirilmis hali
685 |
686 | return df_ip_agg, df_pcb_agg, df_pos_ins
687 |
688 |
689 | def pre_processing_and_combine():
690 | with timer("Process application train"):
691 | df = application_train_g()
692 | print("application train & test shape:", df.shape)
693 |
694 | with timer("Bureau and Bureau Balance"):
695 | df_final = bureau_and_bureau_balance_features()
696 | print("Bureau and Bureau Balance:", df_final.shape)
697 |
698 | with timer("Installment Payments"):
699 | df_ip_agg, df_pcb_agg, df_pos_ins = installment_payment_main()
700 | print("Installment Payments", df_ip_agg.shape)
701 |
702 | with timer("Pos Cash Balance"):
703 | print("Pos Cash Balance:", df_pcb_agg.shape)
704 |
705 | with timer("Credit Card Balance"):
706 | ccb = credit_card_balance()
707 | print("Credit Card Balance:", ccb.shape)
708 |
709 | with timer("previous_application"):
710 | prev_agg = previous_application()
711 | print("previous_application:", prev_agg.shape)
712 |
713 | with timer("All tables are combining"):
714 | df = df.merge(df_final, how="left", on="SK_ID_CURR")
715 | df1 = df.merge(df_ip_agg, how='left', on='SK_ID_CURR')
716 | df2 = df1.merge(df_pcb_agg, how='left', on='SK_ID_CURR')
717 | df3 = df2.merge(ccb, how='left', on='SK_ID_CURR')
718 | all_df = df3.merge(prev_agg, how='left', on='SK_ID_CURR')
719 |
720 | print("all_df process:", all_df.shape)
721 |
722 | return all_df
723 |
724 |
725 | def modeling(all_data):
726 | all_data = all_data.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x))
727 |
728 | train_df = all_data[all_data['TARGET'].notnull()]
729 | test_df = all_data[all_data['TARGET'].isnull()]
730 |
731 | folds = KFold(n_splits=10, shuffle=True, random_state=1001)
732 |
733 | oof_preds = np.zeros(train_df.shape[0])
734 | sub_preds = np.zeros(test_df.shape[0])
735 | feature_importance_df = pd.DataFrame()
736 |
737 | feats = [f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR']]
738 |
739 | for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
740 | train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
741 |
742 | valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]
743 |
744 | clf = LGBMClassifier(
745 | n_jobs=-1,
746 | n_estimators=10000,
747 | learning_rate=0.02,
748 | num_leaves=34,
749 | colsample_bytree=0.9497036,
750 | subsample=0.8715623,
751 | max_depth=8,
752 | reg_alpha=0.041545473,
753 | reg_lambda=0.0735294,
754 | min_split_gain=0.0222415,
755 | min_child_weight=39.3259775,
756 | silent=-1,
757 | verbose=-1, )
758 |
759 | clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)],
760 | eval_metric='auc', verbose=200, early_stopping_rounds=200)
761 |
762 | # y_pred_valid
763 | oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
764 | sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
765 |
766 | fold_importance_df = pd.DataFrame()
767 | fold_importance_df["feature"] = feats
768 | fold_importance_df["importance"] = clf.feature_importances_
769 | fold_importance_df["fold"] = n_fold + 1
770 | feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
771 |
772 | print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
773 |
774 | print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds)) # y_pred_valid
775 |
776 | test_df['TARGET'] = sub_preds
777 | test_df[['SK_ID_CURR', 'TARGET']].to_csv("outputs/predictions/atilla_muhammet.csv'", index=False)
778 |
779 | display_importances(feature_importance_df)
780 |
781 | return feature_importance_df
782 |
783 |
784 | def main():
785 | with timer("Preprocessing Time"):
786 | all_data = pre_processing_and_combine()
787 |
788 | with timer("Modeling"):
789 | feat_importance = modeling(all_data)
790 |
791 |
792 | if __name__ == "__main__":
793 | with timer("Full model run"):
794 | main()
795 |
--------------------------------------------------------------------------------
/models/dsmlbc2/merve_betul.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "pycharm": {
7 | "name": "#%% md\n"
8 | }
9 | },
10 | "source": [
11 | "Libraries"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 1,
17 | "metadata": {
18 | "pycharm": {
19 | "is_executing": false,
20 | "name": "#%% \n"
21 | }
22 | },
23 | "outputs": [],
24 | "source": [
25 | "import numpy as np\n",
26 | "import pandas as pd\n",
27 | "import gc\n",
28 | "import time\n",
29 | "from contextlib import contextmanager\n",
30 | "from lightgbm import LGBMClassifier\n",
31 | "from sklearn.metrics import roc_auc_score\n",
32 | "from sklearn.model_selection import KFold, StratifiedKFold\n",
33 | "from sklearn.preprocessing import LabelEncoder\n",
34 | "import matplotlib.pyplot as plt\n",
35 | "import seaborn as sns\n",
36 | "import warnings\n",
37 | "warnings.simplefilter(action='ignore', category=FutureWarning)"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {
43 | "pycharm": {
44 | "name": "#%% md\n"
45 | }
46 | },
47 | "source": [
48 | "Time function for tracking run times of functions"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 2,
54 | "metadata": {
55 | "pycharm": {
56 | "is_executing": false,
57 | "name": "#%% \n"
58 | }
59 | },
60 | "outputs": [],
61 | "source": [
62 | "@contextmanager\n",
63 | "def timer(title):\n",
64 | " t0 = time.time()\n",
65 | " yield\n",
66 | " print(\"{} - done in {:.0f}s\".format(title, time.time() - t0))\n"
67 | ]
68 | },
69 | {
70 | "cell_type": "markdown",
71 | "metadata": {
72 | "pycharm": {
73 | "name": "#%% md\n"
74 | }
75 | },
76 | "source": [
77 | "One-hot encoding function for categorical variables with get_dummies"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 3,
83 | "metadata": {
84 | "pycharm": {
85 | "is_executing": false,
86 | "name": "#%%\n"
87 | }
88 | },
89 | "outputs": [],
90 | "source": [
91 | "def one_hot_encoder(df, nan_as_category = True):\n",
92 | " original_columns = list(df.columns) # col names as string in a list \n",
93 | " categorical_columns = [col for col in df.columns if df[col].dtype == 'object'] #categorical col names\n",
94 | " df = pd.get_dummies(df, columns = categorical_columns, dummy_na = nan_as_category) #creating dummies\n",
95 | " new_columns = [c for c in df.columns if c not in original_columns] #new col names\n",
96 | " return df, new_columns\n",
97 | "\n",
98 | "def label_encoder(df):\n",
99 | " # Create a label encoder object\n",
100 | " le = LabelEncoder()\n",
101 | " le_count = 0\n",
102 | "\n",
103 | " # Iterate through the columns\n",
104 | " for col in df:\n",
105 | " if df[col].dtype == 'object':\n",
106 | " # If 2 or fewer unique categories\n",
107 | " if len(list(df[col].unique())) <= 2:\n",
108 | " # Train on the training data\n",
109 | " le.fit(df[col])\n",
110 | " # Transform both training and testing data\n",
111 | " df[col] = le.transform(df[col])\n",
112 | "\n",
113 | " # Keep track of how many columns were label encoded\n",
114 | " le_count += 1\n",
115 | "\n",
116 | " print('%d columns were label encoded.' % le_count)\n",
117 | " \n",
118 | " return df"
119 | ]
120 | },
121 | {
122 | "cell_type": "markdown",
123 | "metadata": {
124 | "pycharm": {
125 | "name": "#%% md\n"
126 | }
127 | },
128 | "source": [
129 | "Preprocess application_train and application_test"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": 4,
135 | "metadata": {
136 | "jupyter": {
137 | "outputs_hidden": false
138 | },
139 | "pycharm": {
140 | "is_executing": false,
141 | "name": "#%%\n"
142 | }
143 | },
144 | "outputs": [],
145 | "source": [
146 | "\n",
147 | "# Preprocess application_train.csv and application_test.csv\n",
148 | "def application_train_test(num_rows = None, nan_as_category = False):\n",
149 | " df = pd.read_csv(\"data/application_train.csv\", nrows = num_rows)\n",
150 | " test_df = pd.read_csv(\"data//application_test.csv\", nrows = num_rows)\n",
151 | "\n",
152 | " df = df.append(test_df).reset_index()\n",
153 | " del df[\"index\"]\n",
154 | " \n",
155 | " df = df[df['CODE_GENDER'] != 'XNA']\n",
156 | " df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)\n",
157 | "\n",
158 | " df['APP_NEW_AGE'] = df['DAYS_BIRTH'] / (- 365.25)\n",
159 | " \n",
160 | " APP_NEW_AGE_CAT = pd.Series([\"Young\", \"Adult 1\",\"Adult 2\",\"Adult 3\", \"Adult 4\"], dtype = \"object\")\n",
161 | " df[\"APP_NEW_AGE_CAT\"] = APP_NEW_AGE_CAT\n",
162 | " df.loc[(df[\"APP_NEW_AGE\"] > 20.0) & (df[\"APP_NEW_AGE\"] <= 30.0), \"APP_NEW_AGE_CAT\"] = APP_NEW_AGE_CAT[0]\n",
163 | " df.loc[(df[\"APP_NEW_AGE\"] > 30.0) & (df[\"APP_NEW_AGE\"] <= 40.0), \"APP_NEW_AGE_CAT\"] = APP_NEW_AGE_CAT[1]\n",
164 | " df.loc[(df[\"APP_NEW_AGE\"] > 40.0) & (df[\"APP_NEW_AGE\"] <= 50.0), \"APP_NEW_AGE_CAT\"] = APP_NEW_AGE_CAT[2]\n",
165 | " df.loc[(df[\"APP_NEW_AGE\"] > 50.0) & (df[\"APP_NEW_AGE\"] <= 60.0), \"APP_NEW_AGE_CAT\"] = APP_NEW_AGE_CAT[3]\n",
166 | " df.loc[df[\"APP_NEW_AGE\"] > 60 ,\"APP_NEW_AGE_CAT\"] = APP_NEW_AGE_CAT[4]\n",
167 | " \n",
168 | " df[\"APP_NEW_AGE_DAYS_EMP\"] = df[\"DAYS_EMPLOYED\"] / (- 365.25)\n",
169 | " df[\"APP_NEW_AGE_WORK_PERCENT\"] = (df[\"APP_NEW_AGE_DAYS_EMP\"] / df['APP_NEW_AGE']) * 100\n",
170 | " df['APP_NEW_CREDIT_INCOME_PERCENT'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']\n",
171 | " df['APP_NEW_ANNUITY_INCOME_PERCENT'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']\n",
172 | " df['APP_NEW_DAYS_EMPLOYED_PERCENT'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']\n",
173 | " df['APP_NEW_INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']\n",
174 | " df['APP_NEW_INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']\n",
175 | " df['APP_NEW_PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']\n",
176 | " df['APP_NEW_AMT_PAY_YEAR'] = df['AMT_CREDIT'] / df['AMT_ANNUITY'] \n",
177 | " df['APP_NEW_AGE_PAYOFF'] = df['APP_NEW_AGE'] + df['APP_NEW_AMT_PAY_YEAR']\n",
178 | " df['APP_NEW_AMT_DIFF_CREDIT_GOODS'] = df['AMT_CREDIT'] - df['AMT_GOODS_PRICE'] \n",
179 | " df['APP_NEW_AMT_CREDIT_GOODS_PERC'] = ((df['AMT_GOODS_PRICE'] / df['AMT_CREDIT']) * 100)\n",
180 | " df['APP_NEW_CNT_ADULT'] = df['CNT_FAM_MEMBERS'] - df['CNT_CHILDREN']\n",
181 | " df['APP_NEW_EXT_SOURCES_MEAN'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)\n",
182 | "\n",
183 | " df = label_encoder(df)\n",
184 | " \n",
185 | " df, cat_cols = one_hot_encoder(df)\n",
186 | "\n",
187 | " del test_df\n",
188 | " gc.collect()\n",
189 | " return df"
190 | ]
191 | },
192 | {
193 | "cell_type": "markdown",
194 | "metadata": {
195 | "pycharm": {
196 | "name": "#%% md\n"
197 | }
198 | },
199 | "source": [
200 | "#Preprocess bureau.csv and bureau_balance.csv"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": 5,
206 | "metadata": {
207 | "pycharm": {
208 | "is_executing": false,
209 | "name": "#%%\n"
210 | }
211 | },
212 | "outputs": [],
213 | "source": [
214 | "# Preprocess bureau.csv and bureau_balance.csv\n",
215 | "def bureau_and_balance(num_rows = None, nan_as_category = True):\n",
216 | " bureau = pd.read_csv('data/bureau.csv', nrows = num_rows)\n",
217 | " bureau_balance = pd.read_csv('data/bureau_balance.csv', nrows = num_rows)\n",
218 | " \n",
219 | " \n",
220 | " # Bureau balance: Perform aggregations and merge with bureau.csv\n",
221 | " def _status_to_int(status):\n",
222 | " if status in ['X', 'C']:\n",
223 | " return 0\n",
224 | " if pd.isnull(status):\n",
225 | " return np.nan\n",
226 | " return int(status)\n",
227 | "\n",
228 | " bureau_balance['NEW_BUREAU_BALANCE_DPD_LEVEL'] = bureau_balance['STATUS'].apply(_status_to_int)\n",
229 | " bureau_balance['NEW_BUREAU_BALANCE_STATUS_UNKNOW'] = (bureau_balance['STATUS'] == 'X').astype(int) \n",
230 | "\n",
231 | " bureau_balance[\"MONTHS_BALANCE\"] = (-1*bureau_balance[\"MONTHS_BALANCE\"])+1\n",
232 | "\n",
233 | " bb_aggregations = {'MONTHS_BALANCE': [\"max\"],\n",
234 | " 'NEW_BUREAU_BALANCE_DPD_LEVEL':['sum', 'mean', 'max', 'std', 'skew'],\n",
235 | " 'NEW_BUREAU_BALANCE_STATUS_UNKNOW':['sum', 'mean']}\n",
236 | "\n",
237 | " bb_agg = bureau_balance.groupby('SK_ID_BUREAU').agg(bb_aggregations)\n",
238 | "\n",
239 | " bb_agg.columns = pd.Index([e[0] + \"_\" + e[1].upper() for e in bb_agg.columns.tolist()])\n",
240 | " bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')\n",
241 | " bureau.drop(['SK_ID_BUREAU'], axis=1, inplace= True)\n",
242 | " del bureau_balance, bb_agg\n",
243 | " gc.collect()\n",
244 | " \n",
245 | " # Bureau new features\n",
246 | " bureau.drop([\"AMT_CREDIT_SUM_LIMIT\",\"AMT_CREDIT_SUM_OVERDUE\",\"CREDIT_DAY_OVERDUE\",\"AMT_CREDIT_SUM_OVERDUE\"],axis=1,inplace=True)\n",
247 | " bureau['BUREAU_CREDIT_TYPE_CONSUMER'] = (bureau['CREDIT_TYPE'] == 'Consumer credit').astype(int)\n",
248 | " bureau['BUREAU_CREDIT_TYPE_CAR'] = (bureau['CREDIT_TYPE'] == 'Car loan').astype(int)\n",
249 | " bureau['BUREAU_CREDIT_TYPE_MORTGAGE'] = (bureau['CREDIT_TYPE'] == 'Mortgage').astype(int)\n",
250 | " bureau['BUREAU_CREDIT_TYPE_CREDIT_CARD'] = (bureau['CREDIT_TYPE'] == 'Credit card').astype(int)\n",
251 | " bureau['BUREAU_CREDIT_TYPE_OTHER'] = (~(bureau['CREDIT_TYPE'].isin(['Consumer credit',\n",
252 | " 'Car loan', 'Mortgage', 'Credit card']))).astype(int)\n",
253 | " bureau['BUREAU_UNUSUAL_CURRENCY'] = (~(bureau['CREDIT_CURRENCY'] == 'currency 1')).astype(int)\n",
254 | " bureau['NEW_PAYMENT_RATE_SUM'] = bureau['AMT_ANNUITY'] / bureau['AMT_CREDIT_SUM']\n",
255 | " bureau['NEW_PAYMENT_RATE_SUM_DEBT'] = bureau['AMT_ANNUITY'] / bureau['AMT_CREDIT_SUM_DEBT']\n",
256 | " bureau['NEW_PAYMENT_RATE_AMT_CREDIT_MAX_OVERDUE'] = bureau['AMT_ANNUITY'] / bureau['AMT_CREDIT_MAX_OVERDUE']\n",
257 | " \n",
258 | " bureau.drop([\"CREDIT_TYPE\",\"CREDIT_CURRENCY\"],axis=1,inplace=True)\n",
259 | " # Bureau and bureau_balance numeric features\n",
260 | " num_aggregations = {\n",
261 | " \"DAYS_CREDIT\": ['min', 'max', 'mean', 'var', 'sum'],\n",
262 | " 'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],\n",
263 | " 'DAYS_CREDIT_UPDATE': ['mean'],\n",
264 | " 'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],\n",
265 | " 'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],\n",
266 | " 'AMT_CREDIT_MAX_OVERDUE': ['mean'],\n",
267 | " 'DAYS_ENDDATE_FACT': ['mean', 'sum'],\n",
268 | " 'AMT_ANNUITY': ['max', 'mean'],\n",
269 | " 'CNT_CREDIT_PROLONG': ['sum','std'],\n",
270 | " 'MONTHS_BALANCE_MAX': ['max'],\n",
271 | " \"NEW_BUREAU_BALANCE_DPD_LEVEL_SUM\" :['max',\"sum\"],\n",
272 | " \"NEW_BUREAU_BALANCE_DPD_LEVEL_MEAN\" :['max',\"sum\",\"mean\"],\n",
273 | " \"NEW_BUREAU_BALANCE_DPD_LEVEL_MAX\" :['max',\"sum\"],\n",
274 | " \"NEW_BUREAU_BALANCE_DPD_LEVEL_STD\" :['max',\"sum\",\"std\"],\n",
275 | " \"NEW_BUREAU_BALANCE_DPD_LEVEL_SKEW\" :['max',\"sum\",\"skew\"],\n",
276 | " \"NEW_BUREAU_BALANCE_STATUS_UNKNOW_SUM\" :['max',\"sum\"],\n",
277 | " \"NEW_BUREAU_BALANCE_STATUS_UNKNOW_MEAN\" :['max',\"sum\",\"mean\"],\n",
278 | " 'BUREAU_CREDIT_TYPE_CONSUMER': ['mean', 'sum'],\n",
279 | " 'BUREAU_CREDIT_TYPE_CAR': ['mean', 'sum'],\n",
280 | " 'BUREAU_CREDIT_TYPE_MORTGAGE': ['mean', 'sum'],\n",
281 | " 'BUREAU_CREDIT_TYPE_CREDIT_CARD': ['mean', 'sum'],\n",
282 | " 'BUREAU_CREDIT_TYPE_OTHER': ['mean', 'sum'],\n",
283 | " 'BUREAU_UNUSUAL_CURRENCY': ['mean', 'sum'],\n",
284 | " 'NEW_PAYMENT_RATE_SUM':['max',\"mean\",\"sum\"],\n",
285 | " 'NEW_PAYMENT_RATE_SUM_DEBT':['max',\"mean\",\"sum\"],\n",
286 | " 'NEW_PAYMENT_RATE_AMT_CREDIT_MAX_OVERDUE':['max',\"mean\",\"sum\"]\n",
287 | " }\n",
288 | " # Bureau and bureau_balance categorical features\n",
289 | " bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)\n",
290 | " cat_aggregations = {}\n",
291 | " for cat in bureau_cat: cat_aggregations[cat] = ['mean']\n",
292 | " \n",
293 | " \n",
294 | " bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})\n",
295 | " bureau_agg.columns = pd.Index(['BURO_' + e[0] + \"_\" + e[1].upper() for e in bureau_agg.columns.tolist()])\n",
296 | " # Bureau: Active credits - using only numerical aggregations\n",
297 | " active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]\n",
298 | " active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)\n",
299 | " active_agg.columns = pd.Index(['ACTIVE_' + e[0] + \"_\" + e[1].upper() for e in active_agg.columns.tolist()])\n",
300 | " bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')\n",
301 | " del active, active_agg\n",
302 | " gc.collect()\n",
303 | " # Bureau: Closed credits - using only numerical aggregations\n",
304 | " closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]\n",
305 | " closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)\n",
306 | " closed_agg.columns = pd.Index(['CLOSED_' + e[0] + \"_\" + e[1].upper() for e in closed_agg.columns.tolist()])\n",
307 | " bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')\n",
308 | " del closed, closed_agg, bureau\n",
309 | " gc.collect()\n",
310 | " return bureau_agg"
311 | ]
312 | },
313 | {
314 | "cell_type": "code",
315 | "execution_count": 6,
316 | "metadata": {
317 | "pycharm": {
318 | "is_executing": false,
319 | "name": "#%%\n"
320 | }
321 | },
322 | "outputs": [],
323 | "source": [
324 | "\n",
325 | "# Preprocess previous_applications.csv\n",
326 | "def previous_applications(num_rows = None, nan_as_category = True):\n",
327 | " \n",
328 | " df = pd.read_csv(\"data/previous_application.csv\", nrows = num_rows)\n",
329 | " \n",
330 | " df.replace(365243,np.nan,inplace = True)\n",
331 | " df.replace(\"XNA\",np.nan,inplace = True)\n",
332 | "\n",
333 | " df['NEW_RETURN_DAY'] = df['DAYS_DECISION'] + df['CNT_PAYMENT'] * 30\n",
334 | "\n",
335 | " df['NEW_DAYS_TERMINATION_diff'] = df['DAYS_TERMINATION'] - df['NEW_RETURN_DAY']\n",
336 | "\n",
337 | " df['NEW_AMT_DOWN_PAYMENT_rate'] = df['AMT_DOWN_PAYMENT'] / (df['AMT_CREDIT'] + 0.01)\n",
338 | "\n",
339 | " df['NEW_AMT_SPEND_TO_PRODUCT'] = df['AMT_GOODS_PRICE'] / df['AMT_CREDIT']\n",
340 | " \n",
341 | " df['NEW_DAYS_DUE']=df['DAYS_FIRST_DUE'] - df['DAYS_LAST_DUE_1ST_VERSION'] \n",
342 | " \n",
343 | " df['NEW_APP_CREDIT_PERC'] = df['AMT_APPLICATION'] / df['AMT_CREDIT']\n",
344 | " \n",
345 | " df[\"NAME_PAYMENT_TYPE\"].replace([\"Non-cash from your account\",\"Cashless from the account of the employer\"],np.nan,inplace=True)\n",
346 | "\n",
347 | " a = [\"Channel of corporate sales\",\"Car dealer\"]\n",
348 | " df[\"CHANNEL_TYPE\"].replace(a,\"Others_Type\",inplace=True)\n",
349 | "\n",
350 | " b = ['Family', 'Spouse, partner', 'Children', 'Other_B', 'Other_A', 'Group of people'] \n",
351 | " df[\"NAME_TYPE_SUITE\"] = df[\"NAME_TYPE_SUITE\"].replace(b, 'not_alone')\n",
352 | "\n",
353 | " df[\"WEEKDAY_APPR_PROCESS_START\"] = df[\"WEEKDAY_APPR_PROCESS_START\"].replace(['MONDAY','TUESDAY', 'WEDNESDAY','THURSDAY','FRIDAY'], 'WEEK_DAY') \n",
354 | " df[\"WEEKDAY_APPR_PROCESS_START\"] = df[\"WEEKDAY_APPR_PROCESS_START\"].replace(['SATURDAY', 'SUNDAY'], 'WEEKEND')\n",
355 | "\n",
356 | " a = ['Auto technology', 'Jewelry', 'MLM partners', 'Tourism'] \n",
357 | " df[\"NAME_SELLER_INDUSTRY\"] = df[\"NAME_SELLER_INDUSTRY\"].replace(a, 'Other_Ind')\n",
358 | "\n",
359 | " a = ['Auto Accessories', 'Jewelry', 'Homewares', 'Medical Supplies', 'Vehicles', 'Sport and Leisure','Gardening', 'Other', 'Office Appliances', 'Tourism', 'Medicine', 'Direct Sales', 'Fitness', 'Additional Service','Education', 'Weapon', 'Insurance', 'House Construction', 'Animals'] \n",
360 | " df[\"NAME_GOODS_CATEGORY\"] = df[\"NAME_GOODS_CATEGORY\"].replace(a, 'Other_Cat')\n",
361 | "\n",
362 | " a = ['Buying a used car','Building a house or an annex','Everyday expenses','Medicine','Payments on other loans','Education','Journey', 'Purchase of electronic equipment','Buying a new car','Wedding / gift / holiday','Buying a home','Car repairs','Furniture','Buying a holiday home / land', 'Business development','Gasification / water supply','Buying a garage','Hobby','Money for a third person','Refusal to name the goal','Urgent needs','Other']\n",
363 | " df['NAME_CASH_LOAN_PURPOSE']= df['NAME_CASH_LOAN_PURPOSE'].replace(a,'Others')\n",
364 | "\n",
365 | " df[\"NAME_PORTFOLIO\"].replace(\"cars\",np.nan,inplace=True)\n",
366 | " \n",
367 | " a = [8,9,10,11,12,13,14,15,16,17]\n",
368 | " df[\"HOUR_APPR_PROCESS_START\"] = df[\"HOUR_APPR_PROCESS_START\"].replace(a, 'Working_Hours')\n",
369 | "\n",
370 | " b = [18,19,20,21,22,23,0,1,2,3,4,5,6,7]\n",
371 | " df[\"HOUR_APPR_PROCESS_START\"] = df[\"HOUR_APPR_PROCESS_START\"].replace(b, 'Off_Hours')\n",
372 | " \n",
373 | " drops = [\"RATE_INTEREST_PRIMARY\",\"RATE_INTEREST_PRIVILEGED\",\"FLAG_LAST_APPL_PER_CONTRACT\",\"NFLAG_LAST_APPL_IN_DAY\",\"NAME_PRODUCT_TYPE\",\"SELLERPLACE_AREA\"]\n",
374 | " df.drop(drops,inplace=True,axis=1)\n",
375 | " \n",
376 | " df[\"NFLAG_INSURED_ON_APPROVAL\"] = df[\"NFLAG_INSURED_ON_APPROVAL\"].astype(\"object\")\n",
377 | " cat_features = list(df.select_dtypes(['object']).columns)\n",
378 | " df = pd.get_dummies(df, columns= cat_features, dummy_na= True,drop_first=True)\n",
379 | " \n",
380 | " agg1 = {'SK_ID_CURR': ['size'],\n",
381 | " 'AMT_ANNUITY': ['max', 'min', 'mean','std', 'sum'], \n",
382 | " 'AMT_APPLICATION':['max', 'min', 'mean','std', 'sum'],\n",
383 | " 'AMT_CREDIT':['max', 'min', 'mean','std', 'sum'],\n",
384 | " 'AMT_DOWN_PAYMENT': ['max', 'min', 'mean','std', 'sum'],\n",
385 | " 'AMT_GOODS_PRICE': ['max', 'min', 'mean','std', 'sum'],\n",
386 | " 'RATE_DOWN_PAYMENT': ['max', 'min', 'mean','std'],\n",
387 | " 'DAYS_DECISION': ['max', 'min', 'mean', 'sum'],\n",
388 | " 'CNT_PAYMENT': ['max', 'min', 'mean','std', 'sum'],\n",
389 | " 'DAYS_FIRST_DRAWING': ['max', 'min', 'mean', 'sum'],\n",
390 | " 'DAYS_FIRST_DUE': ['max', 'min', 'mean', 'sum'],\n",
391 | " 'DAYS_LAST_DUE_1ST_VERSION': ['max', 'min', 'mean', 'sum'],\n",
392 | " 'DAYS_LAST_DUE': ['max', 'min', 'mean', 'sum'],\n",
393 | " 'DAYS_TERMINATION': ['max', 'min', 'mean','std', 'sum'],\n",
394 | " 'NEW_RETURN_DAY': ['max', 'min', 'mean','std', 'sum'],\n",
395 | " 'NEW_DAYS_TERMINATION_diff': ['max', 'min', 'mean','std', 'sum'],\n",
396 | " 'NEW_AMT_DOWN_PAYMENT_rate': ['max', 'min', 'mean','std'],\n",
397 | " 'NEW_AMT_SPEND_TO_PRODUCT': ['max', 'min', 'mean','std', 'sum'],\n",
398 | " 'NEW_APP_CREDIT_PERC': ['max', 'min', 'mean'],\n",
399 | " 'NAME_CONTRACT_TYPE_Consumer loans': ['max', 'min','sum'],\n",
400 | " 'NAME_CONTRACT_TYPE_Revolving loans': ['max', 'min','sum'],\n",
401 | " 'NAME_CONTRACT_TYPE_nan': ['max', 'min','sum'],\n",
402 | " 'WEEKDAY_APPR_PROCESS_START_WEEK_DAY': ['max', 'min', 'sum'],\n",
403 | " 'WEEKDAY_APPR_PROCESS_START_nan': ['max', 'min', 'sum'],\n",
404 | " 'HOUR_APPR_PROCESS_START_Working_Hours': ['max', 'min', 'sum'],\n",
405 | " 'HOUR_APPR_PROCESS_START_nan': ['max', 'min', 'sum'],\n",
406 | " 'NAME_CASH_LOAN_PURPOSE_Repairs': ['max', 'min', 'sum'],\n",
407 | " 'NAME_CASH_LOAN_PURPOSE_XAP': ['max', 'min', 'sum'],\n",
408 | " 'NAME_CASH_LOAN_PURPOSE_nan': ['max', 'min', 'sum'],\n",
409 | " 'NAME_CONTRACT_STATUS_Canceled': ['max', 'min', 'sum'],\n",
410 | " 'NAME_CONTRACT_STATUS_Refused': ['max', 'min', 'sum'],\n",
411 | " 'NAME_CONTRACT_STATUS_Unused offer': ['max', 'min', 'sum'],\n",
412 | " 'NAME_CONTRACT_STATUS_nan': ['max', 'min', 'sum'],\n",
413 | " 'NAME_PAYMENT_TYPE_nan': ['max', 'min', 'sum'],\n",
414 | " 'CODE_REJECT_REASON_HC': ['max', 'min','sum'],\n",
415 | " 'CODE_REJECT_REASON_LIMIT': ['max', 'min','sum'],\n",
416 | " 'CODE_REJECT_REASON_SCO': ['max', 'min','sum'],\n",
417 | " 'CODE_REJECT_REASON_SCOFR': ['max', 'min', 'sum'],\n",
418 | " #'CODE_REJECT_REASON_SYSTEM': ['max', 'min', 'sum'],\n",
419 | " 'CODE_REJECT_REASON_VERIF': ['max', 'min', 'sum'],\n",
420 | " 'CODE_REJECT_REASON_XAP': ['max', 'min', 'sum'],\n",
421 | " 'CODE_REJECT_REASON_nan': ['max', 'min','sum'],\n",
422 | " 'NAME_TYPE_SUITE_not_alone': ['max', 'min','sum'],\n",
423 | " 'NAME_TYPE_SUITE_nan': ['max', 'min', 'sum'],\n",
424 | " 'NAME_CLIENT_TYPE_Refreshed': ['max', 'min','sum'],\n",
425 | " 'NAME_CLIENT_TYPE_Repeater': ['max', 'min', 'sum'],\n",
426 | " 'NAME_CLIENT_TYPE_nan': ['max', 'min','sum'],\n",
427 | " 'NAME_GOODS_CATEGORY_Clothing and Accessories': ['max', 'min', 'sum'],\n",
428 | " 'NAME_GOODS_CATEGORY_Computers': ['max', 'min','sum'],\n",
429 | " 'NAME_GOODS_CATEGORY_Construction Materials': ['max', 'min', 'sum'],\n",
430 | " 'NAME_GOODS_CATEGORY_Consumer Electronics': ['max', 'min', 'sum'],\n",
431 | " 'NAME_GOODS_CATEGORY_Furniture': ['max', 'min', 'sum'],\n",
432 | " 'NAME_GOODS_CATEGORY_Mobile': ['max', 'min', 'sum'],\n",
433 | " 'NAME_GOODS_CATEGORY_Other_Cat': ['max', 'min', 'sum'],\n",
434 | " 'NAME_GOODS_CATEGORY_Photo / Cinema Equipment': ['max', 'min', 'sum'],\n",
435 | " 'NAME_GOODS_CATEGORY_nan': ['max', 'min', 'sum'],\n",
436 | " 'NAME_PORTFOLIO_Cars': ['max', 'min', 'sum'],\n",
437 | " 'NAME_PORTFOLIO_Cash': ['max', 'min', 'sum'],\n",
438 | " 'NAME_PORTFOLIO_POS': ['max', 'min','sum'],\n",
439 | " 'NAME_PORTFOLIO_nan': ['max', 'min', 'sum'],\n",
440 | " 'CHANNEL_TYPE_Contact center': ['max', 'min', 'sum'],\n",
441 | " 'CHANNEL_TYPE_Country-wide': ['max', 'min', 'sum'],\n",
442 | " 'CHANNEL_TYPE_Credit and cash offices': ['max', 'min', 'sum'],\n",
443 | " 'CHANNEL_TYPE_Others_Type': ['max', 'min', 'sum'],\n",
444 | " 'CHANNEL_TYPE_Regional / Local': ['max', 'min','sum'],\n",
445 | " 'CHANNEL_TYPE_Stone': ['max', 'min','sum'],\n",
446 | " 'CHANNEL_TYPE_nan': ['max', 'min', 'sum'],\n",
447 | " 'NAME_SELLER_INDUSTRY_Connectivity': ['max', 'min','sum'],\n",
448 | " 'NAME_SELLER_INDUSTRY_Construction': ['max', 'min', 'sum'],\n",
449 | " 'NAME_SELLER_INDUSTRY_Consumer electronics': ['max', 'min', 'sum'],\n",
450 | " 'NAME_SELLER_INDUSTRY_Furniture': ['max', 'min', 'sum'],\n",
451 | " 'NAME_SELLER_INDUSTRY_Industry': ['max', 'min', 'sum'],\n",
452 | " 'NAME_SELLER_INDUSTRY_Other_Ind': ['max', 'min','sum'],\n",
453 | " 'NAME_SELLER_INDUSTRY_nan': ['max', 'min','sum'],\n",
454 | " 'NAME_YIELD_GROUP_low_action': ['max', 'min', 'sum'],\n",
455 | " 'NAME_YIELD_GROUP_low_normal': ['max', 'min', 'sum'],\n",
456 | " 'NAME_YIELD_GROUP_middle': ['max', 'min','sum'],\n",
457 | " 'NAME_YIELD_GROUP_nan': ['max', 'min','sum'],\n",
458 | " 'PRODUCT_COMBINATION_Card X-Sell': ['max', 'min', 'sum'],\n",
459 | " 'PRODUCT_COMBINATION_Cash': ['max', 'min', 'sum'],\n",
460 | " 'PRODUCT_COMBINATION_Cash Street: high': ['max', 'min', 'sum'],\n",
461 | " 'PRODUCT_COMBINATION_Cash Street: low': ['max', 'min','sum'],\n",
462 | " 'PRODUCT_COMBINATION_Cash Street: middle': ['max', 'min','sum'],\n",
463 | " 'PRODUCT_COMBINATION_Cash X-Sell: high': ['max', 'min','sum'],\n",
464 | " 'PRODUCT_COMBINATION_Cash X-Sell: low': ['max', 'min','sum'],\n",
465 | " 'PRODUCT_COMBINATION_Cash X-Sell: middle': ['max', 'min','sum'],\n",
466 | " 'PRODUCT_COMBINATION_POS household with interest': ['max', 'min','sum'],\n",
467 | " 'PRODUCT_COMBINATION_POS household without interest': ['max', 'min','sum'],\n",
468 | " 'PRODUCT_COMBINATION_POS industry with interest': ['max', 'min','sum'],\n",
469 | " 'PRODUCT_COMBINATION_POS industry without interest': ['max', 'min','sum'],\n",
470 | " 'PRODUCT_COMBINATION_POS mobile with interest': ['max', 'min','sum'],\n",
471 | " 'PRODUCT_COMBINATION_POS mobile without interest': ['max', 'min','sum'],\n",
472 | " 'PRODUCT_COMBINATION_POS other with interest': ['max', 'min','sum'],\n",
473 | " 'PRODUCT_COMBINATION_POS others without interest': ['max', 'min','sum'],\n",
474 | " 'PRODUCT_COMBINATION_nan': ['max', 'min','sum'],\n",
475 | " 'NFLAG_INSURED_ON_APPROVAL_1.0': ['max', 'min','sum'],\n",
476 | " 'NFLAG_INSURED_ON_APPROVAL_nan': ['max', 'min','sum']}\n",
477 | " df = df.groupby(['SK_ID_CURR']).agg(agg1)\n",
478 | " \n",
479 | " df.columns = pd.Index(['PREV_' + e[0] + \"_\" + e[1].upper() for e in df.columns.tolist()])\n",
480 | "\n",
481 | " return df\n",
482 | "\n",
483 | "# pytest"
484 | ]
485 | },
486 | {
487 | "cell_type": "code",
488 | "execution_count": 9,
489 | "metadata": {
490 | "pycharm": {
491 | "is_executing": false,
492 | "name": "#%%\n"
493 | }
494 | },
495 | "outputs": [],
496 | "source": [
497 | "# Preprocess POS_CASH_balance.csv\n",
498 | "def pos_cash(num_rows = None, nan_as_category = True):\n",
499 | " df=pd.read_csv('data/POS_CASH_balance.csv',nrows = num_rows)\n",
500 | " \n",
501 | " df['NEW_ADJOURNMENT']=df['SK_DPD']-df['SK_DPD_DEF']\n",
502 | " \n",
503 | " \n",
504 | " b = [\"Demand\",\"Returned to the store\",\"Approved\",\"Amortized debt\",\"Canceled\",\"XNA\"]\n",
505 | " df[\"NAME_CONTRACT_STATUS\"].replace(b, 'Others',inplace=True)\n",
506 | " \n",
507 | " \n",
508 | " cat_features = list(df.select_dtypes(['object']).columns)\n",
509 | " df = pd.get_dummies(df, columns= cat_features, dummy_na= True)\n",
510 | " \n",
511 | " \n",
512 | " agg={\n",
513 | " 'MONTHS_BALANCE': ['max',\"min\"],\n",
514 | " 'SK_DPD': ['max', 'mean',\"std\"],\n",
515 | " 'SK_DPD_DEF': ['max', 'mean',\"std\"],\n",
516 | " 'CNT_INSTALMENT':['min','mean','max'],\n",
517 | " 'CNT_INSTALMENT_FUTURE':['mean','min','max'],\n",
518 | " 'SK_ID_CURR':['max','size'],\n",
519 | " 'NEW_ADJOURNMENT':['max','mean',\"std\"],\n",
520 | " 'NAME_CONTRACT_STATUS_Active':['sum'],\n",
521 | " 'NAME_CONTRACT_STATUS_Completed':['sum'],\n",
522 | " 'NAME_CONTRACT_STATUS_Signed':['sum'],\n",
523 | " 'NAME_CONTRACT_STATUS_Others':['sum']\n",
524 | " \n",
525 | " }\n",
526 | " \n",
527 | " \n",
528 | " pos_agg = df.groupby(['SK_ID_PREV']).agg(agg)\n",
529 | " \n",
530 | " \n",
531 | " pos_agg.columns = pd.Index([e[0] + \"_\" + e[1].upper() for e in pos_agg.columns.tolist()])\n",
532 | " \n",
533 | " pos_agg[\"NEW_PAID_MONTH\"] = pos_agg[\"CNT_INSTALMENT_MAX\"] - pos_agg[\"CNT_INSTALMENT_FUTURE_MIN\"]\n",
534 | " \n",
535 | " agg2={\n",
536 | " \"MONTHS_BALANCE_MAX\":[\"min\",\"max\",\"mean\"],\n",
537 | " \"MONTHS_BALANCE_MIN\":[\"min\",\"max\",\"mean\"],\n",
538 | " \"SK_DPD_MAX\":[\"max\",\"mean\",\"min\"],\n",
539 | " \"SK_DPD_MEAN\":[\"max\",\"mean\",\"min\"],\n",
540 | " \"SK_DPD_STD\":[\"max\",\"mean\",\"min\",\"std\"],\n",
541 | " \"SK_DPD_DEF_MAX\":[\"max\",\"mean\",\"min\"],\n",
542 | " \"SK_DPD_DEF_MEAN\":[\"max\",\"mean\",\"min\"],\n",
543 | " \"SK_DPD_DEF_STD\":[\"max\",\"mean\",\"min\"],\n",
544 | " \"CNT_INSTALMENT_MIN\":[\"max\",\"mean\",\"min\"],\n",
545 | " \"CNT_INSTALMENT_MEAN\":[\"max\",\"mean\",\"min\"],\n",
546 | " \"CNT_INSTALMENT_MAX\":[\"max\",\"mean\",\"min\"],\n",
547 | " \"CNT_INSTALMENT_FUTURE_MEAN\":[\"max\",\"mean\",\"min\"],\n",
548 | " \"CNT_INSTALMENT_FUTURE_MIN\":[\"max\",\"mean\",\"min\"],\n",
549 | " \"CNT_INSTALMENT_FUTURE_MAX\":[\"max\",\"mean\",\"min\"],\n",
550 | " \"SK_ID_CURR_MAX\":[\"max\",\"min\"],\n",
551 | " \"SK_ID_CURR_SIZE\":[\"max\",\"min\"],\n",
552 | " \"NEW_ADJOURNMENT_MAX\":[\"max\",\"mean\",\"min\"],\n",
553 | " \"NEW_ADJOURNMENT_MEAN\":[\"max\",\"mean\",\"min\"],\n",
554 | " \"NEW_ADJOURNMENT_STD\":[\"max\",\"mean\",\"min\"],\n",
555 | " \"NAME_CONTRACT_STATUS_Active_SUM\":[\"max\",\"min\",\"sum\"],\n",
556 | " 'NAME_CONTRACT_STATUS_Signed_SUM':[\"max\",\"min\",\"sum\"],\n",
557 | " 'NAME_CONTRACT_STATUS_Completed_SUM':[\"max\",\"min\",\"sum\"],\n",
558 | " 'NAME_CONTRACT_STATUS_Others_SUM':[\"max\",\"min\",\"sum\"]\n",
559 | " \n",
560 | " }\n",
561 | " \n",
562 | " pos_agg2 = pos_agg.groupby([\"SK_ID_CURR_MAX\"]).agg(agg2)\n",
563 | " pos_agg2.index.names = ['SK_ID_CURR']\n",
564 | " \n",
565 | " pos_agg2.columns = pd.Index([\"POS\" + \"_\" + e[0] + \"_\" + e[1].upper() for e in pos_agg2.columns.tolist()])\n",
566 | " \n",
567 | " return pos_agg2"
568 | ]
569 | },
570 | {
571 | "cell_type": "code",
572 | "execution_count": 10,
573 | "metadata": {
574 | "pycharm": {
575 | "is_executing": false,
576 | "name": "#%%\n"
577 | }
578 | },
579 | "outputs": [],
580 | "source": [
581 | "# Preprocess installments_payments.csv\n",
582 | "def installments_payments(num_rows=None, nan_as_category = True):\n",
583 | " pd.options.mode.chained_assignment = None\n",
584 | " df = pd.read_csv(\"data/installments_payments.csv\", nrows = num_rows)\n",
585 | " df[\"NEW_DELAY\"] = df[\"DAYS_INSTALMENT\"] - df[\"DAYS_ENTRY_PAYMENT\"] \n",
586 | " \n",
587 | " df['NEW_FLAG_DELAY'] = df['NEW_DELAY'].apply(lambda x : 1 if x < 0 else 0)\n",
588 | " df['NEW_RATIO_DELAY'] = df[['SK_ID_PREV','NEW_FLAG_DELAY']].groupby('SK_ID_PREV')['NEW_FLAG_DELAY'].transform(lambda x : x.sum() / x.count())\n",
589 | " \n",
590 | " df[\"NEW_PAYMENT_DIFF\"] = df[\"AMT_INSTALMENT\"] - df[\"AMT_PAYMENT\"]\n",
591 | " \n",
592 | " \n",
593 | " df[\"NUM_INSTALMENT_VERSION\"] = df[\"NUM_INSTALMENT_VERSION\"].astype(\"object\")\n",
594 | " df[(df[\"NUM_INSTALMENT_VERSION\"] != 1) & (df[\"NUM_INSTALMENT_VERSION\"] != 0) & (df[\"NUM_INSTALMENT_VERSION\"] != 2) & (df[\"NUM_INSTALMENT_VERSION\"] != 3)]['NUM_INSTALMENT_VERSION'] = 4\n",
595 | " \n",
596 | " cat_features = list(df.select_dtypes(['object']).columns)\n",
597 | " df = pd.get_dummies(df, columns= cat_features,drop_first=True)\n",
598 | " \n",
599 | " \n",
600 | " agg1 = {'SK_ID_CURR': ['count','max'],\n",
601 | " 'NEW_DELAY': ['max', 'min', 'mean','std', 'sum'],\n",
602 | " 'NUM_INSTALMENT_NUMBER':['min','max'], \n",
603 | " 'DAYS_INSTALMENT':['max','min','std'], \n",
604 | " 'NEW_PAYMENT_DIFF': ['max', 'mean', 'std', 'min','sum'],\n",
605 | " 'AMT_INSTALMENT': ['max', 'mean', 'sum', 'min', 'std'],\n",
606 | " 'AMT_PAYMENT': ['min', 'max', 'mean', 'sum', 'std'],\n",
607 | " 'DAYS_ENTRY_PAYMENT': ['max', 'min', 'std'],\n",
608 | " \"NUM_INSTALMENT_VERSION_1.0\":[\"sum\"],\n",
609 | " \"NUM_INSTALMENT_VERSION_2.0\":[\"sum\"],\n",
610 | " \"NUM_INSTALMENT_VERSION_3.0\":[\"sum\"],\n",
611 | " \"NUM_INSTALMENT_VERSION_4.0\":[\"sum\"]\n",
612 | " }\n",
613 | " \n",
614 | " \n",
615 | " \n",
616 | " Installments_agg = df.groupby(['SK_ID_PREV']).agg(agg1)\n",
617 | " \n",
618 | " Installments_agg.columns = pd.Index([e[0] + \"_\" + e[1].upper() for e in Installments_agg.columns.tolist()])\n",
619 | " \n",
620 | " Installments_agg['NEW_DAYS_INSTALMENT_NUMBER']=Installments_agg['DAYS_INSTALMENT_MAX']-Installments_agg['DAYS_INSTALMENT_MIN'] \n",
621 | " \n",
622 | " Installments_agg['NEW_AMT_INSTALMENT_DIFF']=Installments_agg['AMT_INSTALMENT_MAX']-Installments_agg['AMT_INSTALMENT_MIN']\n",
623 | " \n",
624 | " \n",
625 | " \n",
626 | " agg2= {'SK_ID_CURR_COUNT':['min', 'max'],\n",
627 | " 'SK_ID_CURR_MAX':['min', 'max'],\n",
628 | " 'NEW_DELAY_MAX':['min', 'max', 'mean'],\n",
629 | " 'NEW_DELAY_MIN':['min', 'max', 'mean'],\n",
630 | " 'NEW_DELAY_MEAN':['min', 'max', 'mean'],\n",
631 | " 'NEW_DELAY_STD':['min', 'max', 'mean'],\n",
632 | " 'NEW_DELAY_SUM':['min', 'max', 'mean', 'sum', 'std'],\n",
633 | " 'NUM_INSTALMENT_NUMBER_MIN':['min','max','mean'], \n",
634 | " 'NUM_INSTALMENT_NUMBER_MAX':['min','max','mean','sum'],\n",
635 | " 'NEW_DAYS_INSTALMENT_NUMBER':['min','max','std'], \n",
636 | " 'DAYS_INSTALMENT_STD':['min','max','std'], \n",
637 | " 'DAYS_INSTALMENT_MIN':['std','min','max'],\n",
638 | " 'DAYS_INSTALMENT_MAX':['std','min','max'],\n",
639 | " 'NEW_PAYMENT_DIFF_MAX':['min', 'max', 'mean',\"std\"],\n",
640 | " 'NEW_PAYMENT_DIFF_MEAN':['min', 'max', 'mean',\"std\"],\n",
641 | " 'NEW_PAYMENT_DIFF_SUM':['min', 'max', 'mean',\"std\"],\n",
642 | " 'NEW_PAYMENT_DIFF_STD':['min', 'max', 'mean',\"std\"],\n",
643 | " 'NEW_PAYMENT_DIFF_MIN':['min', 'max', 'mean',\"std\"],\n",
644 | " 'AMT_INSTALMENT_MAX':['min', 'max', 'mean',\"sum\"],\n",
645 | " 'AMT_INSTALMENT_MEAN':['min', 'max', 'mean',\"sum\"],\n",
646 | " 'AMT_INSTALMENT_SUM':['min', 'max', 'mean',\"sum\"],\n",
647 | " 'AMT_INSTALMENT_STD':['min', 'max', 'mean',\"sum\"],\n",
648 | " 'AMT_INSTALMENT_MIN':['min', 'max', 'mean',\"sum\"],\n",
649 | " 'NEW_AMT_INSTALMENT_DIFF':['min','max','mean',\"sum\"],\n",
650 | " 'AMT_PAYMENT_MIN':['min', 'max', 'mean',\"std\",\"sum\"],\n",
651 | " 'AMT_PAYMENT_MAX':['min', 'max', 'mean',\"std\",\"sum\"],\n",
652 | " 'AMT_PAYMENT_MEAN':['min', 'max', 'mean',\"std\",\"sum\"],\n",
653 | " 'AMT_PAYMENT_STD':['min', 'max', 'mean',\"std\",\"sum\"],\n",
654 | " 'AMT_PAYMENT_SUM':['min', 'max', 'mean',\"std\",\"sum\"],\n",
655 | " 'DAYS_ENTRY_PAYMENT_MIN':['min', 'max', 'mean'],\n",
656 | " 'DAYS_ENTRY_PAYMENT_STD':['min', 'max', 'mean'],\n",
657 | " 'DAYS_ENTRY_PAYMENT_MAX':['min', 'max', 'mean'],\n",
658 | " 'NUM_INSTALMENT_VERSION_1.0_SUM':['sum'],\n",
659 | " 'NUM_INSTALMENT_VERSION_2.0_SUM':['sum'],\n",
660 | " 'NUM_INSTALMENT_VERSION_3.0_SUM':['sum'],\n",
661 | " 'NUM_INSTALMENT_VERSION_4.0_SUM':['sum']\n",
662 | " }\n",
663 | " \n",
664 | " Installments_agg2=Installments_agg.groupby('SK_ID_CURR_MAX').agg(agg2)\n",
665 | " Installments_agg2.index.names = ['SK_ID_CURR']\n",
666 | " \n",
667 | " \n",
668 | " Installments_agg2.columns = pd.Index(\"INSTAL_\" + e[0] + \"_\" + e[1].upper() for e in Installments_agg2.columns.tolist())\n",
669 | " return Installments_agg2\n"
670 | ]
671 | },
672 | {
673 | "cell_type": "code",
674 | "execution_count": 11,
675 | "metadata": {
676 | "pycharm": {
677 | "is_executing": false,
678 | "name": "#%%\n"
679 | }
680 | },
681 | "outputs": [],
682 | "source": [
683 | "# Preprocess credit_card_balance.csv\n",
684 | "def credit_card_balance(num_rows = None, nan_as_category = True):\n",
685 | " cc = pd.read_csv('data/credit_card_balance.csv', nrows = num_rows)\n",
686 | " cc, cat_cols = one_hot_encoder(cc, nan_as_category= True)\n",
687 | " # General aggregations\n",
688 | " cc.drop(['SK_ID_PREV'], axis= 1, inplace = True)\n",
689 | " cc['number_of_instalments'] = cc.groupby(\n",
690 | " by=['SK_ID_CURR'])['CNT_INSTALMENT_MATURE_CUM'].agg('max').reset_index()[\n",
691 | " 'CNT_INSTALMENT_MATURE_CUM']\n",
692 | " cc['AMT_DRAWINGS_ATM_CURRENT'][cc['AMT_DRAWINGS_ATM_CURRENT'] < 0] = np.nan\n",
693 | " cc['AMT_DRAWINGS_CURRENT'][cc['AMT_DRAWINGS_CURRENT'] < 0] = np.nan\n",
694 | " cc_agg = cc.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum', 'var'])\n",
695 | " cc_agg.columns = pd.Index(['CC_' + e[0] + \"_\" + e[1].upper() for e in cc_agg.columns.tolist()])\n",
696 | " # Count credit card lines\n",
697 | " cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()\n",
698 | " del cc\n",
699 | " gc.collect()\n",
700 | " return cc_agg\n"
701 | ]
702 | },
703 | {
704 | "cell_type": "code",
705 | "execution_count": 12,
706 | "metadata": {
707 | "pycharm": {
708 | "is_executing": false,
709 | "name": "#%%\n"
710 | }
711 | },
712 | "outputs": [],
713 | "source": [
714 | "# LightGBM GBDT with KFold or Stratified KFold\n",
715 | "# Parameters from Tilii kernel: https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code\n",
716 | "def kfold_lightgbm(df, num_folds, stratified = False, debug= False):\n",
717 | " import re\n",
718 | " df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))\n",
719 | " # Divide in training/validation and test data\n",
720 | " train_df = df[df['TARGET'].notnull()]\n",
721 | " test_df = df[df['TARGET'].isnull()]\n",
722 | " print(\"Starting LightGBM. Train shape: {}, test shape: {}\".format(train_df.shape, test_df.shape))\n",
723 | " del df\n",
724 | " gc.collect()\n",
725 | " # Cross validation model\n",
726 | " if stratified:\n",
727 | " folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)\n",
728 | " else:\n",
729 | " folds = KFold(n_splits= num_folds, shuffle=True, random_state=1001)\n",
730 | " # Create arrays and dataframes to store results\n",
731 | " oof_preds = np.zeros(train_df.shape[0])\n",
732 | " sub_preds = np.zeros(test_df.shape[0])\n",
733 | " feature_importance_df = pd.DataFrame()\n",
734 | " feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]\n",
735 | " \n",
736 | " for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):\n",
737 | " train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]\n",
738 | " valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]\n",
739 | "\n",
740 | " # LightGBM parameters found by Bayesian optimization\n",
741 | " clf = LGBMClassifier(\n",
742 | " njobs = -1,\n",
743 | " n_estimators=10000,\n",
744 | " learning_rate=0.02,\n",
745 | " num_leaves=34,\n",
746 | " colsample_bytree=0.9497036,\n",
747 | " subsample=0.8715623,\n",
748 | " max_depth=8,\n",
749 | " reg_alpha=0.041545473,\n",
750 | " reg_lambda=0.0735294,\n",
751 | " min_split_gain=0.0222415,\n",
752 | " min_child_weight=39.3259775,\n",
753 | " silent=-1,\n",
754 | " verbose=-1, )\n",
755 | "\n",
756 | " clf.fit(train_x, train_y, eval_set = [(train_x, train_y), (valid_x, valid_y)], \n",
757 | " eval_metric = 'auc', verbose = 300, early_stopping_rounds = 200)\n",
758 | "\n",
759 | " oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]\n",
760 | " sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits\n",
761 | "\n",
762 | " fold_importance_df = pd.DataFrame()\n",
763 | " fold_importance_df[\"feature\"] = feats\n",
764 | " fold_importance_df[\"importance\"] = clf.feature_importances_\n",
765 | " fold_importance_df[\"fold\"] = n_fold + 1\n",
766 | " feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n",
767 | " print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))\n",
768 | " del clf, train_x, train_y, valid_x, valid_y\n",
769 | " gc.collect()\n",
770 | "\n",
771 | " print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))\n",
772 | " # Write submission file and plot feature importance\n",
773 | " if not debug:\n",
774 | " test_df['TARGET'] = sub_preds\n",
775 | " test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False)\n",
776 | " display_importances(feature_importance_df)\n",
777 | " return feature_importance_df\n"
778 | ]
779 | },
780 | {
781 | "cell_type": "code",
782 | "execution_count": 13,
783 | "metadata": {
784 | "pycharm": {
785 | "is_executing": false,
786 | "name": "#%%\n"
787 | }
788 | },
789 | "outputs": [],
790 | "source": [
791 | "\n",
792 | "# Display/plot feature importance\n",
793 | "def display_importances(feature_importance_df_):\n",
794 | " cols = feature_importance_df_[[\"feature\", \"importance\"]].groupby(\"feature\").mean().sort_values(by=\"importance\", ascending=False)[:40].index\n",
795 | " best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]\n",
796 | " plt.figure(figsize=(8, 10))\n",
797 | " sns.barplot(x=\"importance\", y=\"feature\", data=best_features.sort_values(by=\"importance\", ascending=False))\n",
798 | " plt.title('LightGBM Features (avg over folds)')\n",
799 | " plt.tight_layout()\n",
800 | " plt.savefig('lgbm_importances01.png')"
801 | ]
802 | },
803 | {
804 | "cell_type": "code",
805 | "execution_count": null,
806 | "metadata": {
807 | "pycharm": {
808 | "is_executing": false,
809 | "name": "#%%\n"
810 | }
811 | },
812 | "outputs": [
813 | {
814 | "name": "stdout",
815 | "output_type": "stream",
816 | "text": [
817 | "4 columns were label encoded.\n",
818 | "Bureau df shape: (305811, 191)\n",
819 | "Process bureau and bureau_balance - done in 523s\n",
820 | "Previous applications df shape: (338856, 316)\n",
821 | "Process previous_applications - done in 79s\n",
822 | "Pos-cash balance df shape: (337252, 68)\n",
823 | "Process POS-CASH balance - done in 48s\n",
824 | "Installments payments df shape: (339587, 122)\n",
825 | "Process installments payments - done in 585s\n",
826 | "Credit card balance df shape: (103558, 146)\n",
827 | "Process credit card balance - done in 35s\n",
828 | "Starting LightGBM. Train shape: (307507, 1117), test shape: (48744, 1117)\n",
829 | "Training until validation scores don't improve for 200 rounds\n",
830 | "[300]\ttraining's auc: 0.811262\ttraining's binary_logloss: 0.228348\tvalid_1's auc: 0.785853\tvalid_1's binary_logloss: 0.24296\n",
831 | "[600]\ttraining's auc: 0.837004\ttraining's binary_logloss: 0.217781\tvalid_1's auc: 0.793038\tvalid_1's binary_logloss: 0.240394\n",
832 | "[900]\ttraining's auc: 0.854723\ttraining's binary_logloss: 0.210303\tvalid_1's auc: 0.795312\tvalid_1's binary_logloss: 0.239707\n",
833 | "[1200]\ttraining's auc: 0.86938\ttraining's binary_logloss: 0.203926\tvalid_1's auc: 0.796109\tvalid_1's binary_logloss: 0.239448\n",
834 | "[1500]\ttraining's auc: 0.881774\ttraining's binary_logloss: 0.19816\tvalid_1's auc: 0.796389\tvalid_1's binary_logloss: 0.239438\n",
835 | "Early stopping, best iteration is:\n",
836 | "[1360]\ttraining's auc: 0.876153\ttraining's binary_logloss: 0.200834\tvalid_1's auc: 0.796395\tvalid_1's binary_logloss: 0.23939\n",
837 | "Fold 1 AUC : 0.796395\n",
838 | "Training until validation scores don't improve for 200 rounds\n",
839 | "[300]\ttraining's auc: 0.811257\ttraining's binary_logloss: 0.229009\tvalid_1's auc: 0.785097\tvalid_1's binary_logloss: 0.238063\n",
840 | "[600]\ttraining's auc: 0.836687\ttraining's binary_logloss: 0.218527\tvalid_1's auc: 0.791366\tvalid_1's binary_logloss: 0.235552\n",
841 | "[900]\ttraining's auc: 0.854237\ttraining's binary_logloss: 0.211137\tvalid_1's auc: 0.793311\tvalid_1's binary_logloss: 0.234739\n",
842 | "[1200]\ttraining's auc: 0.868395\ttraining's binary_logloss: 0.204903\tvalid_1's auc: 0.793761\tvalid_1's binary_logloss: 0.234514\n",
843 | "[1500]\ttraining's auc: 0.881034\ttraining's binary_logloss: 0.199079\tvalid_1's auc: 0.794216\tvalid_1's binary_logloss: 0.234371\n",
844 | "[1800]\ttraining's auc: 0.892325\ttraining's binary_logloss: 0.193587\tvalid_1's auc: 0.79437\tvalid_1's binary_logloss: 0.234313\n",
845 | "Early stopping, best iteration is:\n",
846 | "[1817]\ttraining's auc: 0.892948\ttraining's binary_logloss: 0.19327\tvalid_1's auc: 0.79441\tvalid_1's binary_logloss: 0.234299\n",
847 | "Fold 2 AUC : 0.794410\n",
848 | "Training until validation scores don't improve for 200 rounds\n",
849 | "[300]\ttraining's auc: 0.812067\ttraining's binary_logloss: 0.228295\tvalid_1's auc: 0.774185\tvalid_1's binary_logloss: 0.243945\n",
850 | "[600]\ttraining's auc: 0.838057\ttraining's binary_logloss: 0.217674\tvalid_1's auc: 0.781359\tvalid_1's binary_logloss: 0.241458\n",
851 | "[900]\ttraining's auc: 0.855459\ttraining's binary_logloss: 0.21034\tvalid_1's auc: 0.783278\tvalid_1's binary_logloss: 0.240845\n",
852 | "[1200]\ttraining's auc: 0.869922\ttraining's binary_logloss: 0.203992\tvalid_1's auc: 0.7842\tvalid_1's binary_logloss: 0.240608\n",
853 | "[1500]\ttraining's auc: 0.882433\ttraining's binary_logloss: 0.198129\tvalid_1's auc: 0.784853\tvalid_1's binary_logloss: 0.24043\n",
854 | "[1800]\ttraining's auc: 0.893632\ttraining's binary_logloss: 0.192698\tvalid_1's auc: 0.785133\tvalid_1's binary_logloss: 0.240421\n",
855 | "Early stopping, best iteration is:\n",
856 | "[1636]\ttraining's auc: 0.88773\ttraining's binary_logloss: 0.195601\tvalid_1's auc: 0.785005\tvalid_1's binary_logloss: 0.240401\n",
857 | "Fold 3 AUC : 0.785005\n",
858 | "Training until validation scores don't improve for 200 rounds\n",
859 | "[300]\ttraining's auc: 0.81111\ttraining's binary_logloss: 0.228961\tvalid_1's auc: 0.785304\tvalid_1's binary_logloss: 0.238185\n",
860 | "[600]\ttraining's auc: 0.83684\ttraining's binary_logloss: 0.218454\tvalid_1's auc: 0.792566\tvalid_1's binary_logloss: 0.235349\n",
861 | "[900]\ttraining's auc: 0.854772\ttraining's binary_logloss: 0.210942\tvalid_1's auc: 0.794335\tvalid_1's binary_logloss: 0.234542\n",
862 | "[1200]\ttraining's auc: 0.869597\ttraining's binary_logloss: 0.204469\tvalid_1's auc: 0.794757\tvalid_1's binary_logloss: 0.234228\n",
863 | "[1500]\ttraining's auc: 0.881997\ttraining's binary_logloss: 0.19879\tvalid_1's auc: 0.795144\tvalid_1's binary_logloss: 0.23402\n",
864 | "[1800]\ttraining's auc: 0.893607\ttraining's binary_logloss: 0.193246\tvalid_1's auc: 0.794941\tvalid_1's binary_logloss: 0.234077\n",
865 | "Early stopping, best iteration is:\n",
866 | "[1704]\ttraining's auc: 0.89002\ttraining's binary_logloss: 0.194985\tvalid_1's auc: 0.795307\tvalid_1's binary_logloss: 0.233958\n"
867 | ]
868 | }
869 | ],
870 | "source": [
871 | "def main(debug = False):\n",
872 | " num_rows = 10000 if debug else None\n",
873 | " df = application_train_test(num_rows)\n",
874 | " with timer(\"Process bureau and bureau_balance\"):\n",
875 | " bureau = bureau_and_balance(num_rows)\n",
876 | " print(\"Bureau df shape:\", bureau.shape)\n",
877 | " df = df.join(bureau, how='left', on='SK_ID_CURR')\n",
878 | " del bureau\n",
879 | " gc.collect()\n",
880 | " with timer(\"Process previous_applications\"):\n",
881 | " prev = previous_applications(num_rows)\n",
882 | " print(\"Previous applications df shape:\", prev.shape)\n",
883 | " df = df.join(prev, how='left', on='SK_ID_CURR')\n",
884 | " del prev\n",
885 | " gc.collect()\n",
886 | " with timer(\"Process POS-CASH balance\"):\n",
887 | " pos = pos_cash(num_rows)\n",
888 | " print(\"Pos-cash balance df shape:\", pos.shape)\n",
889 | " df = df.join(pos, how='left', on='SK_ID_CURR')\n",
890 | " del pos\n",
891 | " gc.collect()\n",
892 | " with timer(\"Process installments payments\"):\n",
893 | " ins = installments_payments(num_rows)\n",
894 | " print(\"Installments payments df shape:\", ins.shape)\n",
895 | " df = df.join(ins, how='left', on='SK_ID_CURR')\n",
896 | " del ins\n",
897 | " gc.collect()\n",
898 | " with timer(\"Process credit card balance\"):\n",
899 | " cc = credit_card_balance(num_rows)\n",
900 | " print(\"Credit card balance df shape:\", cc.shape)\n",
901 | " df = df.join(cc, how='left', on='SK_ID_CURR')\n",
902 | " del cc\n",
903 | " gc.collect()\n",
904 | " with timer(\"Run LightGBM with kfold\"):\n",
905 | " feat_importance = kfold_lightgbm(df, num_folds= 10, stratified= False, debug= debug)\n",
906 | "\n",
907 | "if __name__ == \"__main__\":\n",
908 | " submission_file_name = \"outputs/predictions/merve_betul.csv\"\n",
909 | " with timer(\"Full model run\"):\n",
910 | " main(debug=True)"
911 | ]
912 | },
913 | {
914 | "cell_type": "code",
915 | "execution_count": null,
916 | "metadata": {
917 | "pycharm": {
918 | "is_executing": false,
919 | "name": "#%% Notes\n"
920 | }
921 | },
922 | "outputs": [],
923 | "source": [
924 | "# !pip install lightgbm=='2.1.2'\n",
925 | "# lightgbm.__version__\n",
926 | "\n",
927 | "# Full AUC score 0.793601 : pos,installments ve posrevious degistirildiginde\n"
928 | ]
929 | },
930 | {
931 | "cell_type": "code",
932 | "execution_count": null,
933 | "metadata": {},
934 | "outputs": [],
935 | "source": [
936 | "# 3309s"
937 | ]
938 | },
939 | {
940 | "cell_type": "code",
941 | "execution_count": null,
942 | "metadata": {},
943 | "outputs": [],
944 | "source": []
945 | },
946 | {
947 | "cell_type": "code",
948 | "execution_count": null,
949 | "metadata": {},
950 | "outputs": [],
951 | "source": []
952 | }
953 | ],
954 | "metadata": {
955 | "kernelspec": {
956 | "display_name": "Python 3",
957 | "language": "python",
958 | "name": "python3"
959 | },
960 | "language_info": {
961 | "codemirror_mode": {
962 | "name": "ipython",
963 | "version": 3
964 | },
965 | "file_extension": ".py",
966 | "mimetype": "text/x-python",
967 | "name": "python",
968 | "nbconvert_exporter": "python",
969 | "pygments_lexer": "ipython3",
970 | "version": "3.7.6"
971 | }
972 | },
973 | "nbformat": 4,
974 | "nbformat_minor": 4
975 | }
--------------------------------------------------------------------------------
/models/dsmlbc1/homeCreditRiskFinal.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Libraries"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 20,
13 | "metadata": {},
14 | "outputs": [
15 | {
16 | "ename": "ImportError",
17 | "evalue": "Something is wrong with the numpy installation. While importing we detected an older version of numpy in ['/Users/mvahit/anaconda3/lib/python3.7/site-packages/numpy']. One method of fixing this is to repeatedly uninstall numpy until none is found, then reinstall this version.",
18 | "output_type": "error",
19 | "traceback": [
20 | "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
21 | "\u001B[0;31mImportError\u001B[0m Traceback (most recent call last)",
22 | "\u001B[0;32m\u001B[0m in \u001B[0;36m\u001B[0;34m\u001B[0m\n\u001B[0;32m----> 1\u001B[0;31m \u001B[0;32mimport\u001B[0m \u001B[0mnumpy\u001B[0m \u001B[0;32mas\u001B[0m \u001B[0mnp\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m 2\u001B[0m \u001B[0;32mimport\u001B[0m \u001B[0mpandas\u001B[0m \u001B[0;32mas\u001B[0m \u001B[0mpd\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 3\u001B[0m \u001B[0;32mimport\u001B[0m \u001B[0mlightgbm\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 4\u001B[0m \u001B[0;32mimport\u001B[0m \u001B[0mgc\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 5\u001B[0m \u001B[0;32mimport\u001B[0m \u001B[0mtime\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
23 | "\u001B[0;32m~/anaconda3/lib/python3.7/site-packages/numpy/__init__.py\u001B[0m in \u001B[0;36m\u001B[0;34m\u001B[0m\n\u001B[1;32m 140\u001B[0m \u001B[0;32mfrom\u001B[0m \u001B[0;34m.\u001B[0m \u001B[0;32mimport\u001B[0m \u001B[0m_distributor_init\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 141\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 142\u001B[0;31m \u001B[0;32mfrom\u001B[0m \u001B[0;34m.\u001B[0m \u001B[0;32mimport\u001B[0m \u001B[0mcore\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m 143\u001B[0m \u001B[0;32mfrom\u001B[0m \u001B[0;34m.\u001B[0m\u001B[0mcore\u001B[0m \u001B[0;32mimport\u001B[0m \u001B[0;34m*\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 144\u001B[0m \u001B[0;32mfrom\u001B[0m \u001B[0;34m.\u001B[0m \u001B[0;32mimport\u001B[0m \u001B[0mcompat\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
24 | "\u001B[0;32m~/anaconda3/lib/python3.7/site-packages/numpy/core/__init__.py\u001B[0m in \u001B[0;36m\u001B[0;34m\u001B[0m\n\u001B[1;32m 72\u001B[0m \u001B[0;34m\"numpy in {}. One method of fixing this is to repeatedly uninstall \"\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 73\u001B[0m \"numpy until none is found, then reinstall this version.\")\n\u001B[0;32m---> 74\u001B[0;31m \u001B[0;32mraise\u001B[0m \u001B[0mImportError\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mmsg\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mformat\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mpath\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m 75\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 76\u001B[0m \u001B[0;32mfrom\u001B[0m \u001B[0;34m.\u001B[0m \u001B[0;32mimport\u001B[0m \u001B[0mnumerictypes\u001B[0m \u001B[0;32mas\u001B[0m \u001B[0mnt\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
25 | "\u001B[0;31mImportError\u001B[0m: Something is wrong with the numpy installation. While importing we detected an older version of numpy in ['/Users/mvahit/anaconda3/lib/python3.7/site-packages/numpy']. One method of fixing this is to repeatedly uninstall numpy until none is found, then reinstall this version."
26 | ]
27 | }
28 | ],
29 | "source": [
30 | "import numpy as np\n",
31 | "import pandas as pd\n",
32 | "import lightgbm\n",
33 | "import gc\n",
34 | "import time\n",
35 | "from contextlib import contextmanager\n",
36 | "from lightgbm import LGBMClassifier\n",
37 | "from sklearn.metrics import roc_auc_score\n",
38 | "from sklearn.model_selection import KFold, StratifiedKFold\n",
39 | "import matplotlib.pyplot as plt\n",
40 | "import seaborn as sns\n",
41 | "from sklearn.preprocessing import LabelEncoder\n",
42 | "from sklearn.model_selection import GridSearchCV\n",
43 | "\n",
44 | "import warnings\n",
45 | "warnings.filterwarnings(\"ignore\", category=DeprecationWarning) \n",
46 | "warnings.filterwarnings(\"ignore\", category=FutureWarning) \n",
47 | "warnings.filterwarnings(\"ignore\", category=UserWarning) "
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "metadata": {},
53 | "source": [
54 | "# Helper Functions"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 50,
60 | "metadata": {},
61 | "outputs": [],
62 | "source": [
63 | "@contextmanager\n",
64 | "def timer(title):\n",
65 | " t0 = time.time()\n",
66 | " yield\n",
67 | " print(\"{} - done in {:.0f}s\".format(title, time.time() - t0))"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": 51,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "# Display/plot feature importance\n",
77 | "def display_importances(feature_importance_df_):\n",
78 | " cols = feature_importance_df_[[\"feature\", \"importance\"]].groupby(\"feature\").mean().sort_values(by=\"importance\", ascending=False)[:100].index\n",
79 | " best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]\n",
80 | " plt.figure(figsize=(15, 20))\n",
81 | " sns.barplot(x=\"importance\", y=\"feature\", data=best_features.sort_values(by=\"importance\", ascending=False))\n",
82 | " plt.title('LightGBM Features (avg over folds)')\n",
83 | " plt.tight_layout()\n",
84 | " plt.savefig('lgbm_importances01.png')"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 52,
90 | "metadata": {},
91 | "outputs": [
92 | {
93 | "name": "stdout",
94 | "output_type": "stream",
95 | "text": [
96 | " \n",
97 | " \n",
98 | " \n",
99 | " cience I_ cience I_ \n",
100 | " a_Science I_Love_ a_Science I_Love_ \n",
101 | " ta_Science I_Love_Data_Science I_Love_Dat \n",
102 | " ta_Science I_Love_Data_Science I_Love_Data_ \n",
103 | " ta_Science I_Love_Data_Science I_Love_Data_Sc \n",
104 | " a_Science I_Love_Data_Science I_Love_Data_Sci \n",
105 | " _Science I_Love_Data_Science I_Love_Data_Scie \n",
106 | " Science I_Love_Data_Science I_Love_Data_Scien \n",
107 | " cience I_Love_Data_Science I_Love_Data_Scienc \n",
108 | " ience I_Love_Data_Science I_Love_Data_Science \n",
109 | " nce I_Love_Data_Science I_Love_Data_Science \n",
110 | " e I_Love_Data_Science I_Love_Data_Science \n",
111 | " I_Love_Data_Science I_Love_Data_Science \n",
112 | " Love_Data_Science I_Love_Data_Science \n",
113 | " ve_Data_Science I_Love_Data_Science \n",
114 | " _Data_Science I_Love_Data_Science \n",
115 | " ta_Science I_Love_Data_Scienc \n",
116 | " Science I_Love_Data_Scien \n",
117 | " ence I_Love_Data_Scie \n",
118 | " I_Love_Data_Sc \n",
119 | " ove_Data_ \n",
120 | " Dat \n",
121 | " t \n",
122 | " \n",
123 | " \n",
124 | " \n",
125 | " \n"
126 | ]
127 | }
128 | ],
129 | "source": [
130 | "print('\\n'.join([''.join([(' I_Love_Data_Science_'[(x-y) % len('I_Love_Data_Science_')] if ((x*0.05)**2+(y*0.1)**2-1)**3-(x*0.05)**2*(y*0.1)**3 <= 0 else ' ') for x in range(-30, 30)]) for y in range(15, -15, -1)]))"
131 | ]
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "metadata": {},
136 | "source": [
137 | "# application_train"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": 53,
143 | "metadata": {},
144 | "outputs": [],
145 | "source": [
146 | "def application_train():\n",
147 | "\n",
148 | " df = pd.read_csv('data/application_train.csv')\n",
149 | " test_df = pd.read_csv('data/application_test.csv')\n",
150 | "\n",
151 | " df = df.append(test_df).reset_index()\n",
152 | " df = df[df['CODE_GENDER'] != 'XNA']\n",
153 | "\n",
154 | " lbe = LabelEncoder()\n",
155 | "\n",
156 | " for col in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:\n",
157 | " df[col] = lbe.fit_transform(df[col])\n",
158 | "\n",
159 | " df = pd.get_dummies(df, dummy_na = True)\n",
160 | "\n",
161 | " df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace = True)\n",
162 | " df['NEW_DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']\n",
163 | " df['NEW_INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']\n",
164 | " df['NEW_INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']\n",
165 | " df['NEW_ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']\n",
166 | " df['NEW_PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']\n",
167 | "\n",
168 | " df.drop(\"index\", axis = 1, inplace = True)\n",
169 | "\n",
170 | " df.columns = pd.Index([\"APP_\" + col for col in df.columns.tolist()])\n",
171 | "\n",
172 | " df.rename(columns={\"APP_SK_ID_CURR\":\"SK_ID_CURR\"}, inplace = True)\n",
173 | "\n",
174 | " df.rename(columns={\"APP_TARGET\":\"TARGET\"}, inplace = True)\n",
175 | " \n",
176 | " return df"
177 | ]
178 | },
179 | {
180 | "cell_type": "markdown",
181 | "metadata": {},
182 | "source": [
183 | "# bureau & bureau_balance"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": 54,
189 | "metadata": {},
190 | "outputs": [],
191 | "source": [
192 | "def bureau_bb():\n",
193 | "\n",
194 | " #bureau_balance tablosunun okutulması\n",
195 | "\n",
196 | " bb = pd.read_csv('data/bureau_balance.csv')\n",
197 | " bb = pd.get_dummies(bb, dummy_na = True)\n",
198 | "\n",
199 | " agg_list = {\"MONTHS_BALANCE\":\"count\",\n",
200 | " \"STATUS_0\":[\"sum\",\"mean\"],\n",
201 | " \"STATUS_1\":[\"sum\"],\n",
202 | " \"STATUS_2\":[\"sum\"],\n",
203 | " \"STATUS_3\":[\"sum\"],\n",
204 | " \"STATUS_4\":[\"sum\"],\n",
205 | " \"STATUS_5\":[\"sum\"],\n",
206 | " \"STATUS_C\":[\"sum\",\"mean\"],\n",
207 | " \"STATUS_X\":[\"sum\",\"mean\"] }\n",
208 | "\n",
209 | " bb_agg = bb.groupby(\"SK_ID_BUREAU\").agg(agg_list)\n",
210 | "\n",
211 | " # Degisken isimlerinin yeniden adlandirilmasi \n",
212 | " bb_agg.columns = pd.Index([col[0] + \"_\" + col[1].upper() for col in bb_agg.columns.tolist()])\n",
213 | "\n",
214 | " # Status_sum ile ilgili yeni bir degisken olusturma\n",
215 | " bb_agg['NEW_STATUS_SCORE'] = bb_agg['STATUS_1_SUM'] + bb_agg['STATUS_2_SUM']^2 + bb_agg['STATUS_3_SUM']^3 + bb_agg['STATUS_4_SUM']^4 + bb_agg['STATUS_5_SUM']^5\n",
216 | "\n",
217 | " bb_agg.drop(['STATUS_1_SUM','STATUS_2_SUM','STATUS_3_SUM','STATUS_4_SUM','STATUS_5_SUM'], axis=1,inplace=True)\n",
218 | "\n",
219 | " bureau = pd.read_csv('data/bureau.csv')\n",
220 | " bureau_and_bb = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')\n",
221 | "\n",
222 | " #BUREAU BALANCE VE BUREAU ORTAK TABLO\n",
223 | "\n",
224 | " #CREDIT_TYPE degiskeninin sinif sayisini 3'e düsürmek \n",
225 | " bureau_and_bb['CREDIT_TYPE'] = bureau_and_bb['CREDIT_TYPE'].replace(['Car loan',\n",
226 | " 'Mortgage',\n",
227 | " 'Microloan',\n",
228 | " 'Loan for business development', \n",
229 | " 'Another type of loan',\n",
230 | " 'Unknown type of loan', \n",
231 | " 'Loan for working capital replenishment',\n",
232 | " \"Loan for purchase of shares (margin lending)\", \n",
233 | " 'Cash loan (non-earmarked)', \n",
234 | " 'Real estate loan',\n",
235 | " \"Loan for the purchase of equipment\", \n",
236 | " \"Interbank credit\", \n",
237 | " \"Mobile operator loan\"], 'Rare')\n",
238 | "\n",
239 | "\n",
240 | " #CREDIT_ACTIVE degiskeninin sinif sayisini 2'ye düsürmek (Sold' u Closed a dahil etmek daha mi uygun olur ???)\n",
241 | " bureau_and_bb['CREDIT_ACTIVE'] = bureau_and_bb['CREDIT_ACTIVE'].replace(['Bad debt','Sold'], 'Active')\n",
242 | "\n",
243 | " # bureau_bb tablosundaki kategorik degiskenlere One Hot Encoding uygulanmasi\n",
244 | " bureau_and_bb = pd.get_dummies(bureau_and_bb, columns = [\"CREDIT_TYPE\",\"CREDIT_ACTIVE\"])\n",
245 | "\n",
246 | " # CREDIT_CURRENCY degiskeninin %99u currency1, bu sebeple ayirt ediciligi olmayacagini dusundugumuz icin sildik \n",
247 | " bureau_and_bb.drop([\"SK_ID_BUREAU\",\"CREDIT_CURRENCY\"], inplace = True, axis = 1)\n",
248 | "\n",
249 | "\n",
250 | " #NEW FEATURES\n",
251 | "\n",
252 | " #ortalama kac aylık kredi aldıgını gösteren yeni degisken\n",
253 | " bureau_and_bb[\"NEW_MONTHS_CREDIT\"]= round((bureau_and_bb.DAYS_CREDIT_ENDDATE - bureau_and_bb.DAYS_CREDIT)/30)\n",
254 | "\n",
255 | " agg_list = {\n",
256 | " \"SK_ID_CURR\":[\"count\"],\n",
257 | " \"DAYS_CREDIT\":[\"min\",\"max\"],\n",
258 | " \"CREDIT_DAY_OVERDUE\":[\"sum\",\"mean\",\"max\"], \n",
259 | " \"DAYS_CREDIT_ENDDATE\":[\"max\",\"min\"],\n",
260 | " \"DAYS_ENDDATE_FACT\":[\"max\",\"min\"],\n",
261 | " \"AMT_CREDIT_MAX_OVERDUE\":[\"mean\",\"max\",\"min\"],\n",
262 | " \"CNT_CREDIT_PROLONG\":[\"sum\",\"mean\",\"max\",\"min\"],\n",
263 | " \"AMT_CREDIT_SUM\":[\"mean\",\"max\",\"min\"], \n",
264 | " \"AMT_CREDIT_SUM_DEBT\":[\"sum\",\"mean\",\"max\"],\n",
265 | " \"AMT_CREDIT_SUM_LIMIT\":[\"sum\",\"mean\",\"max\"],\n",
266 | " 'AMT_CREDIT_SUM_OVERDUE':[\"sum\",\"mean\",\"max\"], \n",
267 | " 'DAYS_CREDIT_UPDATE':[\"max\",\"min\"],\n",
268 | " 'AMT_ANNUITY':[\"sum\",\"mean\"],\n",
269 | " 'MONTHS_BALANCE_COUNT':[\"sum\"], \n",
270 | " 'STATUS_0_SUM':[\"sum\"], \n",
271 | " 'STATUS_0_MEAN':[\"mean\"], \n",
272 | " 'STATUS_C_SUM':[\"sum\"], \n",
273 | " 'STATUS_C_MEAN':[\"mean\"],\n",
274 | " 'CREDIT_ACTIVE_Active':[\"sum\",\"mean\"], \n",
275 | " 'CREDIT_ACTIVE_Closed':[\"sum\",\"mean\"], \n",
276 | " 'CREDIT_TYPE_Rare':[\"sum\",\"mean\"], \n",
277 | " 'CREDIT_TYPE_Consumer credit':[\"sum\",\"mean\"], \n",
278 | " 'CREDIT_TYPE_Credit card':[\"sum\",\"mean\"],\n",
279 | " \"NEW_MONTHS_CREDIT\":[\"count\",\"sum\",\"mean\",\"max\",\"min\"]}\n",
280 | "\n",
281 | "\n",
282 | " # bureau_bb_agg tablosuna aggreagation islemlerinin uygulanamasi \n",
283 | " bureau_and_bb_agg = bureau_and_bb.groupby(\"SK_ID_CURR\").agg(agg_list).reset_index()\n",
284 | "\n",
285 | "\n",
286 | " # Degisken isimlerinin yeniden adlandirilmasi \n",
287 | " bureau_and_bb_agg.columns = pd.Index([\"BB_\" + col[0] + \"_\" + col[1].upper() for col in bureau_and_bb_agg.columns.tolist()])\n",
288 | "\n",
289 | " # kisinin aldıgı en yuksek ve en dusuk kredinin farkını gösteren yeni degisken\n",
290 | " bureau_and_bb_agg[\"BB_NEW_AMT_CREDIT_SUM_RANGE\"] = bureau_and_bb_agg[\"BB_AMT_CREDIT_SUM_MAX\"] - bureau_and_bb_agg[\"BB_AMT_CREDIT_SUM_MIN\"]\n",
291 | "\n",
292 | " # ortalama kac ayda bir kredi cektigini ifade eden yeni degisken\n",
293 | " bureau_and_bb_agg[\"BB_NEW_DAYS_CREDIT_RANGE\"]= round((bureau_and_bb_agg[\"BB_DAYS_CREDIT_MAX\"] - bureau_and_bb_agg[\"BB_DAYS_CREDIT_MIN\"])/(30 * bureau_and_bb_agg[\"BB_SK_ID_CURR_COUNT\"]))\n",
294 | "\n",
295 | "\n",
296 | " # Bureau: Active credits - using only numerical aggregations\n",
297 | " agg_list = {\n",
298 | " 'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],\n",
299 | " 'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],\n",
300 | " 'DAYS_CREDIT_UPDATE': ['mean'],\n",
301 | " 'CREDIT_DAY_OVERDUE': ['max', 'mean'],\n",
302 | " 'AMT_CREDIT_MAX_OVERDUE': ['mean'],\n",
303 | " 'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],\n",
304 | " 'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],\n",
305 | " 'AMT_CREDIT_SUM_OVERDUE': ['mean'],\n",
306 | " 'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],\n",
307 | " 'AMT_ANNUITY': ['max', 'mean'],\n",
308 | " 'CNT_CREDIT_PROLONG': ['sum']\n",
309 | " }\n",
310 | "\n",
311 | "\n",
312 | " active = bureau_and_bb[bureau_and_bb['CREDIT_ACTIVE_Active'] == 1]\n",
313 | " active_agg = active.groupby('SK_ID_CURR').agg(agg_list)\n",
314 | " active_agg.columns = pd.Index(['BB_NEW_ACTIVE_' + e[0] + \"_\" + e[1].upper() for e in active_agg.columns.tolist()])\n",
315 | " bureau_and_bb_agg.rename(columns = {'BB_SK_ID_CURR_': 'SK_ID_CURR'}, inplace = True)\n",
316 | " bureau_and_bb_agg = bureau_and_bb_agg.join(active_agg, how='left', on='SK_ID_CURR')\n",
317 | "\n",
318 | " # Bureau: Closed credits - using only numerical aggregations\n",
319 | " closed = bureau_and_bb[bureau_and_bb['CREDIT_ACTIVE_Closed'] == 1]\n",
320 | " closed_agg = closed.groupby('SK_ID_CURR').agg(agg_list)\n",
321 | " closed_agg.columns = pd.Index(['BB_NEW_CLOSED_' + e[0] + \"_\" + e[1].upper() for e in closed_agg.columns.tolist()])\n",
322 | " bureau_and_bb_agg = bureau_and_bb_agg.join(closed_agg, how='left', on='SK_ID_CURR')\n",
323 | " \n",
324 | " return bureau_and_bb_agg"
325 | ]
326 | },
327 | {
328 | "cell_type": "markdown",
329 | "metadata": {},
330 | "source": [
331 | "# installments_payments"
332 | ]
333 | },
334 | {
335 | "cell_type": "code",
336 | "execution_count": 55,
337 | "metadata": {},
338 | "outputs": [],
339 | "source": [
340 | "def installments_payments():\n",
341 | "\n",
342 | " #Read the installments_payments.csv\n",
343 | " ins = pd.read_csv('data/installments_payments.csv')\n",
344 | "\n",
345 | " ins['NEW_DAYS_PAID_EARLIER'] = ins['DAYS_INSTALMENT']-ins['DAYS_ENTRY_PAYMENT']\n",
346 | "\n",
347 | " # Her bir taksit ödemesinin gec olup olmama durumu 1: gec ödedi 0: erken ödemeyi temsil eder\n",
348 | " ins['NEW_NUM_PAID_LATER'] = ins['NEW_DAYS_PAID_EARLIER'].map(lambda x: 1 if x<0 else 0)\n",
349 | "\n",
350 | " # Agrregation ve degisken tekillestirme\n",
351 | " agg_list = {'NUM_INSTALMENT_VERSION':['nunique'],\n",
352 | " 'NUM_INSTALMENT_NUMBER':'max',\n",
353 | " 'DAYS_INSTALMENT':['min','max'],\n",
354 | " 'DAYS_ENTRY_PAYMENT':['min','max'],\n",
355 | " 'AMT_INSTALMENT':['min','max','sum','mean'],\n",
356 | " 'AMT_PAYMENT':['min','max','sum','mean'],\n",
357 | " 'NEW_DAYS_PAID_EARLIER':'mean',\n",
358 | " 'NEW_NUM_PAID_LATER':'sum'}\n",
359 | "\n",
360 | "\n",
361 | " ins_agg = ins.groupby('SK_ID_PREV').agg(agg_list)\n",
362 | "\n",
363 | "\n",
364 | " # Multi index problemi cözümü\n",
365 | " ins_agg.columns = pd.Index([\"INS_\" + e[0] + '_' + e[1].upper() for e in ins_agg.columns.tolist()])\n",
366 | "\n",
367 | " # drop variables \n",
368 | " ins_agg.drop(['INS_DAYS_INSTALMENT_MIN',\n",
369 | " 'INS_DAYS_INSTALMENT_MAX',\n",
370 | " 'INS_DAYS_ENTRY_PAYMENT_MIN',\n",
371 | " 'INS_DAYS_ENTRY_PAYMENT_MAX'],axis=1,inplace=True)\n",
372 | "\n",
373 | " # Kredi ödeme yüzdesi ve toplam kalan borc\n",
374 | " ins_agg['INS_NEW_PAYMENT_PERC'] = ins_agg['INS_AMT_PAYMENT_SUM'] / ins_agg['INS_AMT_INSTALMENT_SUM']\n",
375 | " ins_agg['INS_NEW_PAYMENT_DIFF'] = ins_agg['INS_AMT_INSTALMENT_SUM'] - ins_agg['INS_AMT_PAYMENT_SUM']\n",
376 | " \n",
377 | " agg_list_previous_application = {}\n",
378 | " \n",
379 | " for col in ins_agg.columns:\n",
380 | " agg_list_previous_application[col] = ['mean',\"min\",\"max\",\"sum\"]\n",
381 | " \n",
382 | " ins_agg.reset_index(inplace = True) \n",
383 | " \n",
384 | " return agg_list_previous_application, ins_agg"
385 | ]
386 | },
387 | {
388 | "cell_type": "markdown",
389 | "metadata": {},
390 | "source": [
391 | "# pos_cash_balance"
392 | ]
393 | },
394 | {
395 | "cell_type": "code",
396 | "execution_count": null,
397 | "metadata": {},
398 | "outputs": [],
399 | "source": []
400 | },
401 | {
402 | "cell_type": "code",
403 | "execution_count": 56,
404 | "metadata": {},
405 | "outputs": [],
406 | "source": [
407 | "def pos_cash_balance(agg_list_previous_application):\n",
408 | "\n",
409 | " pos = pd.read_csv('data/POS_CASH_balance.csv')\n",
410 | " # Kategorik Degiskenimizi Dummy Degiskenine Dönüstürme\n",
411 | " pos = pd.get_dummies(pos, columns=['NAME_CONTRACT_STATUS'], dummy_na = True)\n",
412 | " # Aggregation Islemi - Tekillestirme\n",
413 | " agg_list = {'MONTHS_BALANCE':['min','max'],\n",
414 | " 'CNT_INSTALMENT':['min','max'],\n",
415 | " 'CNT_INSTALMENT_FUTURE':['min','max'],\n",
416 | " 'SK_DPD':['max','mean'],\n",
417 | " 'SK_DPD_DEF':['max','mean'],\n",
418 | " 'NAME_CONTRACT_STATUS_Active':'sum',\n",
419 | " 'NAME_CONTRACT_STATUS_Amortized debt':'sum',\n",
420 | " 'NAME_CONTRACT_STATUS_Approved':'sum',\n",
421 | " 'NAME_CONTRACT_STATUS_Canceled':'sum',\n",
422 | " 'NAME_CONTRACT_STATUS_Completed':'sum',\n",
423 | " 'NAME_CONTRACT_STATUS_Demand':'sum',\n",
424 | " 'NAME_CONTRACT_STATUS_Returned to the store':'sum',\n",
425 | " 'NAME_CONTRACT_STATUS_Signed':'sum',\n",
426 | " 'NAME_CONTRACT_STATUS_XNA':'sum',\n",
427 | " 'NAME_CONTRACT_STATUS_nan':'sum'\n",
428 | " }\n",
429 | "\n",
430 | " pos_agg = pos.groupby('SK_ID_PREV').agg(agg_list)\n",
431 | "\n",
432 | " # Multilayer index'i tek boyutlu index'e dönüstürme\n",
433 | " pos_agg.columns= pd.Index([\"POS_\" + e[0] + '_' + e[1].upper() for e in pos_agg.columns.tolist()])\n",
434 | "\n",
435 | " # SK_DPD kac kredide 0 olma durumu (SK_DPD MAX alacagiz 0 durumunu veriyor) \n",
436 | " # SK_DPD_DEF (SK_DPD_DEF_MAX sifir olma durumunu veriyor)\n",
437 | " # CNT_INSTALMENT_FUTURE_MIN==0 oldugunda NAME_CONTRACT_STATUS_Completed_SUM==0 olma durumu \n",
438 | "\n",
439 | " pos_agg['POS_NEW_IS_CREDIT_NOT_COMPLETED_ON_TIME']= (pos_agg['POS_CNT_INSTALMENT_FUTURE_MIN']==0) & (pos_agg['POS_NAME_CONTRACT_STATUS_Completed_SUM']==0)\n",
440 | "\n",
441 | "\n",
442 | " # 1:kredi zamaninda kapanmamis 0:kredi zamaninda kapanmis\n",
443 | "\n",
444 | " pos_agg['POS_NEW_IS_CREDIT_NOT_COMPLETED_ON_TIME']=pos_agg['POS_NEW_IS_CREDIT_NOT_COMPLETED_ON_TIME'].astype(int)\n",
445 | "\n",
446 | " pos_agg.drop(['POS_NAME_CONTRACT_STATUS_Approved_SUM',\n",
447 | " 'POS_NAME_CONTRACT_STATUS_Amortized debt_SUM',\n",
448 | " 'POS_NAME_CONTRACT_STATUS_Canceled_SUM',\n",
449 | " 'POS_NAME_CONTRACT_STATUS_Returned to the store_SUM',\n",
450 | " 'POS_NAME_CONTRACT_STATUS_Signed_SUM',\n",
451 | " 'POS_NAME_CONTRACT_STATUS_XNA_SUM',\n",
452 | " 'POS_NAME_CONTRACT_STATUS_nan_SUM'],axis=1,inplace=True)\n",
453 | "\n",
454 | " for col in pos_agg.columns:\n",
455 | " agg_list_previous_application[col] = ['mean',\"min\",\"max\",\"sum\"]\n",
456 | "\n",
457 | " pos_agg.reset_index(inplace = True) \n",
458 | " \n",
459 | " return agg_list_previous_application, pos_agg"
460 | ]
461 | },
462 | {
463 | "cell_type": "markdown",
464 | "metadata": {},
465 | "source": [
466 | "# credit_card_balance"
467 | ]
468 | },
469 | {
470 | "cell_type": "code",
471 | "execution_count": 57,
472 | "metadata": {},
473 | "outputs": [],
474 | "source": [
475 | "def credit_card_balance():\n",
476 | "\n",
477 | " CCB = pd.read_csv('data/credit_card_balance.csv')\n",
478 | "\n",
479 | " CCB = pd.get_dummies(CCB, columns= ['NAME_CONTRACT_STATUS'] ) # artik tumu sayisal \n",
480 | "\n",
481 | " dropthis = ['NAME_CONTRACT_STATUS_Approved', 'NAME_CONTRACT_STATUS_Demand',\n",
482 | " 'NAME_CONTRACT_STATUS_Refused', 'NAME_CONTRACT_STATUS_Sent proposal',\n",
483 | " 'NAME_CONTRACT_STATUS_Signed' ]\n",
484 | "\n",
485 | " CCB = CCB.drop(dropthis, axis=1)\n",
486 | "\n",
487 | " grp = CCB.groupby(by = ['SK_ID_CURR'])['SK_ID_PREV'].nunique().reset_index().rename(index = str, columns = {'SK_ID_PREV': 'NUMBER_OF_LOANS_PER_CUSTOMER'})\n",
488 | " CCB = CCB.merge(grp, on = ['SK_ID_CURR'], how = 'left')\n",
489 | "\n",
490 | " grp = CCB.groupby(by = ['SK_ID_CURR', 'SK_ID_PREV'])['CNT_INSTALMENT_MATURE_CUM'].max().reset_index().rename(index = str, columns = {'CNT_INSTALMENT_MATURE_CUM': 'NUMBER_OF_INSTALMENTS'})\n",
491 | " grp1 = grp.groupby(by = ['SK_ID_CURR'])['NUMBER_OF_INSTALMENTS'].sum().reset_index().rename(index = str, columns = {'NUMBER_OF_INSTALMENTS': 'TOTAL_INSTALMENTS_OF_ALL_LOANS'})\n",
492 | " CCB = CCB.merge(grp1, on = ['SK_ID_CURR'], how = 'left')\n",
493 | "\n",
494 | " CCB['INSTALLMENTS_PER_LOAN'] = (CCB['TOTAL_INSTALMENTS_OF_ALL_LOANS']/CCB['NUMBER_OF_LOANS_PER_CUSTOMER']).astype('uint32')\n",
495 | "\n",
496 | "\n",
497 | " # Bu fonksiyon, kac defa odemelerin geciktigini hesaplar\n",
498 | " # Function to calculate number of times Days Past Due occurred\n",
499 | " def geciken_gun_hesapla(DPD):\n",
500 | "\n",
501 | " # DPD ile beklenen bir seri: SK_DPD degiskeninin her bir prev_app daki gecmis kredi icin olan degerleri\n",
502 | " # DPD is a series of values of SK_DPD for each of the groupby combination\n",
503 | " # We convert it to a list to get the number of SK_DPD values NOT EQUALS ZERO\n",
504 | " x = DPD.tolist()\n",
505 | " c = 0\n",
506 | " for i,j in enumerate(x):\n",
507 | " if j != 0:\n",
508 | " c += 1 \n",
509 | " return c \n",
510 | "\n",
511 | " grp = CCB.groupby(by = ['SK_ID_CURR', 'SK_ID_PREV']).apply(lambda x: geciken_gun_hesapla(x.SK_DPD)).reset_index().rename(index = str, columns = {0: 'NUMBER_OF_DPD'})\n",
512 | " grp1 = grp.groupby(by = ['SK_ID_CURR'])['NUMBER_OF_DPD'].mean().reset_index().rename(index = str, columns = {'NUMBER_OF_DPD' : 'DPD_COUNT'})\n",
513 | "\n",
514 | " CCB = CCB.merge(grp1, on = ['SK_ID_CURR'], how = 'left')\n",
515 | "\n",
516 | "\n",
517 | " def f(min_pay, total_pay):\n",
518 | "\n",
519 | " M = min_pay.tolist()\n",
520 | " T = total_pay.tolist()\n",
521 | " P = len(M) # P: taksit sayisi\n",
522 | " c = 0 \n",
523 | " # Find the count of transactions when Payment made is less than Minimum Payment \n",
524 | " for i in range(len(M)):\n",
525 | " if T[i] < M[i]:\n",
526 | " c += 1 \n",
527 | " return (100*c)/P\n",
528 | "\n",
529 | " grp = CCB.groupby(by = ['SK_ID_CURR']).apply(lambda x: f(x.AMT_INST_MIN_REGULARITY, x.AMT_PAYMENT_CURRENT)).reset_index().rename(index = str, columns = { 0 : 'PERCENTAGE_MIN_MISSED_PAYMENTS'})\n",
530 | " CCB = CCB.merge(grp, on = ['SK_ID_CURR'], how = 'left')\n",
531 | "\n",
532 | " grp = CCB.groupby(by = ['SK_ID_CURR'])['AMT_DRAWINGS_ATM_CURRENT'].sum().reset_index().rename(index = str, columns = {'AMT_DRAWINGS_ATM_CURRENT' : 'DRAWINGS_ATM'})\n",
533 | " CCB = CCB.merge(grp, on = ['SK_ID_CURR'], how = 'left')\n",
534 | "\n",
535 | "\n",
536 | " grp = CCB.groupby(by = ['SK_ID_CURR'])['AMT_DRAWINGS_CURRENT'].sum().reset_index().rename(index = str, columns = {'AMT_DRAWINGS_CURRENT' : 'DRAWINGS_TOTAL'})\n",
537 | " CCB = CCB.merge(grp, on = ['SK_ID_CURR'], how = 'left')\n",
538 | "\n",
539 | "\n",
540 | " CCB['CASH_CARD_RATIO1'] = (CCB['DRAWINGS_ATM']/CCB['DRAWINGS_TOTAL'])*100 # ATM den cektigi nakit / toplam cektigi\n",
541 | " del CCB['DRAWINGS_ATM']\n",
542 | " del CCB['DRAWINGS_TOTAL']\n",
543 | "\n",
544 | " grp = CCB.groupby(by = ['SK_ID_CURR'])['CASH_CARD_RATIO1'].mean().reset_index().rename(index = str, columns ={ 'CASH_CARD_RATIO1' : 'CASH_CARD_RATIO'})\n",
545 | " CCB = CCB.merge(grp, on = ['SK_ID_CURR'], how = 'left')\n",
546 | "\n",
547 | "\n",
548 | " grp = CCB.groupby(by = ['SK_ID_CURR'])['AMT_DRAWINGS_CURRENT'].sum().reset_index().rename(index = str, columns = {'AMT_DRAWINGS_CURRENT' : 'TOTAL_DRAWINGS'})\n",
549 | " CCB = CCB.merge(grp, on = ['SK_ID_CURR'], how = 'left')\n",
550 | "\n",
551 | "\n",
552 | " grp = CCB.groupby(by = ['SK_ID_CURR'])['CNT_DRAWINGS_CURRENT'].sum().reset_index().rename(index = str, columns = {'CNT_DRAWINGS_CURRENT' : 'NUMBER_OF_DRAWINGS'})\n",
553 | " CCB = CCB.merge(grp, on = ['SK_ID_CURR'], how = 'left')\n",
554 | "\n",
555 | "\n",
556 | " CCB['DRAWINGS_RATIO1'] = (CCB['TOTAL_DRAWINGS']/CCB['NUMBER_OF_DRAWINGS'])*100 # yuzdelik degil, genisleme yapmis\n",
557 | " del CCB['TOTAL_DRAWINGS']\n",
558 | " del CCB['NUMBER_OF_DRAWINGS']\n",
559 | "\n",
560 | "\n",
561 | " grp = CCB.groupby(by = ['SK_ID_CURR'])['DRAWINGS_RATIO1'].mean().reset_index().rename(index = str, columns ={ 'DRAWINGS_RATIO1' : 'DRAWINGS_RATIO'})\n",
562 | " CCB = CCB.merge(grp, on = ['SK_ID_CURR'], how = 'left')\n",
563 | "\n",
564 | " del CCB['DRAWINGS_RATIO1']\n",
565 | "\n",
566 | " CCB['CC_COUNT'] = CCB.groupby('SK_ID_CURR').size()\n",
567 | "\n",
568 | " CCB_agg = CCB.groupby('SK_ID_CURR').agg({\n",
569 | " 'MONTHS_BALANCE':[\"sum\",\"mean\"], \n",
570 | " 'AMT_BALANCE':[\"sum\",\"mean\",\"min\",\"max\"],\n",
571 | " 'AMT_CREDIT_LIMIT_ACTUAL':[\"sum\",\"mean\"], \n",
572 | "\n",
573 | " 'AMT_DRAWINGS_ATM_CURRENT':[\"sum\",\"mean\",\"min\",\"max\"],\n",
574 | " 'AMT_DRAWINGS_CURRENT':[\"sum\",\"mean\",\"min\",\"max\"], \n",
575 | " 'AMT_DRAWINGS_OTHER_CURRENT':[\"sum\",\"mean\",\"min\",\"max\"],\n",
576 | " 'AMT_DRAWINGS_POS_CURRENT':[\"sum\",\"mean\",\"min\",\"max\"], \n",
577 | " 'AMT_INST_MIN_REGULARITY':[\"sum\",\"mean\",\"min\",\"max\"],\n",
578 | " 'AMT_PAYMENT_CURRENT':[\"sum\",\"mean\",\"min\",\"max\"], \n",
579 | " 'AMT_PAYMENT_TOTAL_CURRENT':[\"sum\",\"mean\",\"min\",\"max\"],\n",
580 | " 'AMT_RECEIVABLE_PRINCIPAL':[\"sum\",\"mean\",\"min\",\"max\"], \n",
581 | " 'AMT_RECIVABLE':[\"sum\",\"mean\",\"min\",\"max\"], \n",
582 | " 'AMT_TOTAL_RECEIVABLE':[\"sum\",\"mean\",\"min\",\"max\"],\n",
583 | "\n",
584 | " 'CNT_DRAWINGS_ATM_CURRENT':[\"sum\",\"mean\"], \n",
585 | " 'CNT_DRAWINGS_CURRENT':[\"sum\",\"mean\",\"max\"],\n",
586 | " 'CNT_DRAWINGS_OTHER_CURRENT':[\"mean\",\"max\"], \n",
587 | " 'CNT_DRAWINGS_POS_CURRENT':[\"sum\",\"mean\",\"max\"],\n",
588 | " 'CNT_INSTALMENT_MATURE_CUM':[\"sum\",\"mean\",\"max\",\"min\"], \n",
589 | " 'SK_DPD':[\"sum\",\"mean\",\"max\"], \n",
590 | " 'SK_DPD_DEF':[\"sum\",\"mean\",\"max\"],\n",
591 | "\n",
592 | " 'NAME_CONTRACT_STATUS_Active':[\"sum\",\"mean\",\"min\",\"max\"], \n",
593 | " 'INSTALLMENTS_PER_LOAN':[\"sum\",\"mean\",\"min\",\"max\"],\n",
594 | "\n",
595 | " 'NUMBER_OF_LOANS_PER_CUSTOMER':[\"mean\"], \n",
596 | " 'DPD_COUNT':[\"mean\"],\n",
597 | " 'PERCENTAGE_MIN_MISSED_PAYMENTS':[\"mean\"], \n",
598 | " 'CASH_CARD_RATIO':[\"mean\"], \n",
599 | " 'DRAWINGS_RATIO':[\"mean\"]})\n",
600 | "\n",
601 | "\n",
602 | " CCB_agg.columns = pd.Index(['CCB_' + e[0] + \"_\" + e[1].upper() for e in CCB_agg.columns.tolist()])\n",
603 | "\n",
604 | " CCB_agg.reset_index(inplace = True)\n",
605 | " \n",
606 | " return CCB_agg"
607 | ]
608 | },
609 | {
610 | "cell_type": "code",
611 | "execution_count": null,
612 | "metadata": {},
613 | "outputs": [],
614 | "source": []
615 | },
616 | {
617 | "cell_type": "markdown",
618 | "metadata": {},
619 | "source": [
620 | "# previous_application"
621 | ]
622 | },
623 | {
624 | "cell_type": "code",
625 | "execution_count": 58,
626 | "metadata": {},
627 | "outputs": [],
628 | "source": [
629 | "def previous_application(agg_list_previous_application):\n",
630 | "\n",
631 | "\n",
632 | " df_prev = pd.read_csv('data/previous_application.csv')\n",
633 | "\n",
634 | " # \"WEEKDAY_APPR_PROCESS_START\" değişkeninin WEEK_DAY ve WEEKEND olarak iki kategoriye ayrılması\n",
635 | "\n",
636 | " df_prev[\"WEEKDAY_APPR_PROCESS_START\"] = df_prev[\"WEEKDAY_APPR_PROCESS_START\"].replace(['MONDAY','TUESDAY', 'WEDNESDAY','THURSDAY','FRIDAY'], 'WEEK_DAY')\n",
637 | " df_prev[\"WEEKDAY_APPR_PROCESS_START\"] = df_prev[\"WEEKDAY_APPR_PROCESS_START\"].replace(['SATURDAY', 'SUNDAY'], 'WEEKEND')\n",
638 | "\n",
639 | " # \"HOUR_APPR_PROCESS_START\" değişkeninin working_hours ve off_hours olarak iki kategoriye ayrılması\n",
640 | " a = [8,9,10,11,12,13,14,15,16,17]\n",
641 | " df_prev[\"HOUR_APPR_PROCESS_START\"] = df_prev[\"HOUR_APPR_PROCESS_START\"].replace(a, 'working_hours')\n",
642 | "\n",
643 | " b = [18,19,20,21,22,23,0,1,2,3,4,5,6,7]\n",
644 | " df_prev[\"HOUR_APPR_PROCESS_START\"] = df_prev[\"HOUR_APPR_PROCESS_START\"].replace(b, 'off_hours')\n",
645 | "\n",
646 | "\n",
647 | " # DAYS_DECISION değeri 1 yıldan küçük olanlara 1, büyük olanlara 0 değeri verildi.\n",
648 | " df_prev[\"DAYS_DECISION\"] = [1 if abs(i/(12*30)) <=1 else 0 for i in df_prev.DAYS_DECISION]\n",
649 | "\n",
650 | " # \"NAME_TYPE_SUITE\" değişkeninin alone ve not_alone olarak iki kategoriye ayrılması\n",
651 | "\n",
652 | " df_prev[\"NAME_TYPE_SUITE\"] = df_prev[\"NAME_TYPE_SUITE\"].replace('Unaccompanied', 'alone')\n",
653 | "\n",
654 | " b = ['Family', 'Spouse, partner', 'Children', 'Other_B', 'Other_A', 'Group of people']\n",
655 | " df_prev[\"NAME_TYPE_SUITE\"] = df_prev[\"NAME_TYPE_SUITE\"].replace(b, 'not_alone')\n",
656 | "\n",
657 | "\n",
658 | "\n",
659 | " # \"NAME_GOODS_CATEGORY\" değişkenindeki bu değerler others olarak kategorize edilecek\n",
660 | " a = ['Auto Accessories', 'Jewelry', 'Homewares', 'Medical Supplies', 'Vehicles', 'Sport and Leisure', \n",
661 | " 'Gardening', 'Other', 'Office Appliances', 'Tourism', 'Medicine', 'Direct Sales', 'Fitness', 'Additional Service', \n",
662 | " 'Education', 'Weapon', 'Insurance', 'House Construction', 'Animals'] \n",
663 | " df_prev[\"NAME_GOODS_CATEGORY\"] = df_prev[\"NAME_GOODS_CATEGORY\"].replace(a, 'others')\n",
664 | "\n",
665 | " # \"NAME_SELLER_INDUSTRY\" değişkenindeki bu değerler others olarak kategorize edilecek\n",
666 | " a = ['Auto technology', 'Jewelry', 'MLM partners', 'Tourism'] \n",
667 | " df_prev[\"NAME_SELLER_INDUSTRY\"] = df_prev[\"NAME_SELLER_INDUSTRY\"].replace(a, 'others')\n",
668 | " # İstenilen krecinin verilen krediye oranı içeren değişkeni türetir\n",
669 | " df_prev[\"LOAN_RATE\"] = df_prev.AMT_APPLICATION/df_prev.AMT_CREDIT\n",
670 | "\n",
671 | " #YENI DEGISKENLER\n",
672 | "\n",
673 | " # İstenilen krecinin verilen krediye oranı içeren değişkeni türetir\n",
674 | " df_prev[\"NEW_LOAN_RATE\"] = df_prev.AMT_APPLICATION/df_prev.AMT_CREDIT\n",
675 | "\n",
676 | " # Ödeme gününü geciktirmiş mi bunu gösteren churn_prev değişkeni türetilir.\n",
677 | " # 1= geciktirmiş, 0 = geciktirmemiş, NaN = boş değer\n",
678 | " k = df_prev.DAYS_LAST_DUE_1ST_VERSION - df_prev.DAYS_LAST_DUE\n",
679 | " df_prev[\"NEW_CHURN_PREV\"] = [1 if i >= 0 else (0 if i < 0 else \"NaN\") for i in k]\n",
680 | "\n",
681 | "\n",
682 | " # NFLAG_INSURED_ON_APPROVAL değişkeni yerine kullanılmak izere NEW_INSURANCE değişkeni tanımlandı.\n",
683 | " df_prev[(df_prev['AMT_CREDIT'] == 0) | (df_prev['AMT_GOODS_PRICE'] == 0)]['NEW_INSURANCE'] = np.nan\n",
684 | " df_prev['sigorta_miktari'] = df_prev['AMT_CREDIT'] - df_prev['AMT_GOODS_PRICE']\n",
685 | " df_prev[\"NEW_INSURANCE\"] = df_prev['sigorta_miktari'].apply(lambda x: 1 if x > 0 else (0 if x <= 0 else np.nan))\n",
686 | " df_prev.drop('sigorta_miktari', axis=1, inplace=True)\n",
687 | "\n",
688 | " # INTEREST_RATE değişkenini oluşturur.\n",
689 | " #df_prev['INTEREST_RATE'] = (df_prev.AMT_ANNUITY*df_prev.CNT_PAYMENT/df_prev.AMT_CREDIT)**(12/df_prev.CNT_PAYMENT)-1\n",
690 | " #df_prev[df_prev['INTEREST_RATE']==-1]=np.nan\n",
691 | "\n",
692 | "\n",
693 | " drop_list = ['AMT_DOWN_PAYMENT', 'SELLERPLACE_AREA', 'CNT_PAYMENT', 'PRODUCT_COMBINATION', 'DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE',\n",
694 | " 'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_LAST_DUE','DAYS_TERMINATION','NFLAG_INSURED_ON_APPROVAL']\n",
695 | " df_prev.drop(drop_list, axis = 1, inplace = True)\n",
696 | "\n",
697 | " # Previous tablosundaki kategorik değişkenlerin isimlerini tutar.\n",
698 | " category_columns=[]\n",
699 | " for i in df_prev.columns:\n",
700 | " if df_prev[i].dtypes == \"O\":\n",
701 | " category_columns.append(i)\n",
702 | "\n",
703 | " df_prev = pd.get_dummies(df_prev, columns = category_columns )\n",
704 | "\n",
705 | " prev_agg_list = {\"SK_ID_CURR\":[\"count\"], \n",
706 | " \"AMT_ANNUITY\":[\"max\"],\n",
707 | " \"AMT_APPLICATION\":[\"min\",\"mean\",\"max\"],\n",
708 | " \"AMT_CREDIT\":[\"max\"], \n",
709 | " \"AMT_GOODS_PRICE\":[\"sum\", \"mean\"],\n",
710 | " \"NFLAG_LAST_APPL_IN_DAY\":[\"sum\",\"mean\"], \n",
711 | " \"RATE_DOWN_PAYMENT\":[\"sum\", \"mean\"],\n",
712 | " \"RATE_INTEREST_PRIMARY\":[\"sum\", \"mean\"],\n",
713 | " \"RATE_INTEREST_PRIVILEGED\":[\"sum\", \"mean\"],\n",
714 | " \"DAYS_DECISION\":[\"sum\"],\n",
715 | " \"NEW_LOAN_RATE\":[\"sum\", \"mean\", \"min\", \"max\"],\n",
716 | " \"NEW_INSURANCE\":[\"sum\", \"mean\"],\n",
717 | " #\"INTEREST_RATE\":[\"sum\", \"mean\", \"min\", \"max\"],\n",
718 | " \"NAME_CONTRACT_TYPE_Cash loans\":[\"sum\", \"mean\"],\n",
719 | " \"NAME_CONTRACT_TYPE_Consumer loans\":[\"sum\", \"mean\"],\n",
720 | " \"NAME_CONTRACT_TYPE_Revolving loans\":[\"sum\", \"mean\"],\n",
721 | " \"NAME_CONTRACT_TYPE_XNA\":[\"sum\", \"mean\"],\n",
722 | " \"WEEKDAY_APPR_PROCESS_START_WEEKEND\":[\"sum\", \"mean\"],\n",
723 | " \"WEEKDAY_APPR_PROCESS_START_WEEK_DAY\":[\"sum\", \"mean\"],\n",
724 | " \"HOUR_APPR_PROCESS_START_off_hours\":[\"sum\", \"mean\"],\n",
725 | " \"HOUR_APPR_PROCESS_START_working_hours\":[\"sum\", \"mean\"],\n",
726 | " \"FLAG_LAST_APPL_PER_CONTRACT_N\":[\"sum\", \"mean\"],\n",
727 | " \"FLAG_LAST_APPL_PER_CONTRACT_Y\":[\"sum\", \"mean\"],\n",
728 | " \"NAME_CASH_LOAN_PURPOSE_Building a house or an annex\":[\"sum\", \"mean\"],\n",
729 | " \"NAME_CASH_LOAN_PURPOSE_Business development\":[\"sum\", \"mean\"],\n",
730 | " \"NAME_CASH_LOAN_PURPOSE_Buying a garage\":[\"sum\", \"mean\"],\n",
731 | " \"NAME_CASH_LOAN_PURPOSE_Buying a holiday home / land\":[\"sum\", \"mean\"],\n",
732 | " \"NAME_CASH_LOAN_PURPOSE_Buying a home\":[\"sum\", \"mean\"],\n",
733 | " \"NAME_CASH_LOAN_PURPOSE_Buying a new car\":[\"sum\", \"mean\"],\n",
734 | " \"NAME_CASH_LOAN_PURPOSE_Buying a used car\":[\"sum\", \"mean\"],\n",
735 | " \"NAME_CASH_LOAN_PURPOSE_Car repairs\":[\"sum\", \"mean\"],\n",
736 | " \"NAME_CASH_LOAN_PURPOSE_Education\":[\"sum\", \"mean\"],\n",
737 | " \"NAME_CASH_LOAN_PURPOSE_Everyday expenses\":[\"sum\", \"mean\"],\n",
738 | " \"NAME_CASH_LOAN_PURPOSE_Furniture\":[\"sum\", \"mean\"],\n",
739 | " \"NAME_CASH_LOAN_PURPOSE_Gasification / water supply\":[\"sum\", \"mean\"],\n",
740 | " \"NAME_CASH_LOAN_PURPOSE_Hobby\":[\"sum\", \"mean\"],\n",
741 | " \"NAME_CASH_LOAN_PURPOSE_Journey\":[\"sum\", \"mean\"],\n",
742 | " \"NAME_CASH_LOAN_PURPOSE_Medicine\":[\"sum\", \"mean\"],\n",
743 | " \"NAME_CASH_LOAN_PURPOSE_Money for a third person\":[\"sum\", \"mean\"],\n",
744 | " \"NAME_CASH_LOAN_PURPOSE_Other\":[\"sum\", \"mean\"],\n",
745 | " \"NAME_CASH_LOAN_PURPOSE_Payments on other loans\":[\"sum\", \"mean\"],\n",
746 | " \"NAME_CASH_LOAN_PURPOSE_Purchase of electronic equipment\":[\"sum\", \"mean\"],\n",
747 | " \"NAME_CASH_LOAN_PURPOSE_Refusal to name the goal\":[\"sum\", \"mean\"],\n",
748 | " \"NAME_CASH_LOAN_PURPOSE_Repairs\":[\"sum\", \"mean\"],\n",
749 | " \"NAME_CASH_LOAN_PURPOSE_Urgent needs\":[\"sum\", \"mean\"],\n",
750 | " \"NAME_CASH_LOAN_PURPOSE_Wedding / gift / holiday\":[\"sum\", \"mean\"],\n",
751 | " \"NAME_CASH_LOAN_PURPOSE_XAP\":[\"sum\", \"mean\"],\n",
752 | " \"NAME_CASH_LOAN_PURPOSE_XNA\":[\"sum\", \"mean\"],\n",
753 | " \"NAME_CONTRACT_STATUS_Approved\":[\"sum\", \"mean\"],\n",
754 | " \"NAME_CONTRACT_STATUS_Canceled\":[\"sum\", \"mean\"],\n",
755 | " \"NAME_CONTRACT_STATUS_Refused\":[\"sum\", \"mean\"],\n",
756 | " \"NAME_CONTRACT_STATUS_Unused offer\":[\"sum\", \"mean\"],\n",
757 | " \"NAME_PAYMENT_TYPE_Cash through the bank\":[\"sum\", \"mean\"],\n",
758 | " \"NAME_PAYMENT_TYPE_Cashless from the account of the employer\":[\"sum\", \"mean\"],\n",
759 | " \"NAME_PAYMENT_TYPE_Non-cash from your account\":[\"sum\", \"mean\"],\n",
760 | " \"NAME_PAYMENT_TYPE_XNA\":[\"sum\", \"mean\"],\n",
761 | " \"CODE_REJECT_REASON_CLIENT\":[\"sum\", \"mean\"],\n",
762 | " \"CODE_REJECT_REASON_HC\":[\"sum\", \"mean\"],\n",
763 | " \"CODE_REJECT_REASON_LIMIT\":[\"sum\", \"mean\"],\n",
764 | " \"CODE_REJECT_REASON_SCO\":[\"sum\", \"mean\"],\n",
765 | " \"CODE_REJECT_REASON_SCOFR\":[\"sum\", \"mean\"],\n",
766 | " \"CODE_REJECT_REASON_SYSTEM\":[\"sum\", \"mean\"],\n",
767 | " \"CODE_REJECT_REASON_VERIF\":[\"sum\", \"mean\"],\n",
768 | " \"CODE_REJECT_REASON_XAP\":[\"sum\", \"mean\"],\n",
769 | " \"CODE_REJECT_REASON_XNA\":[\"sum\", \"mean\"],\n",
770 | " \"NAME_TYPE_SUITE_alone\":[\"sum\", \"mean\"],\n",
771 | " \"NAME_TYPE_SUITE_not_alone\":[\"sum\", \"mean\"],\n",
772 | " \"NAME_CLIENT_TYPE_New\":[\"sum\", \"mean\"],\n",
773 | " \"NAME_CLIENT_TYPE_Refreshed\":[\"sum\", \"mean\"],\n",
774 | " \"NAME_CLIENT_TYPE_Repeater\":[\"sum\", \"mean\"],\n",
775 | " \"NAME_CLIENT_TYPE_XNA\":[\"sum\", \"mean\"],\n",
776 | " \"NAME_GOODS_CATEGORY_Audio/Video\":[\"sum\", \"mean\"],\n",
777 | " \"NAME_GOODS_CATEGORY_Clothing and Accessories\":[\"sum\", \"mean\"],\n",
778 | " \"NAME_GOODS_CATEGORY_Computers\":[\"sum\", \"mean\"],\n",
779 | " \"NAME_GOODS_CATEGORY_Construction Materials\":[\"sum\", \"mean\"],\n",
780 | " \"NAME_GOODS_CATEGORY_Consumer Electronics\":[\"sum\", \"mean\"],\n",
781 | " \"NAME_GOODS_CATEGORY_Furniture\":[\"sum\", \"mean\"],\n",
782 | " \"NAME_GOODS_CATEGORY_Mobile\":[\"sum\", \"mean\"],\n",
783 | " \"NAME_GOODS_CATEGORY_Photo / Cinema Equipment\":[\"sum\", \"mean\"],\n",
784 | " \"NAME_GOODS_CATEGORY_XNA\":[\"sum\", \"mean\"],\n",
785 | " \"NAME_GOODS_CATEGORY_others\":[\"sum\", \"mean\"],\n",
786 | " \"NAME_PORTFOLIO_Cards\":[\"sum\", \"mean\"],\n",
787 | " \"NAME_PORTFOLIO_Cars\":[\"sum\", \"mean\"],\n",
788 | " \"NAME_PORTFOLIO_Cash\":[\"sum\", \"mean\"],\n",
789 | " \"NAME_PORTFOLIO_POS\":[\"sum\", \"mean\"],\n",
790 | " \"NAME_PORTFOLIO_XNA\":[\"sum\", \"mean\"],\n",
791 | " \"NAME_PRODUCT_TYPE_XNA\":[\"sum\", \"mean\"],\n",
792 | " \"NAME_PRODUCT_TYPE_walk-in\":[\"sum\", \"mean\"],\n",
793 | " \"NAME_PRODUCT_TYPE_x-sell\":[\"sum\", \"mean\"],\n",
794 | " \"CHANNEL_TYPE_AP+ (Cash loan)\":[\"sum\", \"mean\"],\n",
795 | " \"CHANNEL_TYPE_Car dealer\":[\"sum\", \"mean\"],\n",
796 | " \"CHANNEL_TYPE_Channel of corporate sales\":[\"sum\", \"mean\"],\n",
797 | " \"CHANNEL_TYPE_Contact center\":[\"sum\", \"mean\"],\n",
798 | " \"CHANNEL_TYPE_Country-wide\":[\"sum\", \"mean\"],\n",
799 | " \"CHANNEL_TYPE_Credit and cash offices\":[\"sum\", \"mean\"],\n",
800 | " \"CHANNEL_TYPE_Regional / Local\":[\"sum\", \"mean\"],\n",
801 | " \"CHANNEL_TYPE_Stone\":[\"sum\", \"mean\"],\n",
802 | " \"NAME_SELLER_INDUSTRY_Clothing\":[\"sum\", \"mean\"],\n",
803 | " \"NAME_SELLER_INDUSTRY_Connectivity\":[\"sum\", \"mean\"],\n",
804 | " \"NAME_SELLER_INDUSTRY_Construction\":[\"sum\", \"mean\"],\n",
805 | " \"NAME_SELLER_INDUSTRY_Consumer electronics\":[\"sum\", \"mean\"],\n",
806 | " \"NAME_SELLER_INDUSTRY_Furniture\":[\"sum\", \"mean\"],\n",
807 | " \"NAME_SELLER_INDUSTRY_Industry\":[\"sum\", \"mean\"],\n",
808 | " \"NAME_SELLER_INDUSTRY_XNA\":[\"sum\", \"mean\"],\n",
809 | " \"NAME_SELLER_INDUSTRY_others\":[\"sum\", \"mean\"],\n",
810 | " \"NAME_YIELD_GROUP_XNA\":[\"sum\", \"mean\"],\n",
811 | " \"NAME_YIELD_GROUP_high\":[\"sum\", \"mean\"],\n",
812 | " \"NAME_YIELD_GROUP_low_action\":[\"sum\", \"mean\"],\n",
813 | " \"NAME_YIELD_GROUP_low_normal\":[\"sum\", \"mean\"],\n",
814 | " \"NAME_YIELD_GROUP_middle\":[\"sum\", \"mean\"],\n",
815 | " \"NEW_CHURN_PREV_0\":[\"sum\", \"mean\"],\n",
816 | " \"NEW_CHURN_PREV_1\":[\"sum\", \"mean\"],\n",
817 | " \"NEW_CHURN_PREV_NaN\":[\"sum\", \"mean\"]}\n",
818 | "\n",
819 | " prev_agg_list.update(agg_list_previous_application)\n",
820 | " \n",
821 | " \n",
822 | " return prev_agg_list, df_prev"
823 | ]
824 | },
825 | {
826 | "cell_type": "code",
827 | "execution_count": null,
828 | "metadata": {},
829 | "outputs": [],
830 | "source": []
831 | },
832 | {
833 | "cell_type": "markdown",
834 | "metadata": {},
835 | "source": [
836 | "# Combine"
837 | ]
838 | },
839 | {
840 | "cell_type": "code",
841 | "execution_count": 59,
842 | "metadata": {},
843 | "outputs": [],
844 | "source": [
845 | "def pre_processing_and_combine():\n",
846 | "\n",
847 | " \n",
848 | " with timer(\"Process application train\"):\n",
849 | " df = application_train()\n",
850 | " print(\"application train & test shape:\", df.shape)\n",
851 | " \n",
852 | " \n",
853 | " with timer(\"Bureau and Bureau Balance\"):\n",
854 | " bureau_and_bb_agg = bureau_bb()\n",
855 | " print(\"Bureau and Bureau Balance:\", bureau_and_bb_agg.shape)\n",
856 | " \n",
857 | " with timer(\"Installment Payments\"):\n",
858 | " agg_list_previous_application, ins_agg = installments_payments()\n",
859 | " print(\"Installment Payments:\", ins_agg.shape) \n",
860 | " \n",
861 | " with timer(\"Pos Cash Balance\"):\n",
862 | " agg_list_previous_application, pos_agg = pos_cash_balance(agg_list_previous_application)\n",
863 | " print(\"Pos Cash Balance:\", pos_agg.shape) \n",
864 | " \n",
865 | " \n",
866 | " with timer(\"Credit Card Balance\"):\n",
867 | " CCB_agg = credit_card_balance()\n",
868 | " print(\"Credit Card Balance:\", CCB_agg.shape) \n",
869 | " \n",
870 | " with timer(\"previous_application\"):\n",
871 | " prev_agg_list, df_prev = previous_application(agg_list_previous_application)\n",
872 | " print(\"previous_application:\", df_prev.shape) \n",
873 | " \n",
874 | " \n",
875 | " with timer(\"All tables are combining\"):\n",
876 | " df_prev_ins = df_prev.merge(ins_agg, how = 'left', on = 'SK_ID_PREV')\n",
877 | " df_prev_ins_pos = df_prev_ins.merge(pos_agg, how = 'left', on = 'SK_ID_PREV')\n",
878 | " df_prev_ins_pos_agg = df_prev_ins_pos.groupby(\"SK_ID_CURR\").agg(prev_agg_list).reset_index()\n",
879 | " df_prev_ins_pos_agg.columns = pd.Index([\"PREV_\" + col[0] + \"_\" + col[1].upper() for col in df_prev_ins_pos_agg.columns.tolist()])\n",
880 | " df_prev_ins_pos_agg.rename(columns={\"PREV_SK_ID_CURR_\":\"SK_ID_CURR\"}, inplace = True)\n",
881 | " #prev_son ile ana tablo\n",
882 | " df_prev_others = df.merge(df_prev_ins_pos_agg, how = 'left',on = 'SK_ID_CURR')\n",
883 | " \n",
884 | " #credit_card_balance\n",
885 | " df_prev_ins_pos_ccb = df_prev_others.merge(CCB_agg, how = 'left',on = 'SK_ID_CURR')\n",
886 | " \n",
887 | " #bureau_balance\n",
888 | " all_data = df_prev_ins_pos_ccb.merge(bureau_and_bb_agg, how = 'left',on = 'SK_ID_CURR')\n",
889 | " \n",
890 | " print(\"all_data process:\", all_data.shape) \n",
891 | "\n",
892 | " \n",
893 | " \n",
894 | " return all_data\n",
895 | " "
896 | ]
897 | },
898 | {
899 | "cell_type": "code",
900 | "execution_count": null,
901 | "metadata": {},
902 | "outputs": [],
903 | "source": []
904 | },
905 | {
906 | "cell_type": "markdown",
907 | "metadata": {},
908 | "source": [
909 | "# Model Tuning"
910 | ]
911 | },
912 | {
913 | "cell_type": "code",
914 | "execution_count": 60,
915 | "metadata": {},
916 | "outputs": [],
917 | "source": [
918 | "#lgbm = LGBMClassifier()\n",
919 | "\n",
920 | "#lgbm_params = {\"learning_rate\": [0.001, 0.01, 0.1],\n",
921 | "# \"n_estimators\": [200, 500, 100],\n",
922 | "# \"max_depth\":[1,2,35,8]}"
923 | ]
924 | },
925 | {
926 | "cell_type": "code",
927 | "execution_count": 61,
928 | "metadata": {},
929 | "outputs": [],
930 | "source": [
931 | "#train = all_data[all_data['TARGET'].notnull()]\n",
932 | "#y_train = train[\"TARGET\"]\n",
933 | "#X_train = train.drop(\"TARGET\", axis = 1)\n",
934 | "\n",
935 | "#lgbm_cv_model = GridSearchCV(lgbm,lgbm_params, cv = 10, n_jobs = -1, verbose = 4).fit(X_train, y_train)\n",
936 | "#lgbm_cv_model.best_params_"
937 | ]
938 | },
939 | {
940 | "cell_type": "code",
941 | "execution_count": null,
942 | "metadata": {},
943 | "outputs": [],
944 | "source": []
945 | },
946 | {
947 | "cell_type": "markdown",
948 | "metadata": {},
949 | "source": [
950 | "# Machine Learning"
951 | ]
952 | },
953 | {
954 | "cell_type": "code",
955 | "execution_count": 62,
956 | "metadata": {},
957 | "outputs": [],
958 | "source": [
959 | "def modeling(all_data):\n",
960 | "\n",
961 | " train_df = all_data[all_data['TARGET'].notnull()]\n",
962 | " test_df = all_data[all_data['TARGET'].isnull()]\n",
963 | "\n",
964 | " folds = KFold(n_splits = 10, shuffle = True, random_state = 1001)\n",
965 | "\n",
966 | " oof_preds = np.zeros(train_df.shape[0])\n",
967 | " sub_preds = np.zeros(test_df.shape[0])\n",
968 | " feature_importance_df = pd.DataFrame()\n",
969 | "\n",
970 | " feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR']]\n",
971 | "\n",
972 | " for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):\n",
973 | "\n",
974 | " train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]\n",
975 | "\n",
976 | " valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]\n",
977 | "\n",
978 | " clf = LGBMClassifier(\n",
979 | " n_jobs = -1,\n",
980 | " n_estimators=10000,\n",
981 | " learning_rate=0.02,\n",
982 | " num_leaves=34,\n",
983 | " colsample_bytree=0.9497036,\n",
984 | " subsample=0.8715623,\n",
985 | " max_depth=8,\n",
986 | " reg_alpha=0.041545473,\n",
987 | " reg_lambda=0.0735294,\n",
988 | " min_split_gain=0.0222415,\n",
989 | " min_child_weight=39.3259775,\n",
990 | " silent=-1,\n",
991 | " verbose=-1, )\n",
992 | "\n",
993 | " clf.fit(train_x, train_y, eval_set = [(train_x, train_y), (valid_x, valid_y)], \n",
994 | " eval_metric = 'auc', verbose = 200, early_stopping_rounds = 200)\n",
995 | "\n",
996 | " #y_pred_valid\n",
997 | " oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]\n",
998 | " sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits\n",
999 | "\n",
1000 | " fold_importance_df = pd.DataFrame()\n",
1001 | " fold_importance_df[\"feature\"] = feats\n",
1002 | " fold_importance_df[\"importance\"] = clf.feature_importances_\n",
1003 | " fold_importance_df[\"fold\"] = n_fold + 1\n",
1004 | " feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n",
1005 | "\n",
1006 | "\n",
1007 | " print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx]))) \n",
1008 | "\n",
1009 | "\n",
1010 | " print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds)) #y_pred_valid \n",
1011 | "\n",
1012 | " test_df['TARGET'] = sub_preds\n",
1013 | " test_df[['SK_ID_CURR', 'TARGET']].to_csv(\"dsmlbc1_submission.csv\", index= False)\n",
1014 | "\n",
1015 | " display_importances(feature_importance_df)\n",
1016 | " \n",
1017 | " return feature_importance_df\n",
1018 | "\n"
1019 | ]
1020 | },
1021 | {
1022 | "cell_type": "markdown",
1023 | "metadata": {},
1024 | "source": [
1025 | "# main"
1026 | ]
1027 | },
1028 | {
1029 | "cell_type": "code",
1030 | "execution_count": 63,
1031 | "metadata": {},
1032 | "outputs": [],
1033 | "source": [
1034 | "def main():\n",
1035 | " \n",
1036 | " with timer(\"Preprocessing Time\"):\n",
1037 | " all_data = pre_processing_and_combine()\n",
1038 | " \n",
1039 | " with timer(\"Modeling\"):\n",
1040 | " feat_importance = modeling(all_data)\n"
1041 | ]
1042 | },
1043 | {
1044 | "cell_type": "code",
1045 | "execution_count": null,
1046 | "metadata": {},
1047 | "outputs": [],
1048 | "source": [
1049 | "if __name__ == \"__main__\":\n",
1050 | " with timer(\"Full model run\"):\n",
1051 | " main()"
1052 | ]
1053 | },
1054 | {
1055 | "cell_type": "code",
1056 | "execution_count": null,
1057 | "metadata": {},
1058 | "outputs": [],
1059 | "source": []
1060 | },
1061 | {
1062 | "cell_type": "code",
1063 | "execution_count": null,
1064 | "metadata": {},
1065 | "outputs": [],
1066 | "source": [
1067 | "\n",
1068 | "# dsmlbc1_ws: 2115s\n",
1069 | "# dsmlbc1_submission: 0.79441"
1070 | ]
1071 | }
1072 | ],
1073 | "metadata": {
1074 | "kernelspec": {
1075 | "display_name": "Python 3",
1076 | "language": "python",
1077 | "name": "python3"
1078 | },
1079 | "language_info": {
1080 | "codemirror_mode": {
1081 | "name": "ipython",
1082 | "version": 3
1083 | },
1084 | "file_extension": ".py",
1085 | "mimetype": "text/x-python",
1086 | "name": "python",
1087 | "nbconvert_exporter": "python",
1088 | "pygments_lexer": "ipython3",
1089 | "version": "3.7.6"
1090 | }
1091 | },
1092 | "nbformat": 4,
1093 | "nbformat_minor": 4
1094 | }
--------------------------------------------------------------------------------