├── .idea
├── .gitignore
├── credit-risk-modeling-in-python.iml
├── inspectionProfiles
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
└── vcs.xml
├── Credit Risk Modeling Analysis.ipynb
├── LICENSE
├── README.md
├── Section 1 Introduction
├── 1. Importance of Credit Risk.txt
├── 2. Expected Loss and it's components.txt
├── 3. Capital requirement and BASEL II accord.txt
├── 4. BASEL II Approaches.txt
├── 5. Different Asset Classes and Financing them.txt
└── README.md
├── Section 2 Setting up the environment
├── 1. Setting up the environment.txt
└── 2. Why Python and Jupyter.txt
├── Section 3 Dataset Description
├── .ipynb_checkpoints
│ └── Credit Risk Modeling - Preparation - With Comments - 3-1-Copy1-checkpoint.ipynb
├── Credit Risk Modeling - Preparation - 3-1.ipynb
├── Credit Risk Modeling - Preparation - With Comments - 3-1-Copy1.ipynb
└── Foundation of PreProcessing.txt
├── Section 4 General Preprocessing
├── .ipynb_checkpoints
│ └── Credit Risk Modeling - Preparation - 4-3-checkpoint.ipynb
├── Credit Risk Modeling - Preparation - 4-1.ipynb
├── Credit Risk Modeling - Preparation - 4-2.ipynb
├── Credit Risk Modeling - Preparation - 4-3.ipynb
├── Credit Risk Modeling - Preparation - With Comments - 4-1.ipynb
├── Credit Risk Modeling - Preparation - With Comments - 4-2.ipynb
└── Credit Risk Modeling - Preparation - With Comments - 4-3.ipynb
└── Section 5 PD Model, Data Preparation
├── .ipynb_checkpoints
├── Credit Risk Modeling - Preparation - With Comments - 5-11-checkpoint.ipynb
├── Credit Risk Modeling - Preparation - With Comments - 5-2-checkpoint.ipynb
└── Credit Risk Modeling - Preparation - With Comments - 5-6-checkpoint.ipynb
├── Credit Risk Modeling - Preparation - 5-10.ipynb
├── Credit Risk Modeling - Preparation - 5-11.ipynb
├── Credit Risk Modeling - Preparation - 5-12.ipynb
├── Credit Risk Modeling - Preparation - 5-2.ipynb
├── Credit Risk Modeling - Preparation - 5-5.ipynb
├── Credit Risk Modeling - Preparation - 5-6.ipynb
├── Credit Risk Modeling - Preparation - 5-7.ipynb
├── Credit Risk Modeling - Preparation - 5-8.ipynb
├── Credit Risk Modeling - Preparation - 5-9.ipynb
├── Credit Risk Modeling - Preparation - With Comments - 5-10.ipynb
├── Credit Risk Modeling - Preparation - With Comments - 5-11.ipynb
├── Credit Risk Modeling - Preparation - With Comments - 5-2.ipynb
├── Credit Risk Modeling - Preparation - With Comments - 5-5.ipynb
├── Credit Risk Modeling - Preparation - With Comments - 5-6.ipynb
├── Credit Risk Modeling - Preparation - With Comments - 5-7.ipynb
├── Credit Risk Modeling - Preparation - With Comments - 5-8.ipynb
└── Credit Risk Modeling - Preparation - With Comments - 5-9.ipynb
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /workspace.xml
--------------------------------------------------------------------------------
/.idea/credit-risk-modeling-in-python.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Al Ardosa
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Python Data Science Complete – Credit Risk Modeling in Python Data Science
2 |
3 | Hi! Welcome to Python Credit Risk Modeling. A tutorial that teaches you how banks use python data science modeling to improve their performance and comply with regulatory requirements. This is the perfect tutorial for you, if you are interested in a python data science career.
4 |
5 | Hi I'm Al Ardosa the Fellow Actuary. I've been making tutorials since 2013. I'm here to help you do the same. I've majored in Computer Science and do advanced studying methods. My purpose is to make sure you understand every concept in these tutorials. If you get stuck with anything, send me a message, I'm here to help.
6 |
7 | I've been working as a senior software developer and tech lead in Lazada and other tech companies for many years, and is now taking all that I've learned, to teach programming skills and to help you discover the amazing career opportunities that being a developer.
8 |
9 | ### References
10 | [Python Data Science and Machine Learning - Credit Risk Modeling](https://www.alardosa.com/python-data-science-credit-risk-modeling/)
11 |
12 | [Credit Risk Modeling in Python](https://www.udemy.com/course/credit-risk-modeling-in-python/)
13 |
14 | ## Contact
15 | You can send me a note on Linkedin [@alardosa](https://www.linkedin.com/in/alardosa/)
16 |
17 | or visit my website at:
18 |
19 | [www.alardosa.com](https://www.alardosa.com)
20 |
--------------------------------------------------------------------------------
/Section 1 Introduction/1. Importance of Credit Risk.txt:
--------------------------------------------------------------------------------
1 |
2 | Important concepts mentioned in this section
3 |
4 | 1. **Credit** -
5 | Credit Card and Home loans are two very good examples of credit given to a borrower by a lender
6 | Money in a credit card is not ours, we need to pay it back. If we fail to pay it, we need to repay with interest.
7 | Home loans are another type of credit given. For this we have a collateral i.e. home itself, this could be used to recover money if the customer fails to pay back.
8 | Asset financing is another good example of credit. Organizations don't buy the assets at one go instead they finance it and pay it over the time.
9 |
10 | 2. **Credit Risk** -
11 | Likelihood that the borrower wouldn't repay their loan to the lender is credit risk.
12 | Collection costs are the costs incurred in recovering back the money that was not collected.
13 |
14 | 3. **Default Event** -
15 | The event of a borrower not being able to repay their debt is called default.
16 |
17 | 4. Risk Based Pricing
18 | Lenders need to assess the credit risk associated with every loan they are giving to the borrower.
19 | To ensure that the borrower pays back the amount he has taken, lenders could ask for
20 | 1. Collators
21 | 2. Increase the interest on the loan
22 |
23 | 5. Main reasons for serious finacial crisis
24 | Lending to borrowers with a high probability of default
25 | Ex: Global Financial Crisis and Fall of Leeman Brothers
26 |
27 |
--------------------------------------------------------------------------------
/Section 1 Introduction/2. Expected Loss and it's components.txt:
--------------------------------------------------------------------------------
1 | Expected Loss and it's components
2 | EL - Expected Loss
3 | PD - Probability of Default
4 | LGD - Loss Given Default
5 | EAD - Exposure At Default
6 |
7 | Lenders know and expect the possibility of borrower not paying back
8 |
9 | Factors of expected loss
10 | 1. Borrower-specific factors
11 | 2. The economic environment
12 |
13 | How to estimate expected loss or expected credit loss?
14 | Definition: The amount a lender might lose by lending to a borrower
15 |
16 | EL = PD x LGD x EAD
17 |
18 | Probability of Default - Borrower's inability to repay their debt in full or on time
19 |
20 | Loss Given Default - The proportion of the total exposure that can't be recovered by the lender at default event
21 |
22 | Exposure At Default - Total value that a lender is exposed to when a borrower defaults
23 |
24 | Example
25 | Cost of House - $500,000
26 | Lender Funds 80% Loan to Value
27 | So loan amount - $400,000
28 |
29 | Borrower pays back $40,000
30 | Borrower Defaults here.
31 |
32 | So remaining amount to be recovered - $360,000
33 | Exposure at default - $360,000
34 |
35 | If there is an empirical evidence of one in every 4 homeowners having default
36 | So,
37 | Probability of Default = 1 out of 4 = 1/4 = 25%
38 |
39 | Here Bank can sold the house of $342,000
40 | Now
41 | Exposure at Default = $360,000 - $342,000 = $18,000
42 |
43 | Loss Given Default = $18,000/$360,000 = 5%
44 |
45 | Expected Loss = PD x LGD x EAD
46 | = 25% x 5% x $360,000
47 | = $4,500
--------------------------------------------------------------------------------
/Section 1 Introduction/3. Capital requirement and BASEL II accord.txt:
--------------------------------------------------------------------------------
1 | When Banking System is suffered, it impacts the overall functioning of the government and stability of Economic System
2 |
3 | People wouldn't deposit money in their banks if it is not safe
4 | Consequently there would be less liquidity
5 |
6 | Regulators Rules:
7 | 1. Regulate bank operations and hence reduce risky behaviour
8 | 2. Guarantee to the public that the banking sector is in good health
9 |
10 | Loan Defaults
11 | Firms may default because of two possible reasons
12 | 1. Poor corporate management
13 | 2. Bad product performance
14 | 3. Also, if there is a global economic downturn
15 |
16 | Capital Requirement or Capital Adequacy or Regulatory Capital:
17 | Banks are required to have sufficient moeny to absorb losses
18 |
19 | Risk Weighted Assets - Every loan that bank gives is an asset to bank. This loan is associated with risk
20 |
21 | So, Capital Adequacy Ratio should be greater than a certain percentage
22 |
23 | BASEL II Accord
24 | - How much capital banks need to have
25 | - How capital is defined
26 | - How Capital is compared against risk-weighted assets
27 |
28 | BASEL II Accord defines the Capital amount a bank needs
29 | because The Greater the risk a bank is exposed to the greater the capital it needs to hold
30 |
31 | BASEL II Accord has three pillars
32 | 1. Minimum Capital Requirement
33 | - Credit Risk
34 | - Operational Risk
35 | * Internal Ratings Based Approach (IRB)
36 | ** Foundational IRB Approach
37 | ** Advanced IRB Approach
38 | - Market Risk
39 |
40 | 2. Supervisory Review
41 | 3. Market Discipline
42 |
43 |
--------------------------------------------------------------------------------
/Section 1 Introduction/4. BASEL II Approaches.txt:
--------------------------------------------------------------------------------
1 | BASEL II Approaches
2 |
3 | EL = PD x EAD x LGD
4 |
5 | BASEL II Accord has three pillars
6 | 1. Minimum Capital Requirement
7 | - Credit Risk
8 | - Operational Risk
9 | * Internal Ratings Based Approach (IRB)
10 | ** Foundational IRB Approach
11 | ** Advanced IRB Approach
12 | - Market Risk
13 |
14 | 2. Supervisory Review
15 | 3. Market Discipline
16 |
17 | From this we have three options to model the credit risk
18 | 1. Standardised Approach
19 | 2. Foundational IRB Approach
20 | 3. Advanced IRB Approach
21 |
22 | Capital Requirement is calculated differently under these three approach
23 | 1. Standardised Approach -- Fixed % of the total exposure
24 | FICO
25 | Moody's etc do credit risk rating
26 | In India - CRISIL
27 | S&P do it for Firms and Countries ex: AAA, AAA-, BBB etc
28 |
29 | Under this approach,
30 | There is a table for which there is a certain % of total amount is held
31 | Ex:
32 | AAA to AAA- rated companies -- 20%
33 | A+ to A- -- 50% should be held as capital
34 |
35 | Retail, Credit Card and Consumer loans - 75% of the loan given
36 | Home Loans - 35%
37 |
38 | 2. Foundational IRB & Advanced IRB Approach
39 | When banks give out the loans they collect data and this data could be used for calculation
40 | This data can be used under IRB approach
41 |
42 | Banks would like to move from SA to F-IRB to A-IRB
43 | Because holding as much as 75% of the data of the loan is too much
44 | If a bank gets a proper risk profile of an individual, they would need to hold less amount of money
45 |
46 | More Precise estimation of Capital ==> More new business with SAME capital
47 |
48 | IRB Approach allow banks to do their own credit rating
49 | - Hence bank ccan allocate more resources to cover losses
50 |
51 |
--------------------------------------------------------------------------------
/Section 1 Introduction/5. Different Asset Classes and Financing them.txt:
--------------------------------------------------------------------------------
1 | Individuals
2 | Credit Cards -- 75%
3 | Consumer Loans -- 75%
4 | Mortgages -- 35%
5 | Firms
6 | SME
7 | Large Corporations
8 |
9 | Large Corporations are very less to have a statistical data to model
10 | SME contains enough data to build statistical models
11 | Reatil loans are also plenty to get the data
12 |
13 | In this course we would focus on a general case where
14 | enough data is available for implementing a traditional statistical methodology to build credit risk model
15 |
16 | PD - Logistic Regression
17 | LGD - Beta Regression
18 | EAD - Beta Regression
19 |
20 | For different classes of customers we may have different data available
21 | Individuals -
22 | - Any Demographics or Social informations Available
23 | - external credit risk agency data
24 | - No. of inquiries made for credit
25 | - Interest Rate
26 |
27 | Credit Card -
28 | Credit Limit
29 | Credit Limit Utilization can be used to build model
30 |
31 | Mortgage Loan -
32 | Loan To Value Ratio
33 |
34 | Corporate Loans -
35 | Firm's Size
36 | Years in business
37 | Line of Operation
38 | Target Market focus
39 | Financial Statements
40 | Return on Assets - Net income/Total Assets
41 | Return on Equity - Net Income/Shareholder's equity
42 | Current Ratio - Current Assets/Current Liabilities
43 | Debt Ratio - Total Liabilities/ Total Assets
44 |
45 | Many of these informations are available
46 | - before the application
47 | - collected after the loan is granted and under a period of observation
48 |
49 | Both of these could be used to build a behaviour model.
50 |
51 | Two models could be made
52 | 1. Application Model
53 | If the loan is risky it would have a higher interest rate
54 | 2. Behaviour Model
55 | Whether to grant an additional loan or not;
56 | Ex: Using the credit card details bank may use for building the model
57 |
58 | In this course we would be building statistical models
59 | PD - Logistic Regression (Binomial Logistic Regression)
60 | LGD and EAD - Beta Regression
61 |
62 | We would be using Python
--------------------------------------------------------------------------------
/Section 1 Introduction/README.md:
--------------------------------------------------------------------------------
1 | # Section 1: Introduction
2 |
3 | ## Lecture 0
4 | Introduction to this course
5 |
6 | ## Lecture 1
7 | Credit risk and it's importance
8 |
9 | ## Lecture 2
10 | Expected Loss (EL) and it's components: PD, LGD, EAD
11 |
12 | ## Lecture 3
13 | Capital Adequacy, Regulations and Basel II Accord
14 |
15 | ## Lecture 4
16 | BASEL II Approaches: SA, F-IRB, A-IRB
17 |
18 | ## Lecture 5
19 | Different Asset Classes and Risk Modeling Approaches
20 |
--------------------------------------------------------------------------------
/Section 2 Setting up the environment/1. Setting up the environment.txt:
--------------------------------------------------------------------------------
1 | Machine Learning and Data Science
2 |
3 | Can be perfromed in Many Languages
4 |
5 | Python
6 | Anaconda
7 | Jupyter Notebook
8 |
9 | Install scikit-learn
--------------------------------------------------------------------------------
/Section 2 Setting up the environment/2. Why Python and Jupyter.txt:
--------------------------------------------------------------------------------
1 | Why Python and Jupyter?
2 |
3 | Open Source - Anyone can use and contribute to it.
4 | General Purpose - Suitable for all types of work not only Data Science
5 | High-Level - Easy Syntax close to logical human labguage
6 | Available for all operating systems, Windows, Max, Linux,
7 |
8 | Scikit-learn is open source
9 | Countless Packages delivered and maintained for Python
10 |
11 | Python - Programming Language
12 | Jupyter - Software that let's user interact with computer using Python on a web browser
13 | File Format for Jupyter - .ipynb
14 |
15 | Jupyter
16 | Design Well Suited for Demonstrations of Programming concepts and training
17 | Instead of installing different interfaces for all programming languages like R,Python, Julia or PHP, Jupyter allows us to have one interfaces
18 |
19 | Jupyter is not a text editor, it's a software which has text,code and it;s output which also helps us plot the graphs in itself
20 |
21 | Anaconda contains both!!
--------------------------------------------------------------------------------
/Section 3 Dataset Description/Credit Risk Modeling - Preparation - 3-1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Preparation"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Import Libraries"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import numpy as np\n",
24 | "import pandas as pd"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "## Import Data"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "loan_data = loan_data_backup.copy()"
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "metadata": {},
55 | "source": [
56 | "## Explore Data"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": null,
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "loan_data"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "pd.options.display.max_columns = None"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "loan_data"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "loan_data.head()"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": null,
98 | "metadata": {},
99 | "outputs": [],
100 | "source": [
101 | "loan_data.tail()"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "loan_data.columns.values"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "metadata": {
117 | "scrolled": true
118 | },
119 | "outputs": [],
120 | "source": [
121 | "loan_data.info()"
122 | ]
123 | }
124 | ],
125 | "metadata": {
126 | "kernelspec": {
127 | "display_name": "Python 3",
128 | "language": "python",
129 | "name": "python3"
130 | },
131 | "language_info": {
132 | "codemirror_mode": {
133 | "name": "ipython",
134 | "version": 3
135 | },
136 | "file_extension": ".py",
137 | "mimetype": "text/x-python",
138 | "name": "python",
139 | "nbconvert_exporter": "python",
140 | "pygments_lexer": "ipython3",
141 | "version": "3.7.3"
142 | }
143 | },
144 | "nbformat": 4,
145 | "nbformat_minor": 2
146 | }
147 |
--------------------------------------------------------------------------------
/Section 3 Dataset Description/Foundation of PreProcessing.txt:
--------------------------------------------------------------------------------
1 | Dependent Variables and Independent Variables
2 |
3 | Dependent Variables - outcomes of interest
4 |
5 | Independent Variables - Predictors/Features
6 |
7 | PD Model
8 | Dependent Variable for our model - loan status
9 |
10 | LGD Model
11 | How much loan was recovered after default?
12 | Dependent Variable - Recovery Column
13 |
14 | EAD Model
15 | total recovered principal column
16 |
17 | Depending on the type of data we would apply different Preprocessing techniques
18 |
19 | 1. Discrete
20 | Categorical
21 | Finite Number of Values
22 | Some variables take discrete values but can be treated as continuous variables in our dataset Ex: Number of Credit Inquiries in last six months is one such example
23 | If the discrete values are sufficient and are ordered we can treat them continuous
24 |
25 | 2. Continuous
26 | Numerical
27 | Infinite Number of Values
28 |
29 | Distinctive Feature of the PD model
30 | All the independent variables have to be categorical
31 | Reason - It is much easier to present the model in a simplified form and turn it into a scorecard if we only have categorical instead of mixed
32 |
33 | Discrete Values into Categorical
34 | So we would transform the discrete variables into categorical variables or dummy variables
35 | Only when we reach a final version of the categories from discrete values, we would create dummies
36 |
37 |
38 | Continuous Variables can also be transformed into dummy variables
39 | Ex: Annual Income, No. of Credit Inquiries in last six months
40 | We will start by turning each of them into many categories of equally sized intervals
41 |
42 | Fine Classing
43 | Ex: No. of Months since loan has been granted <- column
44 | We will slice it into 50 intervals
45 | like
46 | 1 to 3,
47 | 4 to 6
48 | 7 to 9 months upto 50
49 |
50 | Now on these categories we try to understand how do they discriminate with adjascent categories
51 | For example if 1 and 2 we merge them etc.
52 | Coarse Classing
53 | If we make categories which are not equally distributed
54 | We make them using coarse classing
55 |
56 |
--------------------------------------------------------------------------------
/Section 4 General Preprocessing/.ipynb_checkpoints/Credit Risk Modeling - Preparation - 4-3-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Preparation"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Import Libraries"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import numpy as np\n",
24 | "import pandas as pd"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "## Import Data"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "loan_data = loan_data_backup.copy()"
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "metadata": {},
55 | "source": [
56 | "## Explore Data"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": null,
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "loan_data"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "pd.options.display.max_columns = None"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "loan_data"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "loan_data.head()"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": null,
98 | "metadata": {},
99 | "outputs": [],
100 | "source": [
101 | "loan_data.tail()"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "loan_data.columns.values"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "metadata": {
117 | "scrolled": true
118 | },
119 | "outputs": [],
120 | "source": [
121 | "loan_data.info()"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {},
127 | "source": [
128 | "## General Preprocessing"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "### Preprocessing few continuous variables"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": null,
141 | "metadata": {},
142 | "outputs": [],
143 | "source": [
144 | "loan_data['emp_length'].unique()"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": null,
150 | "metadata": {},
151 | "outputs": [],
152 | "source": [
153 | "loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\\+ years', '')\n",
154 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0))\n",
155 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a', str(0))\n",
156 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '')\n",
157 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '')"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "metadata": {},
164 | "outputs": [],
165 | "source": [
166 | "type(loan_data['emp_length_int'][0])"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": null,
172 | "metadata": {},
173 | "outputs": [],
174 | "source": [
175 | "loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {},
182 | "outputs": [],
183 | "source": [
184 | "type(loan_data['emp_length_int'][0])"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {
191 | "scrolled": true
192 | },
193 | "outputs": [],
194 | "source": [
195 | "loan_data['earliest_cr_line']"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = '%b-%y')"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "type(loan_data['earliest_cr_line_date'][0])"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": null,
219 | "metadata": {
220 | "scrolled": true
221 | },
222 | "outputs": [],
223 | "source": [
224 | "pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": null,
230 | "metadata": {},
231 | "outputs": [],
232 | "source": [
233 | "loan_data['mths_since_earliest_cr_line'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']) / np.timedelta64(1, 'M')))"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": null,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "loan_data['mths_since_earliest_cr_line'].describe()"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": null,
248 | "metadata": {
249 | "scrolled": true
250 | },
251 | "outputs": [],
252 | "source": [
253 | "loan_data.loc[: , ['earliest_cr_line', 'earliest_cr_line_date', 'mths_since_earliest_cr_line']][loan_data['mths_since_earliest_cr_line'] < 0]"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": null,
259 | "metadata": {},
260 | "outputs": [],
261 | "source": [
262 | "loan_data['mths_since_earliest_cr_line'][loan_data['mths_since_earliest_cr_line'] < 0] = loan_data['mths_since_earliest_cr_line'].max()"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": null,
268 | "metadata": {},
269 | "outputs": [],
270 | "source": [
271 | "min(loan_data['mths_since_earliest_cr_line'])"
272 | ]
273 | },
274 | {
275 | "cell_type": "markdown",
276 | "metadata": {},
277 | "source": [
278 | "### Homework"
279 | ]
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": null,
284 | "metadata": {
285 | "scrolled": true
286 | },
287 | "outputs": [],
288 | "source": [
289 | "loan_data['term']"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": null,
295 | "metadata": {},
296 | "outputs": [],
297 | "source": [
298 | "loan_data['term'].describe()"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": null,
304 | "metadata": {},
305 | "outputs": [],
306 | "source": [
307 | "loan_data['term_int'] = loan_data['term'].str.replace(' months', '')"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": null,
313 | "metadata": {
314 | "scrolled": true
315 | },
316 | "outputs": [],
317 | "source": [
318 | "loan_data['term_int']"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": null,
324 | "metadata": {},
325 | "outputs": [],
326 | "source": [
327 | "type(loan_data['term_int'][25])"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": null,
333 | "metadata": {
334 | "scrolled": true
335 | },
336 | "outputs": [],
337 | "source": [
338 | "loan_data['term_int'] = pd.to_numeric(loan_data['term'].str.replace(' months', ''))\n",
339 | "loan_data['term_int']"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": null,
345 | "metadata": {},
346 | "outputs": [],
347 | "source": [
348 | "type(loan_data['term_int'][0])"
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": null,
354 | "metadata": {
355 | "scrolled": true
356 | },
357 | "outputs": [],
358 | "source": [
359 | "loan_data['issue_d']"
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": null,
365 | "metadata": {},
366 | "outputs": [],
367 | "source": [
368 | "loan_data['issue_d_date'] = pd.to_datetime(loan_data['issue_d'], format = '%b-%y')\n",
369 | "loan_data['mths_since_issue_d'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['issue_d_date']) / np.timedelta64(1, 'M')))\n",
370 | "loan_data['mths_since_issue_d'].describe()"
371 | ]
372 | }
373 | ],
374 | "metadata": {
375 | "kernelspec": {
376 | "display_name": "Python 3",
377 | "language": "python",
378 | "name": "python3"
379 | },
380 | "language_info": {
381 | "codemirror_mode": {
382 | "name": "ipython",
383 | "version": 3
384 | },
385 | "file_extension": ".py",
386 | "mimetype": "text/x-python",
387 | "name": "python",
388 | "nbconvert_exporter": "python",
389 | "pygments_lexer": "ipython3",
390 | "version": "3.7.4"
391 | }
392 | },
393 | "nbformat": 4,
394 | "nbformat_minor": 2
395 | }
396 |
--------------------------------------------------------------------------------
/Section 4 General Preprocessing/Credit Risk Modeling - Preparation - 4-1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Preparation"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Import Libraries"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import numpy as np\n",
24 | "import pandas as pd"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "## Import Data"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "loan_data = loan_data_backup.copy()"
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "metadata": {},
55 | "source": [
56 | "## Explore Data"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": null,
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "loan_data"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "pd.options.display.max_columns = None"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "loan_data"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "loan_data.head()"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": null,
98 | "metadata": {},
99 | "outputs": [],
100 | "source": [
101 | "loan_data.tail()"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "loan_data.columns.values"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "metadata": {
117 | "scrolled": true
118 | },
119 | "outputs": [],
120 | "source": [
121 | "loan_data.info()"
122 | ]
123 | }
124 | ],
125 | "metadata": {
126 | "kernelspec": {
127 | "display_name": "Python 3",
128 | "language": "python",
129 | "name": "python3"
130 | },
131 | "language_info": {
132 | "codemirror_mode": {
133 | "name": "ipython",
134 | "version": 3
135 | },
136 | "file_extension": ".py",
137 | "mimetype": "text/x-python",
138 | "name": "python",
139 | "nbconvert_exporter": "python",
140 | "pygments_lexer": "ipython3",
141 | "version": "3.7.3"
142 | }
143 | },
144 | "nbformat": 4,
145 | "nbformat_minor": 2
146 | }
147 |
--------------------------------------------------------------------------------
/Section 4 General Preprocessing/Credit Risk Modeling - Preparation - 4-2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Preparation"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Import Libraries"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import numpy as np\n",
24 | "import pandas as pd"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "## Import Data"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "loan_data = loan_data_backup.copy()"
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "metadata": {},
55 | "source": [
56 | "## Explore Data"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": null,
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "loan_data"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "pd.options.display.max_columns = None"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "loan_data"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "loan_data.head()"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": null,
98 | "metadata": {},
99 | "outputs": [],
100 | "source": [
101 | "loan_data.tail()"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "loan_data.columns.values"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "metadata": {
117 | "scrolled": true
118 | },
119 | "outputs": [],
120 | "source": [
121 | "loan_data.info()"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {},
127 | "source": [
128 | "## General Preprocessing"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "### Preprocessing few continuous variables"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": null,
141 | "metadata": {},
142 | "outputs": [],
143 | "source": [
144 | "loan_data['emp_length'].unique()"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": null,
150 | "metadata": {},
151 | "outputs": [],
152 | "source": [
153 | "loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\\+ years', '')\n",
154 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0))\n",
155 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a', str(0))\n",
156 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '')\n",
157 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '')"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "metadata": {},
164 | "outputs": [],
165 | "source": [
166 | "type(loan_data['emp_length_int'][0])"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": null,
172 | "metadata": {},
173 | "outputs": [],
174 | "source": [
175 | "loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {},
182 | "outputs": [],
183 | "source": [
184 | "type(loan_data['emp_length_int'][0])"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {
191 | "scrolled": true
192 | },
193 | "outputs": [],
194 | "source": [
195 | "loan_data['earliest_cr_line']"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = '%b-%y')"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "type(loan_data['earliest_cr_line_date'][0])"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": null,
219 | "metadata": {
220 | "scrolled": true
221 | },
222 | "outputs": [],
223 | "source": [
224 | "pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": null,
230 | "metadata": {},
231 | "outputs": [],
232 | "source": [
233 | "loan_data['mths_since_earliest_cr_line'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']) / np.timedelta64(1, 'M')))"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": null,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "loan_data['mths_since_earliest_cr_line'].describe()"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": null,
248 | "metadata": {
249 | "scrolled": true
250 | },
251 | "outputs": [],
252 | "source": [
253 | "loan_data.loc[: , ['earliest_cr_line', 'earliest_cr_line_date', 'mths_since_earliest_cr_line']][loan_data['mths_since_earliest_cr_line'] < 0]"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": null,
259 | "metadata": {},
260 | "outputs": [],
261 | "source": [
262 | "loan_data['mths_since_earliest_cr_line'][loan_data['mths_since_earliest_cr_line'] < 0] = loan_data['mths_since_earliest_cr_line'].max()"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": null,
268 | "metadata": {},
269 | "outputs": [],
270 | "source": [
271 | "min(loan_data['mths_since_earliest_cr_line'])"
272 | ]
273 | }
274 | ],
275 | "metadata": {
276 | "kernelspec": {
277 | "display_name": "Python 3",
278 | "language": "python",
279 | "name": "python3"
280 | },
281 | "language_info": {
282 | "codemirror_mode": {
283 | "name": "ipython",
284 | "version": 3
285 | },
286 | "file_extension": ".py",
287 | "mimetype": "text/x-python",
288 | "name": "python",
289 | "nbconvert_exporter": "python",
290 | "pygments_lexer": "ipython3",
291 | "version": "3.7.3"
292 | }
293 | },
294 | "nbformat": 4,
295 | "nbformat_minor": 2
296 | }
297 |
--------------------------------------------------------------------------------
/Section 4 General Preprocessing/Credit Risk Modeling - Preparation - 4-3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Preparation"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Import Libraries"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import numpy as np\n",
24 | "import pandas as pd"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "## Import Data"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "loan_data = loan_data_backup.copy()"
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "metadata": {},
55 | "source": [
56 | "## Explore Data"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": null,
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "loan_data"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "pd.options.display.max_columns = None"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "loan_data"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "loan_data.head()"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": null,
98 | "metadata": {},
99 | "outputs": [],
100 | "source": [
101 | "loan_data.tail()"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "loan_data.columns.values"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "metadata": {
117 | "scrolled": true
118 | },
119 | "outputs": [],
120 | "source": [
121 | "loan_data.info()"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {},
127 | "source": [
128 | "## General Preprocessing"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "### Preprocessing few continuous variables"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": null,
141 | "metadata": {},
142 | "outputs": [],
143 | "source": [
144 | "loan_data['emp_length'].unique()"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": null,
150 | "metadata": {},
151 | "outputs": [],
152 | "source": [
153 | "loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\\+ years', '')\n",
154 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0))\n",
155 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a', str(0))\n",
156 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '')\n",
157 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '')"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "metadata": {},
164 | "outputs": [],
165 | "source": [
166 | "type(loan_data['emp_length_int'][0])"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": null,
172 | "metadata": {},
173 | "outputs": [],
174 | "source": [
175 | "loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {},
182 | "outputs": [],
183 | "source": [
184 | "type(loan_data['emp_length_int'][0])"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {
191 | "scrolled": true
192 | },
193 | "outputs": [],
194 | "source": [
195 | "loan_data['earliest_cr_line']"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = '%b-%y')"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "type(loan_data['earliest_cr_line_date'][0])"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": null,
219 | "metadata": {
220 | "scrolled": true
221 | },
222 | "outputs": [],
223 | "source": [
224 | "pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": null,
230 | "metadata": {},
231 | "outputs": [],
232 | "source": [
233 | "loan_data['mths_since_earliest_cr_line'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']) / np.timedelta64(1, 'M')))"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": null,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "loan_data['mths_since_earliest_cr_line'].describe()"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": null,
248 | "metadata": {
249 | "scrolled": true
250 | },
251 | "outputs": [],
252 | "source": [
253 | "loan_data.loc[: , ['earliest_cr_line', 'earliest_cr_line_date', 'mths_since_earliest_cr_line']][loan_data['mths_since_earliest_cr_line'] < 0]"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": null,
259 | "metadata": {},
260 | "outputs": [],
261 | "source": [
262 | "loan_data['mths_since_earliest_cr_line'][loan_data['mths_since_earliest_cr_line'] < 0] = loan_data['mths_since_earliest_cr_line'].max()"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": null,
268 | "metadata": {},
269 | "outputs": [],
270 | "source": [
271 | "min(loan_data['mths_since_earliest_cr_line'])"
272 | ]
273 | },
274 | {
275 | "cell_type": "markdown",
276 | "metadata": {},
277 | "source": [
278 | "### Homework"
279 | ]
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": null,
284 | "metadata": {
285 | "scrolled": true
286 | },
287 | "outputs": [],
288 | "source": [
289 | "loan_data['term']"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": null,
295 | "metadata": {},
296 | "outputs": [],
297 | "source": [
298 | "loan_data['term'].describe()"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": null,
304 | "metadata": {},
305 | "outputs": [],
306 | "source": [
307 | "loan_data['term_int'] = loan_data['term'].str.replace(' months', '')"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": null,
313 | "metadata": {
314 | "scrolled": true
315 | },
316 | "outputs": [],
317 | "source": [
318 | "loan_data['term_int']"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": null,
324 | "metadata": {},
325 | "outputs": [],
326 | "source": [
327 | "type(loan_data['term_int'][25])"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": null,
333 | "metadata": {
334 | "scrolled": true
335 | },
336 | "outputs": [],
337 | "source": [
338 | "loan_data['term_int'] = pd.to_numeric(loan_data['term'].str.replace(' months', ''))\n",
339 | "loan_data['term_int']"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": null,
345 | "metadata": {},
346 | "outputs": [],
347 | "source": [
348 | "type(loan_data['term_int'][0])"
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": null,
354 | "metadata": {
355 | "scrolled": true
356 | },
357 | "outputs": [],
358 | "source": [
359 | "loan_data['issue_d']"
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": null,
365 | "metadata": {},
366 | "outputs": [],
367 | "source": [
368 | "loan_data['issue_d_date'] = pd.to_datetime(loan_data['issue_d'], format = '%b-%y')\n",
369 | "loan_data['mths_since_issue_d'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['issue_d_date']) / np.timedelta64(1, 'M')))\n",
370 | "loan_data['mths_since_issue_d'].describe()"
371 | ]
372 | }
373 | ],
374 | "metadata": {
375 | "kernelspec": {
376 | "display_name": "Python 3",
377 | "language": "python",
378 | "name": "python3"
379 | },
380 | "language_info": {
381 | "codemirror_mode": {
382 | "name": "ipython",
383 | "version": 3
384 | },
385 | "file_extension": ".py",
386 | "mimetype": "text/x-python",
387 | "name": "python",
388 | "nbconvert_exporter": "python",
389 | "pygments_lexer": "ipython3",
390 | "version": "3.7.4"
391 | }
392 | },
393 | "nbformat": 4,
394 | "nbformat_minor": 2
395 | }
396 |
--------------------------------------------------------------------------------
/Section 4 General Preprocessing/Credit Risk Modeling - Preparation - With Comments - 4-1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Preparation"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Import Libraries"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import numpy as np\n",
24 | "import pandas as pd"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "## Import Data\n",
32 | "The dataset contains all available data for more than 800,000 consumer loans issued from 2007 to 2015 by Lending Club: a large US peer-to-peer lending company. There are several different versions of this dataset. We have used a version available on kaggle.com. You can find it here: https://www.kaggle.com/wendykan/lending-club-loan-data/version/1\n",
33 | "We divided the data into two periods because we assume that some data are available at the moment when we need to build Expected Loss models, and some data comes from applications after. Later, we investigate whether the applications we have after we built the Probability of Default (PD) model have similar characteristics with the applications we used to build the PD model."
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {
40 | "scrolled": true
41 | },
42 | "outputs": [],
43 | "source": [
44 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "loan_data = loan_data_backup.copy()"
54 | ]
55 | },
56 | {
57 | "cell_type": "markdown",
58 | "metadata": {},
59 | "source": [
60 | "## Explore Data"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "loan_data"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "pd.options.display.max_columns = None\n",
79 | "#pd.options.display.max_rows = None\n",
80 | "# Sets the pandas dataframe options to display all columns/ rows."
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "loan_data"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "metadata": {
96 | "scrolled": true
97 | },
98 | "outputs": [],
99 | "source": [
100 | "loan_data.head()"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {
107 | "scrolled": true
108 | },
109 | "outputs": [],
110 | "source": [
111 | "loan_data.tail()"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "loan_data.columns.values\n",
121 | "# Displays all column names."
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": null,
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "loan_data.info()\n",
131 | "# Displays column names, complete (non-missing) cases per column, and datatype per column."
132 | ]
133 | }
134 | ],
135 | "metadata": {
136 | "kernelspec": {
137 | "display_name": "Python 3",
138 | "language": "python",
139 | "name": "python3"
140 | },
141 | "language_info": {
142 | "codemirror_mode": {
143 | "name": "ipython",
144 | "version": 3
145 | },
146 | "file_extension": ".py",
147 | "mimetype": "text/x-python",
148 | "name": "python",
149 | "nbconvert_exporter": "python",
150 | "pygments_lexer": "ipython3",
151 | "version": "3.7.3"
152 | }
153 | },
154 | "nbformat": 4,
155 | "nbformat_minor": 2
156 | }
157 |
--------------------------------------------------------------------------------
/Section 4 General Preprocessing/Credit Risk Modeling - Preparation - With Comments - 4-2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Preparation"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Import Libraries"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import numpy as np\n",
24 | "import pandas as pd"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "## Import Data\n",
32 | "The dataset contains all available data for more than 800,000 consumer loans issued from 2007 to 2015 by Lending Club: a large US peer-to-peer lending company. There are several different versions of this dataset. We have used a version available on kaggle.com. You can find it here: https://www.kaggle.com/wendykan/lending-club-loan-data/version/1\n",
33 | "We divided the data into two periods because we assume that some data are available at the moment when we need to build Expected Loss models, and some data comes from applications after. Later, we investigate whether the applications we have after we built the Probability of Default (PD) model have similar characteristics with the applications we used to build the PD model."
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {
40 | "scrolled": true
41 | },
42 | "outputs": [],
43 | "source": [
44 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "loan_data = loan_data_backup.copy()"
54 | ]
55 | },
56 | {
57 | "cell_type": "markdown",
58 | "metadata": {},
59 | "source": [
60 | "## Explore Data"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "loan_data"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "pd.options.display.max_columns = None\n",
79 | "#pd.options.display.max_rows = None\n",
80 | "# Sets the pandas dataframe options to display all columns/ rows."
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "loan_data"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "metadata": {
96 | "scrolled": true
97 | },
98 | "outputs": [],
99 | "source": [
100 | "loan_data.head()"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {
107 | "scrolled": true
108 | },
109 | "outputs": [],
110 | "source": [
111 | "loan_data.tail()"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "loan_data.columns.values\n",
121 | "# Displays all column names."
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": null,
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "loan_data.info()\n",
131 | "# Displays column names, complete (non-missing) cases per column, and datatype per column."
132 | ]
133 | },
134 | {
135 | "cell_type": "markdown",
136 | "metadata": {},
137 | "source": [
138 | "## General Preprocessing"
139 | ]
140 | },
141 | {
142 | "cell_type": "markdown",
143 | "metadata": {},
144 | "source": [
145 | "### Preprocessing few continuous variables"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": null,
151 | "metadata": {},
152 | "outputs": [],
153 | "source": [
154 | "loan_data['emp_length'].unique()\n",
155 | "# Displays unique values of a column."
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\\+ years', '')\n",
165 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0))\n",
166 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a', str(0))\n",
167 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '')\n",
168 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '')\n",
169 | "# We store the preprocessed ‘employment length’ variable in a new variable called ‘employment length int’,\n",
170 | "# We assign the new ‘employment length int’ to be equal to the ‘employment length’ variable with the string ‘+ years’\n",
171 | "# replaced with nothing. Next, we replace the whole string ‘less than 1 year’ with the string ‘0’.\n",
172 | "# Then, we replace the ‘n/a’ string with the string ‘0’. Then, we replace the string ‘space years’ with nothing.\n",
173 | "# Finally, we replace the string ‘space year’ with nothing."
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": null,
179 | "metadata": {},
180 | "outputs": [],
181 | "source": [
182 | "type(loan_data['emp_length_int'][0])\n",
183 | "# Checks the datatype of a single element of a column."
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": null,
189 | "metadata": {},
190 | "outputs": [],
191 | "source": [
192 | "loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])\n",
193 | "# Transforms the values to numeric."
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": null,
199 | "metadata": {},
200 | "outputs": [],
201 | "source": [
202 | "type(loan_data['emp_length_int'][0])\n",
203 | "# Checks the datatype of a single element of a column."
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": null,
209 | "metadata": {},
210 | "outputs": [],
211 | "source": [
212 | "loan_data['earliest_cr_line']\n",
213 | "# Displays a column."
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": null,
219 | "metadata": {},
220 | "outputs": [],
221 | "source": [
222 | "loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = '%b-%y')\n",
223 | "# Extracts the date and the time from a string variable that is in a given format."
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": null,
229 | "metadata": {
230 | "scrolled": false
231 | },
232 | "outputs": [],
233 | "source": [
234 | "type(loan_data['earliest_cr_line_date'][0])\n",
235 | "# Checks the datatype of a single element of a column."
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": null,
241 | "metadata": {},
242 | "outputs": [],
243 | "source": [
244 | "pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']\n",
245 | "# Calculates the difference between two dates and times."
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": null,
251 | "metadata": {},
252 | "outputs": [],
253 | "source": [
254 | "# Assume we are now in December 2017\n",
255 | "loan_data['mths_since_earliest_cr_line'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']) / np.timedelta64(1, 'M')))\n",
256 | "# We calculate the difference between two dates in months, turn it to numeric datatype and round it.\n",
257 | "# We save the result in a new variable."
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": null,
263 | "metadata": {},
264 | "outputs": [],
265 | "source": [
266 | "loan_data['mths_since_earliest_cr_line'].describe()\n",
267 | "# Shows some descriptive statisics for the values of a column.\n",
268 | "# Dates from 1969 and before are not being converted well, i.e., they have become 2069 and similar,\n",
269 | "# and negative differences are being calculated."
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": null,
275 | "metadata": {},
276 | "outputs": [],
277 | "source": [
278 | "loan_data.loc[: , ['earliest_cr_line', 'earliest_cr_line_date', 'mths_since_earliest_cr_line']][loan_data['mths_since_earliest_cr_line'] < 0]\n",
279 | "# We take three columns from the dataframe. Then, we display them only for the rows where a variable has negative value.\n",
280 | "# There are 2303 strange negative values."
281 | ]
282 | },
283 | {
284 | "cell_type": "code",
285 | "execution_count": null,
286 | "metadata": {},
287 | "outputs": [],
288 | "source": [
289 | "loan_data['mths_since_earliest_cr_line'][loan_data['mths_since_earliest_cr_line'] < 0] = loan_data['mths_since_earliest_cr_line'].max()\n",
290 | "# We set the rows that had negative differences to the maximum value."
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": null,
296 | "metadata": {},
297 | "outputs": [],
298 | "source": [
299 | "min(loan_data['mths_since_earliest_cr_line'])\n",
300 | "# Calculates and shows the minimum value of a column."
301 | ]
302 | }
303 | ],
304 | "metadata": {
305 | "kernelspec": {
306 | "display_name": "Python 3",
307 | "language": "python",
308 | "name": "python3"
309 | },
310 | "language_info": {
311 | "codemirror_mode": {
312 | "name": "ipython",
313 | "version": 3
314 | },
315 | "file_extension": ".py",
316 | "mimetype": "text/x-python",
317 | "name": "python",
318 | "nbconvert_exporter": "python",
319 | "pygments_lexer": "ipython3",
320 | "version": "3.7.3"
321 | }
322 | },
323 | "nbformat": 4,
324 | "nbformat_minor": 2
325 | }
326 |
--------------------------------------------------------------------------------
/Section 4 General Preprocessing/Credit Risk Modeling - Preparation - With Comments - 4-3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Preparation"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Import Libraries"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import numpy as np\n",
24 | "import pandas as pd"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "## Import Data\n",
32 | "The dataset contains all available data for more than 800,000 consumer loans issued from 2007 to 2015 by Lending Club: a large US peer-to-peer lending company. There are several different versions of this dataset. We have used a version available on kaggle.com. You can find it here: https://www.kaggle.com/wendykan/lending-club-loan-data/version/1\n",
33 | "We divided the data into two periods because we assume that some data are available at the moment when we need to build Expected Loss models, and some data comes from applications after. Later, we investigate whether the applications we have after we built the Probability of Default (PD) model have similar characteristics with the applications we used to build the PD model."
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {
40 | "scrolled": true
41 | },
42 | "outputs": [],
43 | "source": [
44 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "loan_data = loan_data_backup.copy()"
54 | ]
55 | },
56 | {
57 | "cell_type": "markdown",
58 | "metadata": {},
59 | "source": [
60 | "## Explore Data"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "loan_data"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "pd.options.display.max_columns = None\n",
79 | "#pd.options.display.max_rows = None\n",
80 | "# Sets the pandas dataframe options to display all columns/ rows."
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "loan_data"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "metadata": {
96 | "scrolled": true
97 | },
98 | "outputs": [],
99 | "source": [
100 | "loan_data.head()"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {
107 | "scrolled": true
108 | },
109 | "outputs": [],
110 | "source": [
111 | "loan_data.tail()"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "loan_data.columns.values\n",
121 | "# Displays all column names."
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": null,
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "loan_data.info()\n",
131 | "# Displays column names, complete (non-missing) cases per column, and datatype per column."
132 | ]
133 | },
134 | {
135 | "cell_type": "markdown",
136 | "metadata": {},
137 | "source": [
138 | "## General Preprocessing"
139 | ]
140 | },
141 | {
142 | "cell_type": "markdown",
143 | "metadata": {},
144 | "source": [
145 | "### Preprocessing few continuous variables"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": null,
151 | "metadata": {},
152 | "outputs": [],
153 | "source": [
154 | "loan_data['emp_length'].unique()\n",
155 | "# Displays unique values of a column."
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\\+ years', '')\n",
165 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0))\n",
166 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a', str(0))\n",
167 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '')\n",
168 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '')\n",
169 | "# We store the preprocessed ‘employment length’ variable in a new variable called ‘employment length int’,\n",
170 | "# We assign the new ‘employment length int’ to be equal to the ‘employment length’ variable with the string ‘+ years’\n",
171 | "# replaced with nothing. Next, we replace the whole string ‘less than 1 year’ with the string ‘0’.\n",
172 | "# Then, we replace the ‘n/a’ string with the string ‘0’. Then, we replace the string ‘space years’ with nothing.\n",
173 | "# Finally, we replace the string ‘space year’ with nothing."
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": null,
179 | "metadata": {},
180 | "outputs": [],
181 | "source": [
182 | "type(loan_data['emp_length_int'][0])\n",
183 | "# Checks the datatype of a single element of a column."
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": null,
189 | "metadata": {},
190 | "outputs": [],
191 | "source": [
192 | "loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])\n",
193 | "# Transforms the values to numeric."
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": null,
199 | "metadata": {},
200 | "outputs": [],
201 | "source": [
202 | "type(loan_data['emp_length_int'][0])\n",
203 | "# Checks the datatype of a single element of a column."
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": null,
209 | "metadata": {},
210 | "outputs": [],
211 | "source": [
212 | "loan_data['earliest_cr_line']\n",
213 | "# Displays a column."
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": null,
219 | "metadata": {},
220 | "outputs": [],
221 | "source": [
222 | "loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = '%b-%y')\n",
223 | "# Extracts the date and the time from a string variable that is in a given format."
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": null,
229 | "metadata": {
230 | "scrolled": false
231 | },
232 | "outputs": [],
233 | "source": [
234 | "type(loan_data['earliest_cr_line_date'][0])\n",
235 | "# Checks the datatype of a single element of a column."
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": null,
241 | "metadata": {},
242 | "outputs": [],
243 | "source": [
244 | "pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']\n",
245 | "# Calculates the difference between two dates and times."
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": null,
251 | "metadata": {},
252 | "outputs": [],
253 | "source": [
254 | "# Assume we are now in December 2017\n",
255 | "loan_data['mths_since_earliest_cr_line'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']) / np.timedelta64(1, 'M')))\n",
256 | "# We calculate the difference between two dates in months, turn it to numeric datatype and round it.\n",
257 | "# We save the result in a new variable."
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": null,
263 | "metadata": {},
264 | "outputs": [],
265 | "source": [
266 | "loan_data['mths_since_earliest_cr_line'].describe()\n",
267 | "# Shows some descriptive statisics for the values of a column.\n",
268 | "# Dates from 1969 and before are not being converted well, i.e., they have become 2069 and similar,\n",
269 | "# and negative differences are being calculated."
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": null,
275 | "metadata": {},
276 | "outputs": [],
277 | "source": [
278 | "loan_data.loc[: , ['earliest_cr_line', 'earliest_cr_line_date', 'mths_since_earliest_cr_line']][loan_data['mths_since_earliest_cr_line'] < 0]\n",
279 | "# We take three columns from the dataframe. Then, we display them only for the rows where a variable has negative value.\n",
280 | "# There are 2303 strange negative values."
281 | ]
282 | },
283 | {
284 | "cell_type": "code",
285 | "execution_count": null,
286 | "metadata": {},
287 | "outputs": [],
288 | "source": [
289 | "loan_data['mths_since_earliest_cr_line'][loan_data['mths_since_earliest_cr_line'] < 0] = loan_data['mths_since_earliest_cr_line'].max()\n",
290 | "# We set the rows that had negative differences to the maximum value."
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": null,
296 | "metadata": {},
297 | "outputs": [],
298 | "source": [
299 | "min(loan_data['mths_since_earliest_cr_line'])\n",
300 | "# Calculates and shows the minimum value of a column."
301 | ]
302 | },
303 | {
304 | "cell_type": "markdown",
305 | "metadata": {},
306 | "source": [
307 | "### Homework"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": null,
313 | "metadata": {},
314 | "outputs": [],
315 | "source": [
316 | "loan_data['term']"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": null,
322 | "metadata": {},
323 | "outputs": [],
324 | "source": [
325 | "loan_data['term'].describe()\n",
326 | "# Shows some descriptive statisics for the values of a column."
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": null,
332 | "metadata": {},
333 | "outputs": [],
334 | "source": [
335 | "loan_data['term_int'] = loan_data['term'].str.replace(' months', '')\n",
336 | "# We replace a string with another string, in this case, with an empty strng (i.e. with nothing)."
337 | ]
338 | },
339 | {
340 | "cell_type": "code",
341 | "execution_count": null,
342 | "metadata": {},
343 | "outputs": [],
344 | "source": [
345 | "loan_data['term_int']"
346 | ]
347 | },
348 | {
349 | "cell_type": "code",
350 | "execution_count": null,
351 | "metadata": {},
352 | "outputs": [],
353 | "source": [
354 | "type(loan_data['term_int'][25])\n",
355 | "# Checks the datatype of a single element of a column."
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": null,
361 | "metadata": {},
362 | "outputs": [],
363 | "source": [
364 | "loan_data['term_int'] = pd.to_numeric(loan_data['term'].str.replace(' months', ''))\n",
365 | "# We remplace a string from a variable with another string, in this case, with an empty strng (i.e. with nothing).\n",
366 | "# We turn the result to numeric datatype and save it in another variable.\n",
367 | "loan_data['term_int']"
368 | ]
369 | },
370 | {
371 | "cell_type": "code",
372 | "execution_count": null,
373 | "metadata": {},
374 | "outputs": [],
375 | "source": [
376 | "type(loan_data['term_int'][0])\n",
377 | "# Checks the datatype of a single element of a column."
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": null,
383 | "metadata": {},
384 | "outputs": [],
385 | "source": [
386 | "loan_data['issue_d']"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": null,
392 | "metadata": {},
393 | "outputs": [],
394 | "source": [
395 | "# Assume we are now in December 2017\n",
396 | "loan_data['issue_d_date'] = pd.to_datetime(loan_data['issue_d'], format = '%b-%y')\n",
397 | "# Extracts the date and the time from a string variable that is in a given format.\n",
398 | "loan_data['mths_since_issue_d'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['issue_d_date']) / np.timedelta64(1, 'M')))\n",
399 | "# We calculate the difference between two dates in months, turn it to numeric datatype and round it.\n",
400 | "# We save the result in a new variable.\n",
401 | "loan_data['mths_since_issue_d'].describe()\n",
402 | "# Shows some descriptive statisics for the values of a column."
403 | ]
404 | }
405 | ],
406 | "metadata": {
407 | "kernelspec": {
408 | "display_name": "Python 3",
409 | "language": "python",
410 | "name": "python3"
411 | },
412 | "language_info": {
413 | "codemirror_mode": {
414 | "name": "ipython",
415 | "version": 3
416 | },
417 | "file_extension": ".py",
418 | "mimetype": "text/x-python",
419 | "name": "python",
420 | "nbconvert_exporter": "python",
421 | "pygments_lexer": "ipython3",
422 | "version": "3.7.3"
423 | }
424 | },
425 | "nbformat": 4,
426 | "nbformat_minor": 2
427 | }
428 |
--------------------------------------------------------------------------------
/Section 5 PD Model, Data Preparation/.ipynb_checkpoints/Credit Risk Modeling - Preparation - With Comments - 5-2-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Preparation"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Import Libraries"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import numpy as np\n",
24 | "import pandas as pd"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "## Import Data\n",
32 | "The dataset contains all available data for more than 800,000 consumer loans issued from 2007 to 2015 by Lending Club: a large US peer-to-peer lending company. There are several different versions of this dataset. We have used a version available on kaggle.com. You can find it here: https://www.kaggle.com/wendykan/lending-club-loan-data/version/1\n",
33 | "We divided the data into two periods because we assume that some data are available at the moment when we need to build Expected Loss models, and some data comes from applications after. Later, we investigate whether the applications we have after we built the Probability of Default (PD) model have similar characteristics with the applications we used to build the PD model."
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {
40 | "scrolled": true
41 | },
42 | "outputs": [],
43 | "source": [
44 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "loan_data = loan_data_backup.copy()"
54 | ]
55 | },
56 | {
57 | "cell_type": "markdown",
58 | "metadata": {},
59 | "source": [
60 | "## Explore Data"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "loan_data"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "pd.options.display.max_columns = None\n",
79 | "#pd.options.display.max_rows = None\n",
80 | "# Sets the pandas dataframe options to display all columns/ rows."
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "loan_data"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "metadata": {
96 | "scrolled": true
97 | },
98 | "outputs": [],
99 | "source": [
100 | "loan_data.head()"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {
107 | "scrolled": true
108 | },
109 | "outputs": [],
110 | "source": [
111 | "loan_data.tail()"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "loan_data.columns.values\n",
121 | "# Displays all column names."
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": null,
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "loan_data.info()\n",
131 | "# Displays column names, complete (non-missing) cases per column, and datatype per column."
132 | ]
133 | },
134 | {
135 | "cell_type": "markdown",
136 | "metadata": {},
137 | "source": [
138 | "## General Preprocessing"
139 | ]
140 | },
141 | {
142 | "cell_type": "markdown",
143 | "metadata": {},
144 | "source": [
145 | "### Preprocessing few continuous variables"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": null,
151 | "metadata": {},
152 | "outputs": [],
153 | "source": [
154 | "loan_data['emp_length'].unique()\n",
155 | "# Displays unique values of a column."
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\\+ years', '')\n",
165 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0))\n",
166 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a', str(0))\n",
167 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '')\n",
168 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '')\n",
169 | "# We store the preprocessed ‘employment length’ variable in a new variable called ‘employment length int’,\n",
170 | "# We assign the new ‘employment length int’ to be equal to the ‘employment length’ variable with the string ‘+ years’\n",
171 | "# replaced with nothing. Next, we replace the whole string ‘less than 1 year’ with the string ‘0’.\n",
172 | "# Then, we replace the ‘n/a’ string with the string ‘0’. Then, we replace the string ‘space years’ with nothing.\n",
173 | "# Finally, we replace the string ‘space year’ with nothing."
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": null,
179 | "metadata": {},
180 | "outputs": [],
181 | "source": [
182 | "type(loan_data['emp_length_int'][0])\n",
183 | "# Checks the datatype of a single element of a column."
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": null,
189 | "metadata": {},
190 | "outputs": [],
191 | "source": [
192 | "loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])\n",
193 | "# Transforms the values to numeric."
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": null,
199 | "metadata": {},
200 | "outputs": [],
201 | "source": [
202 | "type(loan_data['emp_length_int'][0])\n",
203 | "# Checks the datatype of a single element of a column."
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": null,
209 | "metadata": {},
210 | "outputs": [],
211 | "source": [
212 | "loan_data['earliest_cr_line']\n",
213 | "# Displays a column."
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": null,
219 | "metadata": {},
220 | "outputs": [],
221 | "source": [
222 | "loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = '%b-%y')\n",
223 | "# Extracts the date and the time from a string variable that is in a given format."
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": null,
229 | "metadata": {
230 | "scrolled": false
231 | },
232 | "outputs": [],
233 | "source": [
234 | "type(loan_data['earliest_cr_line_date'][0])\n",
235 | "# Checks the datatype of a single element of a column."
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": null,
241 | "metadata": {},
242 | "outputs": [],
243 | "source": [
244 | "pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']\n",
245 | "# Calculates the difference between two dates and times."
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": null,
251 | "metadata": {},
252 | "outputs": [],
253 | "source": [
254 | "# Assume we are now in December 2017\n",
255 | "loan_data['mths_since_earliest_cr_line'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']) / np.timedelta64(1, 'M')))\n",
256 | "# We calculate the difference between two dates in months, turn it to numeric datatype and round it.\n",
257 | "# We save the result in a new variable."
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": null,
263 | "metadata": {},
264 | "outputs": [],
265 | "source": [
266 | "loan_data['mths_since_earliest_cr_line'].describe()\n",
267 | "# Shows some descriptive statisics for the values of a column.\n",
268 | "# Dates from 1969 and before are not being converted well, i.e., they have become 2069 and similar,\n",
269 | "# and negative differences are being calculated."
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": null,
275 | "metadata": {},
276 | "outputs": [],
277 | "source": [
278 | "loan_data.loc[: , ['earliest_cr_line', 'earliest_cr_line_date', 'mths_since_earliest_cr_line']][loan_data['mths_since_earliest_cr_line'] < 0]\n",
279 | "# We take three columns from the dataframe. Then, we display them only for the rows where a variable has negative value.\n",
280 | "# There are 2303 strange negative values."
281 | ]
282 | },
283 | {
284 | "cell_type": "code",
285 | "execution_count": null,
286 | "metadata": {},
287 | "outputs": [],
288 | "source": [
289 | "loan_data['mths_since_earliest_cr_line'][loan_data['mths_since_earliest_cr_line'] < 0] = loan_data['mths_since_earliest_cr_line'].max()\n",
290 | "# We set the rows that had negative differences to the maximum value."
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": null,
296 | "metadata": {},
297 | "outputs": [],
298 | "source": [
299 | "min(loan_data['mths_since_earliest_cr_line'])\n",
300 | "# Calculates and shows the minimum value of a column."
301 | ]
302 | },
303 | {
304 | "cell_type": "markdown",
305 | "metadata": {},
306 | "source": [
307 | "### Homework"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": null,
313 | "metadata": {},
314 | "outputs": [],
315 | "source": [
316 | "loan_data['term']"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": null,
322 | "metadata": {},
323 | "outputs": [],
324 | "source": [
325 | "loan_data['term'].describe()\n",
326 | "# Shows some descriptive statisics for the values of a column."
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": null,
332 | "metadata": {},
333 | "outputs": [],
334 | "source": [
335 | "loan_data['term_int'] = loan_data['term'].str.replace(' months', '')\n",
336 | "# We replace a string with another string, in this case, with an empty strng (i.e. with nothing)."
337 | ]
338 | },
339 | {
340 | "cell_type": "code",
341 | "execution_count": null,
342 | "metadata": {},
343 | "outputs": [],
344 | "source": [
345 | "loan_data['term_int']"
346 | ]
347 | },
348 | {
349 | "cell_type": "code",
350 | "execution_count": null,
351 | "metadata": {},
352 | "outputs": [],
353 | "source": [
354 | "type(loan_data['term_int'][25])\n",
355 | "# Checks the datatype of a single element of a column."
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": null,
361 | "metadata": {},
362 | "outputs": [],
363 | "source": [
364 | "loan_data['term_int'] = pd.to_numeric(loan_data['term'].str.replace(' months', ''))\n",
365 | "# We remplace a string from a variable with another string, in this case, with an empty strng (i.e. with nothing).\n",
366 | "# We turn the result to numeric datatype and save it in another variable.\n",
367 | "loan_data['term_int']"
368 | ]
369 | },
370 | {
371 | "cell_type": "code",
372 | "execution_count": null,
373 | "metadata": {},
374 | "outputs": [],
375 | "source": [
376 | "type(loan_data['term_int'][0])\n",
377 | "# Checks the datatype of a single element of a column."
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": null,
383 | "metadata": {},
384 | "outputs": [],
385 | "source": [
386 | "loan_data['issue_d']"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": null,
392 | "metadata": {},
393 | "outputs": [],
394 | "source": [
395 | "# Assume we are now in December 2017\n",
396 | "loan_data['issue_d_date'] = pd.to_datetime(loan_data['issue_d'], format = '%b-%y')\n",
397 | "# Extracts the date and the time from a string variable that is in a given format.\n",
398 | "loan_data['mths_since_issue_d'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['issue_d_date']) / np.timedelta64(1, 'M')))\n",
399 | "# We calculate the difference between two dates in months, turn it to numeric datatype and round it.\n",
400 | "# We save the result in a new variable.\n",
401 | "loan_data['mths_since_issue_d'].describe()\n",
402 | "# Shows some descriptive statisics for the values of a column."
403 | ]
404 | },
405 | {
406 | "cell_type": "markdown",
407 | "metadata": {},
408 | "source": [
409 | "### Preprocessing few discrete variables"
410 | ]
411 | },
412 | {
413 | "cell_type": "code",
414 | "execution_count": null,
415 | "metadata": {},
416 | "outputs": [],
417 | "source": [
418 | "loan_data.info()\n",
419 | "# Displays column names, complete (non-missing) cases per column, and datatype per column."
420 | ]
421 | },
422 | {
423 | "cell_type": "markdown",
424 | "metadata": {},
425 | "source": [
426 | "We are going to preprocess the following discrete variables: grade, sub_grade, home_ownership, verification_status, loan_status, purpose, addr_state, initial_list_status. Most likely, we are not going to use sub_grade, as it overlaps with grade."
427 | ]
428 | },
429 | {
430 | "cell_type": "code",
431 | "execution_count": null,
432 | "metadata": {},
433 | "outputs": [],
434 | "source": [
435 | "pd.get_dummies(loan_data['grade'])\n",
436 | "# Create dummy variables from a variable."
437 | ]
438 | },
439 | {
440 | "cell_type": "code",
441 | "execution_count": null,
442 | "metadata": {},
443 | "outputs": [],
444 | "source": [
445 | "pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':')\n",
446 | "# Create dummy variables from a variable."
447 | ]
448 | },
449 | {
450 | "cell_type": "code",
451 | "execution_count": null,
452 | "metadata": {
453 | "scrolled": true
454 | },
455 | "outputs": [],
456 | "source": [
457 | "loan_data_dummies = [pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':'),\n",
458 | " pd.get_dummies(loan_data['sub_grade'], prefix = 'sub_grade', prefix_sep = ':'),\n",
459 | " pd.get_dummies(loan_data['home_ownership'], prefix = 'home_ownership', prefix_sep = ':'),\n",
460 | " pd.get_dummies(loan_data['verification_status'], prefix = 'verification_status', prefix_sep = ':'),\n",
461 | " pd.get_dummies(loan_data['loan_status'], prefix = 'loan_status', prefix_sep = ':'),\n",
462 | " pd.get_dummies(loan_data['purpose'], prefix = 'purpose', prefix_sep = ':'),\n",
463 | " pd.get_dummies(loan_data['addr_state'], prefix = 'addr_state', prefix_sep = ':'),\n",
464 | " pd.get_dummies(loan_data['initial_list_status'], prefix = 'initial_list_status', prefix_sep = ':')]\n",
465 | "# We create dummy variables from all 8 original independent variables, and save them into a list.\n",
466 | "# Note that we are using a particular naming convention for all variables: original variable name, colon, category name."
467 | ]
468 | },
469 | {
470 | "cell_type": "code",
471 | "execution_count": null,
472 | "metadata": {},
473 | "outputs": [],
474 | "source": [
475 | "loan_data_dummies = pd.concat(loan_data_dummies, axis = 1)\n",
476 | "# We concatenate the dummy variables and this turns them into a dataframe."
477 | ]
478 | },
479 | {
480 | "cell_type": "code",
481 | "execution_count": null,
482 | "metadata": {},
483 | "outputs": [],
484 | "source": [
485 | "type(loan_data_dummies)\n",
486 | "# Returns the type of the variable."
487 | ]
488 | },
489 | {
490 | "cell_type": "code",
491 | "execution_count": null,
492 | "metadata": {},
493 | "outputs": [],
494 | "source": [
495 | "loan_data = pd.concat([loan_data, loan_data_dummies], axis = 1)\n",
496 | "# Concatenates two dataframes.\n",
497 | "# Here we concatenate the dataframe with original data with the dataframe with dummy variables, along the columns. "
498 | ]
499 | },
500 | {
501 | "cell_type": "code",
502 | "execution_count": null,
503 | "metadata": {},
504 | "outputs": [],
505 | "source": [
506 | "loan_data.columns.values\n",
507 | "# Displays all column names."
508 | ]
509 | },
510 | {
511 | "cell_type": "markdown",
512 | "metadata": {},
513 | "source": [
514 | "### Check for missing values and clean"
515 | ]
516 | },
517 | {
518 | "cell_type": "code",
519 | "execution_count": null,
520 | "metadata": {
521 | "scrolled": false
522 | },
523 | "outputs": [],
524 | "source": [
525 | "loan_data.isnull()\n",
526 | "# It returns 'False' if a value is not missing and 'True' if a value is missing, for each value in a dataframe."
527 | ]
528 | },
529 | {
530 | "cell_type": "code",
531 | "execution_count": null,
532 | "metadata": {},
533 | "outputs": [],
534 | "source": [
535 | "pd.options.display.max_rows = None\n",
536 | "# Sets the pandas dataframe options to display all columns/ rows.\n",
537 | "loan_data.isnull().sum()"
538 | ]
539 | },
540 | {
541 | "cell_type": "code",
542 | "execution_count": null,
543 | "metadata": {},
544 | "outputs": [],
545 | "source": [
546 | "pd.options.display.max_rows = 100\n",
547 | "# Sets the pandas dataframe options to display 100 columns/ rows."
548 | ]
549 | },
550 | {
551 | "cell_type": "code",
552 | "execution_count": null,
553 | "metadata": {},
554 | "outputs": [],
555 | "source": [
556 | "# 'Total revolving high credit/ credit limit', so it makes sense that the missing values are equal to funded_amnt.\n",
557 | "loan_data['total_rev_hi_lim'].fillna(loan_data['funded_amnt'], inplace=True)\n",
558 | "# We fill the missing values with the values of another variable."
559 | ]
560 | },
561 | {
562 | "cell_type": "code",
563 | "execution_count": null,
564 | "metadata": {},
565 | "outputs": [],
566 | "source": [
567 | "loan_data['total_rev_hi_lim'].isnull().sum()"
568 | ]
569 | },
570 | {
571 | "cell_type": "markdown",
572 | "metadata": {},
573 | "source": [
574 | "### Homework"
575 | ]
576 | },
577 | {
578 | "cell_type": "code",
579 | "execution_count": null,
580 | "metadata": {},
581 | "outputs": [],
582 | "source": [
583 | "loan_data['annual_inc'].fillna(loan_data['annual_inc'].mean(), inplace=True)\n",
584 | "# We fill the missing values with the mean value of the non-missing values."
585 | ]
586 | },
587 | {
588 | "cell_type": "code",
589 | "execution_count": null,
590 | "metadata": {},
591 | "outputs": [],
592 | "source": [
593 | "loan_data['mths_since_earliest_cr_line'].fillna(0, inplace=True)\n",
594 | "loan_data['acc_now_delinq'].fillna(0, inplace=True)\n",
595 | "loan_data['total_acc'].fillna(0, inplace=True)\n",
596 | "loan_data['pub_rec'].fillna(0, inplace=True)\n",
597 | "loan_data['open_acc'].fillna(0, inplace=True)\n",
598 | "loan_data['inq_last_6mths'].fillna(0, inplace=True)\n",
599 | "loan_data['delinq_2yrs'].fillna(0, inplace=True)\n",
600 | "loan_data['emp_length_int'].fillna(0, inplace=True)\n",
601 | "# We fill the missing values with zeroes."
602 | ]
603 | },
604 | {
605 | "cell_type": "markdown",
606 | "metadata": {},
607 | "source": [
608 | "# PD model"
609 | ]
610 | },
611 | {
612 | "cell_type": "markdown",
613 | "metadata": {},
614 | "source": [
615 | "## Data preparation"
616 | ]
617 | },
618 | {
619 | "cell_type": "markdown",
620 | "metadata": {},
621 | "source": [
622 | "### Dependent Variable. Good/ Bad (Default) Definition. Default and Non-default Accounts."
623 | ]
624 | },
625 | {
626 | "cell_type": "code",
627 | "execution_count": null,
628 | "metadata": {},
629 | "outputs": [],
630 | "source": [
631 | "loan_data['loan_status'].unique()\n",
632 | "# Displays unique values of a column."
633 | ]
634 | },
635 | {
636 | "cell_type": "code",
637 | "execution_count": null,
638 | "metadata": {
639 | "scrolled": true
640 | },
641 | "outputs": [],
642 | "source": [
643 | "loan_data['loan_status'].value_counts()\n",
644 | "# Calculates the number of observations for each unique value of a variable."
645 | ]
646 | },
647 | {
648 | "cell_type": "code",
649 | "execution_count": null,
650 | "metadata": {},
651 | "outputs": [],
652 | "source": [
653 | "loan_data['loan_status'].value_counts() / loan_data['loan_status'].count()\n",
654 | "# We divide the number of observations for each unique value of a variable by the total number of observations.\n",
655 | "# Thus, we get the proportion of observations for each unique value of a variable."
656 | ]
657 | },
658 | {
659 | "cell_type": "code",
660 | "execution_count": null,
661 | "metadata": {
662 | "scrolled": true
663 | },
664 | "outputs": [],
665 | "source": [
666 | "# Good/ Bad Definition\n",
667 | "loan_data['good_bad'] = np.where(loan_data['loan_status'].isin(['Charged Off', 'Default',\n",
668 | " 'Does not meet the credit policy. Status:Charged Off',\n",
669 | " 'Late (31-120 days)']), 0, 1)\n",
670 | "# We create a new variable that has the value of '0' if a condition is met, and the value of '1' if it is not met."
671 | ]
672 | },
673 | {
674 | "cell_type": "code",
675 | "execution_count": null,
676 | "metadata": {},
677 | "outputs": [],
678 | "source": [
679 | "loan_data['good_bad']"
680 | ]
681 | }
682 | ],
683 | "metadata": {
684 | "kernelspec": {
685 | "display_name": "Python 3",
686 | "language": "python",
687 | "name": "python3"
688 | },
689 | "language_info": {
690 | "codemirror_mode": {
691 | "name": "ipython",
692 | "version": 3
693 | },
694 | "file_extension": ".py",
695 | "mimetype": "text/x-python",
696 | "name": "python",
697 | "nbconvert_exporter": "python",
698 | "pygments_lexer": "ipython3",
699 | "version": "3.7.4"
700 | }
701 | },
702 | "nbformat": 4,
703 | "nbformat_minor": 2
704 | }
705 |
--------------------------------------------------------------------------------
/Section 5 PD Model, Data Preparation/Credit Risk Modeling - Preparation - 5-2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Preparation"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Import Libraries"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import numpy as np\n",
24 | "import pandas as pd"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "## Import Data"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "loan_data = loan_data_backup.copy()"
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "metadata": {},
55 | "source": [
56 | "## Explore Data"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": null,
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "loan_data"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "pd.options.display.max_columns = None"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "loan_data"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "loan_data.head()"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": null,
98 | "metadata": {},
99 | "outputs": [],
100 | "source": [
101 | "loan_data.tail()"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "loan_data.columns.values"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "metadata": {
117 | "scrolled": true
118 | },
119 | "outputs": [],
120 | "source": [
121 | "loan_data.info()"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {},
127 | "source": [
128 | "## General Preprocessing"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "### Preprocessing few continuous variables"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": null,
141 | "metadata": {},
142 | "outputs": [],
143 | "source": [
144 | "loan_data['emp_length'].unique()"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": null,
150 | "metadata": {},
151 | "outputs": [],
152 | "source": [
153 | "loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\\+ years', '')\n",
154 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0))\n",
155 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a', str(0))\n",
156 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '')\n",
157 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '')"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "metadata": {},
164 | "outputs": [],
165 | "source": [
166 | "type(loan_data['emp_length_int'][0])"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": null,
172 | "metadata": {},
173 | "outputs": [],
174 | "source": [
175 | "loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {},
182 | "outputs": [],
183 | "source": [
184 | "type(loan_data['emp_length_int'][0])"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {
191 | "scrolled": true
192 | },
193 | "outputs": [],
194 | "source": [
195 | "loan_data['earliest_cr_line']"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = '%b-%y')"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "type(loan_data['earliest_cr_line_date'][0])"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": null,
219 | "metadata": {
220 | "scrolled": true
221 | },
222 | "outputs": [],
223 | "source": [
224 | "pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": null,
230 | "metadata": {},
231 | "outputs": [],
232 | "source": [
233 | "loan_data['mths_since_earliest_cr_line'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']) / np.timedelta64(1, 'M')))"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": null,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "loan_data['mths_since_earliest_cr_line'].describe()"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": null,
248 | "metadata": {
249 | "scrolled": true
250 | },
251 | "outputs": [],
252 | "source": [
253 | "loan_data.loc[: , ['earliest_cr_line', 'earliest_cr_line_date', 'mths_since_earliest_cr_line']][loan_data['mths_since_earliest_cr_line'] < 0]"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": null,
259 | "metadata": {},
260 | "outputs": [],
261 | "source": [
262 | "loan_data['mths_since_earliest_cr_line'][loan_data['mths_since_earliest_cr_line'] < 0] = loan_data['mths_since_earliest_cr_line'].max()"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": null,
268 | "metadata": {},
269 | "outputs": [],
270 | "source": [
271 | "min(loan_data['mths_since_earliest_cr_line'])"
272 | ]
273 | },
274 | {
275 | "cell_type": "markdown",
276 | "metadata": {},
277 | "source": [
278 | "### Homework"
279 | ]
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": null,
284 | "metadata": {
285 | "scrolled": true
286 | },
287 | "outputs": [],
288 | "source": [
289 | "loan_data['term']"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": null,
295 | "metadata": {},
296 | "outputs": [],
297 | "source": [
298 | "loan_data['term'].describe()"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": null,
304 | "metadata": {},
305 | "outputs": [],
306 | "source": [
307 | "loan_data['term_int'] = loan_data['term'].str.replace(' months', '')"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": null,
313 | "metadata": {
314 | "scrolled": true
315 | },
316 | "outputs": [],
317 | "source": [
318 | "loan_data['term_int']"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": null,
324 | "metadata": {},
325 | "outputs": [],
326 | "source": [
327 | "type(loan_data['term_int'][25])"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": null,
333 | "metadata": {
334 | "scrolled": true
335 | },
336 | "outputs": [],
337 | "source": [
338 | "loan_data['term_int'] = pd.to_numeric(loan_data['term'].str.replace(' months', ''))\n",
339 | "loan_data['term_int']"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": null,
345 | "metadata": {},
346 | "outputs": [],
347 | "source": [
348 | "type(loan_data['term_int'][0])"
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": null,
354 | "metadata": {
355 | "scrolled": true
356 | },
357 | "outputs": [],
358 | "source": [
359 | "loan_data['issue_d']"
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": null,
365 | "metadata": {},
366 | "outputs": [],
367 | "source": [
368 | "loan_data['issue_d_date'] = pd.to_datetime(loan_data['issue_d'], format = '%b-%y')\n",
369 | "loan_data['mths_since_issue_d'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['issue_d_date']) / np.timedelta64(1, 'M')))\n",
370 | "loan_data['mths_since_issue_d'].describe()"
371 | ]
372 | },
373 | {
374 | "cell_type": "markdown",
375 | "metadata": {},
376 | "source": [
377 | "### Preprocessing few discrete variables"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": null,
383 | "metadata": {},
384 | "outputs": [],
385 | "source": [
386 | "loan_data.info()"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": null,
392 | "metadata": {},
393 | "outputs": [],
394 | "source": [
395 | "pd.get_dummies(loan_data['grade'])"
396 | ]
397 | },
398 | {
399 | "cell_type": "code",
400 | "execution_count": null,
401 | "metadata": {},
402 | "outputs": [],
403 | "source": [
404 | "pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':')"
405 | ]
406 | },
407 | {
408 | "cell_type": "code",
409 | "execution_count": null,
410 | "metadata": {},
411 | "outputs": [],
412 | "source": [
413 | "loan_data_dummies = [pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':'),\n",
414 | " pd.get_dummies(loan_data['sub_grade'], prefix = 'sub_grade', prefix_sep = ':'),\n",
415 | " pd.get_dummies(loan_data['home_ownership'], prefix = 'home_ownership', prefix_sep = ':'),\n",
416 | " pd.get_dummies(loan_data['verification_status'], prefix = 'verification_status', prefix_sep = ':'),\n",
417 | " pd.get_dummies(loan_data['loan_status'], prefix = 'loan_status', prefix_sep = ':'),\n",
418 | " pd.get_dummies(loan_data['purpose'], prefix = 'purpose', prefix_sep = ':'),\n",
419 | " pd.get_dummies(loan_data['addr_state'], prefix = 'addr_state', prefix_sep = ':'),\n",
420 | " pd.get_dummies(loan_data['initial_list_status'], prefix = 'initial_list_status', prefix_sep = ':')]"
421 | ]
422 | },
423 | {
424 | "cell_type": "code",
425 | "execution_count": null,
426 | "metadata": {},
427 | "outputs": [],
428 | "source": [
429 | "loan_data_dummies = pd.concat(loan_data_dummies, axis = 1)"
430 | ]
431 | },
432 | {
433 | "cell_type": "code",
434 | "execution_count": null,
435 | "metadata": {},
436 | "outputs": [],
437 | "source": [
438 | "type(loan_data_dummies)"
439 | ]
440 | },
441 | {
442 | "cell_type": "code",
443 | "execution_count": null,
444 | "metadata": {},
445 | "outputs": [],
446 | "source": [
447 | "loan_data = pd.concat([loan_data, loan_data_dummies], axis = 1)"
448 | ]
449 | },
450 | {
451 | "cell_type": "code",
452 | "execution_count": null,
453 | "metadata": {},
454 | "outputs": [],
455 | "source": [
456 | "loan_data.columns.values"
457 | ]
458 | },
459 | {
460 | "cell_type": "markdown",
461 | "metadata": {},
462 | "source": [
463 | "### Check for missing values and clean"
464 | ]
465 | },
466 | {
467 | "cell_type": "code",
468 | "execution_count": null,
469 | "metadata": {
470 | "scrolled": true
471 | },
472 | "outputs": [],
473 | "source": [
474 | "loan_data.isnull()"
475 | ]
476 | },
477 | {
478 | "cell_type": "code",
479 | "execution_count": null,
480 | "metadata": {},
481 | "outputs": [],
482 | "source": [
483 | "pd.options.display.max_rows = None\n",
484 | "loan_data.isnull().sum()"
485 | ]
486 | },
487 | {
488 | "cell_type": "code",
489 | "execution_count": null,
490 | "metadata": {},
491 | "outputs": [],
492 | "source": [
493 | "pd.options.display.max_rows = 100"
494 | ]
495 | },
496 | {
497 | "cell_type": "code",
498 | "execution_count": null,
499 | "metadata": {},
500 | "outputs": [],
501 | "source": [
502 | "loan_data['total_rev_hi_lim'].fillna(loan_data['funded_amnt'], inplace = True)"
503 | ]
504 | },
505 | {
506 | "cell_type": "code",
507 | "execution_count": null,
508 | "metadata": {},
509 | "outputs": [],
510 | "source": [
511 | "loan_data['total_rev_hi_lim'].isnull().sum()"
512 | ]
513 | },
514 | {
515 | "cell_type": "markdown",
516 | "metadata": {},
517 | "source": [
518 | "### Homework"
519 | ]
520 | },
521 | {
522 | "cell_type": "code",
523 | "execution_count": null,
524 | "metadata": {},
525 | "outputs": [],
526 | "source": [
527 | "loan_data['annual_inc'].fillna(loan_data['annual_inc'].mean(), inplace=True)"
528 | ]
529 | },
530 | {
531 | "cell_type": "code",
532 | "execution_count": null,
533 | "metadata": {},
534 | "outputs": [],
535 | "source": [
536 | "loan_data['mths_since_earliest_cr_line'].fillna(0, inplace=True)\n",
537 | "loan_data['acc_now_delinq'].fillna(0, inplace=True)\n",
538 | "loan_data['total_acc'].fillna(0, inplace=True)\n",
539 | "loan_data['pub_rec'].fillna(0, inplace=True)\n",
540 | "loan_data['open_acc'].fillna(0, inplace=True)\n",
541 | "loan_data['inq_last_6mths'].fillna(0, inplace=True)\n",
542 | "loan_data['delinq_2yrs'].fillna(0, inplace=True)\n",
543 | "loan_data['emp_length_int'].fillna(0, inplace=True)"
544 | ]
545 | },
546 | {
547 | "cell_type": "markdown",
548 | "metadata": {},
549 | "source": [
550 | "# PD model"
551 | ]
552 | },
553 | {
554 | "cell_type": "markdown",
555 | "metadata": {},
556 | "source": [
557 | "## Data preparation"
558 | ]
559 | },
560 | {
561 | "cell_type": "markdown",
562 | "metadata": {},
563 | "source": [
564 | "### Dependent Variable. Good/ Bad (Default) Definition. Default and Non-default Accounts."
565 | ]
566 | },
567 | {
568 | "cell_type": "code",
569 | "execution_count": null,
570 | "metadata": {},
571 | "outputs": [],
572 | "source": [
573 | "loan_data['loan_status'].unique()"
574 | ]
575 | },
576 | {
577 | "cell_type": "code",
578 | "execution_count": null,
579 | "metadata": {},
580 | "outputs": [],
581 | "source": [
582 | "loan_data['loan_status'].value_counts()"
583 | ]
584 | },
585 | {
586 | "cell_type": "code",
587 | "execution_count": null,
588 | "metadata": {},
589 | "outputs": [],
590 | "source": [
591 | "loan_data['loan_status'].value_counts() / loan_data['loan_status'].count()"
592 | ]
593 | },
594 | {
595 | "cell_type": "code",
596 | "execution_count": null,
597 | "metadata": {},
598 | "outputs": [],
599 | "source": [
600 | "loan_data['good_bad'] = np.where(loan_data['loan_status'].isin(['Charged Off', 'Default',\n",
601 | " 'Does not meet the credit policy. Status:Charged Off',\n",
602 | " 'Late (31-120 days)']), 0, 1)"
603 | ]
604 | },
605 | {
606 | "cell_type": "code",
607 | "execution_count": null,
608 | "metadata": {
609 | "scrolled": true
610 | },
611 | "outputs": [],
612 | "source": [
613 | "loan_data['good_bad']"
614 | ]
615 | }
616 | ],
617 | "metadata": {
618 | "kernelspec": {
619 | "display_name": "Python 3",
620 | "language": "python",
621 | "name": "python3"
622 | },
623 | "language_info": {
624 | "codemirror_mode": {
625 | "name": "ipython",
626 | "version": 3
627 | },
628 | "file_extension": ".py",
629 | "mimetype": "text/x-python",
630 | "name": "python",
631 | "nbconvert_exporter": "python",
632 | "pygments_lexer": "ipython3",
633 | "version": "3.7.3"
634 | }
635 | },
636 | "nbformat": 4,
637 | "nbformat_minor": 2
638 | }
639 |
--------------------------------------------------------------------------------
/Section 5 PD Model, Data Preparation/Credit Risk Modeling - Preparation - 5-5.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Preparation"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Import Libraries"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import numpy as np\n",
24 | "import pandas as pd"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "## Import Data"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "loan_data = loan_data_backup.copy()"
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "metadata": {},
55 | "source": [
56 | "## Explore Data"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": null,
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "loan_data"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "pd.options.display.max_columns = None"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "loan_data"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "loan_data.head()"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": null,
98 | "metadata": {},
99 | "outputs": [],
100 | "source": [
101 | "loan_data.tail()"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "loan_data.columns.values"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "metadata": {
117 | "scrolled": true
118 | },
119 | "outputs": [],
120 | "source": [
121 | "loan_data.info()"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {},
127 | "source": [
128 | "## General Preprocessing"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "### Preprocessing few continuous variables"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": null,
141 | "metadata": {},
142 | "outputs": [],
143 | "source": [
144 | "loan_data['emp_length'].unique()"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": null,
150 | "metadata": {},
151 | "outputs": [],
152 | "source": [
153 | "loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\\+ years', '')\n",
154 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0))\n",
155 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a', str(0))\n",
156 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '')\n",
157 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '')"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "metadata": {},
164 | "outputs": [],
165 | "source": [
166 | "type(loan_data['emp_length_int'][0])"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": null,
172 | "metadata": {},
173 | "outputs": [],
174 | "source": [
175 | "loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {},
182 | "outputs": [],
183 | "source": [
184 | "type(loan_data['emp_length_int'][0])"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {
191 | "scrolled": true
192 | },
193 | "outputs": [],
194 | "source": [
195 | "loan_data['earliest_cr_line']"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = '%b-%y')"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "type(loan_data['earliest_cr_line_date'][0])"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": null,
219 | "metadata": {
220 | "scrolled": true
221 | },
222 | "outputs": [],
223 | "source": [
224 | "pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": null,
230 | "metadata": {},
231 | "outputs": [],
232 | "source": [
233 | "loan_data['mths_since_earliest_cr_line'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']) / np.timedelta64(1, 'M')))"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": null,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "loan_data['mths_since_earliest_cr_line'].describe()"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": null,
248 | "metadata": {
249 | "scrolled": true
250 | },
251 | "outputs": [],
252 | "source": [
253 | "loan_data.loc[: , ['earliest_cr_line', 'earliest_cr_line_date', 'mths_since_earliest_cr_line']][loan_data['mths_since_earliest_cr_line'] < 0]"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": null,
259 | "metadata": {},
260 | "outputs": [],
261 | "source": [
262 | "loan_data['mths_since_earliest_cr_line'][loan_data['mths_since_earliest_cr_line'] < 0] = loan_data['mths_since_earliest_cr_line'].max()"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": null,
268 | "metadata": {},
269 | "outputs": [],
270 | "source": [
271 | "min(loan_data['mths_since_earliest_cr_line'])"
272 | ]
273 | },
274 | {
275 | "cell_type": "markdown",
276 | "metadata": {},
277 | "source": [
278 | "### Homework"
279 | ]
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": null,
284 | "metadata": {
285 | "scrolled": true
286 | },
287 | "outputs": [],
288 | "source": [
289 | "loan_data['term']"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": null,
295 | "metadata": {},
296 | "outputs": [],
297 | "source": [
298 | "loan_data['term'].describe()"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": null,
304 | "metadata": {},
305 | "outputs": [],
306 | "source": [
307 | "loan_data['term_int'] = loan_data['term'].str.replace(' months', '')"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": null,
313 | "metadata": {
314 | "scrolled": true
315 | },
316 | "outputs": [],
317 | "source": [
318 | "loan_data['term_int']"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": null,
324 | "metadata": {},
325 | "outputs": [],
326 | "source": [
327 | "type(loan_data['term_int'][25])"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": null,
333 | "metadata": {
334 | "scrolled": true
335 | },
336 | "outputs": [],
337 | "source": [
338 | "loan_data['term_int'] = pd.to_numeric(loan_data['term'].str.replace(' months', ''))\n",
339 | "loan_data['term_int']"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": null,
345 | "metadata": {},
346 | "outputs": [],
347 | "source": [
348 | "type(loan_data['term_int'][0])"
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": null,
354 | "metadata": {
355 | "scrolled": true
356 | },
357 | "outputs": [],
358 | "source": [
359 | "loan_data['issue_d']"
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": null,
365 | "metadata": {},
366 | "outputs": [],
367 | "source": [
368 | "loan_data['issue_d_date'] = pd.to_datetime(loan_data['issue_d'], format = '%b-%y')\n",
369 | "loan_data['mths_since_issue_d'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['issue_d_date']) / np.timedelta64(1, 'M')))\n",
370 | "loan_data['mths_since_issue_d'].describe()"
371 | ]
372 | },
373 | {
374 | "cell_type": "markdown",
375 | "metadata": {},
376 | "source": [
377 | "### Preprocessing few discrete variables"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": null,
383 | "metadata": {},
384 | "outputs": [],
385 | "source": [
386 | "loan_data.info()"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": null,
392 | "metadata": {},
393 | "outputs": [],
394 | "source": [
395 | "pd.get_dummies(loan_data['grade'])"
396 | ]
397 | },
398 | {
399 | "cell_type": "code",
400 | "execution_count": null,
401 | "metadata": {},
402 | "outputs": [],
403 | "source": [
404 | "pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':')"
405 | ]
406 | },
407 | {
408 | "cell_type": "code",
409 | "execution_count": null,
410 | "metadata": {},
411 | "outputs": [],
412 | "source": [
413 | "loan_data_dummies = [pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':'),\n",
414 | " pd.get_dummies(loan_data['sub_grade'], prefix = 'sub_grade', prefix_sep = ':'),\n",
415 | " pd.get_dummies(loan_data['home_ownership'], prefix = 'home_ownership', prefix_sep = ':'),\n",
416 | " pd.get_dummies(loan_data['verification_status'], prefix = 'verification_status', prefix_sep = ':'),\n",
417 | " pd.get_dummies(loan_data['loan_status'], prefix = 'loan_status', prefix_sep = ':'),\n",
418 | " pd.get_dummies(loan_data['purpose'], prefix = 'purpose', prefix_sep = ':'),\n",
419 | " pd.get_dummies(loan_data['addr_state'], prefix = 'addr_state', prefix_sep = ':'),\n",
420 | " pd.get_dummies(loan_data['initial_list_status'], prefix = 'initial_list_status', prefix_sep = ':')]"
421 | ]
422 | },
423 | {
424 | "cell_type": "code",
425 | "execution_count": null,
426 | "metadata": {},
427 | "outputs": [],
428 | "source": [
429 | "loan_data_dummies = pd.concat(loan_data_dummies, axis = 1)"
430 | ]
431 | },
432 | {
433 | "cell_type": "code",
434 | "execution_count": null,
435 | "metadata": {},
436 | "outputs": [],
437 | "source": [
438 | "type(loan_data_dummies)"
439 | ]
440 | },
441 | {
442 | "cell_type": "code",
443 | "execution_count": null,
444 | "metadata": {},
445 | "outputs": [],
446 | "source": [
447 | "loan_data = pd.concat([loan_data, loan_data_dummies], axis = 1)"
448 | ]
449 | },
450 | {
451 | "cell_type": "code",
452 | "execution_count": null,
453 | "metadata": {},
454 | "outputs": [],
455 | "source": [
456 | "loan_data.columns.values"
457 | ]
458 | },
459 | {
460 | "cell_type": "markdown",
461 | "metadata": {},
462 | "source": [
463 | "### Check for missing values and clean"
464 | ]
465 | },
466 | {
467 | "cell_type": "code",
468 | "execution_count": null,
469 | "metadata": {
470 | "scrolled": true
471 | },
472 | "outputs": [],
473 | "source": [
474 | "loan_data.isnull()"
475 | ]
476 | },
477 | {
478 | "cell_type": "code",
479 | "execution_count": null,
480 | "metadata": {},
481 | "outputs": [],
482 | "source": [
483 | "pd.options.display.max_rows = None\n",
484 | "loan_data.isnull().sum()"
485 | ]
486 | },
487 | {
488 | "cell_type": "code",
489 | "execution_count": null,
490 | "metadata": {},
491 | "outputs": [],
492 | "source": [
493 | "pd.options.display.max_rows = 100"
494 | ]
495 | },
496 | {
497 | "cell_type": "code",
498 | "execution_count": null,
499 | "metadata": {},
500 | "outputs": [],
501 | "source": [
502 | "loan_data['total_rev_hi_lim'].fillna(loan_data['funded_amnt'], inplace = True)"
503 | ]
504 | },
505 | {
506 | "cell_type": "code",
507 | "execution_count": null,
508 | "metadata": {},
509 | "outputs": [],
510 | "source": [
511 | "loan_data['total_rev_hi_lim'].isnull().sum()"
512 | ]
513 | },
514 | {
515 | "cell_type": "markdown",
516 | "metadata": {},
517 | "source": [
518 | "### Homework"
519 | ]
520 | },
521 | {
522 | "cell_type": "code",
523 | "execution_count": null,
524 | "metadata": {},
525 | "outputs": [],
526 | "source": [
527 | "loan_data['annual_inc'].fillna(loan_data['annual_inc'].mean(), inplace=True)"
528 | ]
529 | },
530 | {
531 | "cell_type": "code",
532 | "execution_count": null,
533 | "metadata": {},
534 | "outputs": [],
535 | "source": [
536 | "loan_data['mths_since_earliest_cr_line'].fillna(0, inplace=True)\n",
537 | "loan_data['acc_now_delinq'].fillna(0, inplace=True)\n",
538 | "loan_data['total_acc'].fillna(0, inplace=True)\n",
539 | "loan_data['pub_rec'].fillna(0, inplace=True)\n",
540 | "loan_data['open_acc'].fillna(0, inplace=True)\n",
541 | "loan_data['inq_last_6mths'].fillna(0, inplace=True)\n",
542 | "loan_data['delinq_2yrs'].fillna(0, inplace=True)\n",
543 | "loan_data['emp_length_int'].fillna(0, inplace=True)"
544 | ]
545 | },
546 | {
547 | "cell_type": "markdown",
548 | "metadata": {},
549 | "source": [
550 | "# PD model"
551 | ]
552 | },
553 | {
554 | "cell_type": "markdown",
555 | "metadata": {},
556 | "source": [
557 | "## Data preparation"
558 | ]
559 | },
560 | {
561 | "cell_type": "markdown",
562 | "metadata": {},
563 | "source": [
564 | "### Dependent Variable. Good/ Bad (Default) Definition. Default and Non-default Accounts."
565 | ]
566 | },
567 | {
568 | "cell_type": "code",
569 | "execution_count": null,
570 | "metadata": {},
571 | "outputs": [],
572 | "source": [
573 | "loan_data['loan_status'].unique()"
574 | ]
575 | },
576 | {
577 | "cell_type": "code",
578 | "execution_count": null,
579 | "metadata": {},
580 | "outputs": [],
581 | "source": [
582 | "loan_data['loan_status'].value_counts()"
583 | ]
584 | },
585 | {
586 | "cell_type": "code",
587 | "execution_count": null,
588 | "metadata": {},
589 | "outputs": [],
590 | "source": [
591 | "loan_data['loan_status'].value_counts() / loan_data['loan_status'].count()"
592 | ]
593 | },
594 | {
595 | "cell_type": "code",
596 | "execution_count": null,
597 | "metadata": {},
598 | "outputs": [],
599 | "source": [
600 | "loan_data['good_bad'] = np.where(loan_data['loan_status'].isin(['Charged Off', 'Default',\n",
601 | " 'Does not meet the credit policy. Status:Charged Off',\n",
602 | " 'Late (31-120 days)']), 0, 1)"
603 | ]
604 | },
605 | {
606 | "cell_type": "code",
607 | "execution_count": null,
608 | "metadata": {
609 | "scrolled": true
610 | },
611 | "outputs": [],
612 | "source": [
613 | "loan_data['good_bad']"
614 | ]
615 | },
616 | {
617 | "cell_type": "markdown",
618 | "metadata": {},
619 | "source": [
620 | "### Splitting Data"
621 | ]
622 | },
623 | {
624 | "cell_type": "code",
625 | "execution_count": null,
626 | "metadata": {},
627 | "outputs": [],
628 | "source": [
629 | "from sklearn.model_selection import train_test_split"
630 | ]
631 | },
632 | {
633 | "cell_type": "code",
634 | "execution_count": null,
635 | "metadata": {},
636 | "outputs": [],
637 | "source": [
638 | "train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'])"
639 | ]
640 | },
641 | {
642 | "cell_type": "code",
643 | "execution_count": null,
644 | "metadata": {},
645 | "outputs": [],
646 | "source": [
647 | "loan_data_inputs_train, loan_data_inputs_test, loan_data_targets_train, loan_data_targets_test = train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'])"
648 | ]
649 | },
650 | {
651 | "cell_type": "code",
652 | "execution_count": null,
653 | "metadata": {},
654 | "outputs": [],
655 | "source": [
656 | "loan_data_inputs_train.shape"
657 | ]
658 | },
659 | {
660 | "cell_type": "code",
661 | "execution_count": null,
662 | "metadata": {},
663 | "outputs": [],
664 | "source": [
665 | "loan_data_targets_train.shape"
666 | ]
667 | },
668 | {
669 | "cell_type": "code",
670 | "execution_count": null,
671 | "metadata": {},
672 | "outputs": [],
673 | "source": [
674 | "loan_data_inputs_test.shape"
675 | ]
676 | },
677 | {
678 | "cell_type": "code",
679 | "execution_count": null,
680 | "metadata": {},
681 | "outputs": [],
682 | "source": [
683 | "loan_data_targets_test.shape"
684 | ]
685 | },
686 | {
687 | "cell_type": "code",
688 | "execution_count": null,
689 | "metadata": {},
690 | "outputs": [],
691 | "source": [
692 | "loan_data_inputs_train, loan_data_inputs_test, loan_data_targets_train, loan_data_targets_test = train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'], test_size = 0.2, random_state = 42)"
693 | ]
694 | },
695 | {
696 | "cell_type": "code",
697 | "execution_count": null,
698 | "metadata": {},
699 | "outputs": [],
700 | "source": [
701 | "loan_data_inputs_train.shape"
702 | ]
703 | },
704 | {
705 | "cell_type": "code",
706 | "execution_count": null,
707 | "metadata": {},
708 | "outputs": [],
709 | "source": [
710 | "loan_data_targets_train.shape"
711 | ]
712 | },
713 | {
714 | "cell_type": "code",
715 | "execution_count": null,
716 | "metadata": {},
717 | "outputs": [],
718 | "source": [
719 | "loan_data_inputs_test.shape"
720 | ]
721 | },
722 | {
723 | "cell_type": "code",
724 | "execution_count": null,
725 | "metadata": {},
726 | "outputs": [],
727 | "source": [
728 | "loan_data_targets_test.shape"
729 | ]
730 | }
731 | ],
732 | "metadata": {
733 | "kernelspec": {
734 | "display_name": "Python 3",
735 | "language": "python",
736 | "name": "python3"
737 | },
738 | "language_info": {
739 | "codemirror_mode": {
740 | "name": "ipython",
741 | "version": 3
742 | },
743 | "file_extension": ".py",
744 | "mimetype": "text/x-python",
745 | "name": "python",
746 | "nbconvert_exporter": "python",
747 | "pygments_lexer": "ipython3",
748 | "version": "3.7.3"
749 | }
750 | },
751 | "nbformat": 4,
752 | "nbformat_minor": 2
753 | }
754 |
--------------------------------------------------------------------------------
/Section 5 PD Model, Data Preparation/Credit Risk Modeling - Preparation - 5-6.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Preparation"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Import Libraries"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import numpy as np\n",
24 | "import pandas as pd"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "## Import Data"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "loan_data = loan_data_backup.copy()"
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "metadata": {},
55 | "source": [
56 | "## Explore Data"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": null,
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "loan_data"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "pd.options.display.max_columns = None"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "loan_data"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "loan_data.head()"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": null,
98 | "metadata": {},
99 | "outputs": [],
100 | "source": [
101 | "loan_data.tail()"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "loan_data.columns.values"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "metadata": {
117 | "scrolled": true
118 | },
119 | "outputs": [],
120 | "source": [
121 | "loan_data.info()"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {},
127 | "source": [
128 | "## General Preprocessing"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "### Preprocessing few continuous variables"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": null,
141 | "metadata": {},
142 | "outputs": [],
143 | "source": [
144 | "loan_data['emp_length'].unique()"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": null,
150 | "metadata": {},
151 | "outputs": [],
152 | "source": [
153 | "loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\\+ years', '')\n",
154 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0))\n",
155 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a', str(0))\n",
156 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '')\n",
157 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '')"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "metadata": {},
164 | "outputs": [],
165 | "source": [
166 | "type(loan_data['emp_length_int'][0])"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": null,
172 | "metadata": {},
173 | "outputs": [],
174 | "source": [
175 | "loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {},
182 | "outputs": [],
183 | "source": [
184 | "type(loan_data['emp_length_int'][0])"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {
191 | "scrolled": true
192 | },
193 | "outputs": [],
194 | "source": [
195 | "loan_data['earliest_cr_line']"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = '%b-%y')"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "type(loan_data['earliest_cr_line_date'][0])"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": null,
219 | "metadata": {
220 | "scrolled": true
221 | },
222 | "outputs": [],
223 | "source": [
224 | "pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": null,
230 | "metadata": {},
231 | "outputs": [],
232 | "source": [
233 | "loan_data['mths_since_earliest_cr_line'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']) / np.timedelta64(1, 'M')))"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": null,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "loan_data['mths_since_earliest_cr_line'].describe()"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": null,
248 | "metadata": {
249 | "scrolled": true
250 | },
251 | "outputs": [],
252 | "source": [
253 | "loan_data.loc[: , ['earliest_cr_line', 'earliest_cr_line_date', 'mths_since_earliest_cr_line']][loan_data['mths_since_earliest_cr_line'] < 0]"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": null,
259 | "metadata": {},
260 | "outputs": [],
261 | "source": [
262 | "loan_data['mths_since_earliest_cr_line'][loan_data['mths_since_earliest_cr_line'] < 0] = loan_data['mths_since_earliest_cr_line'].max()"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": null,
268 | "metadata": {},
269 | "outputs": [],
270 | "source": [
271 | "min(loan_data['mths_since_earliest_cr_line'])"
272 | ]
273 | },
274 | {
275 | "cell_type": "markdown",
276 | "metadata": {},
277 | "source": [
278 | "### Homework"
279 | ]
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": null,
284 | "metadata": {
285 | "scrolled": true
286 | },
287 | "outputs": [],
288 | "source": [
289 | "loan_data['term']"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": null,
295 | "metadata": {},
296 | "outputs": [],
297 | "source": [
298 | "loan_data['term'].describe()"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": null,
304 | "metadata": {},
305 | "outputs": [],
306 | "source": [
307 | "loan_data['term_int'] = loan_data['term'].str.replace(' months', '')"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": null,
313 | "metadata": {
314 | "scrolled": true
315 | },
316 | "outputs": [],
317 | "source": [
318 | "loan_data['term_int']"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": null,
324 | "metadata": {},
325 | "outputs": [],
326 | "source": [
327 | "type(loan_data['term_int'][25])"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": null,
333 | "metadata": {
334 | "scrolled": true
335 | },
336 | "outputs": [],
337 | "source": [
338 | "loan_data['term_int'] = pd.to_numeric(loan_data['term'].str.replace(' months', ''))\n",
339 | "loan_data['term_int']"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": null,
345 | "metadata": {},
346 | "outputs": [],
347 | "source": [
348 | "type(loan_data['term_int'][0])"
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": null,
354 | "metadata": {
355 | "scrolled": true
356 | },
357 | "outputs": [],
358 | "source": [
359 | "loan_data['issue_d']"
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": null,
365 | "metadata": {},
366 | "outputs": [],
367 | "source": [
368 | "loan_data['issue_d_date'] = pd.to_datetime(loan_data['issue_d'], format = '%b-%y')\n",
369 | "loan_data['mths_since_issue_d'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['issue_d_date']) / np.timedelta64(1, 'M')))\n",
370 | "loan_data['mths_since_issue_d'].describe()"
371 | ]
372 | },
373 | {
374 | "cell_type": "markdown",
375 | "metadata": {},
376 | "source": [
377 | "### Preprocessing few discrete variables"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": null,
383 | "metadata": {},
384 | "outputs": [],
385 | "source": [
386 | "loan_data.info()"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": null,
392 | "metadata": {},
393 | "outputs": [],
394 | "source": [
395 | "pd.get_dummies(loan_data['grade'])"
396 | ]
397 | },
398 | {
399 | "cell_type": "code",
400 | "execution_count": null,
401 | "metadata": {},
402 | "outputs": [],
403 | "source": [
404 | "pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':')"
405 | ]
406 | },
407 | {
408 | "cell_type": "code",
409 | "execution_count": null,
410 | "metadata": {},
411 | "outputs": [],
412 | "source": [
413 | "loan_data_dummies = [pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':'),\n",
414 | " pd.get_dummies(loan_data['sub_grade'], prefix = 'sub_grade', prefix_sep = ':'),\n",
415 | " pd.get_dummies(loan_data['home_ownership'], prefix = 'home_ownership', prefix_sep = ':'),\n",
416 | " pd.get_dummies(loan_data['verification_status'], prefix = 'verification_status', prefix_sep = ':'),\n",
417 | " pd.get_dummies(loan_data['loan_status'], prefix = 'loan_status', prefix_sep = ':'),\n",
418 | " pd.get_dummies(loan_data['purpose'], prefix = 'purpose', prefix_sep = ':'),\n",
419 | " pd.get_dummies(loan_data['addr_state'], prefix = 'addr_state', prefix_sep = ':'),\n",
420 | " pd.get_dummies(loan_data['initial_list_status'], prefix = 'initial_list_status', prefix_sep = ':')]"
421 | ]
422 | },
423 | {
424 | "cell_type": "code",
425 | "execution_count": null,
426 | "metadata": {},
427 | "outputs": [],
428 | "source": [
429 | "loan_data_dummies = pd.concat(loan_data_dummies, axis = 1)"
430 | ]
431 | },
432 | {
433 | "cell_type": "code",
434 | "execution_count": null,
435 | "metadata": {},
436 | "outputs": [],
437 | "source": [
438 | "type(loan_data_dummies)"
439 | ]
440 | },
441 | {
442 | "cell_type": "code",
443 | "execution_count": null,
444 | "metadata": {},
445 | "outputs": [],
446 | "source": [
447 | "loan_data = pd.concat([loan_data, loan_data_dummies], axis = 1)"
448 | ]
449 | },
450 | {
451 | "cell_type": "code",
452 | "execution_count": null,
453 | "metadata": {},
454 | "outputs": [],
455 | "source": [
456 | "loan_data.columns.values"
457 | ]
458 | },
459 | {
460 | "cell_type": "markdown",
461 | "metadata": {},
462 | "source": [
463 | "### Check for missing values and clean"
464 | ]
465 | },
466 | {
467 | "cell_type": "code",
468 | "execution_count": null,
469 | "metadata": {
470 | "scrolled": true
471 | },
472 | "outputs": [],
473 | "source": [
474 | "loan_data.isnull()"
475 | ]
476 | },
477 | {
478 | "cell_type": "code",
479 | "execution_count": null,
480 | "metadata": {},
481 | "outputs": [],
482 | "source": [
483 | "pd.options.display.max_rows = None\n",
484 | "loan_data.isnull().sum()"
485 | ]
486 | },
487 | {
488 | "cell_type": "code",
489 | "execution_count": null,
490 | "metadata": {},
491 | "outputs": [],
492 | "source": [
493 | "pd.options.display.max_rows = 100"
494 | ]
495 | },
496 | {
497 | "cell_type": "code",
498 | "execution_count": null,
499 | "metadata": {},
500 | "outputs": [],
501 | "source": [
502 | "loan_data['total_rev_hi_lim'].fillna(loan_data['funded_amnt'], inplace = True)"
503 | ]
504 | },
505 | {
506 | "cell_type": "code",
507 | "execution_count": null,
508 | "metadata": {},
509 | "outputs": [],
510 | "source": [
511 | "loan_data['total_rev_hi_lim'].isnull().sum()"
512 | ]
513 | },
514 | {
515 | "cell_type": "markdown",
516 | "metadata": {},
517 | "source": [
518 | "### Homework"
519 | ]
520 | },
521 | {
522 | "cell_type": "code",
523 | "execution_count": null,
524 | "metadata": {},
525 | "outputs": [],
526 | "source": [
527 | "loan_data['annual_inc'].fillna(loan_data['annual_inc'].mean(), inplace=True)"
528 | ]
529 | },
530 | {
531 | "cell_type": "code",
532 | "execution_count": null,
533 | "metadata": {},
534 | "outputs": [],
535 | "source": [
536 | "loan_data['mths_since_earliest_cr_line'].fillna(0, inplace=True)\n",
537 | "loan_data['acc_now_delinq'].fillna(0, inplace=True)\n",
538 | "loan_data['total_acc'].fillna(0, inplace=True)\n",
539 | "loan_data['pub_rec'].fillna(0, inplace=True)\n",
540 | "loan_data['open_acc'].fillna(0, inplace=True)\n",
541 | "loan_data['inq_last_6mths'].fillna(0, inplace=True)\n",
542 | "loan_data['delinq_2yrs'].fillna(0, inplace=True)\n",
543 | "loan_data['emp_length_int'].fillna(0, inplace=True)"
544 | ]
545 | },
546 | {
547 | "cell_type": "markdown",
548 | "metadata": {},
549 | "source": [
550 | "# PD model"
551 | ]
552 | },
553 | {
554 | "cell_type": "markdown",
555 | "metadata": {},
556 | "source": [
557 | "## Data preparation"
558 | ]
559 | },
560 | {
561 | "cell_type": "markdown",
562 | "metadata": {},
563 | "source": [
564 | "### Dependent Variable. Good/ Bad (Default) Definition. Default and Non-default Accounts."
565 | ]
566 | },
567 | {
568 | "cell_type": "code",
569 | "execution_count": null,
570 | "metadata": {},
571 | "outputs": [],
572 | "source": [
573 | "loan_data['loan_status'].unique()"
574 | ]
575 | },
576 | {
577 | "cell_type": "code",
578 | "execution_count": null,
579 | "metadata": {},
580 | "outputs": [],
581 | "source": [
582 | "loan_data['loan_status'].value_counts()"
583 | ]
584 | },
585 | {
586 | "cell_type": "code",
587 | "execution_count": null,
588 | "metadata": {},
589 | "outputs": [],
590 | "source": [
591 | "loan_data['loan_status'].value_counts() / loan_data['loan_status'].count()"
592 | ]
593 | },
594 | {
595 | "cell_type": "code",
596 | "execution_count": null,
597 | "metadata": {},
598 | "outputs": [],
599 | "source": [
600 | "loan_data['good_bad'] = np.where(loan_data['loan_status'].isin(['Charged Off', 'Default',\n",
601 | " 'Does not meet the credit policy. Status:Charged Off',\n",
602 | " 'Late (31-120 days)']), 0, 1)"
603 | ]
604 | },
605 | {
606 | "cell_type": "code",
607 | "execution_count": null,
608 | "metadata": {
609 | "scrolled": true
610 | },
611 | "outputs": [],
612 | "source": [
613 | "loan_data['good_bad']"
614 | ]
615 | },
616 | {
617 | "cell_type": "markdown",
618 | "metadata": {},
619 | "source": [
620 | "### Splitting Data"
621 | ]
622 | },
623 | {
624 | "cell_type": "code",
625 | "execution_count": null,
626 | "metadata": {},
627 | "outputs": [],
628 | "source": [
629 | "from sklearn.model_selection import train_test_split"
630 | ]
631 | },
632 | {
633 | "cell_type": "code",
634 | "execution_count": null,
635 | "metadata": {},
636 | "outputs": [],
637 | "source": [
638 | "train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'])"
639 | ]
640 | },
641 | {
642 | "cell_type": "code",
643 | "execution_count": null,
644 | "metadata": {},
645 | "outputs": [],
646 | "source": [
647 | "loan_data_inputs_train, loan_data_inputs_test, loan_data_targets_train, loan_data_targets_test = train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'])"
648 | ]
649 | },
650 | {
651 | "cell_type": "code",
652 | "execution_count": null,
653 | "metadata": {},
654 | "outputs": [],
655 | "source": [
656 | "loan_data_inputs_train.shape"
657 | ]
658 | },
659 | {
660 | "cell_type": "code",
661 | "execution_count": null,
662 | "metadata": {},
663 | "outputs": [],
664 | "source": [
665 | "loan_data_targets_train.shape"
666 | ]
667 | },
668 | {
669 | "cell_type": "code",
670 | "execution_count": null,
671 | "metadata": {},
672 | "outputs": [],
673 | "source": [
674 | "loan_data_inputs_test.shape"
675 | ]
676 | },
677 | {
678 | "cell_type": "code",
679 | "execution_count": null,
680 | "metadata": {},
681 | "outputs": [],
682 | "source": [
683 | "loan_data_targets_test.shape"
684 | ]
685 | },
686 | {
687 | "cell_type": "code",
688 | "execution_count": null,
689 | "metadata": {},
690 | "outputs": [],
691 | "source": [
692 | "loan_data_inputs_train, loan_data_inputs_test, loan_data_targets_train, loan_data_targets_test = train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'], test_size = 0.2, random_state = 42)"
693 | ]
694 | },
695 | {
696 | "cell_type": "code",
697 | "execution_count": null,
698 | "metadata": {},
699 | "outputs": [],
700 | "source": [
701 | "loan_data_inputs_train.shape"
702 | ]
703 | },
704 | {
705 | "cell_type": "code",
706 | "execution_count": null,
707 | "metadata": {},
708 | "outputs": [],
709 | "source": [
710 | "loan_data_targets_train.shape"
711 | ]
712 | },
713 | {
714 | "cell_type": "code",
715 | "execution_count": null,
716 | "metadata": {},
717 | "outputs": [],
718 | "source": [
719 | "loan_data_inputs_test.shape"
720 | ]
721 | },
722 | {
723 | "cell_type": "code",
724 | "execution_count": null,
725 | "metadata": {},
726 | "outputs": [],
727 | "source": [
728 | "loan_data_targets_test.shape"
729 | ]
730 | },
731 | {
732 | "cell_type": "markdown",
733 | "metadata": {},
734 | "source": [
735 | "### Data Preparation: An Example"
736 | ]
737 | },
738 | {
739 | "cell_type": "code",
740 | "execution_count": null,
741 | "metadata": {},
742 | "outputs": [],
743 | "source": [
744 | "df_inputs_prepr = loan_data_inputs_train\n",
745 | "df_targets_prepr = loan_data_targets_train\n",
746 | "#df_inputs_prepr = loan_data_inputs_test\n",
747 | "#df_targets_prepr = loan_data_targets_test"
748 | ]
749 | },
750 | {
751 | "cell_type": "code",
752 | "execution_count": null,
753 | "metadata": {},
754 | "outputs": [],
755 | "source": [
756 | "df_inputs_prepr['grade'].unique()"
757 | ]
758 | },
759 | {
760 | "cell_type": "code",
761 | "execution_count": null,
762 | "metadata": {},
763 | "outputs": [],
764 | "source": [
765 | "df1 = pd.concat([df_inputs_prepr['grade'], df_targets_prepr], axis = 1)\n",
766 | "df1.head()"
767 | ]
768 | },
769 | {
770 | "cell_type": "code",
771 | "execution_count": null,
772 | "metadata": {},
773 | "outputs": [],
774 | "source": [
775 | "df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].count()"
776 | ]
777 | },
778 | {
779 | "cell_type": "code",
780 | "execution_count": null,
781 | "metadata": {},
782 | "outputs": [],
783 | "source": [
784 | "df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].mean()"
785 | ]
786 | },
787 | {
788 | "cell_type": "code",
789 | "execution_count": null,
790 | "metadata": {},
791 | "outputs": [],
792 | "source": [
793 | "df1 = pd.concat([df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].count(),\n",
794 | " df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].mean()], axis = 1)"
795 | ]
796 | },
797 | {
798 | "cell_type": "code",
799 | "execution_count": null,
800 | "metadata": {},
801 | "outputs": [],
802 | "source": [
803 | "df1"
804 | ]
805 | },
806 | {
807 | "cell_type": "code",
808 | "execution_count": null,
809 | "metadata": {},
810 | "outputs": [],
811 | "source": [
812 | "df1 = df1.iloc[: , [0, 1, 3]]\n",
813 | "df1"
814 | ]
815 | },
816 | {
817 | "cell_type": "code",
818 | "execution_count": null,
819 | "metadata": {},
820 | "outputs": [],
821 | "source": [
822 | "df1.columns = [df1.columns.values[0], 'n_obs', 'prop_good']\n",
823 | "df1"
824 | ]
825 | },
826 | {
827 | "cell_type": "code",
828 | "execution_count": null,
829 | "metadata": {},
830 | "outputs": [],
831 | "source": [
832 | "df1['prop_n_obs'] = df1['n_obs'] / df1['n_obs'].sum()"
833 | ]
834 | },
835 | {
836 | "cell_type": "code",
837 | "execution_count": null,
838 | "metadata": {},
839 | "outputs": [],
840 | "source": [
841 | "df1"
842 | ]
843 | },
844 | {
845 | "cell_type": "code",
846 | "execution_count": null,
847 | "metadata": {},
848 | "outputs": [],
849 | "source": [
850 | "df1['n_good'] = df1['prop_good'] * df1['n_obs']\n",
851 | "df1['n_bad'] = (1 - df1['prop_good']) * df1['n_obs']\n",
852 | "df1"
853 | ]
854 | },
855 | {
856 | "cell_type": "code",
857 | "execution_count": null,
858 | "metadata": {},
859 | "outputs": [],
860 | "source": [
861 | "df1['prop_n_good'] = df1['n_good'] / df1['n_good'].sum()\n",
862 | "df1['prop_n_bad'] = df1['n_bad'] / df1['n_bad'].sum()\n",
863 | "df1"
864 | ]
865 | },
866 | {
867 | "cell_type": "code",
868 | "execution_count": null,
869 | "metadata": {},
870 | "outputs": [],
871 | "source": [
872 | "df1['WoE'] = np.log(df1['prop_n_good'] / df1['prop_n_bad'])\n",
873 | "df1"
874 | ]
875 | },
876 | {
877 | "cell_type": "code",
878 | "execution_count": null,
879 | "metadata": {},
880 | "outputs": [],
881 | "source": [
882 | "df1 = df1.sort_values(['WoE'])\n",
883 | "df1 = df1.reset_index(drop = True)\n",
884 | "df1"
885 | ]
886 | },
887 | {
888 | "cell_type": "code",
889 | "execution_count": null,
890 | "metadata": {},
891 | "outputs": [],
892 | "source": [
893 | "df1['diff_prop_good'] = df1['prop_good'].diff().abs()\n",
894 | "df1['diff_WoE'] = df1['WoE'].diff().abs()\n",
895 | "df1"
896 | ]
897 | },
898 | {
899 | "cell_type": "code",
900 | "execution_count": null,
901 | "metadata": {},
902 | "outputs": [],
903 | "source": [
904 | "df1['IV'] = (df1['prop_n_good'] - df1['prop_n_bad']) * df1['WoE']\n",
905 | "df1['IV'] = df1['IV'].sum()\n",
906 | "df1"
907 | ]
908 | }
909 | ],
910 | "metadata": {
911 | "kernelspec": {
912 | "display_name": "Python 3",
913 | "language": "python",
914 | "name": "python3"
915 | },
916 | "language_info": {
917 | "codemirror_mode": {
918 | "name": "ipython",
919 | "version": 3
920 | },
921 | "file_extension": ".py",
922 | "mimetype": "text/x-python",
923 | "name": "python",
924 | "nbconvert_exporter": "python",
925 | "pygments_lexer": "ipython3",
926 | "version": "3.7.3"
927 | }
928 | },
929 | "nbformat": 4,
930 | "nbformat_minor": 2
931 | }
932 |
--------------------------------------------------------------------------------
/Section 5 PD Model, Data Preparation/Credit Risk Modeling - Preparation - 5-7.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Preparation"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Import Libraries"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import numpy as np\n",
24 | "import pandas as pd"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "## Import Data"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "loan_data = loan_data_backup.copy()"
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "metadata": {},
55 | "source": [
56 | "## Explore Data"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": null,
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "loan_data"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "pd.options.display.max_columns = None"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "loan_data"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "loan_data.head()"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": null,
98 | "metadata": {},
99 | "outputs": [],
100 | "source": [
101 | "loan_data.tail()"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "loan_data.columns.values"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "metadata": {
117 | "scrolled": true
118 | },
119 | "outputs": [],
120 | "source": [
121 | "loan_data.info()"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {},
127 | "source": [
128 | "## General Preprocessing"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "### Preprocessing few continuous variables"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": null,
141 | "metadata": {},
142 | "outputs": [],
143 | "source": [
144 | "loan_data['emp_length'].unique()"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": null,
150 | "metadata": {},
151 | "outputs": [],
152 | "source": [
153 | "loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\\+ years', '')\n",
154 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0))\n",
155 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a', str(0))\n",
156 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '')\n",
157 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '')"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "metadata": {},
164 | "outputs": [],
165 | "source": [
166 | "type(loan_data['emp_length_int'][0])"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": null,
172 | "metadata": {},
173 | "outputs": [],
174 | "source": [
175 | "loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {},
182 | "outputs": [],
183 | "source": [
184 | "type(loan_data['emp_length_int'][0])"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {
191 | "scrolled": true
192 | },
193 | "outputs": [],
194 | "source": [
195 | "loan_data['earliest_cr_line']"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = '%b-%y')"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "type(loan_data['earliest_cr_line_date'][0])"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": null,
219 | "metadata": {
220 | "scrolled": true
221 | },
222 | "outputs": [],
223 | "source": [
224 | "pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": null,
230 | "metadata": {},
231 | "outputs": [],
232 | "source": [
233 | "loan_data['mths_since_earliest_cr_line'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']) / np.timedelta64(1, 'M')))"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": null,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "loan_data['mths_since_earliest_cr_line'].describe()"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": null,
248 | "metadata": {
249 | "scrolled": true
250 | },
251 | "outputs": [],
252 | "source": [
253 | "loan_data.loc[: , ['earliest_cr_line', 'earliest_cr_line_date', 'mths_since_earliest_cr_line']][loan_data['mths_since_earliest_cr_line'] < 0]"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": null,
259 | "metadata": {},
260 | "outputs": [],
261 | "source": [
262 | "loan_data['mths_since_earliest_cr_line'][loan_data['mths_since_earliest_cr_line'] < 0] = loan_data['mths_since_earliest_cr_line'].max()"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": null,
268 | "metadata": {},
269 | "outputs": [],
270 | "source": [
271 | "min(loan_data['mths_since_earliest_cr_line'])"
272 | ]
273 | },
274 | {
275 | "cell_type": "markdown",
276 | "metadata": {},
277 | "source": [
278 | "### Homework"
279 | ]
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": null,
284 | "metadata": {
285 | "scrolled": true
286 | },
287 | "outputs": [],
288 | "source": [
289 | "loan_data['term']"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": null,
295 | "metadata": {},
296 | "outputs": [],
297 | "source": [
298 | "loan_data['term'].describe()"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": null,
304 | "metadata": {},
305 | "outputs": [],
306 | "source": [
307 | "loan_data['term_int'] = loan_data['term'].str.replace(' months', '')"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": null,
313 | "metadata": {
314 | "scrolled": true
315 | },
316 | "outputs": [],
317 | "source": [
318 | "loan_data['term_int']"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": null,
324 | "metadata": {},
325 | "outputs": [],
326 | "source": [
327 | "type(loan_data['term_int'][25])"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": null,
333 | "metadata": {
334 | "scrolled": true
335 | },
336 | "outputs": [],
337 | "source": [
338 | "loan_data['term_int'] = pd.to_numeric(loan_data['term'].str.replace(' months', ''))\n",
339 | "loan_data['term_int']"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": null,
345 | "metadata": {},
346 | "outputs": [],
347 | "source": [
348 | "type(loan_data['term_int'][0])"
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": null,
354 | "metadata": {
355 | "scrolled": true
356 | },
357 | "outputs": [],
358 | "source": [
359 | "loan_data['issue_d']"
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": null,
365 | "metadata": {},
366 | "outputs": [],
367 | "source": [
368 | "loan_data['issue_d_date'] = pd.to_datetime(loan_data['issue_d'], format = '%b-%y')\n",
369 | "loan_data['mths_since_issue_d'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['issue_d_date']) / np.timedelta64(1, 'M')))\n",
370 | "loan_data['mths_since_issue_d'].describe()"
371 | ]
372 | },
373 | {
374 | "cell_type": "markdown",
375 | "metadata": {},
376 | "source": [
377 | "### Preprocessing few discrete variables"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": null,
383 | "metadata": {},
384 | "outputs": [],
385 | "source": [
386 | "loan_data.info()"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": null,
392 | "metadata": {},
393 | "outputs": [],
394 | "source": [
395 | "pd.get_dummies(loan_data['grade'])"
396 | ]
397 | },
398 | {
399 | "cell_type": "code",
400 | "execution_count": null,
401 | "metadata": {},
402 | "outputs": [],
403 | "source": [
404 | "pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':')"
405 | ]
406 | },
407 | {
408 | "cell_type": "code",
409 | "execution_count": null,
410 | "metadata": {},
411 | "outputs": [],
412 | "source": [
413 | "loan_data_dummies = [pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':'),\n",
414 | " pd.get_dummies(loan_data['sub_grade'], prefix = 'sub_grade', prefix_sep = ':'),\n",
415 | " pd.get_dummies(loan_data['home_ownership'], prefix = 'home_ownership', prefix_sep = ':'),\n",
416 | " pd.get_dummies(loan_data['verification_status'], prefix = 'verification_status', prefix_sep = ':'),\n",
417 | " pd.get_dummies(loan_data['loan_status'], prefix = 'loan_status', prefix_sep = ':'),\n",
418 | " pd.get_dummies(loan_data['purpose'], prefix = 'purpose', prefix_sep = ':'),\n",
419 | " pd.get_dummies(loan_data['addr_state'], prefix = 'addr_state', prefix_sep = ':'),\n",
420 | " pd.get_dummies(loan_data['initial_list_status'], prefix = 'initial_list_status', prefix_sep = ':')]"
421 | ]
422 | },
423 | {
424 | "cell_type": "code",
425 | "execution_count": null,
426 | "metadata": {},
427 | "outputs": [],
428 | "source": [
429 | "loan_data_dummies = pd.concat(loan_data_dummies, axis = 1)"
430 | ]
431 | },
432 | {
433 | "cell_type": "code",
434 | "execution_count": null,
435 | "metadata": {},
436 | "outputs": [],
437 | "source": [
438 | "type(loan_data_dummies)"
439 | ]
440 | },
441 | {
442 | "cell_type": "code",
443 | "execution_count": null,
444 | "metadata": {},
445 | "outputs": [],
446 | "source": [
447 | "loan_data = pd.concat([loan_data, loan_data_dummies], axis = 1)"
448 | ]
449 | },
450 | {
451 | "cell_type": "code",
452 | "execution_count": null,
453 | "metadata": {},
454 | "outputs": [],
455 | "source": [
456 | "loan_data.columns.values"
457 | ]
458 | },
459 | {
460 | "cell_type": "markdown",
461 | "metadata": {},
462 | "source": [
463 | "### Check for missing values and clean"
464 | ]
465 | },
466 | {
467 | "cell_type": "code",
468 | "execution_count": null,
469 | "metadata": {
470 | "scrolled": true
471 | },
472 | "outputs": [],
473 | "source": [
474 | "loan_data.isnull()"
475 | ]
476 | },
477 | {
478 | "cell_type": "code",
479 | "execution_count": null,
480 | "metadata": {},
481 | "outputs": [],
482 | "source": [
483 | "pd.options.display.max_rows = None\n",
484 | "loan_data.isnull().sum()"
485 | ]
486 | },
487 | {
488 | "cell_type": "code",
489 | "execution_count": null,
490 | "metadata": {},
491 | "outputs": [],
492 | "source": [
493 | "pd.options.display.max_rows = 100"
494 | ]
495 | },
496 | {
497 | "cell_type": "code",
498 | "execution_count": null,
499 | "metadata": {},
500 | "outputs": [],
501 | "source": [
502 | "loan_data['total_rev_hi_lim'].fillna(loan_data['funded_amnt'], inplace = True)"
503 | ]
504 | },
505 | {
506 | "cell_type": "code",
507 | "execution_count": null,
508 | "metadata": {},
509 | "outputs": [],
510 | "source": [
511 | "loan_data['total_rev_hi_lim'].isnull().sum()"
512 | ]
513 | },
514 | {
515 | "cell_type": "markdown",
516 | "metadata": {},
517 | "source": [
518 | "### Homework"
519 | ]
520 | },
521 | {
522 | "cell_type": "code",
523 | "execution_count": null,
524 | "metadata": {},
525 | "outputs": [],
526 | "source": [
527 | "loan_data['annual_inc'].fillna(loan_data['annual_inc'].mean(), inplace=True)"
528 | ]
529 | },
530 | {
531 | "cell_type": "code",
532 | "execution_count": null,
533 | "metadata": {},
534 | "outputs": [],
535 | "source": [
536 | "loan_data['mths_since_earliest_cr_line'].fillna(0, inplace=True)\n",
537 | "loan_data['acc_now_delinq'].fillna(0, inplace=True)\n",
538 | "loan_data['total_acc'].fillna(0, inplace=True)\n",
539 | "loan_data['pub_rec'].fillna(0, inplace=True)\n",
540 | "loan_data['open_acc'].fillna(0, inplace=True)\n",
541 | "loan_data['inq_last_6mths'].fillna(0, inplace=True)\n",
542 | "loan_data['delinq_2yrs'].fillna(0, inplace=True)\n",
543 | "loan_data['emp_length_int'].fillna(0, inplace=True)"
544 | ]
545 | },
546 | {
547 | "cell_type": "markdown",
548 | "metadata": {},
549 | "source": [
550 | "# PD model"
551 | ]
552 | },
553 | {
554 | "cell_type": "markdown",
555 | "metadata": {},
556 | "source": [
557 | "## Data preparation"
558 | ]
559 | },
560 | {
561 | "cell_type": "markdown",
562 | "metadata": {},
563 | "source": [
564 | "### Dependent Variable. Good/ Bad (Default) Definition. Default and Non-default Accounts."
565 | ]
566 | },
567 | {
568 | "cell_type": "code",
569 | "execution_count": null,
570 | "metadata": {},
571 | "outputs": [],
572 | "source": [
573 | "loan_data['loan_status'].unique()"
574 | ]
575 | },
576 | {
577 | "cell_type": "code",
578 | "execution_count": null,
579 | "metadata": {},
580 | "outputs": [],
581 | "source": [
582 | "loan_data['loan_status'].value_counts()"
583 | ]
584 | },
585 | {
586 | "cell_type": "code",
587 | "execution_count": null,
588 | "metadata": {},
589 | "outputs": [],
590 | "source": [
591 | "loan_data['loan_status'].value_counts() / loan_data['loan_status'].count()"
592 | ]
593 | },
594 | {
595 | "cell_type": "code",
596 | "execution_count": null,
597 | "metadata": {},
598 | "outputs": [],
599 | "source": [
600 | "loan_data['good_bad'] = np.where(loan_data['loan_status'].isin(['Charged Off', 'Default',\n",
601 | " 'Does not meet the credit policy. Status:Charged Off',\n",
602 | " 'Late (31-120 days)']), 0, 1)"
603 | ]
604 | },
605 | {
606 | "cell_type": "code",
607 | "execution_count": null,
608 | "metadata": {
609 | "scrolled": true
610 | },
611 | "outputs": [],
612 | "source": [
613 | "loan_data['good_bad']"
614 | ]
615 | },
616 | {
617 | "cell_type": "markdown",
618 | "metadata": {},
619 | "source": [
620 | "### Splitting Data"
621 | ]
622 | },
623 | {
624 | "cell_type": "code",
625 | "execution_count": null,
626 | "metadata": {},
627 | "outputs": [],
628 | "source": [
629 | "from sklearn.model_selection import train_test_split"
630 | ]
631 | },
632 | {
633 | "cell_type": "code",
634 | "execution_count": null,
635 | "metadata": {},
636 | "outputs": [],
637 | "source": [
638 | "train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'])"
639 | ]
640 | },
641 | {
642 | "cell_type": "code",
643 | "execution_count": null,
644 | "metadata": {},
645 | "outputs": [],
646 | "source": [
647 | "loan_data_inputs_train, loan_data_inputs_test, loan_data_targets_train, loan_data_targets_test = train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'])"
648 | ]
649 | },
650 | {
651 | "cell_type": "code",
652 | "execution_count": null,
653 | "metadata": {},
654 | "outputs": [],
655 | "source": [
656 | "loan_data_inputs_train.shape"
657 | ]
658 | },
659 | {
660 | "cell_type": "code",
661 | "execution_count": null,
662 | "metadata": {},
663 | "outputs": [],
664 | "source": [
665 | "loan_data_targets_train.shape"
666 | ]
667 | },
668 | {
669 | "cell_type": "code",
670 | "execution_count": null,
671 | "metadata": {},
672 | "outputs": [],
673 | "source": [
674 | "loan_data_inputs_test.shape"
675 | ]
676 | },
677 | {
678 | "cell_type": "code",
679 | "execution_count": null,
680 | "metadata": {},
681 | "outputs": [],
682 | "source": [
683 | "loan_data_targets_test.shape"
684 | ]
685 | },
686 | {
687 | "cell_type": "code",
688 | "execution_count": null,
689 | "metadata": {},
690 | "outputs": [],
691 | "source": [
692 | "loan_data_inputs_train, loan_data_inputs_test, loan_data_targets_train, loan_data_targets_test = train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'], test_size = 0.2, random_state = 42)"
693 | ]
694 | },
695 | {
696 | "cell_type": "code",
697 | "execution_count": null,
698 | "metadata": {},
699 | "outputs": [],
700 | "source": [
701 | "loan_data_inputs_train.shape"
702 | ]
703 | },
704 | {
705 | "cell_type": "code",
706 | "execution_count": null,
707 | "metadata": {},
708 | "outputs": [],
709 | "source": [
710 | "loan_data_targets_train.shape"
711 | ]
712 | },
713 | {
714 | "cell_type": "code",
715 | "execution_count": null,
716 | "metadata": {},
717 | "outputs": [],
718 | "source": [
719 | "loan_data_inputs_test.shape"
720 | ]
721 | },
722 | {
723 | "cell_type": "code",
724 | "execution_count": null,
725 | "metadata": {},
726 | "outputs": [],
727 | "source": [
728 | "loan_data_targets_test.shape"
729 | ]
730 | },
731 | {
732 | "cell_type": "markdown",
733 | "metadata": {},
734 | "source": [
735 | "### Data Preparation: An Example"
736 | ]
737 | },
738 | {
739 | "cell_type": "code",
740 | "execution_count": null,
741 | "metadata": {},
742 | "outputs": [],
743 | "source": [
744 | "df_inputs_prepr = loan_data_inputs_train\n",
745 | "df_targets_prepr = loan_data_targets_train\n",
746 | "#df_inputs_prepr = loan_data_inputs_test\n",
747 | "#df_targets_prepr = loan_data_targets_test"
748 | ]
749 | },
750 | {
751 | "cell_type": "code",
752 | "execution_count": null,
753 | "metadata": {},
754 | "outputs": [],
755 | "source": [
756 | "df_inputs_prepr['grade'].unique()"
757 | ]
758 | },
759 | {
760 | "cell_type": "code",
761 | "execution_count": null,
762 | "metadata": {},
763 | "outputs": [],
764 | "source": [
765 | "df1 = pd.concat([df_inputs_prepr['grade'], df_targets_prepr], axis = 1)\n",
766 | "df1.head()"
767 | ]
768 | },
769 | {
770 | "cell_type": "code",
771 | "execution_count": null,
772 | "metadata": {},
773 | "outputs": [],
774 | "source": [
775 | "df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].count()"
776 | ]
777 | },
778 | {
779 | "cell_type": "code",
780 | "execution_count": null,
781 | "metadata": {},
782 | "outputs": [],
783 | "source": [
784 | "df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].mean()"
785 | ]
786 | },
787 | {
788 | "cell_type": "code",
789 | "execution_count": null,
790 | "metadata": {},
791 | "outputs": [],
792 | "source": [
793 | "df1 = pd.concat([df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].count(),\n",
794 | " df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].mean()], axis = 1)"
795 | ]
796 | },
797 | {
798 | "cell_type": "code",
799 | "execution_count": null,
800 | "metadata": {},
801 | "outputs": [],
802 | "source": [
803 | "df1"
804 | ]
805 | },
806 | {
807 | "cell_type": "code",
808 | "execution_count": null,
809 | "metadata": {},
810 | "outputs": [],
811 | "source": [
812 | "df1 = df1.iloc[: , [0, 1, 3]]\n",
813 | "df1"
814 | ]
815 | },
816 | {
817 | "cell_type": "code",
818 | "execution_count": null,
819 | "metadata": {},
820 | "outputs": [],
821 | "source": [
822 | "df1.columns = [df1.columns.values[0], 'n_obs', 'prop_good']\n",
823 | "df1"
824 | ]
825 | },
826 | {
827 | "cell_type": "code",
828 | "execution_count": null,
829 | "metadata": {},
830 | "outputs": [],
831 | "source": [
832 | "df1['prop_n_obs'] = df1['n_obs'] / df1['n_obs'].sum()"
833 | ]
834 | },
835 | {
836 | "cell_type": "code",
837 | "execution_count": null,
838 | "metadata": {},
839 | "outputs": [],
840 | "source": [
841 | "df1"
842 | ]
843 | },
844 | {
845 | "cell_type": "code",
846 | "execution_count": null,
847 | "metadata": {},
848 | "outputs": [],
849 | "source": [
850 | "df1['n_good'] = df1['prop_good'] * df1['n_obs']\n",
851 | "df1['n_bad'] = (1 - df1['prop_good']) * df1['n_obs']\n",
852 | "df1"
853 | ]
854 | },
855 | {
856 | "cell_type": "code",
857 | "execution_count": null,
858 | "metadata": {},
859 | "outputs": [],
860 | "source": [
861 | "df1['prop_n_good'] = df1['n_good'] / df1['n_good'].sum()\n",
862 | "df1['prop_n_bad'] = df1['n_bad'] / df1['n_bad'].sum()\n",
863 | "df1"
864 | ]
865 | },
866 | {
867 | "cell_type": "code",
868 | "execution_count": null,
869 | "metadata": {},
870 | "outputs": [],
871 | "source": [
872 | "df1['WoE'] = np.log(df1['prop_n_good'] / df1['prop_n_bad'])\n",
873 | "df1"
874 | ]
875 | },
876 | {
877 | "cell_type": "code",
878 | "execution_count": null,
879 | "metadata": {},
880 | "outputs": [],
881 | "source": [
882 | "df1 = df1.sort_values(['WoE'])\n",
883 | "df1 = df1.reset_index(drop = True)\n",
884 | "df1"
885 | ]
886 | },
887 | {
888 | "cell_type": "code",
889 | "execution_count": null,
890 | "metadata": {},
891 | "outputs": [],
892 | "source": [
893 | "df1['diff_prop_good'] = df1['prop_good'].diff().abs()\n",
894 | "df1['diff_WoE'] = df1['WoE'].diff().abs()\n",
895 | "df1"
896 | ]
897 | },
898 | {
899 | "cell_type": "code",
900 | "execution_count": null,
901 | "metadata": {},
902 | "outputs": [],
903 | "source": [
904 | "df1['IV'] = (df1['prop_n_good'] - df1['prop_n_bad']) * df1['WoE']\n",
905 | "df1['IV'] = df1['IV'].sum()\n",
906 | "df1"
907 | ]
908 | },
909 | {
910 | "cell_type": "markdown",
911 | "metadata": {},
912 | "source": [
913 | "### Preprocessing Discrete Variables: Automating Calculaions"
914 | ]
915 | },
916 | {
917 | "cell_type": "code",
918 | "execution_count": null,
919 | "metadata": {},
920 | "outputs": [],
921 | "source": [
922 | "def woe_discrete(df, discrete_variable_name, good_bad_variable_df):\n",
923 | " df = pd.concat([df[discrete_variable_name], good_bad_variable_df], axis = 1)\n",
924 | " df = pd.concat([df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].count(),\n",
925 | " df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].mean()], axis = 1)\n",
926 | " df = df.iloc[:, [0, 1, 3]]\n",
927 | " df.columns = [df.columns.values[0], 'n_obs', 'prop_good']\n",
928 | " df['prop_n_obs'] = df['n_obs'] / df['n_obs'].sum()\n",
929 | " df['n_good'] = df['prop_good'] * df['n_obs']\n",
930 | " df['n_bad'] = (1 - df['prop_good']) * df['n_obs']\n",
931 | " df['prop_n_good'] = df['n_good'] / df['n_good'].sum()\n",
932 | " df['prop_n_bad'] = df['n_bad'] / df['n_bad'].sum()\n",
933 | " df['WoE'] = np.log(df['prop_n_good'] / df['prop_n_bad'])\n",
934 | " df = df.sort_values(['WoE'])\n",
935 | " df = df.reset_index(drop = True)\n",
936 | " df['diff_prop_good'] = df['prop_good'].diff().abs()\n",
937 | " df['diff_WoE'] = df['WoE'].diff().abs()\n",
938 | " df['IV'] = (df['prop_n_good'] - df['prop_n_bad']) * df['WoE']\n",
939 | " df['IV'] = df['IV'].sum()\n",
940 | " return df"
941 | ]
942 | },
943 | {
944 | "cell_type": "code",
945 | "execution_count": null,
946 | "metadata": {},
947 | "outputs": [],
948 | "source": [
949 | "df_temp = woe_discrete(df_inputs_prepr, 'grade', df_targets_prepr)\n",
950 | "df_temp"
951 | ]
952 | }
953 | ],
954 | "metadata": {
955 | "kernelspec": {
956 | "display_name": "Python 3",
957 | "language": "python",
958 | "name": "python3"
959 | },
960 | "language_info": {
961 | "codemirror_mode": {
962 | "name": "ipython",
963 | "version": 3
964 | },
965 | "file_extension": ".py",
966 | "mimetype": "text/x-python",
967 | "name": "python",
968 | "nbconvert_exporter": "python",
969 | "pygments_lexer": "ipython3",
970 | "version": "3.7.3"
971 | }
972 | },
973 | "nbformat": 4,
974 | "nbformat_minor": 2
975 | }
976 |
--------------------------------------------------------------------------------
/Section 5 PD Model, Data Preparation/Credit Risk Modeling - Preparation - With Comments - 5-2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Preparation"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Import Libraries"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import numpy as np\n",
24 | "import pandas as pd"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "## Import Data\n",
32 | "The dataset contains all available data for more than 800,000 consumer loans issued from 2007 to 2015 by Lending Club: a large US peer-to-peer lending company. There are several different versions of this dataset. We have used a version available on kaggle.com. You can find it here: https://www.kaggle.com/wendykan/lending-club-loan-data/version/1\n",
33 | "We divided the data into two periods because we assume that some data are available at the moment when we need to build Expected Loss models, and some data comes from applications after. Later, we investigate whether the applications we have after we built the Probability of Default (PD) model have similar characteristics with the applications we used to build the PD model."
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {
40 | "scrolled": true
41 | },
42 | "outputs": [],
43 | "source": [
44 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "loan_data = loan_data_backup.copy()"
54 | ]
55 | },
56 | {
57 | "cell_type": "markdown",
58 | "metadata": {},
59 | "source": [
60 | "## Explore Data"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "loan_data"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "pd.options.display.max_columns = None\n",
79 | "#pd.options.display.max_rows = None\n",
80 | "# Sets the pandas dataframe options to display all columns/ rows."
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "loan_data"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "metadata": {
96 | "scrolled": true
97 | },
98 | "outputs": [],
99 | "source": [
100 | "loan_data.head()"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {
107 | "scrolled": true
108 | },
109 | "outputs": [],
110 | "source": [
111 | "loan_data.tail()"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "loan_data.columns.values\n",
121 | "# Displays all column names."
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": null,
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "loan_data.info()\n",
131 | "# Displays column names, complete (non-missing) cases per column, and datatype per column."
132 | ]
133 | },
134 | {
135 | "cell_type": "markdown",
136 | "metadata": {},
137 | "source": [
138 | "## General Preprocessing"
139 | ]
140 | },
141 | {
142 | "cell_type": "markdown",
143 | "metadata": {},
144 | "source": [
145 | "### Preprocessing few continuous variables"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": null,
151 | "metadata": {},
152 | "outputs": [],
153 | "source": [
154 | "loan_data['emp_length'].unique()\n",
155 | "# Displays unique values of a column."
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\\+ years', '')\n",
165 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0))\n",
166 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a', str(0))\n",
167 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '')\n",
168 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '')\n",
169 | "# We store the preprocessed ‘employment length’ variable in a new variable called ‘employment length int’,\n",
170 | "# We assign the new ‘employment length int’ to be equal to the ‘employment length’ variable with the string ‘+ years’\n",
171 | "# replaced with nothing. Next, we replace the whole string ‘less than 1 year’ with the string ‘0’.\n",
172 | "# Then, we replace the ‘n/a’ string with the string ‘0’. Then, we replace the string ‘space years’ with nothing.\n",
173 | "# Finally, we replace the string ‘space year’ with nothing."
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": null,
179 | "metadata": {},
180 | "outputs": [],
181 | "source": [
182 | "type(loan_data['emp_length_int'][0])\n",
183 | "# Checks the datatype of a single element of a column."
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": null,
189 | "metadata": {},
190 | "outputs": [],
191 | "source": [
192 | "loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])\n",
193 | "# Transforms the values to numeric."
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": null,
199 | "metadata": {},
200 | "outputs": [],
201 | "source": [
202 | "type(loan_data['emp_length_int'][0])\n",
203 | "# Checks the datatype of a single element of a column."
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": null,
209 | "metadata": {},
210 | "outputs": [],
211 | "source": [
212 | "loan_data['earliest_cr_line']\n",
213 | "# Displays a column."
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": null,
219 | "metadata": {},
220 | "outputs": [],
221 | "source": [
222 | "loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = '%b-%y')\n",
223 | "# Extracts the date and the time from a string variable that is in a given format."
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": null,
229 | "metadata": {
230 | "scrolled": false
231 | },
232 | "outputs": [],
233 | "source": [
234 | "type(loan_data['earliest_cr_line_date'][0])\n",
235 | "# Checks the datatype of a single element of a column."
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": null,
241 | "metadata": {},
242 | "outputs": [],
243 | "source": [
244 | "pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']\n",
245 | "# Calculates the difference between two dates and times."
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": null,
251 | "metadata": {},
252 | "outputs": [],
253 | "source": [
254 | "# Assume we are now in December 2017\n",
255 | "loan_data['mths_since_earliest_cr_line'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']) / np.timedelta64(1, 'M')))\n",
256 | "# We calculate the difference between two dates in months, turn it to numeric datatype and round it.\n",
257 | "# We save the result in a new variable."
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": null,
263 | "metadata": {},
264 | "outputs": [],
265 | "source": [
266 | "loan_data['mths_since_earliest_cr_line'].describe()\n",
267 | "# Shows some descriptive statisics for the values of a column.\n",
268 | "# Dates from 1969 and before are not being converted well, i.e., they have become 2069 and similar,\n",
269 | "# and negative differences are being calculated."
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": null,
275 | "metadata": {},
276 | "outputs": [],
277 | "source": [
278 | "loan_data.loc[: , ['earliest_cr_line', 'earliest_cr_line_date', 'mths_since_earliest_cr_line']][loan_data['mths_since_earliest_cr_line'] < 0]\n",
279 | "# We take three columns from the dataframe. Then, we display them only for the rows where a variable has negative value.\n",
280 | "# There are 2303 strange negative values."
281 | ]
282 | },
283 | {
284 | "cell_type": "code",
285 | "execution_count": null,
286 | "metadata": {},
287 | "outputs": [],
288 | "source": [
289 | "loan_data['mths_since_earliest_cr_line'][loan_data['mths_since_earliest_cr_line'] < 0] = loan_data['mths_since_earliest_cr_line'].max()\n",
290 | "# We set the rows that had negative differences to the maximum value."
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": null,
296 | "metadata": {},
297 | "outputs": [],
298 | "source": [
299 | "min(loan_data['mths_since_earliest_cr_line'])\n",
300 | "# Calculates and shows the minimum value of a column."
301 | ]
302 | },
303 | {
304 | "cell_type": "markdown",
305 | "metadata": {},
306 | "source": [
307 | "### Homework"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": null,
313 | "metadata": {},
314 | "outputs": [],
315 | "source": [
316 | "loan_data['term']"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": null,
322 | "metadata": {},
323 | "outputs": [],
324 | "source": [
325 | "loan_data['term'].describe()\n",
326 | "# Shows some descriptive statisics for the values of a column."
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": null,
332 | "metadata": {},
333 | "outputs": [],
334 | "source": [
335 | "loan_data['term_int'] = loan_data['term'].str.replace(' months', '')\n",
336 | "# We replace a string with another string, in this case, with an empty strng (i.e. with nothing)."
337 | ]
338 | },
339 | {
340 | "cell_type": "code",
341 | "execution_count": null,
342 | "metadata": {},
343 | "outputs": [],
344 | "source": [
345 | "loan_data['term_int']"
346 | ]
347 | },
348 | {
349 | "cell_type": "code",
350 | "execution_count": null,
351 | "metadata": {},
352 | "outputs": [],
353 | "source": [
354 | "type(loan_data['term_int'][25])\n",
355 | "# Checks the datatype of a single element of a column."
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": null,
361 | "metadata": {},
362 | "outputs": [],
363 | "source": [
364 | "loan_data['term_int'] = pd.to_numeric(loan_data['term'].str.replace(' months', ''))\n",
365 | "# We remplace a string from a variable with another string, in this case, with an empty strng (i.e. with nothing).\n",
366 | "# We turn the result to numeric datatype and save it in another variable.\n",
367 | "loan_data['term_int']"
368 | ]
369 | },
370 | {
371 | "cell_type": "code",
372 | "execution_count": null,
373 | "metadata": {},
374 | "outputs": [],
375 | "source": [
376 | "type(loan_data['term_int'][0])\n",
377 | "# Checks the datatype of a single element of a column."
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": null,
383 | "metadata": {},
384 | "outputs": [],
385 | "source": [
386 | "loan_data['issue_d']"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": null,
392 | "metadata": {},
393 | "outputs": [],
394 | "source": [
395 | "# Assume we are now in December 2017\n",
396 | "loan_data['issue_d_date'] = pd.to_datetime(loan_data['issue_d'], format = '%b-%y')\n",
397 | "# Extracts the date and the time from a string variable that is in a given format.\n",
398 | "loan_data['mths_since_issue_d'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['issue_d_date']) / np.timedelta64(1, 'M')))\n",
399 | "# We calculate the difference between two dates in months, turn it to numeric datatype and round it.\n",
400 | "# We save the result in a new variable.\n",
401 | "loan_data['mths_since_issue_d'].describe()\n",
402 | "# Shows some descriptive statisics for the values of a column."
403 | ]
404 | },
405 | {
406 | "cell_type": "markdown",
407 | "metadata": {},
408 | "source": [
409 | "### Preprocessing few discrete variables"
410 | ]
411 | },
412 | {
413 | "cell_type": "code",
414 | "execution_count": null,
415 | "metadata": {},
416 | "outputs": [],
417 | "source": [
418 | "loan_data.info()\n",
419 | "# Displays column names, complete (non-missing) cases per column, and datatype per column."
420 | ]
421 | },
422 | {
423 | "cell_type": "markdown",
424 | "metadata": {},
425 | "source": [
426 | "We are going to preprocess the following discrete variables: grade, sub_grade, home_ownership, verification_status, loan_status, purpose, addr_state, initial_list_status. Most likely, we are not going to use sub_grade, as it overlaps with grade."
427 | ]
428 | },
429 | {
430 | "cell_type": "code",
431 | "execution_count": null,
432 | "metadata": {},
433 | "outputs": [],
434 | "source": [
435 | "pd.get_dummies(loan_data['grade'])\n",
436 | "# Create dummy variables from a variable."
437 | ]
438 | },
439 | {
440 | "cell_type": "code",
441 | "execution_count": null,
442 | "metadata": {},
443 | "outputs": [],
444 | "source": [
445 | "pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':')\n",
446 | "# Create dummy variables from a variable."
447 | ]
448 | },
449 | {
450 | "cell_type": "code",
451 | "execution_count": null,
452 | "metadata": {
453 | "scrolled": true
454 | },
455 | "outputs": [],
456 | "source": [
457 | "loan_data_dummies = [pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':'),\n",
458 | " pd.get_dummies(loan_data['sub_grade'], prefix = 'sub_grade', prefix_sep = ':'),\n",
459 | " pd.get_dummies(loan_data['home_ownership'], prefix = 'home_ownership', prefix_sep = ':'),\n",
460 | " pd.get_dummies(loan_data['verification_status'], prefix = 'verification_status', prefix_sep = ':'),\n",
461 | " pd.get_dummies(loan_data['loan_status'], prefix = 'loan_status', prefix_sep = ':'),\n",
462 | " pd.get_dummies(loan_data['purpose'], prefix = 'purpose', prefix_sep = ':'),\n",
463 | " pd.get_dummies(loan_data['addr_state'], prefix = 'addr_state', prefix_sep = ':'),\n",
464 | " pd.get_dummies(loan_data['initial_list_status'], prefix = 'initial_list_status', prefix_sep = ':')]\n",
465 | "# We create dummy variables from all 8 original independent variables, and save them into a list.\n",
466 | "# Note that we are using a particular naming convention for all variables: original variable name, colon, category name."
467 | ]
468 | },
469 | {
470 | "cell_type": "code",
471 | "execution_count": null,
472 | "metadata": {},
473 | "outputs": [],
474 | "source": [
475 | "loan_data_dummies = pd.concat(loan_data_dummies, axis = 1)\n",
476 | "# We concatenate the dummy variables and this turns them into a dataframe."
477 | ]
478 | },
479 | {
480 | "cell_type": "code",
481 | "execution_count": null,
482 | "metadata": {},
483 | "outputs": [],
484 | "source": [
485 | "type(loan_data_dummies)\n",
486 | "# Returns the type of the variable."
487 | ]
488 | },
489 | {
490 | "cell_type": "code",
491 | "execution_count": null,
492 | "metadata": {},
493 | "outputs": [],
494 | "source": [
495 | "loan_data = pd.concat([loan_data, loan_data_dummies], axis = 1)\n",
496 | "# Concatenates two dataframes.\n",
497 | "# Here we concatenate the dataframe with original data with the dataframe with dummy variables, along the columns. "
498 | ]
499 | },
500 | {
501 | "cell_type": "code",
502 | "execution_count": null,
503 | "metadata": {},
504 | "outputs": [],
505 | "source": [
506 | "loan_data.columns.values\n",
507 | "# Displays all column names."
508 | ]
509 | },
510 | {
511 | "cell_type": "markdown",
512 | "metadata": {},
513 | "source": [
514 | "### Check for missing values and clean"
515 | ]
516 | },
517 | {
518 | "cell_type": "code",
519 | "execution_count": null,
520 | "metadata": {
521 | "scrolled": false
522 | },
523 | "outputs": [],
524 | "source": [
525 | "loan_data.isnull()\n",
526 | "# It returns 'False' if a value is not missing and 'True' if a value is missing, for each value in a dataframe."
527 | ]
528 | },
529 | {
530 | "cell_type": "code",
531 | "execution_count": null,
532 | "metadata": {},
533 | "outputs": [],
534 | "source": [
535 | "pd.options.display.max_rows = None\n",
536 | "# Sets the pandas dataframe options to display all columns/ rows.\n",
537 | "loan_data.isnull().sum()"
538 | ]
539 | },
540 | {
541 | "cell_type": "code",
542 | "execution_count": null,
543 | "metadata": {},
544 | "outputs": [],
545 | "source": [
546 | "pd.options.display.max_rows = 100\n",
547 | "# Sets the pandas dataframe options to display 100 columns/ rows."
548 | ]
549 | },
550 | {
551 | "cell_type": "code",
552 | "execution_count": null,
553 | "metadata": {},
554 | "outputs": [],
555 | "source": [
556 | "# 'Total revolving high credit/ credit limit', so it makes sense that the missing values are equal to funded_amnt.\n",
557 | "loan_data['total_rev_hi_lim'].fillna(loan_data['funded_amnt'], inplace=True)\n",
558 | "# We fill the missing values with the values of another variable."
559 | ]
560 | },
561 | {
562 | "cell_type": "code",
563 | "execution_count": null,
564 | "metadata": {},
565 | "outputs": [],
566 | "source": [
567 | "loan_data['total_rev_hi_lim'].isnull().sum()"
568 | ]
569 | },
570 | {
571 | "cell_type": "markdown",
572 | "metadata": {},
573 | "source": [
574 | "### Homework"
575 | ]
576 | },
577 | {
578 | "cell_type": "code",
579 | "execution_count": null,
580 | "metadata": {},
581 | "outputs": [],
582 | "source": [
583 | "loan_data['annual_inc'].fillna(loan_data['annual_inc'].mean(), inplace=True)\n",
584 | "# We fill the missing values with the mean value of the non-missing values."
585 | ]
586 | },
587 | {
588 | "cell_type": "code",
589 | "execution_count": null,
590 | "metadata": {},
591 | "outputs": [],
592 | "source": [
593 | "loan_data['mths_since_earliest_cr_line'].fillna(0, inplace=True)\n",
594 | "loan_data['acc_now_delinq'].fillna(0, inplace=True)\n",
595 | "loan_data['total_acc'].fillna(0, inplace=True)\n",
596 | "loan_data['pub_rec'].fillna(0, inplace=True)\n",
597 | "loan_data['open_acc'].fillna(0, inplace=True)\n",
598 | "loan_data['inq_last_6mths'].fillna(0, inplace=True)\n",
599 | "loan_data['delinq_2yrs'].fillna(0, inplace=True)\n",
600 | "loan_data['emp_length_int'].fillna(0, inplace=True)\n",
601 | "# We fill the missing values with zeroes."
602 | ]
603 | },
604 | {
605 | "cell_type": "markdown",
606 | "metadata": {},
607 | "source": [
608 | "# PD model"
609 | ]
610 | },
611 | {
612 | "cell_type": "markdown",
613 | "metadata": {},
614 | "source": [
615 | "## Data preparation"
616 | ]
617 | },
618 | {
619 | "cell_type": "markdown",
620 | "metadata": {},
621 | "source": [
622 | "### Dependent Variable. Good/ Bad (Default) Definition. Default and Non-default Accounts."
623 | ]
624 | },
625 | {
626 | "cell_type": "code",
627 | "execution_count": null,
628 | "metadata": {},
629 | "outputs": [],
630 | "source": [
631 | "loan_data['loan_status'].unique()\n",
632 | "# Displays unique values of a column."
633 | ]
634 | },
635 | {
636 | "cell_type": "code",
637 | "execution_count": null,
638 | "metadata": {
639 | "scrolled": true
640 | },
641 | "outputs": [],
642 | "source": [
643 | "loan_data['loan_status'].value_counts()\n",
644 | "# Calculates the number of observations for each unique value of a variable."
645 | ]
646 | },
647 | {
648 | "cell_type": "code",
649 | "execution_count": null,
650 | "metadata": {},
651 | "outputs": [],
652 | "source": [
653 | "loan_data['loan_status'].value_counts() / loan_data['loan_status'].count()\n",
654 | "# We divide the number of observations for each unique value of a variable by the total number of observations.\n",
655 | "# Thus, we get the proportion of observations for each unique value of a variable."
656 | ]
657 | },
658 | {
659 | "cell_type": "code",
660 | "execution_count": null,
661 | "metadata": {
662 | "scrolled": true
663 | },
664 | "outputs": [],
665 | "source": [
666 | "# Good/ Bad Definition\n",
667 | "loan_data['good_bad'] = np.where(loan_data['loan_status'].isin(['Charged Off', 'Default',\n",
668 | " 'Does not meet the credit policy. Status:Charged Off',\n",
669 | " 'Late (31-120 days)']), 0, 1)\n",
670 | "# We create a new variable that has the value of '0' if a condition is met, and the value of '1' if it is not met."
671 | ]
672 | },
673 | {
674 | "cell_type": "code",
675 | "execution_count": null,
676 | "metadata": {},
677 | "outputs": [],
678 | "source": [
679 | "loan_data['good_bad']"
680 | ]
681 | }
682 | ],
683 | "metadata": {
684 | "kernelspec": {
685 | "display_name": "Python 3",
686 | "language": "python",
687 | "name": "python3"
688 | },
689 | "language_info": {
690 | "codemirror_mode": {
691 | "name": "ipython",
692 | "version": 3
693 | },
694 | "file_extension": ".py",
695 | "mimetype": "text/x-python",
696 | "name": "python",
697 | "nbconvert_exporter": "python",
698 | "pygments_lexer": "ipython3",
699 | "version": "3.7.4"
700 | }
701 | },
702 | "nbformat": 4,
703 | "nbformat_minor": 2
704 | }
705 |
--------------------------------------------------------------------------------