├── .idea ├── .gitignore ├── credit-risk-modeling-in-python.iml ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml └── vcs.xml ├── Credit Risk Modeling Analysis.ipynb ├── LICENSE ├── README.md ├── Section 1 Introduction ├── 1. Importance of Credit Risk.txt ├── 2. Expected Loss and it's components.txt ├── 3. Capital requirement and BASEL II accord.txt ├── 4. BASEL II Approaches.txt ├── 5. Different Asset Classes and Financing them.txt └── README.md ├── Section 2 Setting up the environment ├── 1. Setting up the environment.txt └── 2. Why Python and Jupyter.txt ├── Section 3 Dataset Description ├── .ipynb_checkpoints │ └── Credit Risk Modeling - Preparation - With Comments - 3-1-Copy1-checkpoint.ipynb ├── Credit Risk Modeling - Preparation - 3-1.ipynb ├── Credit Risk Modeling - Preparation - With Comments - 3-1-Copy1.ipynb └── Foundation of PreProcessing.txt ├── Section 4 General Preprocessing ├── .ipynb_checkpoints │ └── Credit Risk Modeling - Preparation - 4-3-checkpoint.ipynb ├── Credit Risk Modeling - Preparation - 4-1.ipynb ├── Credit Risk Modeling - Preparation - 4-2.ipynb ├── Credit Risk Modeling - Preparation - 4-3.ipynb ├── Credit Risk Modeling - Preparation - With Comments - 4-1.ipynb ├── Credit Risk Modeling - Preparation - With Comments - 4-2.ipynb └── Credit Risk Modeling - Preparation - With Comments - 4-3.ipynb └── Section 5 PD Model, Data Preparation ├── .ipynb_checkpoints ├── Credit Risk Modeling - Preparation - With Comments - 5-11-checkpoint.ipynb ├── Credit Risk Modeling - Preparation - With Comments - 5-2-checkpoint.ipynb └── Credit Risk Modeling - Preparation - With Comments - 5-6-checkpoint.ipynb ├── Credit Risk Modeling - Preparation - 5-10.ipynb ├── Credit Risk Modeling - Preparation - 5-11.ipynb ├── Credit Risk Modeling - Preparation - 5-12.ipynb ├── Credit Risk Modeling - Preparation - 5-2.ipynb ├── Credit Risk Modeling - Preparation - 5-5.ipynb ├── Credit Risk Modeling - Preparation - 5-6.ipynb ├── Credit Risk Modeling - Preparation - 5-7.ipynb ├── Credit Risk Modeling - Preparation - 5-8.ipynb ├── Credit Risk Modeling - Preparation - 5-9.ipynb ├── Credit Risk Modeling - Preparation - With Comments - 5-10.ipynb ├── Credit Risk Modeling - Preparation - With Comments - 5-11.ipynb ├── Credit Risk Modeling - Preparation - With Comments - 5-2.ipynb ├── Credit Risk Modeling - Preparation - With Comments - 5-5.ipynb ├── Credit Risk Modeling - Preparation - With Comments - 5-6.ipynb ├── Credit Risk Modeling - Preparation - With Comments - 5-7.ipynb ├── Credit Risk Modeling - Preparation - With Comments - 5-8.ipynb └── Credit Risk Modeling - Preparation - With Comments - 5-9.ipynb /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /workspace.xml -------------------------------------------------------------------------------- /.idea/credit-risk-modeling-in-python.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Al Ardosa 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python Data Science Complete – Credit Risk Modeling in Python Data Science 2 | 3 | Hi! Welcome to Python Credit Risk Modeling. A tutorial that teaches you how banks use python data science modeling to improve their performance and comply with regulatory requirements. This is the perfect tutorial for you, if you are interested in a python data science career. 4 | 5 | Hi I'm Al Ardosa the Fellow Actuary. I've been making tutorials since 2013. I'm here to help you do the same. I've majored in Computer Science and do advanced studying methods. My purpose is to make sure you understand every concept in these tutorials. If you get stuck with anything, send me a message, I'm here to help. 6 | 7 | I've been working as a senior software developer and tech lead in Lazada and other tech companies for many years, and is now taking all that I've learned, to teach programming skills and to help you discover the amazing career opportunities that being a developer. 8 | 9 | ### References 10 | [Python Data Science and Machine Learning - Credit Risk Modeling](https://www.alardosa.com/python-data-science-credit-risk-modeling/) 11 | 12 | [Credit Risk Modeling in Python](https://www.udemy.com/course/credit-risk-modeling-in-python/) 13 | 14 | ## Contact 15 | You can send me a note on Linkedin [@alardosa](https://www.linkedin.com/in/alardosa/) 16 | 17 | or visit my website at: 18 | 19 | [www.alardosa.com](https://www.alardosa.com) 20 | -------------------------------------------------------------------------------- /Section 1 Introduction/1. Importance of Credit Risk.txt: -------------------------------------------------------------------------------- 1 | 2 | Important concepts mentioned in this section 3 | 4 | 1. **Credit** - 5 | Credit Card and Home loans are two very good examples of credit given to a borrower by a lender 6 | Money in a credit card is not ours, we need to pay it back. If we fail to pay it, we need to repay with interest. 7 | Home loans are another type of credit given. For this we have a collateral i.e. home itself, this could be used to recover money if the customer fails to pay back. 8 | Asset financing is another good example of credit. Organizations don't buy the assets at one go instead they finance it and pay it over the time. 9 | 10 | 2. **Credit Risk** - 11 | Likelihood that the borrower wouldn't repay their loan to the lender is credit risk. 12 | Collection costs are the costs incurred in recovering back the money that was not collected. 13 | 14 | 3. **Default Event** - 15 | The event of a borrower not being able to repay their debt is called default. 16 | 17 | 4. Risk Based Pricing 18 | Lenders need to assess the credit risk associated with every loan they are giving to the borrower. 19 | To ensure that the borrower pays back the amount he has taken, lenders could ask for 20 | 1. Collators 21 | 2. Increase the interest on the loan 22 | 23 | 5. Main reasons for serious finacial crisis 24 | Lending to borrowers with a high probability of default 25 | Ex: Global Financial Crisis and Fall of Leeman Brothers 26 | 27 | -------------------------------------------------------------------------------- /Section 1 Introduction/2. Expected Loss and it's components.txt: -------------------------------------------------------------------------------- 1 | Expected Loss and it's components 2 | EL - Expected Loss 3 | PD - Probability of Default 4 | LGD - Loss Given Default 5 | EAD - Exposure At Default 6 | 7 | Lenders know and expect the possibility of borrower not paying back 8 | 9 | Factors of expected loss 10 | 1. Borrower-specific factors 11 | 2. The economic environment 12 | 13 | How to estimate expected loss or expected credit loss? 14 | Definition: The amount a lender might lose by lending to a borrower 15 | 16 | EL = PD x LGD x EAD 17 | 18 | Probability of Default - Borrower's inability to repay their debt in full or on time 19 | 20 | Loss Given Default - The proportion of the total exposure that can't be recovered by the lender at default event 21 | 22 | Exposure At Default - Total value that a lender is exposed to when a borrower defaults 23 | 24 | Example 25 | Cost of House - $500,000 26 | Lender Funds 80% Loan to Value 27 | So loan amount - $400,000 28 | 29 | Borrower pays back $40,000 30 | Borrower Defaults here. 31 | 32 | So remaining amount to be recovered - $360,000 33 | Exposure at default - $360,000 34 | 35 | If there is an empirical evidence of one in every 4 homeowners having default 36 | So, 37 | Probability of Default = 1 out of 4 = 1/4 = 25% 38 | 39 | Here Bank can sold the house of $342,000 40 | Now 41 | Exposure at Default = $360,000 - $342,000 = $18,000 42 | 43 | Loss Given Default = $18,000/$360,000 = 5% 44 | 45 | Expected Loss = PD x LGD x EAD 46 | = 25% x 5% x $360,000 47 | = $4,500 -------------------------------------------------------------------------------- /Section 1 Introduction/3. Capital requirement and BASEL II accord.txt: -------------------------------------------------------------------------------- 1 | When Banking System is suffered, it impacts the overall functioning of the government and stability of Economic System 2 | 3 | People wouldn't deposit money in their banks if it is not safe 4 | Consequently there would be less liquidity 5 | 6 | Regulators Rules: 7 | 1. Regulate bank operations and hence reduce risky behaviour 8 | 2. Guarantee to the public that the banking sector is in good health 9 | 10 | Loan Defaults 11 | Firms may default because of two possible reasons 12 | 1. Poor corporate management 13 | 2. Bad product performance 14 | 3. Also, if there is a global economic downturn 15 | 16 | Capital Requirement or Capital Adequacy or Regulatory Capital: 17 | Banks are required to have sufficient moeny to absorb losses 18 | 19 | Risk Weighted Assets - Every loan that bank gives is an asset to bank. This loan is associated with risk 20 | 21 | So, Capital Adequacy Ratio should be greater than a certain percentage 22 | 23 | BASEL II Accord 24 | - How much capital banks need to have 25 | - How capital is defined 26 | - How Capital is compared against risk-weighted assets 27 | 28 | BASEL II Accord defines the Capital amount a bank needs 29 | because The Greater the risk a bank is exposed to the greater the capital it needs to hold 30 | 31 | BASEL II Accord has three pillars 32 | 1. Minimum Capital Requirement 33 | - Credit Risk 34 | - Operational Risk 35 | * Internal Ratings Based Approach (IRB) 36 | ** Foundational IRB Approach 37 | ** Advanced IRB Approach 38 | - Market Risk 39 | 40 | 2. Supervisory Review 41 | 3. Market Discipline 42 | 43 | -------------------------------------------------------------------------------- /Section 1 Introduction/4. BASEL II Approaches.txt: -------------------------------------------------------------------------------- 1 | BASEL II Approaches 2 | 3 | EL = PD x EAD x LGD 4 | 5 | BASEL II Accord has three pillars 6 | 1. Minimum Capital Requirement 7 | - Credit Risk 8 | - Operational Risk 9 | * Internal Ratings Based Approach (IRB) 10 | ** Foundational IRB Approach 11 | ** Advanced IRB Approach 12 | - Market Risk 13 | 14 | 2. Supervisory Review 15 | 3. Market Discipline 16 | 17 | From this we have three options to model the credit risk 18 | 1. Standardised Approach 19 | 2. Foundational IRB Approach 20 | 3. Advanced IRB Approach 21 | 22 | Capital Requirement is calculated differently under these three approach 23 | 1. Standardised Approach -- Fixed % of the total exposure 24 | FICO 25 | Moody's etc do credit risk rating 26 | In India - CRISIL 27 | S&P do it for Firms and Countries ex: AAA, AAA-, BBB etc 28 | 29 | Under this approach, 30 | There is a table for which there is a certain % of total amount is held 31 | Ex: 32 | AAA to AAA- rated companies -- 20% 33 | A+ to A- -- 50% should be held as capital 34 | 35 | Retail, Credit Card and Consumer loans - 75% of the loan given 36 | Home Loans - 35% 37 | 38 | 2. Foundational IRB & Advanced IRB Approach 39 | When banks give out the loans they collect data and this data could be used for calculation 40 | This data can be used under IRB approach 41 | 42 | Banks would like to move from SA to F-IRB to A-IRB 43 | Because holding as much as 75% of the data of the loan is too much 44 | If a bank gets a proper risk profile of an individual, they would need to hold less amount of money 45 | 46 | More Precise estimation of Capital ==> More new business with SAME capital 47 | 48 | IRB Approach allow banks to do their own credit rating 49 | - Hence bank ccan allocate more resources to cover losses 50 | 51 | -------------------------------------------------------------------------------- /Section 1 Introduction/5. Different Asset Classes and Financing them.txt: -------------------------------------------------------------------------------- 1 | Individuals 2 | Credit Cards -- 75% 3 | Consumer Loans -- 75% 4 | Mortgages -- 35% 5 | Firms 6 | SME 7 | Large Corporations 8 | 9 | Large Corporations are very less to have a statistical data to model 10 | SME contains enough data to build statistical models 11 | Reatil loans are also plenty to get the data 12 | 13 | In this course we would focus on a general case where 14 | enough data is available for implementing a traditional statistical methodology to build credit risk model 15 | 16 | PD - Logistic Regression 17 | LGD - Beta Regression 18 | EAD - Beta Regression 19 | 20 | For different classes of customers we may have different data available 21 | Individuals - 22 | - Any Demographics or Social informations Available 23 | - external credit risk agency data 24 | - No. of inquiries made for credit 25 | - Interest Rate 26 | 27 | Credit Card - 28 | Credit Limit 29 | Credit Limit Utilization can be used to build model 30 | 31 | Mortgage Loan - 32 | Loan To Value Ratio 33 | 34 | Corporate Loans - 35 | Firm's Size 36 | Years in business 37 | Line of Operation 38 | Target Market focus 39 | Financial Statements 40 | Return on Assets - Net income/Total Assets 41 | Return on Equity - Net Income/Shareholder's equity 42 | Current Ratio - Current Assets/Current Liabilities 43 | Debt Ratio - Total Liabilities/ Total Assets 44 | 45 | Many of these informations are available 46 | - before the application 47 | - collected after the loan is granted and under a period of observation 48 | 49 | Both of these could be used to build a behaviour model. 50 | 51 | Two models could be made 52 | 1. Application Model 53 | If the loan is risky it would have a higher interest rate 54 | 2. Behaviour Model 55 | Whether to grant an additional loan or not; 56 | Ex: Using the credit card details bank may use for building the model 57 | 58 | In this course we would be building statistical models 59 | PD - Logistic Regression (Binomial Logistic Regression) 60 | LGD and EAD - Beta Regression 61 | 62 | We would be using Python -------------------------------------------------------------------------------- /Section 1 Introduction/README.md: -------------------------------------------------------------------------------- 1 | # Section 1: Introduction 2 | 3 | ## Lecture 0 4 | Introduction to this course 5 | 6 | ## Lecture 1 7 | Credit risk and it's importance 8 | 9 | ## Lecture 2 10 | Expected Loss (EL) and it's components: PD, LGD, EAD 11 | 12 | ## Lecture 3 13 | Capital Adequacy, Regulations and Basel II Accord 14 | 15 | ## Lecture 4 16 | BASEL II Approaches: SA, F-IRB, A-IRB 17 | 18 | ## Lecture 5 19 | Different Asset Classes and Risk Modeling Approaches 20 | -------------------------------------------------------------------------------- /Section 2 Setting up the environment/1. Setting up the environment.txt: -------------------------------------------------------------------------------- 1 | Machine Learning and Data Science 2 | 3 | Can be perfromed in Many Languages 4 | 5 | Python 6 | Anaconda 7 | Jupyter Notebook 8 | 9 | Install scikit-learn -------------------------------------------------------------------------------- /Section 2 Setting up the environment/2. Why Python and Jupyter.txt: -------------------------------------------------------------------------------- 1 | Why Python and Jupyter? 2 | 3 | Open Source - Anyone can use and contribute to it. 4 | General Purpose - Suitable for all types of work not only Data Science 5 | High-Level - Easy Syntax close to logical human labguage 6 | Available for all operating systems, Windows, Max, Linux, 7 | 8 | Scikit-learn is open source 9 | Countless Packages delivered and maintained for Python 10 | 11 | Python - Programming Language 12 | Jupyter - Software that let's user interact with computer using Python on a web browser 13 | File Format for Jupyter - .ipynb 14 | 15 | Jupyter 16 | Design Well Suited for Demonstrations of Programming concepts and training 17 | Instead of installing different interfaces for all programming languages like R,Python, Julia or PHP, Jupyter allows us to have one interfaces 18 | 19 | Jupyter is not a text editor, it's a software which has text,code and it;s output which also helps us plot the graphs in itself 20 | 21 | Anaconda contains both!! -------------------------------------------------------------------------------- /Section 3 Dataset Description/Credit Risk Modeling - Preparation - 3-1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Preparation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Import Libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import numpy as np\n", 24 | "import pandas as pd" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Import Data" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "loan_data = loan_data_backup.copy()" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "## Explore Data" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "loan_data" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "pd.options.display.max_columns = None" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "loan_data" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "loan_data.head()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "loan_data.tail()" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "loan_data.columns.values" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": { 117 | "scrolled": true 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "loan_data.info()" 122 | ] 123 | } 124 | ], 125 | "metadata": { 126 | "kernelspec": { 127 | "display_name": "Python 3", 128 | "language": "python", 129 | "name": "python3" 130 | }, 131 | "language_info": { 132 | "codemirror_mode": { 133 | "name": "ipython", 134 | "version": 3 135 | }, 136 | "file_extension": ".py", 137 | "mimetype": "text/x-python", 138 | "name": "python", 139 | "nbconvert_exporter": "python", 140 | "pygments_lexer": "ipython3", 141 | "version": "3.7.3" 142 | } 143 | }, 144 | "nbformat": 4, 145 | "nbformat_minor": 2 146 | } 147 | -------------------------------------------------------------------------------- /Section 3 Dataset Description/Foundation of PreProcessing.txt: -------------------------------------------------------------------------------- 1 | Dependent Variables and Independent Variables 2 | 3 | Dependent Variables - outcomes of interest 4 | 5 | Independent Variables - Predictors/Features 6 | 7 | PD Model 8 | Dependent Variable for our model - loan status 9 | 10 | LGD Model 11 | How much loan was recovered after default? 12 | Dependent Variable - Recovery Column 13 | 14 | EAD Model 15 | total recovered principal column 16 | 17 | Depending on the type of data we would apply different Preprocessing techniques 18 | 19 | 1. Discrete 20 | Categorical 21 | Finite Number of Values 22 | Some variables take discrete values but can be treated as continuous variables in our dataset Ex: Number of Credit Inquiries in last six months is one such example 23 | If the discrete values are sufficient and are ordered we can treat them continuous 24 | 25 | 2. Continuous 26 | Numerical 27 | Infinite Number of Values 28 | 29 | Distinctive Feature of the PD model 30 | All the independent variables have to be categorical 31 | Reason - It is much easier to present the model in a simplified form and turn it into a scorecard if we only have categorical instead of mixed 32 | 33 | Discrete Values into Categorical 34 | So we would transform the discrete variables into categorical variables or dummy variables 35 | Only when we reach a final version of the categories from discrete values, we would create dummies 36 | 37 | 38 | Continuous Variables can also be transformed into dummy variables 39 | Ex: Annual Income, No. of Credit Inquiries in last six months 40 | We will start by turning each of them into many categories of equally sized intervals 41 | 42 | Fine Classing 43 | Ex: No. of Months since loan has been granted <- column 44 | We will slice it into 50 intervals 45 | like 46 | 1 to 3, 47 | 4 to 6 48 | 7 to 9 months upto 50 49 | 50 | Now on these categories we try to understand how do they discriminate with adjascent categories 51 | For example if 1 and 2 we merge them etc. 52 | Coarse Classing 53 | If we make categories which are not equally distributed 54 | We make them using coarse classing 55 | 56 | -------------------------------------------------------------------------------- /Section 4 General Preprocessing/.ipynb_checkpoints/Credit Risk Modeling - Preparation - 4-3-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Preparation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Import Libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import numpy as np\n", 24 | "import pandas as pd" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Import Data" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "loan_data = loan_data_backup.copy()" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "## Explore Data" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "loan_data" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "pd.options.display.max_columns = None" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "loan_data" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "loan_data.head()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "loan_data.tail()" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "loan_data.columns.values" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": { 117 | "scrolled": true 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "loan_data.info()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "## General Preprocessing" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "### Preprocessing few continuous variables" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "loan_data['emp_length'].unique()" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\\+ years', '')\n", 154 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0))\n", 155 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a', str(0))\n", 156 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '')\n", 157 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '')" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "type(loan_data['emp_length_int'][0])" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "type(loan_data['emp_length_int'][0])" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "scrolled": true 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "loan_data['earliest_cr_line']" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = '%b-%y')" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "type(loan_data['earliest_cr_line_date'][0])" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": { 220 | "scrolled": true 221 | }, 222 | "outputs": [], 223 | "source": [ 224 | "pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "loan_data['mths_since_earliest_cr_line'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']) / np.timedelta64(1, 'M')))" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "loan_data['mths_since_earliest_cr_line'].describe()" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": { 249 | "scrolled": true 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "loan_data.loc[: , ['earliest_cr_line', 'earliest_cr_line_date', 'mths_since_earliest_cr_line']][loan_data['mths_since_earliest_cr_line'] < 0]" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "loan_data['mths_since_earliest_cr_line'][loan_data['mths_since_earliest_cr_line'] < 0] = loan_data['mths_since_earliest_cr_line'].max()" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "min(loan_data['mths_since_earliest_cr_line'])" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "### Homework" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": { 285 | "scrolled": true 286 | }, 287 | "outputs": [], 288 | "source": [ 289 | "loan_data['term']" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "loan_data['term'].describe()" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "loan_data['term_int'] = loan_data['term'].str.replace(' months', '')" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": { 314 | "scrolled": true 315 | }, 316 | "outputs": [], 317 | "source": [ 318 | "loan_data['term_int']" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "type(loan_data['term_int'][25])" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": { 334 | "scrolled": true 335 | }, 336 | "outputs": [], 337 | "source": [ 338 | "loan_data['term_int'] = pd.to_numeric(loan_data['term'].str.replace(' months', ''))\n", 339 | "loan_data['term_int']" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "type(loan_data['term_int'][0])" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "metadata": { 355 | "scrolled": true 356 | }, 357 | "outputs": [], 358 | "source": [ 359 | "loan_data['issue_d']" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "loan_data['issue_d_date'] = pd.to_datetime(loan_data['issue_d'], format = '%b-%y')\n", 369 | "loan_data['mths_since_issue_d'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['issue_d_date']) / np.timedelta64(1, 'M')))\n", 370 | "loan_data['mths_since_issue_d'].describe()" 371 | ] 372 | } 373 | ], 374 | "metadata": { 375 | "kernelspec": { 376 | "display_name": "Python 3", 377 | "language": "python", 378 | "name": "python3" 379 | }, 380 | "language_info": { 381 | "codemirror_mode": { 382 | "name": "ipython", 383 | "version": 3 384 | }, 385 | "file_extension": ".py", 386 | "mimetype": "text/x-python", 387 | "name": "python", 388 | "nbconvert_exporter": "python", 389 | "pygments_lexer": "ipython3", 390 | "version": "3.7.4" 391 | } 392 | }, 393 | "nbformat": 4, 394 | "nbformat_minor": 2 395 | } 396 | -------------------------------------------------------------------------------- /Section 4 General Preprocessing/Credit Risk Modeling - Preparation - 4-1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Preparation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Import Libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import numpy as np\n", 24 | "import pandas as pd" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Import Data" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "loan_data = loan_data_backup.copy()" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "## Explore Data" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "loan_data" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "pd.options.display.max_columns = None" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "loan_data" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "loan_data.head()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "loan_data.tail()" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "loan_data.columns.values" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": { 117 | "scrolled": true 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "loan_data.info()" 122 | ] 123 | } 124 | ], 125 | "metadata": { 126 | "kernelspec": { 127 | "display_name": "Python 3", 128 | "language": "python", 129 | "name": "python3" 130 | }, 131 | "language_info": { 132 | "codemirror_mode": { 133 | "name": "ipython", 134 | "version": 3 135 | }, 136 | "file_extension": ".py", 137 | "mimetype": "text/x-python", 138 | "name": "python", 139 | "nbconvert_exporter": "python", 140 | "pygments_lexer": "ipython3", 141 | "version": "3.7.3" 142 | } 143 | }, 144 | "nbformat": 4, 145 | "nbformat_minor": 2 146 | } 147 | -------------------------------------------------------------------------------- /Section 4 General Preprocessing/Credit Risk Modeling - Preparation - 4-2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Preparation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Import Libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import numpy as np\n", 24 | "import pandas as pd" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Import Data" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "loan_data = loan_data_backup.copy()" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "## Explore Data" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "loan_data" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "pd.options.display.max_columns = None" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "loan_data" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "loan_data.head()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "loan_data.tail()" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "loan_data.columns.values" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": { 117 | "scrolled": true 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "loan_data.info()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "## General Preprocessing" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "### Preprocessing few continuous variables" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "loan_data['emp_length'].unique()" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\\+ years', '')\n", 154 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0))\n", 155 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a', str(0))\n", 156 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '')\n", 157 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '')" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "type(loan_data['emp_length_int'][0])" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "type(loan_data['emp_length_int'][0])" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "scrolled": true 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "loan_data['earliest_cr_line']" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = '%b-%y')" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "type(loan_data['earliest_cr_line_date'][0])" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": { 220 | "scrolled": true 221 | }, 222 | "outputs": [], 223 | "source": [ 224 | "pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "loan_data['mths_since_earliest_cr_line'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']) / np.timedelta64(1, 'M')))" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "loan_data['mths_since_earliest_cr_line'].describe()" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": { 249 | "scrolled": true 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "loan_data.loc[: , ['earliest_cr_line', 'earliest_cr_line_date', 'mths_since_earliest_cr_line']][loan_data['mths_since_earliest_cr_line'] < 0]" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "loan_data['mths_since_earliest_cr_line'][loan_data['mths_since_earliest_cr_line'] < 0] = loan_data['mths_since_earliest_cr_line'].max()" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "min(loan_data['mths_since_earliest_cr_line'])" 272 | ] 273 | } 274 | ], 275 | "metadata": { 276 | "kernelspec": { 277 | "display_name": "Python 3", 278 | "language": "python", 279 | "name": "python3" 280 | }, 281 | "language_info": { 282 | "codemirror_mode": { 283 | "name": "ipython", 284 | "version": 3 285 | }, 286 | "file_extension": ".py", 287 | "mimetype": "text/x-python", 288 | "name": "python", 289 | "nbconvert_exporter": "python", 290 | "pygments_lexer": "ipython3", 291 | "version": "3.7.3" 292 | } 293 | }, 294 | "nbformat": 4, 295 | "nbformat_minor": 2 296 | } 297 | -------------------------------------------------------------------------------- /Section 4 General Preprocessing/Credit Risk Modeling - Preparation - 4-3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Preparation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Import Libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import numpy as np\n", 24 | "import pandas as pd" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Import Data" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "loan_data = loan_data_backup.copy()" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "## Explore Data" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "loan_data" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "pd.options.display.max_columns = None" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "loan_data" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "loan_data.head()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "loan_data.tail()" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "loan_data.columns.values" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": { 117 | "scrolled": true 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "loan_data.info()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "## General Preprocessing" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "### Preprocessing few continuous variables" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "loan_data['emp_length'].unique()" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\\+ years', '')\n", 154 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0))\n", 155 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a', str(0))\n", 156 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '')\n", 157 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '')" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "type(loan_data['emp_length_int'][0])" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "type(loan_data['emp_length_int'][0])" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "scrolled": true 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "loan_data['earliest_cr_line']" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = '%b-%y')" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "type(loan_data['earliest_cr_line_date'][0])" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": { 220 | "scrolled": true 221 | }, 222 | "outputs": [], 223 | "source": [ 224 | "pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "loan_data['mths_since_earliest_cr_line'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']) / np.timedelta64(1, 'M')))" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "loan_data['mths_since_earliest_cr_line'].describe()" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": { 249 | "scrolled": true 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "loan_data.loc[: , ['earliest_cr_line', 'earliest_cr_line_date', 'mths_since_earliest_cr_line']][loan_data['mths_since_earliest_cr_line'] < 0]" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "loan_data['mths_since_earliest_cr_line'][loan_data['mths_since_earliest_cr_line'] < 0] = loan_data['mths_since_earliest_cr_line'].max()" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "min(loan_data['mths_since_earliest_cr_line'])" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "### Homework" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": { 285 | "scrolled": true 286 | }, 287 | "outputs": [], 288 | "source": [ 289 | "loan_data['term']" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "loan_data['term'].describe()" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "loan_data['term_int'] = loan_data['term'].str.replace(' months', '')" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": { 314 | "scrolled": true 315 | }, 316 | "outputs": [], 317 | "source": [ 318 | "loan_data['term_int']" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "type(loan_data['term_int'][25])" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": { 334 | "scrolled": true 335 | }, 336 | "outputs": [], 337 | "source": [ 338 | "loan_data['term_int'] = pd.to_numeric(loan_data['term'].str.replace(' months', ''))\n", 339 | "loan_data['term_int']" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "type(loan_data['term_int'][0])" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "metadata": { 355 | "scrolled": true 356 | }, 357 | "outputs": [], 358 | "source": [ 359 | "loan_data['issue_d']" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "loan_data['issue_d_date'] = pd.to_datetime(loan_data['issue_d'], format = '%b-%y')\n", 369 | "loan_data['mths_since_issue_d'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['issue_d_date']) / np.timedelta64(1, 'M')))\n", 370 | "loan_data['mths_since_issue_d'].describe()" 371 | ] 372 | } 373 | ], 374 | "metadata": { 375 | "kernelspec": { 376 | "display_name": "Python 3", 377 | "language": "python", 378 | "name": "python3" 379 | }, 380 | "language_info": { 381 | "codemirror_mode": { 382 | "name": "ipython", 383 | "version": 3 384 | }, 385 | "file_extension": ".py", 386 | "mimetype": "text/x-python", 387 | "name": "python", 388 | "nbconvert_exporter": "python", 389 | "pygments_lexer": "ipython3", 390 | "version": "3.7.4" 391 | } 392 | }, 393 | "nbformat": 4, 394 | "nbformat_minor": 2 395 | } 396 | -------------------------------------------------------------------------------- /Section 4 General Preprocessing/Credit Risk Modeling - Preparation - With Comments - 4-1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Preparation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Import Libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import numpy as np\n", 24 | "import pandas as pd" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Import Data\n", 32 | "The dataset contains all available data for more than 800,000 consumer loans issued from 2007 to 2015 by Lending Club: a large US peer-to-peer lending company. There are several different versions of this dataset. We have used a version available on kaggle.com. You can find it here: https://www.kaggle.com/wendykan/lending-club-loan-data/version/1\n", 33 | "We divided the data into two periods because we assume that some data are available at the moment when we need to build Expected Loss models, and some data comes from applications after. Later, we investigate whether the applications we have after we built the Probability of Default (PD) model have similar characteristics with the applications we used to build the PD model." 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": { 40 | "scrolled": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "loan_data = loan_data_backup.copy()" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "## Explore Data" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "loan_data" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "pd.options.display.max_columns = None\n", 79 | "#pd.options.display.max_rows = None\n", 80 | "# Sets the pandas dataframe options to display all columns/ rows." 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "loan_data" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "scrolled": true 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "loan_data.head()" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "scrolled": true 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "loan_data.tail()" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "loan_data.columns.values\n", 121 | "# Displays all column names." 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "loan_data.info()\n", 131 | "# Displays column names, complete (non-missing) cases per column, and datatype per column." 132 | ] 133 | } 134 | ], 135 | "metadata": { 136 | "kernelspec": { 137 | "display_name": "Python 3", 138 | "language": "python", 139 | "name": "python3" 140 | }, 141 | "language_info": { 142 | "codemirror_mode": { 143 | "name": "ipython", 144 | "version": 3 145 | }, 146 | "file_extension": ".py", 147 | "mimetype": "text/x-python", 148 | "name": "python", 149 | "nbconvert_exporter": "python", 150 | "pygments_lexer": "ipython3", 151 | "version": "3.7.3" 152 | } 153 | }, 154 | "nbformat": 4, 155 | "nbformat_minor": 2 156 | } 157 | -------------------------------------------------------------------------------- /Section 4 General Preprocessing/Credit Risk Modeling - Preparation - With Comments - 4-2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Preparation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Import Libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import numpy as np\n", 24 | "import pandas as pd" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Import Data\n", 32 | "The dataset contains all available data for more than 800,000 consumer loans issued from 2007 to 2015 by Lending Club: a large US peer-to-peer lending company. There are several different versions of this dataset. We have used a version available on kaggle.com. You can find it here: https://www.kaggle.com/wendykan/lending-club-loan-data/version/1\n", 33 | "We divided the data into two periods because we assume that some data are available at the moment when we need to build Expected Loss models, and some data comes from applications after. Later, we investigate whether the applications we have after we built the Probability of Default (PD) model have similar characteristics with the applications we used to build the PD model." 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": { 40 | "scrolled": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "loan_data = loan_data_backup.copy()" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "## Explore Data" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "loan_data" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "pd.options.display.max_columns = None\n", 79 | "#pd.options.display.max_rows = None\n", 80 | "# Sets the pandas dataframe options to display all columns/ rows." 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "loan_data" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "scrolled": true 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "loan_data.head()" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "scrolled": true 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "loan_data.tail()" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "loan_data.columns.values\n", 121 | "# Displays all column names." 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "loan_data.info()\n", 131 | "# Displays column names, complete (non-missing) cases per column, and datatype per column." 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "## General Preprocessing" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "### Preprocessing few continuous variables" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "loan_data['emp_length'].unique()\n", 155 | "# Displays unique values of a column." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\\+ years', '')\n", 165 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0))\n", 166 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a', str(0))\n", 167 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '')\n", 168 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '')\n", 169 | "# We store the preprocessed ‘employment length’ variable in a new variable called ‘employment length int’,\n", 170 | "# We assign the new ‘employment length int’ to be equal to the ‘employment length’ variable with the string ‘+ years’\n", 171 | "# replaced with nothing. Next, we replace the whole string ‘less than 1 year’ with the string ‘0’.\n", 172 | "# Then, we replace the ‘n/a’ string with the string ‘0’. Then, we replace the string ‘space years’ with nothing.\n", 173 | "# Finally, we replace the string ‘space year’ with nothing." 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "type(loan_data['emp_length_int'][0])\n", 183 | "# Checks the datatype of a single element of a column." 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])\n", 193 | "# Transforms the values to numeric." 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "type(loan_data['emp_length_int'][0])\n", 203 | "# Checks the datatype of a single element of a column." 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "loan_data['earliest_cr_line']\n", 213 | "# Displays a column." 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = '%b-%y')\n", 223 | "# Extracts the date and the time from a string variable that is in a given format." 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": { 230 | "scrolled": false 231 | }, 232 | "outputs": [], 233 | "source": [ 234 | "type(loan_data['earliest_cr_line_date'][0])\n", 235 | "# Checks the datatype of a single element of a column." 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']\n", 245 | "# Calculates the difference between two dates and times." 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "# Assume we are now in December 2017\n", 255 | "loan_data['mths_since_earliest_cr_line'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']) / np.timedelta64(1, 'M')))\n", 256 | "# We calculate the difference between two dates in months, turn it to numeric datatype and round it.\n", 257 | "# We save the result in a new variable." 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "loan_data['mths_since_earliest_cr_line'].describe()\n", 267 | "# Shows some descriptive statisics for the values of a column.\n", 268 | "# Dates from 1969 and before are not being converted well, i.e., they have become 2069 and similar,\n", 269 | "# and negative differences are being calculated." 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "loan_data.loc[: , ['earliest_cr_line', 'earliest_cr_line_date', 'mths_since_earliest_cr_line']][loan_data['mths_since_earliest_cr_line'] < 0]\n", 279 | "# We take three columns from the dataframe. Then, we display them only for the rows where a variable has negative value.\n", 280 | "# There are 2303 strange negative values." 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": {}, 287 | "outputs": [], 288 | "source": [ 289 | "loan_data['mths_since_earliest_cr_line'][loan_data['mths_since_earliest_cr_line'] < 0] = loan_data['mths_since_earliest_cr_line'].max()\n", 290 | "# We set the rows that had negative differences to the maximum value." 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": null, 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [ 299 | "min(loan_data['mths_since_earliest_cr_line'])\n", 300 | "# Calculates and shows the minimum value of a column." 301 | ] 302 | } 303 | ], 304 | "metadata": { 305 | "kernelspec": { 306 | "display_name": "Python 3", 307 | "language": "python", 308 | "name": "python3" 309 | }, 310 | "language_info": { 311 | "codemirror_mode": { 312 | "name": "ipython", 313 | "version": 3 314 | }, 315 | "file_extension": ".py", 316 | "mimetype": "text/x-python", 317 | "name": "python", 318 | "nbconvert_exporter": "python", 319 | "pygments_lexer": "ipython3", 320 | "version": "3.7.3" 321 | } 322 | }, 323 | "nbformat": 4, 324 | "nbformat_minor": 2 325 | } 326 | -------------------------------------------------------------------------------- /Section 4 General Preprocessing/Credit Risk Modeling - Preparation - With Comments - 4-3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Preparation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Import Libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import numpy as np\n", 24 | "import pandas as pd" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Import Data\n", 32 | "The dataset contains all available data for more than 800,000 consumer loans issued from 2007 to 2015 by Lending Club: a large US peer-to-peer lending company. There are several different versions of this dataset. We have used a version available on kaggle.com. You can find it here: https://www.kaggle.com/wendykan/lending-club-loan-data/version/1\n", 33 | "We divided the data into two periods because we assume that some data are available at the moment when we need to build Expected Loss models, and some data comes from applications after. Later, we investigate whether the applications we have after we built the Probability of Default (PD) model have similar characteristics with the applications we used to build the PD model." 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": { 40 | "scrolled": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "loan_data = loan_data_backup.copy()" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "## Explore Data" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "loan_data" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "pd.options.display.max_columns = None\n", 79 | "#pd.options.display.max_rows = None\n", 80 | "# Sets the pandas dataframe options to display all columns/ rows." 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "loan_data" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "scrolled": true 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "loan_data.head()" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "scrolled": true 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "loan_data.tail()" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "loan_data.columns.values\n", 121 | "# Displays all column names." 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "loan_data.info()\n", 131 | "# Displays column names, complete (non-missing) cases per column, and datatype per column." 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "## General Preprocessing" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "### Preprocessing few continuous variables" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "loan_data['emp_length'].unique()\n", 155 | "# Displays unique values of a column." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\\+ years', '')\n", 165 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0))\n", 166 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a', str(0))\n", 167 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '')\n", 168 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '')\n", 169 | "# We store the preprocessed ‘employment length’ variable in a new variable called ‘employment length int’,\n", 170 | "# We assign the new ‘employment length int’ to be equal to the ‘employment length’ variable with the string ‘+ years’\n", 171 | "# replaced with nothing. Next, we replace the whole string ‘less than 1 year’ with the string ‘0’.\n", 172 | "# Then, we replace the ‘n/a’ string with the string ‘0’. Then, we replace the string ‘space years’ with nothing.\n", 173 | "# Finally, we replace the string ‘space year’ with nothing." 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "type(loan_data['emp_length_int'][0])\n", 183 | "# Checks the datatype of a single element of a column." 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])\n", 193 | "# Transforms the values to numeric." 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "type(loan_data['emp_length_int'][0])\n", 203 | "# Checks the datatype of a single element of a column." 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "loan_data['earliest_cr_line']\n", 213 | "# Displays a column." 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = '%b-%y')\n", 223 | "# Extracts the date and the time from a string variable that is in a given format." 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": { 230 | "scrolled": false 231 | }, 232 | "outputs": [], 233 | "source": [ 234 | "type(loan_data['earliest_cr_line_date'][0])\n", 235 | "# Checks the datatype of a single element of a column." 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']\n", 245 | "# Calculates the difference between two dates and times." 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "# Assume we are now in December 2017\n", 255 | "loan_data['mths_since_earliest_cr_line'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']) / np.timedelta64(1, 'M')))\n", 256 | "# We calculate the difference between two dates in months, turn it to numeric datatype and round it.\n", 257 | "# We save the result in a new variable." 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "loan_data['mths_since_earliest_cr_line'].describe()\n", 267 | "# Shows some descriptive statisics for the values of a column.\n", 268 | "# Dates from 1969 and before are not being converted well, i.e., they have become 2069 and similar,\n", 269 | "# and negative differences are being calculated." 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "loan_data.loc[: , ['earliest_cr_line', 'earliest_cr_line_date', 'mths_since_earliest_cr_line']][loan_data['mths_since_earliest_cr_line'] < 0]\n", 279 | "# We take three columns from the dataframe. Then, we display them only for the rows where a variable has negative value.\n", 280 | "# There are 2303 strange negative values." 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": {}, 287 | "outputs": [], 288 | "source": [ 289 | "loan_data['mths_since_earliest_cr_line'][loan_data['mths_since_earliest_cr_line'] < 0] = loan_data['mths_since_earliest_cr_line'].max()\n", 290 | "# We set the rows that had negative differences to the maximum value." 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": null, 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [ 299 | "min(loan_data['mths_since_earliest_cr_line'])\n", 300 | "# Calculates and shows the minimum value of a column." 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "### Homework" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": {}, 314 | "outputs": [], 315 | "source": [ 316 | "loan_data['term']" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "loan_data['term'].describe()\n", 326 | "# Shows some descriptive statisics for the values of a column." 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [ 335 | "loan_data['term_int'] = loan_data['term'].str.replace(' months', '')\n", 336 | "# We replace a string with another string, in this case, with an empty strng (i.e. with nothing)." 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "metadata": {}, 343 | "outputs": [], 344 | "source": [ 345 | "loan_data['term_int']" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "type(loan_data['term_int'][25])\n", 355 | "# Checks the datatype of a single element of a column." 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": {}, 362 | "outputs": [], 363 | "source": [ 364 | "loan_data['term_int'] = pd.to_numeric(loan_data['term'].str.replace(' months', ''))\n", 365 | "# We remplace a string from a variable with another string, in this case, with an empty strng (i.e. with nothing).\n", 366 | "# We turn the result to numeric datatype and save it in another variable.\n", 367 | "loan_data['term_int']" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "type(loan_data['term_int'][0])\n", 377 | "# Checks the datatype of a single element of a column." 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [ 386 | "loan_data['issue_d']" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "# Assume we are now in December 2017\n", 396 | "loan_data['issue_d_date'] = pd.to_datetime(loan_data['issue_d'], format = '%b-%y')\n", 397 | "# Extracts the date and the time from a string variable that is in a given format.\n", 398 | "loan_data['mths_since_issue_d'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['issue_d_date']) / np.timedelta64(1, 'M')))\n", 399 | "# We calculate the difference between two dates in months, turn it to numeric datatype and round it.\n", 400 | "# We save the result in a new variable.\n", 401 | "loan_data['mths_since_issue_d'].describe()\n", 402 | "# Shows some descriptive statisics for the values of a column." 403 | ] 404 | } 405 | ], 406 | "metadata": { 407 | "kernelspec": { 408 | "display_name": "Python 3", 409 | "language": "python", 410 | "name": "python3" 411 | }, 412 | "language_info": { 413 | "codemirror_mode": { 414 | "name": "ipython", 415 | "version": 3 416 | }, 417 | "file_extension": ".py", 418 | "mimetype": "text/x-python", 419 | "name": "python", 420 | "nbconvert_exporter": "python", 421 | "pygments_lexer": "ipython3", 422 | "version": "3.7.3" 423 | } 424 | }, 425 | "nbformat": 4, 426 | "nbformat_minor": 2 427 | } 428 | -------------------------------------------------------------------------------- /Section 5 PD Model, Data Preparation/.ipynb_checkpoints/Credit Risk Modeling - Preparation - With Comments - 5-2-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Preparation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Import Libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import numpy as np\n", 24 | "import pandas as pd" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Import Data\n", 32 | "The dataset contains all available data for more than 800,000 consumer loans issued from 2007 to 2015 by Lending Club: a large US peer-to-peer lending company. There are several different versions of this dataset. We have used a version available on kaggle.com. You can find it here: https://www.kaggle.com/wendykan/lending-club-loan-data/version/1\n", 33 | "We divided the data into two periods because we assume that some data are available at the moment when we need to build Expected Loss models, and some data comes from applications after. Later, we investigate whether the applications we have after we built the Probability of Default (PD) model have similar characteristics with the applications we used to build the PD model." 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": { 40 | "scrolled": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "loan_data = loan_data_backup.copy()" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "## Explore Data" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "loan_data" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "pd.options.display.max_columns = None\n", 79 | "#pd.options.display.max_rows = None\n", 80 | "# Sets the pandas dataframe options to display all columns/ rows." 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "loan_data" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "scrolled": true 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "loan_data.head()" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "scrolled": true 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "loan_data.tail()" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "loan_data.columns.values\n", 121 | "# Displays all column names." 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "loan_data.info()\n", 131 | "# Displays column names, complete (non-missing) cases per column, and datatype per column." 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "## General Preprocessing" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "### Preprocessing few continuous variables" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "loan_data['emp_length'].unique()\n", 155 | "# Displays unique values of a column." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\\+ years', '')\n", 165 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0))\n", 166 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a', str(0))\n", 167 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '')\n", 168 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '')\n", 169 | "# We store the preprocessed ‘employment length’ variable in a new variable called ‘employment length int’,\n", 170 | "# We assign the new ‘employment length int’ to be equal to the ‘employment length’ variable with the string ‘+ years’\n", 171 | "# replaced with nothing. Next, we replace the whole string ‘less than 1 year’ with the string ‘0’.\n", 172 | "# Then, we replace the ‘n/a’ string with the string ‘0’. Then, we replace the string ‘space years’ with nothing.\n", 173 | "# Finally, we replace the string ‘space year’ with nothing." 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "type(loan_data['emp_length_int'][0])\n", 183 | "# Checks the datatype of a single element of a column." 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])\n", 193 | "# Transforms the values to numeric." 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "type(loan_data['emp_length_int'][0])\n", 203 | "# Checks the datatype of a single element of a column." 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "loan_data['earliest_cr_line']\n", 213 | "# Displays a column." 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = '%b-%y')\n", 223 | "# Extracts the date and the time from a string variable that is in a given format." 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": { 230 | "scrolled": false 231 | }, 232 | "outputs": [], 233 | "source": [ 234 | "type(loan_data['earliest_cr_line_date'][0])\n", 235 | "# Checks the datatype of a single element of a column." 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']\n", 245 | "# Calculates the difference between two dates and times." 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "# Assume we are now in December 2017\n", 255 | "loan_data['mths_since_earliest_cr_line'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']) / np.timedelta64(1, 'M')))\n", 256 | "# We calculate the difference between two dates in months, turn it to numeric datatype and round it.\n", 257 | "# We save the result in a new variable." 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "loan_data['mths_since_earliest_cr_line'].describe()\n", 267 | "# Shows some descriptive statisics for the values of a column.\n", 268 | "# Dates from 1969 and before are not being converted well, i.e., they have become 2069 and similar,\n", 269 | "# and negative differences are being calculated." 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "loan_data.loc[: , ['earliest_cr_line', 'earliest_cr_line_date', 'mths_since_earliest_cr_line']][loan_data['mths_since_earliest_cr_line'] < 0]\n", 279 | "# We take three columns from the dataframe. Then, we display them only for the rows where a variable has negative value.\n", 280 | "# There are 2303 strange negative values." 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": {}, 287 | "outputs": [], 288 | "source": [ 289 | "loan_data['mths_since_earliest_cr_line'][loan_data['mths_since_earliest_cr_line'] < 0] = loan_data['mths_since_earliest_cr_line'].max()\n", 290 | "# We set the rows that had negative differences to the maximum value." 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": null, 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [ 299 | "min(loan_data['mths_since_earliest_cr_line'])\n", 300 | "# Calculates and shows the minimum value of a column." 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "### Homework" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": {}, 314 | "outputs": [], 315 | "source": [ 316 | "loan_data['term']" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "loan_data['term'].describe()\n", 326 | "# Shows some descriptive statisics for the values of a column." 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [ 335 | "loan_data['term_int'] = loan_data['term'].str.replace(' months', '')\n", 336 | "# We replace a string with another string, in this case, with an empty strng (i.e. with nothing)." 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "metadata": {}, 343 | "outputs": [], 344 | "source": [ 345 | "loan_data['term_int']" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "type(loan_data['term_int'][25])\n", 355 | "# Checks the datatype of a single element of a column." 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": {}, 362 | "outputs": [], 363 | "source": [ 364 | "loan_data['term_int'] = pd.to_numeric(loan_data['term'].str.replace(' months', ''))\n", 365 | "# We remplace a string from a variable with another string, in this case, with an empty strng (i.e. with nothing).\n", 366 | "# We turn the result to numeric datatype and save it in another variable.\n", 367 | "loan_data['term_int']" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "type(loan_data['term_int'][0])\n", 377 | "# Checks the datatype of a single element of a column." 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [ 386 | "loan_data['issue_d']" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "# Assume we are now in December 2017\n", 396 | "loan_data['issue_d_date'] = pd.to_datetime(loan_data['issue_d'], format = '%b-%y')\n", 397 | "# Extracts the date and the time from a string variable that is in a given format.\n", 398 | "loan_data['mths_since_issue_d'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['issue_d_date']) / np.timedelta64(1, 'M')))\n", 399 | "# We calculate the difference between two dates in months, turn it to numeric datatype and round it.\n", 400 | "# We save the result in a new variable.\n", 401 | "loan_data['mths_since_issue_d'].describe()\n", 402 | "# Shows some descriptive statisics for the values of a column." 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": {}, 408 | "source": [ 409 | "### Preprocessing few discrete variables" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [ 418 | "loan_data.info()\n", 419 | "# Displays column names, complete (non-missing) cases per column, and datatype per column." 420 | ] 421 | }, 422 | { 423 | "cell_type": "markdown", 424 | "metadata": {}, 425 | "source": [ 426 | "We are going to preprocess the following discrete variables: grade, sub_grade, home_ownership, verification_status, loan_status, purpose, addr_state, initial_list_status. Most likely, we are not going to use sub_grade, as it overlaps with grade." 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "metadata": {}, 433 | "outputs": [], 434 | "source": [ 435 | "pd.get_dummies(loan_data['grade'])\n", 436 | "# Create dummy variables from a variable." 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": null, 442 | "metadata": {}, 443 | "outputs": [], 444 | "source": [ 445 | "pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':')\n", 446 | "# Create dummy variables from a variable." 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": null, 452 | "metadata": { 453 | "scrolled": true 454 | }, 455 | "outputs": [], 456 | "source": [ 457 | "loan_data_dummies = [pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':'),\n", 458 | " pd.get_dummies(loan_data['sub_grade'], prefix = 'sub_grade', prefix_sep = ':'),\n", 459 | " pd.get_dummies(loan_data['home_ownership'], prefix = 'home_ownership', prefix_sep = ':'),\n", 460 | " pd.get_dummies(loan_data['verification_status'], prefix = 'verification_status', prefix_sep = ':'),\n", 461 | " pd.get_dummies(loan_data['loan_status'], prefix = 'loan_status', prefix_sep = ':'),\n", 462 | " pd.get_dummies(loan_data['purpose'], prefix = 'purpose', prefix_sep = ':'),\n", 463 | " pd.get_dummies(loan_data['addr_state'], prefix = 'addr_state', prefix_sep = ':'),\n", 464 | " pd.get_dummies(loan_data['initial_list_status'], prefix = 'initial_list_status', prefix_sep = ':')]\n", 465 | "# We create dummy variables from all 8 original independent variables, and save them into a list.\n", 466 | "# Note that we are using a particular naming convention for all variables: original variable name, colon, category name." 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": null, 472 | "metadata": {}, 473 | "outputs": [], 474 | "source": [ 475 | "loan_data_dummies = pd.concat(loan_data_dummies, axis = 1)\n", 476 | "# We concatenate the dummy variables and this turns them into a dataframe." 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": null, 482 | "metadata": {}, 483 | "outputs": [], 484 | "source": [ 485 | "type(loan_data_dummies)\n", 486 | "# Returns the type of the variable." 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": null, 492 | "metadata": {}, 493 | "outputs": [], 494 | "source": [ 495 | "loan_data = pd.concat([loan_data, loan_data_dummies], axis = 1)\n", 496 | "# Concatenates two dataframes.\n", 497 | "# Here we concatenate the dataframe with original data with the dataframe with dummy variables, along the columns. " 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": null, 503 | "metadata": {}, 504 | "outputs": [], 505 | "source": [ 506 | "loan_data.columns.values\n", 507 | "# Displays all column names." 508 | ] 509 | }, 510 | { 511 | "cell_type": "markdown", 512 | "metadata": {}, 513 | "source": [ 514 | "### Check for missing values and clean" 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": null, 520 | "metadata": { 521 | "scrolled": false 522 | }, 523 | "outputs": [], 524 | "source": [ 525 | "loan_data.isnull()\n", 526 | "# It returns 'False' if a value is not missing and 'True' if a value is missing, for each value in a dataframe." 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": null, 532 | "metadata": {}, 533 | "outputs": [], 534 | "source": [ 535 | "pd.options.display.max_rows = None\n", 536 | "# Sets the pandas dataframe options to display all columns/ rows.\n", 537 | "loan_data.isnull().sum()" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": null, 543 | "metadata": {}, 544 | "outputs": [], 545 | "source": [ 546 | "pd.options.display.max_rows = 100\n", 547 | "# Sets the pandas dataframe options to display 100 columns/ rows." 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": null, 553 | "metadata": {}, 554 | "outputs": [], 555 | "source": [ 556 | "# 'Total revolving high credit/ credit limit', so it makes sense that the missing values are equal to funded_amnt.\n", 557 | "loan_data['total_rev_hi_lim'].fillna(loan_data['funded_amnt'], inplace=True)\n", 558 | "# We fill the missing values with the values of another variable." 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": null, 564 | "metadata": {}, 565 | "outputs": [], 566 | "source": [ 567 | "loan_data['total_rev_hi_lim'].isnull().sum()" 568 | ] 569 | }, 570 | { 571 | "cell_type": "markdown", 572 | "metadata": {}, 573 | "source": [ 574 | "### Homework" 575 | ] 576 | }, 577 | { 578 | "cell_type": "code", 579 | "execution_count": null, 580 | "metadata": {}, 581 | "outputs": [], 582 | "source": [ 583 | "loan_data['annual_inc'].fillna(loan_data['annual_inc'].mean(), inplace=True)\n", 584 | "# We fill the missing values with the mean value of the non-missing values." 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": null, 590 | "metadata": {}, 591 | "outputs": [], 592 | "source": [ 593 | "loan_data['mths_since_earliest_cr_line'].fillna(0, inplace=True)\n", 594 | "loan_data['acc_now_delinq'].fillna(0, inplace=True)\n", 595 | "loan_data['total_acc'].fillna(0, inplace=True)\n", 596 | "loan_data['pub_rec'].fillna(0, inplace=True)\n", 597 | "loan_data['open_acc'].fillna(0, inplace=True)\n", 598 | "loan_data['inq_last_6mths'].fillna(0, inplace=True)\n", 599 | "loan_data['delinq_2yrs'].fillna(0, inplace=True)\n", 600 | "loan_data['emp_length_int'].fillna(0, inplace=True)\n", 601 | "# We fill the missing values with zeroes." 602 | ] 603 | }, 604 | { 605 | "cell_type": "markdown", 606 | "metadata": {}, 607 | "source": [ 608 | "# PD model" 609 | ] 610 | }, 611 | { 612 | "cell_type": "markdown", 613 | "metadata": {}, 614 | "source": [ 615 | "## Data preparation" 616 | ] 617 | }, 618 | { 619 | "cell_type": "markdown", 620 | "metadata": {}, 621 | "source": [ 622 | "### Dependent Variable. Good/ Bad (Default) Definition. Default and Non-default Accounts." 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": null, 628 | "metadata": {}, 629 | "outputs": [], 630 | "source": [ 631 | "loan_data['loan_status'].unique()\n", 632 | "# Displays unique values of a column." 633 | ] 634 | }, 635 | { 636 | "cell_type": "code", 637 | "execution_count": null, 638 | "metadata": { 639 | "scrolled": true 640 | }, 641 | "outputs": [], 642 | "source": [ 643 | "loan_data['loan_status'].value_counts()\n", 644 | "# Calculates the number of observations for each unique value of a variable." 645 | ] 646 | }, 647 | { 648 | "cell_type": "code", 649 | "execution_count": null, 650 | "metadata": {}, 651 | "outputs": [], 652 | "source": [ 653 | "loan_data['loan_status'].value_counts() / loan_data['loan_status'].count()\n", 654 | "# We divide the number of observations for each unique value of a variable by the total number of observations.\n", 655 | "# Thus, we get the proportion of observations for each unique value of a variable." 656 | ] 657 | }, 658 | { 659 | "cell_type": "code", 660 | "execution_count": null, 661 | "metadata": { 662 | "scrolled": true 663 | }, 664 | "outputs": [], 665 | "source": [ 666 | "# Good/ Bad Definition\n", 667 | "loan_data['good_bad'] = np.where(loan_data['loan_status'].isin(['Charged Off', 'Default',\n", 668 | " 'Does not meet the credit policy. Status:Charged Off',\n", 669 | " 'Late (31-120 days)']), 0, 1)\n", 670 | "# We create a new variable that has the value of '0' if a condition is met, and the value of '1' if it is not met." 671 | ] 672 | }, 673 | { 674 | "cell_type": "code", 675 | "execution_count": null, 676 | "metadata": {}, 677 | "outputs": [], 678 | "source": [ 679 | "loan_data['good_bad']" 680 | ] 681 | } 682 | ], 683 | "metadata": { 684 | "kernelspec": { 685 | "display_name": "Python 3", 686 | "language": "python", 687 | "name": "python3" 688 | }, 689 | "language_info": { 690 | "codemirror_mode": { 691 | "name": "ipython", 692 | "version": 3 693 | }, 694 | "file_extension": ".py", 695 | "mimetype": "text/x-python", 696 | "name": "python", 697 | "nbconvert_exporter": "python", 698 | "pygments_lexer": "ipython3", 699 | "version": "3.7.4" 700 | } 701 | }, 702 | "nbformat": 4, 703 | "nbformat_minor": 2 704 | } 705 | -------------------------------------------------------------------------------- /Section 5 PD Model, Data Preparation/Credit Risk Modeling - Preparation - 5-2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Preparation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Import Libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import numpy as np\n", 24 | "import pandas as pd" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Import Data" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "loan_data = loan_data_backup.copy()" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "## Explore Data" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "loan_data" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "pd.options.display.max_columns = None" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "loan_data" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "loan_data.head()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "loan_data.tail()" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "loan_data.columns.values" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": { 117 | "scrolled": true 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "loan_data.info()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "## General Preprocessing" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "### Preprocessing few continuous variables" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "loan_data['emp_length'].unique()" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\\+ years', '')\n", 154 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0))\n", 155 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a', str(0))\n", 156 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '')\n", 157 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '')" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "type(loan_data['emp_length_int'][0])" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "type(loan_data['emp_length_int'][0])" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "scrolled": true 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "loan_data['earliest_cr_line']" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = '%b-%y')" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "type(loan_data['earliest_cr_line_date'][0])" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": { 220 | "scrolled": true 221 | }, 222 | "outputs": [], 223 | "source": [ 224 | "pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "loan_data['mths_since_earliest_cr_line'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']) / np.timedelta64(1, 'M')))" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "loan_data['mths_since_earliest_cr_line'].describe()" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": { 249 | "scrolled": true 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "loan_data.loc[: , ['earliest_cr_line', 'earliest_cr_line_date', 'mths_since_earliest_cr_line']][loan_data['mths_since_earliest_cr_line'] < 0]" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "loan_data['mths_since_earliest_cr_line'][loan_data['mths_since_earliest_cr_line'] < 0] = loan_data['mths_since_earliest_cr_line'].max()" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "min(loan_data['mths_since_earliest_cr_line'])" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "### Homework" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": { 285 | "scrolled": true 286 | }, 287 | "outputs": [], 288 | "source": [ 289 | "loan_data['term']" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "loan_data['term'].describe()" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "loan_data['term_int'] = loan_data['term'].str.replace(' months', '')" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": { 314 | "scrolled": true 315 | }, 316 | "outputs": [], 317 | "source": [ 318 | "loan_data['term_int']" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "type(loan_data['term_int'][25])" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": { 334 | "scrolled": true 335 | }, 336 | "outputs": [], 337 | "source": [ 338 | "loan_data['term_int'] = pd.to_numeric(loan_data['term'].str.replace(' months', ''))\n", 339 | "loan_data['term_int']" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "type(loan_data['term_int'][0])" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "metadata": { 355 | "scrolled": true 356 | }, 357 | "outputs": [], 358 | "source": [ 359 | "loan_data['issue_d']" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "loan_data['issue_d_date'] = pd.to_datetime(loan_data['issue_d'], format = '%b-%y')\n", 369 | "loan_data['mths_since_issue_d'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['issue_d_date']) / np.timedelta64(1, 'M')))\n", 370 | "loan_data['mths_since_issue_d'].describe()" 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "metadata": {}, 376 | "source": [ 377 | "### Preprocessing few discrete variables" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [ 386 | "loan_data.info()" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "pd.get_dummies(loan_data['grade'])" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':')" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "metadata": {}, 411 | "outputs": [], 412 | "source": [ 413 | "loan_data_dummies = [pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':'),\n", 414 | " pd.get_dummies(loan_data['sub_grade'], prefix = 'sub_grade', prefix_sep = ':'),\n", 415 | " pd.get_dummies(loan_data['home_ownership'], prefix = 'home_ownership', prefix_sep = ':'),\n", 416 | " pd.get_dummies(loan_data['verification_status'], prefix = 'verification_status', prefix_sep = ':'),\n", 417 | " pd.get_dummies(loan_data['loan_status'], prefix = 'loan_status', prefix_sep = ':'),\n", 418 | " pd.get_dummies(loan_data['purpose'], prefix = 'purpose', prefix_sep = ':'),\n", 419 | " pd.get_dummies(loan_data['addr_state'], prefix = 'addr_state', prefix_sep = ':'),\n", 420 | " pd.get_dummies(loan_data['initial_list_status'], prefix = 'initial_list_status', prefix_sep = ':')]" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [ 429 | "loan_data_dummies = pd.concat(loan_data_dummies, axis = 1)" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": null, 435 | "metadata": {}, 436 | "outputs": [], 437 | "source": [ 438 | "type(loan_data_dummies)" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": null, 444 | "metadata": {}, 445 | "outputs": [], 446 | "source": [ 447 | "loan_data = pd.concat([loan_data, loan_data_dummies], axis = 1)" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "loan_data.columns.values" 457 | ] 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "metadata": {}, 462 | "source": [ 463 | "### Check for missing values and clean" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": null, 469 | "metadata": { 470 | "scrolled": true 471 | }, 472 | "outputs": [], 473 | "source": [ 474 | "loan_data.isnull()" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": null, 480 | "metadata": {}, 481 | "outputs": [], 482 | "source": [ 483 | "pd.options.display.max_rows = None\n", 484 | "loan_data.isnull().sum()" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": null, 490 | "metadata": {}, 491 | "outputs": [], 492 | "source": [ 493 | "pd.options.display.max_rows = 100" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": null, 499 | "metadata": {}, 500 | "outputs": [], 501 | "source": [ 502 | "loan_data['total_rev_hi_lim'].fillna(loan_data['funded_amnt'], inplace = True)" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": null, 508 | "metadata": {}, 509 | "outputs": [], 510 | "source": [ 511 | "loan_data['total_rev_hi_lim'].isnull().sum()" 512 | ] 513 | }, 514 | { 515 | "cell_type": "markdown", 516 | "metadata": {}, 517 | "source": [ 518 | "### Homework" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": null, 524 | "metadata": {}, 525 | "outputs": [], 526 | "source": [ 527 | "loan_data['annual_inc'].fillna(loan_data['annual_inc'].mean(), inplace=True)" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": null, 533 | "metadata": {}, 534 | "outputs": [], 535 | "source": [ 536 | "loan_data['mths_since_earliest_cr_line'].fillna(0, inplace=True)\n", 537 | "loan_data['acc_now_delinq'].fillna(0, inplace=True)\n", 538 | "loan_data['total_acc'].fillna(0, inplace=True)\n", 539 | "loan_data['pub_rec'].fillna(0, inplace=True)\n", 540 | "loan_data['open_acc'].fillna(0, inplace=True)\n", 541 | "loan_data['inq_last_6mths'].fillna(0, inplace=True)\n", 542 | "loan_data['delinq_2yrs'].fillna(0, inplace=True)\n", 543 | "loan_data['emp_length_int'].fillna(0, inplace=True)" 544 | ] 545 | }, 546 | { 547 | "cell_type": "markdown", 548 | "metadata": {}, 549 | "source": [ 550 | "# PD model" 551 | ] 552 | }, 553 | { 554 | "cell_type": "markdown", 555 | "metadata": {}, 556 | "source": [ 557 | "## Data preparation" 558 | ] 559 | }, 560 | { 561 | "cell_type": "markdown", 562 | "metadata": {}, 563 | "source": [ 564 | "### Dependent Variable. Good/ Bad (Default) Definition. Default and Non-default Accounts." 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": null, 570 | "metadata": {}, 571 | "outputs": [], 572 | "source": [ 573 | "loan_data['loan_status'].unique()" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": null, 579 | "metadata": {}, 580 | "outputs": [], 581 | "source": [ 582 | "loan_data['loan_status'].value_counts()" 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": null, 588 | "metadata": {}, 589 | "outputs": [], 590 | "source": [ 591 | "loan_data['loan_status'].value_counts() / loan_data['loan_status'].count()" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": null, 597 | "metadata": {}, 598 | "outputs": [], 599 | "source": [ 600 | "loan_data['good_bad'] = np.where(loan_data['loan_status'].isin(['Charged Off', 'Default',\n", 601 | " 'Does not meet the credit policy. Status:Charged Off',\n", 602 | " 'Late (31-120 days)']), 0, 1)" 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": null, 608 | "metadata": { 609 | "scrolled": true 610 | }, 611 | "outputs": [], 612 | "source": [ 613 | "loan_data['good_bad']" 614 | ] 615 | } 616 | ], 617 | "metadata": { 618 | "kernelspec": { 619 | "display_name": "Python 3", 620 | "language": "python", 621 | "name": "python3" 622 | }, 623 | "language_info": { 624 | "codemirror_mode": { 625 | "name": "ipython", 626 | "version": 3 627 | }, 628 | "file_extension": ".py", 629 | "mimetype": "text/x-python", 630 | "name": "python", 631 | "nbconvert_exporter": "python", 632 | "pygments_lexer": "ipython3", 633 | "version": "3.7.3" 634 | } 635 | }, 636 | "nbformat": 4, 637 | "nbformat_minor": 2 638 | } 639 | -------------------------------------------------------------------------------- /Section 5 PD Model, Data Preparation/Credit Risk Modeling - Preparation - 5-5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Preparation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Import Libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import numpy as np\n", 24 | "import pandas as pd" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Import Data" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "loan_data = loan_data_backup.copy()" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "## Explore Data" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "loan_data" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "pd.options.display.max_columns = None" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "loan_data" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "loan_data.head()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "loan_data.tail()" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "loan_data.columns.values" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": { 117 | "scrolled": true 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "loan_data.info()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "## General Preprocessing" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "### Preprocessing few continuous variables" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "loan_data['emp_length'].unique()" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\\+ years', '')\n", 154 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0))\n", 155 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a', str(0))\n", 156 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '')\n", 157 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '')" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "type(loan_data['emp_length_int'][0])" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "type(loan_data['emp_length_int'][0])" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "scrolled": true 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "loan_data['earliest_cr_line']" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = '%b-%y')" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "type(loan_data['earliest_cr_line_date'][0])" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": { 220 | "scrolled": true 221 | }, 222 | "outputs": [], 223 | "source": [ 224 | "pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "loan_data['mths_since_earliest_cr_line'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']) / np.timedelta64(1, 'M')))" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "loan_data['mths_since_earliest_cr_line'].describe()" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": { 249 | "scrolled": true 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "loan_data.loc[: , ['earliest_cr_line', 'earliest_cr_line_date', 'mths_since_earliest_cr_line']][loan_data['mths_since_earliest_cr_line'] < 0]" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "loan_data['mths_since_earliest_cr_line'][loan_data['mths_since_earliest_cr_line'] < 0] = loan_data['mths_since_earliest_cr_line'].max()" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "min(loan_data['mths_since_earliest_cr_line'])" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "### Homework" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": { 285 | "scrolled": true 286 | }, 287 | "outputs": [], 288 | "source": [ 289 | "loan_data['term']" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "loan_data['term'].describe()" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "loan_data['term_int'] = loan_data['term'].str.replace(' months', '')" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": { 314 | "scrolled": true 315 | }, 316 | "outputs": [], 317 | "source": [ 318 | "loan_data['term_int']" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "type(loan_data['term_int'][25])" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": { 334 | "scrolled": true 335 | }, 336 | "outputs": [], 337 | "source": [ 338 | "loan_data['term_int'] = pd.to_numeric(loan_data['term'].str.replace(' months', ''))\n", 339 | "loan_data['term_int']" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "type(loan_data['term_int'][0])" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "metadata": { 355 | "scrolled": true 356 | }, 357 | "outputs": [], 358 | "source": [ 359 | "loan_data['issue_d']" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "loan_data['issue_d_date'] = pd.to_datetime(loan_data['issue_d'], format = '%b-%y')\n", 369 | "loan_data['mths_since_issue_d'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['issue_d_date']) / np.timedelta64(1, 'M')))\n", 370 | "loan_data['mths_since_issue_d'].describe()" 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "metadata": {}, 376 | "source": [ 377 | "### Preprocessing few discrete variables" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [ 386 | "loan_data.info()" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "pd.get_dummies(loan_data['grade'])" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':')" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "metadata": {}, 411 | "outputs": [], 412 | "source": [ 413 | "loan_data_dummies = [pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':'),\n", 414 | " pd.get_dummies(loan_data['sub_grade'], prefix = 'sub_grade', prefix_sep = ':'),\n", 415 | " pd.get_dummies(loan_data['home_ownership'], prefix = 'home_ownership', prefix_sep = ':'),\n", 416 | " pd.get_dummies(loan_data['verification_status'], prefix = 'verification_status', prefix_sep = ':'),\n", 417 | " pd.get_dummies(loan_data['loan_status'], prefix = 'loan_status', prefix_sep = ':'),\n", 418 | " pd.get_dummies(loan_data['purpose'], prefix = 'purpose', prefix_sep = ':'),\n", 419 | " pd.get_dummies(loan_data['addr_state'], prefix = 'addr_state', prefix_sep = ':'),\n", 420 | " pd.get_dummies(loan_data['initial_list_status'], prefix = 'initial_list_status', prefix_sep = ':')]" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [ 429 | "loan_data_dummies = pd.concat(loan_data_dummies, axis = 1)" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": null, 435 | "metadata": {}, 436 | "outputs": [], 437 | "source": [ 438 | "type(loan_data_dummies)" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": null, 444 | "metadata": {}, 445 | "outputs": [], 446 | "source": [ 447 | "loan_data = pd.concat([loan_data, loan_data_dummies], axis = 1)" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "loan_data.columns.values" 457 | ] 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "metadata": {}, 462 | "source": [ 463 | "### Check for missing values and clean" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": null, 469 | "metadata": { 470 | "scrolled": true 471 | }, 472 | "outputs": [], 473 | "source": [ 474 | "loan_data.isnull()" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": null, 480 | "metadata": {}, 481 | "outputs": [], 482 | "source": [ 483 | "pd.options.display.max_rows = None\n", 484 | "loan_data.isnull().sum()" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": null, 490 | "metadata": {}, 491 | "outputs": [], 492 | "source": [ 493 | "pd.options.display.max_rows = 100" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": null, 499 | "metadata": {}, 500 | "outputs": [], 501 | "source": [ 502 | "loan_data['total_rev_hi_lim'].fillna(loan_data['funded_amnt'], inplace = True)" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": null, 508 | "metadata": {}, 509 | "outputs": [], 510 | "source": [ 511 | "loan_data['total_rev_hi_lim'].isnull().sum()" 512 | ] 513 | }, 514 | { 515 | "cell_type": "markdown", 516 | "metadata": {}, 517 | "source": [ 518 | "### Homework" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": null, 524 | "metadata": {}, 525 | "outputs": [], 526 | "source": [ 527 | "loan_data['annual_inc'].fillna(loan_data['annual_inc'].mean(), inplace=True)" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": null, 533 | "metadata": {}, 534 | "outputs": [], 535 | "source": [ 536 | "loan_data['mths_since_earliest_cr_line'].fillna(0, inplace=True)\n", 537 | "loan_data['acc_now_delinq'].fillna(0, inplace=True)\n", 538 | "loan_data['total_acc'].fillna(0, inplace=True)\n", 539 | "loan_data['pub_rec'].fillna(0, inplace=True)\n", 540 | "loan_data['open_acc'].fillna(0, inplace=True)\n", 541 | "loan_data['inq_last_6mths'].fillna(0, inplace=True)\n", 542 | "loan_data['delinq_2yrs'].fillna(0, inplace=True)\n", 543 | "loan_data['emp_length_int'].fillna(0, inplace=True)" 544 | ] 545 | }, 546 | { 547 | "cell_type": "markdown", 548 | "metadata": {}, 549 | "source": [ 550 | "# PD model" 551 | ] 552 | }, 553 | { 554 | "cell_type": "markdown", 555 | "metadata": {}, 556 | "source": [ 557 | "## Data preparation" 558 | ] 559 | }, 560 | { 561 | "cell_type": "markdown", 562 | "metadata": {}, 563 | "source": [ 564 | "### Dependent Variable. Good/ Bad (Default) Definition. Default and Non-default Accounts." 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": null, 570 | "metadata": {}, 571 | "outputs": [], 572 | "source": [ 573 | "loan_data['loan_status'].unique()" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": null, 579 | "metadata": {}, 580 | "outputs": [], 581 | "source": [ 582 | "loan_data['loan_status'].value_counts()" 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": null, 588 | "metadata": {}, 589 | "outputs": [], 590 | "source": [ 591 | "loan_data['loan_status'].value_counts() / loan_data['loan_status'].count()" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": null, 597 | "metadata": {}, 598 | "outputs": [], 599 | "source": [ 600 | "loan_data['good_bad'] = np.where(loan_data['loan_status'].isin(['Charged Off', 'Default',\n", 601 | " 'Does not meet the credit policy. Status:Charged Off',\n", 602 | " 'Late (31-120 days)']), 0, 1)" 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": null, 608 | "metadata": { 609 | "scrolled": true 610 | }, 611 | "outputs": [], 612 | "source": [ 613 | "loan_data['good_bad']" 614 | ] 615 | }, 616 | { 617 | "cell_type": "markdown", 618 | "metadata": {}, 619 | "source": [ 620 | "### Splitting Data" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": null, 626 | "metadata": {}, 627 | "outputs": [], 628 | "source": [ 629 | "from sklearn.model_selection import train_test_split" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": null, 635 | "metadata": {}, 636 | "outputs": [], 637 | "source": [ 638 | "train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'])" 639 | ] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": null, 644 | "metadata": {}, 645 | "outputs": [], 646 | "source": [ 647 | "loan_data_inputs_train, loan_data_inputs_test, loan_data_targets_train, loan_data_targets_test = train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'])" 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": null, 653 | "metadata": {}, 654 | "outputs": [], 655 | "source": [ 656 | "loan_data_inputs_train.shape" 657 | ] 658 | }, 659 | { 660 | "cell_type": "code", 661 | "execution_count": null, 662 | "metadata": {}, 663 | "outputs": [], 664 | "source": [ 665 | "loan_data_targets_train.shape" 666 | ] 667 | }, 668 | { 669 | "cell_type": "code", 670 | "execution_count": null, 671 | "metadata": {}, 672 | "outputs": [], 673 | "source": [ 674 | "loan_data_inputs_test.shape" 675 | ] 676 | }, 677 | { 678 | "cell_type": "code", 679 | "execution_count": null, 680 | "metadata": {}, 681 | "outputs": [], 682 | "source": [ 683 | "loan_data_targets_test.shape" 684 | ] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": null, 689 | "metadata": {}, 690 | "outputs": [], 691 | "source": [ 692 | "loan_data_inputs_train, loan_data_inputs_test, loan_data_targets_train, loan_data_targets_test = train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'], test_size = 0.2, random_state = 42)" 693 | ] 694 | }, 695 | { 696 | "cell_type": "code", 697 | "execution_count": null, 698 | "metadata": {}, 699 | "outputs": [], 700 | "source": [ 701 | "loan_data_inputs_train.shape" 702 | ] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": null, 707 | "metadata": {}, 708 | "outputs": [], 709 | "source": [ 710 | "loan_data_targets_train.shape" 711 | ] 712 | }, 713 | { 714 | "cell_type": "code", 715 | "execution_count": null, 716 | "metadata": {}, 717 | "outputs": [], 718 | "source": [ 719 | "loan_data_inputs_test.shape" 720 | ] 721 | }, 722 | { 723 | "cell_type": "code", 724 | "execution_count": null, 725 | "metadata": {}, 726 | "outputs": [], 727 | "source": [ 728 | "loan_data_targets_test.shape" 729 | ] 730 | } 731 | ], 732 | "metadata": { 733 | "kernelspec": { 734 | "display_name": "Python 3", 735 | "language": "python", 736 | "name": "python3" 737 | }, 738 | "language_info": { 739 | "codemirror_mode": { 740 | "name": "ipython", 741 | "version": 3 742 | }, 743 | "file_extension": ".py", 744 | "mimetype": "text/x-python", 745 | "name": "python", 746 | "nbconvert_exporter": "python", 747 | "pygments_lexer": "ipython3", 748 | "version": "3.7.3" 749 | } 750 | }, 751 | "nbformat": 4, 752 | "nbformat_minor": 2 753 | } 754 | -------------------------------------------------------------------------------- /Section 5 PD Model, Data Preparation/Credit Risk Modeling - Preparation - 5-6.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Preparation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Import Libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import numpy as np\n", 24 | "import pandas as pd" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Import Data" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "loan_data = loan_data_backup.copy()" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "## Explore Data" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "loan_data" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "pd.options.display.max_columns = None" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "loan_data" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "loan_data.head()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "loan_data.tail()" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "loan_data.columns.values" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": { 117 | "scrolled": true 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "loan_data.info()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "## General Preprocessing" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "### Preprocessing few continuous variables" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "loan_data['emp_length'].unique()" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\\+ years', '')\n", 154 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0))\n", 155 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a', str(0))\n", 156 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '')\n", 157 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '')" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "type(loan_data['emp_length_int'][0])" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "type(loan_data['emp_length_int'][0])" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "scrolled": true 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "loan_data['earliest_cr_line']" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = '%b-%y')" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "type(loan_data['earliest_cr_line_date'][0])" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": { 220 | "scrolled": true 221 | }, 222 | "outputs": [], 223 | "source": [ 224 | "pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "loan_data['mths_since_earliest_cr_line'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']) / np.timedelta64(1, 'M')))" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "loan_data['mths_since_earliest_cr_line'].describe()" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": { 249 | "scrolled": true 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "loan_data.loc[: , ['earliest_cr_line', 'earliest_cr_line_date', 'mths_since_earliest_cr_line']][loan_data['mths_since_earliest_cr_line'] < 0]" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "loan_data['mths_since_earliest_cr_line'][loan_data['mths_since_earliest_cr_line'] < 0] = loan_data['mths_since_earliest_cr_line'].max()" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "min(loan_data['mths_since_earliest_cr_line'])" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "### Homework" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": { 285 | "scrolled": true 286 | }, 287 | "outputs": [], 288 | "source": [ 289 | "loan_data['term']" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "loan_data['term'].describe()" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "loan_data['term_int'] = loan_data['term'].str.replace(' months', '')" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": { 314 | "scrolled": true 315 | }, 316 | "outputs": [], 317 | "source": [ 318 | "loan_data['term_int']" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "type(loan_data['term_int'][25])" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": { 334 | "scrolled": true 335 | }, 336 | "outputs": [], 337 | "source": [ 338 | "loan_data['term_int'] = pd.to_numeric(loan_data['term'].str.replace(' months', ''))\n", 339 | "loan_data['term_int']" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "type(loan_data['term_int'][0])" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "metadata": { 355 | "scrolled": true 356 | }, 357 | "outputs": [], 358 | "source": [ 359 | "loan_data['issue_d']" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "loan_data['issue_d_date'] = pd.to_datetime(loan_data['issue_d'], format = '%b-%y')\n", 369 | "loan_data['mths_since_issue_d'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['issue_d_date']) / np.timedelta64(1, 'M')))\n", 370 | "loan_data['mths_since_issue_d'].describe()" 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "metadata": {}, 376 | "source": [ 377 | "### Preprocessing few discrete variables" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [ 386 | "loan_data.info()" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "pd.get_dummies(loan_data['grade'])" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':')" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "metadata": {}, 411 | "outputs": [], 412 | "source": [ 413 | "loan_data_dummies = [pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':'),\n", 414 | " pd.get_dummies(loan_data['sub_grade'], prefix = 'sub_grade', prefix_sep = ':'),\n", 415 | " pd.get_dummies(loan_data['home_ownership'], prefix = 'home_ownership', prefix_sep = ':'),\n", 416 | " pd.get_dummies(loan_data['verification_status'], prefix = 'verification_status', prefix_sep = ':'),\n", 417 | " pd.get_dummies(loan_data['loan_status'], prefix = 'loan_status', prefix_sep = ':'),\n", 418 | " pd.get_dummies(loan_data['purpose'], prefix = 'purpose', prefix_sep = ':'),\n", 419 | " pd.get_dummies(loan_data['addr_state'], prefix = 'addr_state', prefix_sep = ':'),\n", 420 | " pd.get_dummies(loan_data['initial_list_status'], prefix = 'initial_list_status', prefix_sep = ':')]" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [ 429 | "loan_data_dummies = pd.concat(loan_data_dummies, axis = 1)" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": null, 435 | "metadata": {}, 436 | "outputs": [], 437 | "source": [ 438 | "type(loan_data_dummies)" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": null, 444 | "metadata": {}, 445 | "outputs": [], 446 | "source": [ 447 | "loan_data = pd.concat([loan_data, loan_data_dummies], axis = 1)" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "loan_data.columns.values" 457 | ] 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "metadata": {}, 462 | "source": [ 463 | "### Check for missing values and clean" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": null, 469 | "metadata": { 470 | "scrolled": true 471 | }, 472 | "outputs": [], 473 | "source": [ 474 | "loan_data.isnull()" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": null, 480 | "metadata": {}, 481 | "outputs": [], 482 | "source": [ 483 | "pd.options.display.max_rows = None\n", 484 | "loan_data.isnull().sum()" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": null, 490 | "metadata": {}, 491 | "outputs": [], 492 | "source": [ 493 | "pd.options.display.max_rows = 100" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": null, 499 | "metadata": {}, 500 | "outputs": [], 501 | "source": [ 502 | "loan_data['total_rev_hi_lim'].fillna(loan_data['funded_amnt'], inplace = True)" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": null, 508 | "metadata": {}, 509 | "outputs": [], 510 | "source": [ 511 | "loan_data['total_rev_hi_lim'].isnull().sum()" 512 | ] 513 | }, 514 | { 515 | "cell_type": "markdown", 516 | "metadata": {}, 517 | "source": [ 518 | "### Homework" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": null, 524 | "metadata": {}, 525 | "outputs": [], 526 | "source": [ 527 | "loan_data['annual_inc'].fillna(loan_data['annual_inc'].mean(), inplace=True)" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": null, 533 | "metadata": {}, 534 | "outputs": [], 535 | "source": [ 536 | "loan_data['mths_since_earliest_cr_line'].fillna(0, inplace=True)\n", 537 | "loan_data['acc_now_delinq'].fillna(0, inplace=True)\n", 538 | "loan_data['total_acc'].fillna(0, inplace=True)\n", 539 | "loan_data['pub_rec'].fillna(0, inplace=True)\n", 540 | "loan_data['open_acc'].fillna(0, inplace=True)\n", 541 | "loan_data['inq_last_6mths'].fillna(0, inplace=True)\n", 542 | "loan_data['delinq_2yrs'].fillna(0, inplace=True)\n", 543 | "loan_data['emp_length_int'].fillna(0, inplace=True)" 544 | ] 545 | }, 546 | { 547 | "cell_type": "markdown", 548 | "metadata": {}, 549 | "source": [ 550 | "# PD model" 551 | ] 552 | }, 553 | { 554 | "cell_type": "markdown", 555 | "metadata": {}, 556 | "source": [ 557 | "## Data preparation" 558 | ] 559 | }, 560 | { 561 | "cell_type": "markdown", 562 | "metadata": {}, 563 | "source": [ 564 | "### Dependent Variable. Good/ Bad (Default) Definition. Default and Non-default Accounts." 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": null, 570 | "metadata": {}, 571 | "outputs": [], 572 | "source": [ 573 | "loan_data['loan_status'].unique()" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": null, 579 | "metadata": {}, 580 | "outputs": [], 581 | "source": [ 582 | "loan_data['loan_status'].value_counts()" 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": null, 588 | "metadata": {}, 589 | "outputs": [], 590 | "source": [ 591 | "loan_data['loan_status'].value_counts() / loan_data['loan_status'].count()" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": null, 597 | "metadata": {}, 598 | "outputs": [], 599 | "source": [ 600 | "loan_data['good_bad'] = np.where(loan_data['loan_status'].isin(['Charged Off', 'Default',\n", 601 | " 'Does not meet the credit policy. Status:Charged Off',\n", 602 | " 'Late (31-120 days)']), 0, 1)" 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": null, 608 | "metadata": { 609 | "scrolled": true 610 | }, 611 | "outputs": [], 612 | "source": [ 613 | "loan_data['good_bad']" 614 | ] 615 | }, 616 | { 617 | "cell_type": "markdown", 618 | "metadata": {}, 619 | "source": [ 620 | "### Splitting Data" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": null, 626 | "metadata": {}, 627 | "outputs": [], 628 | "source": [ 629 | "from sklearn.model_selection import train_test_split" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": null, 635 | "metadata": {}, 636 | "outputs": [], 637 | "source": [ 638 | "train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'])" 639 | ] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": null, 644 | "metadata": {}, 645 | "outputs": [], 646 | "source": [ 647 | "loan_data_inputs_train, loan_data_inputs_test, loan_data_targets_train, loan_data_targets_test = train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'])" 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": null, 653 | "metadata": {}, 654 | "outputs": [], 655 | "source": [ 656 | "loan_data_inputs_train.shape" 657 | ] 658 | }, 659 | { 660 | "cell_type": "code", 661 | "execution_count": null, 662 | "metadata": {}, 663 | "outputs": [], 664 | "source": [ 665 | "loan_data_targets_train.shape" 666 | ] 667 | }, 668 | { 669 | "cell_type": "code", 670 | "execution_count": null, 671 | "metadata": {}, 672 | "outputs": [], 673 | "source": [ 674 | "loan_data_inputs_test.shape" 675 | ] 676 | }, 677 | { 678 | "cell_type": "code", 679 | "execution_count": null, 680 | "metadata": {}, 681 | "outputs": [], 682 | "source": [ 683 | "loan_data_targets_test.shape" 684 | ] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": null, 689 | "metadata": {}, 690 | "outputs": [], 691 | "source": [ 692 | "loan_data_inputs_train, loan_data_inputs_test, loan_data_targets_train, loan_data_targets_test = train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'], test_size = 0.2, random_state = 42)" 693 | ] 694 | }, 695 | { 696 | "cell_type": "code", 697 | "execution_count": null, 698 | "metadata": {}, 699 | "outputs": [], 700 | "source": [ 701 | "loan_data_inputs_train.shape" 702 | ] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": null, 707 | "metadata": {}, 708 | "outputs": [], 709 | "source": [ 710 | "loan_data_targets_train.shape" 711 | ] 712 | }, 713 | { 714 | "cell_type": "code", 715 | "execution_count": null, 716 | "metadata": {}, 717 | "outputs": [], 718 | "source": [ 719 | "loan_data_inputs_test.shape" 720 | ] 721 | }, 722 | { 723 | "cell_type": "code", 724 | "execution_count": null, 725 | "metadata": {}, 726 | "outputs": [], 727 | "source": [ 728 | "loan_data_targets_test.shape" 729 | ] 730 | }, 731 | { 732 | "cell_type": "markdown", 733 | "metadata": {}, 734 | "source": [ 735 | "### Data Preparation: An Example" 736 | ] 737 | }, 738 | { 739 | "cell_type": "code", 740 | "execution_count": null, 741 | "metadata": {}, 742 | "outputs": [], 743 | "source": [ 744 | "df_inputs_prepr = loan_data_inputs_train\n", 745 | "df_targets_prepr = loan_data_targets_train\n", 746 | "#df_inputs_prepr = loan_data_inputs_test\n", 747 | "#df_targets_prepr = loan_data_targets_test" 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": null, 753 | "metadata": {}, 754 | "outputs": [], 755 | "source": [ 756 | "df_inputs_prepr['grade'].unique()" 757 | ] 758 | }, 759 | { 760 | "cell_type": "code", 761 | "execution_count": null, 762 | "metadata": {}, 763 | "outputs": [], 764 | "source": [ 765 | "df1 = pd.concat([df_inputs_prepr['grade'], df_targets_prepr], axis = 1)\n", 766 | "df1.head()" 767 | ] 768 | }, 769 | { 770 | "cell_type": "code", 771 | "execution_count": null, 772 | "metadata": {}, 773 | "outputs": [], 774 | "source": [ 775 | "df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].count()" 776 | ] 777 | }, 778 | { 779 | "cell_type": "code", 780 | "execution_count": null, 781 | "metadata": {}, 782 | "outputs": [], 783 | "source": [ 784 | "df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].mean()" 785 | ] 786 | }, 787 | { 788 | "cell_type": "code", 789 | "execution_count": null, 790 | "metadata": {}, 791 | "outputs": [], 792 | "source": [ 793 | "df1 = pd.concat([df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].count(),\n", 794 | " df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].mean()], axis = 1)" 795 | ] 796 | }, 797 | { 798 | "cell_type": "code", 799 | "execution_count": null, 800 | "metadata": {}, 801 | "outputs": [], 802 | "source": [ 803 | "df1" 804 | ] 805 | }, 806 | { 807 | "cell_type": "code", 808 | "execution_count": null, 809 | "metadata": {}, 810 | "outputs": [], 811 | "source": [ 812 | "df1 = df1.iloc[: , [0, 1, 3]]\n", 813 | "df1" 814 | ] 815 | }, 816 | { 817 | "cell_type": "code", 818 | "execution_count": null, 819 | "metadata": {}, 820 | "outputs": [], 821 | "source": [ 822 | "df1.columns = [df1.columns.values[0], 'n_obs', 'prop_good']\n", 823 | "df1" 824 | ] 825 | }, 826 | { 827 | "cell_type": "code", 828 | "execution_count": null, 829 | "metadata": {}, 830 | "outputs": [], 831 | "source": [ 832 | "df1['prop_n_obs'] = df1['n_obs'] / df1['n_obs'].sum()" 833 | ] 834 | }, 835 | { 836 | "cell_type": "code", 837 | "execution_count": null, 838 | "metadata": {}, 839 | "outputs": [], 840 | "source": [ 841 | "df1" 842 | ] 843 | }, 844 | { 845 | "cell_type": "code", 846 | "execution_count": null, 847 | "metadata": {}, 848 | "outputs": [], 849 | "source": [ 850 | "df1['n_good'] = df1['prop_good'] * df1['n_obs']\n", 851 | "df1['n_bad'] = (1 - df1['prop_good']) * df1['n_obs']\n", 852 | "df1" 853 | ] 854 | }, 855 | { 856 | "cell_type": "code", 857 | "execution_count": null, 858 | "metadata": {}, 859 | "outputs": [], 860 | "source": [ 861 | "df1['prop_n_good'] = df1['n_good'] / df1['n_good'].sum()\n", 862 | "df1['prop_n_bad'] = df1['n_bad'] / df1['n_bad'].sum()\n", 863 | "df1" 864 | ] 865 | }, 866 | { 867 | "cell_type": "code", 868 | "execution_count": null, 869 | "metadata": {}, 870 | "outputs": [], 871 | "source": [ 872 | "df1['WoE'] = np.log(df1['prop_n_good'] / df1['prop_n_bad'])\n", 873 | "df1" 874 | ] 875 | }, 876 | { 877 | "cell_type": "code", 878 | "execution_count": null, 879 | "metadata": {}, 880 | "outputs": [], 881 | "source": [ 882 | "df1 = df1.sort_values(['WoE'])\n", 883 | "df1 = df1.reset_index(drop = True)\n", 884 | "df1" 885 | ] 886 | }, 887 | { 888 | "cell_type": "code", 889 | "execution_count": null, 890 | "metadata": {}, 891 | "outputs": [], 892 | "source": [ 893 | "df1['diff_prop_good'] = df1['prop_good'].diff().abs()\n", 894 | "df1['diff_WoE'] = df1['WoE'].diff().abs()\n", 895 | "df1" 896 | ] 897 | }, 898 | { 899 | "cell_type": "code", 900 | "execution_count": null, 901 | "metadata": {}, 902 | "outputs": [], 903 | "source": [ 904 | "df1['IV'] = (df1['prop_n_good'] - df1['prop_n_bad']) * df1['WoE']\n", 905 | "df1['IV'] = df1['IV'].sum()\n", 906 | "df1" 907 | ] 908 | } 909 | ], 910 | "metadata": { 911 | "kernelspec": { 912 | "display_name": "Python 3", 913 | "language": "python", 914 | "name": "python3" 915 | }, 916 | "language_info": { 917 | "codemirror_mode": { 918 | "name": "ipython", 919 | "version": 3 920 | }, 921 | "file_extension": ".py", 922 | "mimetype": "text/x-python", 923 | "name": "python", 924 | "nbconvert_exporter": "python", 925 | "pygments_lexer": "ipython3", 926 | "version": "3.7.3" 927 | } 928 | }, 929 | "nbformat": 4, 930 | "nbformat_minor": 2 931 | } 932 | -------------------------------------------------------------------------------- /Section 5 PD Model, Data Preparation/Credit Risk Modeling - Preparation - 5-7.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Preparation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Import Libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import numpy as np\n", 24 | "import pandas as pd" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Import Data" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "loan_data = loan_data_backup.copy()" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "## Explore Data" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "loan_data" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "pd.options.display.max_columns = None" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "loan_data" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "loan_data.head()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "loan_data.tail()" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "loan_data.columns.values" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": { 117 | "scrolled": true 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "loan_data.info()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "## General Preprocessing" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "### Preprocessing few continuous variables" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "loan_data['emp_length'].unique()" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\\+ years', '')\n", 154 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0))\n", 155 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a', str(0))\n", 156 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '')\n", 157 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '')" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "type(loan_data['emp_length_int'][0])" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "type(loan_data['emp_length_int'][0])" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "scrolled": true 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "loan_data['earliest_cr_line']" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = '%b-%y')" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "type(loan_data['earliest_cr_line_date'][0])" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": { 220 | "scrolled": true 221 | }, 222 | "outputs": [], 223 | "source": [ 224 | "pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "loan_data['mths_since_earliest_cr_line'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']) / np.timedelta64(1, 'M')))" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "loan_data['mths_since_earliest_cr_line'].describe()" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": { 249 | "scrolled": true 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "loan_data.loc[: , ['earliest_cr_line', 'earliest_cr_line_date', 'mths_since_earliest_cr_line']][loan_data['mths_since_earliest_cr_line'] < 0]" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "loan_data['mths_since_earliest_cr_line'][loan_data['mths_since_earliest_cr_line'] < 0] = loan_data['mths_since_earliest_cr_line'].max()" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "min(loan_data['mths_since_earliest_cr_line'])" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "### Homework" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": { 285 | "scrolled": true 286 | }, 287 | "outputs": [], 288 | "source": [ 289 | "loan_data['term']" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "loan_data['term'].describe()" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "loan_data['term_int'] = loan_data['term'].str.replace(' months', '')" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": { 314 | "scrolled": true 315 | }, 316 | "outputs": [], 317 | "source": [ 318 | "loan_data['term_int']" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "type(loan_data['term_int'][25])" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": { 334 | "scrolled": true 335 | }, 336 | "outputs": [], 337 | "source": [ 338 | "loan_data['term_int'] = pd.to_numeric(loan_data['term'].str.replace(' months', ''))\n", 339 | "loan_data['term_int']" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "type(loan_data['term_int'][0])" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "metadata": { 355 | "scrolled": true 356 | }, 357 | "outputs": [], 358 | "source": [ 359 | "loan_data['issue_d']" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "loan_data['issue_d_date'] = pd.to_datetime(loan_data['issue_d'], format = '%b-%y')\n", 369 | "loan_data['mths_since_issue_d'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['issue_d_date']) / np.timedelta64(1, 'M')))\n", 370 | "loan_data['mths_since_issue_d'].describe()" 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "metadata": {}, 376 | "source": [ 377 | "### Preprocessing few discrete variables" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [ 386 | "loan_data.info()" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "pd.get_dummies(loan_data['grade'])" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':')" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "metadata": {}, 411 | "outputs": [], 412 | "source": [ 413 | "loan_data_dummies = [pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':'),\n", 414 | " pd.get_dummies(loan_data['sub_grade'], prefix = 'sub_grade', prefix_sep = ':'),\n", 415 | " pd.get_dummies(loan_data['home_ownership'], prefix = 'home_ownership', prefix_sep = ':'),\n", 416 | " pd.get_dummies(loan_data['verification_status'], prefix = 'verification_status', prefix_sep = ':'),\n", 417 | " pd.get_dummies(loan_data['loan_status'], prefix = 'loan_status', prefix_sep = ':'),\n", 418 | " pd.get_dummies(loan_data['purpose'], prefix = 'purpose', prefix_sep = ':'),\n", 419 | " pd.get_dummies(loan_data['addr_state'], prefix = 'addr_state', prefix_sep = ':'),\n", 420 | " pd.get_dummies(loan_data['initial_list_status'], prefix = 'initial_list_status', prefix_sep = ':')]" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [ 429 | "loan_data_dummies = pd.concat(loan_data_dummies, axis = 1)" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": null, 435 | "metadata": {}, 436 | "outputs": [], 437 | "source": [ 438 | "type(loan_data_dummies)" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": null, 444 | "metadata": {}, 445 | "outputs": [], 446 | "source": [ 447 | "loan_data = pd.concat([loan_data, loan_data_dummies], axis = 1)" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "loan_data.columns.values" 457 | ] 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "metadata": {}, 462 | "source": [ 463 | "### Check for missing values and clean" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": null, 469 | "metadata": { 470 | "scrolled": true 471 | }, 472 | "outputs": [], 473 | "source": [ 474 | "loan_data.isnull()" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": null, 480 | "metadata": {}, 481 | "outputs": [], 482 | "source": [ 483 | "pd.options.display.max_rows = None\n", 484 | "loan_data.isnull().sum()" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": null, 490 | "metadata": {}, 491 | "outputs": [], 492 | "source": [ 493 | "pd.options.display.max_rows = 100" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": null, 499 | "metadata": {}, 500 | "outputs": [], 501 | "source": [ 502 | "loan_data['total_rev_hi_lim'].fillna(loan_data['funded_amnt'], inplace = True)" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": null, 508 | "metadata": {}, 509 | "outputs": [], 510 | "source": [ 511 | "loan_data['total_rev_hi_lim'].isnull().sum()" 512 | ] 513 | }, 514 | { 515 | "cell_type": "markdown", 516 | "metadata": {}, 517 | "source": [ 518 | "### Homework" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": null, 524 | "metadata": {}, 525 | "outputs": [], 526 | "source": [ 527 | "loan_data['annual_inc'].fillna(loan_data['annual_inc'].mean(), inplace=True)" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": null, 533 | "metadata": {}, 534 | "outputs": [], 535 | "source": [ 536 | "loan_data['mths_since_earliest_cr_line'].fillna(0, inplace=True)\n", 537 | "loan_data['acc_now_delinq'].fillna(0, inplace=True)\n", 538 | "loan_data['total_acc'].fillna(0, inplace=True)\n", 539 | "loan_data['pub_rec'].fillna(0, inplace=True)\n", 540 | "loan_data['open_acc'].fillna(0, inplace=True)\n", 541 | "loan_data['inq_last_6mths'].fillna(0, inplace=True)\n", 542 | "loan_data['delinq_2yrs'].fillna(0, inplace=True)\n", 543 | "loan_data['emp_length_int'].fillna(0, inplace=True)" 544 | ] 545 | }, 546 | { 547 | "cell_type": "markdown", 548 | "metadata": {}, 549 | "source": [ 550 | "# PD model" 551 | ] 552 | }, 553 | { 554 | "cell_type": "markdown", 555 | "metadata": {}, 556 | "source": [ 557 | "## Data preparation" 558 | ] 559 | }, 560 | { 561 | "cell_type": "markdown", 562 | "metadata": {}, 563 | "source": [ 564 | "### Dependent Variable. Good/ Bad (Default) Definition. Default and Non-default Accounts." 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": null, 570 | "metadata": {}, 571 | "outputs": [], 572 | "source": [ 573 | "loan_data['loan_status'].unique()" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": null, 579 | "metadata": {}, 580 | "outputs": [], 581 | "source": [ 582 | "loan_data['loan_status'].value_counts()" 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": null, 588 | "metadata": {}, 589 | "outputs": [], 590 | "source": [ 591 | "loan_data['loan_status'].value_counts() / loan_data['loan_status'].count()" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": null, 597 | "metadata": {}, 598 | "outputs": [], 599 | "source": [ 600 | "loan_data['good_bad'] = np.where(loan_data['loan_status'].isin(['Charged Off', 'Default',\n", 601 | " 'Does not meet the credit policy. Status:Charged Off',\n", 602 | " 'Late (31-120 days)']), 0, 1)" 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": null, 608 | "metadata": { 609 | "scrolled": true 610 | }, 611 | "outputs": [], 612 | "source": [ 613 | "loan_data['good_bad']" 614 | ] 615 | }, 616 | { 617 | "cell_type": "markdown", 618 | "metadata": {}, 619 | "source": [ 620 | "### Splitting Data" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": null, 626 | "metadata": {}, 627 | "outputs": [], 628 | "source": [ 629 | "from sklearn.model_selection import train_test_split" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": null, 635 | "metadata": {}, 636 | "outputs": [], 637 | "source": [ 638 | "train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'])" 639 | ] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": null, 644 | "metadata": {}, 645 | "outputs": [], 646 | "source": [ 647 | "loan_data_inputs_train, loan_data_inputs_test, loan_data_targets_train, loan_data_targets_test = train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'])" 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": null, 653 | "metadata": {}, 654 | "outputs": [], 655 | "source": [ 656 | "loan_data_inputs_train.shape" 657 | ] 658 | }, 659 | { 660 | "cell_type": "code", 661 | "execution_count": null, 662 | "metadata": {}, 663 | "outputs": [], 664 | "source": [ 665 | "loan_data_targets_train.shape" 666 | ] 667 | }, 668 | { 669 | "cell_type": "code", 670 | "execution_count": null, 671 | "metadata": {}, 672 | "outputs": [], 673 | "source": [ 674 | "loan_data_inputs_test.shape" 675 | ] 676 | }, 677 | { 678 | "cell_type": "code", 679 | "execution_count": null, 680 | "metadata": {}, 681 | "outputs": [], 682 | "source": [ 683 | "loan_data_targets_test.shape" 684 | ] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": null, 689 | "metadata": {}, 690 | "outputs": [], 691 | "source": [ 692 | "loan_data_inputs_train, loan_data_inputs_test, loan_data_targets_train, loan_data_targets_test = train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'], test_size = 0.2, random_state = 42)" 693 | ] 694 | }, 695 | { 696 | "cell_type": "code", 697 | "execution_count": null, 698 | "metadata": {}, 699 | "outputs": [], 700 | "source": [ 701 | "loan_data_inputs_train.shape" 702 | ] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": null, 707 | "metadata": {}, 708 | "outputs": [], 709 | "source": [ 710 | "loan_data_targets_train.shape" 711 | ] 712 | }, 713 | { 714 | "cell_type": "code", 715 | "execution_count": null, 716 | "metadata": {}, 717 | "outputs": [], 718 | "source": [ 719 | "loan_data_inputs_test.shape" 720 | ] 721 | }, 722 | { 723 | "cell_type": "code", 724 | "execution_count": null, 725 | "metadata": {}, 726 | "outputs": [], 727 | "source": [ 728 | "loan_data_targets_test.shape" 729 | ] 730 | }, 731 | { 732 | "cell_type": "markdown", 733 | "metadata": {}, 734 | "source": [ 735 | "### Data Preparation: An Example" 736 | ] 737 | }, 738 | { 739 | "cell_type": "code", 740 | "execution_count": null, 741 | "metadata": {}, 742 | "outputs": [], 743 | "source": [ 744 | "df_inputs_prepr = loan_data_inputs_train\n", 745 | "df_targets_prepr = loan_data_targets_train\n", 746 | "#df_inputs_prepr = loan_data_inputs_test\n", 747 | "#df_targets_prepr = loan_data_targets_test" 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": null, 753 | "metadata": {}, 754 | "outputs": [], 755 | "source": [ 756 | "df_inputs_prepr['grade'].unique()" 757 | ] 758 | }, 759 | { 760 | "cell_type": "code", 761 | "execution_count": null, 762 | "metadata": {}, 763 | "outputs": [], 764 | "source": [ 765 | "df1 = pd.concat([df_inputs_prepr['grade'], df_targets_prepr], axis = 1)\n", 766 | "df1.head()" 767 | ] 768 | }, 769 | { 770 | "cell_type": "code", 771 | "execution_count": null, 772 | "metadata": {}, 773 | "outputs": [], 774 | "source": [ 775 | "df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].count()" 776 | ] 777 | }, 778 | { 779 | "cell_type": "code", 780 | "execution_count": null, 781 | "metadata": {}, 782 | "outputs": [], 783 | "source": [ 784 | "df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].mean()" 785 | ] 786 | }, 787 | { 788 | "cell_type": "code", 789 | "execution_count": null, 790 | "metadata": {}, 791 | "outputs": [], 792 | "source": [ 793 | "df1 = pd.concat([df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].count(),\n", 794 | " df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].mean()], axis = 1)" 795 | ] 796 | }, 797 | { 798 | "cell_type": "code", 799 | "execution_count": null, 800 | "metadata": {}, 801 | "outputs": [], 802 | "source": [ 803 | "df1" 804 | ] 805 | }, 806 | { 807 | "cell_type": "code", 808 | "execution_count": null, 809 | "metadata": {}, 810 | "outputs": [], 811 | "source": [ 812 | "df1 = df1.iloc[: , [0, 1, 3]]\n", 813 | "df1" 814 | ] 815 | }, 816 | { 817 | "cell_type": "code", 818 | "execution_count": null, 819 | "metadata": {}, 820 | "outputs": [], 821 | "source": [ 822 | "df1.columns = [df1.columns.values[0], 'n_obs', 'prop_good']\n", 823 | "df1" 824 | ] 825 | }, 826 | { 827 | "cell_type": "code", 828 | "execution_count": null, 829 | "metadata": {}, 830 | "outputs": [], 831 | "source": [ 832 | "df1['prop_n_obs'] = df1['n_obs'] / df1['n_obs'].sum()" 833 | ] 834 | }, 835 | { 836 | "cell_type": "code", 837 | "execution_count": null, 838 | "metadata": {}, 839 | "outputs": [], 840 | "source": [ 841 | "df1" 842 | ] 843 | }, 844 | { 845 | "cell_type": "code", 846 | "execution_count": null, 847 | "metadata": {}, 848 | "outputs": [], 849 | "source": [ 850 | "df1['n_good'] = df1['prop_good'] * df1['n_obs']\n", 851 | "df1['n_bad'] = (1 - df1['prop_good']) * df1['n_obs']\n", 852 | "df1" 853 | ] 854 | }, 855 | { 856 | "cell_type": "code", 857 | "execution_count": null, 858 | "metadata": {}, 859 | "outputs": [], 860 | "source": [ 861 | "df1['prop_n_good'] = df1['n_good'] / df1['n_good'].sum()\n", 862 | "df1['prop_n_bad'] = df1['n_bad'] / df1['n_bad'].sum()\n", 863 | "df1" 864 | ] 865 | }, 866 | { 867 | "cell_type": "code", 868 | "execution_count": null, 869 | "metadata": {}, 870 | "outputs": [], 871 | "source": [ 872 | "df1['WoE'] = np.log(df1['prop_n_good'] / df1['prop_n_bad'])\n", 873 | "df1" 874 | ] 875 | }, 876 | { 877 | "cell_type": "code", 878 | "execution_count": null, 879 | "metadata": {}, 880 | "outputs": [], 881 | "source": [ 882 | "df1 = df1.sort_values(['WoE'])\n", 883 | "df1 = df1.reset_index(drop = True)\n", 884 | "df1" 885 | ] 886 | }, 887 | { 888 | "cell_type": "code", 889 | "execution_count": null, 890 | "metadata": {}, 891 | "outputs": [], 892 | "source": [ 893 | "df1['diff_prop_good'] = df1['prop_good'].diff().abs()\n", 894 | "df1['diff_WoE'] = df1['WoE'].diff().abs()\n", 895 | "df1" 896 | ] 897 | }, 898 | { 899 | "cell_type": "code", 900 | "execution_count": null, 901 | "metadata": {}, 902 | "outputs": [], 903 | "source": [ 904 | "df1['IV'] = (df1['prop_n_good'] - df1['prop_n_bad']) * df1['WoE']\n", 905 | "df1['IV'] = df1['IV'].sum()\n", 906 | "df1" 907 | ] 908 | }, 909 | { 910 | "cell_type": "markdown", 911 | "metadata": {}, 912 | "source": [ 913 | "### Preprocessing Discrete Variables: Automating Calculaions" 914 | ] 915 | }, 916 | { 917 | "cell_type": "code", 918 | "execution_count": null, 919 | "metadata": {}, 920 | "outputs": [], 921 | "source": [ 922 | "def woe_discrete(df, discrete_variable_name, good_bad_variable_df):\n", 923 | " df = pd.concat([df[discrete_variable_name], good_bad_variable_df], axis = 1)\n", 924 | " df = pd.concat([df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].count(),\n", 925 | " df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].mean()], axis = 1)\n", 926 | " df = df.iloc[:, [0, 1, 3]]\n", 927 | " df.columns = [df.columns.values[0], 'n_obs', 'prop_good']\n", 928 | " df['prop_n_obs'] = df['n_obs'] / df['n_obs'].sum()\n", 929 | " df['n_good'] = df['prop_good'] * df['n_obs']\n", 930 | " df['n_bad'] = (1 - df['prop_good']) * df['n_obs']\n", 931 | " df['prop_n_good'] = df['n_good'] / df['n_good'].sum()\n", 932 | " df['prop_n_bad'] = df['n_bad'] / df['n_bad'].sum()\n", 933 | " df['WoE'] = np.log(df['prop_n_good'] / df['prop_n_bad'])\n", 934 | " df = df.sort_values(['WoE'])\n", 935 | " df = df.reset_index(drop = True)\n", 936 | " df['diff_prop_good'] = df['prop_good'].diff().abs()\n", 937 | " df['diff_WoE'] = df['WoE'].diff().abs()\n", 938 | " df['IV'] = (df['prop_n_good'] - df['prop_n_bad']) * df['WoE']\n", 939 | " df['IV'] = df['IV'].sum()\n", 940 | " return df" 941 | ] 942 | }, 943 | { 944 | "cell_type": "code", 945 | "execution_count": null, 946 | "metadata": {}, 947 | "outputs": [], 948 | "source": [ 949 | "df_temp = woe_discrete(df_inputs_prepr, 'grade', df_targets_prepr)\n", 950 | "df_temp" 951 | ] 952 | } 953 | ], 954 | "metadata": { 955 | "kernelspec": { 956 | "display_name": "Python 3", 957 | "language": "python", 958 | "name": "python3" 959 | }, 960 | "language_info": { 961 | "codemirror_mode": { 962 | "name": "ipython", 963 | "version": 3 964 | }, 965 | "file_extension": ".py", 966 | "mimetype": "text/x-python", 967 | "name": "python", 968 | "nbconvert_exporter": "python", 969 | "pygments_lexer": "ipython3", 970 | "version": "3.7.3" 971 | } 972 | }, 973 | "nbformat": 4, 974 | "nbformat_minor": 2 975 | } 976 | -------------------------------------------------------------------------------- /Section 5 PD Model, Data Preparation/Credit Risk Modeling - Preparation - With Comments - 5-2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Preparation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Import Libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import numpy as np\n", 24 | "import pandas as pd" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Import Data\n", 32 | "The dataset contains all available data for more than 800,000 consumer loans issued from 2007 to 2015 by Lending Club: a large US peer-to-peer lending company. There are several different versions of this dataset. We have used a version available on kaggle.com. You can find it here: https://www.kaggle.com/wendykan/lending-club-loan-data/version/1\n", 33 | "We divided the data into two periods because we assume that some data are available at the moment when we need to build Expected Loss models, and some data comes from applications after. Later, we investigate whether the applications we have after we built the Probability of Default (PD) model have similar characteristics with the applications we used to build the PD model." 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": { 40 | "scrolled": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "loan_data_backup = pd.read_csv('loan_data_2007_2014.csv')" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "loan_data = loan_data_backup.copy()" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "## Explore Data" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "loan_data" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "pd.options.display.max_columns = None\n", 79 | "#pd.options.display.max_rows = None\n", 80 | "# Sets the pandas dataframe options to display all columns/ rows." 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "loan_data" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "scrolled": true 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "loan_data.head()" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "scrolled": true 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "loan_data.tail()" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "loan_data.columns.values\n", 121 | "# Displays all column names." 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "loan_data.info()\n", 131 | "# Displays column names, complete (non-missing) cases per column, and datatype per column." 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "## General Preprocessing" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "### Preprocessing few continuous variables" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "loan_data['emp_length'].unique()\n", 155 | "# Displays unique values of a column." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\\+ years', '')\n", 165 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0))\n", 166 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a', str(0))\n", 167 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '')\n", 168 | "loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '')\n", 169 | "# We store the preprocessed ‘employment length’ variable in a new variable called ‘employment length int’,\n", 170 | "# We assign the new ‘employment length int’ to be equal to the ‘employment length’ variable with the string ‘+ years’\n", 171 | "# replaced with nothing. Next, we replace the whole string ‘less than 1 year’ with the string ‘0’.\n", 172 | "# Then, we replace the ‘n/a’ string with the string ‘0’. Then, we replace the string ‘space years’ with nothing.\n", 173 | "# Finally, we replace the string ‘space year’ with nothing." 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "type(loan_data['emp_length_int'][0])\n", 183 | "# Checks the datatype of a single element of a column." 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])\n", 193 | "# Transforms the values to numeric." 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "type(loan_data['emp_length_int'][0])\n", 203 | "# Checks the datatype of a single element of a column." 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "loan_data['earliest_cr_line']\n", 213 | "# Displays a column." 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = '%b-%y')\n", 223 | "# Extracts the date and the time from a string variable that is in a given format." 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": { 230 | "scrolled": false 231 | }, 232 | "outputs": [], 233 | "source": [ 234 | "type(loan_data['earliest_cr_line_date'][0])\n", 235 | "# Checks the datatype of a single element of a column." 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']\n", 245 | "# Calculates the difference between two dates and times." 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "# Assume we are now in December 2017\n", 255 | "loan_data['mths_since_earliest_cr_line'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']) / np.timedelta64(1, 'M')))\n", 256 | "# We calculate the difference between two dates in months, turn it to numeric datatype and round it.\n", 257 | "# We save the result in a new variable." 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "loan_data['mths_since_earliest_cr_line'].describe()\n", 267 | "# Shows some descriptive statisics for the values of a column.\n", 268 | "# Dates from 1969 and before are not being converted well, i.e., they have become 2069 and similar,\n", 269 | "# and negative differences are being calculated." 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "loan_data.loc[: , ['earliest_cr_line', 'earliest_cr_line_date', 'mths_since_earliest_cr_line']][loan_data['mths_since_earliest_cr_line'] < 0]\n", 279 | "# We take three columns from the dataframe. Then, we display them only for the rows where a variable has negative value.\n", 280 | "# There are 2303 strange negative values." 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": {}, 287 | "outputs": [], 288 | "source": [ 289 | "loan_data['mths_since_earliest_cr_line'][loan_data['mths_since_earliest_cr_line'] < 0] = loan_data['mths_since_earliest_cr_line'].max()\n", 290 | "# We set the rows that had negative differences to the maximum value." 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": null, 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [ 299 | "min(loan_data['mths_since_earliest_cr_line'])\n", 300 | "# Calculates and shows the minimum value of a column." 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "### Homework" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": {}, 314 | "outputs": [], 315 | "source": [ 316 | "loan_data['term']" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "loan_data['term'].describe()\n", 326 | "# Shows some descriptive statisics for the values of a column." 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [ 335 | "loan_data['term_int'] = loan_data['term'].str.replace(' months', '')\n", 336 | "# We replace a string with another string, in this case, with an empty strng (i.e. with nothing)." 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "metadata": {}, 343 | "outputs": [], 344 | "source": [ 345 | "loan_data['term_int']" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "type(loan_data['term_int'][25])\n", 355 | "# Checks the datatype of a single element of a column." 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": {}, 362 | "outputs": [], 363 | "source": [ 364 | "loan_data['term_int'] = pd.to_numeric(loan_data['term'].str.replace(' months', ''))\n", 365 | "# We remplace a string from a variable with another string, in this case, with an empty strng (i.e. with nothing).\n", 366 | "# We turn the result to numeric datatype and save it in another variable.\n", 367 | "loan_data['term_int']" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "type(loan_data['term_int'][0])\n", 377 | "# Checks the datatype of a single element of a column." 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [ 386 | "loan_data['issue_d']" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "# Assume we are now in December 2017\n", 396 | "loan_data['issue_d_date'] = pd.to_datetime(loan_data['issue_d'], format = '%b-%y')\n", 397 | "# Extracts the date and the time from a string variable that is in a given format.\n", 398 | "loan_data['mths_since_issue_d'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['issue_d_date']) / np.timedelta64(1, 'M')))\n", 399 | "# We calculate the difference between two dates in months, turn it to numeric datatype and round it.\n", 400 | "# We save the result in a new variable.\n", 401 | "loan_data['mths_since_issue_d'].describe()\n", 402 | "# Shows some descriptive statisics for the values of a column." 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": {}, 408 | "source": [ 409 | "### Preprocessing few discrete variables" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [ 418 | "loan_data.info()\n", 419 | "# Displays column names, complete (non-missing) cases per column, and datatype per column." 420 | ] 421 | }, 422 | { 423 | "cell_type": "markdown", 424 | "metadata": {}, 425 | "source": [ 426 | "We are going to preprocess the following discrete variables: grade, sub_grade, home_ownership, verification_status, loan_status, purpose, addr_state, initial_list_status. Most likely, we are not going to use sub_grade, as it overlaps with grade." 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "metadata": {}, 433 | "outputs": [], 434 | "source": [ 435 | "pd.get_dummies(loan_data['grade'])\n", 436 | "# Create dummy variables from a variable." 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": null, 442 | "metadata": {}, 443 | "outputs": [], 444 | "source": [ 445 | "pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':')\n", 446 | "# Create dummy variables from a variable." 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": null, 452 | "metadata": { 453 | "scrolled": true 454 | }, 455 | "outputs": [], 456 | "source": [ 457 | "loan_data_dummies = [pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':'),\n", 458 | " pd.get_dummies(loan_data['sub_grade'], prefix = 'sub_grade', prefix_sep = ':'),\n", 459 | " pd.get_dummies(loan_data['home_ownership'], prefix = 'home_ownership', prefix_sep = ':'),\n", 460 | " pd.get_dummies(loan_data['verification_status'], prefix = 'verification_status', prefix_sep = ':'),\n", 461 | " pd.get_dummies(loan_data['loan_status'], prefix = 'loan_status', prefix_sep = ':'),\n", 462 | " pd.get_dummies(loan_data['purpose'], prefix = 'purpose', prefix_sep = ':'),\n", 463 | " pd.get_dummies(loan_data['addr_state'], prefix = 'addr_state', prefix_sep = ':'),\n", 464 | " pd.get_dummies(loan_data['initial_list_status'], prefix = 'initial_list_status', prefix_sep = ':')]\n", 465 | "# We create dummy variables from all 8 original independent variables, and save them into a list.\n", 466 | "# Note that we are using a particular naming convention for all variables: original variable name, colon, category name." 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": null, 472 | "metadata": {}, 473 | "outputs": [], 474 | "source": [ 475 | "loan_data_dummies = pd.concat(loan_data_dummies, axis = 1)\n", 476 | "# We concatenate the dummy variables and this turns them into a dataframe." 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": null, 482 | "metadata": {}, 483 | "outputs": [], 484 | "source": [ 485 | "type(loan_data_dummies)\n", 486 | "# Returns the type of the variable." 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": null, 492 | "metadata": {}, 493 | "outputs": [], 494 | "source": [ 495 | "loan_data = pd.concat([loan_data, loan_data_dummies], axis = 1)\n", 496 | "# Concatenates two dataframes.\n", 497 | "# Here we concatenate the dataframe with original data with the dataframe with dummy variables, along the columns. " 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": null, 503 | "metadata": {}, 504 | "outputs": [], 505 | "source": [ 506 | "loan_data.columns.values\n", 507 | "# Displays all column names." 508 | ] 509 | }, 510 | { 511 | "cell_type": "markdown", 512 | "metadata": {}, 513 | "source": [ 514 | "### Check for missing values and clean" 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": null, 520 | "metadata": { 521 | "scrolled": false 522 | }, 523 | "outputs": [], 524 | "source": [ 525 | "loan_data.isnull()\n", 526 | "# It returns 'False' if a value is not missing and 'True' if a value is missing, for each value in a dataframe." 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": null, 532 | "metadata": {}, 533 | "outputs": [], 534 | "source": [ 535 | "pd.options.display.max_rows = None\n", 536 | "# Sets the pandas dataframe options to display all columns/ rows.\n", 537 | "loan_data.isnull().sum()" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": null, 543 | "metadata": {}, 544 | "outputs": [], 545 | "source": [ 546 | "pd.options.display.max_rows = 100\n", 547 | "# Sets the pandas dataframe options to display 100 columns/ rows." 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": null, 553 | "metadata": {}, 554 | "outputs": [], 555 | "source": [ 556 | "# 'Total revolving high credit/ credit limit', so it makes sense that the missing values are equal to funded_amnt.\n", 557 | "loan_data['total_rev_hi_lim'].fillna(loan_data['funded_amnt'], inplace=True)\n", 558 | "# We fill the missing values with the values of another variable." 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": null, 564 | "metadata": {}, 565 | "outputs": [], 566 | "source": [ 567 | "loan_data['total_rev_hi_lim'].isnull().sum()" 568 | ] 569 | }, 570 | { 571 | "cell_type": "markdown", 572 | "metadata": {}, 573 | "source": [ 574 | "### Homework" 575 | ] 576 | }, 577 | { 578 | "cell_type": "code", 579 | "execution_count": null, 580 | "metadata": {}, 581 | "outputs": [], 582 | "source": [ 583 | "loan_data['annual_inc'].fillna(loan_data['annual_inc'].mean(), inplace=True)\n", 584 | "# We fill the missing values with the mean value of the non-missing values." 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": null, 590 | "metadata": {}, 591 | "outputs": [], 592 | "source": [ 593 | "loan_data['mths_since_earliest_cr_line'].fillna(0, inplace=True)\n", 594 | "loan_data['acc_now_delinq'].fillna(0, inplace=True)\n", 595 | "loan_data['total_acc'].fillna(0, inplace=True)\n", 596 | "loan_data['pub_rec'].fillna(0, inplace=True)\n", 597 | "loan_data['open_acc'].fillna(0, inplace=True)\n", 598 | "loan_data['inq_last_6mths'].fillna(0, inplace=True)\n", 599 | "loan_data['delinq_2yrs'].fillna(0, inplace=True)\n", 600 | "loan_data['emp_length_int'].fillna(0, inplace=True)\n", 601 | "# We fill the missing values with zeroes." 602 | ] 603 | }, 604 | { 605 | "cell_type": "markdown", 606 | "metadata": {}, 607 | "source": [ 608 | "# PD model" 609 | ] 610 | }, 611 | { 612 | "cell_type": "markdown", 613 | "metadata": {}, 614 | "source": [ 615 | "## Data preparation" 616 | ] 617 | }, 618 | { 619 | "cell_type": "markdown", 620 | "metadata": {}, 621 | "source": [ 622 | "### Dependent Variable. Good/ Bad (Default) Definition. Default and Non-default Accounts." 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": null, 628 | "metadata": {}, 629 | "outputs": [], 630 | "source": [ 631 | "loan_data['loan_status'].unique()\n", 632 | "# Displays unique values of a column." 633 | ] 634 | }, 635 | { 636 | "cell_type": "code", 637 | "execution_count": null, 638 | "metadata": { 639 | "scrolled": true 640 | }, 641 | "outputs": [], 642 | "source": [ 643 | "loan_data['loan_status'].value_counts()\n", 644 | "# Calculates the number of observations for each unique value of a variable." 645 | ] 646 | }, 647 | { 648 | "cell_type": "code", 649 | "execution_count": null, 650 | "metadata": {}, 651 | "outputs": [], 652 | "source": [ 653 | "loan_data['loan_status'].value_counts() / loan_data['loan_status'].count()\n", 654 | "# We divide the number of observations for each unique value of a variable by the total number of observations.\n", 655 | "# Thus, we get the proportion of observations for each unique value of a variable." 656 | ] 657 | }, 658 | { 659 | "cell_type": "code", 660 | "execution_count": null, 661 | "metadata": { 662 | "scrolled": true 663 | }, 664 | "outputs": [], 665 | "source": [ 666 | "# Good/ Bad Definition\n", 667 | "loan_data['good_bad'] = np.where(loan_data['loan_status'].isin(['Charged Off', 'Default',\n", 668 | " 'Does not meet the credit policy. Status:Charged Off',\n", 669 | " 'Late (31-120 days)']), 0, 1)\n", 670 | "# We create a new variable that has the value of '0' if a condition is met, and the value of '1' if it is not met." 671 | ] 672 | }, 673 | { 674 | "cell_type": "code", 675 | "execution_count": null, 676 | "metadata": {}, 677 | "outputs": [], 678 | "source": [ 679 | "loan_data['good_bad']" 680 | ] 681 | } 682 | ], 683 | "metadata": { 684 | "kernelspec": { 685 | "display_name": "Python 3", 686 | "language": "python", 687 | "name": "python3" 688 | }, 689 | "language_info": { 690 | "codemirror_mode": { 691 | "name": "ipython", 692 | "version": 3 693 | }, 694 | "file_extension": ".py", 695 | "mimetype": "text/x-python", 696 | "name": "python", 697 | "nbconvert_exporter": "python", 698 | "pygments_lexer": "ipython3", 699 | "version": "3.7.4" 700 | } 701 | }, 702 | "nbformat": 4, 703 | "nbformat_minor": 2 704 | } 705 | --------------------------------------------------------------------------------