├── .gitattributes ├── .gitignore ├── .project ├── Credit_Card.sas ├── Credit_Scoring1.pptx └── README.md /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear in the root of a volume 35 | .DocumentRevisions-V100 36 | .fseventsd 37 | .Spotlight-V100 38 | .TemporaryItems 39 | .Trashes 40 | .VolumeIcon.icns 41 | 42 | # Directories potentially created on remote AFP share 43 | .AppleDB 44 | .AppleDesktop 45 | Network Trash Folder 46 | Temporary Items 47 | .apdisk 48 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | Regression 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /Credit_Card.sas: -------------------------------------------------------------------------------- 1 | /*Regression- Graded Assignment*/ 2 | 3 | /*Permanent Library**/ 4 | libname reg "D:\Users\ms\Graded Assignments\Topic 10-Case Study Regression-Graded"; 5 | run; 6 | 7 | /*Importing the dataset*/ 8 | proc import 9 | datafile = "Z:\Assignments\Graded Assignment\Topic 10 - Regression Models\Credit.csv" 10 | out = reg.creditcard dbms =csv replace; 11 | run; 12 | 13 | 14 | /*Mandate data exploration finding frequencies of 0's and 1's*/ 15 | proc freq data = reg.creditcard; 16 | tables NPA_Status; 17 | run; 18 | 19 | 20 | /*Data Exploration*/ 21 | proc means data = reg.creditcard n nmiss mean stddev min max range; 22 | proc freq data = reg.creditcard; 23 | tables Numberofdependents Gender MonthlyIncome; 24 | proc contents data= reg.creditcard; 25 | run; 26 | 27 | /*Data Cleaning and removing missing values from number of dependents variable*/ 28 | data reg.creditcard1 (drop = MonthlyIncome1); 29 | set reg.creditcard; 30 | if numberofdependents = "Goo" or numberofdependents="Bad" then delete; 31 | run; 32 | 33 | 34 | /*Data Exploration after removing missing values*/ 35 | proc means data = reg.creditcard1 n nmiss mean min max; 36 | 37 | proc freq data = reg.creditcard1; 38 | tables Numberofdependents Gender MonthlyIncome Education Occupation Rented_OwnHouse; 39 | 40 | proc freq data = reg.creditcard1; 41 | tables NPA_Status; 42 | run; 43 | 44 | 45 | /*Data Cleaning and substitution on MonthlyIncome and Number of Dependents*/ 46 | data reg.creditcard2; 47 | set reg.creditcard1; 48 | if MonthlyIncome>= 1000000 then delete; 49 | MonthlyIncome1 = input(MonthlyIncome,12.) ; 50 | NumberOfDependents1 = input(NumberOfDependents,10.); 51 | run; 52 | proc means data = reg.creditcard2; 53 | var MonthlyIncome1 NumberOfDependents1; 54 | run; 55 | 56 | /*Substituting mean for missing values in MonthlyIncome and Number of Dependents*/ 57 | data reg.creditcard3 (drop = MonthlyIncome NumberOfDependents); 58 | set reg.creditcard2; 59 | if MonthlyIncome1 = . then MonthlyIncome1 = 6608; 60 | if NumberOfDependents1 = . then NumberOfDependents1 = 1; 61 | run; 62 | proc freq data = reg.creditcard3; 63 | tables MonthlyIncome1 NumberOfDependents1; 64 | run; 65 | 66 | 67 | 68 | 69 | /*Data Preparation 70 | dummy variables for :Age,Region,Income,house ownership,Occupation,Education,Gender */ 71 | 72 | *AGE; 73 | data reg.prep; 74 | set reg.creditcard3; 75 | if age le 30 then age30 = 1; 76 | else age30 = 0; 77 | if 31 le age le 45 then age45 = 1; 78 | else age45 = 0; 79 | if 46 le age le 60 then age60 = 1; 80 | else age60 = 0; 81 | if 61 le age le 75 then age75 = 1; 82 | else age75 = 0; 83 | if age gt 76 then age75 = 1; 84 | else age75 = 0; 85 | 86 | *REGION; 87 | if region = "Centr" then region_c = 1; 88 | else region_c = 0; 89 | if region = "North" then region_n = 1; 90 | else region_n = 0; 91 | if region = "East" then region_e = 1; 92 | else region_e = 0; 93 | if region = "South" then region_s = 1; 94 | else region_s = 0; 95 | if region = "West" then region_w = 1; 96 | else region_w = 0; 97 | 98 | *INCOME; 99 | monthly_income =input(MonthlyIncome1,best32.); 100 | length income $25; 101 | if 0 le monthly_income le 10000 then income = "0-10000"; 102 | else if 10001 le monthly_income le 50000 then income = "10001-50000"; 103 | else if 50001 le monthly_income le 100000 then income = "50001-100000"; 104 | else if 100001 le monthly_income le 250000 then income = "100001-250000"; 105 | else if 250001 le monthly_income le 500000 then income = "250001-500000"; 106 | else if 500001 le monthly_income le 750001 then income = "500001-750000"; 107 | else if 750001 le monthly_income le 999999 then income= "750001-999999"; 108 | 109 | if income = "0-10000" then income_cat1 = 1; 110 | else income_cat1 = 0; 111 | if income = "50001-100000" then income_cat2 = 1; 112 | else income_cat2 = 0; 113 | if income = "100001-250000" then income_cat3 = 1; 114 | else income_cat3 = 0; 115 | if income = "250001-500000" then income_cat4 = 1; 116 | else income_cat4 = 0; 117 | if income = "500001-750000" then income_cat5 = 1; 118 | else income_cat5 = 0; 119 | if income = "750001-999999" then income_cat6 = 1; 120 | else income_cat6 = 0; 121 | 122 | *house ownership; 123 | if rented_ownhouse = "Ownhouse" then house_owner_1 = 1; 124 | else house_owner_1 = 0; 125 | if rented_ownhouse = "Rented" then house_owner_2 = 1; 126 | else house_owner_2 = 0; 127 | 128 | *occupation; 129 | if occupation = "Non-offi" then job1 = 1; 130 | else job1 = 0; 131 | if occupation = "Officer1" then job2 = 1; 132 | else job2 = 0; 133 | if occupation = "Officer2" then job3 = 1; 134 | else job3 = 0; 135 | if occupation = "Officer3" then job4 = 1; 136 | else job4 = 0; 137 | if occupation = "Self_Emp" then job5 = 1; 138 | else job5 = 0; 139 | 140 | *Education; 141 | if Education = "Matric" then edu1 = 1; 142 | else edu1 = 0; 143 | if Education = "Graduate" then edu2 = 1; 144 | else edu2 = 0; 145 | if Education = "Post-Grad" then edu3 = 1; 146 | else edu3 = 0; 147 | if Education = "PhD" then edu4 = 1; 148 | else edu4 = 0; 149 | if Education = "Professional" then edu5 = 1; 150 | else edu5 = 0; 151 | 152 | *Gender; 153 | if Gender = "Male" then gend1 = 1; 154 | else gend1 = 0; 155 | if Gender = "Female" then gend2 = 1; 156 | else gend2 = 0; 157 | run; 158 | 159 | 160 | /*Identifying Outliers*/ 161 | proc univariate data = reg.prep; 162 | var revolvingutilizationofunsecuredl debtratio numberoftimes90dayslate 163 | numberoftime60_89dayspastduenotw numberoftime30_59dayspastduenotw; 164 | run; 165 | 166 | /*detecting outliers in revolvingutilizationofunsecuredl variable and substituting 167 | the mean and deleteing extreme values*/ 168 | proc means data = reg.prep n nmiss min max mean; 169 | var revolvingutilizationofunsecuredl; 170 | run; 171 | data reg.prep2; 172 | set reg.prep; 173 | if revolvingutilizationofunsecuredl >5 then delete; 174 | if 5 gt revolvingutilizationofunsecuredl >1 then revolvingutilizationofunsecuredl = 0.32; 175 | run; 176 | proc means data = reg.prep2 n nmiss min max mean; 177 | var revolvingutilizationofunsecuredl; 178 | run; 179 | 180 | 181 | /*removing extreme values from age*/ 182 | data reg.prep2; 183 | set reg.prep2; 184 | if age = 0 then age = 52; 185 | run; 186 | proc means data = reg.prep2 n nmiss mean min; 187 | var age; 188 | run; 189 | 190 | 191 | /*removing outliers from Number of times days past due (30-59,60-90,>90*/ 192 | data reg.prep3(drop = MonthlyIncome NumberOfDependents Income_cat7); 193 | set reg.prep2; 194 | if NumberOfTime30_59DaysPastDueNotW >=96 then delete; 195 | if NumberOfTime60_89DaysPastDueNotW >=96 then delete; 196 | if NumberOfTimes90DaysLate >=96 then delete; 197 | if debtratio > 1 then debtratio = 0.5; 198 | run; 199 | 200 | proc means data = reg.prep3; 201 | run; 202 | 203 | 204 | proc univariate data = reg.prep3; 205 | var revolvingutilizationofunsecuredl debtratio numberoftimes90dayslate 206 | numberoftime60_89dayspastduenotw numberoftime30_59dayspastduenotw NumberOfDependents1; 207 | run; 208 | 209 | 210 | /*****End of Data Exploration and Data Preparation*********************/ 211 | 212 | 213 | /*****Start of Model Building******************************************/ 214 | 215 | 216 | /*partitioning data into development and validation dataset*/ 217 | data reg.development reg.validation; 218 | set reg.prep3; 219 | if ranuni (100) <0.60 then output reg.development; 220 | else output reg.validation; 221 | run; 222 | 223 | 224 | /*counting 0's and 1's in development and validation dataset*/ 225 | proc freq data = reg.development; 226 | tables NPA_Status; 227 | run; 228 | 229 | proc freq data = reg.validation; 230 | tables NPA_Status; 231 | run; 232 | 233 | 234 | 235 | 236 | /*Running Logistic Regression : iteration 1*/ 237 | proc logistic data= reg.development descending; 238 | model NPA_Status = age30 age45 age60 age75 region_c region_n region_e region_s 239 | region_w income_cat1 income_cat2 income_cat3 income_cat4 income_cat5 income_cat6 240 | house_owner_1 house_owner_2 job1 job2 job3 job4 job5 edu1 edu2 edu3 edu4 241 | edu5 gend1 gend2 RevolvingUtilizationOfUnsecuredL NumberOfTime30_59daysPastDueNotW 242 | NumberOfTime60_89daysPastDueNotW DebtRatio NumberOfTimes90DaysLate NumberOfOpenCreditLinesAndLoans 243 | NumberRealEstateLoansOrLines; 244 | output out = reg.creditscore1 predicted = pred_prob; 245 | run; 246 | 247 | /*iteration 2- model with only significant variables*/ 248 | proc logistic data= reg.development descending; 249 | model NPA_Status = age30 age45 age60 region_c region_n region_e region_s income_cat1 house_owner_1 job1 job3 edu1 edu3 edu4 250 | RevolvingUtilizationOfUnsecuredL NumberOfTime30_59daysPastDueNotW 251 | NumberOfTime60_89daysPastDueNotW DebtRatio NumberOfTimes90DaysLate NumberOfOpenCreditLinesAndLoans 252 | NumberRealEstateLoansOrLines; 253 | output out = reg.creditscore2 predicted = pred_prob; 254 | run; 255 | 256 | 257 | /*Auto iteration - forward Selection*/ 258 | proc logistic data= reg.development descending; 259 | model NPA_Status = age30 age45 age60 region_c region_n region_e region_s income_cat1 house_owner_1 job1 job3 edu1 edu3 edu4 260 | RevolvingUtilizationOfUnsecuredL NumberOfTime30_59daysPastDueNotW 261 | NumberOfTime60_89daysPastDueNotW DebtRatio NumberOfTimes90DaysLate NumberOfOpenCreditLinesAndLoans 262 | NumberRealEstateLoansOrLines/selection = forward ; 263 | output out = reg.creditscore3 predicted = pred_prob; 264 | run; 265 | 266 | /*Auto iteration - backward Selection*/ 267 | proc logistic data= reg.development descending; 268 | model NPA_Status = age30 age45 age60 region_c region_n region_e region_s income_cat1 house_owner_1 job1 job3 edu1 edu3 edu4 269 | RevolvingUtilizationOfUnsecuredL NumberOfTime30_59daysPastDueNotW 270 | NumberOfTime60_89daysPastDueNotW DebtRatio NumberOfTimes90DaysLate NumberOfOpenCreditLinesAndLoans 271 | NumberRealEstateLoansOrLines/selection = forward ; 272 | output out = reg.creditscore3 predicted = pred_prob; 273 | run; 274 | 275 | 276 | 277 | 278 | /*Final Iteration with signifant variable at 0.05 only*/ 279 | proc logistic data= reg.development descending; 280 | model NPA_Status = age30 age45 age60 region_c region_n region_e region_s income_cat1 house_owner_1 job1 job3 edu1 edu3 edu4 281 | RevolvingUtilizationOfUnsecuredL NumberOfTime30_59daysPastDueNotW 282 | NumberOfTime60_89daysPastDueNotW DebtRatio NumberOfTimes90DaysLate NumberOfOpenCreditLinesAndLoans 283 | NumberRealEstateLoansOrLines; 284 | output out = reg.creditscore4 predicted = pred_prob; 285 | run; 286 | 287 | proc sort data = reg.creditscore4 out = reg.creditscore4_sorted; 288 | by descending pred_prob; 289 | run; 290 | proc export data = reg.creditscore4_sorted 291 | outfile = "Y:\Graded Assignments\Topic 10-Case Study Regression-Graded\Solution\log_reg.csv" 292 | dbms = csv replace; 293 | run; 294 | 295 | proc rank data = reg.creditscore4_sorted out = reg.decile groups = 10 ties = mean; 296 | var pred_prob; 297 | ranks decile; 298 | run; 299 | 300 | proc export data = reg.decile 301 | outfile = "Y:\Graded Assignments\Topic 10-Case Study Regression-Graded\Solution\lift_chart.csv" 302 | dbms = csv replace; 303 | run; 304 | 305 | /*Scoring a new dataset - Confusion Matrix*/ 306 | 307 | proc logistic data= reg.development descending; 308 | model NPA_Status = age30 age45 age60 region_c region_n region_e region_s income_cat1 house_owner_1 job1 job3 edu1 edu3 edu4 309 | RevolvingUtilizationOfUnsecuredL NumberOfTime30_59daysPastDueNotW 310 | NumberOfTime60_89daysPastDueNotW DebtRatio NumberOfTimes90DaysLate NumberOfOpenCreditLinesAndLoans 311 | NumberRealEstateLoansOrLines/lackfit; 312 | output out = reg.creditscore5 predicted = pred_prob; 313 | score data = reg.validation out = reg.reg_out; 314 | run; 315 | 316 | proc freq data = reg.reg_out; 317 | tables F_NPA_Status*I_NPA_Status/nopercent norow nocol nocum; 318 | run; 319 | 320 | /************End of Development Dataset Modelling********************/ 321 | 322 | /************Validation of Model*************************************/ 323 | proc logistic data= reg.validation descending; 324 | model NPA_Status = age30 age45 age60 region_c region_n region_e region_s income_cat1 house_owner_1 job1 job3 edu1 edu3 edu4 325 | RevolvingUtilizationOfUnsecuredL NumberOfTime30_59daysPastDueNotW 326 | NumberOfTime60_89daysPastDueNotW DebtRatio NumberOfTimes90DaysLate NumberOfOpenCreditLinesAndLoans 327 | NumberRealEstateLoansOrLines; 328 | output out = reg.model_validation predicted = pred_prob; 329 | run; 330 | 331 | proc sort data = reg.model_validation out = reg.model_validation_sorted; 332 | by descending pred_prob; 333 | run; 334 | proc export data = reg.model_validation_sorted 335 | outfile = "Y:\Graded Assignments\Topic 10-Case Study Regression-Graded\Solution\validation_log_reg.csv" 336 | dbms = csv replace; 337 | run; 338 | 339 | proc rank data = reg.model_validation_sorted out = reg.model_validation_decile groups = 10 ties = mean; 340 | var pred_prob; 341 | ranks decile; 342 | run; 343 | 344 | proc export data = reg.model_validation_decile 345 | outfile = "Y:\Graded Assignments\Topic 10-Case Study Regression-Graded\Solution\validation_gain_chart.csv" 346 | dbms = csv replace; 347 | run; 348 | 349 | 350 | /*Scoring-Confusion Matrix*/ 351 | proc logistic data= reg.validation descending; 352 | model NPA_Status = age30 age45 age60 region_c region_n region_e region_s income_cat1 house_owner_1 job1 job3 edu1 edu3 edu4 353 | RevolvingUtilizationOfUnsecuredL NumberOfTime30_59daysPastDueNotW 354 | NumberOfTime60_89daysPastDueNotW DebtRatio NumberOfTimes90DaysLate NumberOfOpenCreditLinesAndLoans 355 | NumberRealEstateLoansOrLines/lackfit; 356 | output out = reg.model_validation predicted = pred_prob; 357 | score data = reg.development out = reg.reg_out1; 358 | run; 359 | proc freq data = reg.reg_out; 360 | tables F_NPA_Status*I_NPA_Status/nopercent norow nocol nocum; 361 | run; 362 | 363 | -------------------------------------------------------------------------------- /Credit_Scoring1.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myankshah/SAS_Credit_Scoring_Model_using_LogisticRegression/31caec2957d0d808ab2d6c3db5f261ac2d1ab75f/Credit_Scoring1.pptx -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SAS_Credit_Scoring_Model_using_LogisticRegression 2 | **************************************************************************** 3 | Author : Mayank Shah 4 | Author URL : https://in.linkedin.com/in/myankshah 5 | Description : This repository contains SAS code for Credit card Scoring Model to predict future default(bad) customers. The model is prepared using Logistic Regression for binary categorical variable(Good/Bad) using Base SAS. 6 | Version : 1.0 7 | License : by myankshah 8 | --------------------------------------------------------------------------------