├── Train.csv ├── README.md └── Feature Engineering.ipynb /Train.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HanXiaoyang/Feature_Engineering_and_XGBoost_Parameter_Tuning/HEAD/Train.csv -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### [Data Hackathon 3.X](https://discuss.analyticsvidhya.com/t/data-hackathon-online-3-x-5th-6th-september-2015/2899)的特征工程和Xgboost模型调参例子。 2 | 为了便于阅读和交互式学习,这里用的ipython notebook格式,csv文件是此比赛的数据集。欢迎大家下载自己运行。
3 | 有问题可以联系[@寒小阳](http://blog.csdn.net/han_xiaoyang)
4 | 邮箱:hanxiaoyang.ml@gmail.com 5 | -------------------------------------------------------------------------------- /Feature Engineering.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 特征工程\n", 8 | "数据集来源于Data Hackathon 3.x,所有的特征处理也只做最基本的参考,可自行尝试更多的特征工程工作,参考github里Feature engineering和Kaggle Titanic的案例。" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "### 加载需要的库:" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "metadata": { 22 | "collapsed": true 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "import pandas as pd\n", 27 | "import numpy as np\n", 28 | "%matplotlib inline" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": { 35 | "collapsed": false 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "#载入数据:\n", 40 | "train = pd.read_csv('Train.csv')\n", 41 | "test = pd.read_csv('Test.csv')" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "metadata": { 48 | "collapsed": false 49 | }, 50 | "outputs": [ 51 | { 52 | "data": { 53 | "text/plain": [ 54 | "((87020, 26), (37717, 24))" 55 | ] 56 | }, 57 | "execution_count": 3, 58 | "metadata": {}, 59 | "output_type": "execute_result" 60 | } 61 | ], 62 | "source": [ 63 | "train.shape, test.shape" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "### 看看数据的基本情况" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 4, 76 | "metadata": { 77 | "collapsed": false 78 | }, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/plain": [ 83 | "ID object\n", 84 | "Gender object\n", 85 | "City object\n", 86 | "Monthly_Income int64\n", 87 | "DOB object\n", 88 | "Lead_Creation_Date object\n", 89 | "Loan_Amount_Applied float64\n", 90 | "Loan_Tenure_Applied float64\n", 91 | "Existing_EMI float64\n", 92 | "Employer_Name object\n", 93 | "Salary_Account object\n", 94 | "Mobile_Verified object\n", 95 | "Var5 int64\n", 96 | "Var1 object\n", 97 | "Loan_Amount_Submitted float64\n", 98 | "Loan_Tenure_Submitted float64\n", 99 | "Interest_Rate float64\n", 100 | "Processing_Fee float64\n", 101 | "EMI_Loan_Submitted float64\n", 102 | "Filled_Form object\n", 103 | "Device_Type object\n", 104 | "Var2 object\n", 105 | "Source object\n", 106 | "Var4 int64\n", 107 | "LoggedIn int64\n", 108 | "Disbursed int64\n", 109 | "dtype: object" 110 | ] 111 | }, 112 | "execution_count": 4, 113 | "metadata": {}, 114 | "output_type": "execute_result" 115 | } 116 | ], 117 | "source": [ 118 | "train.dtypes" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "### 拿前5条出来看看" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 5, 131 | "metadata": { 132 | "collapsed": false 133 | }, 134 | "outputs": [ 135 | { 136 | "data": { 137 | "text/html": [ 138 | "
\n", 139 | "\n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | "
IDGenderCityMonthly_IncomeDOBLead_Creation_DateLoan_Amount_AppliedLoan_Tenure_AppliedExisting_EMIEmployer_Name...Interest_RateProcessing_FeeEMI_Loan_SubmittedFilled_FormDevice_TypeVar2SourceVar4LoggedInDisbursed
0ID000002C20FemaleDelhi2000023-May-7815-May-15300000.05.00.0CYBOSOL...NaNNaNNaNNWeb-browserGS122100
1ID000004E40MaleMumbai3500007-Oct-8504-May-15200000.02.00.0TATA CONSULTANCY SERVICES LTD (TCS)...13.25NaN6762.9NWeb-browserGS122300
2ID000007H20MalePanchkula2250010-Oct-8119-May-15600000.04.00.0ALCHEMIST HOSPITALS LTD...NaNNaNNaNNWeb-browserBS143100
3ID000008I30MaleSaharsa3500030-Nov-8709-May-151000000.05.00.0BIHAR GOVERNMENT...NaNNaNNaNNWeb-browserBS143300
4ID000009J40MaleBengaluru10000017-Feb-8420-May-15500000.02.025000.0GLOBAL EDGE SOFTWARE...NaNNaNNaNNWeb-browserBS134310
\n", 289 | "

5 rows × 26 columns

\n", 290 | "
" 291 | ], 292 | "text/plain": [ 293 | " ID Gender City Monthly_Income DOB \\\n", 294 | "0 ID000002C20 Female Delhi 20000 23-May-78 \n", 295 | "1 ID000004E40 Male Mumbai 35000 07-Oct-85 \n", 296 | "2 ID000007H20 Male Panchkula 22500 10-Oct-81 \n", 297 | "3 ID000008I30 Male Saharsa 35000 30-Nov-87 \n", 298 | "4 ID000009J40 Male Bengaluru 100000 17-Feb-84 \n", 299 | "\n", 300 | " Lead_Creation_Date Loan_Amount_Applied Loan_Tenure_Applied Existing_EMI \\\n", 301 | "0 15-May-15 300000.0 5.0 0.0 \n", 302 | "1 04-May-15 200000.0 2.0 0.0 \n", 303 | "2 19-May-15 600000.0 4.0 0.0 \n", 304 | "3 09-May-15 1000000.0 5.0 0.0 \n", 305 | "4 20-May-15 500000.0 2.0 25000.0 \n", 306 | "\n", 307 | " Employer_Name ... Interest_Rate Processing_Fee \\\n", 308 | "0 CYBOSOL ... NaN NaN \n", 309 | "1 TATA CONSULTANCY SERVICES LTD (TCS) ... 13.25 NaN \n", 310 | "2 ALCHEMIST HOSPITALS LTD ... NaN NaN \n", 311 | "3 BIHAR GOVERNMENT ... NaN NaN \n", 312 | "4 GLOBAL EDGE SOFTWARE ... NaN NaN \n", 313 | "\n", 314 | " EMI_Loan_Submitted Filled_Form Device_Type Var2 Source Var4 LoggedIn \\\n", 315 | "0 NaN N Web-browser G S122 1 0 \n", 316 | "1 6762.9 N Web-browser G S122 3 0 \n", 317 | "2 NaN N Web-browser B S143 1 0 \n", 318 | "3 NaN N Web-browser B S143 3 0 \n", 319 | "4 NaN N Web-browser B S134 3 1 \n", 320 | "\n", 321 | " Disbursed \n", 322 | "0 0 \n", 323 | "1 0 \n", 324 | "2 0 \n", 325 | "3 0 \n", 326 | "4 0 \n", 327 | "\n", 328 | "[5 rows x 26 columns]" 329 | ] 330 | }, 331 | "execution_count": 5, 332 | "metadata": {}, 333 | "output_type": "execute_result" 334 | } 335 | ], 336 | "source": [ 337 | "train.head(5)" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 6, 343 | "metadata": { 344 | "collapsed": false 345 | }, 346 | "outputs": [ 347 | { 348 | "data": { 349 | "text/plain": [ 350 | "(124737, 27)" 351 | ] 352 | }, 353 | "execution_count": 6, 354 | "metadata": {}, 355 | "output_type": "execute_result" 356 | } 357 | ], 358 | "source": [ 359 | "#合成一个总的data\n", 360 | "train['source']= 'train'\n", 361 | "test['source'] = 'test'\n", 362 | "data=pd.concat([train, test],ignore_index=True)\n", 363 | "data.shape" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": {}, 369 | "source": [ 370 | "### 数据应用/建模一个很重要的工作是,你要看看异常点,比如说缺省值" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 7, 376 | "metadata": { 377 | "collapsed": false, 378 | "scrolled": false 379 | }, 380 | "outputs": [ 381 | { 382 | "data": { 383 | "text/plain": [ 384 | "City 1401\n", 385 | "DOB 0\n", 386 | "Device_Type 0\n", 387 | "Disbursed 37717\n", 388 | "EMI_Loan_Submitted 84901\n", 389 | "Employer_Name 113\n", 390 | "Existing_EMI 111\n", 391 | "Filled_Form 0\n", 392 | "Gender 0\n", 393 | "ID 0\n", 394 | "Interest_Rate 84901\n", 395 | "Lead_Creation_Date 0\n", 396 | "Loan_Amount_Applied 111\n", 397 | "Loan_Amount_Submitted 49535\n", 398 | "Loan_Tenure_Applied 111\n", 399 | "Loan_Tenure_Submitted 49535\n", 400 | "LoggedIn 37717\n", 401 | "Mobile_Verified 0\n", 402 | "Monthly_Income 0\n", 403 | "Processing_Fee 85346\n", 404 | "Salary_Account 16801\n", 405 | "Source 0\n", 406 | "Var1 0\n", 407 | "Var2 0\n", 408 | "Var4 0\n", 409 | "Var5 0\n", 410 | "source 0\n", 411 | "dtype: int64" 412 | ] 413 | }, 414 | "execution_count": 7, 415 | "metadata": {}, 416 | "output_type": "execute_result" 417 | } 418 | ], 419 | "source": [ 420 | "data.apply(lambda x: sum(x.isnull()))" 421 | ] 422 | }, 423 | { 424 | "cell_type": "markdown", 425 | "metadata": {}, 426 | "source": [ 427 | "### 要对数据有更深的认识,比如说,咱们看看这些字段,分别有多少种取值(甚至你可以看看分布)" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": 8, 433 | "metadata": { 434 | "collapsed": false, 435 | "scrolled": false 436 | }, 437 | "outputs": [ 438 | { 439 | "name": "stdout", 440 | "output_type": "stream", 441 | "text": [ 442 | "\n", 443 | "Gender这一列数据的不同取值和出现的次数\n", 444 | "\n", 445 | "Male 71398\n", 446 | "Female 53339\n", 447 | "Name: Gender, dtype: int64\n", 448 | "\n", 449 | "Salary_Account这一列数据的不同取值和出现的次数\n", 450 | "\n", 451 | "HDFC Bank 25180\n", 452 | "ICICI Bank 19547\n", 453 | "State Bank of India 17110\n", 454 | "Axis Bank 12590\n", 455 | "Citibank 3398\n", 456 | "Kotak Bank 2955\n", 457 | "IDBI Bank 2213\n", 458 | "Punjab National Bank 1747\n", 459 | "Bank of India 1713\n", 460 | "Bank of Baroda 1675\n", 461 | "Standard Chartered Bank 1434\n", 462 | "Canara Bank 1385\n", 463 | "Union Bank of India 1330\n", 464 | "Yes Bank 1120\n", 465 | "ING Vysya 996\n", 466 | "Corporation bank 948\n", 467 | "Indian Overseas Bank 901\n", 468 | "State Bank of Hyderabad 854\n", 469 | "Indian Bank 773\n", 470 | "Oriental Bank of Commerce 761\n", 471 | "IndusInd Bank 711\n", 472 | "Andhra Bank 706\n", 473 | "Central Bank of India 648\n", 474 | "Syndicate Bank 614\n", 475 | "Bank of Maharasthra 576\n", 476 | "HSBC 474\n", 477 | "State Bank of Bikaner & Jaipur 448\n", 478 | "Karur Vysya Bank 435\n", 479 | "State Bank of Mysore 385\n", 480 | "Federal Bank 377\n", 481 | "Vijaya Bank 354\n", 482 | "Allahabad Bank 345\n", 483 | "UCO Bank 344\n", 484 | "State Bank of Travancore 333\n", 485 | "Karnataka Bank 279\n", 486 | "United Bank of India 276\n", 487 | "Dena Bank 268\n", 488 | "Saraswat Bank 265\n", 489 | "State Bank of Patiala 263\n", 490 | "South Indian Bank 223\n", 491 | "Deutsche Bank 176\n", 492 | "Abhyuday Co-op Bank Ltd 161\n", 493 | "The Ratnakar Bank Ltd 113\n", 494 | "Tamil Nadu Mercantile Bank 103\n", 495 | "Punjab & Sind bank 84\n", 496 | "J&K Bank 78\n", 497 | "Lakshmi Vilas bank 69\n", 498 | "Dhanalakshmi Bank Ltd 66\n", 499 | "State Bank of Indore 32\n", 500 | "Catholic Syrian Bank 27\n", 501 | "India Bulls 21\n", 502 | "B N P Paribas 15\n", 503 | "Firstrand Bank Limited 11\n", 504 | "GIC Housing Finance Ltd 10\n", 505 | "Bank of Rajasthan 8\n", 506 | "Kerala Gramin Bank 4\n", 507 | "Industrial And Commercial Bank Of China Limited 3\n", 508 | "Ahmedabad Mercantile Cooperative Bank 1\n", 509 | "Name: Salary_Account, dtype: int64\n", 510 | "\n", 511 | "Mobile_Verified这一列数据的不同取值和出现的次数\n", 512 | "\n", 513 | "Y 80928\n", 514 | "N 43809\n", 515 | "Name: Mobile_Verified, dtype: int64\n", 516 | "\n", 517 | "Var1这一列数据的不同取值和出现的次数\n", 518 | "\n", 519 | "HBXX 84901\n", 520 | "HBXC 12952\n", 521 | "HBXB 6502\n", 522 | "HAXA 4214\n", 523 | "HBXA 3042\n", 524 | "HAXB 2879\n", 525 | "HBXD 2818\n", 526 | "HAXC 2171\n", 527 | "HBXH 1387\n", 528 | "HCXF 990\n", 529 | "HAYT 710\n", 530 | "HAVC 570\n", 531 | "HAXM 386\n", 532 | "HCXD 348\n", 533 | "HCYS 318\n", 534 | "HVYS 252\n", 535 | "HAZD 161\n", 536 | "HCXG 114\n", 537 | "HAXF 22\n", 538 | "Name: Var1, dtype: int64\n", 539 | "\n", 540 | "Filled_Form这一列数据的不同取值和出现的次数\n", 541 | "\n", 542 | "N 96740\n", 543 | "Y 27997\n", 544 | "Name: Filled_Form, dtype: int64\n", 545 | "\n", 546 | "Device_Type这一列数据的不同取值和出现的次数\n", 547 | "\n", 548 | "Web-browser 92105\n", 549 | "Mobile 32632\n", 550 | "Name: Device_Type, dtype: int64\n", 551 | "\n", 552 | "Var2这一列数据的不同取值和出现的次数\n", 553 | "\n", 554 | "B 53481\n", 555 | "G 47338\n", 556 | "C 20366\n", 557 | "E 1855\n", 558 | "D 918\n", 559 | "F 770\n", 560 | "A 9\n", 561 | "Name: Var2, dtype: int64\n", 562 | "\n", 563 | "Source这一列数据的不同取值和出现的次数\n", 564 | "\n", 565 | "S122 55249\n", 566 | "S133 42900\n", 567 | "S159 7999\n", 568 | "S143 6140\n", 569 | "S127 2804\n", 570 | "S137 2450\n", 571 | "S134 1900\n", 572 | "S161 1109\n", 573 | "S151 1018\n", 574 | "S157 929\n", 575 | "S153 705\n", 576 | "S144 447\n", 577 | "S156 432\n", 578 | "S158 294\n", 579 | "S123 112\n", 580 | "S141 83\n", 581 | "S162 60\n", 582 | "S124 43\n", 583 | "S150 19\n", 584 | "S160 11\n", 585 | "S136 5\n", 586 | "S138 5\n", 587 | "S155 5\n", 588 | "S139 4\n", 589 | "S129 4\n", 590 | "S135 2\n", 591 | "S131 1\n", 592 | "S130 1\n", 593 | "S132 1\n", 594 | "S125 1\n", 595 | "S140 1\n", 596 | "S142 1\n", 597 | "S126 1\n", 598 | "S154 1\n", 599 | "Name: Source, dtype: int64\n" 600 | ] 601 | } 602 | ], 603 | "source": [ 604 | "var = ['Gender','Salary_Account','Mobile_Verified','Var1','Filled_Form','Device_Type','Var2','Source']\n", 605 | "for v in var:\n", 606 | " print '\\n%s这一列数据的不同取值和出现的次数\\n'%v\n", 607 | " print data[v].value_counts()" 608 | ] 609 | }, 610 | { 611 | "cell_type": "markdown", 612 | "metadata": {}, 613 | "source": [ 614 | "### 紧接着你就可以开始处理你的字段(特征)了\n", 615 | "我这里只做了一些简单的处理,你大可在我的基础上做更复杂的特征处理" 616 | ] 617 | }, 618 | { 619 | "cell_type": "markdown", 620 | "metadata": {}, 621 | "source": [ 622 | "### City字段处理" 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": 9, 628 | "metadata": { 629 | "collapsed": false 630 | }, 631 | "outputs": [ 632 | { 633 | "data": { 634 | "text/plain": [ 635 | "724" 636 | ] 637 | }, 638 | "execution_count": 9, 639 | "metadata": {}, 640 | "output_type": "execute_result" 641 | } 642 | ], 643 | "source": [ 644 | "len(data['City'].unique())" 645 | ] 646 | }, 647 | { 648 | "cell_type": "markdown", 649 | "metadata": {}, 650 | "source": [ 651 | "#### 好像city的类型好多,粗暴一点,这个字段咱们不要了" 652 | ] 653 | }, 654 | { 655 | "cell_type": "code", 656 | "execution_count": 10, 657 | "metadata": { 658 | "collapsed": false, 659 | "scrolled": false 660 | }, 661 | "outputs": [], 662 | "source": [ 663 | "data.drop('City',axis=1,inplace=True)" 664 | ] 665 | }, 666 | { 667 | "cell_type": "markdown", 668 | "metadata": {}, 669 | "source": [ 670 | "### DOB字段处理\n", 671 | "DOB是出生的具体日期,咱们要具体日期作用没那么大,年龄段可能对我们有用,所有算一下年龄好了" 672 | ] 673 | }, 674 | { 675 | "cell_type": "code", 676 | "execution_count": 11, 677 | "metadata": { 678 | "collapsed": false 679 | }, 680 | "outputs": [ 681 | { 682 | "data": { 683 | "text/plain": [ 684 | "0 23-May-78\n", 685 | "1 07-Oct-85\n", 686 | "2 10-Oct-81\n", 687 | "3 30-Nov-87\n", 688 | "4 17-Feb-84\n", 689 | "Name: DOB, dtype: object" 690 | ] 691 | }, 692 | "execution_count": 11, 693 | "metadata": {}, 694 | "output_type": "execute_result" 695 | } 696 | ], 697 | "source": [ 698 | "data['DOB'].head()" 699 | ] 700 | }, 701 | { 702 | "cell_type": "code", 703 | "execution_count": 12, 704 | "metadata": { 705 | "collapsed": false 706 | }, 707 | "outputs": [ 708 | { 709 | "data": { 710 | "text/plain": [ 711 | "0 37\n", 712 | "1 30\n", 713 | "2 34\n", 714 | "3 28\n", 715 | "4 31\n", 716 | "Name: Age, dtype: int64" 717 | ] 718 | }, 719 | "execution_count": 12, 720 | "metadata": {}, 721 | "output_type": "execute_result" 722 | } 723 | ], 724 | "source": [ 725 | "#创建一个年龄的字段Age\n", 726 | "data['Age'] = data['DOB'].apply(lambda x: 115 - int(x[-2:]))\n", 727 | "data['Age'].head()" 728 | ] 729 | }, 730 | { 731 | "cell_type": "code", 732 | "execution_count": 13, 733 | "metadata": { 734 | "collapsed": true 735 | }, 736 | "outputs": [], 737 | "source": [ 738 | "#把原始的DOB字段去掉:\n", 739 | "data.drop('DOB',axis=1,inplace=True)" 740 | ] 741 | }, 742 | { 743 | "cell_type": "markdown", 744 | "metadata": {}, 745 | "source": [ 746 | "### EMI_Load_Submitted字段处理" 747 | ] 748 | }, 749 | { 750 | "cell_type": "code", 751 | "execution_count": 14, 752 | "metadata": { 753 | "collapsed": false 754 | }, 755 | "outputs": [ 756 | { 757 | "data": { 758 | "text/plain": [ 759 | "" 760 | ] 761 | }, 762 | "execution_count": 14, 763 | "metadata": {}, 764 | "output_type": "execute_result" 765 | }, 766 | { 767 | "data": { 768 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYcAAAEBCAYAAACT92m7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAHaBJREFUeJzt3X+U1fV95/HniyGISZDBtMRdDI4GsJjVTIhif9h1oito\nu1XSXSk5G2EiJ3sqstG2mw3Y04xszjbR3a2TNKvZtMRBa0oIW38ksUiMXFNXAphI0EBh3HYQiHJO\nBGbjJqGC7/3jfu74Ze78ujPznXsv83qcw8n387mfz/d+ridz39/P5/35fq8iAjMzs6wJ1R6AmZnV\nHgcHMzMr4+BgZmZlHBzMzKyMg4OZmZVxcDAzszKDBgdJayUdlrSrV/1/kLRH0guSPpepXy2pM722\nIFM/T9IuSfsktWfqJ0lan/pslTQz89qy1H6vpKUj/7hmZjYUQ5k53A8szFZIagF+B7g4Ii4G/luq\nnwssBuYC1wH3SlLqdh+wPCLmAHMklc65HDgSEbOBduDudK5pwKeBy4DLgTZJU4f5Oc3MrAKDBoeI\neAY42qv6FuBzEXEitflJqr8BWB8RJyKiC+gE5ks6B5gSETtSuweARZk+69LxRuCqdLwQ2BwR3RFx\nDNgMXFvh5zMzs2EYbs5hDvAvJX1P0hZJH0z1M4ADmXaHUt0M4GCm/mCqO6VPRJwEuiWdPcC5zMws\nZxNH0G9aRPyqpMuArwMXjNKYNHgTMzPL03CDwwHgbwAiYoekk5LeRfHqfmam3bmp7hDwnj7qybz2\nY0kNwFkRcUTSIaClV58tfQ1Gkh8QZWY2DBHR5wX5UIODOPWK/hGKuYGnJc0BJkXEa5IeAx6S9GcU\nl4BmAdsjIiR1S5oP7ACWAl9I53oMWAZsA24Enkr1TwD/JSWhJwDXAKsG+IBD/ChmY+vOO+/kzjvv\nrPYwzMq8tV+o3KDBQdJXKV7Bv0vSy0Ab8BXgfkkvAMcpftkTEbslbQB2A28AK+Ktb+1bgQ5gMvB4\nRGxK9WuBByV1Aq8BS9K5jkr6DPAcEMCalJg2qytdXV3VHoJZxXQ6XHFLitPhc9jpqbW1lY6OjmoP\nw6yMpH6XlXyHtFnOWltbqz0Es4p55mBmNk555mBWRYVCodpDMKuYg4OZmZXxspKZ2TjlZSUzM6uI\ng4NZzpxzsHrk4GBmZmWcczAzG6ecczAzs4o4OJjlzDkHq0cODmZmVsY5BzOzcco5BzMzq4iDg1nO\n2tvbqz0Es4o5OJjlbOfOndUeglnFHBzMctbU1FTtIZhVbKi/IW1mFSgUCj1bWNesWdNT39LSQktL\nS3UGZVYB71Yyy5l/JtRq1Yh2K0laK+mwpF19vPZHkt6UdHambrWkTkl7JC3I1M+TtEvSPkntmfpJ\nktanPlslzcy8tiy13ytpaSUf2szMhm8oOYf7gYW9KyWdC1wD7M/UzQUWA3OB64B7JZWi0n3A8oiY\nA8yRVDrncuBIRMwG2oG707mmAZ8GLgMuB9okTa34E5pVmX9D2urRoMEhIp4Bjvbx0j3AJ3vV3QCs\nj4gTEdEFdALzJZ0DTImIHandA8CiTJ916XgjcFU6XghsjojuiDgGbAauHdKnMqshzjFYPRrWbiVJ\n1wMHIuKFXi/NAA5kyodS3QzgYKb+YKo7pU9EnAS60zJVf+cyqyt+tpLVo4p3K0k6E7iD4pJSHvpM\njpiZ2dgZzlbW9wJNwA9TPuFc4AeS5lO8up+ZaXtuqjsEvKePejKv/VhSA3BWRByRdAho6dVnS3+D\nam1t7dlP3tjYSHNzc890vnTl5rLL1SiX6mplPC6P33KhUOjZOTfY/TdD2soqqQn4RkRc3Mdr/wjM\ni4ijki4CHqKYQJ4BfBuYHREh6XvAJ4AdwLeAL0TEJkkrgH8RESskLQEWRcSSlJB+DphHcfnrOeCD\nKf/QewzeympmVqGRbmX9KvAsxR1GL0v6WK8mQVoKiojdwAZgN/A4sCLzrX0rsBbYB3RGxKZUvxb4\nJUmdwO3AqnSuo8BnKAaFbcCavgKDWa0rXbmZ1RPfBGeWs5UrV/LFL36x2sMwK+NHdptV0Ysvvljt\nIZhVzMHBzMzK+MF7Zjlob2/nkUceAeDpp5/u2TmyaNEibr/99iqOzGxonHMwy1lzc7N/08FqknMO\nZlXU2NhY7SGYVczBwSxnixYtGryRWY1xcDDLWXNzc7WHYFYxBwczMyvjhLSZ2TjlhLSZmVXEwcEs\nZ362ktUjBwczMyvjnIOZ2TjlnIOZmVXEwcEsZ845WD1ycDAzszLOOZiZjVPOOZiZWUUcHMxy5pyD\n1aNBg4OktZIOS9qVqbtb0h5JOyX9L0lnZV5bLakzvb4gUz9P0i5J+yS1Z+onSVqf+myVNDPz2rLU\nfq+kpaPzkc3MbDBDmTncDyzsVbcZeF9ENAOdwGoASRcBi4G5wHXAvZJK61n3AcsjYg4wR1LpnMuB\nIxExG2gH7k7nmgZ8GrgMuBxokzR1WJ/SrIpKvwJnVk8GDQ4R8QxwtFfdkxHxZip+Dzg3HV8PrI+I\nExHRRTFwzJd0DjAlInakdg8ApYfc3wCsS8cbgavS8UJgc0R0R8QxigHp2go/n5mZDcNo5BxuBh5P\nxzOAA5nXDqW6GcDBTP3BVHdKn4g4CXRLOnuAc5nVFeccrB5NHElnSX8MvBERfz1K4wHoc1vVYFpb\nW2lqagKKP8vY3NzcM50v/XG67HI1yqXfj66V8bg8fsuFQoGOjg6Anu/L/gzpPgdJ5wHfiIhLMnWt\nwMeBqyLieKpbBURE3JXKm4A2YD+wJSLmpvolwJURcUupTURsk9QAvBIR01Obloj4/dTnS+kcX+tj\nfL7PwcysQqNxn4PIXNFLuhb4JHB9KTAkjwFL0g6k84FZwPaIeJXictH8lKBeCjya6bMsHd8IPJWO\nnwCukTQ1JaevSXVmZpazoWxl/SrwLMUdRi9L+hjw58A7gW9L+oGkewEiYjewAdhNMQ+xInNJfyuw\nFtgHdEbEplS/FvglSZ3A7cCqdK6jwGeA54BtwJqUmDarK6VpvVk98eMzzHJWKBR61n/NaslAy0oO\nDmZm45SfrWRmZhVxcDDLmXMOVo8cHMzMrIxzDmZm45RzDmZmVhEHB7OcOedg9cjBwczMyjjnYGY2\nTjnnYGZmFXFwMMuZcw5WjxwczMysjHMOZmbjlHMOZmZWEQcHs5w552D1yMHBzMzKOOdgZjZOOedg\nZmYVcXAwy5lzDlaPBg0OktZKOixpV6ZumqTNkvZKekLS1MxrqyV1StojaUGmfp6kXZL2SWrP1E+S\ntD712SppZua1Zan9XklLR+cjm5nZYIYyc7gfWNirbhXwZERcCDwFrAaQdBGwGJgLXAfcK6m0nnUf\nsDwi5gBzJJXOuRw4EhGzgXbg7nSuacCngcuAy4G2bBAyqxctLS3VHoJZxQYNDhHxDHC0V/UNwLp0\nvA5YlI6vB9ZHxImI6AI6gfmSzgGmRMSO1O6BTJ/suTYCV6XjhcDmiOiOiGPAZuDaCj6bmZkN03Bz\nDtMj4jBARLwKTE/1M4ADmXaHUt0M4GCm/mCqO6VPRJwEuiWdPcC5zOqKcw5WjyaO0nlGcx9pn9uq\nBtPa2kpTUxMAjY2NNDc390znS3+cLrtcjfLOnTtrajwuj99yoVCgo6MDoOf7sj9Dus9B0nnANyLi\nklTeA7RExOG0ZLQlIuZKWgVERNyV2m0C2oD9pTapfglwZUTcUmoTEdskNQCvRMT01KYlIn4/9flS\nOsfX+hif73OwmlUoFHr+UM1qyWjc5yBOvaJ/DGhNx8uARzP1S9IOpPOBWcD2tPTULWl+SlAv7dVn\nWTq+kWKCG+AJ4BpJU1Ny+ppUZ1ZXSlduZvVk0GUlSV8FWoB3SXqZ4kzgc8DXJd1McVawGCAidkva\nAOwG3gBWZC7pbwU6gMnA4xGxKdWvBR6U1Am8BixJ5zoq6TPAcxSXrdakxLRZXenq6qr2EMwq5sdn\nmOWgUCj0zBjWrFlDW1sbUFz39RKT1YqBlpVGKyFtZhm9g8Cdd95ZtbGYDYcfn2FmZmUcHMxy1tjY\nWO0hmFXMwcEsZ83NzdUeglnFnJA2Mxun/HsOZmZWEQcHs5z5JjirRw4OZjkrPVvJrJ44OJjl7Ngx\n39hv9cfBwczMyvgOabMc9H58Rokfn2H1wltZzXLW3NzsvIPVJG9lNasi3yFt9cjLSmY5yC4rPf30\n0z0P3vOyktULLyuZ5ay1tbXnpxnNaomXlczMrCIODmY5a21trfYQzCrmZSUzs3HKy0pmVeRnK1k9\nGlFwkLRa0o8k7ZL0kKRJkqZJ2ixpr6QnJE3t1b5T0h5JCzL189I59klqz9RPkrQ+9dkqaeZIxmtm\nZkMz7OAg6Tzg48AHIuISittiPwKsAp6MiAuBp4DVqf1FwGJgLnAdcK+k0nTmPmB5RMwB5khamOqX\nA0ciYjbQDtw93PGaVYu3rlo9GsnM4f8C/wS8Q9JE4EzgEHADsC61WQcsSsfXA+sj4kREdAGdwHxJ\n5wBTImJHavdApk/2XBuBq0cwXrOq8LKS1aNhB4eIOAr8d+BlikGhOyKeBN4dEYdTm1eB6anLDOBA\n5hSHUt0M4GCm/mCqO6VPRJwEjkk6e7hjNqsG3+Ng9WjYd0hLugD4A+A8oBv4uqR/B/TeNjSa24j6\nzKpDcbtgU1MTUHxcQXNzc890vnTl5rLL1Si/+uqrFAqFmhmPy+O3XCgUei5WSt+X/Rn2VlZJi4Fr\nIuLjqXwT8KvAVUBLRBxOS0ZbImKupFVARMRdqf0moA3YX2qT6pcAV0bELaU2EbFNUgPwSkRM72Ms\n3spqNaXQ66msbW1tgB+fYbVloK2sI3m20l7gTyRNBo5TzAfsAF4HWoG7gGXAo6n9Y8BDku6huFw0\nC9geESGpW9L81H8p8IVMn2XANuBGiglus5rXOwiUnq1kVi+GHRwi4oeSHgC+D5wEnge+DEwBNki6\nmeKsYHFqv1vSBmA38AawInO5fyvQAUwGHo+ITal+LfCgpE7gNWDJcMdrVi1dXV3VHoJZxXyHtFnO\n2tvbuf3226s9DLMyAy0rOTiYmY1TfnyGmZlVxMHBLGelXUtm9cTBwczMyjjnYGY2TjnnYGZmFXFw\nMMuZcw5WjxwczHL2+c9/vtpDMKuYg4NZzp5//vlqD8GsYg4OZmZWxsHBLAcrV66kqamJpqYm9u/f\n33O8cuXKag/NbEi8ldUsZ2m7YLWHYVbGW1nNzKwiDg5mObj44ouZOHEiEycWn4pfOr744ourPDKz\noRnJj/2YWT9ef/113nzzzZ5y6fj111+v1pDMKuKcg1nOnHOwWuWcg5mZVcTBwczMyjjnYJaD9vZ2\nHnnkkZ5yS0sLAIsWLfJPhlpdGNHMQdJUSV+XtEfSjyRdLmmapM2S9kp6QtLUTPvVkjpT+wWZ+nmS\ndknaJ6k9Uz9J0vrUZ6ukmSMZr9lYeemll+jq6qKrqwug5/ill16q7sDMhmhECWlJHcDTEXG/pInA\nO4A7gNci4m5JnwKmRcQqSRcBDwGXAecCTwKzIyIkbQNWRsQOSY8Dn4+IJyTdAlwcESsk/R7w4YhY\n0sc4nJC2muWEtNWqXBLSks4CfjMi7geIiBMR0Q3cAKxLzdYBi9Lx9cD61K4L6ATmSzoHmBIRO1K7\nBzJ9sufaCFw93PGamdnQjSTncD7wE0n3A+8HngNuB94dEYcBIuJVSdNT+xnA1kz/Q6nuBHAwU38w\n1Zf6HEjnOinpmKSzI+LICMZtljvnHKzejSQ4TATmAbdGxHOS7gFWAb3nz6M5n+5z+gPQ2tpKU1MT\nAI2NjTQ3N/f8QZZ+bMVll8eqvHbtWjo7Oyl59tlnmTBhAidOnKC5ubnq43N5fJYLhQIdHR0APd+X\n/Rl2zkHSu4GtEXFBKl9BMTi8F2iJiMNpyWhLRMyVtAqIiLgrtd8EtAH7S21S/RLgyoi4pdQmIrZJ\nagBeiYjpfYzFOQerKYVCoeePcs2aNbS1tQHFP9DSH61ZtQ2Ucxj2zCF9+R+QNCci9lHMB/wo/WsF\n7gKWAY+mLo8BD6UZxgxgFrA9JaS7Jc0HdgBLgS9k+iwDtgE3Ak8Nd7xmY2njxo1885vf7CmXrtZ+\n8pOfODhYXRjpfQ6foPiF/zbgH4CPAQ3ABkk3U5wVLAaIiN2SNgC7gTeAFZnL/VuBDmAy8HhEbEr1\na4EHJXUCrwFlO5XMatGsWbN6pu2l33Mo1ZvVgxEFh4j4IcWtqb39q37afxb4bB/13wfKHlcZEcdJ\nwcXMzMaO75A2y0HpJriS0rFvgrN64aeymuWgoaHhlEd2l0yYMIGTJ09WYURm5XJJSJtZ/77zne/0\nu1vJrB545mCWgw9/+MNs2bIFgO7ubqZOLT5i7EMf+hAPP/xwNYdm1sMzB7Mxdtttt/H+978fKM4c\nSndFe+Zg9cIzB7McXHHFFTz33HMAHD9+nDPOOAOASy+9lGeeeaaaQzPr4V+CMzOzijg4mJlZGS8r\nmeXgzDPP5Be/+EVZ/eTJk/n5z39ehRGZlRtoWcnBwSwHUr8PEPYP/1jNcM7BbIxNmzatonqzWuPg\nYJaDqVOnIqlnBlE6Lt3vYFbrvKxklgPnHKweeFnJbIxNnNj3/aX91ZvVGgcHsxw0NTXR0NBAQ0MD\nQM/xYD/NaFYrvKxklgMvK1k98FZWszHmraxWD5xzMBtjkydPrqjerNY4OJjlYNasWX3mHPwb0lYv\nRhwcJE2Q9ANJj6XyNEmbJe2V9ISkqZm2qyV1StojaUGmfp6kXZL2SWrP1E+StD712Spp5kjHazYW\nXnzxRU6ePNnzq2+l4xdffLHKIzMbmtGYOdwG7M6UVwFPRsSFwFPAagBJFwGLgbnAdcC9emth9j5g\neUTMAeZIWpjqlwNHImI20A7cPQrjNTOzQYwoOEg6F/gt4C8z1TcA69LxOmBROr4eWB8RJyKiC+gE\n5ks6B5gSETtSuwcyfbLn2ghcPZLxmpnZ0Ix05nAP8Ekgu/3i3RFxGCAiXgWmp/oZwIFMu0OpbgZw\nMFN/MNWd0iciTgLHJJ09wjGbmdkghn27pqTfBg5HxE5JLQM0Hc19e/3uD2xtbe25waixsZHm5uae\nn2Qs/dC7yy6PZbk/tTI+l8dfuVAo0NHRATDoDZnDvs9B0p8CHwVOAGcCU4CHgUuBlog4nJaMtkTE\nXEmrgIiIu1L/TUAbsL/UJtUvAa6MiFtKbSJim6QG4JWImN5rKL7PwWqO73OwepDLfQ4RcUdEzIyI\nC4AlwFMRcRPwDaA1NVsGPJqOHwOWpB1I5wOzgO1p6alb0vyUoF7aq8+ydHwjxQS3mZnlLI+ngH0O\n2CDpZoqzgsUAEbFb0gaKO5veAFZkLvdvBTqAycDjEbEp1a8FHpTUCbxGMQiZmVnO/PgMsxx4Wcnq\ngR+fYWZmFXFwMDOzMg4OZmZWxsHBzMzKODiYmVkZBwczMyvj4GBmZmUcHMzMrIyDg5mZlXFwMDOz\nMg4OZmZWxsHBzMzKODiYmVkZBwczMyvj4GBmZmUcHMzMrIyDg5mZlXFwMDOzMg4OZmZWZtjBQdK5\nkp6S9CNJL0j6RKqfJmmzpL2SnpA0NdNntaROSXskLcjUz5O0S9I+Se2Z+kmS1qc+WyXNHO54zUaD\npCH9G+k5zKptJDOHE8AfRsT7gF8DbpX0K8Aq4MmIuBB4ClgNIOkiYDEwF7gOuFdv/RXcByyPiDnA\nHEkLU/1y4EhEzAbagbtHMF6zEYuIiv8Np59ZtQ07OETEqxGxMx2/DuwBzgVuANalZuuARen4emB9\nRJyIiC6gE5gv6RxgSkTsSO0eyPTJnmsjcPVwx2tmZkM3KjkHSU1AM/A94N0RcRiKAQSYnprNAA5k\nuh1KdTOAg5n6g6nulD4RcRI4Juns0RizmZn1b+JITyDpnRSv6m+LiNcl9Z4Tj+Ycud/F2NbWVpqa\nmgBobGykubmZlpYWAAqFAoDLLlelfM8991AoFGpmPC6P33KhUKCjowOg5/uyPxrJ+qakicA3gb+N\niM+nuj1AS0QcTktGWyJirqRVQETEXandJqAN2F9qk+qXAFdGxC2lNhGxTVID8EpETO9jHOF1WjOz\nykgiIvq86B7pstJXgN2lwJA8BrSm42XAo5n6JWkH0vnALGB7WnrqljQ/JaiX9uqzLB3fSDHBbWZm\nORv2zEHSbwDfBV6guHQUwB3AdmAD8B6Ks4LFEXEs9VlNcQfSGxSXoTan+g8CHcBk4PGIuC3VnwE8\nCHwAeA1YkpLZvcfimYPVrOySklktGWjmMKJlpVrh4GC1rLW1QEdHS7WHYVbGwcGsiiTw/z2tFuWZ\nczAzs9OQg4NZ7grVHoBZxRwczMysjHMOZjlzzsFq1UA5hxHfIW1Wz84+G44ezf998n7Q6rRpcORI\nvu9h44uXlWxcO3q0eFWf578tWwq5v8dYBDgbXxwczMysjHMONq6dLvmA0+Vz2NjyfQ5mZlYRJ6Rt\nXAs0wIPgR0cBaMn3LdJz8T11sNHjmYONayLnTHExI537e8iBwUaZcw42rp0ua/Wny+ewseWcg5mZ\nVcQ5Bxv38r5BbSyyDtOm5Xp6G4ccHGxcG4ulGC/5WD1yzsEsZw4OVqucczAzs4rURXCQdK2kv5e0\nT9Knqj0es8oUqj0As4rVfHCQNAH4IrAQeB/wEUm/Ut1RmVViZ7UHYFaxmg8OwHygMyL2R8QbwHrg\nhiqPyWzIrrzyWLWHYFaxetitNAM4kCkfpBgwzMachrnvVVpTUXtvsLBqq4eZg1nNiIiK/y1btqzi\nPmbVVg8zh0PAzEz53FR3iuFe0ZmNhXXr1lV7CGYVqfn7HCQ1AHuBq4FXgO3ARyJiT1UHZmZ2Gqv5\nmUNEnJS0EthMcRlsrQODmVm+an7mYGZmY88JaTMzK+PgYDVJ0klJP5D0fPrf/5TqC5K6erV9RNJP\n0/F5kl4Y4LxXSvpGroMvf88zJf2VpF2SXpD0XUlvH6TPFknzRvCeXy7dLCppdaZ+qqRbhnG+Nkl/\nONzxWP2p+ZyDjVv/LyL6+nIM4JikX4+IZyVNBc7h1N/IHGytdKzXUm8DXo2IjwJImg28kecbRsS/\nzxTvAD6bjqcBK4D78nx/q3+eOVitGmhv8nrgI+n4d4G/GfGbSVenGcoPJf2lpLel+j+RtC1d9X8p\n036LpM+l1/5e0m8McPp/Rmb7dUR0RsQbvWc5kv5I0qcz/ZammdMuSZemNm2SOtLs4x8l/a6k/5ra\nPJ529/XMPCR9FjgzfbYHKQaJ96byXantf5S0XdJOSW2Z8fyxpL2SvgtcOIL/vFaHHBysVpW+0ErL\nSjdmXnsK+M303K0lFIPFsEk6A7gfuDEi3g+8DSgtvfx5RFweEZcAb5f025muDRFxOfAHwJ0DvMVX\ngFWS/rekz0ialXltoFnMmRHxAeDWNL6SCyj+etANwF8B307j+wWQHR8RsRr4WUTMi4ibgFXAS6n8\nKUnXALMjYj7wAeBSSVekJa3FwCXpnJcNME47DXlZyWrVz/pZVgI4ATxDMTBMjoiXNbK7IC8E/iEi\n/k8qr6O49PIF4GpJnwTeTnFJ5kXgW6ldacbyfeC8/k4eET+UdD6wALgG2C7p1yh+mQ/kr1P/v5M0\nRdJZqf5vI+LNNOtQRGxO9S8ATUP5wBkLgGsk/YDibO0dwGzgLODhiDgOHJf0WIXntTrn4GD16mvA\nw0BpGWakeYSy4JJmFP8DmBcRP05LLpMzTY6n/z3JIH9LEfEz4BHgEUlvAr8FbAAaMs0m9+7WT/l4\nOmdIyuYu3uxnHAMFTgGfjYi/OKVSum2APjYOeFnJatWAM4GI+DvgT3lrSSnbfrBZRO/X9wLnSbog\nlW+i+CMMkyl+Ib8m6Z3Avx3OeCX9uqTGdDwJuAjoAg4DvyxpWgpE/7pX199Lfa4AuiPip5W8b8Y/\nSSoFjZ8CUzKvPQHcLOkd6b3+uaRfBr4LLJJ0hqQpwO8M4X3sNOKZg9WqyZmljgA2RcQdZK6mI+LP\nMu0r2a10laSXM+e+EfgYsDEldHcA/zMljf8C+BFvPbqlv/cY6D3fC9yXVr4mAN+KiIcBJP3n9H4H\ngeyd/wH8Iv03mJjG15f+3jdb/2Vgl6TvR8RNkp6VtIvi8tSnJM0Ftqbx/RT4aEQ8L2kDsItiENve\n+w3s9OY7pM3MrIyXlczMrIyXley0JGkBcBdvLa+I4o6kf3M6vadZXrysZGZmZbysZGZmZRwczMys\njIODmZmVcXAwM7MyDg5mZlbm/wPLSNHVButiugAAAABJRU5ErkJggg==\n", 769 | "text/plain": [ 770 | "" 771 | ] 772 | }, 773 | "metadata": {}, 774 | "output_type": "display_data" 775 | } 776 | ], 777 | "source": [ 778 | "data.boxplot(column=['EMI_Loan_Submitted'],return_type='axes')" 779 | ] 780 | }, 781 | { 782 | "cell_type": "code", 783 | "execution_count": 15, 784 | "metadata": { 785 | "collapsed": false 786 | }, 787 | "outputs": [ 788 | { 789 | "data": { 790 | "text/html": [ 791 | "
\n", 792 | "\n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | "
EMI_Loan_SubmittedEMI_Loan_Submitted_Missing
0NaN1
16762.900
2NaN1
3NaN1
4NaN1
56978.920
6NaN1
7NaN1
830824.650
910883.380
\n", 853 | "
" 854 | ], 855 | "text/plain": [ 856 | " EMI_Loan_Submitted EMI_Loan_Submitted_Missing\n", 857 | "0 NaN 1\n", 858 | "1 6762.90 0\n", 859 | "2 NaN 1\n", 860 | "3 NaN 1\n", 861 | "4 NaN 1\n", 862 | "5 6978.92 0\n", 863 | "6 NaN 1\n", 864 | "7 NaN 1\n", 865 | "8 30824.65 0\n", 866 | "9 10883.38 0" 867 | ] 868 | }, 869 | "execution_count": 15, 870 | "metadata": {}, 871 | "output_type": "execute_result" 872 | } 873 | ], 874 | "source": [ 875 | "#好像缺失值比较多,干脆就开一个新的字段,表明是缺失值还是不是缺失值\n", 876 | "data['EMI_Loan_Submitted_Missing'] = data['EMI_Loan_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)\n", 877 | "data[['EMI_Loan_Submitted','EMI_Loan_Submitted_Missing']].head(10)" 878 | ] 879 | }, 880 | { 881 | "cell_type": "code", 882 | "execution_count": 16, 883 | "metadata": { 884 | "collapsed": true 885 | }, 886 | "outputs": [], 887 | "source": [ 888 | "#原始那一列就可以不要了\n", 889 | "data.drop('EMI_Loan_Submitted',axis=1,inplace=True)" 890 | ] 891 | }, 892 | { 893 | "cell_type": "markdown", 894 | "metadata": {}, 895 | "source": [ 896 | "### Employer Name字段处理" 897 | ] 898 | }, 899 | { 900 | "cell_type": "markdown", 901 | "metadata": {}, 902 | "source": [ 903 | "#### 看看个数" 904 | ] 905 | }, 906 | { 907 | "cell_type": "code", 908 | "execution_count": 17, 909 | "metadata": { 910 | "collapsed": false 911 | }, 912 | "outputs": [ 913 | { 914 | "data": { 915 | "text/plain": [ 916 | "57193" 917 | ] 918 | }, 919 | "execution_count": 17, 920 | "metadata": {}, 921 | "output_type": "execute_result" 922 | } 923 | ], 924 | "source": [ 925 | "len(data['Employer_Name'].value_counts())" 926 | ] 927 | }, 928 | { 929 | "cell_type": "markdown", 930 | "metadata": {}, 931 | "source": [ 932 | "#### 不看也知道,每个人都有一个名字,太多了,懒癌晚期的同学直接drop掉了" 933 | ] 934 | }, 935 | { 936 | "cell_type": "code", 937 | "execution_count": 18, 938 | "metadata": { 939 | "collapsed": true 940 | }, 941 | "outputs": [], 942 | "source": [ 943 | "#丢掉\n", 944 | "data.drop('Employer_Name',axis=1,inplace=True)" 945 | ] 946 | }, 947 | { 948 | "cell_type": "markdown", 949 | "metadata": {}, 950 | "source": [ 951 | "### Existing_EMI字段" 952 | ] 953 | }, 954 | { 955 | "cell_type": "code", 956 | "execution_count": 19, 957 | "metadata": { 958 | "collapsed": false 959 | }, 960 | "outputs": [ 961 | { 962 | "data": { 963 | "text/plain": [ 964 | "" 965 | ] 966 | }, 967 | "execution_count": 19, 968 | "metadata": {}, 969 | "output_type": "execute_result" 970 | }, 971 | { 972 | "data": { 973 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXEAAAEHCAYAAABY/HZ4AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAEA9JREFUeJzt3H2MZXV9x/H3B1ax9WHHh0QqCNOqhEbFKSmENLa9SCuL\nNoq1FpbGdKg2m1ZMaf8QTGxgExtLbdqJ2JasXdnaBCH1KVgRaehejPUBWl3EujysWoWVWq2ydhHt\ndv32jzkze5mdhzu7s3vPuft+JRvu75zfnPlBdj6c+ZyHVBWSpG46btQLkCQdOkNckjrMEJekDjPE\nJanDDHFJ6jBDXJI67KiHeJKtSb6V5ItDzP2LJF9I8vkk9yX57tFYoyR1RY72feJJXgrsBd5XVWes\n4usuA6aq6o1HbHGS1DFH/Uy8qj4FfG9wW5KfSfLxJHcluSPJaYt86Ubg/UdlkZLUEetGvYDGFmBT\nVX0lydnA3wDnze1McgowCfzzaJYnSe008hBP8mTgF4B/SJJm8xMWTLsY+ED5jgBJepyRhzizlc73\nqurMZeZcDPz+UVqPJHXGip34MHeTJHlXkgeS7EgyNcT3TfOHqvof4GtJfmPgeGcMfD4dmKiqzw5x\nXEk6pgxzYfN64Pyldia5AHheVb0A2ARct9zBktwAfBo4Lck3klwK/BbwhuZ/Al8CXjXwJRcBNw6x\nTkk65gx1i2GSU4GPLnZLYJLrgO1VdVMz3gn0qupba71YSdLjrcUthicBDw6MdzfbJElHmI/dS1KH\nrcXdKbuB5w6MT262HSSJtwhK0iGoqiy2fdgQn7+bZBE3A28CbkpyDvDIcn24t3qrrXq9Hv1+f9TL\nkA5y4BGag60Y4s3dJD3gmUm+AVwFPBGoqtpSVbckeUWSXcCjwKVrsmrpKHvkkUdGvQRp1VYM8aq6\nZIg5l63NcqTROfHEE0e9BGnVvLApNa688spRL0FataP6Ktokvv5EklYpyZIXNj0Tlxpe1FQXGeKS\n1GHWKZLUctYpkjSmDHGpYSeuLjLEJanD7MQlqeXsxCVpTBniUsNOXF1kiEtSh9mJS1LL2YlL0pgy\nxKWGnbi6yBCXpA6zE5eklrMTl6QxZYhLDTtxdZEhLkkdZicuSS1nJy5JY8oQlxp24uoiQ1ySOsxO\nXJJazk5cksaUIS417MTVRYa4JHWYnbgktZyduCSNKUNcatiJq4sMcUnqMDtxSWo5O3FJGlOGuNSw\nE1cXGeKS1GFDhXiSDUnuTXJ/kisW2f/MJB9PsiPJPUmm13yl0hHW6/VGvQRp1Va8sJnkOOB+4Dzg\nm8BdwMVVde/AnKuAJ1XVW5M8C7gPeHZV/d+CY3lhU5JW6XAvbJ4NPFBVX6+qfcCNwKsXzPlP4KnN\n56cC/70wwKW2sxNXF60bYs5JwIMD44eYDfZB7wFuT/JN4CnARWuzPEnScoYJ8WG8Fbi7qs5N8jzg\nn5KcUVV7F06cnp5mcnISgImJCaampua7yLkzIceORzGe29aW9Tg+dsf9fp9t27YBzOflUobpxM8B\nrq6qDc34SqCq6pqBObcAf1JV/9KMbweuqKp/XXAsO3FJWqXD7cTvAp6f5NQkTwQuBm5eMGcn8CvN\nN3s2cBrw1UNfsnT0zZ0JSV2yYp1SVfuTXAbcxmzob62qnUk2ze6uLcA7gOuT3A0EeEtVffdILlyS\n5LtTJKn1fHeKJI0pQ1xq2ImriwxxSeowO3FJajk7cUkaU4a41LATVxcZ4pLUYXbiktRyduKSNKYM\ncalhJ64uMsQlqcPsxCWp5ezEJWlMGeJSw05cXWSIS1KH2YlLUsvZiUvSmDLEpYaduLrIEJekDrMT\nl6SWsxOXpDFliEsNO3F1kSEuSR1mJy5JLWcnLkljyhCXGnbi6iJDXJI6zE5cklrOTlySxpQhLjXs\nxNVFhrgkdZiduCS1nJ24JI0pQ1xq2ImriwxxSeqwoTrxJBuAGWZDf2tVXbPInB7wl8ATgG9X1bmL\nzLETl6RVWq4TXzHEkxwH3A+cB3wTuAu4uKruHZizHvg08PKq2p3kWVX1nUWOZYhL0iod7oXNs4EH\nqurrVbUPuBF49YI5lwAfrKrdAIsFuNR2duLqomFC/CTgwYHxQ822QacBz0iyPcldSV6/VguUJC1t\n3Roe50zgZcCTgc8k+UxV7Vqj40tHXK/XG/USpFUbJsR3A6cMjE9utg16CPhOVf0Q+GGSTwIvAQ4K\n8enpaSYnJwGYmJhgampq/odn7tdZx44dOz6Wx/1+n23btgHM5+VShrmweTxwH7MXNh8G7gQ2VtXO\ngTmnA9cCG4ATgM8BF1XVlxccywubaq1+vz//AyW1yXIXNlc8E6+q/UkuA27jwC2GO5Nsmt1dW6rq\n3iSfAL4I7Ae2LAxwSdLa890pktRyvjtFksaUIS415i4sSV1iiEtSh9mJS1LL2YlL0pgyxKXGzMzM\nqJcgrZohLjV27Ngx6iVIq2aIS42VHm+W2mitXoAldVK/35+/tXDz5s3z23u9no/gqxO8O0VqTE9P\nz790SGoT706RpDFliEuN6enpUS9BWjXrFElqOesUaQi+O0VdZIhLUodZp0hSy1mnSNKYMsSlhp24\nusgQl6QOsxOXpJazE5ekMWWISw07cXWRIS5JHWYnLkktZycuSWPKEJcaduLqIkNckjrMTlySWs5O\nXJLGlCEuNezE1UWGuCR1mJ24JLWcnbgkjSlDXGrYiauLDHFJ6jA7cUlqucPuxJNsSHJvkvuTXLHM\nvLOS7Evy64e6WEnS8FYM8STHAe8GzgdeCGxMcvoS8/4U+MRaL1I6GuzE1UXDnImfDTxQVV+vqn3A\njcCrF5n3ZuADwH+t4fokScsYJsRPAh4cGD/UbJuX5DnAhVX1N8CivY3Udr1eb9RLkFZtre5OmQEG\nu3KDXJKOgnVDzNkNnDIwPrnZNujngRuTBHgWcEGSfVV188KDTU9PMzk5CcDExARTU1PzZ0BznaRj\nx6MYz8zM+PfRcSvG/X6fbdu2Aczn5VJWvMUwyfHAfcB5wMPAncDGqtq5xPzrgY9W1YcW2ecthmqt\nfr8//wMltclytxiueCZeVfuTXAbcxmz9srWqdibZNLu7tiz8ksNesTQCBri6yId9JKnlfAGWNIS5\nTlLqEkNckjrMOkWSWs46RZLGlCEuNezE1UWGuCR1mJ24JLWcnbgkjSlDXGrYiauLDHFJ6jA7cUlq\nOTtxSRpThrjUsBNXFxniktRhduKS1HJ24pI0pgxxqWEnri4yxCWpw+zEJanl7MQlaUwZ4lLDTlxd\nZIhLUofZiUtSy9mJS9KYMsSlhp24usgQl6QOsxOXpJazE5ekMWWISw07cXWRIS5JHWYnLkktZycu\nSWPKEJcaduLqIkNckjrMTlySWs5OXJLG1FAhnmRDknuT3J/kikX2X5Lk7ubPp5K8eO2XKh1ZduLq\nohVDPMlxwLuB84EXAhuTnL5g2leBX6qqlwBvB96z1guVJB1sxU48yTnAVVV1QTO+EqiqumaJ+RPA\nPVX13EX22YlL0iodbid+EvDgwPihZttS3gh8fPjlSZIO1bq1PFiSc4FLgZcuNWd6eprJyUkAJiYm\nmJqaotfrAQc6SceORzGemZnx76PjVoz7/T7btm0DmM/LpQxbp1xdVRua8aJ1SpIzgA8CG6rqK0sc\nyzpFrdXv9+d/oKQ2Wa5OGSbEjwfuA84DHgbuBDZW1c6BOacAtwOvr6rPLnMsQ1ySVmm5EF+xTqmq\n/UkuA25jtkPfWlU7k2ya3V1bgD8GngH8dZIA+6rq7LX7V5AkLcYnNqWGdYrayic2JWlMeSYuSS3n\nmbgkjSlDXGrM3acrdYkhLkkdZicuSS1nJy5JY8oQlxp24uoiQ1ySOsxOXJJazk5cksaUIS417MTV\nRYa4JHWYnbgktZyduCSNKUNcatiJq4sMcUnqMDtxSWo5O3FJGlOGuNSwE1cXGeKS1GF24pLUcnbi\nkjSmDHGpMTMzM+olSKtmiEuNHTt2jHoJ0qoZ4lJjcnJy1EuQVm3dqBcgjVK/35+/tXDz5s3z23u9\nHr1ebzSLklbBu1OkxoYNG7j11ltHvQzpIMvdnWKIS43169ezZ8+eUS9DOoi3GEpDeOyxx0a9BGnV\n7MR1TBvsxPft28fVV18N2ImrO6xTdEx7zWtew/bt2wHYs2cP69evB+Dcc8/lwx/+8CiXJs2zE5eG\n0PygjHoZ0kGWC3HrFB3TZmZm+MhHPjI/nqtQLrzwQi6//PIRrUoaniGuY9odd9zxuCc15z4//elP\nN8TVCUPdnZJkQ5J7k9yf5Iol5rwryQNJdiSZWttlSkfGrl272Lt3L3v37gWY/7xr164Rr0wazooh\nnuQ44N3A+cALgY1JTl8w5wLgeVX1AmATcN0RWKu05tavX8+6detYt272l9K5z3MXOKW2W/HCZpJz\ngKuq6oJmfCVQVXXNwJzrgO1VdVMz3gn0qupbC47lhU21SrLotSIAL3KqNQ73YZ+TgAcHxg8125ab\ns3uROZKkNeYTm5LUYcPcnbIbOGVgfHKzbeGc564wB1j+11epTfy7qi4YJsTvAp6f5FTgYeBiYOOC\nOTcDbwJuajr0Rxb24cCSnY4k6dCsGOJVtT/JZcBtzNYvW6tqZ5JNs7trS1XdkuQVSXYBjwKXHtll\nS5LgKD92L0laW17YlKQOM8TVKkn2J/l8ki80/3zLCvP/McnTltn/B0meNOz8Q1jvLyd5ZMGaX9bs\n+3GS9w3MPT7Jt5Pc3Ix/O8m1a7UWHZt8d4ra5tGqOnPYyVX1aytMuRz4e+CHQ84/FJ+sqlctsv1R\n4EVJTqiqHwG/yuOfpwCwz9Rh8UxcbXPQHUxJnta8u+cFzfiGJG9oPn8tyTOS/GRzlv2FJF9M8rok\nbwaeA2xPcvuC+acm+XKSLUm+lOTWJCc0c85KcndzVv1nSe5Z7ZoH3AK8svm8EXj/qv5rSCswxNU2\nP7GgmnhdVX2f2VtY/y7JRcBEVW1t5s+dyW4AdlfVz1XVGcCtVXUts88r9KrqvAXzAZ4PXFtVLwL2\nAK9ttr8X+N3mN4L9rHy2/IsL1vzTA9/rRmbfN3QCcAbwudX/J5GWZp2itvnBYnVKVd2e5DeBvwJe\nPLBr7iz4HuDPk7wD+FhVfWpgfxaZD/C1qpo7y/43YDLJeuApVXVns/0GDpxJL2WpOoWq+lKSSWbP\nwj/G8mft0qp5Jq5OyOzjkz/LbM/8zIX7q+oB4Exmw/ztSd42xGF/NPB5PwdOatY6aG8G3olVio4A\nQ1xts1SA/hHwZeAS4Pokxz/ui5KfAh6rqhuYDcy5s/nvA0vdjXLQ96qqPcD3k5zVbLr4MNY8t/29\nwOaq+vchjiWtinWK2uZJST7PbAAWcCuwDfgd4Kyq+kGSO4C3AZs50Fe/GHhnkh8D/wv8XrP9PcCt\nSXY3vfhgv71U1/1G4G+T7AfuYLYvX85LF6z57VX1obnjV9VuZt/JL605n9iUFkjy5Kp6tPl8BXBi\nVf3hiJclLcozcelgr0zyVmZ/Pv4DmB7paqRleCYuDSHJy4FrOFDBBPhqVb126a+SjjxDXJI6zLtT\nJKnDDHFJ6jBDXJI6zBCXpA4zxCWpw/4fn10SfexcgdUAAAAASUVORK5CYII=\n", 974 | "text/plain": [ 975 | "" 976 | ] 977 | }, 978 | "metadata": {}, 979 | "output_type": "display_data" 980 | } 981 | ], 982 | "source": [ 983 | "data.boxplot(column='Existing_EMI',return_type='axes')" 984 | ] 985 | }, 986 | { 987 | "cell_type": "code", 988 | "execution_count": 21, 989 | "metadata": { 990 | "collapsed": false 991 | }, 992 | "outputs": [ 993 | { 994 | "data": { 995 | "text/plain": [ 996 | "count 1.246260e+05\n", 997 | "mean 3.636342e+03\n", 998 | "std 3.369124e+04\n", 999 | "min 0.000000e+00\n", 1000 | "25% NaN\n", 1001 | "50% NaN\n", 1002 | "75% NaN\n", 1003 | "max 1.000000e+07\n", 1004 | "Name: Existing_EMI, dtype: float64" 1005 | ] 1006 | }, 1007 | "execution_count": 21, 1008 | "metadata": {}, 1009 | "output_type": "execute_result" 1010 | } 1011 | ], 1012 | "source": [ 1013 | "data['Existing_EMI'].describe()" 1014 | ] 1015 | }, 1016 | { 1017 | "cell_type": "code", 1018 | "execution_count": 22, 1019 | "metadata": { 1020 | "collapsed": false 1021 | }, 1022 | "outputs": [], 1023 | "source": [ 1024 | "#缺省值不多,用均值代替\n", 1025 | "data['Existing_EMI'].fillna(0, inplace=True)" 1026 | ] 1027 | }, 1028 | { 1029 | "cell_type": "markdown", 1030 | "metadata": {}, 1031 | "source": [ 1032 | "### Interest_Rate字段:" 1033 | ] 1034 | }, 1035 | { 1036 | "cell_type": "code", 1037 | "execution_count": 23, 1038 | "metadata": { 1039 | "collapsed": false 1040 | }, 1041 | "outputs": [ 1042 | { 1043 | "data": { 1044 | "text/plain": [ 1045 | "" 1046 | ] 1047 | }, 1048 | "execution_count": 23, 1049 | "metadata": {}, 1050 | "output_type": "execute_result" 1051 | }, 1052 | { 1053 | "data": { 1054 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW0AAAEBCAYAAACzN/QDAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAEJhJREFUeJzt3W+MZXV9x/H3h8VQKGVnq8LSQp1gWqJJdbTSP5KWaauV\nWgPUKG3kwY7yQNuoiNaKTZtdTWMUGxrbxCeizkhqFUlTQaMQYS6JEKnCrlAgPJE1Me2u/7CiNI2B\nbx/M2WWYndm5d3Zm7jn3vF/JZc7v3HPu/d5l+XLmc8753VQVkqRuOGncBUiShmfTlqQOsWlLUofY\ntCWpQ2zaktQhNm1J6pChm3aSk5Lcl+TmZrwryW1JHklya5KdW1emJAlGO9K+Cnho2fga4CtVdT5w\nB/DezSxMknSsoZp2knOAVwPXL1t9KbDQLC8Al21uaZKklYY90v5H4N3A8tsnz6qqwwBVdQg4c5Nr\nkyStsG7TTvInwOGqOgDkOJt6P7wkbbGTh9jmQuCSJK8GTgV+IckNwKEkZ1XV4SS7ge+utnMSm7kk\nbUBVHXOgnFEmjEpyEfCuqrokybXAD6rqQ0neA+yqqmtW2aeclEpttG/fPvbt2zfuMqRVJVm1aZ/I\nddofBF6Z5BHgD5ux1BkHDx4cdwnSyIaJR46qqjuBO5vlHwKv2IqiJEmr845I9dbc3Ny4S5BGNlKm\nvaE3MNOWpJFtRaYtddpgMBh3CdLIbNqS1CHGI5LUQsYjkjQBbNrqLTNtdZFNW5I6xExbklrITFuS\nJoBNW71lpq0usmlLUoeYaUtSC5lpS9IEsGmrt8y01UXGI+qt5tfPcZchrcp4RJImgE1bkjpkpK8b\nk7ouyZpjoxJ1gU1bvbK8MZtpq4uMRySpQ2zaktQhNm311uLi4rhLkEZm05akDvHmGklqIW+ukaQJ\nYNNWb628ZlvqApu2JHXIuk07ySlJ7kmyP8mDST7QrN+b5DtJ7mseF299uZLUb0OdiExyWlU9kWQH\ncBfwLuAVwONVdd06+3oiUq1xvEjEv6dqk7VORA51G3tVPdEsnsLS0fljR153c8qTtoe3savrhsq0\nk5yUZD9wCBhU1UPNU29NciDJ9Ul2blmVkiRgyKZdVU9V1UuAc4DfS3IR8FHgvKqaYamZHzcmkSSd\nuJFm+auqHyf5IvCyqrpz2VMfA25Za7+5uTmmp6cBmJqaYmZmhtnZWeDpr3xy7Hi7x1XVqnoc93s8\nGAyYn58HONovV7PuicgkzwF+VlX/k+RU4FbgfcCDVXWo2eZq4IKqesMq+3siUq00GAyO/scjtc2J\nnIg8G1jI0mn3k4Abqur2JJ9KMgM8BRwE3ryZBUuSjuXcI5LUQs49IkkTwKat3jpyEkjqEpu2JHWI\nmbYktZCZtiRNAJu2estMW11k05akDjHTlqQWMtOWVvDrxtRFNm1J6hCbtiR1yEhTs0pdtzISWT72\n3Iu6wKatXvHrxtR1xiOS1CEeaatXjEfUdTZt9YrxiLrOeESSOsSmLUkdYtNWby0uLo67BGlkzj2i\n3jLTVps594gkTQCbtiR1iJf8qVe8TltdZ9NWr3idtrrOeESSOsSmLUkdYjyiXjHTVtfZtNUrZtrq\nOuMRSeqQdZt2klOS3JNkf5IHk3ygWb8ryW1JHklya5KdW1+udGKSHH2sNpbabqjb2JOcVlVPJNkB\n3AW8C7gE+EFVXZvkPcCuqrpmlX29jV2tZDyiNjuh29ir6olm8ZRmn8eAS4GFZv0CcNkm1ClJOo6h\nmnaSk5LsBw4Bg6p6CDirqg4DVNUh4MytK1OSBENePVJVTwEvSXIGcGuSWWDl75Vr/p45NzfH9PQ0\nAFNTU8zMzDA7OwvAYDAAcOx428eLi4utqsdxv8eDwYD5+XmAo/1yNSNPzZrk74D/Ba4EZqvqcJLd\nwGJVvWCV7c201Upm2mqzDWfaSZ5z5MqQJKcCrwT2AzcDc81me4DPb1q1kqRVDROPnA0sZOmaqJOA\nG6rq9ibjvjHJm4BvA5dvYZ2SJPzmGvXM8a7H9u+p2mSteMTb2NUr3saurvM2dknqEJu2JHWI8Yh6\nxalZ1XU2bfWKmba6znhEkjrEI231ivGIus6mrV4xHlHXGY9IUofYtCWpQ2za6pUdO3as+nVjO3bs\nGHNl0nDMtNUrTz755NFlM211kUfaktQhNm1J6hDjEfWK12mr62za6hWv01bXGY9IUod4pK2Jcbxv\npdnMfTw61zjZtDUxRm2mCdh/1TXGI+qxwbgLkEZm05akDrFpq7f27p0ddwnSyLLVJ1WSlCduJGk0\nzSWpx5wp90hbvTUYDMZdgjQym7YkdYjxiCS1kPGIJE0Am7Z6a25uMO4SpJGt27STnJPkjiQPJnkg\nydua9XuTfCfJfc3j4q0vV9o8CwvjrkAa3bqZdpLdwO6qOpDkdOBe4FLgz4DHq+q6dfY301YreRu7\n2mytTHvduUeq6hBwqFn+SZKHgV8+8rqbWqUk6bhGyrSTTAMzwD3NqrcmOZDk+iQ7N7k2aYsNxl2A\nNLKhZ/lropGbgKuaI+6PAu+vqkry98B1wJWr7Ts3N8f09DQAU1NTzMzMMDs7Czx9g4Njx44d93k8\nGAyYn58HONovVzPUddpJTga+AHypqj6yyvPPA26pqhet8pyZtlpp376lh9RGa2XawzbtTwHfr6p3\nLlu3u8m7SXI1cEFVvWGVfW3akjSiDd9ck+RC4ArgD5LsX3Z537VJ7k9yALgIuHrTq5a20JFfTaUu\nGebqkbuAHas89eXNL0eSdDzOPSJJLeTcI5I0AWza6i3nHlEXGY+ot5IBVbPjLkNa1Qld8neCb2zT\nVis594jazExbkiaATVs9Nhh3AdLIbNqS1CE2bfXW3r2z4y5BGpknIiWphTwRKa3g3CPqIpu2JHWI\n8YgktZDxiCRNAJu2esu5R9RFxiPqLeceUZs594i0gnOPqM3MtCVpAti01WODcRcgjcymLUkdYtNW\nbzn3iLrIE5GS1EKeiJRWcO4RdZFNW5I6xHhEklrIeESSJoBNW73l3CPqIuMR9ZZzj6jNNhyPJDkn\nyR1JHkzyQJK3N+t3JbktySNJbk2ycysKl7bO7LgLkEa27pF2kt3A7qo6kOR04F7gUuCNwA+q6tok\n7wF2VdU1q+zvkbZayQmj1GYbPtKuqkNVdaBZ/gnwMHAOS417odlsAbhs88qVtsNg3AVIIxvpRGSS\naWAG+BpwVlUdhqXGDpy52cVJkp7p5GE3bKKRm4CrquonSVb+YrnmL5pzc3NMT08DMDU1xczMDLOz\ns8DTd6U5drx8/NrXzvLYY/D00fBs83Mzx7MkW/n6cPrpA265Zfx/no7bPx4MBszPzwMc7ZerGerq\nkSQnA18AvlRVH2nWPQzMVtXhJvderKoXrLKvmbZGNil586R8Dm2/E7255hPAQ0caduNmYK5Z3gN8\n/oQqlLbZkaMcqUvWjUeSXAhcATyQZD9LMcjfAB8CbkzyJuDbwOVbWagkyZtr1FKTEitMyufQ9nPu\nEUmaADZt9ZaZtrrIpi1JHWKmrVaalCx4Uj6Htp+ZtiRNAJu2estMW11k05akDjHTVitNShY8KZ9D\n289MW5ImgE1bvWWmrS4aempWaTsVgWN+MeyeWvZPaTOYaauVJiULnpTPoe1npi1JE8Cmrd4y01YX\n2bQlqUPMtNVKk5IFT8rn0PYz05akCWDTVm+ZaauLbNqS1CFm2mqlScmCJ+VzaPuZaUvSBLBpq7fM\ntNVFNm1J6hAzbbXSpGTBk/I5tP3MtCVpAti01Vtm2uoim7YkdYiZtlppUrLgSfkc2n4bzrSTfDzJ\n4ST3L1u3N8l3ktzXPC7e7IIlSccaJh75JPCqVdZfV1UvbR5f3uS6pC1npq0uWrdpV9VXgcdWeWoC\nvsFPkrrlRE5EvjXJgSTXJ9m5aRVJ22R2dnbcJUgj22jT/ihwXlXNAIeA6zavJGlJ0v3Hrl3j/lPU\npDl5IztV1feWDT8G3HK87efm5piengZgamqKmZmZo0c5R3JFx46Xj6u2/v2SAYuLbPnnGQzG/+fp\nuP3jwWDA/Pw8wNF+uZqhLvlLMg3cUlW/3ox3V9WhZvlq4IKqesMa+3rJn1opGRz9n4PUNmtd8rdu\n007yaWAWeDZwGNgL/D4wAzwFHATeXFWH19jfpq1W8hpqtdmGm/YmvLFNW61k01abOWGUdIzBuAuQ\nRmbTVm/t2TPuCqTRGY9IUgsZj0jSBLBpq7eOXCMrdYlNW5I6xExbklrITFtaYd++cVcgjc4jbfWW\nt7GrzTzSlqQJ4JG2esvb2NVmHmlL0gSwaavHBuMuQBqZTVu95dwj6iIzbUlqITNtSZoANm31lnOP\nqIts2pLUIRv6NnapjZJj4r8t4TkajZNNWxPDZqo+MB5Rb5lpq4ts2pLUIV6nLUkt5HXakjQBbNrq\nLTNtdZFNW5I6xExbklrITFuSJsC6TTvJx5McTnL/snW7ktyW5JEktybZubVlSpvPTFtdNMyR9ieB\nV61Ydw3wlao6H7gDeO9mFyZttQMHDoy7BGlk6zbtqvoq8NiK1ZcCC83yAnDZJtclbbkf/ehH4y5B\nGtlGM+0zq+owQFUdAs7cvJIkSWvZrBORXh6izjl48OC4S5BGttFZ/g4nOauqDifZDXz3eBtv15SZ\n0qgWFhbW30hqkWGbdprHETcDc8CHgD3A59facbXrDCVJG7PuzTVJPg3MAs8GDgN7gX8HPgecC3wb\nuLyqPKsjSVtsy++IlCRtHu+IlKQOsWlrbJI8PsQ2VyX5uS2u48VJ/nidbfYk+W6S+5I8mOQvh3jd\ni5L8zuZVKtm0NV7DZHPvAE4b5UWTjPr3egZ49RDbfaaqXgr8LrA3yXPX2X4WePmItUjHZdPW2DVH\npItJPpfk4SQ3NOvfBvwSsJjk9mbdHyW5O8k3knw2yWnN+keTfDDJN4DXJTkvyZeSfD3JnUl+rdnu\n9UkeSLI/ySDJs4D3A5c3R9GvX6/eqvoh8C1gunnN1yT5WpJ7mzl5npvkecBbgHc0r3thkuckuSnJ\nPc3Dhq7RVZUPH2N5AD9ufl7E0lQJZ7N0aendwMub574F7GqWnw3cCZzajP8a+Ntm+VHgr5a99leA\n5zfLvwnc3izfD5zdLJ/R/NwD/NM6tR7dBvgV4BDwi81457LtrgQ+3CzvBd657Ll/Wfa5zgUeGve/\nAx/de2z05hpps/1HVf03QJIDLB3F3s0z7xH4beCFwF1ZumPrWc02R3y22f/nWYolPpen7+x6VvPz\nLmAhyY3Av41Y458nuQg4H3h3LR1xA5zbvN7Zzfs8usb+rwBesKym05OcVlVPjFiHesymrbb4v2XL\nT7L6380At1XVFWu8xk+bnycBj9VS/vwMVfUXSS4AXgPcm+SYbY7jM1X19iS/AdyY5BNV9VPgn4F/\nqKovNk197xr7B/itqvrZCO8pPYOZtsZpmLtlfwyc0Sx/DbgwyfMBkpyW5FdX7lBVjwOPJnnd0TdK\nXtT8PK+qvl5Ve1mafuFc4PFl77GuqrqXpbuCr2pWnQH8V7O8Z9mmK1/3tmX7kOTFw76ndIRNW+O0\n1tUjy9d/DPhyktur6vvAG4F/TfJNlqKR89d4rSuAK5McSPKfwCXN+g8nub/5Uo+7q+p+YBF44bAn\nIhvXAm9pToS+D7gpydeB7y3b5hbgT4+ciATeDrwsyTebmt485HtJR3lHpCR1iEfaktQhnoiUlkky\nx1LuvPxX0Luq6m3jqUh6JuMRSeoQ4xFJ6hCbtiR1iE1bkjrEpi1JHWLTlqQO+X+2nl4VRBTAYgAA\nAABJRU5ErkJggg==\n", 1055 | "text/plain": [ 1056 | "" 1057 | ] 1058 | }, 1059 | "metadata": {}, 1060 | "output_type": "display_data" 1061 | } 1062 | ], 1063 | "source": [ 1064 | "data.boxplot(column=['Interest_Rate'],return_type='axes')" 1065 | ] 1066 | }, 1067 | { 1068 | "cell_type": "code", 1069 | "execution_count": 24, 1070 | "metadata": { 1071 | "collapsed": false 1072 | }, 1073 | "outputs": [ 1074 | { 1075 | "name": "stdout", 1076 | "output_type": "stream", 1077 | "text": [ 1078 | " Interest_Rate Interest_Rate_Missing\n", 1079 | "0 NaN 1\n", 1080 | "1 13.25 0\n", 1081 | "2 NaN 1\n", 1082 | "3 NaN 1\n", 1083 | "4 NaN 1\n", 1084 | "5 13.99 0\n", 1085 | "6 NaN 1\n", 1086 | "7 NaN 1\n", 1087 | "8 14.85 0\n", 1088 | "9 18.25 0\n" 1089 | ] 1090 | } 1091 | ], 1092 | "source": [ 1093 | "#缺省值太多,也造一个字段,表示有无\n", 1094 | "data['Interest_Rate_Missing'] = data['Interest_Rate'].apply(lambda x: 1 if pd.isnull(x) else 0)\n", 1095 | "print data[['Interest_Rate','Interest_Rate_Missing']].head(10)" 1096 | ] 1097 | }, 1098 | { 1099 | "cell_type": "code", 1100 | "execution_count": 25, 1101 | "metadata": { 1102 | "collapsed": true 1103 | }, 1104 | "outputs": [], 1105 | "source": [ 1106 | "data.drop('Interest_Rate',axis=1,inplace=True)" 1107 | ] 1108 | }, 1109 | { 1110 | "cell_type": "markdown", 1111 | "metadata": {}, 1112 | "source": [ 1113 | "### Lead Creation Date字段" 1114 | ] 1115 | }, 1116 | { 1117 | "cell_type": "code", 1118 | "execution_count": 26, 1119 | "metadata": { 1120 | "collapsed": false 1121 | }, 1122 | "outputs": [ 1123 | { 1124 | "data": { 1125 | "text/html": [ 1126 | "
\n", 1127 | "\n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | "
Device_TypeDisbursedExisting_EMIFilled_FormGenderIDLoan_Amount_AppliedLoan_Amount_SubmittedLoan_Tenure_AppliedLoan_Tenure_Submitted...Salary_AccountSourceVar1Var2Var4Var5sourceAgeEMI_Loan_Submitted_MissingInterest_Rate_Missing
0Web-browser0.00.0NFemaleID000002C20300000.0NaN5.0NaN...HDFC BankS122HBXXG10train3711
1Web-browser0.00.0NMaleID000004E40200000.0200000.02.02.0...ICICI BankS122HBXAG313train3000
2Web-browser0.00.0NMaleID000007H20600000.0450000.04.04.0...State Bank of IndiaS143HBXXB10train3411
3Web-browser0.00.0NMaleID000008I301000000.0920000.05.05.0...State Bank of IndiaS143HBXXB310train2811
4Web-browser0.025000.0NMaleID000009J40500000.0500000.02.02.0...HDFC BankS134HBXXB317train3111
\n", 1277 | "

5 rows × 24 columns

\n", 1278 | "
" 1279 | ], 1280 | "text/plain": [ 1281 | " Device_Type Disbursed Existing_EMI Filled_Form Gender ID \\\n", 1282 | "0 Web-browser 0.0 0.0 N Female ID000002C20 \n", 1283 | "1 Web-browser 0.0 0.0 N Male ID000004E40 \n", 1284 | "2 Web-browser 0.0 0.0 N Male ID000007H20 \n", 1285 | "3 Web-browser 0.0 0.0 N Male ID000008I30 \n", 1286 | "4 Web-browser 0.0 25000.0 N Male ID000009J40 \n", 1287 | "\n", 1288 | " Loan_Amount_Applied Loan_Amount_Submitted Loan_Tenure_Applied \\\n", 1289 | "0 300000.0 NaN 5.0 \n", 1290 | "1 200000.0 200000.0 2.0 \n", 1291 | "2 600000.0 450000.0 4.0 \n", 1292 | "3 1000000.0 920000.0 5.0 \n", 1293 | "4 500000.0 500000.0 2.0 \n", 1294 | "\n", 1295 | " Loan_Tenure_Submitted ... Salary_Account Source \\\n", 1296 | "0 NaN ... HDFC Bank S122 \n", 1297 | "1 2.0 ... ICICI Bank S122 \n", 1298 | "2 4.0 ... State Bank of India S143 \n", 1299 | "3 5.0 ... State Bank of India S143 \n", 1300 | "4 2.0 ... HDFC Bank S134 \n", 1301 | "\n", 1302 | " Var1 Var2 Var4 Var5 source Age EMI_Loan_Submitted_Missing \\\n", 1303 | "0 HBXX G 1 0 train 37 1 \n", 1304 | "1 HBXA G 3 13 train 30 0 \n", 1305 | "2 HBXX B 1 0 train 34 1 \n", 1306 | "3 HBXX B 3 10 train 28 1 \n", 1307 | "4 HBXX B 3 17 train 31 1 \n", 1308 | "\n", 1309 | " Interest_Rate_Missing \n", 1310 | "0 1 \n", 1311 | "1 0 \n", 1312 | "2 1 \n", 1313 | "3 1 \n", 1314 | "4 1 \n", 1315 | "\n", 1316 | "[5 rows x 24 columns]" 1317 | ] 1318 | }, 1319 | "execution_count": 26, 1320 | "metadata": {}, 1321 | "output_type": "execute_result" 1322 | } 1323 | ], 1324 | "source": [ 1325 | "#不!要!了!,是的,不要了!!!\n", 1326 | "data.drop('Lead_Creation_Date',axis=1,inplace=True)\n", 1327 | "data.head()" 1328 | ] 1329 | }, 1330 | { 1331 | "cell_type": "markdown", 1332 | "metadata": {}, 1333 | "source": [ 1334 | "### Loan Amount and Tenure applied字段" 1335 | ] 1336 | }, 1337 | { 1338 | "cell_type": "code", 1339 | "execution_count": 27, 1340 | "metadata": { 1341 | "collapsed": true 1342 | }, 1343 | "outputs": [], 1344 | "source": [ 1345 | "#找中位数去填补缺省值(因为缺省的不多)\n", 1346 | "data['Loan_Amount_Applied'].fillna(data['Loan_Amount_Applied'].median(),inplace=True)\n", 1347 | "data['Loan_Tenure_Applied'].fillna(data['Loan_Tenure_Applied'].median(),inplace=True)" 1348 | ] 1349 | }, 1350 | { 1351 | "cell_type": "code", 1352 | "execution_count": 28, 1353 | "metadata": { 1354 | "collapsed": false 1355 | }, 1356 | "outputs": [ 1357 | { 1358 | "data": { 1359 | "text/html": [ 1360 | "
\n", 1361 | "\n", 1362 | " \n", 1363 | " \n", 1364 | " \n", 1365 | " \n", 1366 | " \n", 1367 | " \n", 1368 | " \n", 1369 | " \n", 1370 | " \n", 1371 | " \n", 1372 | " \n", 1373 | " \n", 1374 | " \n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | " \n", 1390 | " \n", 1391 | " \n", 1392 | " \n", 1393 | " \n", 1394 | " \n", 1395 | " \n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | " \n", 1418 | " \n", 1419 | " \n", 1420 | " \n", 1421 | " \n", 1422 | " \n", 1423 | " \n", 1424 | " \n", 1425 | " \n", 1426 | " \n", 1427 | " \n", 1428 | " \n", 1429 | " \n", 1430 | " \n", 1431 | " \n", 1432 | " \n", 1433 | " \n", 1434 | " \n", 1435 | " \n", 1436 | " \n", 1437 | " \n", 1438 | " \n", 1439 | " \n", 1440 | " \n", 1441 | " \n", 1442 | " \n", 1443 | " \n", 1444 | " \n", 1445 | " \n", 1446 | " \n", 1447 | " \n", 1448 | " \n", 1449 | " \n", 1450 | " \n", 1451 | " \n", 1452 | " \n", 1453 | " \n", 1454 | " \n", 1455 | " \n", 1456 | " \n", 1457 | " \n", 1458 | " \n", 1459 | " \n", 1460 | " \n", 1461 | " \n", 1462 | " \n", 1463 | " \n", 1464 | " \n", 1465 | " \n", 1466 | " \n", 1467 | " \n", 1468 | " \n", 1469 | " \n", 1470 | " \n", 1471 | " \n", 1472 | " \n", 1473 | " \n", 1474 | " \n", 1475 | " \n", 1476 | " \n", 1477 | " \n", 1478 | " \n", 1479 | " \n", 1480 | " \n", 1481 | " \n", 1482 | " \n", 1483 | " \n", 1484 | " \n", 1485 | " \n", 1486 | " \n", 1487 | " \n", 1488 | " \n", 1489 | " \n", 1490 | " \n", 1491 | " \n", 1492 | " \n", 1493 | " \n", 1494 | " \n", 1495 | " \n", 1496 | " \n", 1497 | " \n", 1498 | " \n", 1499 | " \n", 1500 | " \n", 1501 | " \n", 1502 | " \n", 1503 | " \n", 1504 | " \n", 1505 | " \n", 1506 | " \n", 1507 | " \n", 1508 | " \n", 1509 | " \n", 1510 | "
Device_TypeDisbursedExisting_EMIFilled_FormGenderIDLoan_Amount_AppliedLoan_Amount_SubmittedLoan_Tenure_AppliedLoan_Tenure_Submitted...Salary_AccountSourceVar1Var2Var4Var5sourceAgeEMI_Loan_Submitted_MissingInterest_Rate_Missing
0Web-browser0.00.0NFemaleID000002C20300000.0NaN5.0NaN...HDFC BankS122HBXXG10train3711
1Web-browser0.00.0NMaleID000004E40200000.0200000.02.02.0...ICICI BankS122HBXAG313train3000
2Web-browser0.00.0NMaleID000007H20600000.0450000.04.04.0...State Bank of IndiaS143HBXXB10train3411
3Web-browser0.00.0NMaleID000008I301000000.0920000.05.05.0...State Bank of IndiaS143HBXXB310train2811
4Web-browser0.025000.0NMaleID000009J40500000.0500000.02.02.0...HDFC BankS134HBXXB317train3111
\n", 1511 | "

5 rows × 24 columns

\n", 1512 | "
" 1513 | ], 1514 | "text/plain": [ 1515 | " Device_Type Disbursed Existing_EMI Filled_Form Gender ID \\\n", 1516 | "0 Web-browser 0.0 0.0 N Female ID000002C20 \n", 1517 | "1 Web-browser 0.0 0.0 N Male ID000004E40 \n", 1518 | "2 Web-browser 0.0 0.0 N Male ID000007H20 \n", 1519 | "3 Web-browser 0.0 0.0 N Male ID000008I30 \n", 1520 | "4 Web-browser 0.0 25000.0 N Male ID000009J40 \n", 1521 | "\n", 1522 | " Loan_Amount_Applied Loan_Amount_Submitted Loan_Tenure_Applied \\\n", 1523 | "0 300000.0 NaN 5.0 \n", 1524 | "1 200000.0 200000.0 2.0 \n", 1525 | "2 600000.0 450000.0 4.0 \n", 1526 | "3 1000000.0 920000.0 5.0 \n", 1527 | "4 500000.0 500000.0 2.0 \n", 1528 | "\n", 1529 | " Loan_Tenure_Submitted ... Salary_Account Source \\\n", 1530 | "0 NaN ... HDFC Bank S122 \n", 1531 | "1 2.0 ... ICICI Bank S122 \n", 1532 | "2 4.0 ... State Bank of India S143 \n", 1533 | "3 5.0 ... State Bank of India S143 \n", 1534 | "4 2.0 ... HDFC Bank S134 \n", 1535 | "\n", 1536 | " Var1 Var2 Var4 Var5 source Age EMI_Loan_Submitted_Missing \\\n", 1537 | "0 HBXX G 1 0 train 37 1 \n", 1538 | "1 HBXA G 3 13 train 30 0 \n", 1539 | "2 HBXX B 1 0 train 34 1 \n", 1540 | "3 HBXX B 3 10 train 28 1 \n", 1541 | "4 HBXX B 3 17 train 31 1 \n", 1542 | "\n", 1543 | " Interest_Rate_Missing \n", 1544 | "0 1 \n", 1545 | "1 0 \n", 1546 | "2 1 \n", 1547 | "3 1 \n", 1548 | "4 1 \n", 1549 | "\n", 1550 | "[5 rows x 24 columns]" 1551 | ] 1552 | }, 1553 | "execution_count": 28, 1554 | "metadata": {}, 1555 | "output_type": "execute_result" 1556 | } 1557 | ], 1558 | "source": [ 1559 | "data.head()" 1560 | ] 1561 | }, 1562 | { 1563 | "cell_type": "markdown", 1564 | "metadata": {}, 1565 | "source": [ 1566 | "### Loan Amount and Tenure selected" 1567 | ] 1568 | }, 1569 | { 1570 | "cell_type": "code", 1571 | "execution_count": 29, 1572 | "metadata": { 1573 | "collapsed": true 1574 | }, 1575 | "outputs": [], 1576 | "source": [ 1577 | "# 缺省值太多。。。是否缺省。。。\n", 1578 | "data['Loan_Amount_Submitted_Missing'] = data['Loan_Amount_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)\n", 1579 | "data['Loan_Tenure_Submitted_Missing'] = data['Loan_Tenure_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)" 1580 | ] 1581 | }, 1582 | { 1583 | "cell_type": "code", 1584 | "execution_count": 45, 1585 | "metadata": { 1586 | "collapsed": false 1587 | }, 1588 | "outputs": [ 1589 | { 1590 | "data": { 1591 | "text/html": [ 1592 | "
\n", 1593 | "\n", 1594 | " \n", 1595 | " \n", 1596 | " \n", 1597 | " \n", 1598 | " \n", 1599 | " \n", 1600 | " \n", 1601 | " \n", 1602 | " \n", 1603 | " \n", 1604 | " \n", 1605 | " \n", 1606 | " \n", 1607 | " \n", 1608 | " \n", 1609 | " \n", 1610 | " \n", 1611 | " \n", 1612 | " \n", 1613 | " \n", 1614 | " \n", 1615 | " \n", 1616 | " \n", 1617 | " \n", 1618 | " \n", 1619 | " \n", 1620 | " \n", 1621 | " \n", 1622 | " \n", 1623 | " \n", 1624 | " \n", 1625 | " \n", 1626 | " \n", 1627 | " \n", 1628 | " \n", 1629 | " \n", 1630 | " \n", 1631 | " \n", 1632 | " \n", 1633 | " \n", 1634 | " \n", 1635 | " \n", 1636 | " \n", 1637 | " \n", 1638 | " \n", 1639 | " \n", 1640 | " \n", 1641 | " \n", 1642 | " \n", 1643 | " \n", 1644 | " \n", 1645 | " \n", 1646 | " \n", 1647 | " \n", 1648 | " \n", 1649 | " \n", 1650 | " \n", 1651 | " \n", 1652 | " \n", 1653 | " \n", 1654 | " \n", 1655 | " \n", 1656 | " \n", 1657 | " \n", 1658 | " \n", 1659 | " \n", 1660 | " \n", 1661 | " \n", 1662 | " \n", 1663 | " \n", 1664 | " \n", 1665 | " \n", 1666 | " \n", 1667 | " \n", 1668 | " \n", 1669 | " \n", 1670 | " \n", 1671 | " \n", 1672 | " \n", 1673 | " \n", 1674 | " \n", 1675 | " \n", 1676 | " \n", 1677 | " \n", 1678 | " \n", 1679 | " \n", 1680 | " \n", 1681 | " \n", 1682 | " \n", 1683 | " \n", 1684 | " \n", 1685 | " \n", 1686 | " \n", 1687 | " \n", 1688 | " \n", 1689 | " \n", 1690 | " \n", 1691 | " \n", 1692 | " \n", 1693 | " \n", 1694 | " \n", 1695 | " \n", 1696 | " \n", 1697 | " \n", 1698 | " \n", 1699 | " \n", 1700 | " \n", 1701 | " \n", 1702 | " \n", 1703 | " \n", 1704 | " \n", 1705 | " \n", 1706 | " \n", 1707 | " \n", 1708 | " \n", 1709 | " \n", 1710 | " \n", 1711 | " \n", 1712 | " \n", 1713 | " \n", 1714 | " \n", 1715 | " \n", 1716 | " \n", 1717 | " \n", 1718 | " \n", 1719 | " \n", 1720 | " \n", 1721 | " \n", 1722 | " \n", 1723 | " \n", 1724 | " \n", 1725 | " \n", 1726 | " \n", 1727 | " \n", 1728 | " \n", 1729 | " \n", 1730 | " \n", 1731 | " \n", 1732 | " \n", 1733 | " \n", 1734 | " \n", 1735 | " \n", 1736 | " \n", 1737 | " \n", 1738 | " \n", 1739 | " \n", 1740 | " \n", 1741 | " \n", 1742 | "
Device_TypeDisbursedEmployer_NameExisting_EMIFilled_FormGenderIDLoan_Amount_AppliedLoan_Amount_SubmittedLoan_Tenure_Applied...Var1Var2Var4Var5sourceAgeEMI_Loan_Submitted_MissingInterest_Rate_MissingLoan_Amount_Submitted_MissingLoan_Tenure_Submitted_Missing
0Web-browser0.0CYBOSOL0.0NFemaleID000002C20300000.0NaN5.0...HBXXG10train371111
1Web-browser0.0TATA CONSULTANCY SERVICES LTD (TCS)0.0NMaleID000004E40200000.0200000.02.0...HBXAG313train300000
2Web-browser0.0ALCHEMIST HOSPITALS LTD0.0NMaleID000007H20600000.0450000.04.0...HBXXB10train341100
3Web-browser0.0BIHAR GOVERNMENT0.0NMaleID000008I301000000.0920000.05.0...HBXXB310train281100
4Web-browser0.0GLOBAL EDGE SOFTWARE25000.0NMaleID000009J40500000.0500000.02.0...HBXXB317train311100
\n", 1743 | "

5 rows × 27 columns

\n", 1744 | "
" 1745 | ], 1746 | "text/plain": [ 1747 | " Device_Type Disbursed Employer_Name Existing_EMI \\\n", 1748 | "0 Web-browser 0.0 CYBOSOL 0.0 \n", 1749 | "1 Web-browser 0.0 TATA CONSULTANCY SERVICES LTD (TCS) 0.0 \n", 1750 | "2 Web-browser 0.0 ALCHEMIST HOSPITALS LTD 0.0 \n", 1751 | "3 Web-browser 0.0 BIHAR GOVERNMENT 0.0 \n", 1752 | "4 Web-browser 0.0 GLOBAL EDGE SOFTWARE 25000.0 \n", 1753 | "\n", 1754 | " Filled_Form Gender ID Loan_Amount_Applied \\\n", 1755 | "0 N Female ID000002C20 300000.0 \n", 1756 | "1 N Male ID000004E40 200000.0 \n", 1757 | "2 N Male ID000007H20 600000.0 \n", 1758 | "3 N Male ID000008I30 1000000.0 \n", 1759 | "4 N Male ID000009J40 500000.0 \n", 1760 | "\n", 1761 | " Loan_Amount_Submitted Loan_Tenure_Applied ... \\\n", 1762 | "0 NaN 5.0 ... \n", 1763 | "1 200000.0 2.0 ... \n", 1764 | "2 450000.0 4.0 ... \n", 1765 | "3 920000.0 5.0 ... \n", 1766 | "4 500000.0 2.0 ... \n", 1767 | "\n", 1768 | " Var1 Var2 Var4 Var5 source Age EMI_Loan_Submitted_Missing \\\n", 1769 | "0 HBXX G 1 0 train 37 1 \n", 1770 | "1 HBXA G 3 13 train 30 0 \n", 1771 | "2 HBXX B 1 0 train 34 1 \n", 1772 | "3 HBXX B 3 10 train 28 1 \n", 1773 | "4 HBXX B 3 17 train 31 1 \n", 1774 | "\n", 1775 | " Interest_Rate_Missing Loan_Amount_Submitted_Missing \\\n", 1776 | "0 1 1 \n", 1777 | "1 0 0 \n", 1778 | "2 1 0 \n", 1779 | "3 1 0 \n", 1780 | "4 1 0 \n", 1781 | "\n", 1782 | " Loan_Tenure_Submitted_Missing \n", 1783 | "0 1 \n", 1784 | "1 0 \n", 1785 | "2 0 \n", 1786 | "3 0 \n", 1787 | "4 0 \n", 1788 | "\n", 1789 | "[5 rows x 27 columns]" 1790 | ] 1791 | }, 1792 | "execution_count": 45, 1793 | "metadata": {}, 1794 | "output_type": "execute_result" 1795 | } 1796 | ], 1797 | "source": [ 1798 | "data.head()" 1799 | ] 1800 | }, 1801 | { 1802 | "cell_type": "code", 1803 | "execution_count": 30, 1804 | "metadata": { 1805 | "collapsed": true 1806 | }, 1807 | "outputs": [], 1808 | "source": [ 1809 | "#原来的字段就没用了\n", 1810 | "data.drop(['Loan_Amount_Submitted','Loan_Tenure_Submitted'],axis=1,inplace=True)" 1811 | ] 1812 | }, 1813 | { 1814 | "cell_type": "markdown", 1815 | "metadata": {}, 1816 | "source": [ 1817 | "### LoggedIn" 1818 | ] 1819 | }, 1820 | { 1821 | "cell_type": "code", 1822 | "execution_count": 31, 1823 | "metadata": { 1824 | "collapsed": true 1825 | }, 1826 | "outputs": [], 1827 | "source": [ 1828 | "#没想好怎么用。。。不要了。。。\n", 1829 | "data.drop('LoggedIn',axis=1,inplace=True)" 1830 | ] 1831 | }, 1832 | { 1833 | "cell_type": "markdown", 1834 | "metadata": {}, 1835 | "source": [ 1836 | "### salary account" 1837 | ] 1838 | }, 1839 | { 1840 | "cell_type": "code", 1841 | "execution_count": 32, 1842 | "metadata": { 1843 | "collapsed": true 1844 | }, 1845 | "outputs": [], 1846 | "source": [ 1847 | "# 可能对接多个银行,所以也不要了\n", 1848 | "data.drop('Salary_Account',axis=1,inplace=True)" 1849 | ] 1850 | }, 1851 | { 1852 | "cell_type": "markdown", 1853 | "metadata": {}, 1854 | "source": [ 1855 | "### Processing_Fee" 1856 | ] 1857 | }, 1858 | { 1859 | "cell_type": "code", 1860 | "execution_count": 33, 1861 | "metadata": { 1862 | "collapsed": true 1863 | }, 1864 | "outputs": [], 1865 | "source": [ 1866 | "#和之前一样的处理,有或者没有\n", 1867 | "data['Processing_Fee_Missing'] = data['Processing_Fee'].apply(lambda x: 1 if pd.isnull(x) else 0)\n", 1868 | "#旧的字段不要了\n", 1869 | "data.drop('Processing_Fee',axis=1,inplace=True)" 1870 | ] 1871 | }, 1872 | { 1873 | "cell_type": "markdown", 1874 | "metadata": {}, 1875 | "source": [ 1876 | "### Source" 1877 | ] 1878 | }, 1879 | { 1880 | "cell_type": "code", 1881 | "execution_count": 34, 1882 | "metadata": { 1883 | "collapsed": false 1884 | }, 1885 | "outputs": [ 1886 | { 1887 | "data": { 1888 | "text/plain": [ 1889 | "S122 55249\n", 1890 | "S133 42900\n", 1891 | "others 26588\n", 1892 | "Name: Source, dtype: int64" 1893 | ] 1894 | }, 1895 | "execution_count": 34, 1896 | "metadata": {}, 1897 | "output_type": "execute_result" 1898 | } 1899 | ], 1900 | "source": [ 1901 | "data['Source'] = data['Source'].apply(lambda x: 'others' if x not in ['S122','S133'] else x)\n", 1902 | "data['Source'].value_counts()" 1903 | ] 1904 | }, 1905 | { 1906 | "cell_type": "markdown", 1907 | "metadata": {}, 1908 | "source": [ 1909 | "## 最终的数据样式" 1910 | ] 1911 | }, 1912 | { 1913 | "cell_type": "code", 1914 | "execution_count": 35, 1915 | "metadata": { 1916 | "collapsed": false 1917 | }, 1918 | "outputs": [ 1919 | { 1920 | "data": { 1921 | "text/html": [ 1922 | "
\n", 1923 | "\n", 1924 | " \n", 1925 | " \n", 1926 | " \n", 1927 | " \n", 1928 | " \n", 1929 | " \n", 1930 | " \n", 1931 | " \n", 1932 | " \n", 1933 | " \n", 1934 | " \n", 1935 | " \n", 1936 | " \n", 1937 | " \n", 1938 | " \n", 1939 | " \n", 1940 | " \n", 1941 | " \n", 1942 | " \n", 1943 | " \n", 1944 | " \n", 1945 | " \n", 1946 | " \n", 1947 | " \n", 1948 | " \n", 1949 | " \n", 1950 | " \n", 1951 | " \n", 1952 | " \n", 1953 | " \n", 1954 | " \n", 1955 | " \n", 1956 | " \n", 1957 | " \n", 1958 | " \n", 1959 | " \n", 1960 | " \n", 1961 | " \n", 1962 | " \n", 1963 | " \n", 1964 | " \n", 1965 | " \n", 1966 | " \n", 1967 | " \n", 1968 | " \n", 1969 | " \n", 1970 | " \n", 1971 | " \n", 1972 | " \n", 1973 | " \n", 1974 | " \n", 1975 | " \n", 1976 | " \n", 1977 | " \n", 1978 | " \n", 1979 | " \n", 1980 | " \n", 1981 | " \n", 1982 | " \n", 1983 | " \n", 1984 | " \n", 1985 | " \n", 1986 | " \n", 1987 | " \n", 1988 | " \n", 1989 | " \n", 1990 | " \n", 1991 | " \n", 1992 | " \n", 1993 | " \n", 1994 | " \n", 1995 | " \n", 1996 | " \n", 1997 | " \n", 1998 | " \n", 1999 | " \n", 2000 | " \n", 2001 | " \n", 2002 | " \n", 2003 | " \n", 2004 | " \n", 2005 | " \n", 2006 | " \n", 2007 | " \n", 2008 | " \n", 2009 | " \n", 2010 | " \n", 2011 | " \n", 2012 | " \n", 2013 | " \n", 2014 | " \n", 2015 | " \n", 2016 | " \n", 2017 | " \n", 2018 | " \n", 2019 | " \n", 2020 | " \n", 2021 | " \n", 2022 | " \n", 2023 | " \n", 2024 | " \n", 2025 | " \n", 2026 | " \n", 2027 | " \n", 2028 | " \n", 2029 | " \n", 2030 | " \n", 2031 | " \n", 2032 | " \n", 2033 | " \n", 2034 | " \n", 2035 | " \n", 2036 | " \n", 2037 | " \n", 2038 | " \n", 2039 | " \n", 2040 | " \n", 2041 | " \n", 2042 | " \n", 2043 | " \n", 2044 | " \n", 2045 | " \n", 2046 | " \n", 2047 | " \n", 2048 | " \n", 2049 | " \n", 2050 | " \n", 2051 | " \n", 2052 | " \n", 2053 | " \n", 2054 | " \n", 2055 | " \n", 2056 | " \n", 2057 | " \n", 2058 | " \n", 2059 | " \n", 2060 | " \n", 2061 | " \n", 2062 | " \n", 2063 | " \n", 2064 | " \n", 2065 | " \n", 2066 | " \n", 2067 | " \n", 2068 | " \n", 2069 | " \n", 2070 | " \n", 2071 | " \n", 2072 | "
Device_TypeDisbursedExisting_EMIFilled_FormGenderIDLoan_Amount_AppliedLoan_Tenure_AppliedMobile_VerifiedMonthly_Income...Var2Var4Var5sourceAgeEMI_Loan_Submitted_MissingInterest_Rate_MissingLoan_Amount_Submitted_MissingLoan_Tenure_Submitted_MissingProcessing_Fee_Missing
0Web-browser0.00.0NFemaleID000002C20300000.05.0N20000...G10train3711111
1Web-browser0.00.0NMaleID000004E40200000.02.0Y35000...G313train3000001
2Web-browser0.00.0NMaleID000007H20600000.04.0Y22500...B10train3411001
3Web-browser0.00.0NMaleID000008I301000000.05.0Y35000...B310train2811001
4Web-browser0.025000.0NMaleID000009J40500000.02.0Y100000...B317train3111001
\n", 2073 | "

5 rows × 22 columns

\n", 2074 | "
" 2075 | ], 2076 | "text/plain": [ 2077 | " Device_Type Disbursed Existing_EMI Filled_Form Gender ID \\\n", 2078 | "0 Web-browser 0.0 0.0 N Female ID000002C20 \n", 2079 | "1 Web-browser 0.0 0.0 N Male ID000004E40 \n", 2080 | "2 Web-browser 0.0 0.0 N Male ID000007H20 \n", 2081 | "3 Web-browser 0.0 0.0 N Male ID000008I30 \n", 2082 | "4 Web-browser 0.0 25000.0 N Male ID000009J40 \n", 2083 | "\n", 2084 | " Loan_Amount_Applied Loan_Tenure_Applied Mobile_Verified Monthly_Income \\\n", 2085 | "0 300000.0 5.0 N 20000 \n", 2086 | "1 200000.0 2.0 Y 35000 \n", 2087 | "2 600000.0 4.0 Y 22500 \n", 2088 | "3 1000000.0 5.0 Y 35000 \n", 2089 | "4 500000.0 2.0 Y 100000 \n", 2090 | "\n", 2091 | " ... Var2 Var4 Var5 source Age \\\n", 2092 | "0 ... G 1 0 train 37 \n", 2093 | "1 ... G 3 13 train 30 \n", 2094 | "2 ... B 1 0 train 34 \n", 2095 | "3 ... B 3 10 train 28 \n", 2096 | "4 ... B 3 17 train 31 \n", 2097 | "\n", 2098 | " EMI_Loan_Submitted_Missing Interest_Rate_Missing \\\n", 2099 | "0 1 1 \n", 2100 | "1 0 0 \n", 2101 | "2 1 1 \n", 2102 | "3 1 1 \n", 2103 | "4 1 1 \n", 2104 | "\n", 2105 | " Loan_Amount_Submitted_Missing Loan_Tenure_Submitted_Missing \\\n", 2106 | "0 1 1 \n", 2107 | "1 0 0 \n", 2108 | "2 0 0 \n", 2109 | "3 0 0 \n", 2110 | "4 0 0 \n", 2111 | "\n", 2112 | " Processing_Fee_Missing \n", 2113 | "0 1 \n", 2114 | "1 1 \n", 2115 | "2 1 \n", 2116 | "3 1 \n", 2117 | "4 1 \n", 2118 | "\n", 2119 | "[5 rows x 22 columns]" 2120 | ] 2121 | }, 2122 | "execution_count": 35, 2123 | "metadata": {}, 2124 | "output_type": "execute_result" 2125 | } 2126 | ], 2127 | "source": [ 2128 | "data.head()" 2129 | ] 2130 | }, 2131 | { 2132 | "cell_type": "code", 2133 | "execution_count": 36, 2134 | "metadata": { 2135 | "collapsed": false 2136 | }, 2137 | "outputs": [ 2138 | { 2139 | "data": { 2140 | "text/html": [ 2141 | "
\n", 2142 | "\n", 2143 | " \n", 2144 | " \n", 2145 | " \n", 2146 | " \n", 2147 | " \n", 2148 | " \n", 2149 | " \n", 2150 | " \n", 2151 | " \n", 2152 | " \n", 2153 | " \n", 2154 | " \n", 2155 | " \n", 2156 | " \n", 2157 | " \n", 2158 | " \n", 2159 | " \n", 2160 | " \n", 2161 | " \n", 2162 | " \n", 2163 | " \n", 2164 | " \n", 2165 | " \n", 2166 | " \n", 2167 | " \n", 2168 | " \n", 2169 | " \n", 2170 | " \n", 2171 | " \n", 2172 | " \n", 2173 | " \n", 2174 | " \n", 2175 | " \n", 2176 | " \n", 2177 | " \n", 2178 | " \n", 2179 | " \n", 2180 | " \n", 2181 | " \n", 2182 | " \n", 2183 | " \n", 2184 | " \n", 2185 | " \n", 2186 | " \n", 2187 | " \n", 2188 | " \n", 2189 | " \n", 2190 | " \n", 2191 | " \n", 2192 | " \n", 2193 | " \n", 2194 | " \n", 2195 | " \n", 2196 | " \n", 2197 | " \n", 2198 | " \n", 2199 | " \n", 2200 | " \n", 2201 | " \n", 2202 | " \n", 2203 | " \n", 2204 | " \n", 2205 | " \n", 2206 | " \n", 2207 | " \n", 2208 | " \n", 2209 | " \n", 2210 | " \n", 2211 | " \n", 2212 | " \n", 2213 | " \n", 2214 | " \n", 2215 | " \n", 2216 | " \n", 2217 | " \n", 2218 | " \n", 2219 | " \n", 2220 | " \n", 2221 | " \n", 2222 | " \n", 2223 | " \n", 2224 | " \n", 2225 | " \n", 2226 | " \n", 2227 | " \n", 2228 | " \n", 2229 | " \n", 2230 | " \n", 2231 | " \n", 2232 | " \n", 2233 | " \n", 2234 | " \n", 2235 | " \n", 2236 | " \n", 2237 | " \n", 2238 | " \n", 2239 | " \n", 2240 | " \n", 2241 | " \n", 2242 | " \n", 2243 | " \n", 2244 | " \n", 2245 | " \n", 2246 | " \n", 2247 | " \n", 2248 | " \n", 2249 | " \n", 2250 | " \n", 2251 | " \n", 2252 | " \n", 2253 | " \n", 2254 | " \n", 2255 | " \n", 2256 | " \n", 2257 | " \n", 2258 | " \n", 2259 | " \n", 2260 | " \n", 2261 | " \n", 2262 | " \n", 2263 | " \n", 2264 | " \n", 2265 | " \n", 2266 | " \n", 2267 | " \n", 2268 | " \n", 2269 | " \n", 2270 | " \n", 2271 | " \n", 2272 | " \n", 2273 | " \n", 2274 | " \n", 2275 | " \n", 2276 | " \n", 2277 | " \n", 2278 | " \n", 2279 | " \n", 2280 | " \n", 2281 | " \n", 2282 | " \n", 2283 | " \n", 2284 | " \n", 2285 | " \n", 2286 | " \n", 2287 | " \n", 2288 | " \n", 2289 | " \n", 2290 | " \n", 2291 | "
DisbursedExisting_EMILoan_Amount_AppliedLoan_Tenure_AppliedMonthly_IncomeVar4Var5AgeEMI_Loan_Submitted_MissingInterest_Rate_MissingLoan_Amount_Submitted_MissingLoan_Tenure_Submitted_MissingProcessing_Fee_Missing
count87020.0000001.247370e+051.247370e+05124737.0000001.247370e+05124737.000000124737.000000124737.000000124737.000000124737.000000124737.000000124737.000000124737.000000
mean0.0146293.633107e+032.298744e+052.1380755.309073e+042.9505604.96477430.9069960.6806400.6806400.3971160.3971160.684208
std0.1200623.367642e+043.539938e+052.0148741.823394e+061.6952615.6697847.1378600.4662310.4662310.4893020.4893020.464833
min0.0000000.000000e+000.000000e+000.0000000.000000e+000.0000000.00000018.0000000.0000000.0000000.0000000.0000000.000000
25%NaN0.000000e+000.000000e+000.0000001.650000e+041.0000000.00000026.0000000.0000000.0000000.0000000.0000000.000000
50%NaN0.000000e+001.000000e+052.0000002.500000e+043.0000002.00000029.0000001.0000001.0000000.0000000.0000001.000000
75%NaN3.500000e+033.000000e+054.0000004.000000e+045.00000011.00000034.0000001.0000001.0000001.0000001.0000001.000000
max1.0000001.000000e+071.500000e+0710.0000004.445544e+087.00000018.000000100.0000001.0000001.0000001.0000001.0000001.000000
\n", 2292 | "
" 2293 | ], 2294 | "text/plain": [ 2295 | " Disbursed Existing_EMI Loan_Amount_Applied Loan_Tenure_Applied \\\n", 2296 | "count 87020.000000 1.247370e+05 1.247370e+05 124737.000000 \n", 2297 | "mean 0.014629 3.633107e+03 2.298744e+05 2.138075 \n", 2298 | "std 0.120062 3.367642e+04 3.539938e+05 2.014874 \n", 2299 | "min 0.000000 0.000000e+00 0.000000e+00 0.000000 \n", 2300 | "25% NaN 0.000000e+00 0.000000e+00 0.000000 \n", 2301 | "50% NaN 0.000000e+00 1.000000e+05 2.000000 \n", 2302 | "75% NaN 3.500000e+03 3.000000e+05 4.000000 \n", 2303 | "max 1.000000 1.000000e+07 1.500000e+07 10.000000 \n", 2304 | "\n", 2305 | " Monthly_Income Var4 Var5 Age \\\n", 2306 | "count 1.247370e+05 124737.000000 124737.000000 124737.000000 \n", 2307 | "mean 5.309073e+04 2.950560 4.964774 30.906996 \n", 2308 | "std 1.823394e+06 1.695261 5.669784 7.137860 \n", 2309 | "min 0.000000e+00 0.000000 0.000000 18.000000 \n", 2310 | "25% 1.650000e+04 1.000000 0.000000 26.000000 \n", 2311 | "50% 2.500000e+04 3.000000 2.000000 29.000000 \n", 2312 | "75% 4.000000e+04 5.000000 11.000000 34.000000 \n", 2313 | "max 4.445544e+08 7.000000 18.000000 100.000000 \n", 2314 | "\n", 2315 | " EMI_Loan_Submitted_Missing Interest_Rate_Missing \\\n", 2316 | "count 124737.000000 124737.000000 \n", 2317 | "mean 0.680640 0.680640 \n", 2318 | "std 0.466231 0.466231 \n", 2319 | "min 0.000000 0.000000 \n", 2320 | "25% 0.000000 0.000000 \n", 2321 | "50% 1.000000 1.000000 \n", 2322 | "75% 1.000000 1.000000 \n", 2323 | "max 1.000000 1.000000 \n", 2324 | "\n", 2325 | " Loan_Amount_Submitted_Missing Loan_Tenure_Submitted_Missing \\\n", 2326 | "count 124737.000000 124737.000000 \n", 2327 | "mean 0.397116 0.397116 \n", 2328 | "std 0.489302 0.489302 \n", 2329 | "min 0.000000 0.000000 \n", 2330 | "25% 0.000000 0.000000 \n", 2331 | "50% 0.000000 0.000000 \n", 2332 | "75% 1.000000 1.000000 \n", 2333 | "max 1.000000 1.000000 \n", 2334 | "\n", 2335 | " Processing_Fee_Missing \n", 2336 | "count 124737.000000 \n", 2337 | "mean 0.684208 \n", 2338 | "std 0.464833 \n", 2339 | "min 0.000000 \n", 2340 | "25% 0.000000 \n", 2341 | "50% 1.000000 \n", 2342 | "75% 1.000000 \n", 2343 | "max 1.000000 " 2344 | ] 2345 | }, 2346 | "execution_count": 36, 2347 | "metadata": {}, 2348 | "output_type": "execute_result" 2349 | } 2350 | ], 2351 | "source": [ 2352 | "data.describe()" 2353 | ] 2354 | }, 2355 | { 2356 | "cell_type": "code", 2357 | "execution_count": 37, 2358 | "metadata": { 2359 | "collapsed": false 2360 | }, 2361 | "outputs": [ 2362 | { 2363 | "data": { 2364 | "text/plain": [ 2365 | "Device_Type 0\n", 2366 | "Disbursed 37717\n", 2367 | "Existing_EMI 0\n", 2368 | "Filled_Form 0\n", 2369 | "Gender 0\n", 2370 | "ID 0\n", 2371 | "Loan_Amount_Applied 0\n", 2372 | "Loan_Tenure_Applied 0\n", 2373 | "Mobile_Verified 0\n", 2374 | "Monthly_Income 0\n", 2375 | "Source 0\n", 2376 | "Var1 0\n", 2377 | "Var2 0\n", 2378 | "Var4 0\n", 2379 | "Var5 0\n", 2380 | "source 0\n", 2381 | "Age 0\n", 2382 | "EMI_Loan_Submitted_Missing 0\n", 2383 | "Interest_Rate_Missing 0\n", 2384 | "Loan_Amount_Submitted_Missing 0\n", 2385 | "Loan_Tenure_Submitted_Missing 0\n", 2386 | "Processing_Fee_Missing 0\n", 2387 | "dtype: int64" 2388 | ] 2389 | }, 2390 | "execution_count": 37, 2391 | "metadata": {}, 2392 | "output_type": "execute_result" 2393 | } 2394 | ], 2395 | "source": [ 2396 | "data.apply(lambda x: sum(x.isnull()))" 2397 | ] 2398 | }, 2399 | { 2400 | "cell_type": "code", 2401 | "execution_count": 38, 2402 | "metadata": { 2403 | "collapsed": false 2404 | }, 2405 | "outputs": [ 2406 | { 2407 | "data": { 2408 | "text/plain": [ 2409 | "Device_Type object\n", 2410 | "Disbursed float64\n", 2411 | "Existing_EMI float64\n", 2412 | "Filled_Form object\n", 2413 | "Gender object\n", 2414 | "ID object\n", 2415 | "Loan_Amount_Applied float64\n", 2416 | "Loan_Tenure_Applied float64\n", 2417 | "Mobile_Verified object\n", 2418 | "Monthly_Income int64\n", 2419 | "Source object\n", 2420 | "Var1 object\n", 2421 | "Var2 object\n", 2422 | "Var4 int64\n", 2423 | "Var5 int64\n", 2424 | "source object\n", 2425 | "Age int64\n", 2426 | "EMI_Loan_Submitted_Missing int64\n", 2427 | "Interest_Rate_Missing int64\n", 2428 | "Loan_Amount_Submitted_Missing int64\n", 2429 | "Loan_Tenure_Submitted_Missing int64\n", 2430 | "Processing_Fee_Missing int64\n", 2431 | "dtype: object" 2432 | ] 2433 | }, 2434 | "execution_count": 38, 2435 | "metadata": {}, 2436 | "output_type": "execute_result" 2437 | } 2438 | ], 2439 | "source": [ 2440 | "data.dtypes" 2441 | ] 2442 | }, 2443 | { 2444 | "cell_type": "markdown", 2445 | "metadata": {}, 2446 | "source": [ 2447 | "### 数值编码" 2448 | ] 2449 | }, 2450 | { 2451 | "cell_type": "code", 2452 | "execution_count": 39, 2453 | "metadata": { 2454 | "collapsed": true 2455 | }, 2456 | "outputs": [], 2457 | "source": [ 2458 | "from sklearn.preprocessing import LabelEncoder\n", 2459 | "le = LabelEncoder()\n", 2460 | "var_to_encode = ['Device_Type','Filled_Form','Gender','Var1','Var2','Mobile_Verified','Source']\n", 2461 | "for col in var_to_encode:\n", 2462 | " data[col] = le.fit_transform(data[col])" 2463 | ] 2464 | }, 2465 | { 2466 | "cell_type": "code", 2467 | "execution_count": 40, 2468 | "metadata": { 2469 | "collapsed": false 2470 | }, 2471 | "outputs": [ 2472 | { 2473 | "data": { 2474 | "text/html": [ 2475 | "
\n", 2476 | "\n", 2477 | " \n", 2478 | " \n", 2479 | " \n", 2480 | " \n", 2481 | " \n", 2482 | " \n", 2483 | " \n", 2484 | " \n", 2485 | " \n", 2486 | " \n", 2487 | " \n", 2488 | " \n", 2489 | " \n", 2490 | " \n", 2491 | " \n", 2492 | " \n", 2493 | " \n", 2494 | " \n", 2495 | " \n", 2496 | " \n", 2497 | " \n", 2498 | " \n", 2499 | " \n", 2500 | " \n", 2501 | " \n", 2502 | " \n", 2503 | " \n", 2504 | " \n", 2505 | " \n", 2506 | " \n", 2507 | " \n", 2508 | " \n", 2509 | " \n", 2510 | " \n", 2511 | " \n", 2512 | " \n", 2513 | " \n", 2514 | " \n", 2515 | " \n", 2516 | " \n", 2517 | " \n", 2518 | " \n", 2519 | " \n", 2520 | " \n", 2521 | " \n", 2522 | " \n", 2523 | " \n", 2524 | " \n", 2525 | " \n", 2526 | " \n", 2527 | " \n", 2528 | " \n", 2529 | " \n", 2530 | " \n", 2531 | " \n", 2532 | " \n", 2533 | " \n", 2534 | " \n", 2535 | " \n", 2536 | " \n", 2537 | " \n", 2538 | " \n", 2539 | " \n", 2540 | " \n", 2541 | " \n", 2542 | " \n", 2543 | " \n", 2544 | " \n", 2545 | " \n", 2546 | " \n", 2547 | " \n", 2548 | " \n", 2549 | " \n", 2550 | " \n", 2551 | " \n", 2552 | " \n", 2553 | " \n", 2554 | " \n", 2555 | " \n", 2556 | " \n", 2557 | " \n", 2558 | " \n", 2559 | " \n", 2560 | " \n", 2561 | " \n", 2562 | " \n", 2563 | " \n", 2564 | " \n", 2565 | " \n", 2566 | " \n", 2567 | " \n", 2568 | " \n", 2569 | " \n", 2570 | " \n", 2571 | " \n", 2572 | " \n", 2573 | " \n", 2574 | " \n", 2575 | " \n", 2576 | " \n", 2577 | " \n", 2578 | " \n", 2579 | " \n", 2580 | " \n", 2581 | " \n", 2582 | " \n", 2583 | " \n", 2584 | " \n", 2585 | " \n", 2586 | " \n", 2587 | " \n", 2588 | " \n", 2589 | " \n", 2590 | " \n", 2591 | " \n", 2592 | " \n", 2593 | " \n", 2594 | " \n", 2595 | " \n", 2596 | " \n", 2597 | " \n", 2598 | " \n", 2599 | " \n", 2600 | " \n", 2601 | " \n", 2602 | " \n", 2603 | " \n", 2604 | " \n", 2605 | " \n", 2606 | " \n", 2607 | " \n", 2608 | " \n", 2609 | " \n", 2610 | " \n", 2611 | " \n", 2612 | " \n", 2613 | " \n", 2614 | " \n", 2615 | " \n", 2616 | " \n", 2617 | " \n", 2618 | " \n", 2619 | " \n", 2620 | " \n", 2621 | " \n", 2622 | " \n", 2623 | " \n", 2624 | " \n", 2625 | "
Device_TypeDisbursedExisting_EMIFilled_FormGenderIDLoan_Amount_AppliedLoan_Tenure_AppliedMobile_VerifiedMonthly_Income...Var2Var4Var5sourceAgeEMI_Loan_Submitted_MissingInterest_Rate_MissingLoan_Amount_Submitted_MissingLoan_Tenure_Submitted_MissingProcessing_Fee_Missing
010.00.000ID000002C20300000.05.0020000...610train3711111
110.00.001ID000004E40200000.02.0135000...6313train3000001
210.00.001ID000007H20600000.04.0122500...110train3411001
310.00.001ID000008I301000000.05.0135000...1310train2811001
410.025000.001ID000009J40500000.02.01100000...1317train3111001
\n", 2626 | "

5 rows × 22 columns

\n", 2627 | "
" 2628 | ], 2629 | "text/plain": [ 2630 | " Device_Type Disbursed Existing_EMI Filled_Form Gender ID \\\n", 2631 | "0 1 0.0 0.0 0 0 ID000002C20 \n", 2632 | "1 1 0.0 0.0 0 1 ID000004E40 \n", 2633 | "2 1 0.0 0.0 0 1 ID000007H20 \n", 2634 | "3 1 0.0 0.0 0 1 ID000008I30 \n", 2635 | "4 1 0.0 25000.0 0 1 ID000009J40 \n", 2636 | "\n", 2637 | " Loan_Amount_Applied Loan_Tenure_Applied Mobile_Verified Monthly_Income \\\n", 2638 | "0 300000.0 5.0 0 20000 \n", 2639 | "1 200000.0 2.0 1 35000 \n", 2640 | "2 600000.0 4.0 1 22500 \n", 2641 | "3 1000000.0 5.0 1 35000 \n", 2642 | "4 500000.0 2.0 1 100000 \n", 2643 | "\n", 2644 | " ... Var2 Var4 Var5 source Age \\\n", 2645 | "0 ... 6 1 0 train 37 \n", 2646 | "1 ... 6 3 13 train 30 \n", 2647 | "2 ... 1 1 0 train 34 \n", 2648 | "3 ... 1 3 10 train 28 \n", 2649 | "4 ... 1 3 17 train 31 \n", 2650 | "\n", 2651 | " EMI_Loan_Submitted_Missing Interest_Rate_Missing \\\n", 2652 | "0 1 1 \n", 2653 | "1 0 0 \n", 2654 | "2 1 1 \n", 2655 | "3 1 1 \n", 2656 | "4 1 1 \n", 2657 | "\n", 2658 | " Loan_Amount_Submitted_Missing Loan_Tenure_Submitted_Missing \\\n", 2659 | "0 1 1 \n", 2660 | "1 0 0 \n", 2661 | "2 0 0 \n", 2662 | "3 0 0 \n", 2663 | "4 0 0 \n", 2664 | "\n", 2665 | " Processing_Fee_Missing \n", 2666 | "0 1 \n", 2667 | "1 1 \n", 2668 | "2 1 \n", 2669 | "3 1 \n", 2670 | "4 1 \n", 2671 | "\n", 2672 | "[5 rows x 22 columns]" 2673 | ] 2674 | }, 2675 | "execution_count": 40, 2676 | "metadata": {}, 2677 | "output_type": "execute_result" 2678 | } 2679 | ], 2680 | "source": [ 2681 | "data.head()" 2682 | ] 2683 | }, 2684 | { 2685 | "cell_type": "code", 2686 | "execution_count": 41, 2687 | "metadata": { 2688 | "collapsed": false 2689 | }, 2690 | "outputs": [ 2691 | { 2692 | "data": { 2693 | "text/plain": [ 2694 | "Device_Type int64\n", 2695 | "Disbursed float64\n", 2696 | "Existing_EMI float64\n", 2697 | "Filled_Form int64\n", 2698 | "Gender int64\n", 2699 | "ID object\n", 2700 | "Loan_Amount_Applied float64\n", 2701 | "Loan_Tenure_Applied float64\n", 2702 | "Mobile_Verified int64\n", 2703 | "Monthly_Income int64\n", 2704 | "Source int64\n", 2705 | "Var1 int64\n", 2706 | "Var2 int64\n", 2707 | "Var4 int64\n", 2708 | "Var5 int64\n", 2709 | "source object\n", 2710 | "Age int64\n", 2711 | "EMI_Loan_Submitted_Missing int64\n", 2712 | "Interest_Rate_Missing int64\n", 2713 | "Loan_Amount_Submitted_Missing int64\n", 2714 | "Loan_Tenure_Submitted_Missing int64\n", 2715 | "Processing_Fee_Missing int64\n", 2716 | "dtype: object" 2717 | ] 2718 | }, 2719 | "execution_count": 41, 2720 | "metadata": {}, 2721 | "output_type": "execute_result" 2722 | } 2723 | ], 2724 | "source": [ 2725 | "data.dtypes" 2726 | ] 2727 | }, 2728 | { 2729 | "cell_type": "markdown", 2730 | "metadata": {}, 2731 | "source": [ 2732 | "### 类别型的One-Hot 编码" 2733 | ] 2734 | }, 2735 | { 2736 | "cell_type": "code", 2737 | "execution_count": 42, 2738 | "metadata": { 2739 | "collapsed": false 2740 | }, 2741 | "outputs": [ 2742 | { 2743 | "data": { 2744 | "text/plain": [ 2745 | "Index([u'Disbursed', u'Existing_EMI', u'ID', u'Loan_Amount_Applied',\n", 2746 | " u'Loan_Tenure_Applied', u'Monthly_Income', u'Var4', u'Var5', u'source',\n", 2747 | " u'Age', u'EMI_Loan_Submitted_Missing', u'Interest_Rate_Missing',\n", 2748 | " u'Loan_Amount_Submitted_Missing', u'Loan_Tenure_Submitted_Missing',\n", 2749 | " u'Processing_Fee_Missing', u'Device_Type_0', u'Device_Type_1',\n", 2750 | " u'Filled_Form_0', u'Filled_Form_1', u'Gender_0', u'Gender_1', u'Var1_0',\n", 2751 | " u'Var1_1', u'Var1_2', u'Var1_3', u'Var1_4', u'Var1_5', u'Var1_6',\n", 2752 | " u'Var1_7', u'Var1_8', u'Var1_9', u'Var1_10', u'Var1_11', u'Var1_12',\n", 2753 | " u'Var1_13', u'Var1_14', u'Var1_15', u'Var1_16', u'Var1_17', u'Var1_18',\n", 2754 | " u'Var2_0', u'Var2_1', u'Var2_2', u'Var2_3', u'Var2_4', u'Var2_5',\n", 2755 | " u'Var2_6', u'Mobile_Verified_0', u'Mobile_Verified_1', u'Source_0',\n", 2756 | " u'Source_1', u'Source_2'],\n", 2757 | " dtype='object')" 2758 | ] 2759 | }, 2760 | "execution_count": 42, 2761 | "metadata": {}, 2762 | "output_type": "execute_result" 2763 | } 2764 | ], 2765 | "source": [ 2766 | "data = pd.get_dummies(data, columns=var_to_encode)\n", 2767 | "data.columns" 2768 | ] 2769 | }, 2770 | { 2771 | "cell_type": "markdown", 2772 | "metadata": {}, 2773 | "source": [ 2774 | "### 区分训练和测试数据" 2775 | ] 2776 | }, 2777 | { 2778 | "cell_type": "code", 2779 | "execution_count": 43, 2780 | "metadata": { 2781 | "collapsed": true 2782 | }, 2783 | "outputs": [], 2784 | "source": [ 2785 | "train = data.loc[data['source']=='train']\n", 2786 | "test = data.loc[data['source']=='test']" 2787 | ] 2788 | }, 2789 | { 2790 | "cell_type": "code", 2791 | "execution_count": 44, 2792 | "metadata": { 2793 | "collapsed": false 2794 | }, 2795 | "outputs": [ 2796 | { 2797 | "name": "stderr", 2798 | "output_type": "stream", 2799 | "text": [ 2800 | "/Library/Python/2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: \n", 2801 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 2802 | "\n", 2803 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 2804 | " if __name__ == '__main__':\n", 2805 | "/Library/Python/2.7/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: \n", 2806 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 2807 | "\n", 2808 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 2809 | " from ipykernel import kernelapp as app\n" 2810 | ] 2811 | } 2812 | ], 2813 | "source": [ 2814 | "train.drop('source',axis=1,inplace=True)\n", 2815 | "test.drop(['source','Disbursed'],axis=1,inplace=True)" 2816 | ] 2817 | }, 2818 | { 2819 | "cell_type": "code", 2820 | "execution_count": 45, 2821 | "metadata": { 2822 | "collapsed": true 2823 | }, 2824 | "outputs": [], 2825 | "source": [ 2826 | "train.to_csv('train_modified.csv',index=False)\n", 2827 | "test.to_csv('test_modified.csv',index=False)" 2828 | ] 2829 | }, 2830 | { 2831 | "cell_type": "code", 2832 | "execution_count": null, 2833 | "metadata": { 2834 | "collapsed": true 2835 | }, 2836 | "outputs": [], 2837 | "source": [] 2838 | } 2839 | ], 2840 | "metadata": { 2841 | "kernelspec": { 2842 | "display_name": "Python 2", 2843 | "language": "python", 2844 | "name": "python2" 2845 | }, 2846 | "language_info": { 2847 | "codemirror_mode": { 2848 | "name": "ipython", 2849 | "version": 2 2850 | }, 2851 | "file_extension": ".py", 2852 | "mimetype": "text/x-python", 2853 | "name": "python", 2854 | "nbconvert_exporter": "python", 2855 | "pygments_lexer": "ipython2", 2856 | "version": "2.7.10" 2857 | } 2858 | }, 2859 | "nbformat": 4, 2860 | "nbformat_minor": 0 2861 | } 2862 | --------------------------------------------------------------------------------