├── .idea ├── deployment.xml ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml ├── preferred-vcs.xml ├── vcs.xml ├── woe.iml └── workspace.xml ├── LICENSE.txt ├── MANIFEST.in ├── README.rst ├── build └── lib │ └── woe │ ├── GridSearch.py │ ├── __init__.py │ ├── config.py │ ├── eval.py │ ├── feature_process.py │ └── ftrl.py ├── dist ├── woe-0.1.4-py2-none-any.tar.gz ├── woe-0.1.4-py2-none-any.whl ├── woe-0.1.4-py2.7.egg ├── woe-0.1.4-py3-none-any.tar.gz ├── woe-0.1.4-py3-none-any.whl └── woe-0.1.4.tar.gz ├── examples ├── HereWeGo.py ├── README.rst ├── UCI_Credit_Card.csv └── config.csv ├── setup.py ├── woe.egg-info ├── PKG-INFO ├── SOURCES.txt ├── dependency_links.txt ├── requires.txt └── top_level.txt └── woe ├── GridSearch.py ├── __init__.py ├── config.py ├── eval.py ├── feature_process.py └── ftrl.py /.idea/deployment.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/preferred-vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Git 5 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/woe.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 13 | 14 | 15 | 16 | 17 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 50 | 51 | 52 | 53 | format_output 54 | civ_list 55 | set 56 | out_path 57 | proc_woe_discrete 58 | change_feature_dtype 59 | eval_feature_detail 60 | print 61 | 62 | 63 | eval 64 | 65 | 66 | 67 | 69 | 70 | 92 | 93 | 94 | 95 | 96 | true 97 | DEFINITION_ORDER 98 | 99 | 100 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 130 | 131 | 134 | 135 | 136 | 137 | 140 | 141 | 144 | 145 | 148 | 149 | 150 | 151 | 154 | 155 | 158 | 159 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 194 | 195 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 257 | 258 | 271 | 272 | 289 | 290 | 302 | 303 | 320 | 321 | project 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 356 | 357 | 376 | 377 | 398 | 399 | 421 | 422 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 458 | 459 | 460 | 461 | 1505092956943 462 | 466 | 467 | 1505121641589 468 | 473 | 474 | 1505122959424 475 | 480 | 481 | 1505126556978 482 | 487 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 545 | 546 | 548 | 549 | 550 | 551 | 552 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 | 720 | 721 | 722 | 723 | 724 | 725 | 726 | 727 | 728 | 729 | 730 | 731 | 732 | 733 | 734 | 735 | 736 | 737 | 738 | 739 | 740 | 741 | 742 | 743 | 744 | 745 | 746 | 747 | 748 | 749 | 750 | 751 | 752 | 753 | 754 | 755 | 756 | 757 | 758 | 759 | 760 | 761 | 762 | 763 | 764 | 765 | 766 | 767 | 768 | 769 | 770 | 771 | 772 | 773 | 774 | 775 | 776 | 777 | 778 | 779 | 780 | 781 | 782 | 783 | 784 | 785 | 786 | 787 | 788 | 789 | 790 | 791 | 792 | 793 | 794 | 795 | 796 | 797 | 798 | 799 | 800 | 801 | 802 | 803 | 804 | 805 | 806 | 807 | 808 | 809 | 810 | 811 | 812 | 813 | 814 | 815 | 816 | 817 | 818 | 819 | 820 | 821 | 822 | 823 | 824 | 825 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 The Python Packaging Authority (PyPA) 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | include *.txt 3 | include *.py 4 | recursive-include examples *.csv *.py *.rst -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | woe 2 | === 3 | 4 | .. image:: https://travis-ci.org/justdoit0823/pywxclient.svg?branch=master 5 | :target: https://travis-ci.org/justdoit0823/pywxclient 6 | 7 | version: 0.1.4 8 | 9 | Tools for WoE Transformation mostly used in ScoreCard Model for credit rating 10 | 11 | Installation 12 | -------------------------------- 13 | 14 | We can simply use pip to install, as the following: 15 | 16 | .. code-block:: bash 17 | 18 | $ pip install woe 19 | 20 | or installing from git 21 | 22 | .. code-block:: bash 23 | 24 | $ pip install git+https://github.com/boredbird/woe 25 | 26 | 27 | Features 28 | ======== 29 | 30 | * Split tree with IV criterion 31 | 32 | * Rich and plentiful model eval methods 33 | 34 | * Unified format and easy for output 35 | 36 | * Storage of IV tree for follow-up use 37 | 38 | 39 | 40 | **woe** module function tree 41 | ============================ 42 | 43 | :: 44 | 45 | |- __init__ 46 | |- config.py 47 | | |-- config 48 | | |-- __init__ 49 | | |-- change_config_var_dtype() 50 | | |-- load_file() 51 | |- eval.py 52 | | |-- compute_ks() 53 | | |-- eval_data_summary() 54 | | |-- eval_feature_detail() 55 | | |-- eval_feature_stability() 56 | | |-- eval_feature_summary() 57 | | |-- eval_model_stability() 58 | | |-- eval_model_summary() 59 | | |-- eval_segment_metrics() 60 | | |-- plot_ks() 61 | | |-- proc_cor_eval() 62 | | |-- proc_validation() 63 | | |-- wald_test() 64 | |- feature_process.py 65 | | |-- binning_data_split() 66 | | |-- calculate_iv_split() 67 | | |-- calulate_iv() 68 | | |-- change_feature_dtype() 69 | | |-- check_point() 70 | | |-- fillna() 71 | | |-- format_iv_split() 72 | | |-- proc_woe_continuous() 73 | | |-- proc_woe_discrete() 74 | | |-- process_train_woe() 75 | | |-- process_woe_trans() 76 | | |-- search() 77 | | |-- woe_trans() 78 | |- ftrl.py 79 | | |-- FTRL() 80 | | |-- LR() 81 | |- GridSearch.py 82 | | |-- fit_single_lr() 83 | | |-- grid_search_lr_c() 84 | | |-- grid_search_lr_c_main() 85 | | |-- grid_search_lr_validation() 86 | 87 | 88 | Examples 89 | ======== 90 | 91 | In the examples directory, there is a simple woe transformation program as tutorials. 92 | 93 | Or you can write a more complex program with this `woe` package. 94 | 95 | Version Records 96 | ================ 97 | woe 0.1.4 2018-03-01 98 | * support py3 99 | 100 | woe 0.1.3 2018-02-09 101 | 102 | * woe.feature_process.proc_woe_discrete(): fix bug when deal with discrete varibales 103 | * woe.eval.eval_feature_detail(): fix bug : utf-8 output file format 104 | * woe.GridSearch.grid_search_lr_c_main(): add function warper for convenience and high efficiency 105 | * woe.GridSearch.grid_search_lr_c_validation(): monitor the ks performance of training sets and test sets on different 'c' 106 | * supplement examples test scripts 107 | 108 | 109 | woe 0.1.2 2017-12-05 110 | 111 | * woe.ftrl.FTRL(): add online learning module 112 | 113 | woe 0.1.1 2017-11-28 114 | 115 | * woe.config.load_file(): change param data_path to be optional 116 | * woe.eval.eval_feature_stability(): fix bug : psi_dict['stability_index'] computation error 117 | * woe.feature_process.change_feature_dtype(): add friendly tips when encounter a error 118 | * woe.feature_process.calulate_iv(): refactor the code 119 | * woe.feature_process.calculate_iv_split(): refactor the code 120 | * woe.feature_process.binning_data_split(): reduce the number of len() function calls with __len__() and shape attributes;replace namedtuple with dict 121 | * woe.feature_process.fillna(): new added function to fill null value 122 | * woe.GridSearch.grid_search_lr_c(): list of regularization parameter c specified inside the function is changed to the user specified 123 | 124 | woe 0.0.9 2017-11-21 125 | 126 | * Add module : GridSearch for the search of optimal hyper parametric C in LogisticRegression 127 | * Code refactoring: function compute_ks and plot_ks 128 | 129 | woe 0.0.8 2017-09-28 130 | 131 | * More flexible: cancel conditional restriction in function feature_process.change_feature_dtype() 132 | * Fix bug: the wrong use of deepcopy in function feature_process.woe_trans() 133 | 134 | woe 0.0.7 2017-09-19 135 | 136 | * Fix bug: eval.eval_feature_detail raises ValueError('arrays must all be same length') 137 | * Add parameter interface: alpha specified step learning rate ,default 0.01 138 | 139 | How to Contribute 140 | -------------------------------- 141 | 142 | Email me,1002937942@qq.com. 143 | -------------------------------------------------------------------------------- /build/lib/woe/GridSearch.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'boredbird' 3 | import pandas as pd 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | from sklearn.linear_model import LogisticRegression 7 | from datetime import datetime 8 | from sklearn.svm import l1_min_c 9 | from woe.eval import compute_ks 10 | import pickle 11 | import time 12 | 13 | """ 14 | Search for optimal hyper parametric C in LogisticRegression 15 | """ 16 | def grid_search_lr_c(X_train,y_train,cs,df_coef_path=False 17 | ,pic_coefpath_title='Logistic Regression Path',pic_coefpath=False 18 | ,pic_performance_title='Logistic Regression Performance',pic_performance=False): 19 | """ 20 | grid search optimal hyper parameters c with the best ks performance 21 | :param X_train: features dataframe 22 | :param y_train: target 23 | :param cs: list of regularization parameter c 24 | :param df_coef_path: the file path for logistic regression coefficient dataframe 25 | :param pic_coefpath_title: the pic title for coefficient path picture 26 | :param pic_coefpath: the file path for coefficient path picture 27 | :param pic_performance_title: the pic title for ks performance picture 28 | :param pic_performance: the file path for ks performance picture 29 | :return: a tuple of c and ks value with the best ks performance 30 | """ 31 | # init a LogisticRegression model 32 | clf_l1_LR = LogisticRegression(C=0.1, penalty='l1', tol=0.01,class_weight='balanced') 33 | # cs = l1_min_c(X_train, y_train, loss='log') * np.logspace(0, 9,200) 34 | 35 | print("Computing regularization path ...") 36 | start = datetime.now() 37 | print(start) 38 | coefs_ = [] 39 | ks = [] 40 | for c in cs: 41 | clf_l1_LR.set_params(C=c) 42 | clf_l1_LR.fit(X_train, y_train) 43 | coefs_.append(clf_l1_LR.coef_.ravel().copy()) 44 | 45 | proba = clf_l1_LR.predict_proba(X_train)[:,1] 46 | ks.append(compute_ks(proba,y_train)) 47 | 48 | end = datetime.now() 49 | print(end) 50 | print("This took ", end - start) 51 | coef_cv_df = pd.DataFrame(coefs_,columns=X_train.columns) 52 | coef_cv_df['ks'] = ks 53 | coef_cv_df['c'] = cs 54 | 55 | if df_coef_path: 56 | file_name = df_coef_path if isinstance(df_coef_path, str) else None 57 | coef_cv_df.to_csv(file_name) 58 | 59 | coefs_ = np.array(coefs_) 60 | 61 | fig1 = plt.figure('fig1') 62 | plt.plot(np.log10(cs), coefs_) 63 | ymin, ymax = plt.ylim() 64 | plt.xlabel('log(C)') 65 | plt.ylabel('Coefficients') 66 | plt.title(pic_coefpath_title) 67 | plt.axis('tight') 68 | if pic_coefpath: 69 | file_name = pic_coefpath if isinstance(pic_coefpath, str) else None 70 | plt.savefig(file_name) 71 | else: 72 | plt.show() 73 | 74 | fig2 = plt.figure('fig2') 75 | plt.plot(np.log10(cs), ks) 76 | plt.xlabel('log(C)') 77 | plt.ylabel('ks score') 78 | plt.title(pic_performance_title) 79 | plt.axis('tight') 80 | if pic_performance: 81 | file_name = pic_performance if isinstance(pic_performance, str) else None 82 | plt.savefig(file_name) 83 | else: 84 | plt.show() 85 | 86 | flag = coefs_<0 87 | idx = np.array(ks)[flag.sum(axis=1) == 0].argmax() 88 | 89 | return (cs[idx],ks[idx]) 90 | 91 | 92 | def grid_search_lr_c_validation(X_train,y_train,validation_dataset_list,cs=[0.01],df_coef_path=False 93 | ,pic_coefpath_title='Logistic Regression Path',pic_coefpath=False 94 | ,pic_performance_title='Logistic Regression Performance',pic_performance=False): 95 | """ 96 | grid search optimal hyper parameters c with the best ks performance 97 | :param X_train: features dataframe 98 | :param y_train: target 99 | :param cs: list of c value 100 | :param df_coef_path: the file path for logistic regression coefficient dataframe 101 | :param pic_coefpath_title: the pic title for coefficient path picture 102 | :param pic_coefpath: the file path for coefficient path picture 103 | :param pic_performance_title: the pic title for ks performance picture 104 | :param pic_performance: the file path for ks performance picture 105 | :return: a tuple of c and ks value with the best ks performance 106 | """ 107 | # init a LogisticRegression model 108 | clf_l1_LR = LogisticRegression(C=0.1, penalty='l1', tol=0.01,class_weight='balanced') 109 | 110 | print("Computing regularization path ...") 111 | start = datetime.now() 112 | print(start) 113 | coefs_ = [] 114 | ks = [] 115 | ks_validation1 = [] 116 | ks_validation2 = [] 117 | counter = 0 118 | for c in cs: 119 | print('time: ',time.asctime(time.localtime(time.time())),'counter: ',counter, ' c: ',c) 120 | clf_l1_LR.set_params(C=c) 121 | clf_l1_LR.fit(X_train, y_train) 122 | coefs_.append(clf_l1_LR.coef_.ravel().copy()) 123 | 124 | proba = clf_l1_LR.predict_proba(X_train)[:,1] 125 | validation_proba1 = clf_l1_LR.predict_proba(validation_dataset_list[0][X_train.columns])[:,1] 126 | 127 | ks.append(compute_ks(proba,y_train)) 128 | ks_validation1.append(compute_ks(validation_proba1,validation_dataset_list[0]['target'])) 129 | 130 | print('ks:\t',ks[-1],'ks_validation1:\t',ks_validation1[-1]) 131 | counter += 1 132 | 133 | end = datetime.now() 134 | print(end) 135 | print("This took ", end - start) 136 | coef_cv_df = pd.DataFrame(coefs_,columns=X_train.columns) 137 | coef_cv_df['ks'] = ks 138 | coef_cv_df['ks_validation1'] = ks_validation1 139 | coef_cv_df['c'] = cs 140 | 141 | 142 | if df_coef_path: 143 | file_name = df_coef_path if isinstance(df_coef_path, str) else None 144 | coef_cv_df.to_csv(file_name) 145 | 146 | coefs_ = np.array(coefs_) 147 | 148 | fig1 = plt.figure('fig1') 149 | plt.plot(np.log10(cs), coefs_) 150 | ymin, ymax = plt.ylim() 151 | plt.xlabel('log(C)') 152 | plt.ylabel('Coefficients') 153 | plt.title(pic_coefpath_title) 154 | plt.axis('tight') 155 | if pic_coefpath: 156 | file_name = pic_coefpath if isinstance(pic_coefpath, str) else None 157 | plt.savefig(file_name) 158 | plt.close() 159 | else: 160 | pass 161 | # plt.show() 162 | # plt.close() 163 | 164 | fig2 = plt.figure('fig2') 165 | plt.plot(np.log10(cs), ks) 166 | plt.xlabel('log(C)') 167 | plt.ylabel('ks score') 168 | plt.title(pic_performance_title) 169 | plt.axis('tight') 170 | if pic_performance: 171 | file_name = pic_performance if isinstance(pic_performance, str) else None 172 | plt.savefig(file_name) 173 | plt.close() 174 | else: 175 | pass 176 | # plt.show() 177 | # plt.close() 178 | 179 | flag = coefs_<0 180 | if np.array(ks)[flag.sum(axis=1) == 0].__len__()>0: 181 | idx = np.array(ks)[flag.sum(axis=1) == 0].argmax() 182 | else: 183 | idx = np.array(ks).argmax() 184 | 185 | return (cs[idx],ks[idx]) 186 | 187 | 188 | def grid_search_lr_c_main(params): 189 | print('run into grid_search_lr_c_main:') 190 | dataset_path = params['dataset_path'] 191 | validation_path = params['validation_path'] 192 | config_path = params['config_path'] 193 | df_coef_path = params['df_coef_path'] 194 | pic_coefpath = params['pic_coefpath'] 195 | pic_performance = params['pic_performance'] 196 | pic_coefpath_title = params['pic_coefpath_title'] 197 | pic_performance_title = params['pic_performance_title'] 198 | 199 | dataset_train = pd.read_csv(dataset_path) 200 | cfg = pd.read_csv(config_path) 201 | candidate_var_list = cfg[cfg['is_modelfeature'] == 1]['var_name'] 202 | 203 | b = [var for var in dataset_train.columns if sum(dataset_train[var].isnull()) == 0] 204 | candidate_var_list = list(set(candidate_var_list).intersection(set(b))) 205 | 206 | var_list_specfied = params['var_list_specfied'] 207 | if var_list_specfied.__len__()>0: 208 | candidate_var_list = list(set(candidate_var_list).intersection(set(var_list_specfied))) 209 | 210 | print('candidate_var_list length:\n',candidate_var_list.__len__()) 211 | print('candidate_var_list:\n',candidate_var_list) 212 | 213 | print('change dtypes:float64 to float32') 214 | for var in candidate_var_list: 215 | dataset_train[var] = dataset_train[var].astype(np.float32) 216 | 217 | X_train = dataset_train[dataset_train.target >=0][candidate_var_list] 218 | y_train = dataset_train[dataset_train.target >=0]['target'] 219 | 220 | validation_cols_keep = [var for var in candidate_var_list] 221 | validation_cols_keep.append('target') 222 | validation_dataset_list = [] 223 | 224 | validation_dataset = pd.read_csv(validation_path) 225 | # fillna 226 | for var in candidate_var_list: 227 | validation_dataset.loc[validation_dataset[var].isnull(), (var)] = 0 228 | validation_dataset_list.append(validation_dataset[validation_cols_keep]) 229 | 230 | cs = params['cs'] 231 | print('cs',cs) 232 | c,ks = grid_search_lr_c_validation(X_train,y_train,validation_dataset_list,cs,df_coef_path,pic_coefpath_title,pic_coefpath 233 | ,pic_performance_title,pic_performance) 234 | print('pic_coefpath:\n',pic_coefpath) 235 | print('pic_performance:\n',pic_performance) 236 | print('ks performance on the c:') 237 | print(c,ks) 238 | 239 | return (c,ks) 240 | 241 | 242 | def fit_single_lr(dataset_path,config_path,var_list_specfied,out_model_path,c=0.01): 243 | dataset_train = pd.read_csv(dataset_path) 244 | cfg = pd.read_csv(config_path) 245 | candidate_var_list = cfg[cfg['is_modelfeature'] == 1]['var_name'] 246 | 247 | b = [var for var in dataset_train.columns if sum(dataset_train[var].isnull()) == 0] 248 | candidate_var_list = list(set(candidate_var_list).intersection(set(b))) 249 | 250 | if var_list_specfied.__len__()>0: 251 | candidate_var_list = list(set(candidate_var_list).intersection(set(var_list_specfied))) 252 | 253 | print('candidate_var_list length:\n',candidate_var_list.__len__()) 254 | print('candidate_var_list:\n',candidate_var_list) 255 | 256 | print('change dtypes:float64 to float32') 257 | for var in candidate_var_list: 258 | dataset_train[var] = dataset_train[var].astype(np.float32) 259 | 260 | X_train = dataset_train[dataset_train.target >=0][candidate_var_list] 261 | y_train = dataset_train[dataset_train.target >=0]['target'] 262 | 263 | print('c:',c) 264 | clf_lr_a = LogisticRegression(C=c, penalty='l1', tol=0.01,class_weight='balanced') 265 | 266 | clf_lr_a.fit(X_train, y_train) 267 | coefs = clf_lr_a.coef_.ravel().copy() 268 | 269 | proba = clf_lr_a.predict_proba(X_train)[:,1] 270 | ks = compute_ks(proba,y_train) 271 | 272 | model = {} 273 | model['clf'] = clf_lr_a 274 | model['features_list'] = candidate_var_list 275 | model['coefs'] = coefs 276 | model['ks'] = ks 277 | 278 | output = open(out_model_path, 'wb') 279 | pickle.dump(model,output) 280 | output.close() 281 | 282 | return model 283 | -------------------------------------------------------------------------------- /build/lib/woe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/boredbird/woe/335e9ec2a521d3bbccb0ad5d915128119e4d0ca6/build/lib/woe/__init__.py -------------------------------------------------------------------------------- /build/lib/woe/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'boredbird' 3 | import pandas as pd 4 | 5 | class config: 6 | 7 | def __init__(self): 8 | self.config = None 9 | self.dataset_train = None 10 | self.variable_type = None 11 | self.bin_var_list = None 12 | self.discrete_var_list = None 13 | self.candidate_var_list = None 14 | self.dataset_len = None 15 | self.min_sample = None 16 | self.global_bt = None 17 | self.global_gt = None 18 | 19 | def load_file(self,config_path,data_path=False): 20 | self.config = pd.read_csv(config_path) 21 | # specify variable dtypes 22 | self.variable_type = self.config[['var_name', 'var_dtype']] 23 | self.variable_type = self.variable_type.rename(columns={'var_name': 'v_name', 'var_dtype': 'v_type'}) 24 | self.variable_type = self.variable_type.set_index(['v_name']) 25 | 26 | # specify the list of continuous variable to be splitted into bin 27 | self.bin_var_list = self.config[self.config['is_tobe_bin'] == 1]['var_name'] 28 | # specify the list of discrete variable to be merged into supper classes 29 | self.discrete_var_list = self.config[(self.config['is_candidate'] == 1) & (self.config['var_dtype'] == 'object')]['var_name'] 30 | 31 | # specify the list of model input variable 32 | self.candidate_var_list = self.config[self.config['is_candidate'] == 1]['var_name'] 33 | 34 | if data_path: 35 | data_path = data_path if isinstance(data_path, str) else None 36 | 37 | # load dataset train 38 | self.dataset_train = pd.read_csv(data_path) 39 | self.dataset_train.columns = [col.split('.')[-1] for col in self.dataset_train.columns] 40 | 41 | # specify some other global variables about the training dataset 42 | self.dataset_len = len(self.dataset_train) 43 | self.min_sample = int(self.dataset_len * 0.05) 44 | self.global_bt = sum(self.dataset_train['target']) 45 | self.global_gt = len(self.dataset_train) - sum(self.dataset_train['target']) 46 | 47 | def change_config_var_dtype(self,var_name,type,inplace_file=True): 48 | if type in ['object','string','int64','uint8','float64','bool1','bool2','dates','category']: 49 | self.variable_type.loc[var_name,'v_type'] = type 50 | else: 51 | raise KeyError("Invalid dtype specified! ") -------------------------------------------------------------------------------- /build/lib/woe/eval.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'boredbird' 3 | import pandas as pd 4 | import numpy as np 5 | import scipy 6 | import matplotlib.pyplot as plt 7 | from scipy.stats import ks_2samp 8 | import woe.config as config 9 | import pickle 10 | 11 | def compute_ks(proba,target): 12 | ''' 13 | target: numpy array of shape (1,) 14 | proba: numpy array of shape (1,), predicted probability of the sample being positive 15 | returns: 16 | ks: float, ks score estimation 17 | ''' 18 | get_ks = lambda proba, target: ks_2samp(proba[target == 1], proba[target != 1]).statistic 19 | 20 | return get_ks(proba, target) 21 | 22 | 23 | def eval_feature_detail(Info_Value_list,out_path=False): 24 | """ 25 | format InfoValue list to Dataframe 26 | :param Info_Value_list: Instance list of Class InfoValue 27 | :param out_path:specify the Dataframe to csv file path ,default False 28 | :return:DataFrame about feature detail 29 | """ 30 | rst = Info_Value_list 31 | format_rst = [] 32 | 33 | for kk in range(0,len(rst)): 34 | print(rst[kk].var_name) 35 | split_list = [] 36 | if rst[kk].split_list != []: 37 | if not rst[kk].is_discrete: 38 | #deal with split_list 39 | split_list.append('(-INF,'+str(rst[kk].split_list[0])+']') 40 | for i in range(0,len(rst[kk].split_list)-1): 41 | split_list.append('(' + str(rst[kk].split_list[i])+','+ str(rst[kk].split_list[i+1]) + ']') 42 | 43 | split_list.append('(' + str(rst[kk].split_list[len(rst[kk].split_list)-1]) + ',+INF)') 44 | else: 45 | split_list = rst[kk].split_list 46 | else: 47 | split_list.append('(-INF,+INF)') 48 | 49 | # merge into dataframe 50 | columns = ['var_name','split_list','sub_total_sample_num','positive_sample_num' 51 | ,'negative_sample_num','sub_total_num_percentage','positive_rate_in_sub_total' 52 | ,'woe_list','iv_list','iv'] 53 | rowcnt = len(rst[kk].iv_list) 54 | if rowcnt < len(split_list): 55 | split_list = split_list[:rowcnt] 56 | 57 | var_name = [rst[kk].var_name] * rowcnt 58 | iv = [rst[kk].iv] * rowcnt 59 | iv_list = rst[kk].iv_list 60 | woe_list = rst[kk].woe_list 61 | a = pd.DataFrame({'var_name':var_name,'iv_list':iv_list,'woe_list':woe_list 62 | ,'split_list':split_list,'iv':iv,'sub_total_sample_num':rst[kk].sub_total_sample_num 63 | ,'positive_sample_num':rst[kk].positive_sample_num,'negative_sample_num':rst[kk].negative_sample_num 64 | ,'sub_total_num_percentage':rst[kk].sub_total_num_percentage 65 | ,'positive_rate_in_sub_total':rst[kk].positive_rate_in_sub_total 66 | ,'negative_rate_in_sub_total':rst[kk].negative_rate_in_sub_total},columns=columns) 67 | format_rst.append(a) 68 | 69 | # merge dataframe list into one dataframe vertically 70 | cformat_rst = pd.concat(format_rst) 71 | 72 | if out_path: 73 | file_name = out_path if isinstance(out_path, str) else None 74 | cformat_rst.to_csv(file_name, index=False,encoding='utf-8') 75 | 76 | return cformat_rst 77 | 78 | 79 | def eval_data_summary(df_list,source_name,out_path=False): 80 | ''' 81 | :param df_list: A dataset DataFrame 82 | :param source_name: string type 83 | :param out_path: specify the Dataframe to csv file path ,default False 84 | :return: DataFrame about dataset summary info 85 | ''' 86 | train_validation_data_summary = [] 87 | for i in range(len(source_name)): 88 | a = dict() 89 | a['source'] = source_name[i] 90 | a['total_sample_cnt'] = len(df_list[i]) 91 | a['positive_sample_cnt'] = df_list[i]['target'].sum() 92 | a['negative_sample_cnt'] = a['total_sample_cnt'] - a['positive_sample_cnt'] 93 | a['positive_rate'] = a['positive_sample_cnt']*1.0/a['total_sample_cnt'] 94 | train_validation_data_summary.append(a) 95 | 96 | train_validation_data_summary = pd.DataFrame(train_validation_data_summary) 97 | 98 | if out_path: 99 | file_name = out_path if isinstance(out_path, str) else None 100 | train_validation_data_summary.to_csv(file_name, index=False) 101 | 102 | return train_validation_data_summary 103 | 104 | 105 | def eval_model_summary(list_dict,out_path=False): 106 | ''' 107 | :param list_dict: a list of dict 108 | :param out_path: specify the Dataframe to csv file path ,default False 109 | :return: DataFrame about model summary info 110 | ''' 111 | model_summary = pd.DataFrame([list_dict[0]]) 112 | if len(list_dict)>1: 113 | for i in range(len(list_dict)-1): 114 | b = pd.DataFrame([list_dict[i+1]]) 115 | model_summary = pd.merge(model_summary, b, how='outer') 116 | 117 | if out_path: 118 | file_name = out_path if isinstance(out_path, str) else None 119 | model_summary.to_csv(file_name, index=False) 120 | 121 | return model_summary 122 | 123 | 124 | def wald_test(model,X): 125 | ''' 126 | :param model: a model file that should have predict_proba() function 127 | :param X: dataset features DataFrame 128 | :return: the value of wald_stats,p_value 129 | ''' 130 | pred_probs = np.matrix(model.predict_proba(X)) 131 | X_design = np.hstack((np.ones(shape=(X.shape[0], 1)), X)) 132 | diag_array = np.multiply(pred_probs[:, 0], pred_probs[:, 1]).A1 133 | V = scipy.sparse.diags(diag_array) 134 | m1 = X_design.T * V 135 | m2 = m1.dot(X_design) 136 | cov_mat = np.linalg.inv(m2) 137 | 138 | model_params = np.hstack((model.intercept_[0], model.coef_[0])) 139 | wald_stats = (model_params / np.sqrt(np.diag(cov_mat))) ** 2 140 | 141 | wald = scipy.stats.wald() 142 | p_value = wald.pdf(wald_stats) 143 | 144 | return wald_stats,p_value 145 | 146 | 147 | def eval_feature_summary(train_X,model,civ_list,candidate_var_list,out_path=False): 148 | ''' 149 | :param train_X: training dataset features DataFrame 150 | :param model: model file 151 | :param civ_list: list of InfoValue Class instances 152 | :param candidate_var_list: the list of model input variable 153 | :param out_path: specify the Dataframe to csv file path ,default False 154 | :return: DataFrame about feature summary 155 | ''' 156 | feature_summary = {} 157 | feature_summary['feature_name'] = list(['Intercept']) 158 | feature_summary['feature_name'].extend(list(candidate_var_list)) 159 | feature_summary['coef'] = [model['classifier'].intercept_] 160 | feature_summary['coef'].extend(model['classifier'].coef_[0]) 161 | var_name = [civ.var_name for civ in civ_list] 162 | feature_summary['iv'] = [0] 163 | feature_summary['iv'].extend([civ_list[var_name.index(var)].iv for var in candidate_var_list]) 164 | feature_summary['wald_stats'], feature_summary['p_value'] = wald_test(model['classifier'], train_X) 165 | 166 | feature_summary = pd.DataFrame(feature_summary) 167 | if out_path: 168 | file_name = out_path if isinstance(out_path, str) else None 169 | feature_summary.to_csv(file_name, index=False) 170 | 171 | return feature_summary 172 | 173 | 174 | def eval_segment_metrics(target, predict_proba, segment_cnt = 20,out_path=False): 175 | ''' 176 | :param target: the list of actual target value 177 | :param predict_proba: the list of predicted probability 178 | :param segment_cnt: the segment number 179 | :param out_path: specify the Dataframe to csv file path ,default False 180 | :return: DataFrame about segment metrics 181 | ''' 182 | proba_descend_idx = np.argsort(predict_proba) 183 | proba_descend_idx = proba_descend_idx[::-1] 184 | 185 | grp_idx = 1 186 | start_idx = 0 187 | total_sample_cnt = len(predict_proba) 188 | total_positive_sample_cnt = target.sum() 189 | total_negative_sample_cnt = total_sample_cnt - total_positive_sample_cnt 190 | 191 | segment_sample_cnt = int(len(predict_proba) / segment_cnt) 192 | cumulative_sample_percentage = 0.0 193 | cumulative_positive_percentage = 0.0 194 | cumulative_negative_percentage = 0.0 195 | 196 | segment_list = [] 197 | columns = ['grp_idx', 'segment_sample_cnt', 'segment_sample_percentage', 'cumulative_sample_percentage', 198 | 'in_segment_positive_percentage', 'positive_percentage_in_total', 'cumulative_positive_percentage', 199 | 'cumulative_negative_percentage', 'ks'] 200 | 201 | while start_idx < total_sample_cnt: 202 | s = {} 203 | s['grp_idx'] = grp_idx 204 | segment_idx_list = proba_descend_idx[start_idx : start_idx + segment_sample_cnt] 205 | segment_target = target[segment_idx_list] 206 | 207 | segment_sample_cnt = len(segment_idx_list) 208 | s['segment_sample_cnt'] = segment_sample_cnt 209 | 210 | segment_pos_cnt = segment_target.sum() 211 | segment_neg_cnt = segment_sample_cnt - segment_pos_cnt 212 | 213 | segment_sample_percentage = segment_sample_cnt*1.0/total_sample_cnt 214 | s['segment_sample_percentage'] = segment_sample_percentage 215 | 216 | pos_percentage_in_total = float(segment_pos_cnt * 100) / total_positive_sample_cnt 217 | neg_percentage_in_total = float(segment_neg_cnt * 100) / total_negative_sample_cnt 218 | s['positive_percentage_in_total'] = pos_percentage_in_total 219 | 220 | in_segment_positive_percentage = float(segment_pos_cnt) / segment_sample_cnt 221 | s['in_segment_positive_percentage'] = in_segment_positive_percentage 222 | 223 | cumulative_sample_percentage += segment_sample_percentage 224 | s['cumulative_sample_percentage'] = cumulative_sample_percentage 225 | 226 | cumulative_positive_percentage += pos_percentage_in_total 227 | cumulative_negative_percentage += neg_percentage_in_total 228 | s['cumulative_positive_percentage'] = cumulative_positive_percentage 229 | s['cumulative_negative_percentage'] = cumulative_negative_percentage 230 | 231 | ks = cumulative_positive_percentage - cumulative_negative_percentage 232 | s['ks'] = ks 233 | 234 | segment_list.append(s) 235 | grp_idx += 1 236 | start_idx += segment_sample_cnt 237 | 238 | segment_list = pd.DataFrame(segment_list,columns=columns) 239 | if out_path: 240 | file_name = out_path if isinstance(out_path, str) else None 241 | segment_list.to_csv(file_name, index=False) 242 | 243 | return segment_list 244 | 245 | 246 | def eval_model_stability(proba_train, proba_validation, segment_cnt = 10,out_path=False): 247 | ''' 248 | :param proba_train: the list of predicted probability on training dataset 249 | :param proba_validation: the list of predicted probability on validation dataset 250 | :param segment_cnt: the segment number 251 | :param out_path: specify the Dataframe to csv file path ,default False 252 | :return: DataFrame about model stability 253 | ''' 254 | step = 1.0/segment_cnt 255 | flag = 0.0 256 | model_stability = [] 257 | len_train = len(proba_train) 258 | len_validation = len(proba_validation) 259 | 260 | columns = ['score_range','segment_train_percentage','segment_validation_percentage','difference', 261 | 'variance','ln_variance','stability_index'] 262 | 263 | while flag < 1.0: 264 | temp = {} 265 | 266 | score_range = '['+str(flag)+','+str(flag + step)+')' 267 | segment_train_cnt = proba_train[(proba_train >= flag) & (proba_train < flag + step)].count() 268 | segment_train_percentage = segment_train_cnt*1.0/len_train 269 | segment_validation_cnt = proba_validation[(proba_validation >= flag) & (proba_validation < flag + step)].count() 270 | segment_validation_percentage = segment_validation_cnt * 1.0 / len_validation 271 | difference = segment_validation_percentage - segment_train_percentage 272 | variance = float(segment_validation_percentage)/segment_train_percentage 273 | ln_variance = variance 274 | stability_index = difference * ln_variance 275 | 276 | temp['score_range'] = score_range 277 | temp['segment_train_percentage'] = segment_train_percentage 278 | temp['segment_validation_percentage'] = segment_validation_percentage 279 | temp['difference'] = difference 280 | temp['variance'] = variance 281 | temp['ln_variance'] = ln_variance 282 | temp['stability_index'] = stability_index 283 | 284 | model_stability.append(temp) 285 | flag += step 286 | 287 | model_stability = pd.DataFrame(model_stability,columns=columns) 288 | if out_path: 289 | file_name = out_path if isinstance(out_path, str) else None 290 | model_stability.to_csv(file_name, index=False) 291 | 292 | return model_stability 293 | 294 | def eval_feature_stability(civ_list, df_train, df_validation,candidate_var_list,out_path=False): 295 | ''' 296 | :param civ_list: List of InfoValue Class instances 297 | :param df_train: DataFrame of training dataset 298 | :param df_validation: DataFrame of validation dataset 299 | :param candidate_var_list: the list of model input variable 300 | :param out_path: specify the Dataframe to csv file path ,default False 301 | :return: DataFrame about features stability 302 | ''' 303 | psi_dict = {} 304 | 305 | civ_var_list = [civ_list[i].var_name for i in range(len(civ_list))] 306 | intersection = list(set(civ_var_list).intersection(set(candidate_var_list))) 307 | civ_idx_list = [civ_var_list.index(var) for var in intersection] 308 | 309 | len_train = len(df_train) 310 | len_validation = len(df_validation) 311 | 312 | psi_dict['feature_name'] = [] 313 | psi_dict['group'] = [] 314 | psi_dict['segment_train_cnt'] = [] 315 | psi_dict['segment_train_percentage'] = [] 316 | psi_dict['segment_validation_cnt'] = [] 317 | psi_dict['segment_validation_percentage'] = [] 318 | 319 | for i in civ_idx_list: 320 | if civ_list[i].is_discrete: 321 | for j in range(len(civ_list[i].split_list)): 322 | psi_dict['feature_name'].append(civ_list[i].var_name) 323 | psi_dict['group'].append(civ_list[i].split_list[j]) 324 | 325 | civ_split_list = civ_list[i].split_list[j] 326 | segment_train_cnt = 0 327 | for m in civ_split_list: 328 | segment_train_cnt += df_train[civ_list[i].var_name][df_train[civ_list[i].var_name] == m].count() 329 | 330 | psi_dict['segment_train_cnt'].append(segment_train_cnt) 331 | psi_dict['segment_train_percentage'].append(float(segment_train_cnt)/len_train) 332 | 333 | segment_validation_cnt = 0 334 | for m in civ_split_list: 335 | segment_validation_cnt += df_validation[civ_list[i].var_name][df_validation[civ_list[i].var_name] == m].count() 336 | 337 | psi_dict['segment_validation_cnt'].append(segment_validation_cnt) 338 | psi_dict['segment_validation_percentage'].append(float(segment_validation_cnt)/len_validation) 339 | 340 | else: 341 | split_list = [] 342 | split_list.append(float("-inf")) 343 | split_list.extend([temp for temp in civ_list[i].split_list]) 344 | split_list.append(float("inf")) 345 | var_name = civ_list[i].var_name 346 | 347 | for j in range(len(split_list)-3): 348 | psi_dict['feature_name'].append(civ_list[i].var_name) 349 | psi_dict['group'].append('('+str(split_list[j])+','+str(split_list[j+1])+']') 350 | 351 | segment_train_cnt = df_train[var_name][(df_train[var_name] > split_list[j])&(df_train[var_name] <= split_list[j+1])].count() 352 | 353 | psi_dict['segment_train_cnt'].append(segment_train_cnt) 354 | psi_dict['segment_train_percentage'].append(float(segment_train_cnt)/len_train) 355 | 356 | segment_validation_cnt = df_validation[var_name][(df_validation[var_name] > split_list[j])& 357 | (df_validation[var_name] <= split_list[j+1])].count() 358 | 359 | psi_dict['segment_validation_cnt'].append(segment_validation_cnt) 360 | psi_dict['segment_validation_percentage'].append(float(segment_validation_cnt)/len_validation) 361 | 362 | psi_dict['feature_name'].append(var_name) 363 | psi_dict['group'].append('(' + str(split_list[len(split_list)-2]) + ',+INF)') 364 | 365 | segment_train_cnt = df_train[var_name][df_train[var_name] > split_list[len(split_list)-1]].count() 366 | psi_dict['segment_train_cnt'].append(segment_train_cnt) 367 | psi_dict['segment_train_percentage'].append(float(segment_train_cnt) / len_train) 368 | 369 | segment_validation_cnt = df_validation[var_name][df_validation[var_name] > split_list[len(split_list)-1]].count() 370 | psi_dict['segment_validation_cnt'].append(segment_validation_cnt) 371 | psi_dict['segment_validation_percentage'].append(float(segment_validation_cnt) / len_validation) 372 | 373 | psi_dict['difference'] = pd.Series(psi_dict['segment_validation_percentage']) - pd.Series(psi_dict['segment_train_percentage']) 374 | psi_dict['variance'] = list(map(lambda x_y: x_y[0] / (x_y[1]+0.000000001), zip(psi_dict['segment_validation_percentage'], psi_dict['segment_train_percentage']))) 375 | psi_dict['Ln(variance)'] = np.log(np.array(psi_dict['variance'])+0.000000001) 376 | psi_dict['stability_index'] = np.array(psi_dict['difference']) * np.array(psi_dict['Ln(variance)']) 377 | 378 | columns = ['feature_name','group','segment_train_cnt','segment_train_percentage', 379 | 'segment_validation_cnt','segment_validation_percentage','difference', 380 | 'variance','Ln(variance)','stability_index'] 381 | 382 | psi_df = pd.DataFrame(psi_dict, columns=columns) 383 | if out_path: 384 | file_name = out_path if isinstance(out_path, str) else None 385 | psi_df.to_csv(file_name, index=False) 386 | 387 | return psi_df 388 | 389 | 390 | def plot_ks(proba,target,axistype='pct',out_path=False): 391 | """ 392 | plot k-s figure 393 | :param proba: 1-d array,prediction probability values 394 | :param target: 1-d array,the list of actual target value 395 | :param axistype: specify x axis :'axistype' must be either 'pct' (sample percent) or 'proba' (prediction probability) 396 | :param out_path: specify the file path to store ks plot figure,default False 397 | :return: DataFrame, figure summary 398 | """ 399 | assert axistype in ['pct','proba'] , "KS Plot TypeError: Attribute 'axistype' must be either 'pct' or 'proba' !" 400 | 401 | a = pd.DataFrame(np.array([proba,target]).T,columns=['proba','target']) 402 | a.sort_values(by='proba',ascending=False,inplace=True) 403 | a['sum_Times']=a['target'].cumsum() 404 | total_1 = a['target'].sum() 405 | total_0 = len(a) - a['target'].sum() 406 | 407 | a['temp'] = 1 408 | a['Times']=a['temp'].cumsum() 409 | a['cdf1'] = a['sum_Times']/total_1 410 | a['cdf0'] = (a['Times'] - a['sum_Times'])/total_0 411 | a['ks'] = a['cdf1'] - a['cdf0'] 412 | a['percent'] = a['Times']*1.0/len(a) 413 | 414 | idx = np.argmax(a['ks']) 415 | # print(a.loc[idx]) 416 | 417 | if axistype == 'pct': 418 | ''' 419 | KS曲线,横轴为按照输出的概率值排序后的观察样本比例 420 | ''' 421 | plt.figure() 422 | plt.plot(a['percent'],a['cdf1'], label="CDF_positive") 423 | plt.plot(a['percent'],a['cdf0'],label="CDF_negative") 424 | plt.plot(a['percent'],a['ks'],label="K-S") 425 | 426 | sx = np.linspace(0,1,10) 427 | sy = sx 428 | plt.plot(sx,sy,linestyle='--',color='darkgrey',linewidth=1.2) 429 | 430 | plt.legend() 431 | plt.grid(True) 432 | ymin, ymax = plt.ylim() 433 | plt.xlabel('Sample percent') 434 | plt.ylabel('Cumulative probability') 435 | plt.title('Model Evaluation Index K-S') 436 | plt.axis('tight') 437 | 438 | # 虚线 439 | t = a.loc[idx]['percent'] 440 | yb = round(a.loc[idx]['cdf1'],4) 441 | yg = round(a.loc[idx]['cdf0'],4) 442 | 443 | plt.plot([t,t],[yb,yg], color ='red', linewidth=1.4, linestyle="--") 444 | plt.scatter([t,],[yb,], 20, color ='dodgerblue') 445 | plt.annotate(r'$recall_p=%s$' % round(a.loc[idx]['cdf1'],4), xy=(t, yb), xycoords='data', xytext=(+10, -5), 446 | textcoords='offset points', fontsize=8, 447 | arrowprops=dict(arrowstyle='->', connectionstyle="arc3,rad=.1")) 448 | 449 | plt.scatter([t,],[yg,], 20, color ='darkorange') 450 | plt.annotate(r'$recall_n=%s$' % round(a.loc[idx]['cdf0'],4), xy=(t, yg), xycoords='data', xytext=(+10, -10), 451 | textcoords='offset points', fontsize=8, 452 | arrowprops=dict(arrowstyle='->', connectionstyle="arc3,rad=.1")) 453 | # K-S曲线峰值 454 | plt.scatter([t,],[a.loc[idx]['ks'],], 20, color ='limegreen') 455 | plt.annotate(r'$ks=%s,p=%s$' % (round(a.loc[idx]['ks'],4) 456 | ,round(a.loc[idx]['proba'],4)) 457 | , xy=(a.loc[idx]['percent'], a.loc[idx]['ks']) 458 | , xycoords='data' 459 | , xytext=(+15, -15), 460 | textcoords='offset points' 461 | , fontsize=8 462 | ,arrowprops=dict(arrowstyle='->', connectionstyle="arc3,rad=.1")) 463 | plt.annotate(r'$percent=%s,cnt=%s$' % (round(a.loc[idx]['percent'],4) 464 | ,round(a.loc[idx]['Times'],0)) 465 | , xy=(a.loc[idx]['percent'], a.loc[idx]['ks']) 466 | , xycoords='data' 467 | , xytext=(+25, -25), 468 | textcoords='offset points' 469 | , fontsize=8 470 | ) 471 | 472 | else: 473 | ''' 474 | 改变横轴,横轴为模型输出的概率值 475 | ''' 476 | plt.figure() 477 | plt.grid(True) 478 | plt.plot(1-a['proba'],a['cdf1'], label="CDF_bad") 479 | plt.plot(1-a['proba'],a['cdf0'],label="CDF_good") 480 | plt.plot(1-a['proba'],a['ks'],label="ks") 481 | 482 | plt.legend() 483 | ymin, ymax = plt.ylim() 484 | plt.xlabel('1-[Predicted probability]') 485 | plt.ylabel('Cumulative probability') 486 | plt.title('Model Evaluation Index K-S') 487 | plt.axis('tight') 488 | plt.show() 489 | # 虚线 490 | t = 1 - a.loc[idx]['proba'] 491 | yb = round(a.loc[idx]['cdf1'],4) 492 | yg = round(a.loc[idx]['cdf0'],4) 493 | 494 | plt.plot([t,t],[yb,yg], color ='red', linewidth=1.4, linestyle="--") 495 | plt.scatter([t,],[yb,], 20, color ='dodgerblue') 496 | plt.annotate(r'$recall_p=%s$' % round(a.loc[idx]['cdf1'],4), xy=(t, yb), xycoords='data', xytext=(+10, -5), 497 | textcoords='offset points', fontsize=8, 498 | arrowprops=dict(arrowstyle='->', connectionstyle="arc3,rad=.1")) 499 | 500 | plt.scatter([t,],[yg,], 20, color ='darkorange') 501 | plt.annotate(r'$recall_n=%s$' % round(a.loc[idx]['cdf0'],4), xy=(t, yg), xycoords='data', xytext=(+10, -10), 502 | textcoords='offset points', fontsize=8, 503 | arrowprops=dict(arrowstyle='->', connectionstyle="arc3,rad=.1")) 504 | # K-S曲线峰值 505 | plt.scatter([t,],[a.loc[idx]['ks'],], 20, color ='limegreen') 506 | plt.annotate(r'$ks=%s,p=%s$' % (round(a.loc[idx]['ks'],4) 507 | ,round(a.loc[idx]['proba'],4)) 508 | , xy=(t, a.loc[idx]['ks']) 509 | , xycoords='data' 510 | , xytext=(+15, -15), 511 | textcoords='offset points' 512 | , fontsize=8 513 | ,arrowprops=dict(arrowstyle='->', connectionstyle="arc3,rad=.1")) 514 | plt.annotate(r'$percent=%s,cnt=%s$' % (round(a.loc[idx]['percent'],4) 515 | ,round(a.loc[idx]['Times'],0)) 516 | , xy=(t, a.loc[idx]['ks']) 517 | , xycoords='data' 518 | , xytext=(+25, -25), 519 | textcoords='offset points' 520 | , fontsize=8 521 | ) 522 | 523 | if out_path: 524 | file_name = out_path if isinstance(out_path, str) else None 525 | plt.savefig(file_name) 526 | else: 527 | plt.show() 528 | 529 | return a.loc[idx] 530 | 531 | 532 | def proc_validattion(dataset_path,config_path,model_path): 533 | print('####PROC VALIDATION#####') 534 | print('dataset_path:\n',dataset_path) 535 | print('config_path:\n',config_path) 536 | print('model_path:\n',model_path) 537 | #fillna 538 | config_path = r'E:\Code\Python_ML_Code\cs_model\config\config_cs_model.csv' 539 | cfg = config.config() 540 | cfg.load_file(config_path, dataset_path) 541 | 542 | for var in [tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns)]: 543 | # fill null 544 | cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 0 545 | 546 | for var in [tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns)]: 547 | # fill null 548 | cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 0 549 | 550 | output = open(model_path, 'rb') 551 | clf_model = pickle.load(output) 552 | output.close() 553 | 554 | clf = clf_model['clf'] 555 | X_test = cfg.dataset_train[clf_model['features_list']] 556 | y_test = cfg.dataset_train['target'] 557 | 558 | y_hat = clf.predict_proba(X_test)[:,1] 559 | ks = compute_ks(y_hat,y_test) 560 | print('global_bt:',cfg.global_bt) 561 | print('global_gt:', cfg.global_gt) 562 | print('ks:',ks) 563 | return ks 564 | 565 | 566 | def proc_cor_eval(dataset_path,config_path,var_list_specfied,out_file_path): 567 | dataset = pd.read_csv(dataset_path) 568 | cfg = pd.read_csv(config_path) 569 | candidate_var_list = cfg[cfg['is_modelfeature'] == 1]['var_name'] 570 | 571 | b = [var for var in dataset.columns if sum(dataset[var].isnull()) == 0] 572 | candidate_var_list = list(set(candidate_var_list).intersection(set(b))) 573 | 574 | if var_list_specfied.__len__()>0: 575 | candidate_var_list = list(set(candidate_var_list).intersection(set(var_list_specfied))) 576 | 577 | print('candidate_var_list length:\n',candidate_var_list.__len__()) 578 | print('candidate_var_list:\n',candidate_var_list) 579 | 580 | cor = np.corrcoef(dataset[candidate_var_list].values,rowvar=0) 581 | pd.DataFrame(cor,columns=candidate_var_list).to_csv(out_file_path,index=False) -------------------------------------------------------------------------------- /build/lib/woe/feature_process.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'boredbird' 3 | import numpy as np 4 | import woe.config as config 5 | import woe.eval as eval 6 | import copy 7 | import pickle 8 | import time 9 | 10 | class node: 11 | '''Tree Node Class 12 | ''' 13 | def __init__(self,var_name=None,iv=0,split_point=None,right=None,left=None): 14 | self.var_name = var_name # The column index value of the attributes that are used to split data sets 15 | self.iv = iv # The info value of the node 16 | self.split_point = split_point # Store split points list 17 | self.right = right # Right sub tree 18 | self.left = left # Left sub tree 19 | 20 | 21 | class InfoValue(object): 22 | ''' 23 | InfoValue Class 24 | ''' 25 | def __init__(self): 26 | self.var_name = [] 27 | self.split_list = [] 28 | self.iv = 0 29 | self.woe_list = [] 30 | self.iv_list = [] 31 | self.is_discrete = 0 32 | self.sub_total_sample_num = [] 33 | self.positive_sample_num = [] 34 | self.negative_sample_num = [] 35 | self.sub_total_num_percentage = [] 36 | self.positive_rate_in_sub_total = [] 37 | self.negative_rate_in_sub_total = [] 38 | 39 | def init(self,civ): 40 | self.var_name = civ.var_name 41 | self.split_list = civ.split_list 42 | self.iv = civ.iv 43 | self.woe_list = civ.woe_list 44 | self.iv_list = civ.iv_list 45 | self.is_discrete = civ.is_discrete 46 | self.sub_total_sample_num = civ.sub_total_sample_num 47 | self.positive_sample_num = civ.positive_sample_num 48 | self.negative_sample_num = civ.negative_sample_num 49 | self.sub_total_num_percentage = civ.sub_total_num_percentage 50 | self.positive_rate_in_sub_total = civ.positive_rate_in_sub_total 51 | self.negative_rate_in_sub_total = civ.negative_rate_in_sub_total 52 | 53 | 54 | class DisInfoValue(object): 55 | ''' 56 | A Class for the storage of discrete variables transformation information 57 | ''' 58 | def __init__(self): 59 | self.var_name = None 60 | self.origin_value = [] 61 | self.woe_before = [] 62 | 63 | 64 | def change_feature_dtype(df,variable_type): 65 | ''' 66 | change feature data type by the variable_type DataFrame 67 | :param df: dataset DataFrame 68 | :param variable_type: the DataFrame about variables dtypes 69 | :return: None 70 | ''' 71 | s = 'Changing Feature Dtypes' 72 | print(s.center(60,'-')) 73 | for vname in df.columns: 74 | try: 75 | df[vname] = df[vname].astype(variable_type.loc[vname,'v_type']) 76 | print(vname,' '*(40-len(vname)),'{0: >10}'.format(variable_type.loc[vname,'v_type'])) 77 | except Exception: 78 | print('[error]',vname) 79 | print('[original dtype] ',df.dtypes[vname],' [astype] ',variable_type.loc[vname,'v_type']) 80 | print('[unique value]',np.unique(df[vname])) 81 | 82 | s = 'Variable Dtypes Have Been Specified' 83 | print(s.center(60,'-')) 84 | 85 | return 86 | 87 | def check_point(df,var,split,min_sample): 88 | """ 89 | Check whether the segmentation points cause some packet samples to be too small; 90 | If there is a packet sample size of less than 5% of the total sample size, 91 | then merge with the adjacent packet until more than 5%; 92 | Applies only to continuous values 93 | :param df: Dataset DataFrame 94 | :param var: Variables list 95 | :param split: Split points list 96 | :param min_sample: Minimum packet sample size 97 | :return: The split points list checked out 98 | """ 99 | new_split = [] 100 | if split is not None and split.__len__()>0: 101 | # print('run into if line:98') 102 | new_split.append(split[0]) 103 | # print(new_split) 104 | # Try the left section of the first split point partition; 105 | # If not meet the conditions then the split point will be removed 106 | pdf = df[df[var] <= split[0]] 107 | if (pdf.shape[0] < min_sample) or (len(np.unique(pdf['target']))<=1): 108 | # print('run into if line:105') 109 | new_split.pop() 110 | # print(new_split) 111 | for i in range(0,split.__len__()-1): 112 | pdf = df[(df[var] > split[i]) & (df[var] <= split[i+1])] 113 | if (pdf.shape[0] < min_sample) or (np.unique(pdf['target']).__len__()<=1): 114 | # print('run into if line:112') 115 | continue 116 | else: 117 | # print('run into if line:115') 118 | new_split.append(split[i+1]) 119 | # print(new_split) 120 | 121 | #If the remaining sample is too small then remove the last one 122 | # print(new_split) 123 | # print(new_split.__len__()) 124 | if new_split.__len__()>1 and len(df[df[var] >= new_split[new_split.__len__()-1]])1 and np.unique(df[df[var] >= new_split[new_split.__len__()-1]]['target']).__len__()<=1: 130 | # print(split) 131 | # print(split[split.__len__()-1]) 132 | # print(df[df[var] >= new_split[new_split.__len__()-1]].shape) 133 | # print(np.unique(df[df[new_split] > new_split[new_split.__len__()-1]]['target'])) 134 | # print('run into if line:125') 135 | new_split.pop() 136 | # print(new_split) 137 | #If the split list has only one value, and no smaller than this value 138 | if new_split == []: 139 | new_split = split 140 | else: 141 | pass 142 | return new_split 143 | 144 | def calulate_iv(df,var,global_bt,global_gt): 145 | ''' 146 | calculate the iv and woe value without split 147 | :param df: 148 | :param var: 149 | :param global_bt: 150 | :param global_gt: 151 | :return: 152 | ''' 153 | # a = df.groupby(['target']).count() 154 | groupdetail = {} 155 | bt_sub = sum(df['target']) 156 | bri = (bt_sub + 0.0001)* 1.0 / global_bt 157 | gt_sub = df.shape[0] - bt_sub 158 | gri = (gt_sub + 0.0001)* 1.0 / global_gt 159 | 160 | groupdetail['woei'] = np.log(bri / gri) 161 | groupdetail['ivi'] = (bri - gri) * np.log(bri / gri) 162 | groupdetail['sub_total_num_percentage'] = df.shape[0]*1.0/(global_bt+global_gt) 163 | groupdetail['positive_sample_num'] = bt_sub 164 | groupdetail['negative_sample_num'] = gt_sub 165 | groupdetail['positive_rate_in_sub_total'] = bt_sub*1.0/df.shape[0] 166 | groupdetail['negative_rate_in_sub_total'] = gt_sub*1.0/df.shape[0] 167 | 168 | return groupdetail 169 | 170 | 171 | def calculate_iv_split(df,var,split_point,global_bt,global_gt): 172 | """ 173 | calculate the iv value with the specified split point 174 | note: 175 | the dataset should have variables:'target' which to be encapsulated if have time 176 | :return: 177 | """ 178 | #split dataset 179 | dataset_r = df[df.loc[:,var] > split_point][[var,'target']] 180 | dataset_l = df[df.loc[:,var] <= split_point][[var,'target']] 181 | 182 | r1_cnt = sum(dataset_r['target']) 183 | r0_cnt = dataset_r.shape[0] - r1_cnt 184 | 185 | l1_cnt = sum(dataset_l['target']) 186 | l0_cnt = dataset_l.shape[0] - l1_cnt 187 | 188 | if r0_cnt == 0 or r1_cnt == 0 or l0_cnt == 0 or l1_cnt ==0: 189 | return 0,0,0,dataset_l,dataset_r,0,0 190 | 191 | lbr = (l1_cnt+ 0.0001)*1.0/global_bt 192 | lgr = (l0_cnt+ 0.0001)*1.0/global_gt 193 | woel = np.log(lbr/lgr) 194 | ivl = (lbr-lgr)*woel 195 | rbr = (r1_cnt+ 0.0001)*1.0/global_bt 196 | rgr = (r0_cnt+ 0.0001)*1.0/global_gt 197 | woer = np.log(rbr/rgr) 198 | ivr = (rbr-rgr)*woer 199 | iv = ivl+ivr 200 | 201 | return woel,woer,iv,dataset_l,dataset_r,ivl,ivr 202 | 203 | 204 | def binning_data_split(df,var,global_bt,global_gt,min_sample,alpha=0.01): 205 | """ 206 | Specify the data split level and return the split value list 207 | :return: 208 | """ 209 | iv_var = InfoValue() 210 | # Calculates the IV of the current node before splitted 211 | gd = calulate_iv(df, var,global_bt,global_gt) 212 | 213 | woei, ivi = gd['woei'],gd['ivi'] 214 | 215 | if np.unique(df[var]).__len__() <=8: 216 | # print('running into if') 217 | split = list(np.unique(df[var])) 218 | split.sort() 219 | # print('split:',split) 220 | #Segmentation point checking and processing 221 | split = check_point(df, var, split, min_sample) 222 | split.sort() 223 | # print('after check:',split) 224 | iv_var.split_list = split 225 | return node(split_point=split,iv=ivi) 226 | 227 | percent_value = list(np.unique(np.percentile(df[var], range(100)))) 228 | percent_value.sort() 229 | 230 | if percent_value.__len__() <=2: 231 | iv_var.split_list = list(np.unique(percent_value)).sort() 232 | return node(split_point=percent_value,iv=ivi) 233 | 234 | # A sentry that attempts to split the current node 235 | # Init bestSplit_iv with zero 236 | bestSplit_iv = 0 237 | bestSplit_woel = [] 238 | bestSplit_woer = [] 239 | bestSplit_ivl = 0 240 | bestSplit_ivr = 0 241 | bestSplit_point = [] 242 | 243 | #remove max value and min value in case dataset_r or dataset_l will be null 244 | for point in percent_value[0:percent_value.__len__()-1]: 245 | # If there is only a sample or a negative sample, skip 246 | if set(df[df[var] > point]['target']).__len__() == 1 or set(df[df[var] <= point]['target']).__len__() == 1 \ 247 | or df[df[var] > point].shape[0] < min_sample or df[df[var] <= point].shape[0] < min_sample : 248 | continue 249 | 250 | woel, woer, iv, dataset_l, dataset_r, ivl, ivr = calculate_iv_split(df,var,point,global_bt,global_gt) 251 | 252 | if iv > bestSplit_iv: 253 | bestSplit_woel = woel 254 | bestSplit_woer = woer 255 | bestSplit_iv = iv 256 | bestSplit_point = point 257 | bestSplit_dataset_r = dataset_r 258 | bestSplit_dataset_l = dataset_l 259 | bestSplit_ivl = ivl 260 | bestSplit_ivr = ivr 261 | 262 | # If the IV after division is greater than the IV value before the current segmentation, the segmentation is valid and recursive 263 | # specified step learning rate 0.01 264 | if bestSplit_iv > ivi*(1+alpha) and bestSplit_dataset_r.shape[0] > min_sample and bestSplit_dataset_l.shape[0] > min_sample: 265 | presplit_right = node() 266 | presplit_left = node() 267 | 268 | # Determine whether the right node satisfies the segmentation prerequisite 269 | if bestSplit_dataset_r.shape[0] < min_sample or set(bestSplit_dataset_r['target']).__len__() == 1: 270 | presplit_right.iv = bestSplit_ivr 271 | right = presplit_right 272 | else: 273 | right = binning_data_split(bestSplit_dataset_r,var,global_bt,global_gt,min_sample,alpha=0.01) 274 | 275 | # Determine whether the left node satisfies the segmentation prerequisite 276 | if bestSplit_dataset_l.shape[0] < min_sample or np.unique(bestSplit_dataset_l['target']).__len__() == 1: 277 | presplit_left.iv = bestSplit_ivl 278 | left = presplit_left 279 | else: 280 | left = binning_data_split(bestSplit_dataset_l,var,global_bt,global_gt,min_sample,alpha=0.01) 281 | 282 | return node(var_name=var,split_point=bestSplit_point,iv=ivi,left=left,right=right) 283 | else: 284 | # Returns the current node as the final leaf node 285 | return node(var_name=var,iv=ivi) 286 | 287 | 288 | def search(tree,split_list): 289 | ''' 290 | search the tree node 291 | :param tree: a instance of Tree Node Class 292 | :return: split points list 293 | ''' 294 | if isinstance(tree.split_point, list): 295 | split_list.extend(tree.split_point) 296 | else: 297 | split_list.append(tree.split_point) 298 | 299 | if tree.left is not None: 300 | search(tree.left,split_list) 301 | 302 | if tree.right is not None: 303 | search(tree.right,split_list) 304 | 305 | return split_list 306 | 307 | 308 | def format_iv_split(df,var,split_list,global_bt,global_gt): 309 | ''' 310 | Given the dataset DataFrame and split points list then return a InfoValue instance; 311 | Just for continuous variable 312 | :param df: 313 | :param var: 314 | :param split_list: 315 | :param global_bt: 316 | :param global_gt: 317 | :return: 318 | ''' 319 | civ = InfoValue() 320 | civ.var_name = var 321 | civ.split_list = split_list 322 | dfcp = df[:] 323 | 324 | civ.sub_total_sample_num = [] 325 | civ.positive_sample_num = [] 326 | civ.negative_sample_num = [] 327 | civ.sub_total_num_percentage = [] 328 | civ.positive_rate_in_sub_total = [] 329 | 330 | for i in range(0, split_list.__len__()): 331 | dfi = dfcp[dfcp[var] <= split_list[i]] 332 | dfcp = dfcp[dfcp[var] > split_list[i]] 333 | gd = calulate_iv(dfi, var,global_bt,global_gt) 334 | woei, ivi = gd['woei'],gd['ivi'] 335 | civ.woe_list.append(woei) 336 | civ.iv_list.append(ivi) 337 | civ.sub_total_sample_num.append(dfi.shape[0]) 338 | civ.positive_sample_num.append(gd['positive_sample_num']) 339 | civ.negative_sample_num.append(gd['negative_sample_num']) 340 | civ.sub_total_num_percentage.append(gd['sub_total_num_percentage']) 341 | civ.positive_rate_in_sub_total.append(gd['positive_rate_in_sub_total']) 342 | civ.negative_rate_in_sub_total.append(gd['negative_rate_in_sub_total']) 343 | 344 | if dfcp.shape[0]>0: 345 | gd = calulate_iv(dfcp, var,global_bt,global_gt) 346 | woei, ivi = gd['woei'],gd['ivi'] 347 | civ.woe_list.append(woei) 348 | civ.iv_list.append(ivi) 349 | civ.sub_total_sample_num.append(dfcp.shape[0]) 350 | civ.positive_sample_num.append(gd['positive_sample_num']) 351 | civ.negative_sample_num.append(gd['negative_sample_num']) 352 | civ.sub_total_num_percentage.append(gd['sub_total_num_percentage']) 353 | civ.positive_rate_in_sub_total.append(gd['positive_rate_in_sub_total']) 354 | civ.negative_rate_in_sub_total.append(gd['negative_rate_in_sub_total']) 355 | 356 | civ.iv = sum(civ.iv_list) 357 | return civ 358 | 359 | 360 | def woe_trans(dvar,civ): 361 | # replace the var value with the given woe value 362 | var = copy.deepcopy(dvar) 363 | if not civ.is_discrete: 364 | if civ.woe_list.__len__()>1: 365 | split_list = [] 366 | split_list.append(float("-inf")) 367 | split_list.extend([i for i in civ.split_list]) 368 | split_list.append(float("inf")) 369 | 370 | for i in range(civ.woe_list.__len__()): 371 | var[(dvar > split_list[i]) & (dvar <= split_list[i+1])] = civ.woe_list[i] 372 | else: 373 | var[:] = civ.woe_list[0] 374 | else: 375 | split_map = {} 376 | for i in range(civ.split_list.__len__()): 377 | for j in range(civ.split_list[i].__len__()): 378 | split_map[civ.split_list[i][j]] = civ.woe_list[i] 379 | 380 | var = var.map(split_map) 381 | 382 | return var 383 | 384 | def proc_woe_discrete(df,var,global_bt,global_gt,min_sample,alpha=0.01): 385 | ''' 386 | process woe transformation of discrete variables 387 | :param df: 388 | :param var: 389 | :param global_bt: 390 | :param global_gt: 391 | :param min_sample: 392 | :return: 393 | ''' 394 | s = 'process discrete variable:'+str(var) 395 | print(s.center(60, '-')) 396 | 397 | df = df[[var,'target']] 398 | div = DisInfoValue() 399 | div.var_name = var 400 | rdict = {} 401 | cpvar = df[var] 402 | # print('np.unique(df[var]):',np.unique(df[var])) 403 | for var_value in np.unique(df[var]): 404 | # Here come with a '==',in case type error you must do Nan filling process firstly 405 | df_temp = df[df[var] == var_value] 406 | gd = calulate_iv(df_temp,var,global_bt,global_gt) 407 | woei, ivi = gd['woei'],gd['ivi'] 408 | div.origin_value.append(var_value) 409 | div.woe_before.append(woei) 410 | rdict[var_value] = woei 411 | # print(var_value,woei,ivi) 412 | 413 | cpvar = cpvar.map(rdict) 414 | df[var] = cpvar 415 | 416 | iv_tree = binning_data_split(df,var,global_bt,global_gt,min_sample,alpha) 417 | 418 | # Traversal tree, get the segmentation point 419 | split_list = [] 420 | search(iv_tree, split_list) 421 | split_list = list(np.unique([1.0 * x for x in split_list if x is not None])) 422 | split_list.sort() 423 | 424 | # Segmentation point checking and processing 425 | split_list = check_point(df, var, split_list, min_sample) 426 | split_list.sort() 427 | 428 | civ = format_iv_split(df, var, split_list,global_bt,global_gt) 429 | civ.is_discrete = 1 430 | 431 | split_list_temp = [] 432 | split_list_temp.append(float("-inf")) 433 | split_list_temp.extend([i for i in split_list]) 434 | split_list_temp.append(float("inf")) 435 | 436 | a = [] 437 | for i in range(split_list_temp.__len__() - 1): 438 | temp = [] 439 | for j in range(div.origin_value.__len__()): 440 | if (div.woe_before[j]>split_list_temp[i]) & (div.woe_before[j]<=split_list_temp[i+1]): 441 | temp.append(div.origin_value[j]) 442 | 443 | if temp != [] : 444 | a.append(temp) 445 | 446 | civ.split_list = a 447 | 448 | return civ 449 | 450 | 451 | def proc_woe_continuous(df,var,global_bt,global_gt,min_sample,alpha=0.01): 452 | ''' 453 | process woe transformation of discrete variables 454 | :param df: 455 | :param var: 456 | :param global_bt: 457 | :param global_gt: 458 | :param min_sample: 459 | :return: 460 | ''' 461 | s = 'process continuous variable:'+str(var) 462 | print(s.center(60, '-')) 463 | df = df[[var,'target']] 464 | iv_tree = binning_data_split(df, var,global_bt,global_gt,min_sample,alpha) 465 | 466 | # Traversal tree, get the segmentation point 467 | split_list = [] 468 | search(iv_tree, split_list) 469 | split_list = list(np.unique([1.0 * x for x in split_list if x is not None])) 470 | split_list.sort() 471 | 472 | # Segmentation point checking and processing 473 | split_list = check_point(df, var, split_list, min_sample) 474 | split_list.sort() 475 | 476 | civ = format_iv_split(df, var,split_list,global_bt,global_gt) 477 | 478 | return civ 479 | 480 | def fillna(dataset,bin_var_list,discrete_var_list,continuous_filler=-1,discrete_filler='missing'): 481 | """ 482 | fill the null value in the dataframe inpalce 483 | :param dataset: input dataset ,pandas.DataFrame type 484 | :param bin_var_list: continuous variables name list 485 | :param discrete_var_list: discretevvvv variables name list 486 | :param continuous_filler: the value to fill the null value in continuous variables 487 | :param discrete_filler: the value to fill the null value in discrete variables 488 | :return: null value,replace null value inplace 489 | """ 490 | for var in [tmp for tmp in bin_var_list if tmp in list(dataset.columns)]: 491 | # fill null 492 | dataset.loc[dataset[var].isnull(), (var)] = continuous_filler 493 | 494 | for var in [tmp for tmp in discrete_var_list if tmp in list(dataset.columns)]: 495 | # fill null 496 | dataset.loc[dataset[var].isnull(), (var)] = discrete_filler 497 | 498 | 499 | def process_train_woe(infile_path=None,outfile_path=None,rst_path=None,config_path=None): 500 | print('run into process_train_woe: \n',time.asctime(time.localtime(time.time()))) 501 | data_path = infile_path 502 | cfg = config.config() 503 | cfg.load_file(config_path,data_path) 504 | bin_var_list = [tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns)] 505 | 506 | for var in bin_var_list: 507 | # fill null 508 | cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1 509 | 510 | # change feature dtypes 511 | change_feature_dtype(cfg.dataset_train, cfg.variable_type) 512 | rst = [] 513 | 514 | # process woe transformation of continuous variables 515 | print('process woe transformation of continuous variables: \n',time.asctime(time.localtime(time.time()))) 516 | print('cfg.global_bt',cfg.global_bt) 517 | print('cfg.global_gt', cfg.global_gt) 518 | 519 | for var in bin_var_list: 520 | rst.append(proc_woe_continuous(cfg.dataset_train,var,cfg.global_bt,cfg.global_gt,cfg.min_sample,alpha=0.05)) 521 | 522 | # process woe transformation of discrete variables 523 | print('process woe transformation of discrete variables: \n',time.asctime(time.localtime(time.time()))) 524 | for var in [tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns)]: 525 | # fill null 526 | cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing' 527 | rst.append(proc_woe_discrete(cfg.dataset_train,var,cfg.global_bt,cfg.global_gt,cfg.min_sample,alpha=0.05)) 528 | 529 | feature_detail = eval.eval_feature_detail(rst, outfile_path) 530 | 531 | print('save woe transformation rule into pickle: \n',time.asctime(time.localtime(time.time()))) 532 | output = open(rst_path, 'wb') 533 | pickle.dump(rst,output) 534 | output.close() 535 | 536 | return feature_detail,rst 537 | 538 | 539 | def process_woe_trans(in_data_path=None,rst_path=None,out_path=None,config_path=None): 540 | cfg = config.config() 541 | cfg.load_file(config_path, in_data_path) 542 | 543 | for var in [tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns)]: 544 | # fill null 545 | cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1 546 | 547 | for var in [tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns)]: 548 | # fill null 549 | cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing' 550 | 551 | change_feature_dtype(cfg.dataset_train, cfg.variable_type) 552 | 553 | output = open(rst_path, 'rb') 554 | rst = pickle.load(output) 555 | output.close() 556 | 557 | # Training dataset Woe Transformation 558 | for r in rst: 559 | cfg.dataset_train[r.var_name] = woe_trans(cfg.dataset_train[r.var_name], r) 560 | 561 | cfg.dataset_train.to_csv(out_path) 562 | -------------------------------------------------------------------------------- /build/lib/woe/ftrl.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'boredbird' 3 | import numpy as np 4 | 5 | class LR(object): 6 | @staticmethod 7 | def fn(w, x): 8 | '''sigmoid function 9 | ''' 10 | return 1.0 / (1.0 + np.exp(-w.dot(x))) 11 | 12 | @staticmethod 13 | def loss(y, y_hat): 14 | '''Cross entropy loss function 15 | ''' 16 | return np.sum(np.nan_to_num(-y * np.log(y_hat) - (1 - y) * np.log(1 - y_hat))) 17 | 18 | @staticmethod 19 | def grad(y, y_hat, x): 20 | '''The first derivative of the cross entropy loss function to the weight W 21 | ''' 22 | return (y_hat - y) * x 23 | 24 | 25 | class FTRL(object): 26 | def __init__(self, dim, l1, l2, alpha, beta, decisionFunc=LR): 27 | self.dim = dim 28 | self.decisionFunc = decisionFunc 29 | self.z = np.zeros(dim) 30 | self.n = np.zeros(dim) 31 | self.w = np.zeros(dim) 32 | self.w_list = [] 33 | self.loss_list = [] 34 | self.l1 = l1 35 | self.l2 = l2 36 | self.alpha = alpha 37 | self.beta = beta 38 | 39 | def predict(self, x): 40 | return self.decisionFunc.fn(self.w, x) 41 | 42 | def update(self, x, y): 43 | self.w = np.array([0 if np.abs(self.z[i]) <= self.l1 else (np.sign( 44 | self.z[i]) * self.l1 - self.z[i]) / (self.l2 + (self.beta + np.sqrt(self.n[i])) / self.alpha) for i in xrange(self.dim)]) 45 | y_hat = self.predict(x) 46 | g = self.decisionFunc.grad(y, y_hat, x) 47 | sigma = (np.sqrt(self.n + g * g) - np.sqrt(self.n)) / self.alpha 48 | self.z += g - sigma * self.w 49 | self.n += g * g 50 | return self.decisionFunc.loss(y, y_hat) 51 | 52 | def train(self, trainSet, verbos=False, max_itr=10000000000, eta=0.01, epochs=100): 53 | itr = 0 54 | n = 0 55 | while True: 56 | for x, y in trainSet: 57 | loss = self.update(x, y) 58 | if verbos and n%verbos==0: 59 | print("itr=" + str(n) + "\tloss=" + str(loss)) 60 | self.w_list.append(self.w) 61 | self.loss_list.append(loss) 62 | if loss < eta: 63 | itr += 1 64 | else: 65 | itr = 0 66 | if itr >= epochs: # when the loss function has been continuously epochs iterations less than eta 67 | print("loss have less than", eta, " continuously for ", itr, "iterations") 68 | return 69 | n += 1 70 | if n >= max_itr: 71 | print("reach max iteration", max_itr) 72 | return -------------------------------------------------------------------------------- /dist/woe-0.1.4-py2-none-any.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/boredbird/woe/335e9ec2a521d3bbccb0ad5d915128119e4d0ca6/dist/woe-0.1.4-py2-none-any.tar.gz -------------------------------------------------------------------------------- /dist/woe-0.1.4-py2-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/boredbird/woe/335e9ec2a521d3bbccb0ad5d915128119e4d0ca6/dist/woe-0.1.4-py2-none-any.whl -------------------------------------------------------------------------------- /dist/woe-0.1.4-py2.7.egg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/boredbird/woe/335e9ec2a521d3bbccb0ad5d915128119e4d0ca6/dist/woe-0.1.4-py2.7.egg -------------------------------------------------------------------------------- /dist/woe-0.1.4-py3-none-any.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/boredbird/woe/335e9ec2a521d3bbccb0ad5d915128119e4d0ca6/dist/woe-0.1.4-py3-none-any.tar.gz -------------------------------------------------------------------------------- /dist/woe-0.1.4-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/boredbird/woe/335e9ec2a521d3bbccb0ad5d915128119e4d0ca6/dist/woe-0.1.4-py3-none-any.whl -------------------------------------------------------------------------------- /dist/woe-0.1.4.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/boredbird/woe/335e9ec2a521d3bbccb0ad5d915128119e4d0ca6/dist/woe-0.1.4.tar.gz -------------------------------------------------------------------------------- /examples/HereWeGo.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'boredbird' 3 | import os 4 | import numpy as np 5 | import woe.feature_process as fp 6 | import woe.GridSearch as gs 7 | 8 | if __name__ == '__main__': 9 | config_path = os.getcwd()+'\\config.csv' 10 | data_path = os.getcwd()+'\\UCI_Credit_Card.csv' 11 | feature_detail_path = os.getcwd()+'\\features_detail.csv' 12 | rst_pkl_path = os.getcwd()+'\\woe_rule.pkl' 13 | # train woe rule 14 | feature_detail,rst = fp.process_train_woe(infile_path=data_path 15 | ,outfile_path=feature_detail_path 16 | ,rst_path=rst_pkl_path 17 | ,config_path=config_path) 18 | # proc woe transformation 19 | woe_train_path = os.getcwd()+'\\dataset_train_woed.csv' 20 | fp.process_woe_trans(data_path,rst_pkl_path,woe_train_path,config_path) 21 | # here i take the same dataset as test dataset 22 | woe_test_path = os.getcwd()+'\\dataset_test_woed.csv' 23 | fp.process_woe_trans(data_path,rst_pkl_path,woe_test_path,config_path) 24 | 25 | print('###TRAIN SCORECARD MODEL###') 26 | params = {} 27 | params['dataset_path'] = woe_train_path 28 | params['validation_path'] = woe_test_path 29 | params['config_path'] = config_path 30 | 31 | params['df_coef_path'] = os.getcwd()+'\\df_model_coef_path.csv' 32 | params['pic_coefpath'] = os.getcwd()+'\\model_coefpath.png' 33 | params['pic_performance'] = os.getcwd()+'\\model_performance_path.png' 34 | params['pic_coefpath_title'] = 'model_coefpath' 35 | params['pic_performance_title'] = 'model_performance_path' 36 | 37 | params['var_list_specfied'] = [] 38 | params['cs'] = np.logspace(-4, -1,40) 39 | for key,value in params.items(): 40 | print(key,': ',value) 41 | gs.grid_search_lr_c_main(params) 42 | -------------------------------------------------------------------------------- /examples/README.rst: -------------------------------------------------------------------------------- 1 | Dataset Information 2 | =================== 3 | 4 | This dataset contains information on default payments, demographic factors, credit data, history of payment, and bill statements of credit card clients in Taiwan from April 2005 to September 2005. 5 | 6 | **YOU SHOULD SPECIFY THE VARIABLES DTYPES WITH config.csv** 7 | 8 | Appointment: 9 | 10 | continuous variables: is_tobe_bin=1 and is_candidate=1 11 | 12 | discrete variables: is_tobe_bin=0 and is_candidate=1 13 | 14 | Content 15 | ======= 16 | 17 | There are 25 variables: 18 | 19 | * ID: ID of each client 20 | * LIMIT_BAL: Amount of given credit in NT dollars (includes individual and family/supplementary credit 21 | * SEX: Gender (1=male, 2=female) 22 | * EDUCATION: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown) 23 | * MARRIAGE: Marital status (1=married, 2=single, 3=others) 24 | * AGE: Age in years 25 | * PAY_0: Repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, ... 8=payment delay for eight months, 9=payment delay for nine months and above) 26 | * PAY_2: Repayment status in August, 2005 (scale same as above) 27 | * PAY_3: Repayment status in July, 2005 (scale same as above) 28 | * PAY_4: Repayment status in June, 2005 (scale same as above) 29 | * PAY_5: Repayment status in May, 2005 (scale same as above) 30 | * PAY_6: Repayment status in April, 2005 (scale same as above) 31 | * BILL_AMT1: Amount of bill statement in September, 2005 (NT dollar) 32 | * BILL_AMT2: Amount of bill statement in August, 2005 (NT dollar) 33 | * BILL_AMT3: Amount of bill statement in July, 2005 (NT dollar) 34 | * BILL_AMT4: Amount of bill statement in June, 2005 (NT dollar) 35 | * BILL_AMT5: Amount of bill statement in May, 2005 (NT dollar) 36 | * BILL_AMT6: Amount of bill statement in April, 2005 (NT dollar) 37 | * PAY_AMT1: Amount of previous payment in September, 2005 (NT dollar) 38 | * PAY_AMT2: Amount of previous payment in August, 2005 (NT dollar) 39 | * PAY_AMT3: Amount of previous payment in July, 2005 (NT dollar) 40 | * PAY_AMT4: Amount of previous payment in June, 2005 (NT dollar) 41 | * PAY_AMT5: Amount of previous payment in May, 2005 (NT dollar) 42 | * PAY_AMT6: Amount of previous payment in April, 2005 (NT dollar) 43 | * default.payment.next.month: Default payment (1=yes, 0=no) 44 | -------------------------------------------------------------------------------- /examples/config.csv: -------------------------------------------------------------------------------- 1 | var_name,var_dtype,is_tobe_bin,is_candidate,is_modelfeature 2 | ID,object,0,0,0 3 | LIMIT_BAL,int64,1,1,1 4 | SEX,object,0,1,1 5 | EDUCATION,object,0,1,1 6 | MARRIAGE,object,0,1,1 7 | AGE,int64,1,1,1 8 | PAY_0,int64,1,1,1 9 | PAY_2,int64,1,1,1 10 | PAY_3,int64,1,1,1 11 | PAY_4,int64,1,1,1 12 | PAY_5,int64,1,1,1 13 | PAY_6,int64,1,1,1 14 | BILL_AMT1,int64,1,1,1 15 | BILL_AMT2,int64,1,1,1 16 | BILL_AMT3,int64,1,1,1 17 | BILL_AMT4,int64,1,1,1 18 | BILL_AMT5,int64,1,1,1 19 | BILL_AMT6,int64,1,1,1 20 | PAY_AMT1,int64,1,1,1 21 | PAY_AMT2,int64,1,1,1 22 | PAY_AMT3,int64,1,1,1 23 | PAY_AMT4,int64,1,1,1 24 | PAY_AMT5,int64,1,1,1 25 | PAY_AMT6,int64,1,1,1 26 | target,int64,0,0,0 27 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'boredbird' 3 | 4 | from setuptools import setup, find_packages 5 | 6 | setup( 7 | name = 'woe', 8 | version = '0.1.4', 9 | description = ( 10 | 'Tools for WoE Transformation mostly used in ScoreCard Model for credit rating' 11 | ), 12 | long_description = open('README.rst').read(), 13 | author = 'boredbird', 14 | author_email = '1002937942@qq.com', 15 | maintainer = 'boredbird', 16 | maintainer_email = '1002937942@qq.com', 17 | license = 'MIT', 18 | packages = ['woe'], 19 | platforms = ["all"], 20 | url = 'https://github.com/boredbird/woe', 21 | classifiers = [ 22 | 'Operating System :: OS Independent', 23 | 'Intended Audience :: Developers', 24 | 'License :: OSI Approved :: MIT License', 25 | 'Programming Language :: Python', 26 | 'Programming Language :: Python :: Implementation', 27 | 'Programming Language :: Python :: 2', 28 | 'Programming Language :: Python :: 2.7', 29 | 'Programming Language :: Python :: 3', 30 | 'Programming Language :: Python :: 3.5', 31 | 'Topic :: Software Development :: Libraries' 32 | ], 33 | keywords = ["math","finance","scorecard","woe",'iv'], 34 | install_requires = [ 35 | 'pandas>=0.19.2', 36 | 'numpy>=1.11.3', 37 | 'scipy>=0.18.1', 38 | 'matplotlib>=2.0.0', 39 | ] 40 | ) 41 | -------------------------------------------------------------------------------- /woe.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.1 2 | Name: woe 3 | Version: 0.1.4 4 | Summary: Tools for WoE Transformation mostly used in ScoreCard Model for credit rating 5 | Home-page: https://github.com/boredbird/woe 6 | Author: boredbird 7 | Author-email: 1002937942@qq.com 8 | License: MIT 9 | Description: woe 10 | === 11 | 12 | .. image:: https://travis-ci.org/justdoit0823/pywxclient.svg?branch=master 13 | :target: https://travis-ci.org/justdoit0823/pywxclient 14 | 15 | version: 0.1.4 16 | 17 | Tools for WoE Transformation mostly used in ScoreCard Model for credit rating 18 | 19 | Installation 20 | -------------------------------- 21 | 22 | We can simply use pip to install, as the following: 23 | 24 | .. code-block:: bash 25 | 26 | $ pip install woe 27 | 28 | or installing from git 29 | 30 | .. code-block:: bash 31 | 32 | $ pip install git+https://github.com/boredbird/woe 33 | 34 | 35 | Features 36 | ======== 37 | 38 | * Split tree with IV criterion 39 | 40 | * Rich and plentiful model eval methods 41 | 42 | * Unified format and easy for output 43 | 44 | * Storage of IV tree for follow-up use 45 | 46 | 47 | 48 | **woe** module function tree 49 | ============================ 50 | 51 | :: 52 | 53 | |- __init__ 54 | |- config.py 55 | | |-- config 56 | | |-- __init__ 57 | | |-- change_config_var_dtype() 58 | | |-- load_file() 59 | |- eval.py 60 | | |-- compute_ks() 61 | | |-- eval_data_summary() 62 | | |-- eval_feature_detail() 63 | | |-- eval_feature_stability() 64 | | |-- eval_feature_summary() 65 | | |-- eval_model_stability() 66 | | |-- eval_model_summary() 67 | | |-- eval_segment_metrics() 68 | | |-- plot_ks() 69 | | |-- proc_cor_eval() 70 | | |-- proc_validation() 71 | | |-- wald_test() 72 | |- feature_process.py 73 | | |-- binning_data_split() 74 | | |-- calculate_iv_split() 75 | | |-- calulate_iv() 76 | | |-- change_feature_dtype() 77 | | |-- check_point() 78 | | |-- fillna() 79 | | |-- format_iv_split() 80 | | |-- proc_woe_continuous() 81 | | |-- proc_woe_discrete() 82 | | |-- process_train_woe() 83 | | |-- process_woe_trans() 84 | | |-- search() 85 | | |-- woe_trans() 86 | |- ftrl.py 87 | | |-- FTRL() 88 | | |-- LR() 89 | |- GridSearch.py 90 | | |-- fit_single_lr() 91 | | |-- grid_search_lr_c() 92 | | |-- grid_search_lr_c_main() 93 | | |-- grid_search_lr_validation() 94 | 95 | 96 | Examples 97 | ======== 98 | 99 | In the examples directory, there is a simple woe transformation program as tutorials. 100 | 101 | Or you can write a more complex program with this `woe` package. 102 | 103 | Version Records 104 | ================ 105 | woe 0.1.4 2018-03-01 106 | * support py3 107 | 108 | woe 0.1.3 2018-02-09 109 | 110 | * woe.feature_process.proc_woe_discrete(): fix bug when deal with discrete varibales 111 | * woe.eval.eval_feature_detail(): fix bug : utf-8 output file format 112 | * woe.GridSearch.grid_search_lr_c_main(): add function warper for convenience and high efficiency 113 | * woe.GridSearch.grid_search_lr_c_validation(): monitor the ks performance of training sets and test sets on different 'c' 114 | * supplement examples test scripts 115 | 116 | 117 | woe 0.1.2 2017-12-05 118 | 119 | * woe.ftrl.FTRL(): add online learning module 120 | 121 | woe 0.1.1 2017-11-28 122 | 123 | * woe.config.load_file(): change param data_path to be optional 124 | * woe.eval.eval_feature_stability(): fix bug : psi_dict['stability_index'] computation error 125 | * woe.feature_process.change_feature_dtype(): add friendly tips when encounter a error 126 | * woe.feature_process.calulate_iv(): refactor the code 127 | * woe.feature_process.calculate_iv_split(): refactor the code 128 | * woe.feature_process.binning_data_split(): reduce the number of len() function calls with __len__() and shape attributes;replace namedtuple with dict 129 | * woe.feature_process.fillna(): new added function to fill null value 130 | * woe.GridSearch.grid_search_lr_c(): list of regularization parameter c specified inside the function is changed to the user specified 131 | 132 | woe 0.0.9 2017-11-21 133 | 134 | * Add module : GridSearch for the search of optimal hyper parametric C in LogisticRegression 135 | * Code refactoring: function compute_ks and plot_ks 136 | 137 | woe 0.0.8 2017-09-28 138 | 139 | * More flexible: cancel conditional restriction in function feature_process.change_feature_dtype() 140 | * Fix bug: the wrong use of deepcopy in function feature_process.woe_trans() 141 | 142 | woe 0.0.7 2017-09-19 143 | 144 | * Fix bug: eval.eval_feature_detail raises ValueError('arrays must all be same length') 145 | * Add parameter interface: alpha specified step learning rate ,default 0.01 146 | 147 | How to Contribute 148 | -------------------------------- 149 | 150 | Email me,1002937942@qq.com. 151 | 152 | Keywords: math,finance,scorecard,woe,iv 153 | Platform: all 154 | Classifier: Operating System :: OS Independent 155 | Classifier: Intended Audience :: Developers 156 | Classifier: License :: OSI Approved :: MIT License 157 | Classifier: Programming Language :: Python 158 | Classifier: Programming Language :: Python :: Implementation 159 | Classifier: Programming Language :: Python :: 2 160 | Classifier: Programming Language :: Python :: 2.7 161 | Classifier: Programming Language :: Python :: 3 162 | Classifier: Programming Language :: Python :: 3.5 163 | Classifier: Topic :: Software Development :: Libraries 164 | -------------------------------------------------------------------------------- /woe.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | LICENSE.txt 2 | MANIFEST.in 3 | README.rst 4 | setup.py 5 | examples/HereWeGo.py 6 | examples/README.rst 7 | examples/UCI_Credit_Card.csv 8 | examples/config.csv 9 | woe/GridSearch.py 10 | woe/__init__.py 11 | woe/config.py 12 | woe/eval.py 13 | woe/feature_process.py 14 | woe/ftrl.py 15 | woe.egg-info/PKG-INFO 16 | woe.egg-info/SOURCES.txt 17 | woe.egg-info/dependency_links.txt 18 | woe.egg-info/requires.txt 19 | woe.egg-info/top_level.txt -------------------------------------------------------------------------------- /woe.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /woe.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | pandas>=0.19.2 2 | numpy>=1.11.3 3 | scipy>=0.18.1 4 | matplotlib>=2.0.0 5 | -------------------------------------------------------------------------------- /woe.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | woe 2 | -------------------------------------------------------------------------------- /woe/GridSearch.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'boredbird' 3 | import pandas as pd 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | from sklearn.linear_model import LogisticRegression 7 | from datetime import datetime 8 | from sklearn.svm import l1_min_c 9 | from woe.eval import compute_ks 10 | import pickle 11 | import time 12 | 13 | """ 14 | Search for optimal hyper parametric C in LogisticRegression 15 | """ 16 | def grid_search_lr_c(X_train,y_train,cs,df_coef_path=False 17 | ,pic_coefpath_title='Logistic Regression Path',pic_coefpath=False 18 | ,pic_performance_title='Logistic Regression Performance',pic_performance=False): 19 | """ 20 | grid search optimal hyper parameters c with the best ks performance 21 | :param X_train: features dataframe 22 | :param y_train: target 23 | :param cs: list of regularization parameter c 24 | :param df_coef_path: the file path for logistic regression coefficient dataframe 25 | :param pic_coefpath_title: the pic title for coefficient path picture 26 | :param pic_coefpath: the file path for coefficient path picture 27 | :param pic_performance_title: the pic title for ks performance picture 28 | :param pic_performance: the file path for ks performance picture 29 | :return: a tuple of c and ks value with the best ks performance 30 | """ 31 | # init a LogisticRegression model 32 | clf_l1_LR = LogisticRegression(C=0.1, penalty='l1', tol=0.01,class_weight='balanced') 33 | # cs = l1_min_c(X_train, y_train, loss='log') * np.logspace(0, 9,200) 34 | 35 | print("Computing regularization path ...") 36 | start = datetime.now() 37 | print(start) 38 | coefs_ = [] 39 | ks = [] 40 | for c in cs: 41 | clf_l1_LR.set_params(C=c) 42 | clf_l1_LR.fit(X_train, y_train) 43 | coefs_.append(clf_l1_LR.coef_.ravel().copy()) 44 | 45 | proba = clf_l1_LR.predict_proba(X_train)[:,1] 46 | ks.append(compute_ks(proba,y_train)) 47 | 48 | end = datetime.now() 49 | print(end) 50 | print("This took ", end - start) 51 | coef_cv_df = pd.DataFrame(coefs_,columns=X_train.columns) 52 | coef_cv_df['ks'] = ks 53 | coef_cv_df['c'] = cs 54 | 55 | if df_coef_path: 56 | file_name = df_coef_path if isinstance(df_coef_path, str) else None 57 | coef_cv_df.to_csv(file_name) 58 | 59 | coefs_ = np.array(coefs_) 60 | 61 | fig1 = plt.figure('fig1') 62 | plt.plot(np.log10(cs), coefs_) 63 | ymin, ymax = plt.ylim() 64 | plt.xlabel('log(C)') 65 | plt.ylabel('Coefficients') 66 | plt.title(pic_coefpath_title) 67 | plt.axis('tight') 68 | if pic_coefpath: 69 | file_name = pic_coefpath if isinstance(pic_coefpath, str) else None 70 | plt.savefig(file_name) 71 | else: 72 | plt.show() 73 | 74 | fig2 = plt.figure('fig2') 75 | plt.plot(np.log10(cs), ks) 76 | plt.xlabel('log(C)') 77 | plt.ylabel('ks score') 78 | plt.title(pic_performance_title) 79 | plt.axis('tight') 80 | if pic_performance: 81 | file_name = pic_performance if isinstance(pic_performance, str) else None 82 | plt.savefig(file_name) 83 | else: 84 | plt.show() 85 | 86 | flag = coefs_<0 87 | idx = np.array(ks)[flag.sum(axis=1) == 0].argmax() 88 | 89 | return (cs[idx],ks[idx]) 90 | 91 | 92 | def grid_search_lr_c_validation(X_train,y_train,validation_dataset_list,cs=[0.01],df_coef_path=False 93 | ,pic_coefpath_title='Logistic Regression Path',pic_coefpath=False 94 | ,pic_performance_title='Logistic Regression Performance',pic_performance=False): 95 | """ 96 | grid search optimal hyper parameters c with the best ks performance 97 | :param X_train: features dataframe 98 | :param y_train: target 99 | :param cs: list of c value 100 | :param df_coef_path: the file path for logistic regression coefficient dataframe 101 | :param pic_coefpath_title: the pic title for coefficient path picture 102 | :param pic_coefpath: the file path for coefficient path picture 103 | :param pic_performance_title: the pic title for ks performance picture 104 | :param pic_performance: the file path for ks performance picture 105 | :return: a tuple of c and ks value with the best ks performance 106 | """ 107 | # init a LogisticRegression model 108 | clf_l1_LR = LogisticRegression(C=0.1, penalty='l1', tol=0.01,class_weight='balanced') 109 | 110 | print("Computing regularization path ...") 111 | start = datetime.now() 112 | print(start) 113 | coefs_ = [] 114 | ks = [] 115 | ks_validation1 = [] 116 | ks_validation2 = [] 117 | counter = 0 118 | for c in cs: 119 | print('time: ',time.asctime(time.localtime(time.time())),'counter: ',counter, ' c: ',c) 120 | clf_l1_LR.set_params(C=c) 121 | clf_l1_LR.fit(X_train, y_train) 122 | coefs_.append(clf_l1_LR.coef_.ravel().copy()) 123 | 124 | proba = clf_l1_LR.predict_proba(X_train)[:,1] 125 | validation_proba1 = clf_l1_LR.predict_proba(validation_dataset_list[0][X_train.columns])[:,1] 126 | 127 | ks.append(compute_ks(proba,y_train)) 128 | ks_validation1.append(compute_ks(validation_proba1,validation_dataset_list[0]['target'])) 129 | 130 | print('ks:\t',ks[-1],'ks_validation1:\t',ks_validation1[-1]) 131 | counter += 1 132 | 133 | end = datetime.now() 134 | print(end) 135 | print("This took ", end - start) 136 | coef_cv_df = pd.DataFrame(coefs_,columns=X_train.columns) 137 | coef_cv_df['ks'] = ks 138 | coef_cv_df['ks_validation1'] = ks_validation1 139 | coef_cv_df['c'] = cs 140 | 141 | 142 | if df_coef_path: 143 | file_name = df_coef_path if isinstance(df_coef_path, str) else None 144 | coef_cv_df.to_csv(file_name) 145 | 146 | coefs_ = np.array(coefs_) 147 | 148 | fig1 = plt.figure('fig1') 149 | plt.plot(np.log10(cs), coefs_) 150 | ymin, ymax = plt.ylim() 151 | plt.xlabel('log(C)') 152 | plt.ylabel('Coefficients') 153 | plt.title(pic_coefpath_title) 154 | plt.axis('tight') 155 | if pic_coefpath: 156 | file_name = pic_coefpath if isinstance(pic_coefpath, str) else None 157 | plt.savefig(file_name) 158 | plt.close() 159 | else: 160 | pass 161 | # plt.show() 162 | # plt.close() 163 | 164 | fig2 = plt.figure('fig2') 165 | plt.plot(np.log10(cs), ks) 166 | plt.xlabel('log(C)') 167 | plt.ylabel('ks score') 168 | plt.title(pic_performance_title) 169 | plt.axis('tight') 170 | if pic_performance: 171 | file_name = pic_performance if isinstance(pic_performance, str) else None 172 | plt.savefig(file_name) 173 | plt.close() 174 | else: 175 | pass 176 | # plt.show() 177 | # plt.close() 178 | 179 | flag = coefs_<0 180 | if np.array(ks)[flag.sum(axis=1) == 0].__len__()>0: 181 | idx = np.array(ks)[flag.sum(axis=1) == 0].argmax() 182 | else: 183 | idx = np.array(ks).argmax() 184 | 185 | return (cs[idx],ks[idx]) 186 | 187 | 188 | def grid_search_lr_c_main(params): 189 | print('run into grid_search_lr_c_main:') 190 | dataset_path = params['dataset_path'] 191 | validation_path = params['validation_path'] 192 | config_path = params['config_path'] 193 | df_coef_path = params['df_coef_path'] 194 | pic_coefpath = params['pic_coefpath'] 195 | pic_performance = params['pic_performance'] 196 | pic_coefpath_title = params['pic_coefpath_title'] 197 | pic_performance_title = params['pic_performance_title'] 198 | 199 | dataset_train = pd.read_csv(dataset_path) 200 | cfg = pd.read_csv(config_path) 201 | candidate_var_list = cfg[cfg['is_modelfeature'] == 1]['var_name'] 202 | 203 | b = [var for var in dataset_train.columns if sum(dataset_train[var].isnull()) == 0] 204 | candidate_var_list = list(set(candidate_var_list).intersection(set(b))) 205 | 206 | var_list_specfied = params['var_list_specfied'] 207 | if var_list_specfied.__len__()>0: 208 | candidate_var_list = list(set(candidate_var_list).intersection(set(var_list_specfied))) 209 | 210 | print('candidate_var_list length:\n',candidate_var_list.__len__()) 211 | print('candidate_var_list:\n',candidate_var_list) 212 | 213 | print('change dtypes:float64 to float32') 214 | for var in candidate_var_list: 215 | dataset_train[var] = dataset_train[var].astype(np.float32) 216 | 217 | X_train = dataset_train[dataset_train.target >=0][candidate_var_list] 218 | y_train = dataset_train[dataset_train.target >=0]['target'] 219 | 220 | validation_cols_keep = [var for var in candidate_var_list] 221 | validation_cols_keep.append('target') 222 | validation_dataset_list = [] 223 | 224 | validation_dataset = pd.read_csv(validation_path) 225 | # fillna 226 | for var in candidate_var_list: 227 | validation_dataset.loc[validation_dataset[var].isnull(), (var)] = 0 228 | validation_dataset_list.append(validation_dataset[validation_cols_keep]) 229 | 230 | cs = params['cs'] 231 | print('cs',cs) 232 | c,ks = grid_search_lr_c_validation(X_train,y_train,validation_dataset_list,cs,df_coef_path,pic_coefpath_title,pic_coefpath 233 | ,pic_performance_title,pic_performance) 234 | print('pic_coefpath:\n',pic_coefpath) 235 | print('pic_performance:\n',pic_performance) 236 | print('ks performance on the c:') 237 | print(c,ks) 238 | 239 | return (c,ks) 240 | 241 | 242 | def fit_single_lr(dataset_path,config_path,var_list_specfied,out_model_path,c=0.01): 243 | dataset_train = pd.read_csv(dataset_path) 244 | cfg = pd.read_csv(config_path) 245 | candidate_var_list = cfg[cfg['is_modelfeature'] == 1]['var_name'] 246 | 247 | b = [var for var in dataset_train.columns if sum(dataset_train[var].isnull()) == 0] 248 | candidate_var_list = list(set(candidate_var_list).intersection(set(b))) 249 | 250 | if var_list_specfied.__len__()>0: 251 | candidate_var_list = list(set(candidate_var_list).intersection(set(var_list_specfied))) 252 | 253 | print('candidate_var_list length:\n',candidate_var_list.__len__()) 254 | print('candidate_var_list:\n',candidate_var_list) 255 | 256 | print('change dtypes:float64 to float32') 257 | for var in candidate_var_list: 258 | dataset_train[var] = dataset_train[var].astype(np.float32) 259 | 260 | X_train = dataset_train[dataset_train.target >=0][candidate_var_list] 261 | y_train = dataset_train[dataset_train.target >=0]['target'] 262 | 263 | print('c:',c) 264 | clf_lr_a = LogisticRegression(C=c, penalty='l1', tol=0.01,class_weight='balanced') 265 | 266 | clf_lr_a.fit(X_train, y_train) 267 | coefs = clf_lr_a.coef_.ravel().copy() 268 | 269 | proba = clf_lr_a.predict_proba(X_train)[:,1] 270 | ks = compute_ks(proba,y_train) 271 | 272 | model = {} 273 | model['clf'] = clf_lr_a 274 | model['features_list'] = candidate_var_list 275 | model['coefs'] = coefs 276 | model['ks'] = ks 277 | 278 | output = open(out_model_path, 'wb') 279 | pickle.dump(model,output) 280 | output.close() 281 | 282 | return model 283 | -------------------------------------------------------------------------------- /woe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/boredbird/woe/335e9ec2a521d3bbccb0ad5d915128119e4d0ca6/woe/__init__.py -------------------------------------------------------------------------------- /woe/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'boredbird' 3 | import pandas as pd 4 | 5 | class config: 6 | 7 | def __init__(self): 8 | self.config = None 9 | self.dataset_train = None 10 | self.variable_type = None 11 | self.bin_var_list = None 12 | self.discrete_var_list = None 13 | self.candidate_var_list = None 14 | self.dataset_len = None 15 | self.min_sample = None 16 | self.global_bt = None 17 | self.global_gt = None 18 | 19 | def load_file(self,config_path,data_path=False): 20 | self.config = pd.read_csv(config_path) 21 | # specify variable dtypes 22 | self.variable_type = self.config[['var_name', 'var_dtype']] 23 | self.variable_type = self.variable_type.rename(columns={'var_name': 'v_name', 'var_dtype': 'v_type'}) 24 | self.variable_type = self.variable_type.set_index(['v_name']) 25 | 26 | # specify the list of continuous variable to be splitted into bin 27 | self.bin_var_list = self.config[self.config['is_tobe_bin'] == 1]['var_name'] 28 | # specify the list of discrete variable to be merged into supper classes 29 | self.discrete_var_list = self.config[(self.config['is_candidate'] == 1) & (self.config['var_dtype'] == 'object')]['var_name'] 30 | 31 | # specify the list of model input variable 32 | self.candidate_var_list = self.config[self.config['is_candidate'] == 1]['var_name'] 33 | 34 | if data_path: 35 | data_path = data_path if isinstance(data_path, str) else None 36 | 37 | # load dataset train 38 | self.dataset_train = pd.read_csv(data_path) 39 | self.dataset_train.columns = [col.split('.')[-1] for col in self.dataset_train.columns] 40 | 41 | # specify some other global variables about the training dataset 42 | self.dataset_len = len(self.dataset_train) 43 | self.min_sample = int(self.dataset_len * 0.05) 44 | self.global_bt = sum(self.dataset_train['target']) 45 | self.global_gt = len(self.dataset_train) - sum(self.dataset_train['target']) 46 | 47 | def change_config_var_dtype(self,var_name,type,inplace_file=True): 48 | if type in ['object','string','int64','uint8','float64','bool1','bool2','dates','category']: 49 | self.variable_type.loc[var_name,'v_type'] = type 50 | else: 51 | raise KeyError("Invalid dtype specified! ") -------------------------------------------------------------------------------- /woe/eval.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'boredbird' 3 | import pandas as pd 4 | import numpy as np 5 | import scipy 6 | import matplotlib.pyplot as plt 7 | from scipy.stats import ks_2samp 8 | import woe.config as config 9 | import pickle 10 | 11 | def compute_ks(proba,target): 12 | ''' 13 | target: numpy array of shape (1,) 14 | proba: numpy array of shape (1,), predicted probability of the sample being positive 15 | returns: 16 | ks: float, ks score estimation 17 | ''' 18 | get_ks = lambda proba, target: ks_2samp(proba[target == 1], proba[target != 1]).statistic 19 | 20 | return get_ks(proba, target) 21 | 22 | 23 | def eval_feature_detail(Info_Value_list,out_path=False): 24 | """ 25 | format InfoValue list to Dataframe 26 | :param Info_Value_list: Instance list of Class InfoValue 27 | :param out_path:specify the Dataframe to csv file path ,default False 28 | :return:DataFrame about feature detail 29 | """ 30 | rst = Info_Value_list 31 | format_rst = [] 32 | 33 | for kk in range(0,len(rst)): 34 | print(rst[kk].var_name) 35 | split_list = [] 36 | if rst[kk].split_list != []: 37 | if not rst[kk].is_discrete: 38 | #deal with split_list 39 | split_list.append('(-INF,'+str(rst[kk].split_list[0])+']') 40 | for i in range(0,len(rst[kk].split_list)-1): 41 | split_list.append('(' + str(rst[kk].split_list[i])+','+ str(rst[kk].split_list[i+1]) + ']') 42 | 43 | split_list.append('(' + str(rst[kk].split_list[len(rst[kk].split_list)-1]) + ',+INF)') 44 | else: 45 | split_list = rst[kk].split_list 46 | else: 47 | split_list.append('(-INF,+INF)') 48 | 49 | # merge into dataframe 50 | columns = ['var_name','split_list','sub_total_sample_num','positive_sample_num' 51 | ,'negative_sample_num','sub_total_num_percentage','positive_rate_in_sub_total' 52 | ,'woe_list','iv_list','iv'] 53 | rowcnt = len(rst[kk].iv_list) 54 | if rowcnt < len(split_list): 55 | split_list = split_list[:rowcnt] 56 | 57 | var_name = [rst[kk].var_name] * rowcnt 58 | iv = [rst[kk].iv] * rowcnt 59 | iv_list = rst[kk].iv_list 60 | woe_list = rst[kk].woe_list 61 | a = pd.DataFrame({'var_name':var_name,'iv_list':iv_list,'woe_list':woe_list 62 | ,'split_list':split_list,'iv':iv,'sub_total_sample_num':rst[kk].sub_total_sample_num 63 | ,'positive_sample_num':rst[kk].positive_sample_num,'negative_sample_num':rst[kk].negative_sample_num 64 | ,'sub_total_num_percentage':rst[kk].sub_total_num_percentage 65 | ,'positive_rate_in_sub_total':rst[kk].positive_rate_in_sub_total 66 | ,'negative_rate_in_sub_total':rst[kk].negative_rate_in_sub_total},columns=columns) 67 | format_rst.append(a) 68 | 69 | # merge dataframe list into one dataframe vertically 70 | cformat_rst = pd.concat(format_rst) 71 | 72 | if out_path: 73 | file_name = out_path if isinstance(out_path, str) else None 74 | cformat_rst.to_csv(file_name, index=False,encoding='utf-8') 75 | 76 | return cformat_rst 77 | 78 | 79 | def eval_data_summary(df_list,source_name,out_path=False): 80 | ''' 81 | :param df_list: A dataset DataFrame 82 | :param source_name: string type 83 | :param out_path: specify the Dataframe to csv file path ,default False 84 | :return: DataFrame about dataset summary info 85 | ''' 86 | train_validation_data_summary = [] 87 | for i in range(len(source_name)): 88 | a = dict() 89 | a['source'] = source_name[i] 90 | a['total_sample_cnt'] = len(df_list[i]) 91 | a['positive_sample_cnt'] = df_list[i]['target'].sum() 92 | a['negative_sample_cnt'] = a['total_sample_cnt'] - a['positive_sample_cnt'] 93 | a['positive_rate'] = a['positive_sample_cnt']*1.0/a['total_sample_cnt'] 94 | train_validation_data_summary.append(a) 95 | 96 | train_validation_data_summary = pd.DataFrame(train_validation_data_summary) 97 | 98 | if out_path: 99 | file_name = out_path if isinstance(out_path, str) else None 100 | train_validation_data_summary.to_csv(file_name, index=False) 101 | 102 | return train_validation_data_summary 103 | 104 | 105 | def eval_model_summary(list_dict,out_path=False): 106 | ''' 107 | :param list_dict: a list of dict 108 | :param out_path: specify the Dataframe to csv file path ,default False 109 | :return: DataFrame about model summary info 110 | ''' 111 | model_summary = pd.DataFrame([list_dict[0]]) 112 | if len(list_dict)>1: 113 | for i in range(len(list_dict)-1): 114 | b = pd.DataFrame([list_dict[i+1]]) 115 | model_summary = pd.merge(model_summary, b, how='outer') 116 | 117 | if out_path: 118 | file_name = out_path if isinstance(out_path, str) else None 119 | model_summary.to_csv(file_name, index=False) 120 | 121 | return model_summary 122 | 123 | 124 | def wald_test(model,X): 125 | ''' 126 | :param model: a model file that should have predict_proba() function 127 | :param X: dataset features DataFrame 128 | :return: the value of wald_stats,p_value 129 | ''' 130 | pred_probs = np.matrix(model.predict_proba(X)) 131 | X_design = np.hstack((np.ones(shape=(X.shape[0], 1)), X)) 132 | diag_array = np.multiply(pred_probs[:, 0], pred_probs[:, 1]).A1 133 | V = scipy.sparse.diags(diag_array) 134 | m1 = X_design.T * V 135 | m2 = m1.dot(X_design) 136 | cov_mat = np.linalg.inv(m2) 137 | 138 | model_params = np.hstack((model.intercept_[0], model.coef_[0])) 139 | wald_stats = (model_params / np.sqrt(np.diag(cov_mat))) ** 2 140 | 141 | wald = scipy.stats.wald() 142 | p_value = wald.pdf(wald_stats) 143 | 144 | return wald_stats,p_value 145 | 146 | 147 | def eval_feature_summary(train_X,model,civ_list,candidate_var_list,out_path=False): 148 | ''' 149 | :param train_X: training dataset features DataFrame 150 | :param model: model file 151 | :param civ_list: list of InfoValue Class instances 152 | :param candidate_var_list: the list of model input variable 153 | :param out_path: specify the Dataframe to csv file path ,default False 154 | :return: DataFrame about feature summary 155 | ''' 156 | feature_summary = {} 157 | feature_summary['feature_name'] = list(['Intercept']) 158 | feature_summary['feature_name'].extend(list(candidate_var_list)) 159 | feature_summary['coef'] = [model['classifier'].intercept_] 160 | feature_summary['coef'].extend(model['classifier'].coef_[0]) 161 | var_name = [civ.var_name for civ in civ_list] 162 | feature_summary['iv'] = [0] 163 | feature_summary['iv'].extend([civ_list[var_name.index(var)].iv for var in candidate_var_list]) 164 | feature_summary['wald_stats'], feature_summary['p_value'] = wald_test(model['classifier'], train_X) 165 | 166 | feature_summary = pd.DataFrame(feature_summary) 167 | if out_path: 168 | file_name = out_path if isinstance(out_path, str) else None 169 | feature_summary.to_csv(file_name, index=False) 170 | 171 | return feature_summary 172 | 173 | 174 | def eval_segment_metrics(target, predict_proba, segment_cnt = 20,out_path=False): 175 | ''' 176 | :param target: the list of actual target value 177 | :param predict_proba: the list of predicted probability 178 | :param segment_cnt: the segment number 179 | :param out_path: specify the Dataframe to csv file path ,default False 180 | :return: DataFrame about segment metrics 181 | ''' 182 | proba_descend_idx = np.argsort(predict_proba) 183 | proba_descend_idx = proba_descend_idx[::-1] 184 | 185 | grp_idx = 1 186 | start_idx = 0 187 | total_sample_cnt = len(predict_proba) 188 | total_positive_sample_cnt = target.sum() 189 | total_negative_sample_cnt = total_sample_cnt - total_positive_sample_cnt 190 | 191 | segment_sample_cnt = int(len(predict_proba) / segment_cnt) 192 | cumulative_sample_percentage = 0.0 193 | cumulative_positive_percentage = 0.0 194 | cumulative_negative_percentage = 0.0 195 | 196 | segment_list = [] 197 | columns = ['grp_idx', 'segment_sample_cnt', 'segment_sample_percentage', 'cumulative_sample_percentage', 198 | 'in_segment_positive_percentage', 'positive_percentage_in_total', 'cumulative_positive_percentage', 199 | 'cumulative_negative_percentage', 'ks'] 200 | 201 | while start_idx < total_sample_cnt: 202 | s = {} 203 | s['grp_idx'] = grp_idx 204 | segment_idx_list = proba_descend_idx[start_idx : start_idx + segment_sample_cnt] 205 | segment_target = target[segment_idx_list] 206 | 207 | segment_sample_cnt = len(segment_idx_list) 208 | s['segment_sample_cnt'] = segment_sample_cnt 209 | 210 | segment_pos_cnt = segment_target.sum() 211 | segment_neg_cnt = segment_sample_cnt - segment_pos_cnt 212 | 213 | segment_sample_percentage = segment_sample_cnt*1.0/total_sample_cnt 214 | s['segment_sample_percentage'] = segment_sample_percentage 215 | 216 | pos_percentage_in_total = float(segment_pos_cnt * 100) / total_positive_sample_cnt 217 | neg_percentage_in_total = float(segment_neg_cnt * 100) / total_negative_sample_cnt 218 | s['positive_percentage_in_total'] = pos_percentage_in_total 219 | 220 | in_segment_positive_percentage = float(segment_pos_cnt) / segment_sample_cnt 221 | s['in_segment_positive_percentage'] = in_segment_positive_percentage 222 | 223 | cumulative_sample_percentage += segment_sample_percentage 224 | s['cumulative_sample_percentage'] = cumulative_sample_percentage 225 | 226 | cumulative_positive_percentage += pos_percentage_in_total 227 | cumulative_negative_percentage += neg_percentage_in_total 228 | s['cumulative_positive_percentage'] = cumulative_positive_percentage 229 | s['cumulative_negative_percentage'] = cumulative_negative_percentage 230 | 231 | ks = cumulative_positive_percentage - cumulative_negative_percentage 232 | s['ks'] = ks 233 | 234 | segment_list.append(s) 235 | grp_idx += 1 236 | start_idx += segment_sample_cnt 237 | 238 | segment_list = pd.DataFrame(segment_list,columns=columns) 239 | if out_path: 240 | file_name = out_path if isinstance(out_path, str) else None 241 | segment_list.to_csv(file_name, index=False) 242 | 243 | return segment_list 244 | 245 | 246 | def eval_model_stability(proba_train, proba_validation, segment_cnt = 10,out_path=False): 247 | ''' 248 | :param proba_train: the list of predicted probability on training dataset 249 | :param proba_validation: the list of predicted probability on validation dataset 250 | :param segment_cnt: the segment number 251 | :param out_path: specify the Dataframe to csv file path ,default False 252 | :return: DataFrame about model stability 253 | ''' 254 | step = 1.0/segment_cnt 255 | flag = 0.0 256 | model_stability = [] 257 | len_train = len(proba_train) 258 | len_validation = len(proba_validation) 259 | 260 | columns = ['score_range','segment_train_percentage','segment_validation_percentage','difference', 261 | 'variance','ln_variance','stability_index'] 262 | 263 | while flag < 1.0: 264 | temp = {} 265 | 266 | score_range = '['+str(flag)+','+str(flag + step)+')' 267 | segment_train_cnt = proba_train[(proba_train >= flag) & (proba_train < flag + step)].count() 268 | segment_train_percentage = segment_train_cnt*1.0/len_train 269 | segment_validation_cnt = proba_validation[(proba_validation >= flag) & (proba_validation < flag + step)].count() 270 | segment_validation_percentage = segment_validation_cnt * 1.0 / len_validation 271 | difference = segment_validation_percentage - segment_train_percentage 272 | variance = float(segment_validation_percentage)/segment_train_percentage 273 | ln_variance = variance 274 | stability_index = difference * ln_variance 275 | 276 | temp['score_range'] = score_range 277 | temp['segment_train_percentage'] = segment_train_percentage 278 | temp['segment_validation_percentage'] = segment_validation_percentage 279 | temp['difference'] = difference 280 | temp['variance'] = variance 281 | temp['ln_variance'] = ln_variance 282 | temp['stability_index'] = stability_index 283 | 284 | model_stability.append(temp) 285 | flag += step 286 | 287 | model_stability = pd.DataFrame(model_stability,columns=columns) 288 | if out_path: 289 | file_name = out_path if isinstance(out_path, str) else None 290 | model_stability.to_csv(file_name, index=False) 291 | 292 | return model_stability 293 | 294 | def eval_feature_stability(civ_list, df_train, df_validation,candidate_var_list,out_path=False): 295 | ''' 296 | :param civ_list: List of InfoValue Class instances 297 | :param df_train: DataFrame of training dataset 298 | :param df_validation: DataFrame of validation dataset 299 | :param candidate_var_list: the list of model input variable 300 | :param out_path: specify the Dataframe to csv file path ,default False 301 | :return: DataFrame about features stability 302 | ''' 303 | psi_dict = {} 304 | 305 | civ_var_list = [civ_list[i].var_name for i in range(len(civ_list))] 306 | intersection = list(set(civ_var_list).intersection(set(candidate_var_list))) 307 | civ_idx_list = [civ_var_list.index(var) for var in intersection] 308 | 309 | len_train = len(df_train) 310 | len_validation = len(df_validation) 311 | 312 | psi_dict['feature_name'] = [] 313 | psi_dict['group'] = [] 314 | psi_dict['segment_train_cnt'] = [] 315 | psi_dict['segment_train_percentage'] = [] 316 | psi_dict['segment_validation_cnt'] = [] 317 | psi_dict['segment_validation_percentage'] = [] 318 | 319 | for i in civ_idx_list: 320 | if civ_list[i].is_discrete: 321 | for j in range(len(civ_list[i].split_list)): 322 | psi_dict['feature_name'].append(civ_list[i].var_name) 323 | psi_dict['group'].append(civ_list[i].split_list[j]) 324 | 325 | civ_split_list = civ_list[i].split_list[j] 326 | segment_train_cnt = 0 327 | for m in civ_split_list: 328 | segment_train_cnt += df_train[civ_list[i].var_name][df_train[civ_list[i].var_name] == m].count() 329 | 330 | psi_dict['segment_train_cnt'].append(segment_train_cnt) 331 | psi_dict['segment_train_percentage'].append(float(segment_train_cnt)/len_train) 332 | 333 | segment_validation_cnt = 0 334 | for m in civ_split_list: 335 | segment_validation_cnt += df_validation[civ_list[i].var_name][df_validation[civ_list[i].var_name] == m].count() 336 | 337 | psi_dict['segment_validation_cnt'].append(segment_validation_cnt) 338 | psi_dict['segment_validation_percentage'].append(float(segment_validation_cnt)/len_validation) 339 | 340 | else: 341 | split_list = [] 342 | split_list.append(float("-inf")) 343 | split_list.extend([temp for temp in civ_list[i].split_list]) 344 | split_list.append(float("inf")) 345 | var_name = civ_list[i].var_name 346 | 347 | for j in range(len(split_list)-3): 348 | psi_dict['feature_name'].append(civ_list[i].var_name) 349 | psi_dict['group'].append('('+str(split_list[j])+','+str(split_list[j+1])+']') 350 | 351 | segment_train_cnt = df_train[var_name][(df_train[var_name] > split_list[j])&(df_train[var_name] <= split_list[j+1])].count() 352 | 353 | psi_dict['segment_train_cnt'].append(segment_train_cnt) 354 | psi_dict['segment_train_percentage'].append(float(segment_train_cnt)/len_train) 355 | 356 | segment_validation_cnt = df_validation[var_name][(df_validation[var_name] > split_list[j])& 357 | (df_validation[var_name] <= split_list[j+1])].count() 358 | 359 | psi_dict['segment_validation_cnt'].append(segment_validation_cnt) 360 | psi_dict['segment_validation_percentage'].append(float(segment_validation_cnt)/len_validation) 361 | 362 | psi_dict['feature_name'].append(var_name) 363 | psi_dict['group'].append('(' + str(split_list[len(split_list)-2]) + ',+INF)') 364 | 365 | segment_train_cnt = df_train[var_name][df_train[var_name] > split_list[len(split_list)-1]].count() 366 | psi_dict['segment_train_cnt'].append(segment_train_cnt) 367 | psi_dict['segment_train_percentage'].append(float(segment_train_cnt) / len_train) 368 | 369 | segment_validation_cnt = df_validation[var_name][df_validation[var_name] > split_list[len(split_list)-1]].count() 370 | psi_dict['segment_validation_cnt'].append(segment_validation_cnt) 371 | psi_dict['segment_validation_percentage'].append(float(segment_validation_cnt) / len_validation) 372 | 373 | psi_dict['difference'] = pd.Series(psi_dict['segment_validation_percentage']) - pd.Series(psi_dict['segment_train_percentage']) 374 | psi_dict['variance'] = list(map(lambda x_y: x_y[0] / (x_y[1]+0.000000001), zip(psi_dict['segment_validation_percentage'], psi_dict['segment_train_percentage']))) 375 | psi_dict['Ln(variance)'] = np.log(np.array(psi_dict['variance'])+0.000000001) 376 | psi_dict['stability_index'] = np.array(psi_dict['difference']) * np.array(psi_dict['Ln(variance)']) 377 | 378 | columns = ['feature_name','group','segment_train_cnt','segment_train_percentage', 379 | 'segment_validation_cnt','segment_validation_percentage','difference', 380 | 'variance','Ln(variance)','stability_index'] 381 | 382 | psi_df = pd.DataFrame(psi_dict, columns=columns) 383 | if out_path: 384 | file_name = out_path if isinstance(out_path, str) else None 385 | psi_df.to_csv(file_name, index=False) 386 | 387 | return psi_df 388 | 389 | 390 | def plot_ks(proba,target,axistype='pct',out_path=False): 391 | """ 392 | plot k-s figure 393 | :param proba: 1-d array,prediction probability values 394 | :param target: 1-d array,the list of actual target value 395 | :param axistype: specify x axis :'axistype' must be either 'pct' (sample percent) or 'proba' (prediction probability) 396 | :param out_path: specify the file path to store ks plot figure,default False 397 | :return: DataFrame, figure summary 398 | """ 399 | assert axistype in ['pct','proba'] , "KS Plot TypeError: Attribute 'axistype' must be either 'pct' or 'proba' !" 400 | 401 | a = pd.DataFrame(np.array([proba,target]).T,columns=['proba','target']) 402 | a.sort_values(by='proba',ascending=False,inplace=True) 403 | a['sum_Times']=a['target'].cumsum() 404 | total_1 = a['target'].sum() 405 | total_0 = len(a) - a['target'].sum() 406 | 407 | a['temp'] = 1 408 | a['Times']=a['temp'].cumsum() 409 | a['cdf1'] = a['sum_Times']/total_1 410 | a['cdf0'] = (a['Times'] - a['sum_Times'])/total_0 411 | a['ks'] = a['cdf1'] - a['cdf0'] 412 | a['percent'] = a['Times']*1.0/len(a) 413 | 414 | idx = np.argmax(a['ks']) 415 | # print(a.loc[idx]) 416 | 417 | if axistype == 'pct': 418 | ''' 419 | KS曲线,横轴为按照输出的概率值排序后的观察样本比例 420 | ''' 421 | plt.figure() 422 | plt.plot(a['percent'],a['cdf1'], label="CDF_positive") 423 | plt.plot(a['percent'],a['cdf0'],label="CDF_negative") 424 | plt.plot(a['percent'],a['ks'],label="K-S") 425 | 426 | sx = np.linspace(0,1,10) 427 | sy = sx 428 | plt.plot(sx,sy,linestyle='--',color='darkgrey',linewidth=1.2) 429 | 430 | plt.legend() 431 | plt.grid(True) 432 | ymin, ymax = plt.ylim() 433 | plt.xlabel('Sample percent') 434 | plt.ylabel('Cumulative probability') 435 | plt.title('Model Evaluation Index K-S') 436 | plt.axis('tight') 437 | 438 | # 虚线 439 | t = a.loc[idx]['percent'] 440 | yb = round(a.loc[idx]['cdf1'],4) 441 | yg = round(a.loc[idx]['cdf0'],4) 442 | 443 | plt.plot([t,t],[yb,yg], color ='red', linewidth=1.4, linestyle="--") 444 | plt.scatter([t,],[yb,], 20, color ='dodgerblue') 445 | plt.annotate(r'$recall_p=%s$' % round(a.loc[idx]['cdf1'],4), xy=(t, yb), xycoords='data', xytext=(+10, -5), 446 | textcoords='offset points', fontsize=8, 447 | arrowprops=dict(arrowstyle='->', connectionstyle="arc3,rad=.1")) 448 | 449 | plt.scatter([t,],[yg,], 20, color ='darkorange') 450 | plt.annotate(r'$recall_n=%s$' % round(a.loc[idx]['cdf0'],4), xy=(t, yg), xycoords='data', xytext=(+10, -10), 451 | textcoords='offset points', fontsize=8, 452 | arrowprops=dict(arrowstyle='->', connectionstyle="arc3,rad=.1")) 453 | # K-S曲线峰值 454 | plt.scatter([t,],[a.loc[idx]['ks'],], 20, color ='limegreen') 455 | plt.annotate(r'$ks=%s,p=%s$' % (round(a.loc[idx]['ks'],4) 456 | ,round(a.loc[idx]['proba'],4)) 457 | , xy=(a.loc[idx]['percent'], a.loc[idx]['ks']) 458 | , xycoords='data' 459 | , xytext=(+15, -15), 460 | textcoords='offset points' 461 | , fontsize=8 462 | ,arrowprops=dict(arrowstyle='->', connectionstyle="arc3,rad=.1")) 463 | plt.annotate(r'$percent=%s,cnt=%s$' % (round(a.loc[idx]['percent'],4) 464 | ,round(a.loc[idx]['Times'],0)) 465 | , xy=(a.loc[idx]['percent'], a.loc[idx]['ks']) 466 | , xycoords='data' 467 | , xytext=(+25, -25), 468 | textcoords='offset points' 469 | , fontsize=8 470 | ) 471 | 472 | else: 473 | ''' 474 | 改变横轴,横轴为模型输出的概率值 475 | ''' 476 | plt.figure() 477 | plt.grid(True) 478 | plt.plot(1-a['proba'],a['cdf1'], label="CDF_bad") 479 | plt.plot(1-a['proba'],a['cdf0'],label="CDF_good") 480 | plt.plot(1-a['proba'],a['ks'],label="ks") 481 | 482 | plt.legend() 483 | ymin, ymax = plt.ylim() 484 | plt.xlabel('1-[Predicted probability]') 485 | plt.ylabel('Cumulative probability') 486 | plt.title('Model Evaluation Index K-S') 487 | plt.axis('tight') 488 | plt.show() 489 | # 虚线 490 | t = 1 - a.loc[idx]['proba'] 491 | yb = round(a.loc[idx]['cdf1'],4) 492 | yg = round(a.loc[idx]['cdf0'],4) 493 | 494 | plt.plot([t,t],[yb,yg], color ='red', linewidth=1.4, linestyle="--") 495 | plt.scatter([t,],[yb,], 20, color ='dodgerblue') 496 | plt.annotate(r'$recall_p=%s$' % round(a.loc[idx]['cdf1'],4), xy=(t, yb), xycoords='data', xytext=(+10, -5), 497 | textcoords='offset points', fontsize=8, 498 | arrowprops=dict(arrowstyle='->', connectionstyle="arc3,rad=.1")) 499 | 500 | plt.scatter([t,],[yg,], 20, color ='darkorange') 501 | plt.annotate(r'$recall_n=%s$' % round(a.loc[idx]['cdf0'],4), xy=(t, yg), xycoords='data', xytext=(+10, -10), 502 | textcoords='offset points', fontsize=8, 503 | arrowprops=dict(arrowstyle='->', connectionstyle="arc3,rad=.1")) 504 | # K-S曲线峰值 505 | plt.scatter([t,],[a.loc[idx]['ks'],], 20, color ='limegreen') 506 | plt.annotate(r'$ks=%s,p=%s$' % (round(a.loc[idx]['ks'],4) 507 | ,round(a.loc[idx]['proba'],4)) 508 | , xy=(t, a.loc[idx]['ks']) 509 | , xycoords='data' 510 | , xytext=(+15, -15), 511 | textcoords='offset points' 512 | , fontsize=8 513 | ,arrowprops=dict(arrowstyle='->', connectionstyle="arc3,rad=.1")) 514 | plt.annotate(r'$percent=%s,cnt=%s$' % (round(a.loc[idx]['percent'],4) 515 | ,round(a.loc[idx]['Times'],0)) 516 | , xy=(t, a.loc[idx]['ks']) 517 | , xycoords='data' 518 | , xytext=(+25, -25), 519 | textcoords='offset points' 520 | , fontsize=8 521 | ) 522 | 523 | if out_path: 524 | file_name = out_path if isinstance(out_path, str) else None 525 | plt.savefig(file_name) 526 | else: 527 | plt.show() 528 | 529 | return a.loc[idx] 530 | 531 | 532 | def proc_validattion(dataset_path,config_path,model_path): 533 | print('####PROC VALIDATION#####') 534 | print('dataset_path:\n',dataset_path) 535 | print('config_path:\n',config_path) 536 | print('model_path:\n',model_path) 537 | #fillna 538 | config_path = r'E:\Code\Python_ML_Code\cs_model\config\config_cs_model.csv' 539 | cfg = config.config() 540 | cfg.load_file(config_path, dataset_path) 541 | 542 | for var in [tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns)]: 543 | # fill null 544 | cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 0 545 | 546 | for var in [tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns)]: 547 | # fill null 548 | cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 0 549 | 550 | output = open(model_path, 'rb') 551 | clf_model = pickle.load(output) 552 | output.close() 553 | 554 | clf = clf_model['clf'] 555 | X_test = cfg.dataset_train[clf_model['features_list']] 556 | y_test = cfg.dataset_train['target'] 557 | 558 | y_hat = clf.predict_proba(X_test)[:,1] 559 | ks = compute_ks(y_hat,y_test) 560 | print('global_bt:',cfg.global_bt) 561 | print('global_gt:', cfg.global_gt) 562 | print('ks:',ks) 563 | return ks 564 | 565 | 566 | def proc_cor_eval(dataset_path,config_path,var_list_specfied,out_file_path): 567 | dataset = pd.read_csv(dataset_path) 568 | cfg = pd.read_csv(config_path) 569 | candidate_var_list = cfg[cfg['is_modelfeature'] == 1]['var_name'] 570 | 571 | b = [var for var in dataset.columns if sum(dataset[var].isnull()) == 0] 572 | candidate_var_list = list(set(candidate_var_list).intersection(set(b))) 573 | 574 | if var_list_specfied.__len__()>0: 575 | candidate_var_list = list(set(candidate_var_list).intersection(set(var_list_specfied))) 576 | 577 | print('candidate_var_list length:\n',candidate_var_list.__len__()) 578 | print('candidate_var_list:\n',candidate_var_list) 579 | 580 | cor = np.corrcoef(dataset[candidate_var_list].values,rowvar=0) 581 | pd.DataFrame(cor,columns=candidate_var_list).to_csv(out_file_path,index=False) -------------------------------------------------------------------------------- /woe/feature_process.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'boredbird' 3 | import numpy as np 4 | import woe.config as config 5 | import woe.eval as eval 6 | import copy 7 | import pickle 8 | import time 9 | 10 | class node: 11 | '''Tree Node Class 12 | ''' 13 | def __init__(self,var_name=None,iv=0,split_point=None,right=None,left=None): 14 | self.var_name = var_name # The column index value of the attributes that are used to split data sets 15 | self.iv = iv # The info value of the node 16 | self.split_point = split_point # Store split points list 17 | self.right = right # Right sub tree 18 | self.left = left # Left sub tree 19 | 20 | 21 | class InfoValue(object): 22 | ''' 23 | InfoValue Class 24 | ''' 25 | def __init__(self): 26 | self.var_name = [] 27 | self.split_list = [] 28 | self.iv = 0 29 | self.woe_list = [] 30 | self.iv_list = [] 31 | self.is_discrete = 0 32 | self.sub_total_sample_num = [] 33 | self.positive_sample_num = [] 34 | self.negative_sample_num = [] 35 | self.sub_total_num_percentage = [] 36 | self.positive_rate_in_sub_total = [] 37 | self.negative_rate_in_sub_total = [] 38 | 39 | def init(self,civ): 40 | self.var_name = civ.var_name 41 | self.split_list = civ.split_list 42 | self.iv = civ.iv 43 | self.woe_list = civ.woe_list 44 | self.iv_list = civ.iv_list 45 | self.is_discrete = civ.is_discrete 46 | self.sub_total_sample_num = civ.sub_total_sample_num 47 | self.positive_sample_num = civ.positive_sample_num 48 | self.negative_sample_num = civ.negative_sample_num 49 | self.sub_total_num_percentage = civ.sub_total_num_percentage 50 | self.positive_rate_in_sub_total = civ.positive_rate_in_sub_total 51 | self.negative_rate_in_sub_total = civ.negative_rate_in_sub_total 52 | 53 | 54 | class DisInfoValue(object): 55 | ''' 56 | A Class for the storage of discrete variables transformation information 57 | ''' 58 | def __init__(self): 59 | self.var_name = None 60 | self.origin_value = [] 61 | self.woe_before = [] 62 | 63 | 64 | def change_feature_dtype(df,variable_type): 65 | ''' 66 | change feature data type by the variable_type DataFrame 67 | :param df: dataset DataFrame 68 | :param variable_type: the DataFrame about variables dtypes 69 | :return: None 70 | ''' 71 | s = 'Changing Feature Dtypes' 72 | print(s.center(60,'-')) 73 | for vname in df.columns: 74 | try: 75 | df[vname] = df[vname].astype(variable_type.loc[vname,'v_type']) 76 | print(vname,' '*(40-len(vname)),'{0: >10}'.format(variable_type.loc[vname,'v_type'])) 77 | except Exception: 78 | print('[error]',vname) 79 | print('[original dtype] ',df.dtypes[vname],' [astype] ',variable_type.loc[vname,'v_type']) 80 | print('[unique value]',np.unique(df[vname])) 81 | 82 | s = 'Variable Dtypes Have Been Specified' 83 | print(s.center(60,'-')) 84 | 85 | return 86 | 87 | def check_point(df,var,split,min_sample): 88 | """ 89 | Check whether the segmentation points cause some packet samples to be too small; 90 | If there is a packet sample size of less than 5% of the total sample size, 91 | then merge with the adjacent packet until more than 5%; 92 | Applies only to continuous values 93 | :param df: Dataset DataFrame 94 | :param var: Variables list 95 | :param split: Split points list 96 | :param min_sample: Minimum packet sample size 97 | :return: The split points list checked out 98 | """ 99 | new_split = [] 100 | if split is not None and split.__len__()>0: 101 | # print('run into if line:98') 102 | new_split.append(split[0]) 103 | # print(new_split) 104 | # Try the left section of the first split point partition; 105 | # If not meet the conditions then the split point will be removed 106 | pdf = df[df[var] <= split[0]] 107 | if (pdf.shape[0] < min_sample) or (len(np.unique(pdf['target']))<=1): 108 | # print('run into if line:105') 109 | new_split.pop() 110 | # print(new_split) 111 | for i in range(0,split.__len__()-1): 112 | pdf = df[(df[var] > split[i]) & (df[var] <= split[i+1])] 113 | if (pdf.shape[0] < min_sample) or (np.unique(pdf['target']).__len__()<=1): 114 | # print('run into if line:112') 115 | continue 116 | else: 117 | # print('run into if line:115') 118 | new_split.append(split[i+1]) 119 | # print(new_split) 120 | 121 | #If the remaining sample is too small then remove the last one 122 | # print(new_split) 123 | # print(new_split.__len__()) 124 | if new_split.__len__()>1 and len(df[df[var] >= new_split[new_split.__len__()-1]])1 and np.unique(df[df[var] >= new_split[new_split.__len__()-1]]['target']).__len__()<=1: 130 | # print(split) 131 | # print(split[split.__len__()-1]) 132 | # print(df[df[var] >= new_split[new_split.__len__()-1]].shape) 133 | # print(np.unique(df[df[new_split] > new_split[new_split.__len__()-1]]['target'])) 134 | # print('run into if line:125') 135 | new_split.pop() 136 | # print(new_split) 137 | #If the split list has only one value, and no smaller than this value 138 | if new_split == []: 139 | new_split = split 140 | else: 141 | pass 142 | return new_split 143 | 144 | def calulate_iv(df,var,global_bt,global_gt): 145 | ''' 146 | calculate the iv and woe value without split 147 | :param df: 148 | :param var: 149 | :param global_bt: 150 | :param global_gt: 151 | :return: 152 | ''' 153 | # a = df.groupby(['target']).count() 154 | groupdetail = {} 155 | bt_sub = sum(df['target']) 156 | bri = (bt_sub + 0.0001)* 1.0 / global_bt 157 | gt_sub = df.shape[0] - bt_sub 158 | gri = (gt_sub + 0.0001)* 1.0 / global_gt 159 | 160 | groupdetail['woei'] = np.log(bri / gri) 161 | groupdetail['ivi'] = (bri - gri) * np.log(bri / gri) 162 | groupdetail['sub_total_num_percentage'] = df.shape[0]*1.0/(global_bt+global_gt) 163 | groupdetail['positive_sample_num'] = bt_sub 164 | groupdetail['negative_sample_num'] = gt_sub 165 | groupdetail['positive_rate_in_sub_total'] = bt_sub*1.0/df.shape[0] 166 | groupdetail['negative_rate_in_sub_total'] = gt_sub*1.0/df.shape[0] 167 | 168 | return groupdetail 169 | 170 | 171 | def calculate_iv_split(df,var,split_point,global_bt,global_gt): 172 | """ 173 | calculate the iv value with the specified split point 174 | note: 175 | the dataset should have variables:'target' which to be encapsulated if have time 176 | :return: 177 | """ 178 | #split dataset 179 | dataset_r = df[df.loc[:,var] > split_point][[var,'target']] 180 | dataset_l = df[df.loc[:,var] <= split_point][[var,'target']] 181 | 182 | r1_cnt = sum(dataset_r['target']) 183 | r0_cnt = dataset_r.shape[0] - r1_cnt 184 | 185 | l1_cnt = sum(dataset_l['target']) 186 | l0_cnt = dataset_l.shape[0] - l1_cnt 187 | 188 | if r0_cnt == 0 or r1_cnt == 0 or l0_cnt == 0 or l1_cnt ==0: 189 | return 0,0,0,dataset_l,dataset_r,0,0 190 | 191 | lbr = (l1_cnt+ 0.0001)*1.0/global_bt 192 | lgr = (l0_cnt+ 0.0001)*1.0/global_gt 193 | woel = np.log(lbr/lgr) 194 | ivl = (lbr-lgr)*woel 195 | rbr = (r1_cnt+ 0.0001)*1.0/global_bt 196 | rgr = (r0_cnt+ 0.0001)*1.0/global_gt 197 | woer = np.log(rbr/rgr) 198 | ivr = (rbr-rgr)*woer 199 | iv = ivl+ivr 200 | 201 | return woel,woer,iv,dataset_l,dataset_r,ivl,ivr 202 | 203 | 204 | def binning_data_split(df,var,global_bt,global_gt,min_sample,alpha=0.01): 205 | """ 206 | Specify the data split level and return the split value list 207 | :return: 208 | """ 209 | iv_var = InfoValue() 210 | # Calculates the IV of the current node before splitted 211 | gd = calulate_iv(df, var,global_bt,global_gt) 212 | 213 | woei, ivi = gd['woei'],gd['ivi'] 214 | 215 | if np.unique(df[var]).__len__() <=8: 216 | # print('running into if') 217 | split = list(np.unique(df[var])) 218 | split.sort() 219 | # print('split:',split) 220 | #Segmentation point checking and processing 221 | split = check_point(df, var, split, min_sample) 222 | split.sort() 223 | # print('after check:',split) 224 | iv_var.split_list = split 225 | return node(split_point=split,iv=ivi) 226 | 227 | percent_value = list(np.unique(np.percentile(df[var], range(100)))) 228 | percent_value.sort() 229 | 230 | if percent_value.__len__() <=2: 231 | iv_var.split_list = list(np.unique(percent_value)).sort() 232 | return node(split_point=percent_value,iv=ivi) 233 | 234 | # A sentry that attempts to split the current node 235 | # Init bestSplit_iv with zero 236 | bestSplit_iv = 0 237 | bestSplit_woel = [] 238 | bestSplit_woer = [] 239 | bestSplit_ivl = 0 240 | bestSplit_ivr = 0 241 | bestSplit_point = [] 242 | 243 | #remove max value and min value in case dataset_r or dataset_l will be null 244 | for point in percent_value[0:percent_value.__len__()-1]: 245 | # If there is only a sample or a negative sample, skip 246 | if set(df[df[var] > point]['target']).__len__() == 1 or set(df[df[var] <= point]['target']).__len__() == 1 \ 247 | or df[df[var] > point].shape[0] < min_sample or df[df[var] <= point].shape[0] < min_sample : 248 | continue 249 | 250 | woel, woer, iv, dataset_l, dataset_r, ivl, ivr = calculate_iv_split(df,var,point,global_bt,global_gt) 251 | 252 | if iv > bestSplit_iv: 253 | bestSplit_woel = woel 254 | bestSplit_woer = woer 255 | bestSplit_iv = iv 256 | bestSplit_point = point 257 | bestSplit_dataset_r = dataset_r 258 | bestSplit_dataset_l = dataset_l 259 | bestSplit_ivl = ivl 260 | bestSplit_ivr = ivr 261 | 262 | # If the IV after division is greater than the IV value before the current segmentation, the segmentation is valid and recursive 263 | # specified step learning rate 0.01 264 | if bestSplit_iv > ivi*(1+alpha) and bestSplit_dataset_r.shape[0] > min_sample and bestSplit_dataset_l.shape[0] > min_sample: 265 | presplit_right = node() 266 | presplit_left = node() 267 | 268 | # Determine whether the right node satisfies the segmentation prerequisite 269 | if bestSplit_dataset_r.shape[0] < min_sample or set(bestSplit_dataset_r['target']).__len__() == 1: 270 | presplit_right.iv = bestSplit_ivr 271 | right = presplit_right 272 | else: 273 | right = binning_data_split(bestSplit_dataset_r,var,global_bt,global_gt,min_sample,alpha=0.01) 274 | 275 | # Determine whether the left node satisfies the segmentation prerequisite 276 | if bestSplit_dataset_l.shape[0] < min_sample or np.unique(bestSplit_dataset_l['target']).__len__() == 1: 277 | presplit_left.iv = bestSplit_ivl 278 | left = presplit_left 279 | else: 280 | left = binning_data_split(bestSplit_dataset_l,var,global_bt,global_gt,min_sample,alpha=0.01) 281 | 282 | return node(var_name=var,split_point=bestSplit_point,iv=ivi,left=left,right=right) 283 | else: 284 | # Returns the current node as the final leaf node 285 | return node(var_name=var,iv=ivi) 286 | 287 | 288 | def search(tree,split_list): 289 | ''' 290 | search the tree node 291 | :param tree: a instance of Tree Node Class 292 | :return: split points list 293 | ''' 294 | if isinstance(tree.split_point, list): 295 | split_list.extend(tree.split_point) 296 | else: 297 | split_list.append(tree.split_point) 298 | 299 | if tree.left is not None: 300 | search(tree.left,split_list) 301 | 302 | if tree.right is not None: 303 | search(tree.right,split_list) 304 | 305 | return split_list 306 | 307 | 308 | def format_iv_split(df,var,split_list,global_bt,global_gt): 309 | ''' 310 | Given the dataset DataFrame and split points list then return a InfoValue instance; 311 | Just for continuous variable 312 | :param df: 313 | :param var: 314 | :param split_list: 315 | :param global_bt: 316 | :param global_gt: 317 | :return: 318 | ''' 319 | civ = InfoValue() 320 | civ.var_name = var 321 | civ.split_list = split_list 322 | dfcp = df[:] 323 | 324 | civ.sub_total_sample_num = [] 325 | civ.positive_sample_num = [] 326 | civ.negative_sample_num = [] 327 | civ.sub_total_num_percentage = [] 328 | civ.positive_rate_in_sub_total = [] 329 | 330 | for i in range(0, split_list.__len__()): 331 | dfi = dfcp[dfcp[var] <= split_list[i]] 332 | dfcp = dfcp[dfcp[var] > split_list[i]] 333 | gd = calulate_iv(dfi, var,global_bt,global_gt) 334 | woei, ivi = gd['woei'],gd['ivi'] 335 | civ.woe_list.append(woei) 336 | civ.iv_list.append(ivi) 337 | civ.sub_total_sample_num.append(dfi.shape[0]) 338 | civ.positive_sample_num.append(gd['positive_sample_num']) 339 | civ.negative_sample_num.append(gd['negative_sample_num']) 340 | civ.sub_total_num_percentage.append(gd['sub_total_num_percentage']) 341 | civ.positive_rate_in_sub_total.append(gd['positive_rate_in_sub_total']) 342 | civ.negative_rate_in_sub_total.append(gd['negative_rate_in_sub_total']) 343 | 344 | if dfcp.shape[0]>0: 345 | gd = calulate_iv(dfcp, var,global_bt,global_gt) 346 | woei, ivi = gd['woei'],gd['ivi'] 347 | civ.woe_list.append(woei) 348 | civ.iv_list.append(ivi) 349 | civ.sub_total_sample_num.append(dfcp.shape[0]) 350 | civ.positive_sample_num.append(gd['positive_sample_num']) 351 | civ.negative_sample_num.append(gd['negative_sample_num']) 352 | civ.sub_total_num_percentage.append(gd['sub_total_num_percentage']) 353 | civ.positive_rate_in_sub_total.append(gd['positive_rate_in_sub_total']) 354 | civ.negative_rate_in_sub_total.append(gd['negative_rate_in_sub_total']) 355 | 356 | civ.iv = sum(civ.iv_list) 357 | return civ 358 | 359 | 360 | def woe_trans(dvar,civ): 361 | # replace the var value with the given woe value 362 | var = copy.deepcopy(dvar) 363 | if not civ.is_discrete: 364 | if civ.woe_list.__len__()>1: 365 | split_list = [] 366 | split_list.append(float("-inf")) 367 | split_list.extend([i for i in civ.split_list]) 368 | split_list.append(float("inf")) 369 | 370 | for i in range(civ.woe_list.__len__()): 371 | var[(dvar > split_list[i]) & (dvar <= split_list[i+1])] = civ.woe_list[i] 372 | else: 373 | var[:] = civ.woe_list[0] 374 | else: 375 | split_map = {} 376 | for i in range(civ.split_list.__len__()): 377 | for j in range(civ.split_list[i].__len__()): 378 | split_map[civ.split_list[i][j]] = civ.woe_list[i] 379 | 380 | var = var.map(split_map) 381 | 382 | return var 383 | 384 | def proc_woe_discrete(df,var,global_bt,global_gt,min_sample,alpha=0.01): 385 | ''' 386 | process woe transformation of discrete variables 387 | :param df: 388 | :param var: 389 | :param global_bt: 390 | :param global_gt: 391 | :param min_sample: 392 | :return: 393 | ''' 394 | s = 'process discrete variable:'+str(var) 395 | print(s.center(60, '-')) 396 | 397 | df = df[[var,'target']] 398 | div = DisInfoValue() 399 | div.var_name = var 400 | rdict = {} 401 | cpvar = df[var] 402 | # print('np.unique(df[var]):',np.unique(df[var])) 403 | for var_value in np.unique(df[var]): 404 | # Here come with a '==',in case type error you must do Nan filling process firstly 405 | df_temp = df[df[var] == var_value] 406 | gd = calulate_iv(df_temp,var,global_bt,global_gt) 407 | woei, ivi = gd['woei'],gd['ivi'] 408 | div.origin_value.append(var_value) 409 | div.woe_before.append(woei) 410 | rdict[var_value] = woei 411 | # print(var_value,woei,ivi) 412 | 413 | cpvar = cpvar.map(rdict) 414 | df[var] = cpvar 415 | 416 | iv_tree = binning_data_split(df,var,global_bt,global_gt,min_sample,alpha) 417 | 418 | # Traversal tree, get the segmentation point 419 | split_list = [] 420 | search(iv_tree, split_list) 421 | split_list = list(np.unique([1.0 * x for x in split_list if x is not None])) 422 | split_list.sort() 423 | 424 | # Segmentation point checking and processing 425 | split_list = check_point(df, var, split_list, min_sample) 426 | split_list.sort() 427 | 428 | civ = format_iv_split(df, var, split_list,global_bt,global_gt) 429 | civ.is_discrete = 1 430 | 431 | split_list_temp = [] 432 | split_list_temp.append(float("-inf")) 433 | split_list_temp.extend([i for i in split_list]) 434 | split_list_temp.append(float("inf")) 435 | 436 | a = [] 437 | for i in range(split_list_temp.__len__() - 1): 438 | temp = [] 439 | for j in range(div.origin_value.__len__()): 440 | if (div.woe_before[j]>split_list_temp[i]) & (div.woe_before[j]<=split_list_temp[i+1]): 441 | temp.append(div.origin_value[j]) 442 | 443 | if temp != [] : 444 | a.append(temp) 445 | 446 | civ.split_list = a 447 | 448 | return civ 449 | 450 | 451 | def proc_woe_continuous(df,var,global_bt,global_gt,min_sample,alpha=0.01): 452 | ''' 453 | process woe transformation of discrete variables 454 | :param df: 455 | :param var: 456 | :param global_bt: 457 | :param global_gt: 458 | :param min_sample: 459 | :return: 460 | ''' 461 | s = 'process continuous variable:'+str(var) 462 | print(s.center(60, '-')) 463 | df = df[[var,'target']] 464 | iv_tree = binning_data_split(df, var,global_bt,global_gt,min_sample,alpha) 465 | 466 | # Traversal tree, get the segmentation point 467 | split_list = [] 468 | search(iv_tree, split_list) 469 | split_list = list(np.unique([1.0 * x for x in split_list if x is not None])) 470 | split_list.sort() 471 | 472 | # Segmentation point checking and processing 473 | split_list = check_point(df, var, split_list, min_sample) 474 | split_list.sort() 475 | 476 | civ = format_iv_split(df, var,split_list,global_bt,global_gt) 477 | 478 | return civ 479 | 480 | def fillna(dataset,bin_var_list,discrete_var_list,continuous_filler=-1,discrete_filler='missing'): 481 | """ 482 | fill the null value in the dataframe inpalce 483 | :param dataset: input dataset ,pandas.DataFrame type 484 | :param bin_var_list: continuous variables name list 485 | :param discrete_var_list: discretevvvv variables name list 486 | :param continuous_filler: the value to fill the null value in continuous variables 487 | :param discrete_filler: the value to fill the null value in discrete variables 488 | :return: null value,replace null value inplace 489 | """ 490 | for var in [tmp for tmp in bin_var_list if tmp in list(dataset.columns)]: 491 | # fill null 492 | dataset.loc[dataset[var].isnull(), (var)] = continuous_filler 493 | 494 | for var in [tmp for tmp in discrete_var_list if tmp in list(dataset.columns)]: 495 | # fill null 496 | dataset.loc[dataset[var].isnull(), (var)] = discrete_filler 497 | 498 | 499 | def process_train_woe(infile_path=None,outfile_path=None,rst_path=None,config_path=None): 500 | print('run into process_train_woe: \n',time.asctime(time.localtime(time.time()))) 501 | data_path = infile_path 502 | cfg = config.config() 503 | cfg.load_file(config_path,data_path) 504 | bin_var_list = [tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns)] 505 | 506 | for var in bin_var_list: 507 | # fill null 508 | cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1 509 | 510 | # change feature dtypes 511 | change_feature_dtype(cfg.dataset_train, cfg.variable_type) 512 | rst = [] 513 | 514 | # process woe transformation of continuous variables 515 | print('process woe transformation of continuous variables: \n',time.asctime(time.localtime(time.time()))) 516 | print('cfg.global_bt',cfg.global_bt) 517 | print('cfg.global_gt', cfg.global_gt) 518 | 519 | for var in bin_var_list: 520 | rst.append(proc_woe_continuous(cfg.dataset_train,var,cfg.global_bt,cfg.global_gt,cfg.min_sample,alpha=0.05)) 521 | 522 | # process woe transformation of discrete variables 523 | print('process woe transformation of discrete variables: \n',time.asctime(time.localtime(time.time()))) 524 | for var in [tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns)]: 525 | # fill null 526 | cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing' 527 | rst.append(proc_woe_discrete(cfg.dataset_train,var,cfg.global_bt,cfg.global_gt,cfg.min_sample,alpha=0.05)) 528 | 529 | feature_detail = eval.eval_feature_detail(rst, outfile_path) 530 | 531 | print('save woe transformation rule into pickle: \n',time.asctime(time.localtime(time.time()))) 532 | output = open(rst_path, 'wb') 533 | pickle.dump(rst,output) 534 | output.close() 535 | 536 | return feature_detail,rst 537 | 538 | 539 | def process_woe_trans(in_data_path=None,rst_path=None,out_path=None,config_path=None): 540 | cfg = config.config() 541 | cfg.load_file(config_path, in_data_path) 542 | 543 | for var in [tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns)]: 544 | # fill null 545 | cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1 546 | 547 | for var in [tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns)]: 548 | # fill null 549 | cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing' 550 | 551 | change_feature_dtype(cfg.dataset_train, cfg.variable_type) 552 | 553 | output = open(rst_path, 'rb') 554 | rst = pickle.load(output) 555 | output.close() 556 | 557 | # Training dataset Woe Transformation 558 | for r in rst: 559 | cfg.dataset_train[r.var_name] = woe_trans(cfg.dataset_train[r.var_name], r) 560 | 561 | cfg.dataset_train.to_csv(out_path) 562 | -------------------------------------------------------------------------------- /woe/ftrl.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __author__ = 'boredbird' 3 | import numpy as np 4 | 5 | class LR(object): 6 | @staticmethod 7 | def fn(w, x): 8 | '''sigmoid function 9 | ''' 10 | return 1.0 / (1.0 + np.exp(-w.dot(x))) 11 | 12 | @staticmethod 13 | def loss(y, y_hat): 14 | '''Cross entropy loss function 15 | ''' 16 | return np.sum(np.nan_to_num(-y * np.log(y_hat) - (1 - y) * np.log(1 - y_hat))) 17 | 18 | @staticmethod 19 | def grad(y, y_hat, x): 20 | '''The first derivative of the cross entropy loss function to the weight W 21 | ''' 22 | return (y_hat - y) * x 23 | 24 | 25 | class FTRL(object): 26 | def __init__(self, dim, l1, l2, alpha, beta, decisionFunc=LR): 27 | self.dim = dim 28 | self.decisionFunc = decisionFunc 29 | self.z = np.zeros(dim) 30 | self.n = np.zeros(dim) 31 | self.w = np.zeros(dim) 32 | self.w_list = [] 33 | self.loss_list = [] 34 | self.l1 = l1 35 | self.l2 = l2 36 | self.alpha = alpha 37 | self.beta = beta 38 | 39 | def predict(self, x): 40 | return self.decisionFunc.fn(self.w, x) 41 | 42 | def update(self, x, y): 43 | self.w = np.array([0 if np.abs(self.z[i]) <= self.l1 else (np.sign( 44 | self.z[i]) * self.l1 - self.z[i]) / (self.l2 + (self.beta + np.sqrt(self.n[i])) / self.alpha) for i in xrange(self.dim)]) 45 | y_hat = self.predict(x) 46 | g = self.decisionFunc.grad(y, y_hat, x) 47 | sigma = (np.sqrt(self.n + g * g) - np.sqrt(self.n)) / self.alpha 48 | self.z += g - sigma * self.w 49 | self.n += g * g 50 | return self.decisionFunc.loss(y, y_hat) 51 | 52 | def train(self, trainSet, verbos=False, max_itr=10000000000, eta=0.01, epochs=100): 53 | itr = 0 54 | n = 0 55 | while True: 56 | for x, y in trainSet: 57 | loss = self.update(x, y) 58 | if verbos and n%verbos==0: 59 | print("itr=" + str(n) + "\tloss=" + str(loss)) 60 | self.w_list.append(self.w) 61 | self.loss_list.append(loss) 62 | if loss < eta: 63 | itr += 1 64 | else: 65 | itr = 0 66 | if itr >= epochs: # when the loss function has been continuously epochs iterations less than eta 67 | print("loss have less than", eta, " continuously for ", itr, "iterations") 68 | return 69 | n += 1 70 | if n >= max_itr: 71 | print("reach max iteration", max_itr) 72 | return --------------------------------------------------------------------------------