├── .idea
├── deployment.xml
├── inspectionProfiles
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
├── preferred-vcs.xml
├── vcs.xml
├── woe.iml
└── workspace.xml
├── LICENSE.txt
├── MANIFEST.in
├── README.rst
├── build
└── lib
│ └── woe
│ ├── GridSearch.py
│ ├── __init__.py
│ ├── config.py
│ ├── eval.py
│ ├── feature_process.py
│ └── ftrl.py
├── dist
├── woe-0.1.4-py2-none-any.tar.gz
├── woe-0.1.4-py2-none-any.whl
├── woe-0.1.4-py2.7.egg
├── woe-0.1.4-py3-none-any.tar.gz
├── woe-0.1.4-py3-none-any.whl
└── woe-0.1.4.tar.gz
├── examples
├── HereWeGo.py
├── README.rst
├── UCI_Credit_Card.csv
└── config.csv
├── setup.py
├── woe.egg-info
├── PKG-INFO
├── SOURCES.txt
├── dependency_links.txt
├── requires.txt
└── top_level.txt
└── woe
├── GridSearch.py
├── __init__.py
├── config.py
├── eval.py
├── feature_process.py
└── ftrl.py
/.idea/deployment.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/preferred-vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Git
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/woe.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
50 |
51 |
52 |
53 | format_output
54 | civ_list
55 | set
56 | out_path
57 | proc_woe_discrete
58 | change_feature_dtype
59 | eval_feature_detail
60 | print
61 |
62 |
63 | eval
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 | true
97 | DEFINITION_ORDER
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 | project
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 | 1505092956943
462 |
463 |
464 | 1505092956943
465 |
466 |
467 | 1505121641589
468 |
469 |
470 |
471 | 1505121641589
472 |
473 |
474 | 1505122959424
475 |
476 |
477 |
478 | 1505122959424
479 |
480 |
481 | 1505126556978
482 |
483 |
484 |
485 | 1505126556978
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
591 |
592 |
593 |
594 |
595 |
596 |
597 |
598 |
599 |
600 |
601 |
602 |
603 |
604 |
605 |
606 |
607 |
608 |
609 |
610 |
611 |
612 |
613 |
614 |
615 |
616 |
617 |
618 |
619 |
620 |
621 |
622 |
623 |
624 |
625 |
626 |
627 |
628 |
629 |
630 |
631 |
632 |
633 |
634 |
635 |
636 |
637 |
638 |
639 |
640 |
641 |
642 |
643 |
644 |
645 |
646 |
647 |
648 |
649 |
650 |
651 |
652 |
653 |
654 |
655 |
656 |
657 |
658 |
659 |
660 |
661 |
662 |
663 |
664 |
665 |
666 |
667 |
668 |
669 |
670 |
671 |
672 |
673 |
674 |
675 |
676 |
677 |
678 |
679 |
680 |
681 |
682 |
683 |
684 |
685 |
686 |
687 |
688 |
689 |
690 |
691 |
692 |
693 |
694 |
695 |
696 |
697 |
698 |
699 |
700 |
701 |
702 |
703 |
704 |
705 |
706 |
707 |
708 |
709 |
710 |
711 |
712 |
713 |
714 |
715 |
716 |
717 |
718 |
719 |
720 |
721 |
722 |
723 |
724 |
725 |
726 |
727 |
728 |
729 |
730 |
731 |
732 |
733 |
734 |
735 |
736 |
737 |
738 |
739 |
740 |
741 |
742 |
743 |
744 |
745 |
746 |
747 |
748 |
749 |
750 |
751 |
752 |
753 |
754 |
755 |
756 |
757 |
758 |
759 |
760 |
761 |
762 |
763 |
764 |
765 |
766 |
767 |
768 |
769 |
770 |
771 |
772 |
773 |
774 |
775 |
776 |
777 |
778 |
779 |
780 |
781 |
782 |
783 |
784 |
785 |
786 |
787 |
788 |
789 |
790 |
791 |
792 |
793 |
794 |
795 |
796 |
797 |
798 |
799 |
800 |
801 |
802 |
803 |
804 |
805 |
806 |
807 |
808 |
809 |
810 |
811 |
812 |
813 |
814 |
815 |
816 |
817 |
818 |
819 |
820 |
821 |
822 |
823 |
824 |
825 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2017 The Python Packaging Authority (PyPA)
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst
2 | include *.txt
3 | include *.py
4 | recursive-include examples *.csv *.py *.rst
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | woe
2 | ===
3 |
4 | .. image:: https://travis-ci.org/justdoit0823/pywxclient.svg?branch=master
5 | :target: https://travis-ci.org/justdoit0823/pywxclient
6 |
7 | version: 0.1.4
8 |
9 | Tools for WoE Transformation mostly used in ScoreCard Model for credit rating
10 |
11 | Installation
12 | --------------------------------
13 |
14 | We can simply use pip to install, as the following:
15 |
16 | .. code-block:: bash
17 |
18 | $ pip install woe
19 |
20 | or installing from git
21 |
22 | .. code-block:: bash
23 |
24 | $ pip install git+https://github.com/boredbird/woe
25 |
26 |
27 | Features
28 | ========
29 |
30 | * Split tree with IV criterion
31 |
32 | * Rich and plentiful model eval methods
33 |
34 | * Unified format and easy for output
35 |
36 | * Storage of IV tree for follow-up use
37 |
38 |
39 |
40 | **woe** module function tree
41 | ============================
42 |
43 | ::
44 |
45 | |- __init__
46 | |- config.py
47 | | |-- config
48 | | |-- __init__
49 | | |-- change_config_var_dtype()
50 | | |-- load_file()
51 | |- eval.py
52 | | |-- compute_ks()
53 | | |-- eval_data_summary()
54 | | |-- eval_feature_detail()
55 | | |-- eval_feature_stability()
56 | | |-- eval_feature_summary()
57 | | |-- eval_model_stability()
58 | | |-- eval_model_summary()
59 | | |-- eval_segment_metrics()
60 | | |-- plot_ks()
61 | | |-- proc_cor_eval()
62 | | |-- proc_validation()
63 | | |-- wald_test()
64 | |- feature_process.py
65 | | |-- binning_data_split()
66 | | |-- calculate_iv_split()
67 | | |-- calulate_iv()
68 | | |-- change_feature_dtype()
69 | | |-- check_point()
70 | | |-- fillna()
71 | | |-- format_iv_split()
72 | | |-- proc_woe_continuous()
73 | | |-- proc_woe_discrete()
74 | | |-- process_train_woe()
75 | | |-- process_woe_trans()
76 | | |-- search()
77 | | |-- woe_trans()
78 | |- ftrl.py
79 | | |-- FTRL()
80 | | |-- LR()
81 | |- GridSearch.py
82 | | |-- fit_single_lr()
83 | | |-- grid_search_lr_c()
84 | | |-- grid_search_lr_c_main()
85 | | |-- grid_search_lr_validation()
86 |
87 |
88 | Examples
89 | ========
90 |
91 | In the examples directory, there is a simple woe transformation program as tutorials.
92 |
93 | Or you can write a more complex program with this `woe` package.
94 |
95 | Version Records
96 | ================
97 | woe 0.1.4 2018-03-01
98 | * support py3
99 |
100 | woe 0.1.3 2018-02-09
101 |
102 | * woe.feature_process.proc_woe_discrete(): fix bug when deal with discrete varibales
103 | * woe.eval.eval_feature_detail(): fix bug : utf-8 output file format
104 | * woe.GridSearch.grid_search_lr_c_main(): add function warper for convenience and high efficiency
105 | * woe.GridSearch.grid_search_lr_c_validation(): monitor the ks performance of training sets and test sets on different 'c'
106 | * supplement examples test scripts
107 |
108 |
109 | woe 0.1.2 2017-12-05
110 |
111 | * woe.ftrl.FTRL(): add online learning module
112 |
113 | woe 0.1.1 2017-11-28
114 |
115 | * woe.config.load_file(): change param data_path to be optional
116 | * woe.eval.eval_feature_stability(): fix bug : psi_dict['stability_index'] computation error
117 | * woe.feature_process.change_feature_dtype(): add friendly tips when encounter a error
118 | * woe.feature_process.calulate_iv(): refactor the code
119 | * woe.feature_process.calculate_iv_split(): refactor the code
120 | * woe.feature_process.binning_data_split(): reduce the number of len() function calls with __len__() and shape attributes;replace namedtuple with dict
121 | * woe.feature_process.fillna(): new added function to fill null value
122 | * woe.GridSearch.grid_search_lr_c(): list of regularization parameter c specified inside the function is changed to the user specified
123 |
124 | woe 0.0.9 2017-11-21
125 |
126 | * Add module : GridSearch for the search of optimal hyper parametric C in LogisticRegression
127 | * Code refactoring: function compute_ks and plot_ks
128 |
129 | woe 0.0.8 2017-09-28
130 |
131 | * More flexible: cancel conditional restriction in function feature_process.change_feature_dtype()
132 | * Fix bug: the wrong use of deepcopy in function feature_process.woe_trans()
133 |
134 | woe 0.0.7 2017-09-19
135 |
136 | * Fix bug: eval.eval_feature_detail raises ValueError('arrays must all be same length')
137 | * Add parameter interface: alpha specified step learning rate ,default 0.01
138 |
139 | How to Contribute
140 | --------------------------------
141 |
142 | Email me,1002937942@qq.com.
143 |
--------------------------------------------------------------------------------
/build/lib/woe/GridSearch.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | __author__ = 'boredbird'
3 | import pandas as pd
4 | import numpy as np
5 | import matplotlib.pyplot as plt
6 | from sklearn.linear_model import LogisticRegression
7 | from datetime import datetime
8 | from sklearn.svm import l1_min_c
9 | from woe.eval import compute_ks
10 | import pickle
11 | import time
12 |
13 | """
14 | Search for optimal hyper parametric C in LogisticRegression
15 | """
16 | def grid_search_lr_c(X_train,y_train,cs,df_coef_path=False
17 | ,pic_coefpath_title='Logistic Regression Path',pic_coefpath=False
18 | ,pic_performance_title='Logistic Regression Performance',pic_performance=False):
19 | """
20 | grid search optimal hyper parameters c with the best ks performance
21 | :param X_train: features dataframe
22 | :param y_train: target
23 | :param cs: list of regularization parameter c
24 | :param df_coef_path: the file path for logistic regression coefficient dataframe
25 | :param pic_coefpath_title: the pic title for coefficient path picture
26 | :param pic_coefpath: the file path for coefficient path picture
27 | :param pic_performance_title: the pic title for ks performance picture
28 | :param pic_performance: the file path for ks performance picture
29 | :return: a tuple of c and ks value with the best ks performance
30 | """
31 | # init a LogisticRegression model
32 | clf_l1_LR = LogisticRegression(C=0.1, penalty='l1', tol=0.01,class_weight='balanced')
33 | # cs = l1_min_c(X_train, y_train, loss='log') * np.logspace(0, 9,200)
34 |
35 | print("Computing regularization path ...")
36 | start = datetime.now()
37 | print(start)
38 | coefs_ = []
39 | ks = []
40 | for c in cs:
41 | clf_l1_LR.set_params(C=c)
42 | clf_l1_LR.fit(X_train, y_train)
43 | coefs_.append(clf_l1_LR.coef_.ravel().copy())
44 |
45 | proba = clf_l1_LR.predict_proba(X_train)[:,1]
46 | ks.append(compute_ks(proba,y_train))
47 |
48 | end = datetime.now()
49 | print(end)
50 | print("This took ", end - start)
51 | coef_cv_df = pd.DataFrame(coefs_,columns=X_train.columns)
52 | coef_cv_df['ks'] = ks
53 | coef_cv_df['c'] = cs
54 |
55 | if df_coef_path:
56 | file_name = df_coef_path if isinstance(df_coef_path, str) else None
57 | coef_cv_df.to_csv(file_name)
58 |
59 | coefs_ = np.array(coefs_)
60 |
61 | fig1 = plt.figure('fig1')
62 | plt.plot(np.log10(cs), coefs_)
63 | ymin, ymax = plt.ylim()
64 | plt.xlabel('log(C)')
65 | plt.ylabel('Coefficients')
66 | plt.title(pic_coefpath_title)
67 | plt.axis('tight')
68 | if pic_coefpath:
69 | file_name = pic_coefpath if isinstance(pic_coefpath, str) else None
70 | plt.savefig(file_name)
71 | else:
72 | plt.show()
73 |
74 | fig2 = plt.figure('fig2')
75 | plt.plot(np.log10(cs), ks)
76 | plt.xlabel('log(C)')
77 | plt.ylabel('ks score')
78 | plt.title(pic_performance_title)
79 | plt.axis('tight')
80 | if pic_performance:
81 | file_name = pic_performance if isinstance(pic_performance, str) else None
82 | plt.savefig(file_name)
83 | else:
84 | plt.show()
85 |
86 | flag = coefs_<0
87 | idx = np.array(ks)[flag.sum(axis=1) == 0].argmax()
88 |
89 | return (cs[idx],ks[idx])
90 |
91 |
92 | def grid_search_lr_c_validation(X_train,y_train,validation_dataset_list,cs=[0.01],df_coef_path=False
93 | ,pic_coefpath_title='Logistic Regression Path',pic_coefpath=False
94 | ,pic_performance_title='Logistic Regression Performance',pic_performance=False):
95 | """
96 | grid search optimal hyper parameters c with the best ks performance
97 | :param X_train: features dataframe
98 | :param y_train: target
99 | :param cs: list of c value
100 | :param df_coef_path: the file path for logistic regression coefficient dataframe
101 | :param pic_coefpath_title: the pic title for coefficient path picture
102 | :param pic_coefpath: the file path for coefficient path picture
103 | :param pic_performance_title: the pic title for ks performance picture
104 | :param pic_performance: the file path for ks performance picture
105 | :return: a tuple of c and ks value with the best ks performance
106 | """
107 | # init a LogisticRegression model
108 | clf_l1_LR = LogisticRegression(C=0.1, penalty='l1', tol=0.01,class_weight='balanced')
109 |
110 | print("Computing regularization path ...")
111 | start = datetime.now()
112 | print(start)
113 | coefs_ = []
114 | ks = []
115 | ks_validation1 = []
116 | ks_validation2 = []
117 | counter = 0
118 | for c in cs:
119 | print('time: ',time.asctime(time.localtime(time.time())),'counter: ',counter, ' c: ',c)
120 | clf_l1_LR.set_params(C=c)
121 | clf_l1_LR.fit(X_train, y_train)
122 | coefs_.append(clf_l1_LR.coef_.ravel().copy())
123 |
124 | proba = clf_l1_LR.predict_proba(X_train)[:,1]
125 | validation_proba1 = clf_l1_LR.predict_proba(validation_dataset_list[0][X_train.columns])[:,1]
126 |
127 | ks.append(compute_ks(proba,y_train))
128 | ks_validation1.append(compute_ks(validation_proba1,validation_dataset_list[0]['target']))
129 |
130 | print('ks:\t',ks[-1],'ks_validation1:\t',ks_validation1[-1])
131 | counter += 1
132 |
133 | end = datetime.now()
134 | print(end)
135 | print("This took ", end - start)
136 | coef_cv_df = pd.DataFrame(coefs_,columns=X_train.columns)
137 | coef_cv_df['ks'] = ks
138 | coef_cv_df['ks_validation1'] = ks_validation1
139 | coef_cv_df['c'] = cs
140 |
141 |
142 | if df_coef_path:
143 | file_name = df_coef_path if isinstance(df_coef_path, str) else None
144 | coef_cv_df.to_csv(file_name)
145 |
146 | coefs_ = np.array(coefs_)
147 |
148 | fig1 = plt.figure('fig1')
149 | plt.plot(np.log10(cs), coefs_)
150 | ymin, ymax = plt.ylim()
151 | plt.xlabel('log(C)')
152 | plt.ylabel('Coefficients')
153 | plt.title(pic_coefpath_title)
154 | plt.axis('tight')
155 | if pic_coefpath:
156 | file_name = pic_coefpath if isinstance(pic_coefpath, str) else None
157 | plt.savefig(file_name)
158 | plt.close()
159 | else:
160 | pass
161 | # plt.show()
162 | # plt.close()
163 |
164 | fig2 = plt.figure('fig2')
165 | plt.plot(np.log10(cs), ks)
166 | plt.xlabel('log(C)')
167 | plt.ylabel('ks score')
168 | plt.title(pic_performance_title)
169 | plt.axis('tight')
170 | if pic_performance:
171 | file_name = pic_performance if isinstance(pic_performance, str) else None
172 | plt.savefig(file_name)
173 | plt.close()
174 | else:
175 | pass
176 | # plt.show()
177 | # plt.close()
178 |
179 | flag = coefs_<0
180 | if np.array(ks)[flag.sum(axis=1) == 0].__len__()>0:
181 | idx = np.array(ks)[flag.sum(axis=1) == 0].argmax()
182 | else:
183 | idx = np.array(ks).argmax()
184 |
185 | return (cs[idx],ks[idx])
186 |
187 |
188 | def grid_search_lr_c_main(params):
189 | print('run into grid_search_lr_c_main:')
190 | dataset_path = params['dataset_path']
191 | validation_path = params['validation_path']
192 | config_path = params['config_path']
193 | df_coef_path = params['df_coef_path']
194 | pic_coefpath = params['pic_coefpath']
195 | pic_performance = params['pic_performance']
196 | pic_coefpath_title = params['pic_coefpath_title']
197 | pic_performance_title = params['pic_performance_title']
198 |
199 | dataset_train = pd.read_csv(dataset_path)
200 | cfg = pd.read_csv(config_path)
201 | candidate_var_list = cfg[cfg['is_modelfeature'] == 1]['var_name']
202 |
203 | b = [var for var in dataset_train.columns if sum(dataset_train[var].isnull()) == 0]
204 | candidate_var_list = list(set(candidate_var_list).intersection(set(b)))
205 |
206 | var_list_specfied = params['var_list_specfied']
207 | if var_list_specfied.__len__()>0:
208 | candidate_var_list = list(set(candidate_var_list).intersection(set(var_list_specfied)))
209 |
210 | print('candidate_var_list length:\n',candidate_var_list.__len__())
211 | print('candidate_var_list:\n',candidate_var_list)
212 |
213 | print('change dtypes:float64 to float32')
214 | for var in candidate_var_list:
215 | dataset_train[var] = dataset_train[var].astype(np.float32)
216 |
217 | X_train = dataset_train[dataset_train.target >=0][candidate_var_list]
218 | y_train = dataset_train[dataset_train.target >=0]['target']
219 |
220 | validation_cols_keep = [var for var in candidate_var_list]
221 | validation_cols_keep.append('target')
222 | validation_dataset_list = []
223 |
224 | validation_dataset = pd.read_csv(validation_path)
225 | # fillna
226 | for var in candidate_var_list:
227 | validation_dataset.loc[validation_dataset[var].isnull(), (var)] = 0
228 | validation_dataset_list.append(validation_dataset[validation_cols_keep])
229 |
230 | cs = params['cs']
231 | print('cs',cs)
232 | c,ks = grid_search_lr_c_validation(X_train,y_train,validation_dataset_list,cs,df_coef_path,pic_coefpath_title,pic_coefpath
233 | ,pic_performance_title,pic_performance)
234 | print('pic_coefpath:\n',pic_coefpath)
235 | print('pic_performance:\n',pic_performance)
236 | print('ks performance on the c:')
237 | print(c,ks)
238 |
239 | return (c,ks)
240 |
241 |
242 | def fit_single_lr(dataset_path,config_path,var_list_specfied,out_model_path,c=0.01):
243 | dataset_train = pd.read_csv(dataset_path)
244 | cfg = pd.read_csv(config_path)
245 | candidate_var_list = cfg[cfg['is_modelfeature'] == 1]['var_name']
246 |
247 | b = [var for var in dataset_train.columns if sum(dataset_train[var].isnull()) == 0]
248 | candidate_var_list = list(set(candidate_var_list).intersection(set(b)))
249 |
250 | if var_list_specfied.__len__()>0:
251 | candidate_var_list = list(set(candidate_var_list).intersection(set(var_list_specfied)))
252 |
253 | print('candidate_var_list length:\n',candidate_var_list.__len__())
254 | print('candidate_var_list:\n',candidate_var_list)
255 |
256 | print('change dtypes:float64 to float32')
257 | for var in candidate_var_list:
258 | dataset_train[var] = dataset_train[var].astype(np.float32)
259 |
260 | X_train = dataset_train[dataset_train.target >=0][candidate_var_list]
261 | y_train = dataset_train[dataset_train.target >=0]['target']
262 |
263 | print('c:',c)
264 | clf_lr_a = LogisticRegression(C=c, penalty='l1', tol=0.01,class_weight='balanced')
265 |
266 | clf_lr_a.fit(X_train, y_train)
267 | coefs = clf_lr_a.coef_.ravel().copy()
268 |
269 | proba = clf_lr_a.predict_proba(X_train)[:,1]
270 | ks = compute_ks(proba,y_train)
271 |
272 | model = {}
273 | model['clf'] = clf_lr_a
274 | model['features_list'] = candidate_var_list
275 | model['coefs'] = coefs
276 | model['ks'] = ks
277 |
278 | output = open(out_model_path, 'wb')
279 | pickle.dump(model,output)
280 | output.close()
281 |
282 | return model
283 |
--------------------------------------------------------------------------------
/build/lib/woe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boredbird/woe/335e9ec2a521d3bbccb0ad5d915128119e4d0ca6/build/lib/woe/__init__.py
--------------------------------------------------------------------------------
/build/lib/woe/config.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | __author__ = 'boredbird'
3 | import pandas as pd
4 |
5 | class config:
6 |
7 | def __init__(self):
8 | self.config = None
9 | self.dataset_train = None
10 | self.variable_type = None
11 | self.bin_var_list = None
12 | self.discrete_var_list = None
13 | self.candidate_var_list = None
14 | self.dataset_len = None
15 | self.min_sample = None
16 | self.global_bt = None
17 | self.global_gt = None
18 |
19 | def load_file(self,config_path,data_path=False):
20 | self.config = pd.read_csv(config_path)
21 | # specify variable dtypes
22 | self.variable_type = self.config[['var_name', 'var_dtype']]
23 | self.variable_type = self.variable_type.rename(columns={'var_name': 'v_name', 'var_dtype': 'v_type'})
24 | self.variable_type = self.variable_type.set_index(['v_name'])
25 |
26 | # specify the list of continuous variable to be splitted into bin
27 | self.bin_var_list = self.config[self.config['is_tobe_bin'] == 1]['var_name']
28 | # specify the list of discrete variable to be merged into supper classes
29 | self.discrete_var_list = self.config[(self.config['is_candidate'] == 1) & (self.config['var_dtype'] == 'object')]['var_name']
30 |
31 | # specify the list of model input variable
32 | self.candidate_var_list = self.config[self.config['is_candidate'] == 1]['var_name']
33 |
34 | if data_path:
35 | data_path = data_path if isinstance(data_path, str) else None
36 |
37 | # load dataset train
38 | self.dataset_train = pd.read_csv(data_path)
39 | self.dataset_train.columns = [col.split('.')[-1] for col in self.dataset_train.columns]
40 |
41 | # specify some other global variables about the training dataset
42 | self.dataset_len = len(self.dataset_train)
43 | self.min_sample = int(self.dataset_len * 0.05)
44 | self.global_bt = sum(self.dataset_train['target'])
45 | self.global_gt = len(self.dataset_train) - sum(self.dataset_train['target'])
46 |
47 | def change_config_var_dtype(self,var_name,type,inplace_file=True):
48 | if type in ['object','string','int64','uint8','float64','bool1','bool2','dates','category']:
49 | self.variable_type.loc[var_name,'v_type'] = type
50 | else:
51 | raise KeyError("Invalid dtype specified! ")
--------------------------------------------------------------------------------
/build/lib/woe/eval.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | __author__ = 'boredbird'
3 | import pandas as pd
4 | import numpy as np
5 | import scipy
6 | import matplotlib.pyplot as plt
7 | from scipy.stats import ks_2samp
8 | import woe.config as config
9 | import pickle
10 |
11 | def compute_ks(proba,target):
12 | '''
13 | target: numpy array of shape (1,)
14 | proba: numpy array of shape (1,), predicted probability of the sample being positive
15 | returns:
16 | ks: float, ks score estimation
17 | '''
18 | get_ks = lambda proba, target: ks_2samp(proba[target == 1], proba[target != 1]).statistic
19 |
20 | return get_ks(proba, target)
21 |
22 |
23 | def eval_feature_detail(Info_Value_list,out_path=False):
24 | """
25 | format InfoValue list to Dataframe
26 | :param Info_Value_list: Instance list of Class InfoValue
27 | :param out_path:specify the Dataframe to csv file path ,default False
28 | :return:DataFrame about feature detail
29 | """
30 | rst = Info_Value_list
31 | format_rst = []
32 |
33 | for kk in range(0,len(rst)):
34 | print(rst[kk].var_name)
35 | split_list = []
36 | if rst[kk].split_list != []:
37 | if not rst[kk].is_discrete:
38 | #deal with split_list
39 | split_list.append('(-INF,'+str(rst[kk].split_list[0])+']')
40 | for i in range(0,len(rst[kk].split_list)-1):
41 | split_list.append('(' + str(rst[kk].split_list[i])+','+ str(rst[kk].split_list[i+1]) + ']')
42 |
43 | split_list.append('(' + str(rst[kk].split_list[len(rst[kk].split_list)-1]) + ',+INF)')
44 | else:
45 | split_list = rst[kk].split_list
46 | else:
47 | split_list.append('(-INF,+INF)')
48 |
49 | # merge into dataframe
50 | columns = ['var_name','split_list','sub_total_sample_num','positive_sample_num'
51 | ,'negative_sample_num','sub_total_num_percentage','positive_rate_in_sub_total'
52 | ,'woe_list','iv_list','iv']
53 | rowcnt = len(rst[kk].iv_list)
54 | if rowcnt < len(split_list):
55 | split_list = split_list[:rowcnt]
56 |
57 | var_name = [rst[kk].var_name] * rowcnt
58 | iv = [rst[kk].iv] * rowcnt
59 | iv_list = rst[kk].iv_list
60 | woe_list = rst[kk].woe_list
61 | a = pd.DataFrame({'var_name':var_name,'iv_list':iv_list,'woe_list':woe_list
62 | ,'split_list':split_list,'iv':iv,'sub_total_sample_num':rst[kk].sub_total_sample_num
63 | ,'positive_sample_num':rst[kk].positive_sample_num,'negative_sample_num':rst[kk].negative_sample_num
64 | ,'sub_total_num_percentage':rst[kk].sub_total_num_percentage
65 | ,'positive_rate_in_sub_total':rst[kk].positive_rate_in_sub_total
66 | ,'negative_rate_in_sub_total':rst[kk].negative_rate_in_sub_total},columns=columns)
67 | format_rst.append(a)
68 |
69 | # merge dataframe list into one dataframe vertically
70 | cformat_rst = pd.concat(format_rst)
71 |
72 | if out_path:
73 | file_name = out_path if isinstance(out_path, str) else None
74 | cformat_rst.to_csv(file_name, index=False,encoding='utf-8')
75 |
76 | return cformat_rst
77 |
78 |
79 | def eval_data_summary(df_list,source_name,out_path=False):
80 | '''
81 | :param df_list: A dataset DataFrame
82 | :param source_name: string type
83 | :param out_path: specify the Dataframe to csv file path ,default False
84 | :return: DataFrame about dataset summary info
85 | '''
86 | train_validation_data_summary = []
87 | for i in range(len(source_name)):
88 | a = dict()
89 | a['source'] = source_name[i]
90 | a['total_sample_cnt'] = len(df_list[i])
91 | a['positive_sample_cnt'] = df_list[i]['target'].sum()
92 | a['negative_sample_cnt'] = a['total_sample_cnt'] - a['positive_sample_cnt']
93 | a['positive_rate'] = a['positive_sample_cnt']*1.0/a['total_sample_cnt']
94 | train_validation_data_summary.append(a)
95 |
96 | train_validation_data_summary = pd.DataFrame(train_validation_data_summary)
97 |
98 | if out_path:
99 | file_name = out_path if isinstance(out_path, str) else None
100 | train_validation_data_summary.to_csv(file_name, index=False)
101 |
102 | return train_validation_data_summary
103 |
104 |
105 | def eval_model_summary(list_dict,out_path=False):
106 | '''
107 | :param list_dict: a list of dict
108 | :param out_path: specify the Dataframe to csv file path ,default False
109 | :return: DataFrame about model summary info
110 | '''
111 | model_summary = pd.DataFrame([list_dict[0]])
112 | if len(list_dict)>1:
113 | for i in range(len(list_dict)-1):
114 | b = pd.DataFrame([list_dict[i+1]])
115 | model_summary = pd.merge(model_summary, b, how='outer')
116 |
117 | if out_path:
118 | file_name = out_path if isinstance(out_path, str) else None
119 | model_summary.to_csv(file_name, index=False)
120 |
121 | return model_summary
122 |
123 |
124 | def wald_test(model,X):
125 | '''
126 | :param model: a model file that should have predict_proba() function
127 | :param X: dataset features DataFrame
128 | :return: the value of wald_stats,p_value
129 | '''
130 | pred_probs = np.matrix(model.predict_proba(X))
131 | X_design = np.hstack((np.ones(shape=(X.shape[0], 1)), X))
132 | diag_array = np.multiply(pred_probs[:, 0], pred_probs[:, 1]).A1
133 | V = scipy.sparse.diags(diag_array)
134 | m1 = X_design.T * V
135 | m2 = m1.dot(X_design)
136 | cov_mat = np.linalg.inv(m2)
137 |
138 | model_params = np.hstack((model.intercept_[0], model.coef_[0]))
139 | wald_stats = (model_params / np.sqrt(np.diag(cov_mat))) ** 2
140 |
141 | wald = scipy.stats.wald()
142 | p_value = wald.pdf(wald_stats)
143 |
144 | return wald_stats,p_value
145 |
146 |
147 | def eval_feature_summary(train_X,model,civ_list,candidate_var_list,out_path=False):
148 | '''
149 | :param train_X: training dataset features DataFrame
150 | :param model: model file
151 | :param civ_list: list of InfoValue Class instances
152 | :param candidate_var_list: the list of model input variable
153 | :param out_path: specify the Dataframe to csv file path ,default False
154 | :return: DataFrame about feature summary
155 | '''
156 | feature_summary = {}
157 | feature_summary['feature_name'] = list(['Intercept'])
158 | feature_summary['feature_name'].extend(list(candidate_var_list))
159 | feature_summary['coef'] = [model['classifier'].intercept_]
160 | feature_summary['coef'].extend(model['classifier'].coef_[0])
161 | var_name = [civ.var_name for civ in civ_list]
162 | feature_summary['iv'] = [0]
163 | feature_summary['iv'].extend([civ_list[var_name.index(var)].iv for var in candidate_var_list])
164 | feature_summary['wald_stats'], feature_summary['p_value'] = wald_test(model['classifier'], train_X)
165 |
166 | feature_summary = pd.DataFrame(feature_summary)
167 | if out_path:
168 | file_name = out_path if isinstance(out_path, str) else None
169 | feature_summary.to_csv(file_name, index=False)
170 |
171 | return feature_summary
172 |
173 |
174 | def eval_segment_metrics(target, predict_proba, segment_cnt = 20,out_path=False):
175 | '''
176 | :param target: the list of actual target value
177 | :param predict_proba: the list of predicted probability
178 | :param segment_cnt: the segment number
179 | :param out_path: specify the Dataframe to csv file path ,default False
180 | :return: DataFrame about segment metrics
181 | '''
182 | proba_descend_idx = np.argsort(predict_proba)
183 | proba_descend_idx = proba_descend_idx[::-1]
184 |
185 | grp_idx = 1
186 | start_idx = 0
187 | total_sample_cnt = len(predict_proba)
188 | total_positive_sample_cnt = target.sum()
189 | total_negative_sample_cnt = total_sample_cnt - total_positive_sample_cnt
190 |
191 | segment_sample_cnt = int(len(predict_proba) / segment_cnt)
192 | cumulative_sample_percentage = 0.0
193 | cumulative_positive_percentage = 0.0
194 | cumulative_negative_percentage = 0.0
195 |
196 | segment_list = []
197 | columns = ['grp_idx', 'segment_sample_cnt', 'segment_sample_percentage', 'cumulative_sample_percentage',
198 | 'in_segment_positive_percentage', 'positive_percentage_in_total', 'cumulative_positive_percentage',
199 | 'cumulative_negative_percentage', 'ks']
200 |
201 | while start_idx < total_sample_cnt:
202 | s = {}
203 | s['grp_idx'] = grp_idx
204 | segment_idx_list = proba_descend_idx[start_idx : start_idx + segment_sample_cnt]
205 | segment_target = target[segment_idx_list]
206 |
207 | segment_sample_cnt = len(segment_idx_list)
208 | s['segment_sample_cnt'] = segment_sample_cnt
209 |
210 | segment_pos_cnt = segment_target.sum()
211 | segment_neg_cnt = segment_sample_cnt - segment_pos_cnt
212 |
213 | segment_sample_percentage = segment_sample_cnt*1.0/total_sample_cnt
214 | s['segment_sample_percentage'] = segment_sample_percentage
215 |
216 | pos_percentage_in_total = float(segment_pos_cnt * 100) / total_positive_sample_cnt
217 | neg_percentage_in_total = float(segment_neg_cnt * 100) / total_negative_sample_cnt
218 | s['positive_percentage_in_total'] = pos_percentage_in_total
219 |
220 | in_segment_positive_percentage = float(segment_pos_cnt) / segment_sample_cnt
221 | s['in_segment_positive_percentage'] = in_segment_positive_percentage
222 |
223 | cumulative_sample_percentage += segment_sample_percentage
224 | s['cumulative_sample_percentage'] = cumulative_sample_percentage
225 |
226 | cumulative_positive_percentage += pos_percentage_in_total
227 | cumulative_negative_percentage += neg_percentage_in_total
228 | s['cumulative_positive_percentage'] = cumulative_positive_percentage
229 | s['cumulative_negative_percentage'] = cumulative_negative_percentage
230 |
231 | ks = cumulative_positive_percentage - cumulative_negative_percentage
232 | s['ks'] = ks
233 |
234 | segment_list.append(s)
235 | grp_idx += 1
236 | start_idx += segment_sample_cnt
237 |
238 | segment_list = pd.DataFrame(segment_list,columns=columns)
239 | if out_path:
240 | file_name = out_path if isinstance(out_path, str) else None
241 | segment_list.to_csv(file_name, index=False)
242 |
243 | return segment_list
244 |
245 |
246 | def eval_model_stability(proba_train, proba_validation, segment_cnt = 10,out_path=False):
247 | '''
248 | :param proba_train: the list of predicted probability on training dataset
249 | :param proba_validation: the list of predicted probability on validation dataset
250 | :param segment_cnt: the segment number
251 | :param out_path: specify the Dataframe to csv file path ,default False
252 | :return: DataFrame about model stability
253 | '''
254 | step = 1.0/segment_cnt
255 | flag = 0.0
256 | model_stability = []
257 | len_train = len(proba_train)
258 | len_validation = len(proba_validation)
259 |
260 | columns = ['score_range','segment_train_percentage','segment_validation_percentage','difference',
261 | 'variance','ln_variance','stability_index']
262 |
263 | while flag < 1.0:
264 | temp = {}
265 |
266 | score_range = '['+str(flag)+','+str(flag + step)+')'
267 | segment_train_cnt = proba_train[(proba_train >= flag) & (proba_train < flag + step)].count()
268 | segment_train_percentage = segment_train_cnt*1.0/len_train
269 | segment_validation_cnt = proba_validation[(proba_validation >= flag) & (proba_validation < flag + step)].count()
270 | segment_validation_percentage = segment_validation_cnt * 1.0 / len_validation
271 | difference = segment_validation_percentage - segment_train_percentage
272 | variance = float(segment_validation_percentage)/segment_train_percentage
273 | ln_variance = variance
274 | stability_index = difference * ln_variance
275 |
276 | temp['score_range'] = score_range
277 | temp['segment_train_percentage'] = segment_train_percentage
278 | temp['segment_validation_percentage'] = segment_validation_percentage
279 | temp['difference'] = difference
280 | temp['variance'] = variance
281 | temp['ln_variance'] = ln_variance
282 | temp['stability_index'] = stability_index
283 |
284 | model_stability.append(temp)
285 | flag += step
286 |
287 | model_stability = pd.DataFrame(model_stability,columns=columns)
288 | if out_path:
289 | file_name = out_path if isinstance(out_path, str) else None
290 | model_stability.to_csv(file_name, index=False)
291 |
292 | return model_stability
293 |
294 | def eval_feature_stability(civ_list, df_train, df_validation,candidate_var_list,out_path=False):
295 | '''
296 | :param civ_list: List of InfoValue Class instances
297 | :param df_train: DataFrame of training dataset
298 | :param df_validation: DataFrame of validation dataset
299 | :param candidate_var_list: the list of model input variable
300 | :param out_path: specify the Dataframe to csv file path ,default False
301 | :return: DataFrame about features stability
302 | '''
303 | psi_dict = {}
304 |
305 | civ_var_list = [civ_list[i].var_name for i in range(len(civ_list))]
306 | intersection = list(set(civ_var_list).intersection(set(candidate_var_list)))
307 | civ_idx_list = [civ_var_list.index(var) for var in intersection]
308 |
309 | len_train = len(df_train)
310 | len_validation = len(df_validation)
311 |
312 | psi_dict['feature_name'] = []
313 | psi_dict['group'] = []
314 | psi_dict['segment_train_cnt'] = []
315 | psi_dict['segment_train_percentage'] = []
316 | psi_dict['segment_validation_cnt'] = []
317 | psi_dict['segment_validation_percentage'] = []
318 |
319 | for i in civ_idx_list:
320 | if civ_list[i].is_discrete:
321 | for j in range(len(civ_list[i].split_list)):
322 | psi_dict['feature_name'].append(civ_list[i].var_name)
323 | psi_dict['group'].append(civ_list[i].split_list[j])
324 |
325 | civ_split_list = civ_list[i].split_list[j]
326 | segment_train_cnt = 0
327 | for m in civ_split_list:
328 | segment_train_cnt += df_train[civ_list[i].var_name][df_train[civ_list[i].var_name] == m].count()
329 |
330 | psi_dict['segment_train_cnt'].append(segment_train_cnt)
331 | psi_dict['segment_train_percentage'].append(float(segment_train_cnt)/len_train)
332 |
333 | segment_validation_cnt = 0
334 | for m in civ_split_list:
335 | segment_validation_cnt += df_validation[civ_list[i].var_name][df_validation[civ_list[i].var_name] == m].count()
336 |
337 | psi_dict['segment_validation_cnt'].append(segment_validation_cnt)
338 | psi_dict['segment_validation_percentage'].append(float(segment_validation_cnt)/len_validation)
339 |
340 | else:
341 | split_list = []
342 | split_list.append(float("-inf"))
343 | split_list.extend([temp for temp in civ_list[i].split_list])
344 | split_list.append(float("inf"))
345 | var_name = civ_list[i].var_name
346 |
347 | for j in range(len(split_list)-3):
348 | psi_dict['feature_name'].append(civ_list[i].var_name)
349 | psi_dict['group'].append('('+str(split_list[j])+','+str(split_list[j+1])+']')
350 |
351 | segment_train_cnt = df_train[var_name][(df_train[var_name] > split_list[j])&(df_train[var_name] <= split_list[j+1])].count()
352 |
353 | psi_dict['segment_train_cnt'].append(segment_train_cnt)
354 | psi_dict['segment_train_percentage'].append(float(segment_train_cnt)/len_train)
355 |
356 | segment_validation_cnt = df_validation[var_name][(df_validation[var_name] > split_list[j])&
357 | (df_validation[var_name] <= split_list[j+1])].count()
358 |
359 | psi_dict['segment_validation_cnt'].append(segment_validation_cnt)
360 | psi_dict['segment_validation_percentage'].append(float(segment_validation_cnt)/len_validation)
361 |
362 | psi_dict['feature_name'].append(var_name)
363 | psi_dict['group'].append('(' + str(split_list[len(split_list)-2]) + ',+INF)')
364 |
365 | segment_train_cnt = df_train[var_name][df_train[var_name] > split_list[len(split_list)-1]].count()
366 | psi_dict['segment_train_cnt'].append(segment_train_cnt)
367 | psi_dict['segment_train_percentage'].append(float(segment_train_cnt) / len_train)
368 |
369 | segment_validation_cnt = df_validation[var_name][df_validation[var_name] > split_list[len(split_list)-1]].count()
370 | psi_dict['segment_validation_cnt'].append(segment_validation_cnt)
371 | psi_dict['segment_validation_percentage'].append(float(segment_validation_cnt) / len_validation)
372 |
373 | psi_dict['difference'] = pd.Series(psi_dict['segment_validation_percentage']) - pd.Series(psi_dict['segment_train_percentage'])
374 | psi_dict['variance'] = list(map(lambda x_y: x_y[0] / (x_y[1]+0.000000001), zip(psi_dict['segment_validation_percentage'], psi_dict['segment_train_percentage'])))
375 | psi_dict['Ln(variance)'] = np.log(np.array(psi_dict['variance'])+0.000000001)
376 | psi_dict['stability_index'] = np.array(psi_dict['difference']) * np.array(psi_dict['Ln(variance)'])
377 |
378 | columns = ['feature_name','group','segment_train_cnt','segment_train_percentage',
379 | 'segment_validation_cnt','segment_validation_percentage','difference',
380 | 'variance','Ln(variance)','stability_index']
381 |
382 | psi_df = pd.DataFrame(psi_dict, columns=columns)
383 | if out_path:
384 | file_name = out_path if isinstance(out_path, str) else None
385 | psi_df.to_csv(file_name, index=False)
386 |
387 | return psi_df
388 |
389 |
390 | def plot_ks(proba,target,axistype='pct',out_path=False):
391 | """
392 | plot k-s figure
393 | :param proba: 1-d array,prediction probability values
394 | :param target: 1-d array,the list of actual target value
395 | :param axistype: specify x axis :'axistype' must be either 'pct' (sample percent) or 'proba' (prediction probability)
396 | :param out_path: specify the file path to store ks plot figure,default False
397 | :return: DataFrame, figure summary
398 | """
399 | assert axistype in ['pct','proba'] , "KS Plot TypeError: Attribute 'axistype' must be either 'pct' or 'proba' !"
400 |
401 | a = pd.DataFrame(np.array([proba,target]).T,columns=['proba','target'])
402 | a.sort_values(by='proba',ascending=False,inplace=True)
403 | a['sum_Times']=a['target'].cumsum()
404 | total_1 = a['target'].sum()
405 | total_0 = len(a) - a['target'].sum()
406 |
407 | a['temp'] = 1
408 | a['Times']=a['temp'].cumsum()
409 | a['cdf1'] = a['sum_Times']/total_1
410 | a['cdf0'] = (a['Times'] - a['sum_Times'])/total_0
411 | a['ks'] = a['cdf1'] - a['cdf0']
412 | a['percent'] = a['Times']*1.0/len(a)
413 |
414 | idx = np.argmax(a['ks'])
415 | # print(a.loc[idx])
416 |
417 | if axistype == 'pct':
418 | '''
419 | KS曲线,横轴为按照输出的概率值排序后的观察样本比例
420 | '''
421 | plt.figure()
422 | plt.plot(a['percent'],a['cdf1'], label="CDF_positive")
423 | plt.plot(a['percent'],a['cdf0'],label="CDF_negative")
424 | plt.plot(a['percent'],a['ks'],label="K-S")
425 |
426 | sx = np.linspace(0,1,10)
427 | sy = sx
428 | plt.plot(sx,sy,linestyle='--',color='darkgrey',linewidth=1.2)
429 |
430 | plt.legend()
431 | plt.grid(True)
432 | ymin, ymax = plt.ylim()
433 | plt.xlabel('Sample percent')
434 | plt.ylabel('Cumulative probability')
435 | plt.title('Model Evaluation Index K-S')
436 | plt.axis('tight')
437 |
438 | # 虚线
439 | t = a.loc[idx]['percent']
440 | yb = round(a.loc[idx]['cdf1'],4)
441 | yg = round(a.loc[idx]['cdf0'],4)
442 |
443 | plt.plot([t,t],[yb,yg], color ='red', linewidth=1.4, linestyle="--")
444 | plt.scatter([t,],[yb,], 20, color ='dodgerblue')
445 | plt.annotate(r'$recall_p=%s$' % round(a.loc[idx]['cdf1'],4), xy=(t, yb), xycoords='data', xytext=(+10, -5),
446 | textcoords='offset points', fontsize=8,
447 | arrowprops=dict(arrowstyle='->', connectionstyle="arc3,rad=.1"))
448 |
449 | plt.scatter([t,],[yg,], 20, color ='darkorange')
450 | plt.annotate(r'$recall_n=%s$' % round(a.loc[idx]['cdf0'],4), xy=(t, yg), xycoords='data', xytext=(+10, -10),
451 | textcoords='offset points', fontsize=8,
452 | arrowprops=dict(arrowstyle='->', connectionstyle="arc3,rad=.1"))
453 | # K-S曲线峰值
454 | plt.scatter([t,],[a.loc[idx]['ks'],], 20, color ='limegreen')
455 | plt.annotate(r'$ks=%s,p=%s$' % (round(a.loc[idx]['ks'],4)
456 | ,round(a.loc[idx]['proba'],4))
457 | , xy=(a.loc[idx]['percent'], a.loc[idx]['ks'])
458 | , xycoords='data'
459 | , xytext=(+15, -15),
460 | textcoords='offset points'
461 | , fontsize=8
462 | ,arrowprops=dict(arrowstyle='->', connectionstyle="arc3,rad=.1"))
463 | plt.annotate(r'$percent=%s,cnt=%s$' % (round(a.loc[idx]['percent'],4)
464 | ,round(a.loc[idx]['Times'],0))
465 | , xy=(a.loc[idx]['percent'], a.loc[idx]['ks'])
466 | , xycoords='data'
467 | , xytext=(+25, -25),
468 | textcoords='offset points'
469 | , fontsize=8
470 | )
471 |
472 | else:
473 | '''
474 | 改变横轴,横轴为模型输出的概率值
475 | '''
476 | plt.figure()
477 | plt.grid(True)
478 | plt.plot(1-a['proba'],a['cdf1'], label="CDF_bad")
479 | plt.plot(1-a['proba'],a['cdf0'],label="CDF_good")
480 | plt.plot(1-a['proba'],a['ks'],label="ks")
481 |
482 | plt.legend()
483 | ymin, ymax = plt.ylim()
484 | plt.xlabel('1-[Predicted probability]')
485 | plt.ylabel('Cumulative probability')
486 | plt.title('Model Evaluation Index K-S')
487 | plt.axis('tight')
488 | plt.show()
489 | # 虚线
490 | t = 1 - a.loc[idx]['proba']
491 | yb = round(a.loc[idx]['cdf1'],4)
492 | yg = round(a.loc[idx]['cdf0'],4)
493 |
494 | plt.plot([t,t],[yb,yg], color ='red', linewidth=1.4, linestyle="--")
495 | plt.scatter([t,],[yb,], 20, color ='dodgerblue')
496 | plt.annotate(r'$recall_p=%s$' % round(a.loc[idx]['cdf1'],4), xy=(t, yb), xycoords='data', xytext=(+10, -5),
497 | textcoords='offset points', fontsize=8,
498 | arrowprops=dict(arrowstyle='->', connectionstyle="arc3,rad=.1"))
499 |
500 | plt.scatter([t,],[yg,], 20, color ='darkorange')
501 | plt.annotate(r'$recall_n=%s$' % round(a.loc[idx]['cdf0'],4), xy=(t, yg), xycoords='data', xytext=(+10, -10),
502 | textcoords='offset points', fontsize=8,
503 | arrowprops=dict(arrowstyle='->', connectionstyle="arc3,rad=.1"))
504 | # K-S曲线峰值
505 | plt.scatter([t,],[a.loc[idx]['ks'],], 20, color ='limegreen')
506 | plt.annotate(r'$ks=%s,p=%s$' % (round(a.loc[idx]['ks'],4)
507 | ,round(a.loc[idx]['proba'],4))
508 | , xy=(t, a.loc[idx]['ks'])
509 | , xycoords='data'
510 | , xytext=(+15, -15),
511 | textcoords='offset points'
512 | , fontsize=8
513 | ,arrowprops=dict(arrowstyle='->', connectionstyle="arc3,rad=.1"))
514 | plt.annotate(r'$percent=%s,cnt=%s$' % (round(a.loc[idx]['percent'],4)
515 | ,round(a.loc[idx]['Times'],0))
516 | , xy=(t, a.loc[idx]['ks'])
517 | , xycoords='data'
518 | , xytext=(+25, -25),
519 | textcoords='offset points'
520 | , fontsize=8
521 | )
522 |
523 | if out_path:
524 | file_name = out_path if isinstance(out_path, str) else None
525 | plt.savefig(file_name)
526 | else:
527 | plt.show()
528 |
529 | return a.loc[idx]
530 |
531 |
532 | def proc_validattion(dataset_path,config_path,model_path):
533 | print('####PROC VALIDATION#####')
534 | print('dataset_path:\n',dataset_path)
535 | print('config_path:\n',config_path)
536 | print('model_path:\n',model_path)
537 | #fillna
538 | config_path = r'E:\Code\Python_ML_Code\cs_model\config\config_cs_model.csv'
539 | cfg = config.config()
540 | cfg.load_file(config_path, dataset_path)
541 |
542 | for var in [tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns)]:
543 | # fill null
544 | cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 0
545 |
546 | for var in [tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns)]:
547 | # fill null
548 | cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 0
549 |
550 | output = open(model_path, 'rb')
551 | clf_model = pickle.load(output)
552 | output.close()
553 |
554 | clf = clf_model['clf']
555 | X_test = cfg.dataset_train[clf_model['features_list']]
556 | y_test = cfg.dataset_train['target']
557 |
558 | y_hat = clf.predict_proba(X_test)[:,1]
559 | ks = compute_ks(y_hat,y_test)
560 | print('global_bt:',cfg.global_bt)
561 | print('global_gt:', cfg.global_gt)
562 | print('ks:',ks)
563 | return ks
564 |
565 |
566 | def proc_cor_eval(dataset_path,config_path,var_list_specfied,out_file_path):
567 | dataset = pd.read_csv(dataset_path)
568 | cfg = pd.read_csv(config_path)
569 | candidate_var_list = cfg[cfg['is_modelfeature'] == 1]['var_name']
570 |
571 | b = [var for var in dataset.columns if sum(dataset[var].isnull()) == 0]
572 | candidate_var_list = list(set(candidate_var_list).intersection(set(b)))
573 |
574 | if var_list_specfied.__len__()>0:
575 | candidate_var_list = list(set(candidate_var_list).intersection(set(var_list_specfied)))
576 |
577 | print('candidate_var_list length:\n',candidate_var_list.__len__())
578 | print('candidate_var_list:\n',candidate_var_list)
579 |
580 | cor = np.corrcoef(dataset[candidate_var_list].values,rowvar=0)
581 | pd.DataFrame(cor,columns=candidate_var_list).to_csv(out_file_path,index=False)
--------------------------------------------------------------------------------
/build/lib/woe/feature_process.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | __author__ = 'boredbird'
3 | import numpy as np
4 | import woe.config as config
5 | import woe.eval as eval
6 | import copy
7 | import pickle
8 | import time
9 |
10 | class node:
11 | '''Tree Node Class
12 | '''
13 | def __init__(self,var_name=None,iv=0,split_point=None,right=None,left=None):
14 | self.var_name = var_name # The column index value of the attributes that are used to split data sets
15 | self.iv = iv # The info value of the node
16 | self.split_point = split_point # Store split points list
17 | self.right = right # Right sub tree
18 | self.left = left # Left sub tree
19 |
20 |
21 | class InfoValue(object):
22 | '''
23 | InfoValue Class
24 | '''
25 | def __init__(self):
26 | self.var_name = []
27 | self.split_list = []
28 | self.iv = 0
29 | self.woe_list = []
30 | self.iv_list = []
31 | self.is_discrete = 0
32 | self.sub_total_sample_num = []
33 | self.positive_sample_num = []
34 | self.negative_sample_num = []
35 | self.sub_total_num_percentage = []
36 | self.positive_rate_in_sub_total = []
37 | self.negative_rate_in_sub_total = []
38 |
39 | def init(self,civ):
40 | self.var_name = civ.var_name
41 | self.split_list = civ.split_list
42 | self.iv = civ.iv
43 | self.woe_list = civ.woe_list
44 | self.iv_list = civ.iv_list
45 | self.is_discrete = civ.is_discrete
46 | self.sub_total_sample_num = civ.sub_total_sample_num
47 | self.positive_sample_num = civ.positive_sample_num
48 | self.negative_sample_num = civ.negative_sample_num
49 | self.sub_total_num_percentage = civ.sub_total_num_percentage
50 | self.positive_rate_in_sub_total = civ.positive_rate_in_sub_total
51 | self.negative_rate_in_sub_total = civ.negative_rate_in_sub_total
52 |
53 |
54 | class DisInfoValue(object):
55 | '''
56 | A Class for the storage of discrete variables transformation information
57 | '''
58 | def __init__(self):
59 | self.var_name = None
60 | self.origin_value = []
61 | self.woe_before = []
62 |
63 |
64 | def change_feature_dtype(df,variable_type):
65 | '''
66 | change feature data type by the variable_type DataFrame
67 | :param df: dataset DataFrame
68 | :param variable_type: the DataFrame about variables dtypes
69 | :return: None
70 | '''
71 | s = 'Changing Feature Dtypes'
72 | print(s.center(60,'-'))
73 | for vname in df.columns:
74 | try:
75 | df[vname] = df[vname].astype(variable_type.loc[vname,'v_type'])
76 | print(vname,' '*(40-len(vname)),'{0: >10}'.format(variable_type.loc[vname,'v_type']))
77 | except Exception:
78 | print('[error]',vname)
79 | print('[original dtype] ',df.dtypes[vname],' [astype] ',variable_type.loc[vname,'v_type'])
80 | print('[unique value]',np.unique(df[vname]))
81 |
82 | s = 'Variable Dtypes Have Been Specified'
83 | print(s.center(60,'-'))
84 |
85 | return
86 |
87 | def check_point(df,var,split,min_sample):
88 | """
89 | Check whether the segmentation points cause some packet samples to be too small;
90 | If there is a packet sample size of less than 5% of the total sample size,
91 | then merge with the adjacent packet until more than 5%;
92 | Applies only to continuous values
93 | :param df: Dataset DataFrame
94 | :param var: Variables list
95 | :param split: Split points list
96 | :param min_sample: Minimum packet sample size
97 | :return: The split points list checked out
98 | """
99 | new_split = []
100 | if split is not None and split.__len__()>0:
101 | # print('run into if line:98')
102 | new_split.append(split[0])
103 | # print(new_split)
104 | # Try the left section of the first split point partition;
105 | # If not meet the conditions then the split point will be removed
106 | pdf = df[df[var] <= split[0]]
107 | if (pdf.shape[0] < min_sample) or (len(np.unique(pdf['target']))<=1):
108 | # print('run into if line:105')
109 | new_split.pop()
110 | # print(new_split)
111 | for i in range(0,split.__len__()-1):
112 | pdf = df[(df[var] > split[i]) & (df[var] <= split[i+1])]
113 | if (pdf.shape[0] < min_sample) or (np.unique(pdf['target']).__len__()<=1):
114 | # print('run into if line:112')
115 | continue
116 | else:
117 | # print('run into if line:115')
118 | new_split.append(split[i+1])
119 | # print(new_split)
120 |
121 | #If the remaining sample is too small then remove the last one
122 | # print(new_split)
123 | # print(new_split.__len__())
124 | if new_split.__len__()>1 and len(df[df[var] >= new_split[new_split.__len__()-1]])1 and np.unique(df[df[var] >= new_split[new_split.__len__()-1]]['target']).__len__()<=1:
130 | # print(split)
131 | # print(split[split.__len__()-1])
132 | # print(df[df[var] >= new_split[new_split.__len__()-1]].shape)
133 | # print(np.unique(df[df[new_split] > new_split[new_split.__len__()-1]]['target']))
134 | # print('run into if line:125')
135 | new_split.pop()
136 | # print(new_split)
137 | #If the split list has only one value, and no smaller than this value
138 | if new_split == []:
139 | new_split = split
140 | else:
141 | pass
142 | return new_split
143 |
144 | def calulate_iv(df,var,global_bt,global_gt):
145 | '''
146 | calculate the iv and woe value without split
147 | :param df:
148 | :param var:
149 | :param global_bt:
150 | :param global_gt:
151 | :return:
152 | '''
153 | # a = df.groupby(['target']).count()
154 | groupdetail = {}
155 | bt_sub = sum(df['target'])
156 | bri = (bt_sub + 0.0001)* 1.0 / global_bt
157 | gt_sub = df.shape[0] - bt_sub
158 | gri = (gt_sub + 0.0001)* 1.0 / global_gt
159 |
160 | groupdetail['woei'] = np.log(bri / gri)
161 | groupdetail['ivi'] = (bri - gri) * np.log(bri / gri)
162 | groupdetail['sub_total_num_percentage'] = df.shape[0]*1.0/(global_bt+global_gt)
163 | groupdetail['positive_sample_num'] = bt_sub
164 | groupdetail['negative_sample_num'] = gt_sub
165 | groupdetail['positive_rate_in_sub_total'] = bt_sub*1.0/df.shape[0]
166 | groupdetail['negative_rate_in_sub_total'] = gt_sub*1.0/df.shape[0]
167 |
168 | return groupdetail
169 |
170 |
171 | def calculate_iv_split(df,var,split_point,global_bt,global_gt):
172 | """
173 | calculate the iv value with the specified split point
174 | note:
175 | the dataset should have variables:'target' which to be encapsulated if have time
176 | :return:
177 | """
178 | #split dataset
179 | dataset_r = df[df.loc[:,var] > split_point][[var,'target']]
180 | dataset_l = df[df.loc[:,var] <= split_point][[var,'target']]
181 |
182 | r1_cnt = sum(dataset_r['target'])
183 | r0_cnt = dataset_r.shape[0] - r1_cnt
184 |
185 | l1_cnt = sum(dataset_l['target'])
186 | l0_cnt = dataset_l.shape[0] - l1_cnt
187 |
188 | if r0_cnt == 0 or r1_cnt == 0 or l0_cnt == 0 or l1_cnt ==0:
189 | return 0,0,0,dataset_l,dataset_r,0,0
190 |
191 | lbr = (l1_cnt+ 0.0001)*1.0/global_bt
192 | lgr = (l0_cnt+ 0.0001)*1.0/global_gt
193 | woel = np.log(lbr/lgr)
194 | ivl = (lbr-lgr)*woel
195 | rbr = (r1_cnt+ 0.0001)*1.0/global_bt
196 | rgr = (r0_cnt+ 0.0001)*1.0/global_gt
197 | woer = np.log(rbr/rgr)
198 | ivr = (rbr-rgr)*woer
199 | iv = ivl+ivr
200 |
201 | return woel,woer,iv,dataset_l,dataset_r,ivl,ivr
202 |
203 |
204 | def binning_data_split(df,var,global_bt,global_gt,min_sample,alpha=0.01):
205 | """
206 | Specify the data split level and return the split value list
207 | :return:
208 | """
209 | iv_var = InfoValue()
210 | # Calculates the IV of the current node before splitted
211 | gd = calulate_iv(df, var,global_bt,global_gt)
212 |
213 | woei, ivi = gd['woei'],gd['ivi']
214 |
215 | if np.unique(df[var]).__len__() <=8:
216 | # print('running into if')
217 | split = list(np.unique(df[var]))
218 | split.sort()
219 | # print('split:',split)
220 | #Segmentation point checking and processing
221 | split = check_point(df, var, split, min_sample)
222 | split.sort()
223 | # print('after check:',split)
224 | iv_var.split_list = split
225 | return node(split_point=split,iv=ivi)
226 |
227 | percent_value = list(np.unique(np.percentile(df[var], range(100))))
228 | percent_value.sort()
229 |
230 | if percent_value.__len__() <=2:
231 | iv_var.split_list = list(np.unique(percent_value)).sort()
232 | return node(split_point=percent_value,iv=ivi)
233 |
234 | # A sentry that attempts to split the current node
235 | # Init bestSplit_iv with zero
236 | bestSplit_iv = 0
237 | bestSplit_woel = []
238 | bestSplit_woer = []
239 | bestSplit_ivl = 0
240 | bestSplit_ivr = 0
241 | bestSplit_point = []
242 |
243 | #remove max value and min value in case dataset_r or dataset_l will be null
244 | for point in percent_value[0:percent_value.__len__()-1]:
245 | # If there is only a sample or a negative sample, skip
246 | if set(df[df[var] > point]['target']).__len__() == 1 or set(df[df[var] <= point]['target']).__len__() == 1 \
247 | or df[df[var] > point].shape[0] < min_sample or df[df[var] <= point].shape[0] < min_sample :
248 | continue
249 |
250 | woel, woer, iv, dataset_l, dataset_r, ivl, ivr = calculate_iv_split(df,var,point,global_bt,global_gt)
251 |
252 | if iv > bestSplit_iv:
253 | bestSplit_woel = woel
254 | bestSplit_woer = woer
255 | bestSplit_iv = iv
256 | bestSplit_point = point
257 | bestSplit_dataset_r = dataset_r
258 | bestSplit_dataset_l = dataset_l
259 | bestSplit_ivl = ivl
260 | bestSplit_ivr = ivr
261 |
262 | # If the IV after division is greater than the IV value before the current segmentation, the segmentation is valid and recursive
263 | # specified step learning rate 0.01
264 | if bestSplit_iv > ivi*(1+alpha) and bestSplit_dataset_r.shape[0] > min_sample and bestSplit_dataset_l.shape[0] > min_sample:
265 | presplit_right = node()
266 | presplit_left = node()
267 |
268 | # Determine whether the right node satisfies the segmentation prerequisite
269 | if bestSplit_dataset_r.shape[0] < min_sample or set(bestSplit_dataset_r['target']).__len__() == 1:
270 | presplit_right.iv = bestSplit_ivr
271 | right = presplit_right
272 | else:
273 | right = binning_data_split(bestSplit_dataset_r,var,global_bt,global_gt,min_sample,alpha=0.01)
274 |
275 | # Determine whether the left node satisfies the segmentation prerequisite
276 | if bestSplit_dataset_l.shape[0] < min_sample or np.unique(bestSplit_dataset_l['target']).__len__() == 1:
277 | presplit_left.iv = bestSplit_ivl
278 | left = presplit_left
279 | else:
280 | left = binning_data_split(bestSplit_dataset_l,var,global_bt,global_gt,min_sample,alpha=0.01)
281 |
282 | return node(var_name=var,split_point=bestSplit_point,iv=ivi,left=left,right=right)
283 | else:
284 | # Returns the current node as the final leaf node
285 | return node(var_name=var,iv=ivi)
286 |
287 |
288 | def search(tree,split_list):
289 | '''
290 | search the tree node
291 | :param tree: a instance of Tree Node Class
292 | :return: split points list
293 | '''
294 | if isinstance(tree.split_point, list):
295 | split_list.extend(tree.split_point)
296 | else:
297 | split_list.append(tree.split_point)
298 |
299 | if tree.left is not None:
300 | search(tree.left,split_list)
301 |
302 | if tree.right is not None:
303 | search(tree.right,split_list)
304 |
305 | return split_list
306 |
307 |
308 | def format_iv_split(df,var,split_list,global_bt,global_gt):
309 | '''
310 | Given the dataset DataFrame and split points list then return a InfoValue instance;
311 | Just for continuous variable
312 | :param df:
313 | :param var:
314 | :param split_list:
315 | :param global_bt:
316 | :param global_gt:
317 | :return:
318 | '''
319 | civ = InfoValue()
320 | civ.var_name = var
321 | civ.split_list = split_list
322 | dfcp = df[:]
323 |
324 | civ.sub_total_sample_num = []
325 | civ.positive_sample_num = []
326 | civ.negative_sample_num = []
327 | civ.sub_total_num_percentage = []
328 | civ.positive_rate_in_sub_total = []
329 |
330 | for i in range(0, split_list.__len__()):
331 | dfi = dfcp[dfcp[var] <= split_list[i]]
332 | dfcp = dfcp[dfcp[var] > split_list[i]]
333 | gd = calulate_iv(dfi, var,global_bt,global_gt)
334 | woei, ivi = gd['woei'],gd['ivi']
335 | civ.woe_list.append(woei)
336 | civ.iv_list.append(ivi)
337 | civ.sub_total_sample_num.append(dfi.shape[0])
338 | civ.positive_sample_num.append(gd['positive_sample_num'])
339 | civ.negative_sample_num.append(gd['negative_sample_num'])
340 | civ.sub_total_num_percentage.append(gd['sub_total_num_percentage'])
341 | civ.positive_rate_in_sub_total.append(gd['positive_rate_in_sub_total'])
342 | civ.negative_rate_in_sub_total.append(gd['negative_rate_in_sub_total'])
343 |
344 | if dfcp.shape[0]>0:
345 | gd = calulate_iv(dfcp, var,global_bt,global_gt)
346 | woei, ivi = gd['woei'],gd['ivi']
347 | civ.woe_list.append(woei)
348 | civ.iv_list.append(ivi)
349 | civ.sub_total_sample_num.append(dfcp.shape[0])
350 | civ.positive_sample_num.append(gd['positive_sample_num'])
351 | civ.negative_sample_num.append(gd['negative_sample_num'])
352 | civ.sub_total_num_percentage.append(gd['sub_total_num_percentage'])
353 | civ.positive_rate_in_sub_total.append(gd['positive_rate_in_sub_total'])
354 | civ.negative_rate_in_sub_total.append(gd['negative_rate_in_sub_total'])
355 |
356 | civ.iv = sum(civ.iv_list)
357 | return civ
358 |
359 |
360 | def woe_trans(dvar,civ):
361 | # replace the var value with the given woe value
362 | var = copy.deepcopy(dvar)
363 | if not civ.is_discrete:
364 | if civ.woe_list.__len__()>1:
365 | split_list = []
366 | split_list.append(float("-inf"))
367 | split_list.extend([i for i in civ.split_list])
368 | split_list.append(float("inf"))
369 |
370 | for i in range(civ.woe_list.__len__()):
371 | var[(dvar > split_list[i]) & (dvar <= split_list[i+1])] = civ.woe_list[i]
372 | else:
373 | var[:] = civ.woe_list[0]
374 | else:
375 | split_map = {}
376 | for i in range(civ.split_list.__len__()):
377 | for j in range(civ.split_list[i].__len__()):
378 | split_map[civ.split_list[i][j]] = civ.woe_list[i]
379 |
380 | var = var.map(split_map)
381 |
382 | return var
383 |
384 | def proc_woe_discrete(df,var,global_bt,global_gt,min_sample,alpha=0.01):
385 | '''
386 | process woe transformation of discrete variables
387 | :param df:
388 | :param var:
389 | :param global_bt:
390 | :param global_gt:
391 | :param min_sample:
392 | :return:
393 | '''
394 | s = 'process discrete variable:'+str(var)
395 | print(s.center(60, '-'))
396 |
397 | df = df[[var,'target']]
398 | div = DisInfoValue()
399 | div.var_name = var
400 | rdict = {}
401 | cpvar = df[var]
402 | # print('np.unique(df[var]):',np.unique(df[var]))
403 | for var_value in np.unique(df[var]):
404 | # Here come with a '==',in case type error you must do Nan filling process firstly
405 | df_temp = df[df[var] == var_value]
406 | gd = calulate_iv(df_temp,var,global_bt,global_gt)
407 | woei, ivi = gd['woei'],gd['ivi']
408 | div.origin_value.append(var_value)
409 | div.woe_before.append(woei)
410 | rdict[var_value] = woei
411 | # print(var_value,woei,ivi)
412 |
413 | cpvar = cpvar.map(rdict)
414 | df[var] = cpvar
415 |
416 | iv_tree = binning_data_split(df,var,global_bt,global_gt,min_sample,alpha)
417 |
418 | # Traversal tree, get the segmentation point
419 | split_list = []
420 | search(iv_tree, split_list)
421 | split_list = list(np.unique([1.0 * x for x in split_list if x is not None]))
422 | split_list.sort()
423 |
424 | # Segmentation point checking and processing
425 | split_list = check_point(df, var, split_list, min_sample)
426 | split_list.sort()
427 |
428 | civ = format_iv_split(df, var, split_list,global_bt,global_gt)
429 | civ.is_discrete = 1
430 |
431 | split_list_temp = []
432 | split_list_temp.append(float("-inf"))
433 | split_list_temp.extend([i for i in split_list])
434 | split_list_temp.append(float("inf"))
435 |
436 | a = []
437 | for i in range(split_list_temp.__len__() - 1):
438 | temp = []
439 | for j in range(div.origin_value.__len__()):
440 | if (div.woe_before[j]>split_list_temp[i]) & (div.woe_before[j]<=split_list_temp[i+1]):
441 | temp.append(div.origin_value[j])
442 |
443 | if temp != [] :
444 | a.append(temp)
445 |
446 | civ.split_list = a
447 |
448 | return civ
449 |
450 |
451 | def proc_woe_continuous(df,var,global_bt,global_gt,min_sample,alpha=0.01):
452 | '''
453 | process woe transformation of discrete variables
454 | :param df:
455 | :param var:
456 | :param global_bt:
457 | :param global_gt:
458 | :param min_sample:
459 | :return:
460 | '''
461 | s = 'process continuous variable:'+str(var)
462 | print(s.center(60, '-'))
463 | df = df[[var,'target']]
464 | iv_tree = binning_data_split(df, var,global_bt,global_gt,min_sample,alpha)
465 |
466 | # Traversal tree, get the segmentation point
467 | split_list = []
468 | search(iv_tree, split_list)
469 | split_list = list(np.unique([1.0 * x for x in split_list if x is not None]))
470 | split_list.sort()
471 |
472 | # Segmentation point checking and processing
473 | split_list = check_point(df, var, split_list, min_sample)
474 | split_list.sort()
475 |
476 | civ = format_iv_split(df, var,split_list,global_bt,global_gt)
477 |
478 | return civ
479 |
480 | def fillna(dataset,bin_var_list,discrete_var_list,continuous_filler=-1,discrete_filler='missing'):
481 | """
482 | fill the null value in the dataframe inpalce
483 | :param dataset: input dataset ,pandas.DataFrame type
484 | :param bin_var_list: continuous variables name list
485 | :param discrete_var_list: discretevvvv variables name list
486 | :param continuous_filler: the value to fill the null value in continuous variables
487 | :param discrete_filler: the value to fill the null value in discrete variables
488 | :return: null value,replace null value inplace
489 | """
490 | for var in [tmp for tmp in bin_var_list if tmp in list(dataset.columns)]:
491 | # fill null
492 | dataset.loc[dataset[var].isnull(), (var)] = continuous_filler
493 |
494 | for var in [tmp for tmp in discrete_var_list if tmp in list(dataset.columns)]:
495 | # fill null
496 | dataset.loc[dataset[var].isnull(), (var)] = discrete_filler
497 |
498 |
499 | def process_train_woe(infile_path=None,outfile_path=None,rst_path=None,config_path=None):
500 | print('run into process_train_woe: \n',time.asctime(time.localtime(time.time())))
501 | data_path = infile_path
502 | cfg = config.config()
503 | cfg.load_file(config_path,data_path)
504 | bin_var_list = [tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns)]
505 |
506 | for var in bin_var_list:
507 | # fill null
508 | cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1
509 |
510 | # change feature dtypes
511 | change_feature_dtype(cfg.dataset_train, cfg.variable_type)
512 | rst = []
513 |
514 | # process woe transformation of continuous variables
515 | print('process woe transformation of continuous variables: \n',time.asctime(time.localtime(time.time())))
516 | print('cfg.global_bt',cfg.global_bt)
517 | print('cfg.global_gt', cfg.global_gt)
518 |
519 | for var in bin_var_list:
520 | rst.append(proc_woe_continuous(cfg.dataset_train,var,cfg.global_bt,cfg.global_gt,cfg.min_sample,alpha=0.05))
521 |
522 | # process woe transformation of discrete variables
523 | print('process woe transformation of discrete variables: \n',time.asctime(time.localtime(time.time())))
524 | for var in [tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns)]:
525 | # fill null
526 | cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing'
527 | rst.append(proc_woe_discrete(cfg.dataset_train,var,cfg.global_bt,cfg.global_gt,cfg.min_sample,alpha=0.05))
528 |
529 | feature_detail = eval.eval_feature_detail(rst, outfile_path)
530 |
531 | print('save woe transformation rule into pickle: \n',time.asctime(time.localtime(time.time())))
532 | output = open(rst_path, 'wb')
533 | pickle.dump(rst,output)
534 | output.close()
535 |
536 | return feature_detail,rst
537 |
538 |
539 | def process_woe_trans(in_data_path=None,rst_path=None,out_path=None,config_path=None):
540 | cfg = config.config()
541 | cfg.load_file(config_path, in_data_path)
542 |
543 | for var in [tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns)]:
544 | # fill null
545 | cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1
546 |
547 | for var in [tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns)]:
548 | # fill null
549 | cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing'
550 |
551 | change_feature_dtype(cfg.dataset_train, cfg.variable_type)
552 |
553 | output = open(rst_path, 'rb')
554 | rst = pickle.load(output)
555 | output.close()
556 |
557 | # Training dataset Woe Transformation
558 | for r in rst:
559 | cfg.dataset_train[r.var_name] = woe_trans(cfg.dataset_train[r.var_name], r)
560 |
561 | cfg.dataset_train.to_csv(out_path)
562 |
--------------------------------------------------------------------------------
/build/lib/woe/ftrl.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | __author__ = 'boredbird'
3 | import numpy as np
4 |
5 | class LR(object):
6 | @staticmethod
7 | def fn(w, x):
8 | '''sigmoid function
9 | '''
10 | return 1.0 / (1.0 + np.exp(-w.dot(x)))
11 |
12 | @staticmethod
13 | def loss(y, y_hat):
14 | '''Cross entropy loss function
15 | '''
16 | return np.sum(np.nan_to_num(-y * np.log(y_hat) - (1 - y) * np.log(1 - y_hat)))
17 |
18 | @staticmethod
19 | def grad(y, y_hat, x):
20 | '''The first derivative of the cross entropy loss function to the weight W
21 | '''
22 | return (y_hat - y) * x
23 |
24 |
25 | class FTRL(object):
26 | def __init__(self, dim, l1, l2, alpha, beta, decisionFunc=LR):
27 | self.dim = dim
28 | self.decisionFunc = decisionFunc
29 | self.z = np.zeros(dim)
30 | self.n = np.zeros(dim)
31 | self.w = np.zeros(dim)
32 | self.w_list = []
33 | self.loss_list = []
34 | self.l1 = l1
35 | self.l2 = l2
36 | self.alpha = alpha
37 | self.beta = beta
38 |
39 | def predict(self, x):
40 | return self.decisionFunc.fn(self.w, x)
41 |
42 | def update(self, x, y):
43 | self.w = np.array([0 if np.abs(self.z[i]) <= self.l1 else (np.sign(
44 | self.z[i]) * self.l1 - self.z[i]) / (self.l2 + (self.beta + np.sqrt(self.n[i])) / self.alpha) for i in xrange(self.dim)])
45 | y_hat = self.predict(x)
46 | g = self.decisionFunc.grad(y, y_hat, x)
47 | sigma = (np.sqrt(self.n + g * g) - np.sqrt(self.n)) / self.alpha
48 | self.z += g - sigma * self.w
49 | self.n += g * g
50 | return self.decisionFunc.loss(y, y_hat)
51 |
52 | def train(self, trainSet, verbos=False, max_itr=10000000000, eta=0.01, epochs=100):
53 | itr = 0
54 | n = 0
55 | while True:
56 | for x, y in trainSet:
57 | loss = self.update(x, y)
58 | if verbos and n%verbos==0:
59 | print("itr=" + str(n) + "\tloss=" + str(loss))
60 | self.w_list.append(self.w)
61 | self.loss_list.append(loss)
62 | if loss < eta:
63 | itr += 1
64 | else:
65 | itr = 0
66 | if itr >= epochs: # when the loss function has been continuously epochs iterations less than eta
67 | print("loss have less than", eta, " continuously for ", itr, "iterations")
68 | return
69 | n += 1
70 | if n >= max_itr:
71 | print("reach max iteration", max_itr)
72 | return
--------------------------------------------------------------------------------
/dist/woe-0.1.4-py2-none-any.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boredbird/woe/335e9ec2a521d3bbccb0ad5d915128119e4d0ca6/dist/woe-0.1.4-py2-none-any.tar.gz
--------------------------------------------------------------------------------
/dist/woe-0.1.4-py2-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boredbird/woe/335e9ec2a521d3bbccb0ad5d915128119e4d0ca6/dist/woe-0.1.4-py2-none-any.whl
--------------------------------------------------------------------------------
/dist/woe-0.1.4-py2.7.egg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boredbird/woe/335e9ec2a521d3bbccb0ad5d915128119e4d0ca6/dist/woe-0.1.4-py2.7.egg
--------------------------------------------------------------------------------
/dist/woe-0.1.4-py3-none-any.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boredbird/woe/335e9ec2a521d3bbccb0ad5d915128119e4d0ca6/dist/woe-0.1.4-py3-none-any.tar.gz
--------------------------------------------------------------------------------
/dist/woe-0.1.4-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boredbird/woe/335e9ec2a521d3bbccb0ad5d915128119e4d0ca6/dist/woe-0.1.4-py3-none-any.whl
--------------------------------------------------------------------------------
/dist/woe-0.1.4.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boredbird/woe/335e9ec2a521d3bbccb0ad5d915128119e4d0ca6/dist/woe-0.1.4.tar.gz
--------------------------------------------------------------------------------
/examples/HereWeGo.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | __author__ = 'boredbird'
3 | import os
4 | import numpy as np
5 | import woe.feature_process as fp
6 | import woe.GridSearch as gs
7 |
8 | if __name__ == '__main__':
9 | config_path = os.getcwd()+'\\config.csv'
10 | data_path = os.getcwd()+'\\UCI_Credit_Card.csv'
11 | feature_detail_path = os.getcwd()+'\\features_detail.csv'
12 | rst_pkl_path = os.getcwd()+'\\woe_rule.pkl'
13 | # train woe rule
14 | feature_detail,rst = fp.process_train_woe(infile_path=data_path
15 | ,outfile_path=feature_detail_path
16 | ,rst_path=rst_pkl_path
17 | ,config_path=config_path)
18 | # proc woe transformation
19 | woe_train_path = os.getcwd()+'\\dataset_train_woed.csv'
20 | fp.process_woe_trans(data_path,rst_pkl_path,woe_train_path,config_path)
21 | # here i take the same dataset as test dataset
22 | woe_test_path = os.getcwd()+'\\dataset_test_woed.csv'
23 | fp.process_woe_trans(data_path,rst_pkl_path,woe_test_path,config_path)
24 |
25 | print('###TRAIN SCORECARD MODEL###')
26 | params = {}
27 | params['dataset_path'] = woe_train_path
28 | params['validation_path'] = woe_test_path
29 | params['config_path'] = config_path
30 |
31 | params['df_coef_path'] = os.getcwd()+'\\df_model_coef_path.csv'
32 | params['pic_coefpath'] = os.getcwd()+'\\model_coefpath.png'
33 | params['pic_performance'] = os.getcwd()+'\\model_performance_path.png'
34 | params['pic_coefpath_title'] = 'model_coefpath'
35 | params['pic_performance_title'] = 'model_performance_path'
36 |
37 | params['var_list_specfied'] = []
38 | params['cs'] = np.logspace(-4, -1,40)
39 | for key,value in params.items():
40 | print(key,': ',value)
41 | gs.grid_search_lr_c_main(params)
42 |
--------------------------------------------------------------------------------
/examples/README.rst:
--------------------------------------------------------------------------------
1 | Dataset Information
2 | ===================
3 |
4 | This dataset contains information on default payments, demographic factors, credit data, history of payment, and bill statements of credit card clients in Taiwan from April 2005 to September 2005.
5 |
6 | **YOU SHOULD SPECIFY THE VARIABLES DTYPES WITH config.csv**
7 |
8 | Appointment:
9 |
10 | continuous variables: is_tobe_bin=1 and is_candidate=1
11 |
12 | discrete variables: is_tobe_bin=0 and is_candidate=1
13 |
14 | Content
15 | =======
16 |
17 | There are 25 variables:
18 |
19 | * ID: ID of each client
20 | * LIMIT_BAL: Amount of given credit in NT dollars (includes individual and family/supplementary credit
21 | * SEX: Gender (1=male, 2=female)
22 | * EDUCATION: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)
23 | * MARRIAGE: Marital status (1=married, 2=single, 3=others)
24 | * AGE: Age in years
25 | * PAY_0: Repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, ... 8=payment delay for eight months, 9=payment delay for nine months and above)
26 | * PAY_2: Repayment status in August, 2005 (scale same as above)
27 | * PAY_3: Repayment status in July, 2005 (scale same as above)
28 | * PAY_4: Repayment status in June, 2005 (scale same as above)
29 | * PAY_5: Repayment status in May, 2005 (scale same as above)
30 | * PAY_6: Repayment status in April, 2005 (scale same as above)
31 | * BILL_AMT1: Amount of bill statement in September, 2005 (NT dollar)
32 | * BILL_AMT2: Amount of bill statement in August, 2005 (NT dollar)
33 | * BILL_AMT3: Amount of bill statement in July, 2005 (NT dollar)
34 | * BILL_AMT4: Amount of bill statement in June, 2005 (NT dollar)
35 | * BILL_AMT5: Amount of bill statement in May, 2005 (NT dollar)
36 | * BILL_AMT6: Amount of bill statement in April, 2005 (NT dollar)
37 | * PAY_AMT1: Amount of previous payment in September, 2005 (NT dollar)
38 | * PAY_AMT2: Amount of previous payment in August, 2005 (NT dollar)
39 | * PAY_AMT3: Amount of previous payment in July, 2005 (NT dollar)
40 | * PAY_AMT4: Amount of previous payment in June, 2005 (NT dollar)
41 | * PAY_AMT5: Amount of previous payment in May, 2005 (NT dollar)
42 | * PAY_AMT6: Amount of previous payment in April, 2005 (NT dollar)
43 | * default.payment.next.month: Default payment (1=yes, 0=no)
44 |
--------------------------------------------------------------------------------
/examples/config.csv:
--------------------------------------------------------------------------------
1 | var_name,var_dtype,is_tobe_bin,is_candidate,is_modelfeature
2 | ID,object,0,0,0
3 | LIMIT_BAL,int64,1,1,1
4 | SEX,object,0,1,1
5 | EDUCATION,object,0,1,1
6 | MARRIAGE,object,0,1,1
7 | AGE,int64,1,1,1
8 | PAY_0,int64,1,1,1
9 | PAY_2,int64,1,1,1
10 | PAY_3,int64,1,1,1
11 | PAY_4,int64,1,1,1
12 | PAY_5,int64,1,1,1
13 | PAY_6,int64,1,1,1
14 | BILL_AMT1,int64,1,1,1
15 | BILL_AMT2,int64,1,1,1
16 | BILL_AMT3,int64,1,1,1
17 | BILL_AMT4,int64,1,1,1
18 | BILL_AMT5,int64,1,1,1
19 | BILL_AMT6,int64,1,1,1
20 | PAY_AMT1,int64,1,1,1
21 | PAY_AMT2,int64,1,1,1
22 | PAY_AMT3,int64,1,1,1
23 | PAY_AMT4,int64,1,1,1
24 | PAY_AMT5,int64,1,1,1
25 | PAY_AMT6,int64,1,1,1
26 | target,int64,0,0,0
27 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | __author__ = 'boredbird'
3 |
4 | from setuptools import setup, find_packages
5 |
6 | setup(
7 | name = 'woe',
8 | version = '0.1.4',
9 | description = (
10 | 'Tools for WoE Transformation mostly used in ScoreCard Model for credit rating'
11 | ),
12 | long_description = open('README.rst').read(),
13 | author = 'boredbird',
14 | author_email = '1002937942@qq.com',
15 | maintainer = 'boredbird',
16 | maintainer_email = '1002937942@qq.com',
17 | license = 'MIT',
18 | packages = ['woe'],
19 | platforms = ["all"],
20 | url = 'https://github.com/boredbird/woe',
21 | classifiers = [
22 | 'Operating System :: OS Independent',
23 | 'Intended Audience :: Developers',
24 | 'License :: OSI Approved :: MIT License',
25 | 'Programming Language :: Python',
26 | 'Programming Language :: Python :: Implementation',
27 | 'Programming Language :: Python :: 2',
28 | 'Programming Language :: Python :: 2.7',
29 | 'Programming Language :: Python :: 3',
30 | 'Programming Language :: Python :: 3.5',
31 | 'Topic :: Software Development :: Libraries'
32 | ],
33 | keywords = ["math","finance","scorecard","woe",'iv'],
34 | install_requires = [
35 | 'pandas>=0.19.2',
36 | 'numpy>=1.11.3',
37 | 'scipy>=0.18.1',
38 | 'matplotlib>=2.0.0',
39 | ]
40 | )
41 |
--------------------------------------------------------------------------------
/woe.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 1.1
2 | Name: woe
3 | Version: 0.1.4
4 | Summary: Tools for WoE Transformation mostly used in ScoreCard Model for credit rating
5 | Home-page: https://github.com/boredbird/woe
6 | Author: boredbird
7 | Author-email: 1002937942@qq.com
8 | License: MIT
9 | Description: woe
10 | ===
11 |
12 | .. image:: https://travis-ci.org/justdoit0823/pywxclient.svg?branch=master
13 | :target: https://travis-ci.org/justdoit0823/pywxclient
14 |
15 | version: 0.1.4
16 |
17 | Tools for WoE Transformation mostly used in ScoreCard Model for credit rating
18 |
19 | Installation
20 | --------------------------------
21 |
22 | We can simply use pip to install, as the following:
23 |
24 | .. code-block:: bash
25 |
26 | $ pip install woe
27 |
28 | or installing from git
29 |
30 | .. code-block:: bash
31 |
32 | $ pip install git+https://github.com/boredbird/woe
33 |
34 |
35 | Features
36 | ========
37 |
38 | * Split tree with IV criterion
39 |
40 | * Rich and plentiful model eval methods
41 |
42 | * Unified format and easy for output
43 |
44 | * Storage of IV tree for follow-up use
45 |
46 |
47 |
48 | **woe** module function tree
49 | ============================
50 |
51 | ::
52 |
53 | |- __init__
54 | |- config.py
55 | | |-- config
56 | | |-- __init__
57 | | |-- change_config_var_dtype()
58 | | |-- load_file()
59 | |- eval.py
60 | | |-- compute_ks()
61 | | |-- eval_data_summary()
62 | | |-- eval_feature_detail()
63 | | |-- eval_feature_stability()
64 | | |-- eval_feature_summary()
65 | | |-- eval_model_stability()
66 | | |-- eval_model_summary()
67 | | |-- eval_segment_metrics()
68 | | |-- plot_ks()
69 | | |-- proc_cor_eval()
70 | | |-- proc_validation()
71 | | |-- wald_test()
72 | |- feature_process.py
73 | | |-- binning_data_split()
74 | | |-- calculate_iv_split()
75 | | |-- calulate_iv()
76 | | |-- change_feature_dtype()
77 | | |-- check_point()
78 | | |-- fillna()
79 | | |-- format_iv_split()
80 | | |-- proc_woe_continuous()
81 | | |-- proc_woe_discrete()
82 | | |-- process_train_woe()
83 | | |-- process_woe_trans()
84 | | |-- search()
85 | | |-- woe_trans()
86 | |- ftrl.py
87 | | |-- FTRL()
88 | | |-- LR()
89 | |- GridSearch.py
90 | | |-- fit_single_lr()
91 | | |-- grid_search_lr_c()
92 | | |-- grid_search_lr_c_main()
93 | | |-- grid_search_lr_validation()
94 |
95 |
96 | Examples
97 | ========
98 |
99 | In the examples directory, there is a simple woe transformation program as tutorials.
100 |
101 | Or you can write a more complex program with this `woe` package.
102 |
103 | Version Records
104 | ================
105 | woe 0.1.4 2018-03-01
106 | * support py3
107 |
108 | woe 0.1.3 2018-02-09
109 |
110 | * woe.feature_process.proc_woe_discrete(): fix bug when deal with discrete varibales
111 | * woe.eval.eval_feature_detail(): fix bug : utf-8 output file format
112 | * woe.GridSearch.grid_search_lr_c_main(): add function warper for convenience and high efficiency
113 | * woe.GridSearch.grid_search_lr_c_validation(): monitor the ks performance of training sets and test sets on different 'c'
114 | * supplement examples test scripts
115 |
116 |
117 | woe 0.1.2 2017-12-05
118 |
119 | * woe.ftrl.FTRL(): add online learning module
120 |
121 | woe 0.1.1 2017-11-28
122 |
123 | * woe.config.load_file(): change param data_path to be optional
124 | * woe.eval.eval_feature_stability(): fix bug : psi_dict['stability_index'] computation error
125 | * woe.feature_process.change_feature_dtype(): add friendly tips when encounter a error
126 | * woe.feature_process.calulate_iv(): refactor the code
127 | * woe.feature_process.calculate_iv_split(): refactor the code
128 | * woe.feature_process.binning_data_split(): reduce the number of len() function calls with __len__() and shape attributes;replace namedtuple with dict
129 | * woe.feature_process.fillna(): new added function to fill null value
130 | * woe.GridSearch.grid_search_lr_c(): list of regularization parameter c specified inside the function is changed to the user specified
131 |
132 | woe 0.0.9 2017-11-21
133 |
134 | * Add module : GridSearch for the search of optimal hyper parametric C in LogisticRegression
135 | * Code refactoring: function compute_ks and plot_ks
136 |
137 | woe 0.0.8 2017-09-28
138 |
139 | * More flexible: cancel conditional restriction in function feature_process.change_feature_dtype()
140 | * Fix bug: the wrong use of deepcopy in function feature_process.woe_trans()
141 |
142 | woe 0.0.7 2017-09-19
143 |
144 | * Fix bug: eval.eval_feature_detail raises ValueError('arrays must all be same length')
145 | * Add parameter interface: alpha specified step learning rate ,default 0.01
146 |
147 | How to Contribute
148 | --------------------------------
149 |
150 | Email me,1002937942@qq.com.
151 |
152 | Keywords: math,finance,scorecard,woe,iv
153 | Platform: all
154 | Classifier: Operating System :: OS Independent
155 | Classifier: Intended Audience :: Developers
156 | Classifier: License :: OSI Approved :: MIT License
157 | Classifier: Programming Language :: Python
158 | Classifier: Programming Language :: Python :: Implementation
159 | Classifier: Programming Language :: Python :: 2
160 | Classifier: Programming Language :: Python :: 2.7
161 | Classifier: Programming Language :: Python :: 3
162 | Classifier: Programming Language :: Python :: 3.5
163 | Classifier: Topic :: Software Development :: Libraries
164 |
--------------------------------------------------------------------------------
/woe.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | LICENSE.txt
2 | MANIFEST.in
3 | README.rst
4 | setup.py
5 | examples/HereWeGo.py
6 | examples/README.rst
7 | examples/UCI_Credit_Card.csv
8 | examples/config.csv
9 | woe/GridSearch.py
10 | woe/__init__.py
11 | woe/config.py
12 | woe/eval.py
13 | woe/feature_process.py
14 | woe/ftrl.py
15 | woe.egg-info/PKG-INFO
16 | woe.egg-info/SOURCES.txt
17 | woe.egg-info/dependency_links.txt
18 | woe.egg-info/requires.txt
19 | woe.egg-info/top_level.txt
--------------------------------------------------------------------------------
/woe.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/woe.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | pandas>=0.19.2
2 | numpy>=1.11.3
3 | scipy>=0.18.1
4 | matplotlib>=2.0.0
5 |
--------------------------------------------------------------------------------
/woe.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | woe
2 |
--------------------------------------------------------------------------------
/woe/GridSearch.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | __author__ = 'boredbird'
3 | import pandas as pd
4 | import numpy as np
5 | import matplotlib.pyplot as plt
6 | from sklearn.linear_model import LogisticRegression
7 | from datetime import datetime
8 | from sklearn.svm import l1_min_c
9 | from woe.eval import compute_ks
10 | import pickle
11 | import time
12 |
13 | """
14 | Search for optimal hyper parametric C in LogisticRegression
15 | """
16 | def grid_search_lr_c(X_train,y_train,cs,df_coef_path=False
17 | ,pic_coefpath_title='Logistic Regression Path',pic_coefpath=False
18 | ,pic_performance_title='Logistic Regression Performance',pic_performance=False):
19 | """
20 | grid search optimal hyper parameters c with the best ks performance
21 | :param X_train: features dataframe
22 | :param y_train: target
23 | :param cs: list of regularization parameter c
24 | :param df_coef_path: the file path for logistic regression coefficient dataframe
25 | :param pic_coefpath_title: the pic title for coefficient path picture
26 | :param pic_coefpath: the file path for coefficient path picture
27 | :param pic_performance_title: the pic title for ks performance picture
28 | :param pic_performance: the file path for ks performance picture
29 | :return: a tuple of c and ks value with the best ks performance
30 | """
31 | # init a LogisticRegression model
32 | clf_l1_LR = LogisticRegression(C=0.1, penalty='l1', tol=0.01,class_weight='balanced')
33 | # cs = l1_min_c(X_train, y_train, loss='log') * np.logspace(0, 9,200)
34 |
35 | print("Computing regularization path ...")
36 | start = datetime.now()
37 | print(start)
38 | coefs_ = []
39 | ks = []
40 | for c in cs:
41 | clf_l1_LR.set_params(C=c)
42 | clf_l1_LR.fit(X_train, y_train)
43 | coefs_.append(clf_l1_LR.coef_.ravel().copy())
44 |
45 | proba = clf_l1_LR.predict_proba(X_train)[:,1]
46 | ks.append(compute_ks(proba,y_train))
47 |
48 | end = datetime.now()
49 | print(end)
50 | print("This took ", end - start)
51 | coef_cv_df = pd.DataFrame(coefs_,columns=X_train.columns)
52 | coef_cv_df['ks'] = ks
53 | coef_cv_df['c'] = cs
54 |
55 | if df_coef_path:
56 | file_name = df_coef_path if isinstance(df_coef_path, str) else None
57 | coef_cv_df.to_csv(file_name)
58 |
59 | coefs_ = np.array(coefs_)
60 |
61 | fig1 = plt.figure('fig1')
62 | plt.plot(np.log10(cs), coefs_)
63 | ymin, ymax = plt.ylim()
64 | plt.xlabel('log(C)')
65 | plt.ylabel('Coefficients')
66 | plt.title(pic_coefpath_title)
67 | plt.axis('tight')
68 | if pic_coefpath:
69 | file_name = pic_coefpath if isinstance(pic_coefpath, str) else None
70 | plt.savefig(file_name)
71 | else:
72 | plt.show()
73 |
74 | fig2 = plt.figure('fig2')
75 | plt.plot(np.log10(cs), ks)
76 | plt.xlabel('log(C)')
77 | plt.ylabel('ks score')
78 | plt.title(pic_performance_title)
79 | plt.axis('tight')
80 | if pic_performance:
81 | file_name = pic_performance if isinstance(pic_performance, str) else None
82 | plt.savefig(file_name)
83 | else:
84 | plt.show()
85 |
86 | flag = coefs_<0
87 | idx = np.array(ks)[flag.sum(axis=1) == 0].argmax()
88 |
89 | return (cs[idx],ks[idx])
90 |
91 |
92 | def grid_search_lr_c_validation(X_train,y_train,validation_dataset_list,cs=[0.01],df_coef_path=False
93 | ,pic_coefpath_title='Logistic Regression Path',pic_coefpath=False
94 | ,pic_performance_title='Logistic Regression Performance',pic_performance=False):
95 | """
96 | grid search optimal hyper parameters c with the best ks performance
97 | :param X_train: features dataframe
98 | :param y_train: target
99 | :param cs: list of c value
100 | :param df_coef_path: the file path for logistic regression coefficient dataframe
101 | :param pic_coefpath_title: the pic title for coefficient path picture
102 | :param pic_coefpath: the file path for coefficient path picture
103 | :param pic_performance_title: the pic title for ks performance picture
104 | :param pic_performance: the file path for ks performance picture
105 | :return: a tuple of c and ks value with the best ks performance
106 | """
107 | # init a LogisticRegression model
108 | clf_l1_LR = LogisticRegression(C=0.1, penalty='l1', tol=0.01,class_weight='balanced')
109 |
110 | print("Computing regularization path ...")
111 | start = datetime.now()
112 | print(start)
113 | coefs_ = []
114 | ks = []
115 | ks_validation1 = []
116 | ks_validation2 = []
117 | counter = 0
118 | for c in cs:
119 | print('time: ',time.asctime(time.localtime(time.time())),'counter: ',counter, ' c: ',c)
120 | clf_l1_LR.set_params(C=c)
121 | clf_l1_LR.fit(X_train, y_train)
122 | coefs_.append(clf_l1_LR.coef_.ravel().copy())
123 |
124 | proba = clf_l1_LR.predict_proba(X_train)[:,1]
125 | validation_proba1 = clf_l1_LR.predict_proba(validation_dataset_list[0][X_train.columns])[:,1]
126 |
127 | ks.append(compute_ks(proba,y_train))
128 | ks_validation1.append(compute_ks(validation_proba1,validation_dataset_list[0]['target']))
129 |
130 | print('ks:\t',ks[-1],'ks_validation1:\t',ks_validation1[-1])
131 | counter += 1
132 |
133 | end = datetime.now()
134 | print(end)
135 | print("This took ", end - start)
136 | coef_cv_df = pd.DataFrame(coefs_,columns=X_train.columns)
137 | coef_cv_df['ks'] = ks
138 | coef_cv_df['ks_validation1'] = ks_validation1
139 | coef_cv_df['c'] = cs
140 |
141 |
142 | if df_coef_path:
143 | file_name = df_coef_path if isinstance(df_coef_path, str) else None
144 | coef_cv_df.to_csv(file_name)
145 |
146 | coefs_ = np.array(coefs_)
147 |
148 | fig1 = plt.figure('fig1')
149 | plt.plot(np.log10(cs), coefs_)
150 | ymin, ymax = plt.ylim()
151 | plt.xlabel('log(C)')
152 | plt.ylabel('Coefficients')
153 | plt.title(pic_coefpath_title)
154 | plt.axis('tight')
155 | if pic_coefpath:
156 | file_name = pic_coefpath if isinstance(pic_coefpath, str) else None
157 | plt.savefig(file_name)
158 | plt.close()
159 | else:
160 | pass
161 | # plt.show()
162 | # plt.close()
163 |
164 | fig2 = plt.figure('fig2')
165 | plt.plot(np.log10(cs), ks)
166 | plt.xlabel('log(C)')
167 | plt.ylabel('ks score')
168 | plt.title(pic_performance_title)
169 | plt.axis('tight')
170 | if pic_performance:
171 | file_name = pic_performance if isinstance(pic_performance, str) else None
172 | plt.savefig(file_name)
173 | plt.close()
174 | else:
175 | pass
176 | # plt.show()
177 | # plt.close()
178 |
179 | flag = coefs_<0
180 | if np.array(ks)[flag.sum(axis=1) == 0].__len__()>0:
181 | idx = np.array(ks)[flag.sum(axis=1) == 0].argmax()
182 | else:
183 | idx = np.array(ks).argmax()
184 |
185 | return (cs[idx],ks[idx])
186 |
187 |
188 | def grid_search_lr_c_main(params):
189 | print('run into grid_search_lr_c_main:')
190 | dataset_path = params['dataset_path']
191 | validation_path = params['validation_path']
192 | config_path = params['config_path']
193 | df_coef_path = params['df_coef_path']
194 | pic_coefpath = params['pic_coefpath']
195 | pic_performance = params['pic_performance']
196 | pic_coefpath_title = params['pic_coefpath_title']
197 | pic_performance_title = params['pic_performance_title']
198 |
199 | dataset_train = pd.read_csv(dataset_path)
200 | cfg = pd.read_csv(config_path)
201 | candidate_var_list = cfg[cfg['is_modelfeature'] == 1]['var_name']
202 |
203 | b = [var for var in dataset_train.columns if sum(dataset_train[var].isnull()) == 0]
204 | candidate_var_list = list(set(candidate_var_list).intersection(set(b)))
205 |
206 | var_list_specfied = params['var_list_specfied']
207 | if var_list_specfied.__len__()>0:
208 | candidate_var_list = list(set(candidate_var_list).intersection(set(var_list_specfied)))
209 |
210 | print('candidate_var_list length:\n',candidate_var_list.__len__())
211 | print('candidate_var_list:\n',candidate_var_list)
212 |
213 | print('change dtypes:float64 to float32')
214 | for var in candidate_var_list:
215 | dataset_train[var] = dataset_train[var].astype(np.float32)
216 |
217 | X_train = dataset_train[dataset_train.target >=0][candidate_var_list]
218 | y_train = dataset_train[dataset_train.target >=0]['target']
219 |
220 | validation_cols_keep = [var for var in candidate_var_list]
221 | validation_cols_keep.append('target')
222 | validation_dataset_list = []
223 |
224 | validation_dataset = pd.read_csv(validation_path)
225 | # fillna
226 | for var in candidate_var_list:
227 | validation_dataset.loc[validation_dataset[var].isnull(), (var)] = 0
228 | validation_dataset_list.append(validation_dataset[validation_cols_keep])
229 |
230 | cs = params['cs']
231 | print('cs',cs)
232 | c,ks = grid_search_lr_c_validation(X_train,y_train,validation_dataset_list,cs,df_coef_path,pic_coefpath_title,pic_coefpath
233 | ,pic_performance_title,pic_performance)
234 | print('pic_coefpath:\n',pic_coefpath)
235 | print('pic_performance:\n',pic_performance)
236 | print('ks performance on the c:')
237 | print(c,ks)
238 |
239 | return (c,ks)
240 |
241 |
242 | def fit_single_lr(dataset_path,config_path,var_list_specfied,out_model_path,c=0.01):
243 | dataset_train = pd.read_csv(dataset_path)
244 | cfg = pd.read_csv(config_path)
245 | candidate_var_list = cfg[cfg['is_modelfeature'] == 1]['var_name']
246 |
247 | b = [var for var in dataset_train.columns if sum(dataset_train[var].isnull()) == 0]
248 | candidate_var_list = list(set(candidate_var_list).intersection(set(b)))
249 |
250 | if var_list_specfied.__len__()>0:
251 | candidate_var_list = list(set(candidate_var_list).intersection(set(var_list_specfied)))
252 |
253 | print('candidate_var_list length:\n',candidate_var_list.__len__())
254 | print('candidate_var_list:\n',candidate_var_list)
255 |
256 | print('change dtypes:float64 to float32')
257 | for var in candidate_var_list:
258 | dataset_train[var] = dataset_train[var].astype(np.float32)
259 |
260 | X_train = dataset_train[dataset_train.target >=0][candidate_var_list]
261 | y_train = dataset_train[dataset_train.target >=0]['target']
262 |
263 | print('c:',c)
264 | clf_lr_a = LogisticRegression(C=c, penalty='l1', tol=0.01,class_weight='balanced')
265 |
266 | clf_lr_a.fit(X_train, y_train)
267 | coefs = clf_lr_a.coef_.ravel().copy()
268 |
269 | proba = clf_lr_a.predict_proba(X_train)[:,1]
270 | ks = compute_ks(proba,y_train)
271 |
272 | model = {}
273 | model['clf'] = clf_lr_a
274 | model['features_list'] = candidate_var_list
275 | model['coefs'] = coefs
276 | model['ks'] = ks
277 |
278 | output = open(out_model_path, 'wb')
279 | pickle.dump(model,output)
280 | output.close()
281 |
282 | return model
283 |
--------------------------------------------------------------------------------
/woe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boredbird/woe/335e9ec2a521d3bbccb0ad5d915128119e4d0ca6/woe/__init__.py
--------------------------------------------------------------------------------
/woe/config.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | __author__ = 'boredbird'
3 | import pandas as pd
4 |
5 | class config:
6 |
7 | def __init__(self):
8 | self.config = None
9 | self.dataset_train = None
10 | self.variable_type = None
11 | self.bin_var_list = None
12 | self.discrete_var_list = None
13 | self.candidate_var_list = None
14 | self.dataset_len = None
15 | self.min_sample = None
16 | self.global_bt = None
17 | self.global_gt = None
18 |
19 | def load_file(self,config_path,data_path=False):
20 | self.config = pd.read_csv(config_path)
21 | # specify variable dtypes
22 | self.variable_type = self.config[['var_name', 'var_dtype']]
23 | self.variable_type = self.variable_type.rename(columns={'var_name': 'v_name', 'var_dtype': 'v_type'})
24 | self.variable_type = self.variable_type.set_index(['v_name'])
25 |
26 | # specify the list of continuous variable to be splitted into bin
27 | self.bin_var_list = self.config[self.config['is_tobe_bin'] == 1]['var_name']
28 | # specify the list of discrete variable to be merged into supper classes
29 | self.discrete_var_list = self.config[(self.config['is_candidate'] == 1) & (self.config['var_dtype'] == 'object')]['var_name']
30 |
31 | # specify the list of model input variable
32 | self.candidate_var_list = self.config[self.config['is_candidate'] == 1]['var_name']
33 |
34 | if data_path:
35 | data_path = data_path if isinstance(data_path, str) else None
36 |
37 | # load dataset train
38 | self.dataset_train = pd.read_csv(data_path)
39 | self.dataset_train.columns = [col.split('.')[-1] for col in self.dataset_train.columns]
40 |
41 | # specify some other global variables about the training dataset
42 | self.dataset_len = len(self.dataset_train)
43 | self.min_sample = int(self.dataset_len * 0.05)
44 | self.global_bt = sum(self.dataset_train['target'])
45 | self.global_gt = len(self.dataset_train) - sum(self.dataset_train['target'])
46 |
47 | def change_config_var_dtype(self,var_name,type,inplace_file=True):
48 | if type in ['object','string','int64','uint8','float64','bool1','bool2','dates','category']:
49 | self.variable_type.loc[var_name,'v_type'] = type
50 | else:
51 | raise KeyError("Invalid dtype specified! ")
--------------------------------------------------------------------------------
/woe/eval.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | __author__ = 'boredbird'
3 | import pandas as pd
4 | import numpy as np
5 | import scipy
6 | import matplotlib.pyplot as plt
7 | from scipy.stats import ks_2samp
8 | import woe.config as config
9 | import pickle
10 |
11 | def compute_ks(proba,target):
12 | '''
13 | target: numpy array of shape (1,)
14 | proba: numpy array of shape (1,), predicted probability of the sample being positive
15 | returns:
16 | ks: float, ks score estimation
17 | '''
18 | get_ks = lambda proba, target: ks_2samp(proba[target == 1], proba[target != 1]).statistic
19 |
20 | return get_ks(proba, target)
21 |
22 |
23 | def eval_feature_detail(Info_Value_list,out_path=False):
24 | """
25 | format InfoValue list to Dataframe
26 | :param Info_Value_list: Instance list of Class InfoValue
27 | :param out_path:specify the Dataframe to csv file path ,default False
28 | :return:DataFrame about feature detail
29 | """
30 | rst = Info_Value_list
31 | format_rst = []
32 |
33 | for kk in range(0,len(rst)):
34 | print(rst[kk].var_name)
35 | split_list = []
36 | if rst[kk].split_list != []:
37 | if not rst[kk].is_discrete:
38 | #deal with split_list
39 | split_list.append('(-INF,'+str(rst[kk].split_list[0])+']')
40 | for i in range(0,len(rst[kk].split_list)-1):
41 | split_list.append('(' + str(rst[kk].split_list[i])+','+ str(rst[kk].split_list[i+1]) + ']')
42 |
43 | split_list.append('(' + str(rst[kk].split_list[len(rst[kk].split_list)-1]) + ',+INF)')
44 | else:
45 | split_list = rst[kk].split_list
46 | else:
47 | split_list.append('(-INF,+INF)')
48 |
49 | # merge into dataframe
50 | columns = ['var_name','split_list','sub_total_sample_num','positive_sample_num'
51 | ,'negative_sample_num','sub_total_num_percentage','positive_rate_in_sub_total'
52 | ,'woe_list','iv_list','iv']
53 | rowcnt = len(rst[kk].iv_list)
54 | if rowcnt < len(split_list):
55 | split_list = split_list[:rowcnt]
56 |
57 | var_name = [rst[kk].var_name] * rowcnt
58 | iv = [rst[kk].iv] * rowcnt
59 | iv_list = rst[kk].iv_list
60 | woe_list = rst[kk].woe_list
61 | a = pd.DataFrame({'var_name':var_name,'iv_list':iv_list,'woe_list':woe_list
62 | ,'split_list':split_list,'iv':iv,'sub_total_sample_num':rst[kk].sub_total_sample_num
63 | ,'positive_sample_num':rst[kk].positive_sample_num,'negative_sample_num':rst[kk].negative_sample_num
64 | ,'sub_total_num_percentage':rst[kk].sub_total_num_percentage
65 | ,'positive_rate_in_sub_total':rst[kk].positive_rate_in_sub_total
66 | ,'negative_rate_in_sub_total':rst[kk].negative_rate_in_sub_total},columns=columns)
67 | format_rst.append(a)
68 |
69 | # merge dataframe list into one dataframe vertically
70 | cformat_rst = pd.concat(format_rst)
71 |
72 | if out_path:
73 | file_name = out_path if isinstance(out_path, str) else None
74 | cformat_rst.to_csv(file_name, index=False,encoding='utf-8')
75 |
76 | return cformat_rst
77 |
78 |
79 | def eval_data_summary(df_list,source_name,out_path=False):
80 | '''
81 | :param df_list: A dataset DataFrame
82 | :param source_name: string type
83 | :param out_path: specify the Dataframe to csv file path ,default False
84 | :return: DataFrame about dataset summary info
85 | '''
86 | train_validation_data_summary = []
87 | for i in range(len(source_name)):
88 | a = dict()
89 | a['source'] = source_name[i]
90 | a['total_sample_cnt'] = len(df_list[i])
91 | a['positive_sample_cnt'] = df_list[i]['target'].sum()
92 | a['negative_sample_cnt'] = a['total_sample_cnt'] - a['positive_sample_cnt']
93 | a['positive_rate'] = a['positive_sample_cnt']*1.0/a['total_sample_cnt']
94 | train_validation_data_summary.append(a)
95 |
96 | train_validation_data_summary = pd.DataFrame(train_validation_data_summary)
97 |
98 | if out_path:
99 | file_name = out_path if isinstance(out_path, str) else None
100 | train_validation_data_summary.to_csv(file_name, index=False)
101 |
102 | return train_validation_data_summary
103 |
104 |
105 | def eval_model_summary(list_dict,out_path=False):
106 | '''
107 | :param list_dict: a list of dict
108 | :param out_path: specify the Dataframe to csv file path ,default False
109 | :return: DataFrame about model summary info
110 | '''
111 | model_summary = pd.DataFrame([list_dict[0]])
112 | if len(list_dict)>1:
113 | for i in range(len(list_dict)-1):
114 | b = pd.DataFrame([list_dict[i+1]])
115 | model_summary = pd.merge(model_summary, b, how='outer')
116 |
117 | if out_path:
118 | file_name = out_path if isinstance(out_path, str) else None
119 | model_summary.to_csv(file_name, index=False)
120 |
121 | return model_summary
122 |
123 |
124 | def wald_test(model,X):
125 | '''
126 | :param model: a model file that should have predict_proba() function
127 | :param X: dataset features DataFrame
128 | :return: the value of wald_stats,p_value
129 | '''
130 | pred_probs = np.matrix(model.predict_proba(X))
131 | X_design = np.hstack((np.ones(shape=(X.shape[0], 1)), X))
132 | diag_array = np.multiply(pred_probs[:, 0], pred_probs[:, 1]).A1
133 | V = scipy.sparse.diags(diag_array)
134 | m1 = X_design.T * V
135 | m2 = m1.dot(X_design)
136 | cov_mat = np.linalg.inv(m2)
137 |
138 | model_params = np.hstack((model.intercept_[0], model.coef_[0]))
139 | wald_stats = (model_params / np.sqrt(np.diag(cov_mat))) ** 2
140 |
141 | wald = scipy.stats.wald()
142 | p_value = wald.pdf(wald_stats)
143 |
144 | return wald_stats,p_value
145 |
146 |
147 | def eval_feature_summary(train_X,model,civ_list,candidate_var_list,out_path=False):
148 | '''
149 | :param train_X: training dataset features DataFrame
150 | :param model: model file
151 | :param civ_list: list of InfoValue Class instances
152 | :param candidate_var_list: the list of model input variable
153 | :param out_path: specify the Dataframe to csv file path ,default False
154 | :return: DataFrame about feature summary
155 | '''
156 | feature_summary = {}
157 | feature_summary['feature_name'] = list(['Intercept'])
158 | feature_summary['feature_name'].extend(list(candidate_var_list))
159 | feature_summary['coef'] = [model['classifier'].intercept_]
160 | feature_summary['coef'].extend(model['classifier'].coef_[0])
161 | var_name = [civ.var_name for civ in civ_list]
162 | feature_summary['iv'] = [0]
163 | feature_summary['iv'].extend([civ_list[var_name.index(var)].iv for var in candidate_var_list])
164 | feature_summary['wald_stats'], feature_summary['p_value'] = wald_test(model['classifier'], train_X)
165 |
166 | feature_summary = pd.DataFrame(feature_summary)
167 | if out_path:
168 | file_name = out_path if isinstance(out_path, str) else None
169 | feature_summary.to_csv(file_name, index=False)
170 |
171 | return feature_summary
172 |
173 |
174 | def eval_segment_metrics(target, predict_proba, segment_cnt = 20,out_path=False):
175 | '''
176 | :param target: the list of actual target value
177 | :param predict_proba: the list of predicted probability
178 | :param segment_cnt: the segment number
179 | :param out_path: specify the Dataframe to csv file path ,default False
180 | :return: DataFrame about segment metrics
181 | '''
182 | proba_descend_idx = np.argsort(predict_proba)
183 | proba_descend_idx = proba_descend_idx[::-1]
184 |
185 | grp_idx = 1
186 | start_idx = 0
187 | total_sample_cnt = len(predict_proba)
188 | total_positive_sample_cnt = target.sum()
189 | total_negative_sample_cnt = total_sample_cnt - total_positive_sample_cnt
190 |
191 | segment_sample_cnt = int(len(predict_proba) / segment_cnt)
192 | cumulative_sample_percentage = 0.0
193 | cumulative_positive_percentage = 0.0
194 | cumulative_negative_percentage = 0.0
195 |
196 | segment_list = []
197 | columns = ['grp_idx', 'segment_sample_cnt', 'segment_sample_percentage', 'cumulative_sample_percentage',
198 | 'in_segment_positive_percentage', 'positive_percentage_in_total', 'cumulative_positive_percentage',
199 | 'cumulative_negative_percentage', 'ks']
200 |
201 | while start_idx < total_sample_cnt:
202 | s = {}
203 | s['grp_idx'] = grp_idx
204 | segment_idx_list = proba_descend_idx[start_idx : start_idx + segment_sample_cnt]
205 | segment_target = target[segment_idx_list]
206 |
207 | segment_sample_cnt = len(segment_idx_list)
208 | s['segment_sample_cnt'] = segment_sample_cnt
209 |
210 | segment_pos_cnt = segment_target.sum()
211 | segment_neg_cnt = segment_sample_cnt - segment_pos_cnt
212 |
213 | segment_sample_percentage = segment_sample_cnt*1.0/total_sample_cnt
214 | s['segment_sample_percentage'] = segment_sample_percentage
215 |
216 | pos_percentage_in_total = float(segment_pos_cnt * 100) / total_positive_sample_cnt
217 | neg_percentage_in_total = float(segment_neg_cnt * 100) / total_negative_sample_cnt
218 | s['positive_percentage_in_total'] = pos_percentage_in_total
219 |
220 | in_segment_positive_percentage = float(segment_pos_cnt) / segment_sample_cnt
221 | s['in_segment_positive_percentage'] = in_segment_positive_percentage
222 |
223 | cumulative_sample_percentage += segment_sample_percentage
224 | s['cumulative_sample_percentage'] = cumulative_sample_percentage
225 |
226 | cumulative_positive_percentage += pos_percentage_in_total
227 | cumulative_negative_percentage += neg_percentage_in_total
228 | s['cumulative_positive_percentage'] = cumulative_positive_percentage
229 | s['cumulative_negative_percentage'] = cumulative_negative_percentage
230 |
231 | ks = cumulative_positive_percentage - cumulative_negative_percentage
232 | s['ks'] = ks
233 |
234 | segment_list.append(s)
235 | grp_idx += 1
236 | start_idx += segment_sample_cnt
237 |
238 | segment_list = pd.DataFrame(segment_list,columns=columns)
239 | if out_path:
240 | file_name = out_path if isinstance(out_path, str) else None
241 | segment_list.to_csv(file_name, index=False)
242 |
243 | return segment_list
244 |
245 |
246 | def eval_model_stability(proba_train, proba_validation, segment_cnt = 10,out_path=False):
247 | '''
248 | :param proba_train: the list of predicted probability on training dataset
249 | :param proba_validation: the list of predicted probability on validation dataset
250 | :param segment_cnt: the segment number
251 | :param out_path: specify the Dataframe to csv file path ,default False
252 | :return: DataFrame about model stability
253 | '''
254 | step = 1.0/segment_cnt
255 | flag = 0.0
256 | model_stability = []
257 | len_train = len(proba_train)
258 | len_validation = len(proba_validation)
259 |
260 | columns = ['score_range','segment_train_percentage','segment_validation_percentage','difference',
261 | 'variance','ln_variance','stability_index']
262 |
263 | while flag < 1.0:
264 | temp = {}
265 |
266 | score_range = '['+str(flag)+','+str(flag + step)+')'
267 | segment_train_cnt = proba_train[(proba_train >= flag) & (proba_train < flag + step)].count()
268 | segment_train_percentage = segment_train_cnt*1.0/len_train
269 | segment_validation_cnt = proba_validation[(proba_validation >= flag) & (proba_validation < flag + step)].count()
270 | segment_validation_percentage = segment_validation_cnt * 1.0 / len_validation
271 | difference = segment_validation_percentage - segment_train_percentage
272 | variance = float(segment_validation_percentage)/segment_train_percentage
273 | ln_variance = variance
274 | stability_index = difference * ln_variance
275 |
276 | temp['score_range'] = score_range
277 | temp['segment_train_percentage'] = segment_train_percentage
278 | temp['segment_validation_percentage'] = segment_validation_percentage
279 | temp['difference'] = difference
280 | temp['variance'] = variance
281 | temp['ln_variance'] = ln_variance
282 | temp['stability_index'] = stability_index
283 |
284 | model_stability.append(temp)
285 | flag += step
286 |
287 | model_stability = pd.DataFrame(model_stability,columns=columns)
288 | if out_path:
289 | file_name = out_path if isinstance(out_path, str) else None
290 | model_stability.to_csv(file_name, index=False)
291 |
292 | return model_stability
293 |
294 | def eval_feature_stability(civ_list, df_train, df_validation,candidate_var_list,out_path=False):
295 | '''
296 | :param civ_list: List of InfoValue Class instances
297 | :param df_train: DataFrame of training dataset
298 | :param df_validation: DataFrame of validation dataset
299 | :param candidate_var_list: the list of model input variable
300 | :param out_path: specify the Dataframe to csv file path ,default False
301 | :return: DataFrame about features stability
302 | '''
303 | psi_dict = {}
304 |
305 | civ_var_list = [civ_list[i].var_name for i in range(len(civ_list))]
306 | intersection = list(set(civ_var_list).intersection(set(candidate_var_list)))
307 | civ_idx_list = [civ_var_list.index(var) for var in intersection]
308 |
309 | len_train = len(df_train)
310 | len_validation = len(df_validation)
311 |
312 | psi_dict['feature_name'] = []
313 | psi_dict['group'] = []
314 | psi_dict['segment_train_cnt'] = []
315 | psi_dict['segment_train_percentage'] = []
316 | psi_dict['segment_validation_cnt'] = []
317 | psi_dict['segment_validation_percentage'] = []
318 |
319 | for i in civ_idx_list:
320 | if civ_list[i].is_discrete:
321 | for j in range(len(civ_list[i].split_list)):
322 | psi_dict['feature_name'].append(civ_list[i].var_name)
323 | psi_dict['group'].append(civ_list[i].split_list[j])
324 |
325 | civ_split_list = civ_list[i].split_list[j]
326 | segment_train_cnt = 0
327 | for m in civ_split_list:
328 | segment_train_cnt += df_train[civ_list[i].var_name][df_train[civ_list[i].var_name] == m].count()
329 |
330 | psi_dict['segment_train_cnt'].append(segment_train_cnt)
331 | psi_dict['segment_train_percentage'].append(float(segment_train_cnt)/len_train)
332 |
333 | segment_validation_cnt = 0
334 | for m in civ_split_list:
335 | segment_validation_cnt += df_validation[civ_list[i].var_name][df_validation[civ_list[i].var_name] == m].count()
336 |
337 | psi_dict['segment_validation_cnt'].append(segment_validation_cnt)
338 | psi_dict['segment_validation_percentage'].append(float(segment_validation_cnt)/len_validation)
339 |
340 | else:
341 | split_list = []
342 | split_list.append(float("-inf"))
343 | split_list.extend([temp for temp in civ_list[i].split_list])
344 | split_list.append(float("inf"))
345 | var_name = civ_list[i].var_name
346 |
347 | for j in range(len(split_list)-3):
348 | psi_dict['feature_name'].append(civ_list[i].var_name)
349 | psi_dict['group'].append('('+str(split_list[j])+','+str(split_list[j+1])+']')
350 |
351 | segment_train_cnt = df_train[var_name][(df_train[var_name] > split_list[j])&(df_train[var_name] <= split_list[j+1])].count()
352 |
353 | psi_dict['segment_train_cnt'].append(segment_train_cnt)
354 | psi_dict['segment_train_percentage'].append(float(segment_train_cnt)/len_train)
355 |
356 | segment_validation_cnt = df_validation[var_name][(df_validation[var_name] > split_list[j])&
357 | (df_validation[var_name] <= split_list[j+1])].count()
358 |
359 | psi_dict['segment_validation_cnt'].append(segment_validation_cnt)
360 | psi_dict['segment_validation_percentage'].append(float(segment_validation_cnt)/len_validation)
361 |
362 | psi_dict['feature_name'].append(var_name)
363 | psi_dict['group'].append('(' + str(split_list[len(split_list)-2]) + ',+INF)')
364 |
365 | segment_train_cnt = df_train[var_name][df_train[var_name] > split_list[len(split_list)-1]].count()
366 | psi_dict['segment_train_cnt'].append(segment_train_cnt)
367 | psi_dict['segment_train_percentage'].append(float(segment_train_cnt) / len_train)
368 |
369 | segment_validation_cnt = df_validation[var_name][df_validation[var_name] > split_list[len(split_list)-1]].count()
370 | psi_dict['segment_validation_cnt'].append(segment_validation_cnt)
371 | psi_dict['segment_validation_percentage'].append(float(segment_validation_cnt) / len_validation)
372 |
373 | psi_dict['difference'] = pd.Series(psi_dict['segment_validation_percentage']) - pd.Series(psi_dict['segment_train_percentage'])
374 | psi_dict['variance'] = list(map(lambda x_y: x_y[0] / (x_y[1]+0.000000001), zip(psi_dict['segment_validation_percentage'], psi_dict['segment_train_percentage'])))
375 | psi_dict['Ln(variance)'] = np.log(np.array(psi_dict['variance'])+0.000000001)
376 | psi_dict['stability_index'] = np.array(psi_dict['difference']) * np.array(psi_dict['Ln(variance)'])
377 |
378 | columns = ['feature_name','group','segment_train_cnt','segment_train_percentage',
379 | 'segment_validation_cnt','segment_validation_percentage','difference',
380 | 'variance','Ln(variance)','stability_index']
381 |
382 | psi_df = pd.DataFrame(psi_dict, columns=columns)
383 | if out_path:
384 | file_name = out_path if isinstance(out_path, str) else None
385 | psi_df.to_csv(file_name, index=False)
386 |
387 | return psi_df
388 |
389 |
390 | def plot_ks(proba,target,axistype='pct',out_path=False):
391 | """
392 | plot k-s figure
393 | :param proba: 1-d array,prediction probability values
394 | :param target: 1-d array,the list of actual target value
395 | :param axistype: specify x axis :'axistype' must be either 'pct' (sample percent) or 'proba' (prediction probability)
396 | :param out_path: specify the file path to store ks plot figure,default False
397 | :return: DataFrame, figure summary
398 | """
399 | assert axistype in ['pct','proba'] , "KS Plot TypeError: Attribute 'axistype' must be either 'pct' or 'proba' !"
400 |
401 | a = pd.DataFrame(np.array([proba,target]).T,columns=['proba','target'])
402 | a.sort_values(by='proba',ascending=False,inplace=True)
403 | a['sum_Times']=a['target'].cumsum()
404 | total_1 = a['target'].sum()
405 | total_0 = len(a) - a['target'].sum()
406 |
407 | a['temp'] = 1
408 | a['Times']=a['temp'].cumsum()
409 | a['cdf1'] = a['sum_Times']/total_1
410 | a['cdf0'] = (a['Times'] - a['sum_Times'])/total_0
411 | a['ks'] = a['cdf1'] - a['cdf0']
412 | a['percent'] = a['Times']*1.0/len(a)
413 |
414 | idx = np.argmax(a['ks'])
415 | # print(a.loc[idx])
416 |
417 | if axistype == 'pct':
418 | '''
419 | KS曲线,横轴为按照输出的概率值排序后的观察样本比例
420 | '''
421 | plt.figure()
422 | plt.plot(a['percent'],a['cdf1'], label="CDF_positive")
423 | plt.plot(a['percent'],a['cdf0'],label="CDF_negative")
424 | plt.plot(a['percent'],a['ks'],label="K-S")
425 |
426 | sx = np.linspace(0,1,10)
427 | sy = sx
428 | plt.plot(sx,sy,linestyle='--',color='darkgrey',linewidth=1.2)
429 |
430 | plt.legend()
431 | plt.grid(True)
432 | ymin, ymax = plt.ylim()
433 | plt.xlabel('Sample percent')
434 | plt.ylabel('Cumulative probability')
435 | plt.title('Model Evaluation Index K-S')
436 | plt.axis('tight')
437 |
438 | # 虚线
439 | t = a.loc[idx]['percent']
440 | yb = round(a.loc[idx]['cdf1'],4)
441 | yg = round(a.loc[idx]['cdf0'],4)
442 |
443 | plt.plot([t,t],[yb,yg], color ='red', linewidth=1.4, linestyle="--")
444 | plt.scatter([t,],[yb,], 20, color ='dodgerblue')
445 | plt.annotate(r'$recall_p=%s$' % round(a.loc[idx]['cdf1'],4), xy=(t, yb), xycoords='data', xytext=(+10, -5),
446 | textcoords='offset points', fontsize=8,
447 | arrowprops=dict(arrowstyle='->', connectionstyle="arc3,rad=.1"))
448 |
449 | plt.scatter([t,],[yg,], 20, color ='darkorange')
450 | plt.annotate(r'$recall_n=%s$' % round(a.loc[idx]['cdf0'],4), xy=(t, yg), xycoords='data', xytext=(+10, -10),
451 | textcoords='offset points', fontsize=8,
452 | arrowprops=dict(arrowstyle='->', connectionstyle="arc3,rad=.1"))
453 | # K-S曲线峰值
454 | plt.scatter([t,],[a.loc[idx]['ks'],], 20, color ='limegreen')
455 | plt.annotate(r'$ks=%s,p=%s$' % (round(a.loc[idx]['ks'],4)
456 | ,round(a.loc[idx]['proba'],4))
457 | , xy=(a.loc[idx]['percent'], a.loc[idx]['ks'])
458 | , xycoords='data'
459 | , xytext=(+15, -15),
460 | textcoords='offset points'
461 | , fontsize=8
462 | ,arrowprops=dict(arrowstyle='->', connectionstyle="arc3,rad=.1"))
463 | plt.annotate(r'$percent=%s,cnt=%s$' % (round(a.loc[idx]['percent'],4)
464 | ,round(a.loc[idx]['Times'],0))
465 | , xy=(a.loc[idx]['percent'], a.loc[idx]['ks'])
466 | , xycoords='data'
467 | , xytext=(+25, -25),
468 | textcoords='offset points'
469 | , fontsize=8
470 | )
471 |
472 | else:
473 | '''
474 | 改变横轴,横轴为模型输出的概率值
475 | '''
476 | plt.figure()
477 | plt.grid(True)
478 | plt.plot(1-a['proba'],a['cdf1'], label="CDF_bad")
479 | plt.plot(1-a['proba'],a['cdf0'],label="CDF_good")
480 | plt.plot(1-a['proba'],a['ks'],label="ks")
481 |
482 | plt.legend()
483 | ymin, ymax = plt.ylim()
484 | plt.xlabel('1-[Predicted probability]')
485 | plt.ylabel('Cumulative probability')
486 | plt.title('Model Evaluation Index K-S')
487 | plt.axis('tight')
488 | plt.show()
489 | # 虚线
490 | t = 1 - a.loc[idx]['proba']
491 | yb = round(a.loc[idx]['cdf1'],4)
492 | yg = round(a.loc[idx]['cdf0'],4)
493 |
494 | plt.plot([t,t],[yb,yg], color ='red', linewidth=1.4, linestyle="--")
495 | plt.scatter([t,],[yb,], 20, color ='dodgerblue')
496 | plt.annotate(r'$recall_p=%s$' % round(a.loc[idx]['cdf1'],4), xy=(t, yb), xycoords='data', xytext=(+10, -5),
497 | textcoords='offset points', fontsize=8,
498 | arrowprops=dict(arrowstyle='->', connectionstyle="arc3,rad=.1"))
499 |
500 | plt.scatter([t,],[yg,], 20, color ='darkorange')
501 | plt.annotate(r'$recall_n=%s$' % round(a.loc[idx]['cdf0'],4), xy=(t, yg), xycoords='data', xytext=(+10, -10),
502 | textcoords='offset points', fontsize=8,
503 | arrowprops=dict(arrowstyle='->', connectionstyle="arc3,rad=.1"))
504 | # K-S曲线峰值
505 | plt.scatter([t,],[a.loc[idx]['ks'],], 20, color ='limegreen')
506 | plt.annotate(r'$ks=%s,p=%s$' % (round(a.loc[idx]['ks'],4)
507 | ,round(a.loc[idx]['proba'],4))
508 | , xy=(t, a.loc[idx]['ks'])
509 | , xycoords='data'
510 | , xytext=(+15, -15),
511 | textcoords='offset points'
512 | , fontsize=8
513 | ,arrowprops=dict(arrowstyle='->', connectionstyle="arc3,rad=.1"))
514 | plt.annotate(r'$percent=%s,cnt=%s$' % (round(a.loc[idx]['percent'],4)
515 | ,round(a.loc[idx]['Times'],0))
516 | , xy=(t, a.loc[idx]['ks'])
517 | , xycoords='data'
518 | , xytext=(+25, -25),
519 | textcoords='offset points'
520 | , fontsize=8
521 | )
522 |
523 | if out_path:
524 | file_name = out_path if isinstance(out_path, str) else None
525 | plt.savefig(file_name)
526 | else:
527 | plt.show()
528 |
529 | return a.loc[idx]
530 |
531 |
532 | def proc_validattion(dataset_path,config_path,model_path):
533 | print('####PROC VALIDATION#####')
534 | print('dataset_path:\n',dataset_path)
535 | print('config_path:\n',config_path)
536 | print('model_path:\n',model_path)
537 | #fillna
538 | config_path = r'E:\Code\Python_ML_Code\cs_model\config\config_cs_model.csv'
539 | cfg = config.config()
540 | cfg.load_file(config_path, dataset_path)
541 |
542 | for var in [tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns)]:
543 | # fill null
544 | cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 0
545 |
546 | for var in [tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns)]:
547 | # fill null
548 | cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 0
549 |
550 | output = open(model_path, 'rb')
551 | clf_model = pickle.load(output)
552 | output.close()
553 |
554 | clf = clf_model['clf']
555 | X_test = cfg.dataset_train[clf_model['features_list']]
556 | y_test = cfg.dataset_train['target']
557 |
558 | y_hat = clf.predict_proba(X_test)[:,1]
559 | ks = compute_ks(y_hat,y_test)
560 | print('global_bt:',cfg.global_bt)
561 | print('global_gt:', cfg.global_gt)
562 | print('ks:',ks)
563 | return ks
564 |
565 |
566 | def proc_cor_eval(dataset_path,config_path,var_list_specfied,out_file_path):
567 | dataset = pd.read_csv(dataset_path)
568 | cfg = pd.read_csv(config_path)
569 | candidate_var_list = cfg[cfg['is_modelfeature'] == 1]['var_name']
570 |
571 | b = [var for var in dataset.columns if sum(dataset[var].isnull()) == 0]
572 | candidate_var_list = list(set(candidate_var_list).intersection(set(b)))
573 |
574 | if var_list_specfied.__len__()>0:
575 | candidate_var_list = list(set(candidate_var_list).intersection(set(var_list_specfied)))
576 |
577 | print('candidate_var_list length:\n',candidate_var_list.__len__())
578 | print('candidate_var_list:\n',candidate_var_list)
579 |
580 | cor = np.corrcoef(dataset[candidate_var_list].values,rowvar=0)
581 | pd.DataFrame(cor,columns=candidate_var_list).to_csv(out_file_path,index=False)
--------------------------------------------------------------------------------
/woe/feature_process.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | __author__ = 'boredbird'
3 | import numpy as np
4 | import woe.config as config
5 | import woe.eval as eval
6 | import copy
7 | import pickle
8 | import time
9 |
10 | class node:
11 | '''Tree Node Class
12 | '''
13 | def __init__(self,var_name=None,iv=0,split_point=None,right=None,left=None):
14 | self.var_name = var_name # The column index value of the attributes that are used to split data sets
15 | self.iv = iv # The info value of the node
16 | self.split_point = split_point # Store split points list
17 | self.right = right # Right sub tree
18 | self.left = left # Left sub tree
19 |
20 |
21 | class InfoValue(object):
22 | '''
23 | InfoValue Class
24 | '''
25 | def __init__(self):
26 | self.var_name = []
27 | self.split_list = []
28 | self.iv = 0
29 | self.woe_list = []
30 | self.iv_list = []
31 | self.is_discrete = 0
32 | self.sub_total_sample_num = []
33 | self.positive_sample_num = []
34 | self.negative_sample_num = []
35 | self.sub_total_num_percentage = []
36 | self.positive_rate_in_sub_total = []
37 | self.negative_rate_in_sub_total = []
38 |
39 | def init(self,civ):
40 | self.var_name = civ.var_name
41 | self.split_list = civ.split_list
42 | self.iv = civ.iv
43 | self.woe_list = civ.woe_list
44 | self.iv_list = civ.iv_list
45 | self.is_discrete = civ.is_discrete
46 | self.sub_total_sample_num = civ.sub_total_sample_num
47 | self.positive_sample_num = civ.positive_sample_num
48 | self.negative_sample_num = civ.negative_sample_num
49 | self.sub_total_num_percentage = civ.sub_total_num_percentage
50 | self.positive_rate_in_sub_total = civ.positive_rate_in_sub_total
51 | self.negative_rate_in_sub_total = civ.negative_rate_in_sub_total
52 |
53 |
54 | class DisInfoValue(object):
55 | '''
56 | A Class for the storage of discrete variables transformation information
57 | '''
58 | def __init__(self):
59 | self.var_name = None
60 | self.origin_value = []
61 | self.woe_before = []
62 |
63 |
64 | def change_feature_dtype(df,variable_type):
65 | '''
66 | change feature data type by the variable_type DataFrame
67 | :param df: dataset DataFrame
68 | :param variable_type: the DataFrame about variables dtypes
69 | :return: None
70 | '''
71 | s = 'Changing Feature Dtypes'
72 | print(s.center(60,'-'))
73 | for vname in df.columns:
74 | try:
75 | df[vname] = df[vname].astype(variable_type.loc[vname,'v_type'])
76 | print(vname,' '*(40-len(vname)),'{0: >10}'.format(variable_type.loc[vname,'v_type']))
77 | except Exception:
78 | print('[error]',vname)
79 | print('[original dtype] ',df.dtypes[vname],' [astype] ',variable_type.loc[vname,'v_type'])
80 | print('[unique value]',np.unique(df[vname]))
81 |
82 | s = 'Variable Dtypes Have Been Specified'
83 | print(s.center(60,'-'))
84 |
85 | return
86 |
87 | def check_point(df,var,split,min_sample):
88 | """
89 | Check whether the segmentation points cause some packet samples to be too small;
90 | If there is a packet sample size of less than 5% of the total sample size,
91 | then merge with the adjacent packet until more than 5%;
92 | Applies only to continuous values
93 | :param df: Dataset DataFrame
94 | :param var: Variables list
95 | :param split: Split points list
96 | :param min_sample: Minimum packet sample size
97 | :return: The split points list checked out
98 | """
99 | new_split = []
100 | if split is not None and split.__len__()>0:
101 | # print('run into if line:98')
102 | new_split.append(split[0])
103 | # print(new_split)
104 | # Try the left section of the first split point partition;
105 | # If not meet the conditions then the split point will be removed
106 | pdf = df[df[var] <= split[0]]
107 | if (pdf.shape[0] < min_sample) or (len(np.unique(pdf['target']))<=1):
108 | # print('run into if line:105')
109 | new_split.pop()
110 | # print(new_split)
111 | for i in range(0,split.__len__()-1):
112 | pdf = df[(df[var] > split[i]) & (df[var] <= split[i+1])]
113 | if (pdf.shape[0] < min_sample) or (np.unique(pdf['target']).__len__()<=1):
114 | # print('run into if line:112')
115 | continue
116 | else:
117 | # print('run into if line:115')
118 | new_split.append(split[i+1])
119 | # print(new_split)
120 |
121 | #If the remaining sample is too small then remove the last one
122 | # print(new_split)
123 | # print(new_split.__len__())
124 | if new_split.__len__()>1 and len(df[df[var] >= new_split[new_split.__len__()-1]])1 and np.unique(df[df[var] >= new_split[new_split.__len__()-1]]['target']).__len__()<=1:
130 | # print(split)
131 | # print(split[split.__len__()-1])
132 | # print(df[df[var] >= new_split[new_split.__len__()-1]].shape)
133 | # print(np.unique(df[df[new_split] > new_split[new_split.__len__()-1]]['target']))
134 | # print('run into if line:125')
135 | new_split.pop()
136 | # print(new_split)
137 | #If the split list has only one value, and no smaller than this value
138 | if new_split == []:
139 | new_split = split
140 | else:
141 | pass
142 | return new_split
143 |
144 | def calulate_iv(df,var,global_bt,global_gt):
145 | '''
146 | calculate the iv and woe value without split
147 | :param df:
148 | :param var:
149 | :param global_bt:
150 | :param global_gt:
151 | :return:
152 | '''
153 | # a = df.groupby(['target']).count()
154 | groupdetail = {}
155 | bt_sub = sum(df['target'])
156 | bri = (bt_sub + 0.0001)* 1.0 / global_bt
157 | gt_sub = df.shape[0] - bt_sub
158 | gri = (gt_sub + 0.0001)* 1.0 / global_gt
159 |
160 | groupdetail['woei'] = np.log(bri / gri)
161 | groupdetail['ivi'] = (bri - gri) * np.log(bri / gri)
162 | groupdetail['sub_total_num_percentage'] = df.shape[0]*1.0/(global_bt+global_gt)
163 | groupdetail['positive_sample_num'] = bt_sub
164 | groupdetail['negative_sample_num'] = gt_sub
165 | groupdetail['positive_rate_in_sub_total'] = bt_sub*1.0/df.shape[0]
166 | groupdetail['negative_rate_in_sub_total'] = gt_sub*1.0/df.shape[0]
167 |
168 | return groupdetail
169 |
170 |
171 | def calculate_iv_split(df,var,split_point,global_bt,global_gt):
172 | """
173 | calculate the iv value with the specified split point
174 | note:
175 | the dataset should have variables:'target' which to be encapsulated if have time
176 | :return:
177 | """
178 | #split dataset
179 | dataset_r = df[df.loc[:,var] > split_point][[var,'target']]
180 | dataset_l = df[df.loc[:,var] <= split_point][[var,'target']]
181 |
182 | r1_cnt = sum(dataset_r['target'])
183 | r0_cnt = dataset_r.shape[0] - r1_cnt
184 |
185 | l1_cnt = sum(dataset_l['target'])
186 | l0_cnt = dataset_l.shape[0] - l1_cnt
187 |
188 | if r0_cnt == 0 or r1_cnt == 0 or l0_cnt == 0 or l1_cnt ==0:
189 | return 0,0,0,dataset_l,dataset_r,0,0
190 |
191 | lbr = (l1_cnt+ 0.0001)*1.0/global_bt
192 | lgr = (l0_cnt+ 0.0001)*1.0/global_gt
193 | woel = np.log(lbr/lgr)
194 | ivl = (lbr-lgr)*woel
195 | rbr = (r1_cnt+ 0.0001)*1.0/global_bt
196 | rgr = (r0_cnt+ 0.0001)*1.0/global_gt
197 | woer = np.log(rbr/rgr)
198 | ivr = (rbr-rgr)*woer
199 | iv = ivl+ivr
200 |
201 | return woel,woer,iv,dataset_l,dataset_r,ivl,ivr
202 |
203 |
204 | def binning_data_split(df,var,global_bt,global_gt,min_sample,alpha=0.01):
205 | """
206 | Specify the data split level and return the split value list
207 | :return:
208 | """
209 | iv_var = InfoValue()
210 | # Calculates the IV of the current node before splitted
211 | gd = calulate_iv(df, var,global_bt,global_gt)
212 |
213 | woei, ivi = gd['woei'],gd['ivi']
214 |
215 | if np.unique(df[var]).__len__() <=8:
216 | # print('running into if')
217 | split = list(np.unique(df[var]))
218 | split.sort()
219 | # print('split:',split)
220 | #Segmentation point checking and processing
221 | split = check_point(df, var, split, min_sample)
222 | split.sort()
223 | # print('after check:',split)
224 | iv_var.split_list = split
225 | return node(split_point=split,iv=ivi)
226 |
227 | percent_value = list(np.unique(np.percentile(df[var], range(100))))
228 | percent_value.sort()
229 |
230 | if percent_value.__len__() <=2:
231 | iv_var.split_list = list(np.unique(percent_value)).sort()
232 | return node(split_point=percent_value,iv=ivi)
233 |
234 | # A sentry that attempts to split the current node
235 | # Init bestSplit_iv with zero
236 | bestSplit_iv = 0
237 | bestSplit_woel = []
238 | bestSplit_woer = []
239 | bestSplit_ivl = 0
240 | bestSplit_ivr = 0
241 | bestSplit_point = []
242 |
243 | #remove max value and min value in case dataset_r or dataset_l will be null
244 | for point in percent_value[0:percent_value.__len__()-1]:
245 | # If there is only a sample or a negative sample, skip
246 | if set(df[df[var] > point]['target']).__len__() == 1 or set(df[df[var] <= point]['target']).__len__() == 1 \
247 | or df[df[var] > point].shape[0] < min_sample or df[df[var] <= point].shape[0] < min_sample :
248 | continue
249 |
250 | woel, woer, iv, dataset_l, dataset_r, ivl, ivr = calculate_iv_split(df,var,point,global_bt,global_gt)
251 |
252 | if iv > bestSplit_iv:
253 | bestSplit_woel = woel
254 | bestSplit_woer = woer
255 | bestSplit_iv = iv
256 | bestSplit_point = point
257 | bestSplit_dataset_r = dataset_r
258 | bestSplit_dataset_l = dataset_l
259 | bestSplit_ivl = ivl
260 | bestSplit_ivr = ivr
261 |
262 | # If the IV after division is greater than the IV value before the current segmentation, the segmentation is valid and recursive
263 | # specified step learning rate 0.01
264 | if bestSplit_iv > ivi*(1+alpha) and bestSplit_dataset_r.shape[0] > min_sample and bestSplit_dataset_l.shape[0] > min_sample:
265 | presplit_right = node()
266 | presplit_left = node()
267 |
268 | # Determine whether the right node satisfies the segmentation prerequisite
269 | if bestSplit_dataset_r.shape[0] < min_sample or set(bestSplit_dataset_r['target']).__len__() == 1:
270 | presplit_right.iv = bestSplit_ivr
271 | right = presplit_right
272 | else:
273 | right = binning_data_split(bestSplit_dataset_r,var,global_bt,global_gt,min_sample,alpha=0.01)
274 |
275 | # Determine whether the left node satisfies the segmentation prerequisite
276 | if bestSplit_dataset_l.shape[0] < min_sample or np.unique(bestSplit_dataset_l['target']).__len__() == 1:
277 | presplit_left.iv = bestSplit_ivl
278 | left = presplit_left
279 | else:
280 | left = binning_data_split(bestSplit_dataset_l,var,global_bt,global_gt,min_sample,alpha=0.01)
281 |
282 | return node(var_name=var,split_point=bestSplit_point,iv=ivi,left=left,right=right)
283 | else:
284 | # Returns the current node as the final leaf node
285 | return node(var_name=var,iv=ivi)
286 |
287 |
288 | def search(tree,split_list):
289 | '''
290 | search the tree node
291 | :param tree: a instance of Tree Node Class
292 | :return: split points list
293 | '''
294 | if isinstance(tree.split_point, list):
295 | split_list.extend(tree.split_point)
296 | else:
297 | split_list.append(tree.split_point)
298 |
299 | if tree.left is not None:
300 | search(tree.left,split_list)
301 |
302 | if tree.right is not None:
303 | search(tree.right,split_list)
304 |
305 | return split_list
306 |
307 |
308 | def format_iv_split(df,var,split_list,global_bt,global_gt):
309 | '''
310 | Given the dataset DataFrame and split points list then return a InfoValue instance;
311 | Just for continuous variable
312 | :param df:
313 | :param var:
314 | :param split_list:
315 | :param global_bt:
316 | :param global_gt:
317 | :return:
318 | '''
319 | civ = InfoValue()
320 | civ.var_name = var
321 | civ.split_list = split_list
322 | dfcp = df[:]
323 |
324 | civ.sub_total_sample_num = []
325 | civ.positive_sample_num = []
326 | civ.negative_sample_num = []
327 | civ.sub_total_num_percentage = []
328 | civ.positive_rate_in_sub_total = []
329 |
330 | for i in range(0, split_list.__len__()):
331 | dfi = dfcp[dfcp[var] <= split_list[i]]
332 | dfcp = dfcp[dfcp[var] > split_list[i]]
333 | gd = calulate_iv(dfi, var,global_bt,global_gt)
334 | woei, ivi = gd['woei'],gd['ivi']
335 | civ.woe_list.append(woei)
336 | civ.iv_list.append(ivi)
337 | civ.sub_total_sample_num.append(dfi.shape[0])
338 | civ.positive_sample_num.append(gd['positive_sample_num'])
339 | civ.negative_sample_num.append(gd['negative_sample_num'])
340 | civ.sub_total_num_percentage.append(gd['sub_total_num_percentage'])
341 | civ.positive_rate_in_sub_total.append(gd['positive_rate_in_sub_total'])
342 | civ.negative_rate_in_sub_total.append(gd['negative_rate_in_sub_total'])
343 |
344 | if dfcp.shape[0]>0:
345 | gd = calulate_iv(dfcp, var,global_bt,global_gt)
346 | woei, ivi = gd['woei'],gd['ivi']
347 | civ.woe_list.append(woei)
348 | civ.iv_list.append(ivi)
349 | civ.sub_total_sample_num.append(dfcp.shape[0])
350 | civ.positive_sample_num.append(gd['positive_sample_num'])
351 | civ.negative_sample_num.append(gd['negative_sample_num'])
352 | civ.sub_total_num_percentage.append(gd['sub_total_num_percentage'])
353 | civ.positive_rate_in_sub_total.append(gd['positive_rate_in_sub_total'])
354 | civ.negative_rate_in_sub_total.append(gd['negative_rate_in_sub_total'])
355 |
356 | civ.iv = sum(civ.iv_list)
357 | return civ
358 |
359 |
360 | def woe_trans(dvar,civ):
361 | # replace the var value with the given woe value
362 | var = copy.deepcopy(dvar)
363 | if not civ.is_discrete:
364 | if civ.woe_list.__len__()>1:
365 | split_list = []
366 | split_list.append(float("-inf"))
367 | split_list.extend([i for i in civ.split_list])
368 | split_list.append(float("inf"))
369 |
370 | for i in range(civ.woe_list.__len__()):
371 | var[(dvar > split_list[i]) & (dvar <= split_list[i+1])] = civ.woe_list[i]
372 | else:
373 | var[:] = civ.woe_list[0]
374 | else:
375 | split_map = {}
376 | for i in range(civ.split_list.__len__()):
377 | for j in range(civ.split_list[i].__len__()):
378 | split_map[civ.split_list[i][j]] = civ.woe_list[i]
379 |
380 | var = var.map(split_map)
381 |
382 | return var
383 |
384 | def proc_woe_discrete(df,var,global_bt,global_gt,min_sample,alpha=0.01):
385 | '''
386 | process woe transformation of discrete variables
387 | :param df:
388 | :param var:
389 | :param global_bt:
390 | :param global_gt:
391 | :param min_sample:
392 | :return:
393 | '''
394 | s = 'process discrete variable:'+str(var)
395 | print(s.center(60, '-'))
396 |
397 | df = df[[var,'target']]
398 | div = DisInfoValue()
399 | div.var_name = var
400 | rdict = {}
401 | cpvar = df[var]
402 | # print('np.unique(df[var]):',np.unique(df[var]))
403 | for var_value in np.unique(df[var]):
404 | # Here come with a '==',in case type error you must do Nan filling process firstly
405 | df_temp = df[df[var] == var_value]
406 | gd = calulate_iv(df_temp,var,global_bt,global_gt)
407 | woei, ivi = gd['woei'],gd['ivi']
408 | div.origin_value.append(var_value)
409 | div.woe_before.append(woei)
410 | rdict[var_value] = woei
411 | # print(var_value,woei,ivi)
412 |
413 | cpvar = cpvar.map(rdict)
414 | df[var] = cpvar
415 |
416 | iv_tree = binning_data_split(df,var,global_bt,global_gt,min_sample,alpha)
417 |
418 | # Traversal tree, get the segmentation point
419 | split_list = []
420 | search(iv_tree, split_list)
421 | split_list = list(np.unique([1.0 * x for x in split_list if x is not None]))
422 | split_list.sort()
423 |
424 | # Segmentation point checking and processing
425 | split_list = check_point(df, var, split_list, min_sample)
426 | split_list.sort()
427 |
428 | civ = format_iv_split(df, var, split_list,global_bt,global_gt)
429 | civ.is_discrete = 1
430 |
431 | split_list_temp = []
432 | split_list_temp.append(float("-inf"))
433 | split_list_temp.extend([i for i in split_list])
434 | split_list_temp.append(float("inf"))
435 |
436 | a = []
437 | for i in range(split_list_temp.__len__() - 1):
438 | temp = []
439 | for j in range(div.origin_value.__len__()):
440 | if (div.woe_before[j]>split_list_temp[i]) & (div.woe_before[j]<=split_list_temp[i+1]):
441 | temp.append(div.origin_value[j])
442 |
443 | if temp != [] :
444 | a.append(temp)
445 |
446 | civ.split_list = a
447 |
448 | return civ
449 |
450 |
451 | def proc_woe_continuous(df,var,global_bt,global_gt,min_sample,alpha=0.01):
452 | '''
453 | process woe transformation of discrete variables
454 | :param df:
455 | :param var:
456 | :param global_bt:
457 | :param global_gt:
458 | :param min_sample:
459 | :return:
460 | '''
461 | s = 'process continuous variable:'+str(var)
462 | print(s.center(60, '-'))
463 | df = df[[var,'target']]
464 | iv_tree = binning_data_split(df, var,global_bt,global_gt,min_sample,alpha)
465 |
466 | # Traversal tree, get the segmentation point
467 | split_list = []
468 | search(iv_tree, split_list)
469 | split_list = list(np.unique([1.0 * x for x in split_list if x is not None]))
470 | split_list.sort()
471 |
472 | # Segmentation point checking and processing
473 | split_list = check_point(df, var, split_list, min_sample)
474 | split_list.sort()
475 |
476 | civ = format_iv_split(df, var,split_list,global_bt,global_gt)
477 |
478 | return civ
479 |
480 | def fillna(dataset,bin_var_list,discrete_var_list,continuous_filler=-1,discrete_filler='missing'):
481 | """
482 | fill the null value in the dataframe inpalce
483 | :param dataset: input dataset ,pandas.DataFrame type
484 | :param bin_var_list: continuous variables name list
485 | :param discrete_var_list: discretevvvv variables name list
486 | :param continuous_filler: the value to fill the null value in continuous variables
487 | :param discrete_filler: the value to fill the null value in discrete variables
488 | :return: null value,replace null value inplace
489 | """
490 | for var in [tmp for tmp in bin_var_list if tmp in list(dataset.columns)]:
491 | # fill null
492 | dataset.loc[dataset[var].isnull(), (var)] = continuous_filler
493 |
494 | for var in [tmp for tmp in discrete_var_list if tmp in list(dataset.columns)]:
495 | # fill null
496 | dataset.loc[dataset[var].isnull(), (var)] = discrete_filler
497 |
498 |
499 | def process_train_woe(infile_path=None,outfile_path=None,rst_path=None,config_path=None):
500 | print('run into process_train_woe: \n',time.asctime(time.localtime(time.time())))
501 | data_path = infile_path
502 | cfg = config.config()
503 | cfg.load_file(config_path,data_path)
504 | bin_var_list = [tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns)]
505 |
506 | for var in bin_var_list:
507 | # fill null
508 | cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1
509 |
510 | # change feature dtypes
511 | change_feature_dtype(cfg.dataset_train, cfg.variable_type)
512 | rst = []
513 |
514 | # process woe transformation of continuous variables
515 | print('process woe transformation of continuous variables: \n',time.asctime(time.localtime(time.time())))
516 | print('cfg.global_bt',cfg.global_bt)
517 | print('cfg.global_gt', cfg.global_gt)
518 |
519 | for var in bin_var_list:
520 | rst.append(proc_woe_continuous(cfg.dataset_train,var,cfg.global_bt,cfg.global_gt,cfg.min_sample,alpha=0.05))
521 |
522 | # process woe transformation of discrete variables
523 | print('process woe transformation of discrete variables: \n',time.asctime(time.localtime(time.time())))
524 | for var in [tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns)]:
525 | # fill null
526 | cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing'
527 | rst.append(proc_woe_discrete(cfg.dataset_train,var,cfg.global_bt,cfg.global_gt,cfg.min_sample,alpha=0.05))
528 |
529 | feature_detail = eval.eval_feature_detail(rst, outfile_path)
530 |
531 | print('save woe transformation rule into pickle: \n',time.asctime(time.localtime(time.time())))
532 | output = open(rst_path, 'wb')
533 | pickle.dump(rst,output)
534 | output.close()
535 |
536 | return feature_detail,rst
537 |
538 |
539 | def process_woe_trans(in_data_path=None,rst_path=None,out_path=None,config_path=None):
540 | cfg = config.config()
541 | cfg.load_file(config_path, in_data_path)
542 |
543 | for var in [tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns)]:
544 | # fill null
545 | cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1
546 |
547 | for var in [tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns)]:
548 | # fill null
549 | cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing'
550 |
551 | change_feature_dtype(cfg.dataset_train, cfg.variable_type)
552 |
553 | output = open(rst_path, 'rb')
554 | rst = pickle.load(output)
555 | output.close()
556 |
557 | # Training dataset Woe Transformation
558 | for r in rst:
559 | cfg.dataset_train[r.var_name] = woe_trans(cfg.dataset_train[r.var_name], r)
560 |
561 | cfg.dataset_train.to_csv(out_path)
562 |
--------------------------------------------------------------------------------
/woe/ftrl.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | __author__ = 'boredbird'
3 | import numpy as np
4 |
5 | class LR(object):
6 | @staticmethod
7 | def fn(w, x):
8 | '''sigmoid function
9 | '''
10 | return 1.0 / (1.0 + np.exp(-w.dot(x)))
11 |
12 | @staticmethod
13 | def loss(y, y_hat):
14 | '''Cross entropy loss function
15 | '''
16 | return np.sum(np.nan_to_num(-y * np.log(y_hat) - (1 - y) * np.log(1 - y_hat)))
17 |
18 | @staticmethod
19 | def grad(y, y_hat, x):
20 | '''The first derivative of the cross entropy loss function to the weight W
21 | '''
22 | return (y_hat - y) * x
23 |
24 |
25 | class FTRL(object):
26 | def __init__(self, dim, l1, l2, alpha, beta, decisionFunc=LR):
27 | self.dim = dim
28 | self.decisionFunc = decisionFunc
29 | self.z = np.zeros(dim)
30 | self.n = np.zeros(dim)
31 | self.w = np.zeros(dim)
32 | self.w_list = []
33 | self.loss_list = []
34 | self.l1 = l1
35 | self.l2 = l2
36 | self.alpha = alpha
37 | self.beta = beta
38 |
39 | def predict(self, x):
40 | return self.decisionFunc.fn(self.w, x)
41 |
42 | def update(self, x, y):
43 | self.w = np.array([0 if np.abs(self.z[i]) <= self.l1 else (np.sign(
44 | self.z[i]) * self.l1 - self.z[i]) / (self.l2 + (self.beta + np.sqrt(self.n[i])) / self.alpha) for i in xrange(self.dim)])
45 | y_hat = self.predict(x)
46 | g = self.decisionFunc.grad(y, y_hat, x)
47 | sigma = (np.sqrt(self.n + g * g) - np.sqrt(self.n)) / self.alpha
48 | self.z += g - sigma * self.w
49 | self.n += g * g
50 | return self.decisionFunc.loss(y, y_hat)
51 |
52 | def train(self, trainSet, verbos=False, max_itr=10000000000, eta=0.01, epochs=100):
53 | itr = 0
54 | n = 0
55 | while True:
56 | for x, y in trainSet:
57 | loss = self.update(x, y)
58 | if verbos and n%verbos==0:
59 | print("itr=" + str(n) + "\tloss=" + str(loss))
60 | self.w_list.append(self.w)
61 | self.loss_list.append(loss)
62 | if loss < eta:
63 | itr += 1
64 | else:
65 | itr = 0
66 | if itr >= epochs: # when the loss function has been continuously epochs iterations less than eta
67 | print("loss have less than", eta, " continuously for ", itr, "iterations")
68 | return
69 | n += 1
70 | if n >= max_itr:
71 | print("reach max iteration", max_itr)
72 | return
--------------------------------------------------------------------------------