├── __init__.py
├── utils
    ├── __init__.py
    ├── simpleMethods.py
    ├── mapa.py
    ├── backwardSplit.py
    ├── forwardSplit.py
    ├── trendSplit.py
    └── trendDiscretization.py
├── autoBinning
    ├── __init__.py
    └── utils
    │   ├── __init__.py
    │   ├── simpleMethods.py
    │   ├── mapa.py
    │   ├── backwardSplit.py
    │   ├── forwardSplit.py
    │   ├── trendSplit.py
    │   └── trendDiscretization.py
├── setup.cfg
├── doc
    ├── woe1.JPG
    └── woe2.JPG
├── setup.py
├── LICENSE
├── .gitignore
├── README.md
└── test.py


/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-


--------------------------------------------------------------------------------
/autoBinning/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-


--------------------------------------------------------------------------------
/autoBinning/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md


--------------------------------------------------------------------------------
/doc/woe1.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaiwang0112006/autoBinning/HEAD/doc/woe1.JPG


--------------------------------------------------------------------------------
/doc/woe2.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaiwang0112006/autoBinning/HEAD/doc/woe2.JPG


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r", encoding='utf-8') as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="autoBinning",
 8 |     version="0.1.7",
 9 |     author="Kai Wang",
10 |     author_email="wangkai0112006@163.com",
11 |     description="A small package for feature autoBinning",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/kaiwang0112006/autoBinning",
15 |     packages=setuptools.find_packages(),
16 |     install_requires=[
17 |         'numpy',
18 |         'scipy',
19 |     ],
20 |     classifiers=[
21 |         "Programming Language :: Python :: 3",
22 |         "License :: OSI Approved :: MIT License",
23 |         "Operating System :: OS Independent",
24 |     ],
25 | )
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Kai Wang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | .hypothesis/
 50 | .pytest_cache/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | db.sqlite3
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # IPython
 78 | profile_default/
 79 | ipython_config.py
 80 | 
 81 | # pyenv
 82 | .python-version
 83 | 
 84 | # celery beat schedule file
 85 | celerybeat-schedule
 86 | 
 87 | # SageMath parsed files
 88 | *.sage.py
 89 | 
 90 | # Environments
 91 | .env
 92 | .venv
 93 | env/
 94 | venv/
 95 | ENV/
 96 | env.bak/
 97 | venv.bak/
 98 | 
 99 | # Spyder project settings
100 | .spyderproject
101 | .spyproject
102 | 
103 | # Rope project settings
104 | .ropeproject
105 | 
106 | # mkdocs documentation
107 | /site
108 | 
109 | # mypy
110 | .mypy_cache/
111 | .dmypy.json
112 | dmypy.json
113 | 
114 | # Pyre type checker
115 | .pyre/


--------------------------------------------------------------------------------
/utils/simpleMethods.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np 
 3 | import pandas as pd
 4 | import math
 5 | import copy
 6 | 
 7 | class simpleMethods:
 8 |     def __init__(self,x, missing=None,force=False, categorical=False):
 9 |         self.x_org = x
10 |         self.range_dict = {}
11 |         self.missing=missing
12 |         self.categorical = categorical
13 |         if self.missing == None:
14 |             self.x = copy.deepcopy(self.x_org)
15 |             self.x_miss = None
16 |         else:
17 |             self.x = self.x_org[self.x_org!=self.missing]
18 |             self.x_miss = self.x_org[self.x_org==self.missing]
19 |         self.force=force
20 |         
21 |     def equalValue(self,size):
22 |         '''
23 |         x 等间距划分分箱  -> (0-0.1,0.1-0.2...)
24 |         :param size:
25 |         :return:
26 |         '''
27 |         self.range_dict = {}
28 | 
29 |         self.bins = np.linspace(min(self.x), max(self.x), size+1)
30 | 
31 |         for i in range(len(self.bins)-1):
32 |             self.range_dict[(self.bins[i],self.bins[i+1])] = i
33 | 
34 |         return self
35 | 
36 |     def equalHist(self,size):
37 |         '''
38 |         基于np.histogram分箱
39 |         :param size: bin数目
40 |         :return:
41 |         '''
42 |         self.down = {}
43 |         self.hist, self.bins = np.histogram(self.x, bins=size)
44 | 
45 | 
46 |         for i in range(len(self.bins)-1):
47 |             start = self.bins[i]
48 |             end = self.bins[i+1]
49 | 
50 |             self.range_dict[(start, end)] = i       
51 | 
52 |         return self
53 |     
54 |     def equalSize(self,size):
55 |         '''
56 |         每个分箱样本数平均
57 |         :param size:
58 |         :return:
59 |         '''
60 |         self.range_dict = {}
61 |         # use pandas qcut
62 |         #_, self.bins = pd.qcut(self.x,size,retbins='True',duplicates='drop')
63 |         #self.bins = sorted(list(self.bins))
64 |         # use numpy instead
65 |         breakpoints = np.arange(0, size + 1) / (size) * 100
66 |         self.bins = [np.percentile(self.x, b) for b in breakpoints]
67 | 
68 |         for i in range(len(self.bins)-1):
69 |             start = self.bins[i]
70 |             end = self.bins[i+1]
71 | 
72 |             self.range_dict[(start, end)] = i       
73 | 
74 |         self.bins = np.array(self.bins)
75 |         return self
76 | 
77 |     def everysplit(self):
78 |         '''
79 |         最细粒度切分
80 |         :return:
81 |         '''
82 |         if (len(set(self.x))<=10 and not self.force) or self.categorical:
83 |             self.bins = sorted(list(set(self.x)))
84 |             self.bins.append(max(self.bins)+1)
85 |             self.bins = np.array(self.bins)
86 |         else:
87 |             x_sort = sorted(list(set(self.x)),reverse=False)
88 |             bins = [x_sort[0]]
89 |             for i in range(len(x_sort)-1):
90 |                 bins.append((x_sort[i]+x_sort[i+1])/2)
91 |             bins.append(x_sort[-1]+1)
92 |             self.bins = np.array(bins)
93 | 
94 |         self.range_dict = {}
95 |         for i in range(len(self.bins)-1):
96 |             start = self.bins[i]
97 |             end = self.bins[i+1]
98 | 
99 |             self.range_dict[(start, end)] = i


--------------------------------------------------------------------------------
/autoBinning/utils/simpleMethods.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np 
 3 | import pandas as pd
 4 | import math
 5 | import copy
 6 | 
 7 | class simpleMethods:
 8 |     def __init__(self,x, missing=None,force=False, categorical=False):
 9 |         self.x_org = x
10 |         self.range_dict = {}
11 |         self.missing=missing
12 |         self.categorical = categorical
13 |         if self.missing == None:
14 |             self.x = copy.deepcopy(self.x_org)
15 |             self.x_miss = None
16 |         else:
17 |             self.x = self.x_org[self.x_org!=self.missing]
18 |             self.x_miss = self.x_org[self.x_org==self.missing]
19 |         self.force=force
20 |         
21 |     def equalValue(self,size):
22 |         '''
23 |         x 等间距划分分箱  -> (0-0.1,0.1-0.2...)
24 |         :param size:
25 |         :return:
26 |         '''
27 |         self.range_dict = {}
28 | 
29 |         self.bins = np.linspace(min(self.x), max(self.x), size+1)
30 | 
31 |         for i in range(len(self.bins)-1):
32 |             self.range_dict[(self.bins[i],self.bins[i+1])] = i
33 | 
34 |         return self
35 | 
36 |     def equalHist(self,size):
37 |         '''
38 |         基于np.histogram分箱
39 |         :param size: bin数目
40 |         :return:
41 |         '''
42 |         self.down = {}
43 |         self.hist, self.bins = np.histogram(self.x, bins=size)
44 | 
45 | 
46 |         for i in range(len(self.bins)-1):
47 |             start = self.bins[i]
48 |             end = self.bins[i+1]
49 | 
50 |             self.range_dict[(start, end)] = i       
51 | 
52 |         return self
53 |     
54 |     def equalSize(self,size):
55 |         '''
56 |         每个分箱样本数平均
57 |         :param size:
58 |         :return:
59 |         '''
60 |         self.range_dict = {}
61 |         # use pandas qcut
62 |         #_, self.bins = pd.qcut(self.x,size,retbins='True',duplicates='drop')
63 |         #self.bins = sorted(list(self.bins))
64 |         # use numpy instead
65 |         breakpoints = np.arange(0, size + 1) / (size) * 100
66 |         self.bins = [np.percentile(self.x, b) for b in breakpoints]
67 | 
68 |         for i in range(len(self.bins)-1):
69 |             start = self.bins[i]
70 |             end = self.bins[i+1]
71 | 
72 |             self.range_dict[(start, end)] = i       
73 | 
74 |         self.bins = np.array(self.bins)
75 |         return self
76 | 
77 |     def everysplit(self):
78 |         '''
79 |         最细粒度切分
80 |         :return:
81 |         '''
82 |         if (len(set(self.x))<=10 and not self.force) or self.categorical:
83 |             self.bins = sorted(list(set(self.x)))
84 |             self.bins.append(max(self.bins)+1)
85 |             self.bins = np.array(self.bins)
86 |         else:
87 |             x_sort = sorted(list(set(self.x)),reverse=False)
88 |             bins = [x_sort[0]]
89 |             for i in range(len(x_sort)-1):
90 |                 bins.append((x_sort[i]+x_sort[i+1])/2)
91 |             bins.append(x_sort[-1]+1)
92 |             self.bins = np.array(bins)
93 | 
94 |         self.range_dict = {}
95 |         for i in range(len(self.bins)-1):
96 |             start = self.bins[i]
97 |             end = self.bins[i+1]
98 | 
99 |             self.range_dict[(start, end)] = i


--------------------------------------------------------------------------------
/utils/mapa.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from .trendSplit import *
  3 | import numpy as np
  4 | import copy
  5 | import math
  6 | 
  7 | 
  8 | class MAPA(trendSplit):
  9 |     def __init__(self, x, y, bad=1):
 10 |         trendSplit.__init__(self, x, y, bad)
 11 | 
 12 |     def fit(self,trend='up',sby='woe'):
 13 |         '''
 14 |         :param num_split: 最大切割点数,不包含最大最小值
 15 |         :param minv: 最小分裂所需数值，woe/iv
 16 |         :param sby: 'woe','iv','woeiv'
 17 |         :param min_sample: 每个分箱最小样本数
 18 |         :return: numpy array -- 切割点数组
 19 |         '''
 20 |         self.set_init()
 21 |         self.everysplit()
 22 |         if trend == 'auto':
 23 |             self.candidateTrend()
 24 |         else:
 25 |             self.trend = trend
 26 |         self.test = {}
 27 |         candidate = []
 28 |         for r in self.range_dict:
 29 |             candidate.append(r[0])
 30 |             candidate.append(r[1])
 31 | 
 32 |         if self.trend == 'up':
 33 |             self.candidate = sorted(list(set(candidate)),reverse=False)
 34 |         else:
 35 |             self.candidate = sorted(list(set(candidate)), reverse=True)
 36 | 
 37 |         cut_list, v = self.find_cut(sby=sby)
 38 |         self.cut_range = [cut_list[-1]]
 39 |         self.cut_range.append(self.candidate[0])
 40 |         self.cut_range.append(self.candidate[-1])
 41 |         for d in cut_list:
 42 |             self.candidate.remove(d)
 43 | 
 44 |         while True:
 45 |             cut_list, v = self.find_cut(sby=sby)
 46 |             if len(cut_list)>0:
 47 |                 self.cut_range.append(cut_list[-1])
 48 |                 self.cut_range = sorted(list(set(self.cut_range)))
 49 |                 for d in cut_list:
 50 |                     self.candidate.remove(d)
 51 |             else:
 52 |                 break
 53 |         self.bins = np.array(sorted(list(set(self.cut_range))))
 54 |         print(self.test)
 55 | 
 56 |     def find_cut(self,trend='up',sby='woe'):
 57 |         '''
 58 |         :param minv: 最小分裂所需数值，woe/iv
 59 |         :param sby: 'woe','iv','woeiv'
 60 |         :param iv_base: 上一轮的iv值，sby='woe'时不用考虑
 61 |         :return:
 62 |         '''
 63 |         cut_list = []
 64 |         cut = None
 65 |         minv = 0 # bad rate
 66 |         for c in self.candidate:
 67 |             if c != self.candidate[0]:
 68 |                 if trend == 'up':
 69 |                     v = self.value[(self.x<c) & (self.x>=self.candidate[0])]
 70 |                 else:
 71 |                     v = self.value[(self.x >=c) & (self.x < self.candidate[0])]
 72 | 
 73 |                 if len(v)>0:
 74 |                     if sby == 'woe':
 75 |                         badr = self._cal_woe(v)
 76 |                     elif badr == 'bad':
 77 |                         badr = len(v[v == self.bad]) / len(v)
 78 |                     else:
 79 |                         badr = 0
 80 |                 else:
 81 |                     badr = 0
 82 |                 self.test[c] = badr
 83 |                 if badr>=minv:
 84 |                     minv = badr
 85 |                     cut_list.append(c)
 86 |                 else:
 87 |                     break
 88 |         return cut_list, minv
 89 | 
 90 |     def candidateTrend(self):
 91 |         trend_up = 0
 92 |         trend_down = 0
 93 | 
 94 |         candidate_list = copy.deepcopy(self.candidate)
 95 |         for i in range(1, len(candidate_list) - 1):
 96 |             v_up = self.value[(self.x < candidate_list[i]) & (self.x >= candidate_list[i - 1])]
 97 |             v_down = self.value[(self.x < candidate_list[i + 1]) & (self.x >= candidate_list[i])]
 98 |             woe_up = self._cal_woe(v_up)
 99 |             woe_down = self._cal_woe(v_down)
100 |             iv_up = self._cal_iv(v_up)
101 |             iv_down = self._cal_iv(v_down)
102 |             if woe_up > woe_down:
103 |                 trend_up += 1
104 |             elif woe_up < woe_down:
105 |                 trend_down += 1
106 |         if trend_up > trend_down:
107 |             self.trend = 'up'
108 |         else:
109 |             self.trend = 'down'
110 | 


--------------------------------------------------------------------------------
/autoBinning/utils/mapa.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from .trendSplit import *
  3 | import numpy as np
  4 | import copy
  5 | import math
  6 | 
  7 | 
  8 | class MAPA(trendSplit):
  9 |     def __init__(self, x, y, bad=1):
 10 |         trendSplit.__init__(self, x, y, bad)
 11 | 
 12 |     def fit(self,trend='up',sby='woe'):
 13 |         '''
 14 |         :param num_split: 最大切割点数,不包含最大最小值
 15 |         :param minv: 最小分裂所需数值，woe/iv
 16 |         :param sby: 'woe','iv','woeiv'
 17 |         :param min_sample: 每个分箱最小样本数
 18 |         :return: numpy array -- 切割点数组
 19 |         '''
 20 |         self.set_init()
 21 |         self.everysplit()
 22 |         if trend == 'auto':
 23 |             self.candidateTrend()
 24 |         else:
 25 |             self.trend = trend
 26 |         self.test = {}
 27 |         candidate = []
 28 |         for r in self.range_dict:
 29 |             candidate.append(r[0])
 30 |             candidate.append(r[1])
 31 | 
 32 |         if self.trend == 'up':
 33 |             self.candidate = sorted(list(set(candidate)),reverse=False)
 34 |         else:
 35 |             self.candidate = sorted(list(set(candidate)), reverse=True)
 36 | 
 37 |         cut_list, v = self.find_cut(sby=sby)
 38 |         self.cut_range = [cut_list[-1]]
 39 |         self.cut_range.append(self.candidate[0])
 40 |         self.cut_range.append(self.candidate[-1])
 41 |         for d in cut_list:
 42 |             self.candidate.remove(d)
 43 | 
 44 |         while True:
 45 |             cut_list, v = self.find_cut(sby=sby)
 46 |             if len(cut_list)>0:
 47 |                 self.cut_range.append(cut_list[-1])
 48 |                 self.cut_range = sorted(list(set(self.cut_range)))
 49 |                 for d in cut_list:
 50 |                     self.candidate.remove(d)
 51 |             else:
 52 |                 break
 53 |         self.bins = np.array(sorted(list(set(self.cut_range))))
 54 |         print(self.test)
 55 | 
 56 |     def find_cut(self,trend='up',sby='woe'):
 57 |         '''
 58 |         :param minv: 最小分裂所需数值，woe/iv
 59 |         :param sby: 'woe','iv','woeiv'
 60 |         :param iv_base: 上一轮的iv值，sby='woe'时不用考虑
 61 |         :return:
 62 |         '''
 63 |         cut_list = []
 64 |         cut = None
 65 |         minv = 0 # bad rate
 66 |         for c in self.candidate:
 67 |             if c != self.candidate[0]:
 68 |                 if trend == 'up':
 69 |                     v = self.value[(self.x<c) & (self.x>=self.candidate[0])]
 70 |                 else:
 71 |                     v = self.value[(self.x >=c) & (self.x < self.candidate[0])]
 72 | 
 73 |                 if len(v)>0:
 74 |                     if sby == 'woe':
 75 |                         badr = self._cal_woe(v)
 76 |                     elif badr == 'bad':
 77 |                         badr = len(v[v == self.bad]) / len(v)
 78 |                     else:
 79 |                         badr = 0
 80 |                 else:
 81 |                     badr = 0
 82 |                 self.test[c] = badr
 83 |                 if badr>=minv:
 84 |                     minv = badr
 85 |                     cut_list.append(c)
 86 |                 else:
 87 |                     break
 88 |         return cut_list, minv
 89 | 
 90 |     def candidateTrend(self):
 91 |         trend_up = 0
 92 |         trend_down = 0
 93 | 
 94 |         candidate_list = copy.deepcopy(self.candidate)
 95 |         for i in range(1, len(candidate_list) - 1):
 96 |             v_up = self.value[(self.x < candidate_list[i]) & (self.x >= candidate_list[i - 1])]
 97 |             v_down = self.value[(self.x < candidate_list[i + 1]) & (self.x >= candidate_list[i])]
 98 |             woe_up = self._cal_woe(v_up)
 99 |             woe_down = self._cal_woe(v_down)
100 |             iv_up = self._cal_iv(v_up)
101 |             iv_down = self._cal_iv(v_down)
102 |             if woe_up > woe_down:
103 |                 trend_up += 1
104 |             elif woe_up < woe_down:
105 |                 trend_down += 1
106 |         if trend_up > trend_down:
107 |             self.trend = 'up'
108 |         else:
109 |             self.trend = 'down'
110 | 


--------------------------------------------------------------------------------
/utils/backwardSplit.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from .trendSplit import *
  3 | import numpy as np
  4 | import copy
  5 | import math
  6 | from scipy import stats
  7 | 
  8 | class backwardSplit(trendSplit):
  9 |     def __init__(self, x, y, bad=1,missing=None, force=False):
 10 |         trendSplit.__init__(self, x, y, bad, missing, force)
 11 | 
 12 |     def fit(self, init_split=0, num_split=0, minv=0, sby='iv'):
 13 |         '''
 14 |         :param init_split: 备选初始切割点数
 15 |         :param num_split: 最大切割点数,不包含最大最小值
 16 |         :param minv: 最小分裂所需数值，woe/iv
 17 |         :param sby: 'iv','chi'
 18 |         :param min_sample: 每个分箱最小样本数
 19 |         :return: numpy array -- 切割点数组
 20 |         '''
 21 |         self.set_init()
 22 | 
 23 |         if init_split == 0 or len(self.x) <= init_split:
 24 |             self.everysplit()
 25 |         else:
 26 |             self.equalSize(init_split)
 27 | 
 28 |         candidate = []
 29 |         for r in self.range_dict:
 30 |             candidate.append(r[0])
 31 |             candidate.append(r[1])
 32 |         self.candidate = sorted(list(set(candidate)))
 33 | 
 34 |         param = {'minv': minv, 'sby': sby}
 35 |         cut = self.find_cut(**param)
 36 | 
 37 |         if cut:
 38 |             self.candidate.remove(cut)
 39 |             while True:
 40 |                 cut = self.find_cut(**param)
 41 | 
 42 |                 if cut:
 43 |                     self.candidate.remove(cut)
 44 |                 else:
 45 |                     break
 46 | 
 47 |                 if num_split:
 48 |                     #print(len(set(self.candidate)),num_split, self.candidate)
 49 |                     #print()
 50 |                     if len(set(self.candidate)) <= num_split:
 51 |                         break
 52 | 
 53 |             self.bins = np.array(sorted(list(set(self.candidate))))
 54 |         else:
 55 |             self.bins = None
 56 | 
 57 | 
 58 |     def find_cut(self,minv=0, num_split=0, sby='iv'):
 59 |         '''
 60 |         :param minv: 最小分裂所需数值，woe/iv
 61 |         :param sby: 'iv','chi'
 62 |         :param num_split: 最大切割点数,不包含最大最小值
 63 |         :return:
 64 |         '''
 65 | 
 66 |         cut = None
 67 |         if sby == 'chi':
 68 |             minv = 10**7
 69 |         if len(self.candidate)>num_split:
 70 |             for i in range(1, len(self.candidate)-1):
 71 |                 candidate_list = [c for c in self.candidate if c!=self.candidate[i]]
 72 |                 down_idx = candidate_list.index(self.candidate[i+1])
 73 |                 iv_range = tuple(candidate_list)
 74 |                 near_range_before = (self.candidate[i-1], self.candidate[i], self.candidate[i+1])
 75 | 
 76 |                 if sby=='iv':
 77 |                     iv = self.cal_iv_by_range(iv_range)
 78 |                     if iv > minv:
 79 |                         minv = iv
 80 |                         cut = self.candidate[i]
 81 |                 elif sby=='chi':
 82 |                     chi_v = self.cal_chisquare_by_range(near_range_before)
 83 |                     if chi_v < minv:
 84 |                         minv = chi_v
 85 |                         cut = self.candidate[i]
 86 |         return cut
 87 | 
 88 |     def fit_by_spearman(self, init_split=0, min_v=10):
 89 |         '''
 90 |         :param init_split:
 91 |         :param max_v:
 92 |         :return:
 93 |         '''
 94 |         target_dict = None
 95 |         self.set_init()
 96 |         if init_split == 0 or len(self.x) <= init_split:
 97 |             n_split = int(len(self.x)/2)
 98 |         else:
 99 |             n_split = init_split
100 | 
101 |         while n_split>=min_v:
102 |             self.equalSize(n_split)
103 |             x_mean = []
104 |             y_mean = []
105 |             for r in self.range_dict:
106 |                 if self.range_dict[r] == max(self.range_dict.values()):
107 |                     x_mean.append(np.nanmean(self.x[(self.x <= r[1]) & (self.x >= r[0])]))
108 |                     y_mean.append(np.nanmean(self.value[(self.x <= r[1]) & (self.x >= r[0])]))
109 |                 else:
110 |                     x_mean.append(np.nanmean(self.x[(self.x < r[1]) & (self.x >= r[0])]))
111 |                     y_mean.append(np.nanmean(self.value[(self.x < r[1]) & (self.x >= r[0])]))
112 | 
113 |             #print(stats.spearmanr(x_mean, y_mean))
114 |             if abs(stats.spearmanr(x_mean, y_mean)[0]) > 0.999:
115 |                 target_dict = self.range_dict
116 |             n_split -= 1
117 | 
118 |         candidate = []
119 |         for r in target_dict:
120 |             candidate.append(r[0])
121 |             candidate.append(r[1])
122 |         self.candidate = sorted(list(set(candidate)))
123 |         self.bins = np.array(sorted(list(set(self.candidate))))


--------------------------------------------------------------------------------
/autoBinning/utils/backwardSplit.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from .trendSplit import *
  3 | import numpy as np
  4 | import copy
  5 | import math
  6 | from scipy import stats
  7 | 
  8 | class backwardSplit(trendSplit):
  9 |     def __init__(self, x, y, bad=1,missing=None, force=False):
 10 |         trendSplit.__init__(self, x, y, bad, missing, force)
 11 | 
 12 |     def fit(self, init_split=0, num_split=0, minv=0, sby='iv'):
 13 |         '''
 14 |         :param init_split: 备选初始切割点数
 15 |         :param num_split: 最大切割点数,不包含最大最小值
 16 |         :param minv: 最小分裂所需数值，woe/iv
 17 |         :param sby: 'iv','chi'
 18 |         :param min_sample: 每个分箱最小样本数
 19 |         :return: numpy array -- 切割点数组
 20 |         '''
 21 |         self.set_init()
 22 | 
 23 |         if init_split == 0 or len(self.x) <= init_split:
 24 |             self.everysplit()
 25 |         else:
 26 |             self.equalSize(init_split)
 27 | 
 28 |         candidate = []
 29 |         for r in self.range_dict:
 30 |             candidate.append(r[0])
 31 |             candidate.append(r[1])
 32 |         self.candidate = sorted(list(set(candidate)))
 33 | 
 34 |         param = {'minv': minv, 'sby': sby}
 35 |         cut = self.find_cut(**param)
 36 | 
 37 |         if cut:
 38 |             self.candidate.remove(cut)
 39 |             while True:
 40 |                 cut = self.find_cut(**param)
 41 | 
 42 |                 if cut:
 43 |                     self.candidate.remove(cut)
 44 |                 else:
 45 |                     break
 46 | 
 47 |                 if num_split:
 48 |                     #print(len(set(self.candidate)),num_split, self.candidate)
 49 |                     #print()
 50 |                     if len(set(self.candidate)) <= num_split:
 51 |                         break
 52 | 
 53 |             self.bins = np.array(sorted(list(set(self.candidate))))
 54 |         else:
 55 |             self.bins = None
 56 | 
 57 | 
 58 |     def find_cut(self,minv=0, num_split=0, sby='iv'):
 59 |         '''
 60 |         :param minv: 最小分裂所需数值，woe/iv
 61 |         :param sby: 'iv','chi'
 62 |         :param num_split: 最大切割点数,不包含最大最小值
 63 |         :return:
 64 |         '''
 65 | 
 66 |         cut = None
 67 |         if sby == 'chi':
 68 |             minv = 10**7
 69 |         if len(self.candidate)>num_split:
 70 |             for i in range(1, len(self.candidate)-1):
 71 |                 candidate_list = [c for c in self.candidate if c!=self.candidate[i]]
 72 |                 down_idx = candidate_list.index(self.candidate[i+1])
 73 |                 iv_range = tuple(candidate_list)
 74 |                 near_range_before = (self.candidate[i-1], self.candidate[i], self.candidate[i+1])
 75 | 
 76 |                 if sby=='iv':
 77 |                     iv = self.cal_iv_by_range(iv_range)
 78 |                     if iv > minv:
 79 |                         minv = iv
 80 |                         cut = self.candidate[i]
 81 |                 elif sby=='chi':
 82 |                     chi_v = self.cal_chisquare_by_range(near_range_before)
 83 |                     if chi_v < minv:
 84 |                         minv = chi_v
 85 |                         cut = self.candidate[i]
 86 |         return cut
 87 | 
 88 |     def fit_by_spearman(self, init_split=0, min_v=10):
 89 |         '''
 90 |         :param init_split:
 91 |         :param max_v:
 92 |         :return:
 93 |         '''
 94 |         target_dict = None
 95 |         self.set_init()
 96 |         if init_split == 0 or len(self.x) <= init_split:
 97 |             n_split = int(len(self.x)/2)
 98 |         else:
 99 |             n_split = init_split
100 | 
101 |         while n_split>=min_v:
102 |             self.equalSize(n_split)
103 |             x_mean = []
104 |             y_mean = []
105 |             for r in self.range_dict:
106 |                 if self.range_dict[r] == max(self.range_dict.values()):
107 |                     x_mean.append(np.nanmean(self.x[(self.x <= r[1]) & (self.x >= r[0])]))
108 |                     y_mean.append(np.nanmean(self.value[(self.x <= r[1]) & (self.x >= r[0])]))
109 |                 else:
110 |                     x_mean.append(np.nanmean(self.x[(self.x < r[1]) & (self.x >= r[0])]))
111 |                     y_mean.append(np.nanmean(self.value[(self.x < r[1]) & (self.x >= r[0])]))
112 | 
113 |             #print(stats.spearmanr(x_mean, y_mean))
114 |             if abs(stats.spearmanr(x_mean, y_mean)[0]) > 0.999:
115 |                 target_dict = self.range_dict
116 |             n_split -= 1
117 | 
118 |         candidate = []
119 |         for r in target_dict:
120 |             candidate.append(r[0])
121 |             candidate.append(r[1])
122 |         self.candidate = sorted(list(set(candidate)))
123 |         self.bins = np.array(sorted(list(set(self.candidate))))


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # auto binning 分箱工具
  2 | 
  3 | ## 安装
  4 | 
  5 |     pip install autoBinning
  6 | 
  7 | ## 基础工具 (simpleMethods)
  8 | 
  9 |     from autoBinning.utils.simpleMethods import *
 10 |     my_list = [1,1,2,2,2,2,3,3,4,5,6,7,8,9,10,10,20,20,20,20,30,30,40,50,60,70,80,90,100]
 11 |     my_list_y = [1,1,2,2,2,2,1,1,1,2,2,2,1,1]
 12 |     t = simpleMethods(my_list)
 13 |     t.equalSize(3)
 14 |     # 每个分箱样本数平均
 15 |     print(t.bins) # [  1.           5.33333333  20.         100.        ]
 16 |     # 等间距划分分箱
 17 |     t.equalValue(4)
 18 |     print(t.bins) # [  1.    25.75  50.5   75.25 100.  ]
 19 |     # 基于numpy histogram分箱
 20 |     t.equalHist(4)
 21 |     print(t.bins) # [  1.    25.75  50.5   75.25 100.  ]
 22 | 
 23 | ## 基于标签的有监督自动分箱
 24 | 
 25 | ### 向前迭代方法 (forward method)
 26 | 
 27 |     # load data
 28 |     import pandas as pd
 29 |     df = pd.read_csv('credit_old.csv')
 30 |     df = df[['Age','target']]
 31 |     df = df.dropna()
 32 | 
 33 | #### 基于最大woe分裂分箱
 34 | 
 35 | 在得到尽可能细粒度的细分箱之后，寻找上下分箱woe差异最大的初始切割点，并得到woe趋势，之后迭代找到下一个woe差异最大且趋势相同的切割点，直到满足woe差异不大于一个阈值或分箱数（切割点数）满足要求
 36 | 
 37 |     from autoBinning.utils.forwardSplit import *
 38 |     t = forwardSplit(df['Age'], df['target'])
 39 |     t.fit(sby='woe',minv=0.01,init_split=20)
 40 |     print(t.bins) # [16. 25. 29. 33. 36. 38. 40. 42. 44. 46. 48. 50. 52. 54. 55. 58. 60. 63. 72. 94.]
 41 |     t = forwardSplit(df['Age'], df['target'])
 42 |     t.fit(sby='woe',num_split=4,init_split=20)
 43 |     print(t.bins) # [16. 42. 44. 48. 50. 94.]
 44 |     print("bin\twoe")
 45 |     for i in range(len(t.bins)-1):
 46 |         v = t.value[(t.x < t.bins[i+1]) & (t.x >= t.bins[i])]
 47 |         woe = t._cal_woe(v)
 48 |         print((t.bins[i], t.bins[i+1]),woe)
 49 | 
 50 |     bin	woe
 51 |     (16.0, 25.0) 0.11373232830301286
 52 |     (25.0, 42.0) 0.07217546872710079
 53 |     (42.0, 50.0) 0.04972042405868509
 54 |     (50.0, 72.0) -0.07172614369435065
 55 |     (72.0, 94.0) -0.13778318584223453
 56 | 
 57 | ![avatar](https://github.com/kaiwang0112006/autoBinning/blob/master/doc/woe1.JPG)
 58 | ![avatar](https://github.com/kaiwang0112006/autoBinning/blob/master/doc/woe2.JPG)
 59 | 
 60 | #### 基于最大iv分裂分箱
 61 | 
 62 | 与最大woe分裂分箱方法类似，在得到尽可能细粒度的细分箱之后，寻找iv值最大的切割点，并得到woe趋势，之后迭代找到下一个iv最大且woe趋势相同的切割点，直到分箱数（切割点数）满足要求
 63 | 
 64 |     from autoBinning.utils.forwardSplit import *
 65 |     # sby='woeiv'时考虑woe趋势，sby='iv'时不考虑woe趋势
 66 |     t = forwardSplit(df['Age'], df['target'])
 67 |     t.fit(sby='iv',minv=0.1,init_split=20)
 68 |     print(t.bins) # [16. 25. 29. 33. 36. 38. 40. 42. 44. 46. 48. 50. 58. 60. 63. 94.]
 69 |     t = forwardSplit(df['Age'], df['target'])
 70 |     t.fit(sby='iv',num_split=4,init_split=20)
 71 |     print(t.bins) # [16. 25. 33. 36. 38. 94.]
 72 |     t.fit(sby='woeiv',num_split=4,init_split=20)
 73 |     print(t.bins) # [16. 25. 33. 36. 38. 94.]
 74 | 
 75 |     print("bin\twoe")
 76 |     for i in range(len(t.bins)-1):
 77 |         v = t.value[(t.x < t.bins[i+1]) & (t.x >= t.bins[i])]
 78 |         woe = t._cal_woe(v)
 79 |         print((t.bins[i], t.bins[i+1]),woe)
 80 | 
 81 |     bin	woe
 82 |     (16.0, 25.0) 0.11373232830301286
 83 |     (25.0, 33.0) 0.06679187564362839
 84 |     (33.0, 40.0) 0.06638021747875023
 85 |     (40.0, 50.0) 0.05894173616389541
 86 |     (50.0, 94.0) -0.07934608583946329
 87 | 
 88 |     t = forwardSplit(df['Branch'], df['target'],missing=-1,categorical=True)
 89 |     t.fit(sby='woeiv',minv=0,init_split=0,num_split=4) # [['B19'], ['B15'], ['B14'], ['B16'], ['B7', 'B18', 'B2', 'B9', 'B5', 'B6', 'B1', 'B17', 'B4', 'B10', 'B8', 'B3', 'B12', 'B13', 'B11']]
 90 | 
 91 | ### 向后迭代方法 (backward method)
 92 | 
 93 | #### 基于最大iv合并分箱
 94 | 
 95 | 迭代每次删除一个分箱切点，是去掉后整体iv最大
 96 | 
 97 |     from autoBinning.utils.backwardSplit import *
 98 |     t = backwardSplit(df['Age'], df['target'])
 99 |     t.fit(sby='iv',num_split=5)
100 |     print(t.bins) # [16.  17.5 18.5 85.5 95. ]
101 | 
102 | #### 基于卡方检验的合并分箱
103 | 
104 | 1\. 得到尽可能细粒度的细分箱切点
105 | 
106 | 2\. 每个切点计算上下相邻分箱的卡方检验值
107 | 
108 | 3\. 将卡方检验值最低的两个分箱合并
109 | 
110 | 4\. 重复前两步直到达到分裂最小分裂切点数
111 | 
112 | 1\. First the input range is initialized by splitting
113 | it into sub-intervals with each sample
114 | getting own interval.
115 | 
116 | 2\. For every pair of adjacent sub-intervals a
117 | chi-square value is computed.
118 | 
119 | 3\. Merge pair with lowest chi-square into single bin.
120 | 
121 | 4\. Repeat 1 and 2 until number of bins meets predefined threshold.
122 | 
123 |     from autoBinning.utils.backwardSplit import *
124 |     t = backwardSplit(df['Age'], df['target'])
125 |     t.fit(sby='chi',num_split=7)
126 |     print(t.bins) # [16.  72.5 73.5 87.5 89.5 90.5 95. ]
127 | 
128 | #### 基于spearman相关性做向后等频分箱
129 | 
130 |     from autoBinning.utils.backwardSplit import *
131 |     t = backwardSplit(df['Age'], df['target'])
132 |     t.fit_by_spearman(min_v=5, init_split=20)
133 | 


--------------------------------------------------------------------------------
/utils/forwardSplit.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from .trendSplit import *
  3 | import numpy as np
  4 | import copy
  5 | import math
  6 | 
  7 | 
  8 | class forwardSplit(trendSplit):
  9 |     def __init__(self, x, y, bad=1,missing=None, force=False, categorical=False):
 10 |         trendSplit.__init__(self, x, y, bad, missing, force, categorical=categorical)
 11 | 
 12 |     def fit(self, init_split=0, num_split=0, minv=0, sby='woe', min_sample=0):
 13 |         '''
 14 |         :param num_split: 最大切割点数,不包含最大最小值
 15 |         :param minv: 最小分裂所需数值，woe/iv
 16 |         :param sby: 'woe','iv','woeiv'
 17 |         :param min_sample: 每个分箱最小样本数
 18 |         :return: numpy array -- 切割点数组
 19 |         '''
 20 |         self.set_init()
 21 | 
 22 |         if init_split == 0 or len(self.x) <= init_split or self.categorical:
 23 |             self.everysplit()
 24 |         else:
 25 |             self.equalSize(init_split)
 26 | 
 27 |         candidate = []
 28 |         for r in self.range_dict:
 29 |             candidate.append(r[0])
 30 |             candidate.append(r[1])
 31 |         self.candidate = sorted(list(set(candidate)))
 32 | 
 33 |         param = {'minv': minv, 'sby': sby, 'min_sample':min_sample}
 34 |         cut, iv = self.find_cut(**param)
 35 |         param['iv_base'] = iv
 36 |         self.cut_range = [cut]
 37 |         if cut!=None:
 38 |             self.candidate.remove(cut)
 39 |             while True:
 40 |                 cut, iv = self.find_cut(**param)
 41 |                 param['iv_base'] = iv
 42 |                 if cut:
 43 |                     self.cut_range.append(cut)
 44 |                     self.cut_range = sorted(list(set(self.cut_range)))
 45 |                     self.candidate.remove(cut)
 46 |                 else:
 47 |                     break
 48 | 
 49 |                 if num_split:
 50 |                     if len(set(self.cut_range)) >= num_split:
 51 |                         break
 52 | 
 53 |             self.cut_range.append(self.candidate[0])
 54 |             self.cut_range.append(self.candidate[-1])
 55 |             self.bins = sorted(list(set(self.cut_range)))
 56 |             if self.categorical:
 57 |                 binslist = list(self.bins)
 58 |                 binsvalue = []
 59 |                 for i in range(len(binslist)-1):
 60 |                     v = []
 61 |                     for k in self.xmap:
 62 |                         if self.xmap[k]>=binslist[i] and self.xmap[k]<binslist[i+1]:
 63 |                             v.append(k)
 64 |                     binsvalue.append(v)
 65 |                 self.bins = copy.deepcopy(binsvalue)
 66 |             else:
 67 |                 self.bins = np.array(self.bins)
 68 |         else:
 69 |             self.cut_range = None
 70 |             self.bins = None
 71 | 
 72 | 
 73 |     def find_cut(self,minv=None, sby='iv',iv_base=0, min_sample=0):
 74 |         '''
 75 |         :param minv: 最小分裂所需数值，woe/iv
 76 |         :param sby: 'woe','iv','woeiv'
 77 |         :param iv_base: 上一轮的iv值，sby='woe'时不用考虑
 78 |         :return:
 79 |         '''
 80 |         if not minv:
 81 |             minv = 0
 82 |         if ((sby== 'woe') or (sby == 'woeiv')) and (not self.trend) and len(self.cut_range)>0:
 83 |             self.candidateTrend(self.cut_range)
 84 | 
 85 |         iv = 0
 86 | 
 87 |         cut = None
 88 |         result = {}
 89 | 
 90 |         for i in range(1, len(self.candidate) - 1):
 91 |             if len(self.cut_range) == 0:
 92 |                 woe_range = (self.candidate[0]-0.1, self.candidate[i], self.candidate[-1])
 93 |                 iv_range = (self.candidate[0]-0.1, self.candidate[i], self.candidate[-1])
 94 |             else:
 95 |                 range_list = sorted([self.candidate[0]-0.1, self.candidate[i], self.candidate[-1]]+list(self.cut_range))
 96 |                 canidx = range_list.index(self.candidate[i])
 97 |                 #woe_range = (self.candidate[0], self.candidate[i], self.candidate[-1]+1)
 98 |                 woe_range = (range_list[canidx-1], range_list[canidx], range_list[canidx+1])
 99 |                 iv_range = tuple(range_list)
100 | 
101 |             if len(self.value[(self.x < woe_range[1]) & (self.x >= woe_range[0])]) > min_sample and \
102 |                     len(self.value[(self.x < woe_range[2]) & (self.x >= woe_range[1])]) > min_sample:
103 |                 if sby == 'woe':
104 |                     woe = self.cal_woe_by_range(woe_range)
105 |                     if woe > minv:
106 |                         minv = woe
107 |                         cut = self.candidate[i]
108 |                 elif sby == 'iv':
109 |                     iv = self.cal_iv_by_range(iv_range)
110 |                     result[self.candidate[i]] = iv
111 |                     if iv > minv and iv > iv_base:
112 |                         minv = iv
113 |                         cut = self.candidate[i]
114 |                 else:
115 |                     is_trend_tag = False
116 |                     if self.trend in ('up','down'):
117 |                         up_count, down_count = self.candidateTrend(list(iv_range)[1:-1])
118 |                         if (self.trend == 'up' and down_count==0 and up_count>0) or \
119 |                            (self.trend == 'down' and down_count > 0 and up_count == 0):
120 |                             is_trend_tag = True
121 | 
122 |                     iv = self.cal_iv_by_range(iv_range)
123 |                     #print(is_trend_tag, self.trend not in ('up','down'), self.trend)
124 |                     #print((is_trend_tag or self.trend not in ('up','down')))
125 |                     if (is_trend_tag or self.trend not in ('up','down')) and iv > minv:
126 |                         minv = iv
127 |                         cut = self.candidate[i]
128 | 
129 |         return cut, iv
130 | 


--------------------------------------------------------------------------------
/autoBinning/utils/forwardSplit.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from .trendSplit import *
  3 | import numpy as np
  4 | import copy
  5 | import math
  6 | 
  7 | 
  8 | class forwardSplit(trendSplit):
  9 |     def __init__(self, x, y, bad=1,missing=None, force=False, categorical=False):
 10 |         trendSplit.__init__(self, x, y, bad, missing, force, categorical=categorical)
 11 | 
 12 |     def fit(self, init_split=0, num_split=0, minv=0, sby='woe', min_sample=0):
 13 |         '''
 14 |         :param num_split: 最大切割点数,不包含最大最小值
 15 |         :param minv: 最小分裂所需数值，woe/iv
 16 |         :param sby: 'woe','iv','woeiv'
 17 |         :param min_sample: 每个分箱最小样本数
 18 |         :return: numpy array -- 切割点数组
 19 |         '''
 20 |         self.set_init()
 21 | 
 22 |         if init_split == 0 or len(self.x) <= init_split or self.categorical:
 23 |             self.everysplit()
 24 |         else:
 25 |             self.equalSize(init_split)
 26 | 
 27 |         candidate = []
 28 |         for r in self.range_dict:
 29 |             candidate.append(r[0])
 30 |             candidate.append(r[1])
 31 |         self.candidate = sorted(list(set(candidate)))
 32 | 
 33 |         param = {'minv': minv, 'sby': sby, 'min_sample':min_sample}
 34 |         cut, iv = self.find_cut(**param)
 35 |         param['iv_base'] = iv
 36 |         self.cut_range = [cut]
 37 |         if cut!=None:
 38 |             self.candidate.remove(cut)
 39 |             while True:
 40 |                 cut, iv = self.find_cut(**param)
 41 |                 param['iv_base'] = iv
 42 |                 if cut:
 43 |                     self.cut_range.append(cut)
 44 |                     self.cut_range = sorted(list(set(self.cut_range)))
 45 |                     self.candidate.remove(cut)
 46 |                 else:
 47 |                     break
 48 | 
 49 |                 if num_split:
 50 |                     if len(set(self.cut_range)) >= num_split:
 51 |                         break
 52 | 
 53 |             self.cut_range.append(self.candidate[0])
 54 |             self.cut_range.append(self.candidate[-1])
 55 |             self.bins = sorted(list(set(self.cut_range)))
 56 |             if self.categorical:
 57 |                 binslist = list(self.bins)
 58 |                 binsvalue = []
 59 |                 for i in range(len(binslist)-1):
 60 |                     v = []
 61 |                     for k in self.xmap:
 62 |                         if self.xmap[k]>=binslist[i] and self.xmap[k]<binslist[i+1]:
 63 |                             v.append(k)
 64 |                     binsvalue.append(v)
 65 |                 self.bins = copy.deepcopy(binsvalue)
 66 |             else:
 67 |                 self.bins = np.array(self.bins)
 68 |         else:
 69 |             self.cut_range = None
 70 |             self.bins = None
 71 | 
 72 | 
 73 |     def find_cut(self,minv=None, sby='iv',iv_base=0, min_sample=0):
 74 |         '''
 75 |         :param minv: 最小分裂所需数值，woe/iv
 76 |         :param sby: 'woe','iv','woeiv'
 77 |         :param iv_base: 上一轮的iv值，sby='woe'时不用考虑
 78 |         :return:
 79 |         '''
 80 |         if not minv:
 81 |             minv = 0
 82 |         if ((sby== 'woe') or (sby == 'woeiv')) and (not self.trend) and len(self.cut_range)>0:
 83 |             self.candidateTrend(self.cut_range)
 84 | 
 85 |         iv = 0
 86 | 
 87 |         cut = None
 88 |         result = {}
 89 | 
 90 |         for i in range(1, len(self.candidate) - 1):
 91 |             if len(self.cut_range) == 0:
 92 |                 woe_range = (self.candidate[0]-0.1, self.candidate[i], self.candidate[-1])
 93 |                 iv_range = (self.candidate[0]-0.1, self.candidate[i], self.candidate[-1])
 94 |             else:
 95 |                 range_list = sorted([self.candidate[0]-0.1, self.candidate[i], self.candidate[-1]]+list(self.cut_range))
 96 |                 canidx = range_list.index(self.candidate[i])
 97 |                 #woe_range = (self.candidate[0], self.candidate[i], self.candidate[-1]+1)
 98 |                 woe_range = (range_list[canidx-1], range_list[canidx], range_list[canidx+1])
 99 |                 iv_range = tuple(range_list)
100 | 
101 |             if len(self.value[(self.x < woe_range[1]) & (self.x >= woe_range[0])]) > min_sample and \
102 |                     len(self.value[(self.x < woe_range[2]) & (self.x >= woe_range[1])]) > min_sample:
103 |                 if sby == 'woe':
104 |                     woe = self.cal_woe_by_range(woe_range)
105 |                     if woe > minv:
106 |                         minv = woe
107 |                         cut = self.candidate[i]
108 |                 elif sby == 'iv':
109 |                     iv = self.cal_iv_by_range(iv_range)
110 |                     result[self.candidate[i]] = iv
111 |                     if iv > minv and iv > iv_base:
112 |                         minv = iv
113 |                         cut = self.candidate[i]
114 |                 else:
115 |                     is_trend_tag = False
116 |                     if self.trend in ('up','down'):
117 |                         up_count, down_count = self.candidateTrend(list(iv_range)[1:-1])
118 |                         if (self.trend == 'up' and down_count==0 and up_count>0) or \
119 |                            (self.trend == 'down' and down_count > 0 and up_count == 0):
120 |                             is_trend_tag = True
121 | 
122 |                     iv = self.cal_iv_by_range(iv_range)
123 |                     #print(is_trend_tag, self.trend not in ('up','down'), self.trend)
124 |                     #print((is_trend_tag or self.trend not in ('up','down')))
125 |                     if (is_trend_tag or self.trend not in ('up','down')) and iv > minv:
126 |                         minv = iv
127 |                         cut = self.candidate[i]
128 | 
129 |         return cut, iv
130 | 


--------------------------------------------------------------------------------
/utils/trendSplit.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from .simpleMethods import *
  3 | import numpy as np
  4 | import copy
  5 | import math
  6 | 
  7 | 
  8 | class trendSplit(simpleMethods):
  9 |     def __init__(self, x, y, bad=1,missing=None,force=False, categorical=False):
 10 |         simpleMethods.__init__(self, x,  missing=missing,force=force, categorical=categorical)
 11 |         self.y = y
 12 |         self.xmap = {}
 13 |         self.categorical = categorical
 14 |         self.bad=bad
 15 |         self.set_init(categorical=categorical)
 16 | 
 17 |     def set_init(self, categorical=False):
 18 |         self.cut_range = []
 19 |         self.trend = None
 20 |         self.value = np.array(self.y)
 21 | 
 22 |         if categorical:
 23 |             xset = set(self.x)
 24 |             vmap = {}
 25 |             for v in xset:
 26 |                 v_filter = self.value[self.x_org == v]
 27 |                 vmap[v] = len(v_filter[v_filter==self.bad])/len(v_filter)
 28 |             v_sort = sorted(list(vmap.keys()), key=lambda x: vmap[x], reverse=True)
 29 |             self.xmap = dict(zip(v_sort, range(len(v_sort))))
 30 |             self.xmap_inverse = dict(zip(range(len(v_sort)),v_sort))
 31 |             self.x_idx = copy.deepcopy(self.x)
 32 |             self.x = np.array([self.xmap[i] for i in list(self.x)])
 33 | 
 34 |         if self.missing == None:
 35 |             self.value_miss = None
 36 |         else:
 37 |             self.value_miss = self.value[self.x_org == self.missing]
 38 |             self.value = self.value[self.x_org!=self.missing]
 39 | 
 40 |         self.allbad = len(self.value[self.value == self.bad])  # 好样本总数
 41 |         self.allgood = len(self.value) - self.allbad  # 坏样本总数
 42 |         self.candidate = []
 43 |         self.woe_cache = {}
 44 |         self.iv_cache = {}
 45 |         self.chisquare_cache = {}
 46 |         self.know_box = {}
 47 | 
 48 | 
 49 |     def cal_woe_by_range(self,wrange):
 50 |         '''
 51 |         根据切点范围(start, mid, end)计算woe
 52 |         :param wrange:
 53 |         :param trend:
 54 |         :return:
 55 |         '''
 56 |         woe_up = self.cal_woe_by_start_end(wrange[0], wrange[1])
 57 |         woe_down = self.cal_woe_by_start_end(wrange[1], wrange[2])
 58 | 
 59 |         if self.trend == 'up':
 60 |             woe_sub = woe_up - woe_down
 61 |         elif self.trend == 'down':
 62 |             woe_sub = woe_down - woe_up
 63 |         else:
 64 |             woe_sub = abs(woe_down - woe_up)
 65 |         return woe_sub
 66 | 
 67 |     def cal_iv_by_range(self,vrange):
 68 |         '''
 69 |         根据切点范围(start, mid, end)计算iv
 70 |         :param vrange:
 71 |         :param bad:
 72 |         :return:
 73 |         '''
 74 |         iv_split = 0
 75 |         result = []
 76 |         for j in range(len(vrange)-1):
 77 |             if (vrange[j], vrange[j+1]) not in self.iv_cache:
 78 |                 vvalue = self.value[(self.x < vrange[j+1]) & (self.x >= vrange[j])]
 79 |                 iv_box = self._cal_iv(vvalue)
 80 |                 self.iv_cache[(vrange[j], vrange[j+1])] = iv_box
 81 |             else:
 82 |                 iv_box = self.iv_cache[(vrange[j], vrange[j+1])]
 83 |             result.append(iv_box)
 84 |             iv_split += iv_box
 85 | 
 86 |         return iv_split
 87 | 
 88 |     def cal_woe_by_start_end(self, start, end):
 89 |         if (start, end) not in self.woe_cache:
 90 |             vvalue = self.value[(self.x < end) & (self.x >= start)]
 91 |             woe_box = self._cal_woe(vvalue)
 92 |             self.woe_cache[(start, end)] = woe_box
 93 |         else:
 94 |             woe_box = self.woe_cache[(start, end)]
 95 |         return woe_box
 96 | 
 97 |     def _cal_woe(self,v):
 98 |         '''
 99 |         计算woe
100 |         :param v:
101 |         :param bad:
102 |         :return:
103 |         '''
104 |         bad_num = len(v[v == self.bad])
105 |         count_num = len(v)
106 | 
107 |         if count_num-bad_num==0 or self.allgood==0 or bad_num==0:
108 |             woe = 0
109 |         else:
110 |             woe = math.log((bad_num / (count_num - bad_num)) / (self.allbad / self.allgood))
111 |         return woe
112 | 
113 |     def _cal_iv(self, v):
114 |         '''
115 |         计算iv
116 |         :param v:
117 |         :param bad:
118 |         :return:
119 |         '''
120 |         bad_num = len(v[v == self.bad])
121 |         count_num = len(v)
122 | 
123 |         if count_num-bad_num == 0 or self.allgood==0 or bad_num==0:
124 |             iv = 0
125 |         else:
126 |             iv = (bad_num / (count_num - bad_num))*math.log((bad_num / (count_num - bad_num)) / (self.allbad / self.allgood))
127 |         return iv
128 | 
129 |     def candidateTrend(self,cut_range):
130 | 
131 |         trend_up = 0
132 |         trend_down = 0
133 |         result = {}
134 |         if len(cut_range) == 0:
135 |             candidate_list = copy.deepcopy(self.candidate)
136 |         else:
137 |             candidate_list = [self.candidate[0]] + copy.deepcopy(cut_range) + [self.candidate[-1]]
138 | 
139 |         for i in range(1,len(candidate_list) - 1):
140 |             woe_up = self.cal_woe_by_start_end(candidate_list[i-1], candidate_list[i])
141 |             woe_down = self.cal_woe_by_start_end(candidate_list[i], candidate_list[i+1])
142 | 
143 |             if woe_up>woe_down:
144 |                 trend_up += 1
145 |             elif woe_up<woe_down:
146 |                 trend_down += 1
147 |             result[candidate_list[i]] = (woe_up, woe_down)
148 | 
149 |         if trend_up>trend_down:
150 |             self.trend = 'up'
151 |         elif trend_up<trend_down:
152 |             self.trend = 'down'
153 | 
154 |         return trend_up, trend_down
155 | 
156 |     def cal_chisquare_by_range(self, chi_range):
157 |         if chi_range not in self.chisquare_cache:
158 |             v_up = self.value[(self.x < chi_range[1]) & (self.x >= chi_range[0])]
159 |             v_down = self.value[(self.x < chi_range[2]) & (self.x >= chi_range[1])]
160 |             all_num = len(v_up)+len(v_down)
161 |             up_bad = len(v_up[v_up==self.bad])
162 |             up_good = len(v_up)-up_bad
163 |             down_bad = len(v_down[v_down==self.bad])
164 |             down_good = len(v_down)-down_bad
165 |             all_g = up_good + down_good
166 |             all_b = up_bad + down_bad
167 | 
168 |             if len(v_up)==0 or len(v_down)==0:
169 |                 chisquare_value = 10**7
170 |             else:
171 |                 chisquare_value = (up_bad-len(v_up)*all_b/all_num)/len(v_up)*all_b/all_num + \
172 |                      (up_good-len(v_up)*all_g/all_num)/len(v_up)*all_g/all_num + \
173 |                      (down_good-len(v_down)*all_g/all_num)/len(v_down)*all_g/all_num + \
174 |                      (down_bad-len(v_down)*all_b/all_num)/len(v_down)*all_b/all_num
175 |             self.chisquare_cache[chi_range] = chisquare_value
176 |         else:
177 |             chisquare_value = self.chisquare_cache[chi_range]
178 |         return chisquare_value
179 | 


--------------------------------------------------------------------------------
/autoBinning/utils/trendSplit.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from .simpleMethods import *
  3 | import numpy as np
  4 | import copy
  5 | import math
  6 | 
  7 | 
  8 | class trendSplit(simpleMethods):
  9 |     def __init__(self, x, y, bad=1,missing=None,force=False, categorical=False):
 10 |         simpleMethods.__init__(self, x,  missing=missing,force=force, categorical=categorical)
 11 |         self.y = y
 12 |         self.xmap = {}
 13 |         self.categorical = categorical
 14 |         self.bad=bad
 15 |         self.set_init(categorical=categorical)
 16 | 
 17 |     def set_init(self, categorical=False):
 18 |         self.cut_range = []
 19 |         self.trend = None
 20 |         self.value = np.array(self.y)
 21 | 
 22 |         if categorical:
 23 |             xset = set(self.x)
 24 |             vmap = {}
 25 |             for v in xset:
 26 |                 v_filter = self.value[self.x_org == v]
 27 |                 vmap[v] = len(v_filter[v_filter==self.bad])/len(v_filter)
 28 |             v_sort = sorted(list(vmap.keys()), key=lambda x: vmap[x], reverse=True)
 29 |             self.xmap = dict(zip(v_sort, range(len(v_sort))))
 30 |             self.xmap_inverse = dict(zip(range(len(v_sort)),v_sort))
 31 |             self.x_idx = copy.deepcopy(self.x)
 32 |             self.x = np.array([self.xmap[i] for i in list(self.x)])
 33 | 
 34 |         if self.missing == None:
 35 |             self.value_miss = None
 36 |         else:
 37 |             self.value_miss = self.value[self.x_org == self.missing]
 38 |             self.value = self.value[self.x_org!=self.missing]
 39 | 
 40 |         self.allbad = len(self.value[self.value == self.bad])  # 好样本总数
 41 |         self.allgood = len(self.value) - self.allbad  # 坏样本总数
 42 |         self.candidate = []
 43 |         self.woe_cache = {}
 44 |         self.iv_cache = {}
 45 |         self.chisquare_cache = {}
 46 |         self.know_box = {}
 47 | 
 48 | 
 49 |     def cal_woe_by_range(self,wrange):
 50 |         '''
 51 |         根据切点范围(start, mid, end)计算woe
 52 |         :param wrange:
 53 |         :param trend:
 54 |         :return:
 55 |         '''
 56 |         woe_up = self.cal_woe_by_start_end(wrange[0], wrange[1])
 57 |         woe_down = self.cal_woe_by_start_end(wrange[1], wrange[2])
 58 | 
 59 |         if self.trend == 'up':
 60 |             woe_sub = woe_up - woe_down
 61 |         elif self.trend == 'down':
 62 |             woe_sub = woe_down - woe_up
 63 |         else:
 64 |             woe_sub = abs(woe_down - woe_up)
 65 |         return woe_sub
 66 | 
 67 |     def cal_iv_by_range(self,vrange):
 68 |         '''
 69 |         根据切点范围(start, mid, end)计算iv
 70 |         :param vrange:
 71 |         :param bad:
 72 |         :return:
 73 |         '''
 74 |         iv_split = 0
 75 |         result = []
 76 |         for j in range(len(vrange)-1):
 77 |             if (vrange[j], vrange[j+1]) not in self.iv_cache:
 78 |                 vvalue = self.value[(self.x < vrange[j+1]) & (self.x >= vrange[j])]
 79 |                 iv_box = self._cal_iv(vvalue)
 80 |                 self.iv_cache[(vrange[j], vrange[j+1])] = iv_box
 81 |             else:
 82 |                 iv_box = self.iv_cache[(vrange[j], vrange[j+1])]
 83 |             result.append(iv_box)
 84 |             iv_split += iv_box
 85 | 
 86 |         return iv_split
 87 | 
 88 |     def cal_woe_by_start_end(self, start, end):
 89 |         if (start, end) not in self.woe_cache:
 90 |             vvalue = self.value[(self.x < end) & (self.x >= start)]
 91 |             woe_box = self._cal_woe(vvalue)
 92 |             self.woe_cache[(start, end)] = woe_box
 93 |         else:
 94 |             woe_box = self.woe_cache[(start, end)]
 95 |         return woe_box
 96 | 
 97 |     def _cal_woe(self,v):
 98 |         '''
 99 |         计算woe
100 |         :param v:
101 |         :param bad:
102 |         :return:
103 |         '''
104 |         bad_num = len(v[v == self.bad])
105 |         count_num = len(v)
106 | 
107 |         if count_num-bad_num==0 or self.allgood==0 or bad_num==0:
108 |             woe = 0
109 |         else:
110 |             woe = math.log((bad_num / (count_num - bad_num)) / (self.allbad / self.allgood))
111 |         return woe
112 | 
113 |     def _cal_iv(self, v):
114 |         '''
115 |         计算iv
116 |         :param v:
117 |         :param bad:
118 |         :return:
119 |         '''
120 |         bad_num = len(v[v == self.bad])
121 |         count_num = len(v)
122 | 
123 |         if count_num-bad_num == 0 or self.allgood==0 or bad_num==0:
124 |             iv = 0
125 |         else:
126 |             iv = (bad_num / (count_num - bad_num))*math.log((bad_num / (count_num - bad_num)) / (self.allbad / self.allgood))
127 |         return iv
128 | 
129 |     def candidateTrend(self,cut_range):
130 | 
131 |         trend_up = 0
132 |         trend_down = 0
133 |         result = {}
134 |         if len(cut_range) == 0:
135 |             candidate_list = copy.deepcopy(self.candidate)
136 |         else:
137 |             candidate_list = [self.candidate[0]] + copy.deepcopy(cut_range) + [self.candidate[-1]]
138 | 
139 |         for i in range(1,len(candidate_list) - 1):
140 |             woe_up = self.cal_woe_by_start_end(candidate_list[i-1], candidate_list[i])
141 |             woe_down = self.cal_woe_by_start_end(candidate_list[i], candidate_list[i+1])
142 | 
143 |             if woe_up>woe_down:
144 |                 trend_up += 1
145 |             elif woe_up<woe_down:
146 |                 trend_down += 1
147 |             result[candidate_list[i]] = (woe_up, woe_down)
148 | 
149 |         if trend_up>trend_down:
150 |             self.trend = 'up'
151 |         elif trend_up<trend_down:
152 |             self.trend = 'down'
153 | 
154 |         return trend_up, trend_down
155 | 
156 |     def cal_chisquare_by_range(self, chi_range):
157 |         if chi_range not in self.chisquare_cache:
158 |             v_up = self.value[(self.x < chi_range[1]) & (self.x >= chi_range[0])]
159 |             v_down = self.value[(self.x < chi_range[2]) & (self.x >= chi_range[1])]
160 |             all_num = len(v_up)+len(v_down)
161 |             up_bad = len(v_up[v_up==self.bad])
162 |             up_good = len(v_up)-up_bad
163 |             down_bad = len(v_down[v_down==self.bad])
164 |             down_good = len(v_down)-down_bad
165 |             all_g = up_good + down_good
166 |             all_b = up_bad + down_bad
167 | 
168 |             if len(v_up)==0 or len(v_down)==0:
169 |                 chisquare_value = 10**7
170 |             else:
171 |                 chisquare_value = (up_bad-len(v_up)*all_b/all_num)/len(v_up)*all_b/all_num + \
172 |                      (up_good-len(v_up)*all_g/all_num)/len(v_up)*all_g/all_num + \
173 |                      (down_good-len(v_down)*all_g/all_num)/len(v_down)*all_g/all_num + \
174 |                      (down_bad-len(v_down)*all_b/all_num)/len(v_down)*all_b/all_num
175 |             self.chisquare_cache[chi_range] = chisquare_value
176 |         else:
177 |             chisquare_value = self.chisquare_cache[chi_range]
178 |         return chisquare_value
179 | 


--------------------------------------------------------------------------------
/utils/trendDiscretization.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from .simpleMethods import *
  3 | import numpy as np
  4 | import copy 
  5 | 
  6 | class trendDisMethod(simpleMethods):
  7 |     def __init__(self,x,y):
  8 |         simpleMethods.__init__(self,x) 
  9 |         self.y = y
 10 |         self.cut_range = []
 11 |         
 12 |     def __stat(self,bad=1):
 13 |         '''
 14 |         统计数据集各个区间的坏样本比例
 15 |         :param bad:
 16 |         :return:
 17 |         '''
 18 |         n = 20 if len(self.x)>=10 else len(self.x)
 19 |         self.equalSize(n)
 20 |         self.value = np.array(self.y)
 21 |         #self.x_array = np.array(self.x)
 22 |         self.range_table = {}
 23 |         self.down = []
 24 |         self.bad_list = []
 25 |         self.count_list = []
 26 |         #print(self.range_dict)
 27 |         #print('for')
 28 |         for r in self.range_dict:
 29 |             range_value = self.value[(self.x<r[1]) & (self.x>=r[0])]
 30 |             bad_num = len(range_value[range_value==bad])
 31 |             count_num = len(range_value)
 32 |             if count_num == 0:
 33 |                 rate = 0
 34 |             else:
 35 |                 rate = bad_num/count_num
 36 |             
 37 |             self.range_table[r[1]] = {'bad_rate':rate,'count':count_num,'down':r[1],'range':r,
 38 |                                       'bad_num':bad_num,'count_num':count_num}
 39 |             self.down.append(r[1])
 40 |             self.bad_list.append(bad_num)
 41 |             self.count_list.append(count_num)
 42 |         print(self.range_table)
 43 | 
 44 |             
 45 |     def fit(self, bad=1, trend='up'):
 46 |         self.__stat(bad=bad)
 47 |         self.down = np.array(self.down)              # 各分区区间下限
 48 |         self.bad_list = np.array(self.bad_list)      # 各分区坏样本比例
 49 |         self.count_list = np.array(self.count_list)  # 各分区总样本比例
 50 |         
 51 |         self.cut_range = self.find_cut(trend=trend)  # 第一个切割点
 52 | 
 53 |         while True:
 54 |             cut_list = self.find_cut(self.cut_range,trend=trend)
 55 | 
 56 |             if len(cut_list)>0:
 57 |                 for c in cut_list:
 58 |                     self.cut_range.append(c)
 59 |                     self.cut_range = sorted(list(set(self.cut_range)))
 60 |             else:
 61 |                 break
 62 | 
 63 |         self.bins = np.array(sorted(list(set(self.cut_range))))
 64 |         
 65 |     
 66 |     def find_cut(self,cut_list=[],trend='up'):
 67 |         cuts = []
 68 |         if cut_list == []:
 69 |             candidate = sorted(copy.deepcopy(self.down))
 70 |             cut, rate = self.__find_cut(list(candidate),start='',end='',trend=trend)
 71 |             if rate:
 72 |                 cuts.append(cut)
 73 |         else:
 74 |             for i in range(len(cut_list)):
 75 |                 if i == 0:
 76 |                     candidate = list(self.down[self.down<cut_list[i]])
 77 |                     cut, rate = self.__find_cut(list(candidate),start='',end=cut_list[i],trend=trend)
 78 | 
 79 |                 else:
 80 |                     candidate = list(self.down[(self.down<cut_list[i]) & (self.down>cut_list[i-1])])
 81 |                     cut, rate = self.__find_cut(list(candidate),start=cut_list[i-1],end=cut_list[i],trend=trend)
 82 | 
 83 |                 if rate and cut not in self.cut_range:
 84 |                     cuts.append(cut)
 85 |                 if i == len(cut_list)-1:
 86 |                     candidate = list(self.down[self.down>cut_list[i]])
 87 |                     cut, rate = self.__find_cut(list(candidate),start=cut_list[i],end='',trend=trend)
 88 | 
 89 |                     if rate and cut not in self.cut_range:
 90 |                         cuts.append(cut)   
 91 |         return cuts                       
 92 |         
 93 |     def __find_cut(self,candidate,start='',end='',trend='up'):
 94 |         result_cut = None
 95 |         result_rate = None
 96 | 
 97 |         for i in range(len(candidate)):
 98 |             if start=='' and end=='':
 99 |                 bad_up = sum(self.bad_list[(self.down<=candidate[i])])
100 |                 count_up = sum(self.count_list[(self.down<=candidate[i])])                                                    
101 |                 bad_down = sum(self.bad_list[(self.down>candidate[i])])
102 |                 count_down = sum(self.count_list[(self.down>candidate[i])])    
103 |             elif start == '' and end != '':   
104 |                 bad_up = sum(self.bad_list[(self.down<=candidate[i])])
105 |                 count_up = sum(self.count_list[(self.down<=candidate[i])])                                                    
106 |                 bad_down = sum(self.bad_list[(self.down>candidate[i]) & (self.down<=end)])
107 |                 count_down = sum(self.count_list[(self.down>candidate[i]) & (self.down<=end)])
108 |             elif start != '' and end == '':
109 |                 bad_up = sum(self.bad_list[(self.down<=candidate[i]) & (self.down>start)])
110 |                 count_up = sum(self.count_list[(self.down<=candidate[i]) & (self.down>start)])                                                    
111 |                 bad_down = sum(self.bad_list[(self.down>candidate[i])])
112 |                 count_down = sum(self.count_list[(self.down>candidate[i])])    
113 |             elif start != '' and end != '':
114 |                 bad_up = sum(self.bad_list[(self.down<=candidate[i]) & (self.down>start)])
115 |                 count_up = sum(self.count_list[(self.down<=candidate[i]) & (self.down>start)])                                                    
116 |                 bad_down = sum(self.bad_list[(self.down>candidate[i]) & (self.down<=end)])
117 |                 count_down = sum(self.count_list[(self.down>candidate[i]) & (self.down<=end)])   
118 |                 
119 |             if count_down == 0:
120 |                 rate_down = 0
121 |             else:
122 |                 rate_down = bad_down/count_down
123 |             if count_up == 0:
124 |                 rate_up = 0
125 |             else:
126 |                 rate_up = bad_up/count_up    
127 |                 
128 |             rate = rate_down - rate_up
129 |             
130 |             if trend=='up':
131 |                 if rate >=0:
132 |                     if not result_rate:
133 |                         result_rate = rate
134 |                         result_cut = candidate[i] 
135 |                     if rate > result_rate and candidate[i] not in self.cut_range:
136 |                         result_rate = rate
137 |                         result_cut = candidate[i] 
138 |             elif trend=='down':
139 |                 if rate <=0:
140 |                     if not result_rate:
141 |                         result_rate = rate
142 |                         result_cut = candidate[i] 
143 |                     if rate < result_rate and candidate[i] not in self.cut_range:
144 |                         result_rate = rate
145 |                         result_cut = candidate[i]                 
146 |         #print(result_cut, result_rate)                                               
147 |         return result_cut, result_rate
148 | 


--------------------------------------------------------------------------------
/autoBinning/utils/trendDiscretization.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from .simpleMethods import *
  3 | import numpy as np
  4 | import copy 
  5 | 
  6 | class trendDisMethod(simpleMethods):
  7 |     def __init__(self,x,y):
  8 |         simpleMethods.__init__(self,x) 
  9 |         self.y = y
 10 |         self.cut_range = []
 11 |         
 12 |     def __stat(self,bad=1):
 13 |         '''
 14 |         统计数据集各个区间的坏样本比例
 15 |         :param bad:
 16 |         :return:
 17 |         '''
 18 |         n = 20 if len(self.x)>=10 else len(self.x)
 19 |         self.equalSize(n)
 20 |         self.value = np.array(self.y)
 21 |         #self.x_array = np.array(self.x)
 22 |         self.range_table = {}
 23 |         self.down = []
 24 |         self.bad_list = []
 25 |         self.count_list = []
 26 |         #print(self.range_dict)
 27 |         #print('for')
 28 |         for r in self.range_dict:
 29 |             range_value = self.value[(self.x<r[1]) & (self.x>=r[0])]
 30 |             bad_num = len(range_value[range_value==bad])
 31 |             count_num = len(range_value)
 32 |             if count_num == 0:
 33 |                 rate = 0
 34 |             else:
 35 |                 rate = bad_num/count_num
 36 |             
 37 |             self.range_table[r[1]] = {'bad_rate':rate,'count':count_num,'down':r[1],'range':r,
 38 |                                       'bad_num':bad_num,'count_num':count_num}
 39 |             self.down.append(r[1])
 40 |             self.bad_list.append(bad_num)
 41 |             self.count_list.append(count_num)
 42 |         print(self.range_table)
 43 | 
 44 |             
 45 |     def fit(self, bad=1, trend='up'):
 46 |         self.__stat(bad=bad)
 47 |         self.down = np.array(self.down)              # 各分区区间下限
 48 |         self.bad_list = np.array(self.bad_list)      # 各分区坏样本比例
 49 |         self.count_list = np.array(self.count_list)  # 各分区总样本比例
 50 |         
 51 |         self.cut_range = self.find_cut(trend=trend)  # 第一个切割点
 52 | 
 53 |         while True:
 54 |             cut_list = self.find_cut(self.cut_range,trend=trend)
 55 | 
 56 |             if len(cut_list)>0:
 57 |                 for c in cut_list:
 58 |                     self.cut_range.append(c)
 59 |                     self.cut_range = sorted(list(set(self.cut_range)))
 60 |             else:
 61 |                 break
 62 | 
 63 |         self.bins = np.array(sorted(list(set(self.cut_range))))
 64 |         
 65 |     
 66 |     def find_cut(self,cut_list=[],trend='up'):
 67 |         cuts = []
 68 |         if cut_list == []:
 69 |             candidate = sorted(copy.deepcopy(self.down))
 70 |             cut, rate = self.__find_cut(list(candidate),start='',end='',trend=trend)
 71 |             if rate:
 72 |                 cuts.append(cut)
 73 |         else:
 74 |             for i in range(len(cut_list)):
 75 |                 if i == 0:
 76 |                     candidate = list(self.down[self.down<cut_list[i]])
 77 |                     cut, rate = self.__find_cut(list(candidate),start='',end=cut_list[i],trend=trend)
 78 | 
 79 |                 else:
 80 |                     candidate = list(self.down[(self.down<cut_list[i]) & (self.down>cut_list[i-1])])
 81 |                     cut, rate = self.__find_cut(list(candidate),start=cut_list[i-1],end=cut_list[i],trend=trend)
 82 | 
 83 |                 if rate and cut not in self.cut_range:
 84 |                     cuts.append(cut)
 85 |                 if i == len(cut_list)-1:
 86 |                     candidate = list(self.down[self.down>cut_list[i]])
 87 |                     cut, rate = self.__find_cut(list(candidate),start=cut_list[i],end='',trend=trend)
 88 | 
 89 |                     if rate and cut not in self.cut_range:
 90 |                         cuts.append(cut)   
 91 |         return cuts                       
 92 |         
 93 |     def __find_cut(self,candidate,start='',end='',trend='up'):
 94 |         result_cut = None
 95 |         result_rate = None
 96 | 
 97 |         for i in range(len(candidate)):
 98 |             if start=='' and end=='':
 99 |                 bad_up = sum(self.bad_list[(self.down<=candidate[i])])
100 |                 count_up = sum(self.count_list[(self.down<=candidate[i])])                                                    
101 |                 bad_down = sum(self.bad_list[(self.down>candidate[i])])
102 |                 count_down = sum(self.count_list[(self.down>candidate[i])])    
103 |             elif start == '' and end != '':   
104 |                 bad_up = sum(self.bad_list[(self.down<=candidate[i])])
105 |                 count_up = sum(self.count_list[(self.down<=candidate[i])])                                                    
106 |                 bad_down = sum(self.bad_list[(self.down>candidate[i]) & (self.down<=end)])
107 |                 count_down = sum(self.count_list[(self.down>candidate[i]) & (self.down<=end)])
108 |             elif start != '' and end == '':
109 |                 bad_up = sum(self.bad_list[(self.down<=candidate[i]) & (self.down>start)])
110 |                 count_up = sum(self.count_list[(self.down<=candidate[i]) & (self.down>start)])                                                    
111 |                 bad_down = sum(self.bad_list[(self.down>candidate[i])])
112 |                 count_down = sum(self.count_list[(self.down>candidate[i])])    
113 |             elif start != '' and end != '':
114 |                 bad_up = sum(self.bad_list[(self.down<=candidate[i]) & (self.down>start)])
115 |                 count_up = sum(self.count_list[(self.down<=candidate[i]) & (self.down>start)])                                                    
116 |                 bad_down = sum(self.bad_list[(self.down>candidate[i]) & (self.down<=end)])
117 |                 count_down = sum(self.count_list[(self.down>candidate[i]) & (self.down<=end)])   
118 |                 
119 |             if count_down == 0:
120 |                 rate_down = 0
121 |             else:
122 |                 rate_down = bad_down/count_down
123 |             if count_up == 0:
124 |                 rate_up = 0
125 |             else:
126 |                 rate_up = bad_up/count_up    
127 |                 
128 |             rate = rate_down - rate_up
129 |             
130 |             if trend=='up':
131 |                 if rate >=0:
132 |                     if not result_rate:
133 |                         result_rate = rate
134 |                         result_cut = candidate[i] 
135 |                     if rate > result_rate and candidate[i] not in self.cut_range:
136 |                         result_rate = rate
137 |                         result_cut = candidate[i] 
138 |             elif trend=='down':
139 |                 if rate <=0:
140 |                     if not result_rate:
141 |                         result_rate = rate
142 |                         result_cut = candidate[i] 
143 |                     if rate < result_rate and candidate[i] not in self.cut_range:
144 |                         result_rate = rate
145 |                         result_cut = candidate[i]                 
146 |         #print(result_cut, result_rate)                                               
147 |         return result_cut, result_rate
148 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from autoBinning.utils.forwardSplit import *
  3 | from autoBinning.utils.trendSplit import *
  4 | from autoBinning.utils.simpleMethods import *
  5 | from autoBinning.utils.trendDiscretization import *
  6 | from autoBinning.utils.mapa import *
  7 | 
  8 | import numpy as np
  9 | import pandas as pd
 10 | 
 11 | 
 12 | def sampleTest():
 13 |     #my_list = [1,1,2,2,2,2,3,3,4,5,6,7,8,9]
 14 |     my_list = [1,1,2,2,2,2,3,3,4,5,6,7,8,9,10,10,20,20,20,20,30,30,40,50,60,70,80,90,100]
 15 |     my_list_y = [1,1,2,2,2,2,1,1,1,2,2,2,1,1]
 16 |     t = simpleMethods(my_list) # [  1.           5.33333333  20.         100.        ]
 17 |     t.equalSize(3)
 18 |     trans = np.digitize(my_list, t.bins)
 19 |     print(t.bins)
 20 |     print(trans)
 21 |     t.equalValue(4)
 22 |     trans = np.digitize(my_list, t.bins)
 23 |     print(t.bins)
 24 |     print(trans)
 25 |     t.equalHist(4)
 26 |     trans = np.digitize(my_list, t.bins)
 27 |     print(t.bins)
 28 |     print(trans)
 29 | 
 30 |     
 31 | def distest():
 32 |     my_list = [1,1,2,2,2,2,3,3,4,5,6,7,8,9,10,10,20,20,20,20,30,30,40,50,60,70,80,90,100]
 33 |     my_list_y = [1,1,0,0,0,1,0,0,1,1,1,0,1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1]
 34 |     t = trendDisMethod(my_list, my_list_y)
 35 |     t.fit()
 36 |     trans = np.digitize(my_list, t.bins)
 37 |     print(t.bins)
 38 |     print(trans)   
 39 |     
 40 | def trend_test_by_data():
 41 |     df = pd.read_csv('credit_old.csv')
 42 |     df = df[['Age','target']]
 43 |     df = df.dropna()
 44 | 
 45 |     t = trendDisMethod(df['Age'], df['target'])
 46 |     t.fit(trend='down')
 47 |     print(df['Age'].describe())
 48 |     print(t.bins)
 49 |     #print(df['Age'].describe())
 50 |  
 51 | def forward_woe_test():
 52 |     df = pd.read_csv('credit_old.csv')
 53 |     df = df[['Age','target']]
 54 |     df = df.dropna()
 55 | 
 56 |     t = forwardSplit(df['Age'], df['target'])
 57 |     t.fit(sby='woe',minv=0.01,init_split=20)
 58 |     print(t.bins) # [16. 25. 29. 33. 36. 38. 40. 42. 44. 46. 48. 50. 52. 54. 55. 58. 60. 63. 72. 94.]
 59 |     t = forwardSplit(df['Age'], df['target'],missing=16)
 60 |     t.fit(sby='woe',num_split=4,init_split=20)
 61 |     print(t.bins)
 62 |     t = forwardSplit(df['Age'], df['target'])
 63 |     t.fit(sby='woe',num_split=4,init_split=20)
 64 |     print(t.bins) # [16. 42. 44. 48. 50. 94.]
 65 |     print("bin\twoe")
 66 |     for i in range(len(t.bins)-1):
 67 |         v = t.value[(t.x < t.bins[i+1]) & (t.x >= t.bins[i])]
 68 |         woe = t._cal_woe(v)
 69 |         print((t.bins[i], t.bins[i+1]),woe)
 70 |     # {(16.0, 25.0): 0.11373232830301286, (25.0, 42.0): 0.07217546872710079, (42.0, 50.0): 0.04972042405868509, (50.0, 72.0): -0.07172614369435065, (72.0, 94.0): -0.13778318584223453}
 71 | 
 72 | def forward_iv_test():
 73 |     df = pd.read_csv('credit_old.csv')
 74 |     df = df[['Age','target']]
 75 |     df = df.dropna()
 76 | 
 77 |     t = forwardSplit(df['Age'], df['target'])
 78 |     t.fit(sby='iv',minv=0.1,init_split=20)
 79 |     print(t.bins) # [16. 25. 29. 33. 36. 38. 40. 42. 44. 46. 48. 50. 58. 60. 63. 94.]
 80 |     t = forwardSplit(df['Age'], df['target'])
 81 |     t.fit(sby='iv',num_split=4,init_split=20,min_sample=len(df)*0.2)
 82 |     print(t.bins) # [16. 38. 50. 94.]
 83 |     t.fit(sby='woeiv',num_split=4,init_split=20)
 84 |     print(t.bins) # [16. 25. 33. 36. 38. 94.]
 85 |     print("bin\twoe")
 86 |     for i in range(len(t.bins)-1):
 87 |         v = t.value[(t.x < t.bins[i+1]) & (t.x >= t.bins[i])]
 88 |         woe = t._cal_woe(v)
 89 |         print((t.bins[i], t.bins[i+1]),woe)
 90 | 
 91 | def backward_iv_test():
 92 |     df = pd.read_csv('credit_old.csv')
 93 |     df = df[['Age','target']]
 94 |     df = df.dropna()
 95 | 
 96 |     t = backwardSplit(df['Age'], df['target'])
 97 |     t.fit(sby='iv',num_split=5)
 98 |     print(t.bins) # [16.  17.5 18.5 85.5 95. ]
 99 |     woe_dict = {}
100 |     for i in range(len(t.bins)-1):
101 |         v = t.value[(t.x < t.bins[i+1]) & (t.x >= t.bins[i])]
102 |         woe = t._cal_woe(v)
103 |         woe_dict[(t.bins[i], t.bins[i+1])] = woe
104 |     print(woe_dict)
105 | 
106 | def backward_chi_test():
107 |     df = pd.read_csv('credit_old.csv')
108 |     df = df[['Age','target']]
109 |     df = df.dropna()
110 | 
111 |     t = backwardSplit(df['Age'], df['target'])
112 |     t.fit(sby='chi',num_split=7)
113 |     print(t.bins) # [16.  72.5 73.5 87.5 89.5 90.5 95. ]
114 |     print("bin\twoe")
115 |     for i in range(len(t.bins)-1):
116 |         v = t.value[(t.x < t.bins[i+1]) & (t.x >= t.bins[i])]
117 |         woe = t._cal_woe(v)
118 |         print((t.bins[i], t.bins[i+1]),woe)
119 |         #woe_dict[(t.bins[i], t.bins[i+1])] = woe
120 | 
121 | def forward_iv_test2():
122 |     df = pd.read_csv('resolution.csv')
123 |     df = df.dropna()
124 | 
125 |     t = forwardSplit(df['x'], df['y'],missing=-1)
126 |     t.fit(sby='woeiv',minv=0.1,init_split=0,num_split=4)
127 |     print(t.bins)
128 |     print("bin\twoe")
129 |     for i in range(len(t.bins)-1):
130 |         v = t.value[(t.x < t.bins[i+1]) & (t.x >= t.bins[i])]
131 |         woe = t._cal_woe(v)
132 |         print((t.bins[i], t.bins[i+1]),woe)
133 | 
134 | def MAPA_test():
135 |     df = pd.read_csv('credit_old.csv')
136 |     df = df[['Age','target']]
137 |     df = df.dropna()
138 | 
139 |     t = MAPA(df['Age'], df['target'])
140 |     t.fit(trend='up',sby='woe')
141 |     print(t.bins)
142 |     print("bin\twoe")
143 |     for i in range(len(t.bins)-1):
144 |         v = t.value[(t.x < t.bins[i+1]) & (t.x >= t.bins[i])]
145 |         woe = t._cal_woe(v)
146 |         print((t.bins[i], t.bins[i+1]),woe)
147 | 
148 | def spearman_test():
149 |     df = pd.read_csv('credit_old.csv')
150 |     df = df[['Age','target']]
151 |     df = df.dropna()
152 | 
153 |     t = backwardSplit(df['Age'], df['target'])
154 |     t.fit_by_spearman(min_v=5, init_split=20)
155 |     print(t.bins)
156 |     print("bin\twoe")
157 |     for i in range(len(t.bins)-1):
158 |         v = t.value[(t.x < t.bins[i+1]) & (t.x >= t.bins[i])]
159 |         woe = t._cal_woe(v)
160 |         print((t.bins[i], t.bins[i+1]),woe)
161 | 
162 | def forward_woe_test_cat():
163 |     df = pd.read_csv('credit_old.csv')
164 |     df = df[['Branch','target']]
165 |     df = df.dropna()
166 | 
167 |     t = forwardSplit(df['Branch'], df['target'],categorical=True)
168 |     t.fit(sby='woe',minv=0.01,init_split=0,num_split=4)
169 |     print(t.bins)
170 |     for i in range(len(t.bins)):
171 |         v = t.value[np.isin(t.x_idx,t.bins[i])]
172 |         woe = t._cal_woe(v)
173 |         print(t.bins[i],woe)
174 | 
175 | def forward_iv_test2_cat():
176 |     df = pd.read_csv('credit_old.csv')
177 |     df = df[['Branch','target']]
178 |     df = df.dropna()
179 | 
180 |     t = forwardSplit(df['Branch'], df['target'],missing=-1,categorical=True)
181 |     t.fit(sby='woeiv',minv=0,init_split=0,num_split=4)
182 |     print(t.bins)
183 |     print("bin\twoe")
184 |     for i in range(len(t.bins)):
185 |         v = t.value[np.isin(t.x_idx,t.bins[i])]
186 |         woe = t._cal_woe(v)
187 |         print(t.bins[i],woe)
188 | 
189 | 
190 | def main():
191 |     #forward_woe_test()
192 |     #forward_iv_test2()
193 |     #sampleTest()
194 |     #forward_iv_test()
195 |     #backward_chi_test()
196 |     #MAPA_test()
197 |     #spearman_test()
198 |     forward_iv_test2_cat()
199 | 
200 | if __name__ == "__main__":
201 |     main()


--------------------------------------------------------------------------------