├── __init__.py ├── utils ├── __init__.py ├── simpleMethods.py ├── mapa.py ├── backwardSplit.py ├── forwardSplit.py ├── trendSplit.py └── trendDiscretization.py ├── autoBinning ├── __init__.py └── utils │ ├── __init__.py │ ├── simpleMethods.py │ ├── mapa.py │ ├── backwardSplit.py │ ├── forwardSplit.py │ ├── trendSplit.py │ └── trendDiscretization.py ├── setup.cfg ├── doc ├── woe1.JPG └── woe2.JPG ├── setup.py ├── LICENSE ├── .gitignore ├── README.md └── test.py /__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /autoBinning/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /autoBinning/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /doc/woe1.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaiwang0112006/autoBinning/HEAD/doc/woe1.JPG -------------------------------------------------------------------------------- /doc/woe2.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaiwang0112006/autoBinning/HEAD/doc/woe2.JPG -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r", encoding='utf-8') as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="autoBinning", 8 | version="0.1.7", 9 | author="Kai Wang", 10 | author_email="wangkai0112006@163.com", 11 | description="A small package for feature autoBinning", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/kaiwang0112006/autoBinning", 15 | packages=setuptools.find_packages(), 16 | install_requires=[ 17 | 'numpy', 18 | 'scipy', 19 | ], 20 | classifiers=[ 21 | "Programming Language :: Python :: 3", 22 | "License :: OSI Approved :: MIT License", 23 | "Operating System :: OS Independent", 24 | ], 25 | ) 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Kai Wang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # IPython 78 | profile_default/ 79 | ipython_config.py 80 | 81 | # pyenv 82 | .python-version 83 | 84 | # celery beat schedule file 85 | celerybeat-schedule 86 | 87 | # SageMath parsed files 88 | *.sage.py 89 | 90 | # Environments 91 | .env 92 | .venv 93 | env/ 94 | venv/ 95 | ENV/ 96 | env.bak/ 97 | venv.bak/ 98 | 99 | # Spyder project settings 100 | .spyderproject 101 | .spyproject 102 | 103 | # Rope project settings 104 | .ropeproject 105 | 106 | # mkdocs documentation 107 | /site 108 | 109 | # mypy 110 | .mypy_cache/ 111 | .dmypy.json 112 | dmypy.json 113 | 114 | # Pyre type checker 115 | .pyre/ -------------------------------------------------------------------------------- /utils/simpleMethods.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import pandas as pd 4 | import math 5 | import copy 6 | 7 | class simpleMethods: 8 | def __init__(self,x, missing=None,force=False, categorical=False): 9 | self.x_org = x 10 | self.range_dict = {} 11 | self.missing=missing 12 | self.categorical = categorical 13 | if self.missing == None: 14 | self.x = copy.deepcopy(self.x_org) 15 | self.x_miss = None 16 | else: 17 | self.x = self.x_org[self.x_org!=self.missing] 18 | self.x_miss = self.x_org[self.x_org==self.missing] 19 | self.force=force 20 | 21 | def equalValue(self,size): 22 | ''' 23 | x 等间距划分分箱 -> (0-0.1,0.1-0.2...) 24 | :param size: 25 | :return: 26 | ''' 27 | self.range_dict = {} 28 | 29 | self.bins = np.linspace(min(self.x), max(self.x), size+1) 30 | 31 | for i in range(len(self.bins)-1): 32 | self.range_dict[(self.bins[i],self.bins[i+1])] = i 33 | 34 | return self 35 | 36 | def equalHist(self,size): 37 | ''' 38 | 基于np.histogram分箱 39 | :param size: bin数目 40 | :return: 41 | ''' 42 | self.down = {} 43 | self.hist, self.bins = np.histogram(self.x, bins=size) 44 | 45 | 46 | for i in range(len(self.bins)-1): 47 | start = self.bins[i] 48 | end = self.bins[i+1] 49 | 50 | self.range_dict[(start, end)] = i 51 | 52 | return self 53 | 54 | def equalSize(self,size): 55 | ''' 56 | 每个分箱样本数平均 57 | :param size: 58 | :return: 59 | ''' 60 | self.range_dict = {} 61 | # use pandas qcut 62 | #_, self.bins = pd.qcut(self.x,size,retbins='True',duplicates='drop') 63 | #self.bins = sorted(list(self.bins)) 64 | # use numpy instead 65 | breakpoints = np.arange(0, size + 1) / (size) * 100 66 | self.bins = [np.percentile(self.x, b) for b in breakpoints] 67 | 68 | for i in range(len(self.bins)-1): 69 | start = self.bins[i] 70 | end = self.bins[i+1] 71 | 72 | self.range_dict[(start, end)] = i 73 | 74 | self.bins = np.array(self.bins) 75 | return self 76 | 77 | def everysplit(self): 78 | ''' 79 | 最细粒度切分 80 | :return: 81 | ''' 82 | if (len(set(self.x))<=10 and not self.force) or self.categorical: 83 | self.bins = sorted(list(set(self.x))) 84 | self.bins.append(max(self.bins)+1) 85 | self.bins = np.array(self.bins) 86 | else: 87 | x_sort = sorted(list(set(self.x)),reverse=False) 88 | bins = [x_sort[0]] 89 | for i in range(len(x_sort)-1): 90 | bins.append((x_sort[i]+x_sort[i+1])/2) 91 | bins.append(x_sort[-1]+1) 92 | self.bins = np.array(bins) 93 | 94 | self.range_dict = {} 95 | for i in range(len(self.bins)-1): 96 | start = self.bins[i] 97 | end = self.bins[i+1] 98 | 99 | self.range_dict[(start, end)] = i -------------------------------------------------------------------------------- /autoBinning/utils/simpleMethods.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import pandas as pd 4 | import math 5 | import copy 6 | 7 | class simpleMethods: 8 | def __init__(self,x, missing=None,force=False, categorical=False): 9 | self.x_org = x 10 | self.range_dict = {} 11 | self.missing=missing 12 | self.categorical = categorical 13 | if self.missing == None: 14 | self.x = copy.deepcopy(self.x_org) 15 | self.x_miss = None 16 | else: 17 | self.x = self.x_org[self.x_org!=self.missing] 18 | self.x_miss = self.x_org[self.x_org==self.missing] 19 | self.force=force 20 | 21 | def equalValue(self,size): 22 | ''' 23 | x 等间距划分分箱 -> (0-0.1,0.1-0.2...) 24 | :param size: 25 | :return: 26 | ''' 27 | self.range_dict = {} 28 | 29 | self.bins = np.linspace(min(self.x), max(self.x), size+1) 30 | 31 | for i in range(len(self.bins)-1): 32 | self.range_dict[(self.bins[i],self.bins[i+1])] = i 33 | 34 | return self 35 | 36 | def equalHist(self,size): 37 | ''' 38 | 基于np.histogram分箱 39 | :param size: bin数目 40 | :return: 41 | ''' 42 | self.down = {} 43 | self.hist, self.bins = np.histogram(self.x, bins=size) 44 | 45 | 46 | for i in range(len(self.bins)-1): 47 | start = self.bins[i] 48 | end = self.bins[i+1] 49 | 50 | self.range_dict[(start, end)] = i 51 | 52 | return self 53 | 54 | def equalSize(self,size): 55 | ''' 56 | 每个分箱样本数平均 57 | :param size: 58 | :return: 59 | ''' 60 | self.range_dict = {} 61 | # use pandas qcut 62 | #_, self.bins = pd.qcut(self.x,size,retbins='True',duplicates='drop') 63 | #self.bins = sorted(list(self.bins)) 64 | # use numpy instead 65 | breakpoints = np.arange(0, size + 1) / (size) * 100 66 | self.bins = [np.percentile(self.x, b) for b in breakpoints] 67 | 68 | for i in range(len(self.bins)-1): 69 | start = self.bins[i] 70 | end = self.bins[i+1] 71 | 72 | self.range_dict[(start, end)] = i 73 | 74 | self.bins = np.array(self.bins) 75 | return self 76 | 77 | def everysplit(self): 78 | ''' 79 | 最细粒度切分 80 | :return: 81 | ''' 82 | if (len(set(self.x))<=10 and not self.force) or self.categorical: 83 | self.bins = sorted(list(set(self.x))) 84 | self.bins.append(max(self.bins)+1) 85 | self.bins = np.array(self.bins) 86 | else: 87 | x_sort = sorted(list(set(self.x)),reverse=False) 88 | bins = [x_sort[0]] 89 | for i in range(len(x_sort)-1): 90 | bins.append((x_sort[i]+x_sort[i+1])/2) 91 | bins.append(x_sort[-1]+1) 92 | self.bins = np.array(bins) 93 | 94 | self.range_dict = {} 95 | for i in range(len(self.bins)-1): 96 | start = self.bins[i] 97 | end = self.bins[i+1] 98 | 99 | self.range_dict[(start, end)] = i -------------------------------------------------------------------------------- /utils/mapa.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .trendSplit import * 3 | import numpy as np 4 | import copy 5 | import math 6 | 7 | 8 | class MAPA(trendSplit): 9 | def __init__(self, x, y, bad=1): 10 | trendSplit.__init__(self, x, y, bad) 11 | 12 | def fit(self,trend='up',sby='woe'): 13 | ''' 14 | :param num_split: 最大切割点数,不包含最大最小值 15 | :param minv: 最小分裂所需数值,woe/iv 16 | :param sby: 'woe','iv','woeiv' 17 | :param min_sample: 每个分箱最小样本数 18 | :return: numpy array -- 切割点数组 19 | ''' 20 | self.set_init() 21 | self.everysplit() 22 | if trend == 'auto': 23 | self.candidateTrend() 24 | else: 25 | self.trend = trend 26 | self.test = {} 27 | candidate = [] 28 | for r in self.range_dict: 29 | candidate.append(r[0]) 30 | candidate.append(r[1]) 31 | 32 | if self.trend == 'up': 33 | self.candidate = sorted(list(set(candidate)),reverse=False) 34 | else: 35 | self.candidate = sorted(list(set(candidate)), reverse=True) 36 | 37 | cut_list, v = self.find_cut(sby=sby) 38 | self.cut_range = [cut_list[-1]] 39 | self.cut_range.append(self.candidate[0]) 40 | self.cut_range.append(self.candidate[-1]) 41 | for d in cut_list: 42 | self.candidate.remove(d) 43 | 44 | while True: 45 | cut_list, v = self.find_cut(sby=sby) 46 | if len(cut_list)>0: 47 | self.cut_range.append(cut_list[-1]) 48 | self.cut_range = sorted(list(set(self.cut_range))) 49 | for d in cut_list: 50 | self.candidate.remove(d) 51 | else: 52 | break 53 | self.bins = np.array(sorted(list(set(self.cut_range)))) 54 | print(self.test) 55 | 56 | def find_cut(self,trend='up',sby='woe'): 57 | ''' 58 | :param minv: 最小分裂所需数值,woe/iv 59 | :param sby: 'woe','iv','woeiv' 60 | :param iv_base: 上一轮的iv值,sby='woe'时不用考虑 61 | :return: 62 | ''' 63 | cut_list = [] 64 | cut = None 65 | minv = 0 # bad rate 66 | for c in self.candidate: 67 | if c != self.candidate[0]: 68 | if trend == 'up': 69 | v = self.value[(self.x=self.candidate[0])] 70 | else: 71 | v = self.value[(self.x >=c) & (self.x < self.candidate[0])] 72 | 73 | if len(v)>0: 74 | if sby == 'woe': 75 | badr = self._cal_woe(v) 76 | elif badr == 'bad': 77 | badr = len(v[v == self.bad]) / len(v) 78 | else: 79 | badr = 0 80 | else: 81 | badr = 0 82 | self.test[c] = badr 83 | if badr>=minv: 84 | minv = badr 85 | cut_list.append(c) 86 | else: 87 | break 88 | return cut_list, minv 89 | 90 | def candidateTrend(self): 91 | trend_up = 0 92 | trend_down = 0 93 | 94 | candidate_list = copy.deepcopy(self.candidate) 95 | for i in range(1, len(candidate_list) - 1): 96 | v_up = self.value[(self.x < candidate_list[i]) & (self.x >= candidate_list[i - 1])] 97 | v_down = self.value[(self.x < candidate_list[i + 1]) & (self.x >= candidate_list[i])] 98 | woe_up = self._cal_woe(v_up) 99 | woe_down = self._cal_woe(v_down) 100 | iv_up = self._cal_iv(v_up) 101 | iv_down = self._cal_iv(v_down) 102 | if woe_up > woe_down: 103 | trend_up += 1 104 | elif woe_up < woe_down: 105 | trend_down += 1 106 | if trend_up > trend_down: 107 | self.trend = 'up' 108 | else: 109 | self.trend = 'down' 110 | -------------------------------------------------------------------------------- /autoBinning/utils/mapa.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .trendSplit import * 3 | import numpy as np 4 | import copy 5 | import math 6 | 7 | 8 | class MAPA(trendSplit): 9 | def __init__(self, x, y, bad=1): 10 | trendSplit.__init__(self, x, y, bad) 11 | 12 | def fit(self,trend='up',sby='woe'): 13 | ''' 14 | :param num_split: 最大切割点数,不包含最大最小值 15 | :param minv: 最小分裂所需数值,woe/iv 16 | :param sby: 'woe','iv','woeiv' 17 | :param min_sample: 每个分箱最小样本数 18 | :return: numpy array -- 切割点数组 19 | ''' 20 | self.set_init() 21 | self.everysplit() 22 | if trend == 'auto': 23 | self.candidateTrend() 24 | else: 25 | self.trend = trend 26 | self.test = {} 27 | candidate = [] 28 | for r in self.range_dict: 29 | candidate.append(r[0]) 30 | candidate.append(r[1]) 31 | 32 | if self.trend == 'up': 33 | self.candidate = sorted(list(set(candidate)),reverse=False) 34 | else: 35 | self.candidate = sorted(list(set(candidate)), reverse=True) 36 | 37 | cut_list, v = self.find_cut(sby=sby) 38 | self.cut_range = [cut_list[-1]] 39 | self.cut_range.append(self.candidate[0]) 40 | self.cut_range.append(self.candidate[-1]) 41 | for d in cut_list: 42 | self.candidate.remove(d) 43 | 44 | while True: 45 | cut_list, v = self.find_cut(sby=sby) 46 | if len(cut_list)>0: 47 | self.cut_range.append(cut_list[-1]) 48 | self.cut_range = sorted(list(set(self.cut_range))) 49 | for d in cut_list: 50 | self.candidate.remove(d) 51 | else: 52 | break 53 | self.bins = np.array(sorted(list(set(self.cut_range)))) 54 | print(self.test) 55 | 56 | def find_cut(self,trend='up',sby='woe'): 57 | ''' 58 | :param minv: 最小分裂所需数值,woe/iv 59 | :param sby: 'woe','iv','woeiv' 60 | :param iv_base: 上一轮的iv值,sby='woe'时不用考虑 61 | :return: 62 | ''' 63 | cut_list = [] 64 | cut = None 65 | minv = 0 # bad rate 66 | for c in self.candidate: 67 | if c != self.candidate[0]: 68 | if trend == 'up': 69 | v = self.value[(self.x=self.candidate[0])] 70 | else: 71 | v = self.value[(self.x >=c) & (self.x < self.candidate[0])] 72 | 73 | if len(v)>0: 74 | if sby == 'woe': 75 | badr = self._cal_woe(v) 76 | elif badr == 'bad': 77 | badr = len(v[v == self.bad]) / len(v) 78 | else: 79 | badr = 0 80 | else: 81 | badr = 0 82 | self.test[c] = badr 83 | if badr>=minv: 84 | minv = badr 85 | cut_list.append(c) 86 | else: 87 | break 88 | return cut_list, minv 89 | 90 | def candidateTrend(self): 91 | trend_up = 0 92 | trend_down = 0 93 | 94 | candidate_list = copy.deepcopy(self.candidate) 95 | for i in range(1, len(candidate_list) - 1): 96 | v_up = self.value[(self.x < candidate_list[i]) & (self.x >= candidate_list[i - 1])] 97 | v_down = self.value[(self.x < candidate_list[i + 1]) & (self.x >= candidate_list[i])] 98 | woe_up = self._cal_woe(v_up) 99 | woe_down = self._cal_woe(v_down) 100 | iv_up = self._cal_iv(v_up) 101 | iv_down = self._cal_iv(v_down) 102 | if woe_up > woe_down: 103 | trend_up += 1 104 | elif woe_up < woe_down: 105 | trend_down += 1 106 | if trend_up > trend_down: 107 | self.trend = 'up' 108 | else: 109 | self.trend = 'down' 110 | -------------------------------------------------------------------------------- /utils/backwardSplit.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .trendSplit import * 3 | import numpy as np 4 | import copy 5 | import math 6 | from scipy import stats 7 | 8 | class backwardSplit(trendSplit): 9 | def __init__(self, x, y, bad=1,missing=None, force=False): 10 | trendSplit.__init__(self, x, y, bad, missing, force) 11 | 12 | def fit(self, init_split=0, num_split=0, minv=0, sby='iv'): 13 | ''' 14 | :param init_split: 备选初始切割点数 15 | :param num_split: 最大切割点数,不包含最大最小值 16 | :param minv: 最小分裂所需数值,woe/iv 17 | :param sby: 'iv','chi' 18 | :param min_sample: 每个分箱最小样本数 19 | :return: numpy array -- 切割点数组 20 | ''' 21 | self.set_init() 22 | 23 | if init_split == 0 or len(self.x) <= init_split: 24 | self.everysplit() 25 | else: 26 | self.equalSize(init_split) 27 | 28 | candidate = [] 29 | for r in self.range_dict: 30 | candidate.append(r[0]) 31 | candidate.append(r[1]) 32 | self.candidate = sorted(list(set(candidate))) 33 | 34 | param = {'minv': minv, 'sby': sby} 35 | cut = self.find_cut(**param) 36 | 37 | if cut: 38 | self.candidate.remove(cut) 39 | while True: 40 | cut = self.find_cut(**param) 41 | 42 | if cut: 43 | self.candidate.remove(cut) 44 | else: 45 | break 46 | 47 | if num_split: 48 | #print(len(set(self.candidate)),num_split, self.candidate) 49 | #print() 50 | if len(set(self.candidate)) <= num_split: 51 | break 52 | 53 | self.bins = np.array(sorted(list(set(self.candidate)))) 54 | else: 55 | self.bins = None 56 | 57 | 58 | def find_cut(self,minv=0, num_split=0, sby='iv'): 59 | ''' 60 | :param minv: 最小分裂所需数值,woe/iv 61 | :param sby: 'iv','chi' 62 | :param num_split: 最大切割点数,不包含最大最小值 63 | :return: 64 | ''' 65 | 66 | cut = None 67 | if sby == 'chi': 68 | minv = 10**7 69 | if len(self.candidate)>num_split: 70 | for i in range(1, len(self.candidate)-1): 71 | candidate_list = [c for c in self.candidate if c!=self.candidate[i]] 72 | down_idx = candidate_list.index(self.candidate[i+1]) 73 | iv_range = tuple(candidate_list) 74 | near_range_before = (self.candidate[i-1], self.candidate[i], self.candidate[i+1]) 75 | 76 | if sby=='iv': 77 | iv = self.cal_iv_by_range(iv_range) 78 | if iv > minv: 79 | minv = iv 80 | cut = self.candidate[i] 81 | elif sby=='chi': 82 | chi_v = self.cal_chisquare_by_range(near_range_before) 83 | if chi_v < minv: 84 | minv = chi_v 85 | cut = self.candidate[i] 86 | return cut 87 | 88 | def fit_by_spearman(self, init_split=0, min_v=10): 89 | ''' 90 | :param init_split: 91 | :param max_v: 92 | :return: 93 | ''' 94 | target_dict = None 95 | self.set_init() 96 | if init_split == 0 or len(self.x) <= init_split: 97 | n_split = int(len(self.x)/2) 98 | else: 99 | n_split = init_split 100 | 101 | while n_split>=min_v: 102 | self.equalSize(n_split) 103 | x_mean = [] 104 | y_mean = [] 105 | for r in self.range_dict: 106 | if self.range_dict[r] == max(self.range_dict.values()): 107 | x_mean.append(np.nanmean(self.x[(self.x <= r[1]) & (self.x >= r[0])])) 108 | y_mean.append(np.nanmean(self.value[(self.x <= r[1]) & (self.x >= r[0])])) 109 | else: 110 | x_mean.append(np.nanmean(self.x[(self.x < r[1]) & (self.x >= r[0])])) 111 | y_mean.append(np.nanmean(self.value[(self.x < r[1]) & (self.x >= r[0])])) 112 | 113 | #print(stats.spearmanr(x_mean, y_mean)) 114 | if abs(stats.spearmanr(x_mean, y_mean)[0]) > 0.999: 115 | target_dict = self.range_dict 116 | n_split -= 1 117 | 118 | candidate = [] 119 | for r in target_dict: 120 | candidate.append(r[0]) 121 | candidate.append(r[1]) 122 | self.candidate = sorted(list(set(candidate))) 123 | self.bins = np.array(sorted(list(set(self.candidate)))) -------------------------------------------------------------------------------- /autoBinning/utils/backwardSplit.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .trendSplit import * 3 | import numpy as np 4 | import copy 5 | import math 6 | from scipy import stats 7 | 8 | class backwardSplit(trendSplit): 9 | def __init__(self, x, y, bad=1,missing=None, force=False): 10 | trendSplit.__init__(self, x, y, bad, missing, force) 11 | 12 | def fit(self, init_split=0, num_split=0, minv=0, sby='iv'): 13 | ''' 14 | :param init_split: 备选初始切割点数 15 | :param num_split: 最大切割点数,不包含最大最小值 16 | :param minv: 最小分裂所需数值,woe/iv 17 | :param sby: 'iv','chi' 18 | :param min_sample: 每个分箱最小样本数 19 | :return: numpy array -- 切割点数组 20 | ''' 21 | self.set_init() 22 | 23 | if init_split == 0 or len(self.x) <= init_split: 24 | self.everysplit() 25 | else: 26 | self.equalSize(init_split) 27 | 28 | candidate = [] 29 | for r in self.range_dict: 30 | candidate.append(r[0]) 31 | candidate.append(r[1]) 32 | self.candidate = sorted(list(set(candidate))) 33 | 34 | param = {'minv': minv, 'sby': sby} 35 | cut = self.find_cut(**param) 36 | 37 | if cut: 38 | self.candidate.remove(cut) 39 | while True: 40 | cut = self.find_cut(**param) 41 | 42 | if cut: 43 | self.candidate.remove(cut) 44 | else: 45 | break 46 | 47 | if num_split: 48 | #print(len(set(self.candidate)),num_split, self.candidate) 49 | #print() 50 | if len(set(self.candidate)) <= num_split: 51 | break 52 | 53 | self.bins = np.array(sorted(list(set(self.candidate)))) 54 | else: 55 | self.bins = None 56 | 57 | 58 | def find_cut(self,minv=0, num_split=0, sby='iv'): 59 | ''' 60 | :param minv: 最小分裂所需数值,woe/iv 61 | :param sby: 'iv','chi' 62 | :param num_split: 最大切割点数,不包含最大最小值 63 | :return: 64 | ''' 65 | 66 | cut = None 67 | if sby == 'chi': 68 | minv = 10**7 69 | if len(self.candidate)>num_split: 70 | for i in range(1, len(self.candidate)-1): 71 | candidate_list = [c for c in self.candidate if c!=self.candidate[i]] 72 | down_idx = candidate_list.index(self.candidate[i+1]) 73 | iv_range = tuple(candidate_list) 74 | near_range_before = (self.candidate[i-1], self.candidate[i], self.candidate[i+1]) 75 | 76 | if sby=='iv': 77 | iv = self.cal_iv_by_range(iv_range) 78 | if iv > minv: 79 | minv = iv 80 | cut = self.candidate[i] 81 | elif sby=='chi': 82 | chi_v = self.cal_chisquare_by_range(near_range_before) 83 | if chi_v < minv: 84 | minv = chi_v 85 | cut = self.candidate[i] 86 | return cut 87 | 88 | def fit_by_spearman(self, init_split=0, min_v=10): 89 | ''' 90 | :param init_split: 91 | :param max_v: 92 | :return: 93 | ''' 94 | target_dict = None 95 | self.set_init() 96 | if init_split == 0 or len(self.x) <= init_split: 97 | n_split = int(len(self.x)/2) 98 | else: 99 | n_split = init_split 100 | 101 | while n_split>=min_v: 102 | self.equalSize(n_split) 103 | x_mean = [] 104 | y_mean = [] 105 | for r in self.range_dict: 106 | if self.range_dict[r] == max(self.range_dict.values()): 107 | x_mean.append(np.nanmean(self.x[(self.x <= r[1]) & (self.x >= r[0])])) 108 | y_mean.append(np.nanmean(self.value[(self.x <= r[1]) & (self.x >= r[0])])) 109 | else: 110 | x_mean.append(np.nanmean(self.x[(self.x < r[1]) & (self.x >= r[0])])) 111 | y_mean.append(np.nanmean(self.value[(self.x < r[1]) & (self.x >= r[0])])) 112 | 113 | #print(stats.spearmanr(x_mean, y_mean)) 114 | if abs(stats.spearmanr(x_mean, y_mean)[0]) > 0.999: 115 | target_dict = self.range_dict 116 | n_split -= 1 117 | 118 | candidate = [] 119 | for r in target_dict: 120 | candidate.append(r[0]) 121 | candidate.append(r[1]) 122 | self.candidate = sorted(list(set(candidate))) 123 | self.bins = np.array(sorted(list(set(self.candidate)))) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # auto binning 分箱工具 2 | 3 | ## 安装 4 | 5 | pip install autoBinning 6 | 7 | ## 基础工具 (simpleMethods) 8 | 9 | from autoBinning.utils.simpleMethods import * 10 | my_list = [1,1,2,2,2,2,3,3,4,5,6,7,8,9,10,10,20,20,20,20,30,30,40,50,60,70,80,90,100] 11 | my_list_y = [1,1,2,2,2,2,1,1,1,2,2,2,1,1] 12 | t = simpleMethods(my_list) 13 | t.equalSize(3) 14 | # 每个分箱样本数平均 15 | print(t.bins) # [ 1. 5.33333333 20. 100. ] 16 | # 等间距划分分箱 17 | t.equalValue(4) 18 | print(t.bins) # [ 1. 25.75 50.5 75.25 100. ] 19 | # 基于numpy histogram分箱 20 | t.equalHist(4) 21 | print(t.bins) # [ 1. 25.75 50.5 75.25 100. ] 22 | 23 | ## 基于标签的有监督自动分箱 24 | 25 | ### 向前迭代方法 (forward method) 26 | 27 | # load data 28 | import pandas as pd 29 | df = pd.read_csv('credit_old.csv') 30 | df = df[['Age','target']] 31 | df = df.dropna() 32 | 33 | #### 基于最大woe分裂分箱 34 | 35 | 在得到尽可能细粒度的细分箱之后,寻找上下分箱woe差异最大的初始切割点,并得到woe趋势,之后迭代找到下一个woe差异最大且趋势相同的切割点,直到满足woe差异不大于一个阈值或分箱数(切割点数)满足要求 36 | 37 | from autoBinning.utils.forwardSplit import * 38 | t = forwardSplit(df['Age'], df['target']) 39 | t.fit(sby='woe',minv=0.01,init_split=20) 40 | print(t.bins) # [16. 25. 29. 33. 36. 38. 40. 42. 44. 46. 48. 50. 52. 54. 55. 58. 60. 63. 72. 94.] 41 | t = forwardSplit(df['Age'], df['target']) 42 | t.fit(sby='woe',num_split=4,init_split=20) 43 | print(t.bins) # [16. 42. 44. 48. 50. 94.] 44 | print("bin\twoe") 45 | for i in range(len(t.bins)-1): 46 | v = t.value[(t.x < t.bins[i+1]) & (t.x >= t.bins[i])] 47 | woe = t._cal_woe(v) 48 | print((t.bins[i], t.bins[i+1]),woe) 49 | 50 | bin woe 51 | (16.0, 25.0) 0.11373232830301286 52 | (25.0, 42.0) 0.07217546872710079 53 | (42.0, 50.0) 0.04972042405868509 54 | (50.0, 72.0) -0.07172614369435065 55 | (72.0, 94.0) -0.13778318584223453 56 | 57 | ![avatar](https://github.com/kaiwang0112006/autoBinning/blob/master/doc/woe1.JPG) 58 | ![avatar](https://github.com/kaiwang0112006/autoBinning/blob/master/doc/woe2.JPG) 59 | 60 | #### 基于最大iv分裂分箱 61 | 62 | 与最大woe分裂分箱方法类似,在得到尽可能细粒度的细分箱之后,寻找iv值最大的切割点,并得到woe趋势,之后迭代找到下一个iv最大且woe趋势相同的切割点,直到分箱数(切割点数)满足要求 63 | 64 | from autoBinning.utils.forwardSplit import * 65 | # sby='woeiv'时考虑woe趋势,sby='iv'时不考虑woe趋势 66 | t = forwardSplit(df['Age'], df['target']) 67 | t.fit(sby='iv',minv=0.1,init_split=20) 68 | print(t.bins) # [16. 25. 29. 33. 36. 38. 40. 42. 44. 46. 48. 50. 58. 60. 63. 94.] 69 | t = forwardSplit(df['Age'], df['target']) 70 | t.fit(sby='iv',num_split=4,init_split=20) 71 | print(t.bins) # [16. 25. 33. 36. 38. 94.] 72 | t.fit(sby='woeiv',num_split=4,init_split=20) 73 | print(t.bins) # [16. 25. 33. 36. 38. 94.] 74 | 75 | print("bin\twoe") 76 | for i in range(len(t.bins)-1): 77 | v = t.value[(t.x < t.bins[i+1]) & (t.x >= t.bins[i])] 78 | woe = t._cal_woe(v) 79 | print((t.bins[i], t.bins[i+1]),woe) 80 | 81 | bin woe 82 | (16.0, 25.0) 0.11373232830301286 83 | (25.0, 33.0) 0.06679187564362839 84 | (33.0, 40.0) 0.06638021747875023 85 | (40.0, 50.0) 0.05894173616389541 86 | (50.0, 94.0) -0.07934608583946329 87 | 88 | t = forwardSplit(df['Branch'], df['target'],missing=-1,categorical=True) 89 | t.fit(sby='woeiv',minv=0,init_split=0,num_split=4) # [['B19'], ['B15'], ['B14'], ['B16'], ['B7', 'B18', 'B2', 'B9', 'B5', 'B6', 'B1', 'B17', 'B4', 'B10', 'B8', 'B3', 'B12', 'B13', 'B11']] 90 | 91 | ### 向后迭代方法 (backward method) 92 | 93 | #### 基于最大iv合并分箱 94 | 95 | 迭代每次删除一个分箱切点,是去掉后整体iv最大 96 | 97 | from autoBinning.utils.backwardSplit import * 98 | t = backwardSplit(df['Age'], df['target']) 99 | t.fit(sby='iv',num_split=5) 100 | print(t.bins) # [16. 17.5 18.5 85.5 95. ] 101 | 102 | #### 基于卡方检验的合并分箱 103 | 104 | 1\. 得到尽可能细粒度的细分箱切点 105 | 106 | 2\. 每个切点计算上下相邻分箱的卡方检验值 107 | 108 | 3\. 将卡方检验值最低的两个分箱合并 109 | 110 | 4\. 重复前两步直到达到分裂最小分裂切点数 111 | 112 | 1\. First the input range is initialized by splitting 113 | it into sub-intervals with each sample 114 | getting own interval. 115 | 116 | 2\. For every pair of adjacent sub-intervals a 117 | chi-square value is computed. 118 | 119 | 3\. Merge pair with lowest chi-square into single bin. 120 | 121 | 4\. Repeat 1 and 2 until number of bins meets predefined threshold. 122 | 123 | from autoBinning.utils.backwardSplit import * 124 | t = backwardSplit(df['Age'], df['target']) 125 | t.fit(sby='chi',num_split=7) 126 | print(t.bins) # [16. 72.5 73.5 87.5 89.5 90.5 95. ] 127 | 128 | #### 基于spearman相关性做向后等频分箱 129 | 130 | from autoBinning.utils.backwardSplit import * 131 | t = backwardSplit(df['Age'], df['target']) 132 | t.fit_by_spearman(min_v=5, init_split=20) 133 | -------------------------------------------------------------------------------- /utils/forwardSplit.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .trendSplit import * 3 | import numpy as np 4 | import copy 5 | import math 6 | 7 | 8 | class forwardSplit(trendSplit): 9 | def __init__(self, x, y, bad=1,missing=None, force=False, categorical=False): 10 | trendSplit.__init__(self, x, y, bad, missing, force, categorical=categorical) 11 | 12 | def fit(self, init_split=0, num_split=0, minv=0, sby='woe', min_sample=0): 13 | ''' 14 | :param num_split: 最大切割点数,不包含最大最小值 15 | :param minv: 最小分裂所需数值,woe/iv 16 | :param sby: 'woe','iv','woeiv' 17 | :param min_sample: 每个分箱最小样本数 18 | :return: numpy array -- 切割点数组 19 | ''' 20 | self.set_init() 21 | 22 | if init_split == 0 or len(self.x) <= init_split or self.categorical: 23 | self.everysplit() 24 | else: 25 | self.equalSize(init_split) 26 | 27 | candidate = [] 28 | for r in self.range_dict: 29 | candidate.append(r[0]) 30 | candidate.append(r[1]) 31 | self.candidate = sorted(list(set(candidate))) 32 | 33 | param = {'minv': minv, 'sby': sby, 'min_sample':min_sample} 34 | cut, iv = self.find_cut(**param) 35 | param['iv_base'] = iv 36 | self.cut_range = [cut] 37 | if cut!=None: 38 | self.candidate.remove(cut) 39 | while True: 40 | cut, iv = self.find_cut(**param) 41 | param['iv_base'] = iv 42 | if cut: 43 | self.cut_range.append(cut) 44 | self.cut_range = sorted(list(set(self.cut_range))) 45 | self.candidate.remove(cut) 46 | else: 47 | break 48 | 49 | if num_split: 50 | if len(set(self.cut_range)) >= num_split: 51 | break 52 | 53 | self.cut_range.append(self.candidate[0]) 54 | self.cut_range.append(self.candidate[-1]) 55 | self.bins = sorted(list(set(self.cut_range))) 56 | if self.categorical: 57 | binslist = list(self.bins) 58 | binsvalue = [] 59 | for i in range(len(binslist)-1): 60 | v = [] 61 | for k in self.xmap: 62 | if self.xmap[k]>=binslist[i] and self.xmap[k]0: 83 | self.candidateTrend(self.cut_range) 84 | 85 | iv = 0 86 | 87 | cut = None 88 | result = {} 89 | 90 | for i in range(1, len(self.candidate) - 1): 91 | if len(self.cut_range) == 0: 92 | woe_range = (self.candidate[0]-0.1, self.candidate[i], self.candidate[-1]) 93 | iv_range = (self.candidate[0]-0.1, self.candidate[i], self.candidate[-1]) 94 | else: 95 | range_list = sorted([self.candidate[0]-0.1, self.candidate[i], self.candidate[-1]]+list(self.cut_range)) 96 | canidx = range_list.index(self.candidate[i]) 97 | #woe_range = (self.candidate[0], self.candidate[i], self.candidate[-1]+1) 98 | woe_range = (range_list[canidx-1], range_list[canidx], range_list[canidx+1]) 99 | iv_range = tuple(range_list) 100 | 101 | if len(self.value[(self.x < woe_range[1]) & (self.x >= woe_range[0])]) > min_sample and \ 102 | len(self.value[(self.x < woe_range[2]) & (self.x >= woe_range[1])]) > min_sample: 103 | if sby == 'woe': 104 | woe = self.cal_woe_by_range(woe_range) 105 | if woe > minv: 106 | minv = woe 107 | cut = self.candidate[i] 108 | elif sby == 'iv': 109 | iv = self.cal_iv_by_range(iv_range) 110 | result[self.candidate[i]] = iv 111 | if iv > minv and iv > iv_base: 112 | minv = iv 113 | cut = self.candidate[i] 114 | else: 115 | is_trend_tag = False 116 | if self.trend in ('up','down'): 117 | up_count, down_count = self.candidateTrend(list(iv_range)[1:-1]) 118 | if (self.trend == 'up' and down_count==0 and up_count>0) or \ 119 | (self.trend == 'down' and down_count > 0 and up_count == 0): 120 | is_trend_tag = True 121 | 122 | iv = self.cal_iv_by_range(iv_range) 123 | #print(is_trend_tag, self.trend not in ('up','down'), self.trend) 124 | #print((is_trend_tag or self.trend not in ('up','down'))) 125 | if (is_trend_tag or self.trend not in ('up','down')) and iv > minv: 126 | minv = iv 127 | cut = self.candidate[i] 128 | 129 | return cut, iv 130 | -------------------------------------------------------------------------------- /autoBinning/utils/forwardSplit.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .trendSplit import * 3 | import numpy as np 4 | import copy 5 | import math 6 | 7 | 8 | class forwardSplit(trendSplit): 9 | def __init__(self, x, y, bad=1,missing=None, force=False, categorical=False): 10 | trendSplit.__init__(self, x, y, bad, missing, force, categorical=categorical) 11 | 12 | def fit(self, init_split=0, num_split=0, minv=0, sby='woe', min_sample=0): 13 | ''' 14 | :param num_split: 最大切割点数,不包含最大最小值 15 | :param minv: 最小分裂所需数值,woe/iv 16 | :param sby: 'woe','iv','woeiv' 17 | :param min_sample: 每个分箱最小样本数 18 | :return: numpy array -- 切割点数组 19 | ''' 20 | self.set_init() 21 | 22 | if init_split == 0 or len(self.x) <= init_split or self.categorical: 23 | self.everysplit() 24 | else: 25 | self.equalSize(init_split) 26 | 27 | candidate = [] 28 | for r in self.range_dict: 29 | candidate.append(r[0]) 30 | candidate.append(r[1]) 31 | self.candidate = sorted(list(set(candidate))) 32 | 33 | param = {'minv': minv, 'sby': sby, 'min_sample':min_sample} 34 | cut, iv = self.find_cut(**param) 35 | param['iv_base'] = iv 36 | self.cut_range = [cut] 37 | if cut!=None: 38 | self.candidate.remove(cut) 39 | while True: 40 | cut, iv = self.find_cut(**param) 41 | param['iv_base'] = iv 42 | if cut: 43 | self.cut_range.append(cut) 44 | self.cut_range = sorted(list(set(self.cut_range))) 45 | self.candidate.remove(cut) 46 | else: 47 | break 48 | 49 | if num_split: 50 | if len(set(self.cut_range)) >= num_split: 51 | break 52 | 53 | self.cut_range.append(self.candidate[0]) 54 | self.cut_range.append(self.candidate[-1]) 55 | self.bins = sorted(list(set(self.cut_range))) 56 | if self.categorical: 57 | binslist = list(self.bins) 58 | binsvalue = [] 59 | for i in range(len(binslist)-1): 60 | v = [] 61 | for k in self.xmap: 62 | if self.xmap[k]>=binslist[i] and self.xmap[k]0: 83 | self.candidateTrend(self.cut_range) 84 | 85 | iv = 0 86 | 87 | cut = None 88 | result = {} 89 | 90 | for i in range(1, len(self.candidate) - 1): 91 | if len(self.cut_range) == 0: 92 | woe_range = (self.candidate[0]-0.1, self.candidate[i], self.candidate[-1]) 93 | iv_range = (self.candidate[0]-0.1, self.candidate[i], self.candidate[-1]) 94 | else: 95 | range_list = sorted([self.candidate[0]-0.1, self.candidate[i], self.candidate[-1]]+list(self.cut_range)) 96 | canidx = range_list.index(self.candidate[i]) 97 | #woe_range = (self.candidate[0], self.candidate[i], self.candidate[-1]+1) 98 | woe_range = (range_list[canidx-1], range_list[canidx], range_list[canidx+1]) 99 | iv_range = tuple(range_list) 100 | 101 | if len(self.value[(self.x < woe_range[1]) & (self.x >= woe_range[0])]) > min_sample and \ 102 | len(self.value[(self.x < woe_range[2]) & (self.x >= woe_range[1])]) > min_sample: 103 | if sby == 'woe': 104 | woe = self.cal_woe_by_range(woe_range) 105 | if woe > minv: 106 | minv = woe 107 | cut = self.candidate[i] 108 | elif sby == 'iv': 109 | iv = self.cal_iv_by_range(iv_range) 110 | result[self.candidate[i]] = iv 111 | if iv > minv and iv > iv_base: 112 | minv = iv 113 | cut = self.candidate[i] 114 | else: 115 | is_trend_tag = False 116 | if self.trend in ('up','down'): 117 | up_count, down_count = self.candidateTrend(list(iv_range)[1:-1]) 118 | if (self.trend == 'up' and down_count==0 and up_count>0) or \ 119 | (self.trend == 'down' and down_count > 0 and up_count == 0): 120 | is_trend_tag = True 121 | 122 | iv = self.cal_iv_by_range(iv_range) 123 | #print(is_trend_tag, self.trend not in ('up','down'), self.trend) 124 | #print((is_trend_tag or self.trend not in ('up','down'))) 125 | if (is_trend_tag or self.trend not in ('up','down')) and iv > minv: 126 | minv = iv 127 | cut = self.candidate[i] 128 | 129 | return cut, iv 130 | -------------------------------------------------------------------------------- /utils/trendSplit.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .simpleMethods import * 3 | import numpy as np 4 | import copy 5 | import math 6 | 7 | 8 | class trendSplit(simpleMethods): 9 | def __init__(self, x, y, bad=1,missing=None,force=False, categorical=False): 10 | simpleMethods.__init__(self, x, missing=missing,force=force, categorical=categorical) 11 | self.y = y 12 | self.xmap = {} 13 | self.categorical = categorical 14 | self.bad=bad 15 | self.set_init(categorical=categorical) 16 | 17 | def set_init(self, categorical=False): 18 | self.cut_range = [] 19 | self.trend = None 20 | self.value = np.array(self.y) 21 | 22 | if categorical: 23 | xset = set(self.x) 24 | vmap = {} 25 | for v in xset: 26 | v_filter = self.value[self.x_org == v] 27 | vmap[v] = len(v_filter[v_filter==self.bad])/len(v_filter) 28 | v_sort = sorted(list(vmap.keys()), key=lambda x: vmap[x], reverse=True) 29 | self.xmap = dict(zip(v_sort, range(len(v_sort)))) 30 | self.xmap_inverse = dict(zip(range(len(v_sort)),v_sort)) 31 | self.x_idx = copy.deepcopy(self.x) 32 | self.x = np.array([self.xmap[i] for i in list(self.x)]) 33 | 34 | if self.missing == None: 35 | self.value_miss = None 36 | else: 37 | self.value_miss = self.value[self.x_org == self.missing] 38 | self.value = self.value[self.x_org!=self.missing] 39 | 40 | self.allbad = len(self.value[self.value == self.bad]) # 好样本总数 41 | self.allgood = len(self.value) - self.allbad # 坏样本总数 42 | self.candidate = [] 43 | self.woe_cache = {} 44 | self.iv_cache = {} 45 | self.chisquare_cache = {} 46 | self.know_box = {} 47 | 48 | 49 | def cal_woe_by_range(self,wrange): 50 | ''' 51 | 根据切点范围(start, mid, end)计算woe 52 | :param wrange: 53 | :param trend: 54 | :return: 55 | ''' 56 | woe_up = self.cal_woe_by_start_end(wrange[0], wrange[1]) 57 | woe_down = self.cal_woe_by_start_end(wrange[1], wrange[2]) 58 | 59 | if self.trend == 'up': 60 | woe_sub = woe_up - woe_down 61 | elif self.trend == 'down': 62 | woe_sub = woe_down - woe_up 63 | else: 64 | woe_sub = abs(woe_down - woe_up) 65 | return woe_sub 66 | 67 | def cal_iv_by_range(self,vrange): 68 | ''' 69 | 根据切点范围(start, mid, end)计算iv 70 | :param vrange: 71 | :param bad: 72 | :return: 73 | ''' 74 | iv_split = 0 75 | result = [] 76 | for j in range(len(vrange)-1): 77 | if (vrange[j], vrange[j+1]) not in self.iv_cache: 78 | vvalue = self.value[(self.x < vrange[j+1]) & (self.x >= vrange[j])] 79 | iv_box = self._cal_iv(vvalue) 80 | self.iv_cache[(vrange[j], vrange[j+1])] = iv_box 81 | else: 82 | iv_box = self.iv_cache[(vrange[j], vrange[j+1])] 83 | result.append(iv_box) 84 | iv_split += iv_box 85 | 86 | return iv_split 87 | 88 | def cal_woe_by_start_end(self, start, end): 89 | if (start, end) not in self.woe_cache: 90 | vvalue = self.value[(self.x < end) & (self.x >= start)] 91 | woe_box = self._cal_woe(vvalue) 92 | self.woe_cache[(start, end)] = woe_box 93 | else: 94 | woe_box = self.woe_cache[(start, end)] 95 | return woe_box 96 | 97 | def _cal_woe(self,v): 98 | ''' 99 | 计算woe 100 | :param v: 101 | :param bad: 102 | :return: 103 | ''' 104 | bad_num = len(v[v == self.bad]) 105 | count_num = len(v) 106 | 107 | if count_num-bad_num==0 or self.allgood==0 or bad_num==0: 108 | woe = 0 109 | else: 110 | woe = math.log((bad_num / (count_num - bad_num)) / (self.allbad / self.allgood)) 111 | return woe 112 | 113 | def _cal_iv(self, v): 114 | ''' 115 | 计算iv 116 | :param v: 117 | :param bad: 118 | :return: 119 | ''' 120 | bad_num = len(v[v == self.bad]) 121 | count_num = len(v) 122 | 123 | if count_num-bad_num == 0 or self.allgood==0 or bad_num==0: 124 | iv = 0 125 | else: 126 | iv = (bad_num / (count_num - bad_num))*math.log((bad_num / (count_num - bad_num)) / (self.allbad / self.allgood)) 127 | return iv 128 | 129 | def candidateTrend(self,cut_range): 130 | 131 | trend_up = 0 132 | trend_down = 0 133 | result = {} 134 | if len(cut_range) == 0: 135 | candidate_list = copy.deepcopy(self.candidate) 136 | else: 137 | candidate_list = [self.candidate[0]] + copy.deepcopy(cut_range) + [self.candidate[-1]] 138 | 139 | for i in range(1,len(candidate_list) - 1): 140 | woe_up = self.cal_woe_by_start_end(candidate_list[i-1], candidate_list[i]) 141 | woe_down = self.cal_woe_by_start_end(candidate_list[i], candidate_list[i+1]) 142 | 143 | if woe_up>woe_down: 144 | trend_up += 1 145 | elif woe_uptrend_down: 150 | self.trend = 'up' 151 | elif trend_up= chi_range[0])] 159 | v_down = self.value[(self.x < chi_range[2]) & (self.x >= chi_range[1])] 160 | all_num = len(v_up)+len(v_down) 161 | up_bad = len(v_up[v_up==self.bad]) 162 | up_good = len(v_up)-up_bad 163 | down_bad = len(v_down[v_down==self.bad]) 164 | down_good = len(v_down)-down_bad 165 | all_g = up_good + down_good 166 | all_b = up_bad + down_bad 167 | 168 | if len(v_up)==0 or len(v_down)==0: 169 | chisquare_value = 10**7 170 | else: 171 | chisquare_value = (up_bad-len(v_up)*all_b/all_num)/len(v_up)*all_b/all_num + \ 172 | (up_good-len(v_up)*all_g/all_num)/len(v_up)*all_g/all_num + \ 173 | (down_good-len(v_down)*all_g/all_num)/len(v_down)*all_g/all_num + \ 174 | (down_bad-len(v_down)*all_b/all_num)/len(v_down)*all_b/all_num 175 | self.chisquare_cache[chi_range] = chisquare_value 176 | else: 177 | chisquare_value = self.chisquare_cache[chi_range] 178 | return chisquare_value 179 | -------------------------------------------------------------------------------- /autoBinning/utils/trendSplit.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .simpleMethods import * 3 | import numpy as np 4 | import copy 5 | import math 6 | 7 | 8 | class trendSplit(simpleMethods): 9 | def __init__(self, x, y, bad=1,missing=None,force=False, categorical=False): 10 | simpleMethods.__init__(self, x, missing=missing,force=force, categorical=categorical) 11 | self.y = y 12 | self.xmap = {} 13 | self.categorical = categorical 14 | self.bad=bad 15 | self.set_init(categorical=categorical) 16 | 17 | def set_init(self, categorical=False): 18 | self.cut_range = [] 19 | self.trend = None 20 | self.value = np.array(self.y) 21 | 22 | if categorical: 23 | xset = set(self.x) 24 | vmap = {} 25 | for v in xset: 26 | v_filter = self.value[self.x_org == v] 27 | vmap[v] = len(v_filter[v_filter==self.bad])/len(v_filter) 28 | v_sort = sorted(list(vmap.keys()), key=lambda x: vmap[x], reverse=True) 29 | self.xmap = dict(zip(v_sort, range(len(v_sort)))) 30 | self.xmap_inverse = dict(zip(range(len(v_sort)),v_sort)) 31 | self.x_idx = copy.deepcopy(self.x) 32 | self.x = np.array([self.xmap[i] for i in list(self.x)]) 33 | 34 | if self.missing == None: 35 | self.value_miss = None 36 | else: 37 | self.value_miss = self.value[self.x_org == self.missing] 38 | self.value = self.value[self.x_org!=self.missing] 39 | 40 | self.allbad = len(self.value[self.value == self.bad]) # 好样本总数 41 | self.allgood = len(self.value) - self.allbad # 坏样本总数 42 | self.candidate = [] 43 | self.woe_cache = {} 44 | self.iv_cache = {} 45 | self.chisquare_cache = {} 46 | self.know_box = {} 47 | 48 | 49 | def cal_woe_by_range(self,wrange): 50 | ''' 51 | 根据切点范围(start, mid, end)计算woe 52 | :param wrange: 53 | :param trend: 54 | :return: 55 | ''' 56 | woe_up = self.cal_woe_by_start_end(wrange[0], wrange[1]) 57 | woe_down = self.cal_woe_by_start_end(wrange[1], wrange[2]) 58 | 59 | if self.trend == 'up': 60 | woe_sub = woe_up - woe_down 61 | elif self.trend == 'down': 62 | woe_sub = woe_down - woe_up 63 | else: 64 | woe_sub = abs(woe_down - woe_up) 65 | return woe_sub 66 | 67 | def cal_iv_by_range(self,vrange): 68 | ''' 69 | 根据切点范围(start, mid, end)计算iv 70 | :param vrange: 71 | :param bad: 72 | :return: 73 | ''' 74 | iv_split = 0 75 | result = [] 76 | for j in range(len(vrange)-1): 77 | if (vrange[j], vrange[j+1]) not in self.iv_cache: 78 | vvalue = self.value[(self.x < vrange[j+1]) & (self.x >= vrange[j])] 79 | iv_box = self._cal_iv(vvalue) 80 | self.iv_cache[(vrange[j], vrange[j+1])] = iv_box 81 | else: 82 | iv_box = self.iv_cache[(vrange[j], vrange[j+1])] 83 | result.append(iv_box) 84 | iv_split += iv_box 85 | 86 | return iv_split 87 | 88 | def cal_woe_by_start_end(self, start, end): 89 | if (start, end) not in self.woe_cache: 90 | vvalue = self.value[(self.x < end) & (self.x >= start)] 91 | woe_box = self._cal_woe(vvalue) 92 | self.woe_cache[(start, end)] = woe_box 93 | else: 94 | woe_box = self.woe_cache[(start, end)] 95 | return woe_box 96 | 97 | def _cal_woe(self,v): 98 | ''' 99 | 计算woe 100 | :param v: 101 | :param bad: 102 | :return: 103 | ''' 104 | bad_num = len(v[v == self.bad]) 105 | count_num = len(v) 106 | 107 | if count_num-bad_num==0 or self.allgood==0 or bad_num==0: 108 | woe = 0 109 | else: 110 | woe = math.log((bad_num / (count_num - bad_num)) / (self.allbad / self.allgood)) 111 | return woe 112 | 113 | def _cal_iv(self, v): 114 | ''' 115 | 计算iv 116 | :param v: 117 | :param bad: 118 | :return: 119 | ''' 120 | bad_num = len(v[v == self.bad]) 121 | count_num = len(v) 122 | 123 | if count_num-bad_num == 0 or self.allgood==0 or bad_num==0: 124 | iv = 0 125 | else: 126 | iv = (bad_num / (count_num - bad_num))*math.log((bad_num / (count_num - bad_num)) / (self.allbad / self.allgood)) 127 | return iv 128 | 129 | def candidateTrend(self,cut_range): 130 | 131 | trend_up = 0 132 | trend_down = 0 133 | result = {} 134 | if len(cut_range) == 0: 135 | candidate_list = copy.deepcopy(self.candidate) 136 | else: 137 | candidate_list = [self.candidate[0]] + copy.deepcopy(cut_range) + [self.candidate[-1]] 138 | 139 | for i in range(1,len(candidate_list) - 1): 140 | woe_up = self.cal_woe_by_start_end(candidate_list[i-1], candidate_list[i]) 141 | woe_down = self.cal_woe_by_start_end(candidate_list[i], candidate_list[i+1]) 142 | 143 | if woe_up>woe_down: 144 | trend_up += 1 145 | elif woe_uptrend_down: 150 | self.trend = 'up' 151 | elif trend_up= chi_range[0])] 159 | v_down = self.value[(self.x < chi_range[2]) & (self.x >= chi_range[1])] 160 | all_num = len(v_up)+len(v_down) 161 | up_bad = len(v_up[v_up==self.bad]) 162 | up_good = len(v_up)-up_bad 163 | down_bad = len(v_down[v_down==self.bad]) 164 | down_good = len(v_down)-down_bad 165 | all_g = up_good + down_good 166 | all_b = up_bad + down_bad 167 | 168 | if len(v_up)==0 or len(v_down)==0: 169 | chisquare_value = 10**7 170 | else: 171 | chisquare_value = (up_bad-len(v_up)*all_b/all_num)/len(v_up)*all_b/all_num + \ 172 | (up_good-len(v_up)*all_g/all_num)/len(v_up)*all_g/all_num + \ 173 | (down_good-len(v_down)*all_g/all_num)/len(v_down)*all_g/all_num + \ 174 | (down_bad-len(v_down)*all_b/all_num)/len(v_down)*all_b/all_num 175 | self.chisquare_cache[chi_range] = chisquare_value 176 | else: 177 | chisquare_value = self.chisquare_cache[chi_range] 178 | return chisquare_value 179 | -------------------------------------------------------------------------------- /utils/trendDiscretization.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .simpleMethods import * 3 | import numpy as np 4 | import copy 5 | 6 | class trendDisMethod(simpleMethods): 7 | def __init__(self,x,y): 8 | simpleMethods.__init__(self,x) 9 | self.y = y 10 | self.cut_range = [] 11 | 12 | def __stat(self,bad=1): 13 | ''' 14 | 统计数据集各个区间的坏样本比例 15 | :param bad: 16 | :return: 17 | ''' 18 | n = 20 if len(self.x)>=10 else len(self.x) 19 | self.equalSize(n) 20 | self.value = np.array(self.y) 21 | #self.x_array = np.array(self.x) 22 | self.range_table = {} 23 | self.down = [] 24 | self.bad_list = [] 25 | self.count_list = [] 26 | #print(self.range_dict) 27 | #print('for') 28 | for r in self.range_dict: 29 | range_value = self.value[(self.x=r[0])] 30 | bad_num = len(range_value[range_value==bad]) 31 | count_num = len(range_value) 32 | if count_num == 0: 33 | rate = 0 34 | else: 35 | rate = bad_num/count_num 36 | 37 | self.range_table[r[1]] = {'bad_rate':rate,'count':count_num,'down':r[1],'range':r, 38 | 'bad_num':bad_num,'count_num':count_num} 39 | self.down.append(r[1]) 40 | self.bad_list.append(bad_num) 41 | self.count_list.append(count_num) 42 | print(self.range_table) 43 | 44 | 45 | def fit(self, bad=1, trend='up'): 46 | self.__stat(bad=bad) 47 | self.down = np.array(self.down) # 各分区区间下限 48 | self.bad_list = np.array(self.bad_list) # 各分区坏样本比例 49 | self.count_list = np.array(self.count_list) # 各分区总样本比例 50 | 51 | self.cut_range = self.find_cut(trend=trend) # 第一个切割点 52 | 53 | while True: 54 | cut_list = self.find_cut(self.cut_range,trend=trend) 55 | 56 | if len(cut_list)>0: 57 | for c in cut_list: 58 | self.cut_range.append(c) 59 | self.cut_range = sorted(list(set(self.cut_range))) 60 | else: 61 | break 62 | 63 | self.bins = np.array(sorted(list(set(self.cut_range)))) 64 | 65 | 66 | def find_cut(self,cut_list=[],trend='up'): 67 | cuts = [] 68 | if cut_list == []: 69 | candidate = sorted(copy.deepcopy(self.down)) 70 | cut, rate = self.__find_cut(list(candidate),start='',end='',trend=trend) 71 | if rate: 72 | cuts.append(cut) 73 | else: 74 | for i in range(len(cut_list)): 75 | if i == 0: 76 | candidate = list(self.down[self.downcut_list[i-1])]) 81 | cut, rate = self.__find_cut(list(candidate),start=cut_list[i-1],end=cut_list[i],trend=trend) 82 | 83 | if rate and cut not in self.cut_range: 84 | cuts.append(cut) 85 | if i == len(cut_list)-1: 86 | candidate = list(self.down[self.down>cut_list[i]]) 87 | cut, rate = self.__find_cut(list(candidate),start=cut_list[i],end='',trend=trend) 88 | 89 | if rate and cut not in self.cut_range: 90 | cuts.append(cut) 91 | return cuts 92 | 93 | def __find_cut(self,candidate,start='',end='',trend='up'): 94 | result_cut = None 95 | result_rate = None 96 | 97 | for i in range(len(candidate)): 98 | if start=='' and end=='': 99 | bad_up = sum(self.bad_list[(self.down<=candidate[i])]) 100 | count_up = sum(self.count_list[(self.down<=candidate[i])]) 101 | bad_down = sum(self.bad_list[(self.down>candidate[i])]) 102 | count_down = sum(self.count_list[(self.down>candidate[i])]) 103 | elif start == '' and end != '': 104 | bad_up = sum(self.bad_list[(self.down<=candidate[i])]) 105 | count_up = sum(self.count_list[(self.down<=candidate[i])]) 106 | bad_down = sum(self.bad_list[(self.down>candidate[i]) & (self.down<=end)]) 107 | count_down = sum(self.count_list[(self.down>candidate[i]) & (self.down<=end)]) 108 | elif start != '' and end == '': 109 | bad_up = sum(self.bad_list[(self.down<=candidate[i]) & (self.down>start)]) 110 | count_up = sum(self.count_list[(self.down<=candidate[i]) & (self.down>start)]) 111 | bad_down = sum(self.bad_list[(self.down>candidate[i])]) 112 | count_down = sum(self.count_list[(self.down>candidate[i])]) 113 | elif start != '' and end != '': 114 | bad_up = sum(self.bad_list[(self.down<=candidate[i]) & (self.down>start)]) 115 | count_up = sum(self.count_list[(self.down<=candidate[i]) & (self.down>start)]) 116 | bad_down = sum(self.bad_list[(self.down>candidate[i]) & (self.down<=end)]) 117 | count_down = sum(self.count_list[(self.down>candidate[i]) & (self.down<=end)]) 118 | 119 | if count_down == 0: 120 | rate_down = 0 121 | else: 122 | rate_down = bad_down/count_down 123 | if count_up == 0: 124 | rate_up = 0 125 | else: 126 | rate_up = bad_up/count_up 127 | 128 | rate = rate_down - rate_up 129 | 130 | if trend=='up': 131 | if rate >=0: 132 | if not result_rate: 133 | result_rate = rate 134 | result_cut = candidate[i] 135 | if rate > result_rate and candidate[i] not in self.cut_range: 136 | result_rate = rate 137 | result_cut = candidate[i] 138 | elif trend=='down': 139 | if rate <=0: 140 | if not result_rate: 141 | result_rate = rate 142 | result_cut = candidate[i] 143 | if rate < result_rate and candidate[i] not in self.cut_range: 144 | result_rate = rate 145 | result_cut = candidate[i] 146 | #print(result_cut, result_rate) 147 | return result_cut, result_rate 148 | -------------------------------------------------------------------------------- /autoBinning/utils/trendDiscretization.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .simpleMethods import * 3 | import numpy as np 4 | import copy 5 | 6 | class trendDisMethod(simpleMethods): 7 | def __init__(self,x,y): 8 | simpleMethods.__init__(self,x) 9 | self.y = y 10 | self.cut_range = [] 11 | 12 | def __stat(self,bad=1): 13 | ''' 14 | 统计数据集各个区间的坏样本比例 15 | :param bad: 16 | :return: 17 | ''' 18 | n = 20 if len(self.x)>=10 else len(self.x) 19 | self.equalSize(n) 20 | self.value = np.array(self.y) 21 | #self.x_array = np.array(self.x) 22 | self.range_table = {} 23 | self.down = [] 24 | self.bad_list = [] 25 | self.count_list = [] 26 | #print(self.range_dict) 27 | #print('for') 28 | for r in self.range_dict: 29 | range_value = self.value[(self.x=r[0])] 30 | bad_num = len(range_value[range_value==bad]) 31 | count_num = len(range_value) 32 | if count_num == 0: 33 | rate = 0 34 | else: 35 | rate = bad_num/count_num 36 | 37 | self.range_table[r[1]] = {'bad_rate':rate,'count':count_num,'down':r[1],'range':r, 38 | 'bad_num':bad_num,'count_num':count_num} 39 | self.down.append(r[1]) 40 | self.bad_list.append(bad_num) 41 | self.count_list.append(count_num) 42 | print(self.range_table) 43 | 44 | 45 | def fit(self, bad=1, trend='up'): 46 | self.__stat(bad=bad) 47 | self.down = np.array(self.down) # 各分区区间下限 48 | self.bad_list = np.array(self.bad_list) # 各分区坏样本比例 49 | self.count_list = np.array(self.count_list) # 各分区总样本比例 50 | 51 | self.cut_range = self.find_cut(trend=trend) # 第一个切割点 52 | 53 | while True: 54 | cut_list = self.find_cut(self.cut_range,trend=trend) 55 | 56 | if len(cut_list)>0: 57 | for c in cut_list: 58 | self.cut_range.append(c) 59 | self.cut_range = sorted(list(set(self.cut_range))) 60 | else: 61 | break 62 | 63 | self.bins = np.array(sorted(list(set(self.cut_range)))) 64 | 65 | 66 | def find_cut(self,cut_list=[],trend='up'): 67 | cuts = [] 68 | if cut_list == []: 69 | candidate = sorted(copy.deepcopy(self.down)) 70 | cut, rate = self.__find_cut(list(candidate),start='',end='',trend=trend) 71 | if rate: 72 | cuts.append(cut) 73 | else: 74 | for i in range(len(cut_list)): 75 | if i == 0: 76 | candidate = list(self.down[self.downcut_list[i-1])]) 81 | cut, rate = self.__find_cut(list(candidate),start=cut_list[i-1],end=cut_list[i],trend=trend) 82 | 83 | if rate and cut not in self.cut_range: 84 | cuts.append(cut) 85 | if i == len(cut_list)-1: 86 | candidate = list(self.down[self.down>cut_list[i]]) 87 | cut, rate = self.__find_cut(list(candidate),start=cut_list[i],end='',trend=trend) 88 | 89 | if rate and cut not in self.cut_range: 90 | cuts.append(cut) 91 | return cuts 92 | 93 | def __find_cut(self,candidate,start='',end='',trend='up'): 94 | result_cut = None 95 | result_rate = None 96 | 97 | for i in range(len(candidate)): 98 | if start=='' and end=='': 99 | bad_up = sum(self.bad_list[(self.down<=candidate[i])]) 100 | count_up = sum(self.count_list[(self.down<=candidate[i])]) 101 | bad_down = sum(self.bad_list[(self.down>candidate[i])]) 102 | count_down = sum(self.count_list[(self.down>candidate[i])]) 103 | elif start == '' and end != '': 104 | bad_up = sum(self.bad_list[(self.down<=candidate[i])]) 105 | count_up = sum(self.count_list[(self.down<=candidate[i])]) 106 | bad_down = sum(self.bad_list[(self.down>candidate[i]) & (self.down<=end)]) 107 | count_down = sum(self.count_list[(self.down>candidate[i]) & (self.down<=end)]) 108 | elif start != '' and end == '': 109 | bad_up = sum(self.bad_list[(self.down<=candidate[i]) & (self.down>start)]) 110 | count_up = sum(self.count_list[(self.down<=candidate[i]) & (self.down>start)]) 111 | bad_down = sum(self.bad_list[(self.down>candidate[i])]) 112 | count_down = sum(self.count_list[(self.down>candidate[i])]) 113 | elif start != '' and end != '': 114 | bad_up = sum(self.bad_list[(self.down<=candidate[i]) & (self.down>start)]) 115 | count_up = sum(self.count_list[(self.down<=candidate[i]) & (self.down>start)]) 116 | bad_down = sum(self.bad_list[(self.down>candidate[i]) & (self.down<=end)]) 117 | count_down = sum(self.count_list[(self.down>candidate[i]) & (self.down<=end)]) 118 | 119 | if count_down == 0: 120 | rate_down = 0 121 | else: 122 | rate_down = bad_down/count_down 123 | if count_up == 0: 124 | rate_up = 0 125 | else: 126 | rate_up = bad_up/count_up 127 | 128 | rate = rate_down - rate_up 129 | 130 | if trend=='up': 131 | if rate >=0: 132 | if not result_rate: 133 | result_rate = rate 134 | result_cut = candidate[i] 135 | if rate > result_rate and candidate[i] not in self.cut_range: 136 | result_rate = rate 137 | result_cut = candidate[i] 138 | elif trend=='down': 139 | if rate <=0: 140 | if not result_rate: 141 | result_rate = rate 142 | result_cut = candidate[i] 143 | if rate < result_rate and candidate[i] not in self.cut_range: 144 | result_rate = rate 145 | result_cut = candidate[i] 146 | #print(result_cut, result_rate) 147 | return result_cut, result_rate 148 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from autoBinning.utils.forwardSplit import * 3 | from autoBinning.utils.trendSplit import * 4 | from autoBinning.utils.simpleMethods import * 5 | from autoBinning.utils.trendDiscretization import * 6 | from autoBinning.utils.mapa import * 7 | 8 | import numpy as np 9 | import pandas as pd 10 | 11 | 12 | def sampleTest(): 13 | #my_list = [1,1,2,2,2,2,3,3,4,5,6,7,8,9] 14 | my_list = [1,1,2,2,2,2,3,3,4,5,6,7,8,9,10,10,20,20,20,20,30,30,40,50,60,70,80,90,100] 15 | my_list_y = [1,1,2,2,2,2,1,1,1,2,2,2,1,1] 16 | t = simpleMethods(my_list) # [ 1. 5.33333333 20. 100. ] 17 | t.equalSize(3) 18 | trans = np.digitize(my_list, t.bins) 19 | print(t.bins) 20 | print(trans) 21 | t.equalValue(4) 22 | trans = np.digitize(my_list, t.bins) 23 | print(t.bins) 24 | print(trans) 25 | t.equalHist(4) 26 | trans = np.digitize(my_list, t.bins) 27 | print(t.bins) 28 | print(trans) 29 | 30 | 31 | def distest(): 32 | my_list = [1,1,2,2,2,2,3,3,4,5,6,7,8,9,10,10,20,20,20,20,30,30,40,50,60,70,80,90,100] 33 | my_list_y = [1,1,0,0,0,1,0,0,1,1,1,0,1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1] 34 | t = trendDisMethod(my_list, my_list_y) 35 | t.fit() 36 | trans = np.digitize(my_list, t.bins) 37 | print(t.bins) 38 | print(trans) 39 | 40 | def trend_test_by_data(): 41 | df = pd.read_csv('credit_old.csv') 42 | df = df[['Age','target']] 43 | df = df.dropna() 44 | 45 | t = trendDisMethod(df['Age'], df['target']) 46 | t.fit(trend='down') 47 | print(df['Age'].describe()) 48 | print(t.bins) 49 | #print(df['Age'].describe()) 50 | 51 | def forward_woe_test(): 52 | df = pd.read_csv('credit_old.csv') 53 | df = df[['Age','target']] 54 | df = df.dropna() 55 | 56 | t = forwardSplit(df['Age'], df['target']) 57 | t.fit(sby='woe',minv=0.01,init_split=20) 58 | print(t.bins) # [16. 25. 29. 33. 36. 38. 40. 42. 44. 46. 48. 50. 52. 54. 55. 58. 60. 63. 72. 94.] 59 | t = forwardSplit(df['Age'], df['target'],missing=16) 60 | t.fit(sby='woe',num_split=4,init_split=20) 61 | print(t.bins) 62 | t = forwardSplit(df['Age'], df['target']) 63 | t.fit(sby='woe',num_split=4,init_split=20) 64 | print(t.bins) # [16. 42. 44. 48. 50. 94.] 65 | print("bin\twoe") 66 | for i in range(len(t.bins)-1): 67 | v = t.value[(t.x < t.bins[i+1]) & (t.x >= t.bins[i])] 68 | woe = t._cal_woe(v) 69 | print((t.bins[i], t.bins[i+1]),woe) 70 | # {(16.0, 25.0): 0.11373232830301286, (25.0, 42.0): 0.07217546872710079, (42.0, 50.0): 0.04972042405868509, (50.0, 72.0): -0.07172614369435065, (72.0, 94.0): -0.13778318584223453} 71 | 72 | def forward_iv_test(): 73 | df = pd.read_csv('credit_old.csv') 74 | df = df[['Age','target']] 75 | df = df.dropna() 76 | 77 | t = forwardSplit(df['Age'], df['target']) 78 | t.fit(sby='iv',minv=0.1,init_split=20) 79 | print(t.bins) # [16. 25. 29. 33. 36. 38. 40. 42. 44. 46. 48. 50. 58. 60. 63. 94.] 80 | t = forwardSplit(df['Age'], df['target']) 81 | t.fit(sby='iv',num_split=4,init_split=20,min_sample=len(df)*0.2) 82 | print(t.bins) # [16. 38. 50. 94.] 83 | t.fit(sby='woeiv',num_split=4,init_split=20) 84 | print(t.bins) # [16. 25. 33. 36. 38. 94.] 85 | print("bin\twoe") 86 | for i in range(len(t.bins)-1): 87 | v = t.value[(t.x < t.bins[i+1]) & (t.x >= t.bins[i])] 88 | woe = t._cal_woe(v) 89 | print((t.bins[i], t.bins[i+1]),woe) 90 | 91 | def backward_iv_test(): 92 | df = pd.read_csv('credit_old.csv') 93 | df = df[['Age','target']] 94 | df = df.dropna() 95 | 96 | t = backwardSplit(df['Age'], df['target']) 97 | t.fit(sby='iv',num_split=5) 98 | print(t.bins) # [16. 17.5 18.5 85.5 95. ] 99 | woe_dict = {} 100 | for i in range(len(t.bins)-1): 101 | v = t.value[(t.x < t.bins[i+1]) & (t.x >= t.bins[i])] 102 | woe = t._cal_woe(v) 103 | woe_dict[(t.bins[i], t.bins[i+1])] = woe 104 | print(woe_dict) 105 | 106 | def backward_chi_test(): 107 | df = pd.read_csv('credit_old.csv') 108 | df = df[['Age','target']] 109 | df = df.dropna() 110 | 111 | t = backwardSplit(df['Age'], df['target']) 112 | t.fit(sby='chi',num_split=7) 113 | print(t.bins) # [16. 72.5 73.5 87.5 89.5 90.5 95. ] 114 | print("bin\twoe") 115 | for i in range(len(t.bins)-1): 116 | v = t.value[(t.x < t.bins[i+1]) & (t.x >= t.bins[i])] 117 | woe = t._cal_woe(v) 118 | print((t.bins[i], t.bins[i+1]),woe) 119 | #woe_dict[(t.bins[i], t.bins[i+1])] = woe 120 | 121 | def forward_iv_test2(): 122 | df = pd.read_csv('resolution.csv') 123 | df = df.dropna() 124 | 125 | t = forwardSplit(df['x'], df['y'],missing=-1) 126 | t.fit(sby='woeiv',minv=0.1,init_split=0,num_split=4) 127 | print(t.bins) 128 | print("bin\twoe") 129 | for i in range(len(t.bins)-1): 130 | v = t.value[(t.x < t.bins[i+1]) & (t.x >= t.bins[i])] 131 | woe = t._cal_woe(v) 132 | print((t.bins[i], t.bins[i+1]),woe) 133 | 134 | def MAPA_test(): 135 | df = pd.read_csv('credit_old.csv') 136 | df = df[['Age','target']] 137 | df = df.dropna() 138 | 139 | t = MAPA(df['Age'], df['target']) 140 | t.fit(trend='up',sby='woe') 141 | print(t.bins) 142 | print("bin\twoe") 143 | for i in range(len(t.bins)-1): 144 | v = t.value[(t.x < t.bins[i+1]) & (t.x >= t.bins[i])] 145 | woe = t._cal_woe(v) 146 | print((t.bins[i], t.bins[i+1]),woe) 147 | 148 | def spearman_test(): 149 | df = pd.read_csv('credit_old.csv') 150 | df = df[['Age','target']] 151 | df = df.dropna() 152 | 153 | t = backwardSplit(df['Age'], df['target']) 154 | t.fit_by_spearman(min_v=5, init_split=20) 155 | print(t.bins) 156 | print("bin\twoe") 157 | for i in range(len(t.bins)-1): 158 | v = t.value[(t.x < t.bins[i+1]) & (t.x >= t.bins[i])] 159 | woe = t._cal_woe(v) 160 | print((t.bins[i], t.bins[i+1]),woe) 161 | 162 | def forward_woe_test_cat(): 163 | df = pd.read_csv('credit_old.csv') 164 | df = df[['Branch','target']] 165 | df = df.dropna() 166 | 167 | t = forwardSplit(df['Branch'], df['target'],categorical=True) 168 | t.fit(sby='woe',minv=0.01,init_split=0,num_split=4) 169 | print(t.bins) 170 | for i in range(len(t.bins)): 171 | v = t.value[np.isin(t.x_idx,t.bins[i])] 172 | woe = t._cal_woe(v) 173 | print(t.bins[i],woe) 174 | 175 | def forward_iv_test2_cat(): 176 | df = pd.read_csv('credit_old.csv') 177 | df = df[['Branch','target']] 178 | df = df.dropna() 179 | 180 | t = forwardSplit(df['Branch'], df['target'],missing=-1,categorical=True) 181 | t.fit(sby='woeiv',minv=0,init_split=0,num_split=4) 182 | print(t.bins) 183 | print("bin\twoe") 184 | for i in range(len(t.bins)): 185 | v = t.value[np.isin(t.x_idx,t.bins[i])] 186 | woe = t._cal_woe(v) 187 | print(t.bins[i],woe) 188 | 189 | 190 | def main(): 191 | #forward_woe_test() 192 | #forward_iv_test2() 193 | #sampleTest() 194 | #forward_iv_test() 195 | #backward_chi_test() 196 | #MAPA_test() 197 | #spearman_test() 198 | forward_iv_test2_cat() 199 | 200 | if __name__ == "__main__": 201 | main() --------------------------------------------------------------------------------