├── ChiMerge.py └── README.md /ChiMerge.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | data = pd.read_csv('sample_data.csv', sep="\t", na_values=['', '?']) 4 | temp = data[['x','y']] 5 | 6 | 7 | # 定义一个卡方分箱(可设置参数置信度水平与箱的个数)停止条件为大于置信水平且小于bin的数目 8 | def ChiMerge(df, variable, flag, confidenceVal=3.841, bin=10, sample = None): 9 | ''' 10 | 运行前需要 import pandas as pd 和 import numpy as np 11 | df:传入一个数据框仅包含一个需要卡方分箱的变量与正负样本标识(正样本为1,负样本为0) 12 | variable:需要卡方分箱的变量名称(字符串) 13 | flag:正负样本标识的名称(字符串) 14 | confidenceVal:置信度水平(默认是不进行抽样95%) 15 | bin:最多箱的数目 16 | sample: 为抽样的数目(默认是不进行抽样),因为如果观测值过多运行会较慢 17 | ''' 18 | #进行是否抽样操作 19 | if sample != None: 20 | df = df.sample(n=sample) 21 | else: 22 | df 23 | 24 | #进行数据格式化录入 25 | total_num = df.groupby([variable])[flag].count() # 统计需分箱变量每个值数目 26 | total_num = pd.DataFrame({'total_num': total_num}) # 创建一个数据框保存之前的结果 27 | positive_class = df.groupby([variable])[flag].sum() # 统计需分箱变量每个值正样本数 28 | positive_class = pd.DataFrame({'positive_class': positive_class}) # 创建一个数据框保存之前的结果 29 | regroup = pd.merge(total_num, positive_class, left_index=True, right_index=True, 30 | how='inner') # 组合total_num与positive_class 31 | regroup.reset_index(inplace=True) 32 | regroup['negative_class'] = regroup['total_num'] - regroup['positive_class'] # 统计需分箱变量每个值负样本数 33 | regroup = regroup.drop('total_num', axis=1) 34 | np_regroup = np.array(regroup) # 把数据框转化为numpy(提高运行效率) 35 | print('已完成数据读入,正在计算数据初处理') 36 | 37 | #处理连续没有正样本或负样本的区间,并进行区间的合并(以免卡方值计算报错) 38 | i = 0 39 | while (i <= np_regroup.shape[0] - 2): 40 | if ((np_regroup[i, 1] == 0 and np_regroup[i + 1, 1] == 0) or ( np_regroup[i, 2] == 0 and np_regroup[i + 1, 2] == 0)): 41 | np_regroup[i, 1] = np_regroup[i, 1] + np_regroup[i + 1, 1] # 正样本 42 | np_regroup[i, 2] = np_regroup[i, 2] + np_regroup[i + 1, 2] # 负样本 43 | np_regroup[i, 0] = np_regroup[i + 1, 0] 44 | np_regroup = np.delete(np_regroup, i + 1, 0) 45 | i = i - 1 46 | i = i + 1 47 | 48 | #对相邻两个区间进行卡方值计算 49 | chi_table = np.array([]) # 创建一个数组保存相邻两个区间的卡方值 50 | for i in np.arange(np_regroup.shape[0] - 1): 51 | chi = (np_regroup[i, 1] * np_regroup[i + 1, 2] - np_regroup[i, 2] * np_regroup[i + 1, 1]) ** 2 \ 52 | * (np_regroup[i, 1] + np_regroup[i, 2] + np_regroup[i + 1, 1] + np_regroup[i + 1, 2]) / \ 53 | ((np_regroup[i, 1] + np_regroup[i, 2]) * (np_regroup[i + 1, 1] + np_regroup[i + 1, 2]) * ( 54 | np_regroup[i, 1] + np_regroup[i + 1, 1]) * (np_regroup[i, 2] + np_regroup[i + 1, 2])) 55 | chi_table = np.append(chi_table, chi) 56 | print('已完成数据初处理,正在进行卡方分箱核心操作') 57 | 58 | #把卡方值最小的两个区间进行合并(卡方分箱核心) 59 | while (1): 60 | if (len(chi_table) <= (bin - 1) and min(chi_table) >= confidenceVal): 61 | break 62 | chi_min_index = np.argwhere(chi_table == min(chi_table))[0] # 找出卡方值最小的位置索引 63 | np_regroup[chi_min_index, 1] = np_regroup[chi_min_index, 1] + np_regroup[chi_min_index + 1, 1] 64 | np_regroup[chi_min_index, 2] = np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 2] 65 | np_regroup[chi_min_index, 0] = np_regroup[chi_min_index + 1, 0] 66 | np_regroup = np.delete(np_regroup, chi_min_index + 1, 0) 67 | 68 | if (chi_min_index == np_regroup.shape[0] - 1): # 最小值试最后两个区间的时候 69 | # 计算合并后当前区间与前一个区间的卡方值并替换 70 | chi_table[chi_min_index - 1] = (np_regroup[chi_min_index - 1, 1] * np_regroup[chi_min_index, 2] - np_regroup[chi_min_index - 1, 2] * np_regroup[chi_min_index, 1]) ** 2 \ 71 | * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) / \ 72 | ((np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index, 1]) * (np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 2])) 73 | # 删除替换前的卡方值 74 | chi_table = np.delete(chi_table, chi_min_index, axis=0) 75 | 76 | else: 77 | # 计算合并后当前区间与前一个区间的卡方值并替换 78 | chi_table[chi_min_index - 1] = (np_regroup[chi_min_index - 1, 1] * np_regroup[chi_min_index, 2] - np_regroup[chi_min_index - 1, 2] * np_regroup[chi_min_index, 1]) ** 2 \ 79 | * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) / \ 80 | ((np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index, 1]) * (np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 2])) 81 | # 计算合并后当前区间与后一个区间的卡方值并替换 82 | chi_table[chi_min_index] = (np_regroup[chi_min_index, 1] * np_regroup[chi_min_index + 1, 2] - np_regroup[chi_min_index, 2] * np_regroup[chi_min_index + 1, 1]) ** 2 \ 83 | * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 1] + np_regroup[chi_min_index + 1, 2]) / \ 84 | ((np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index + 1, 1] + np_regroup[chi_min_index + 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index + 1, 1]) * (np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 2])) 85 | # 删除替换前的卡方值 86 | chi_table = np.delete(chi_table, chi_min_index + 1, axis=0) 87 | print('已完成卡方分箱核心操作,正在保存结果') 88 | 89 | #把结果保存成一个数据框 90 | result_data = pd.DataFrame() # 创建一个保存结果的数据框 91 | result_data['variable'] = [variable] * np_regroup.shape[0] # 结果表第一列:变量名 92 | list_temp = [] 93 | for i in np.arange(np_regroup.shape[0]): 94 | if i == 0: 95 | x = '0' + ',' + str(np_regroup[i, 0]) 96 | elif i == np_regroup.shape[0] - 1: 97 | x = str(np_regroup[i - 1, 0]) + '+' 98 | else: 99 | x = str(np_regroup[i - 1, 0]) + ',' + str(np_regroup[i, 0]) 100 | list_temp.append(x) 101 | result_data['interval'] = list_temp # 结果表第二列:区间 102 | result_data['flag_0'] = np_regroup[:, 2] # 结果表第三列:负样本数目 103 | result_data['flag_1'] = np_regroup[:, 1] # 结果表第四列:正样本数目 104 | 105 | return result_data 106 | 107 | #调用函数参数示例 108 | bins = ChiMerge(temp, 'x','y', confidenceVal=3.841, bin=10,sample=None) 109 | bins -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ChiMerge 2 | 对数据框中的某个变量进行有监督的分箱操作 3 | --------------------------------------------------------------------------------