├── IGN_UI.py ├── README.md ├── RiskModeler 使用文档.docx ├── __init__.py ├── base.py ├── dataset ├── echarts.min.js ├── func.py ├── funcc.py ├── inputdata.py ├── interactive_grouping.py ├── load_node.py ├── model.py ├── model_result_ui.py ├── model_ui.py ├── policy.py ├── render.html ├── sampling.py ├── score_result_ui.py ├── score_ui.py ├── split.py ├── start.py ├── test.py └── var_clus.py /README.md: -------------------------------------------------------------------------------- 1 | # RiskModeler 2 | Risk scorecard develop tool welcome to use 3 | 想了解更多内容欢迎关注公众号 人工智障风控 4 | -------------------------------------------------------------------------------- /RiskModeler 使用文档.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nothingyang/RiskModeler/9192aee2d886c6f109b07e25abf7c4ae66d6e209/RiskModeler 使用文档.docx -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | from .start import scorecard 2 | 3 | # h=scorecard() -------------------------------------------------------------------------------- /dataset: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /func.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from .base import group_func 3 | import numpy as np 4 | from joblib import Parallel, delayed 5 | import joblib 6 | 7 | import math 8 | group_func=group_func() 9 | 10 | class binning(): 11 | def fit_bin(self, data, varnum, varchar, target, s_bin_num=20, special_code=pd.DataFrame(), min_num=500, min_pct=0.05, n_job=None, criterion='entropy', splitter='best', max_depth=6, min_samples_leaf=500,max_leaf_nodes=9): 12 | if n_job == None: 13 | n_job = joblib.cpu_count() - 1 14 | # colnum=data[colmns].select_dtypes(include=['float','int8','int16','int32','int64']).columns.values.tolist() 15 | # colchar=data[colmns].select_dtypes(include=['object']).columns.values.tolist() 16 | column = varnum + varchar 17 | lenp = math.ceil(len(column) / (n_job)) 18 | 19 | def func(part): 20 | temp1 = pd.DataFrame() 21 | temp2 = [] 22 | for i in range(lenp * part, min(lenp * (part + 1), len(column))): 23 | col = column[i] 24 | if special_code.empty == False: 25 | specialcode_list = list(special_code[special_code['variable'] == col]['value']) 26 | else: 27 | specialcode_list = [] 28 | inputdata = data[[col, target]] 29 | if col in varchar: 30 | out, outdata = group_func.charvar(inputdata=inputdata, col=col, min_num=min_num, min_pct=min_pct, 31 | target=target, criterion=criterion, splitter=splitter, 32 | max_depth=max_depth, min_samples_leaf=min_samples_leaf, 33 | max_leaf_nodes=max_leaf_nodes) 34 | else: 35 | out, outdata = group_func.numericvar(inputdata=inputdata, col=col, 36 | specialcode_list=specialcode_list, s_bin_num=s_bin_num, 37 | target=target, criterion=criterion, splitter=splitter, 38 | max_depth=max_depth, min_samples_leaf=min_samples_leaf, 39 | max_leaf_nodes=max_leaf_nodes) 40 | out = out.drop_duplicates() 41 | outdata = outdata.drop_duplicates() 42 | temp1 = pd.concat([temp1, out]) 43 | temp2.append(outdata) 44 | return temp1, temp2 45 | 46 | # if len(data.columns)*len(data)>3*10**8: 47 | # print('your dataset is quiet large Parallel may not work , n_job=1 is recommend') 48 | results = Parallel(n_jobs=n_job, max_nbytes=None, verbose=5)(delayed(func)(part) for part in range(n_job)) 49 | self.group_info = pd.DataFrame() 50 | self.group_info_data = data.copy() 51 | for i in range(len(results)): 52 | te = results[i][0] 53 | self.group_info = pd.concat([self.group_info, te], sort=True) 54 | tf = results[i][1] 55 | for pp in range(len(tf)): 56 | one = tf[pp] 57 | self.group_info_data = pd.merge(self.group_info_data, one, how='left') 58 | return self.group_info, self.group_info_data 59 | 60 | def fit_bin_aprior(self, data, varnum, target, s_bin_num=20, special_code=pd.DataFrame(), n_job=None): 61 | if n_job == None: 62 | n_job = joblib.cpu_count() - 1 63 | # colnum=data[colmns].select_dtypes(include=['float','int8','int16','int32','int64']).columns.values.tolist() 64 | # colchar=data[colmns].select_dtypes(include=['object']).columns.values.tolist() 65 | column = varnum 66 | lenp = math.ceil(len(column) / (n_job)) 67 | 68 | def func(part): 69 | temp1 = pd.DataFrame() 70 | temp2 = [] 71 | for i in range(lenp * part, min(lenp * (part + 1), len(column))): 72 | col = column[i] 73 | if special_code.empty == False: 74 | specialcode_list = list(special_code[special_code['variable'] == col]['value']) 75 | else: 76 | specialcode_list = [] 77 | inputdata = data[[col, target]] 78 | 79 | out, outdata = group_func.numericvar_apior(inputdata=inputdata, col=col, 80 | specialcode_list=specialcode_list, s_bin_num=s_bin_num, 81 | target=target) 82 | out = out.drop_duplicates() 83 | outdata = outdata.drop_duplicates() 84 | temp1 = pd.concat([temp1, out]) 85 | temp2.append(outdata) 86 | return temp1, temp2 87 | 88 | # if len(data.columns)*len(data)>3*10**8: 89 | # print('your dataset is quiet large Parallel may not work , n_job=1 is recommend') 90 | results = Parallel(n_jobs=n_job, max_nbytes=None, verbose=5)(delayed(func)(part) for part in range(n_job)) 91 | self.group_info = pd.DataFrame() 92 | self.group_info_data = data.copy() 93 | for i in range(len(results)): 94 | te = results[i][0] 95 | self.group_info = pd.concat([self.group_info, te], sort=True) 96 | tf = results[i][1] 97 | for pp in range(len(tf)): 98 | one = tf[pp] 99 | self.group_info_data = pd.merge(self.group_info_data, one, how='left') 100 | return self.group_info, self.group_info_data 101 | 102 | def fit_bin_existing(self, data, varnum, varchar, target, group_info, data_only=False, n_job=None): 103 | 104 | if n_job == None: 105 | n_job = joblib.cpu_count() - 2 106 | data_col = list(data.columns) 107 | group_col = varnum + varchar 108 | column = list(set(data_col).intersection(set(group_col))) 109 | lenp = math.ceil(len(column) / (n_job)) 110 | 111 | 112 | def func(part): 113 | temp1 = pd.DataFrame() 114 | temp2 = [] 115 | for i in range(lenp * part, min(lenp * (part + 1), len(column))): 116 | col = column[i] 117 | if data_only == False: 118 | inputdata = data[[col, target]] 119 | else: 120 | inputdata = data[[col]] 121 | group_info_col = group_info[group_info['variable_name'] == col] 122 | if data_only == False: 123 | if col in varchar: 124 | 125 | out, outdata = group_func.charvarexist(group_info_old=group_info_col, data_only=data_only, 126 | inputdata=inputdata, col=col, target=target) 127 | else: 128 | out, outdata = group_func.numericexist(group_info_old=group_info_col, data_only=data_only, 129 | inputdata=inputdata, col=col, target=target, modify=False, 130 | add_value=0) 131 | out = out.drop_duplicates() 132 | outdata = outdata.drop_duplicates() 133 | temp1 = pd.concat([temp1, out]) 134 | temp2.append(outdata) 135 | else: 136 | if col in varchar: 137 | 138 | outdata = group_func.charvarexist(group_info_old=group_info_col, data_only=data_only, 139 | inputdata=inputdata, col=col, target=target) 140 | else: 141 | outdata = group_func.numericexist(group_info_old=group_info_col, data_only=data_only, 142 | inputdata=inputdata, col=col, target=target, modify=False, 143 | add_value=0) 144 | 145 | outdata = outdata.drop_duplicates() 146 | temp2.append(outdata) 147 | # return outdata 148 | if data_only==False: 149 | return temp1, temp2 150 | else : 151 | return temp2 152 | 153 | results = Parallel(n_jobs=n_job, max_nbytes=None, verbose=5)(delayed(func)(part) for part in range(n_job)) 154 | if data_only == False: 155 | self.group_info = pd.DataFrame() 156 | self.group_info_data = data.copy() 157 | for i in range(len(results)): 158 | te = results[i][0] 159 | self.group_info = pd.concat([self.group_info, te], sort=True) 160 | tf = results[i][1] 161 | for pp in range(len(tf)): 162 | one = tf[pp] 163 | self.group_info_data = pd.merge(self.group_info_data, one, how='left') 164 | return self.group_info, self.group_info_data 165 | else: 166 | 167 | self.group_info_data = data.copy() 168 | for i in range(len(results)): 169 | tf = results[i] 170 | for pp in range(len(tf)): 171 | one = tf[pp] 172 | self.group_info_data = pd.merge(self.group_info_data, one, how='left') 173 | return self.group_info_data 174 | 175 | 176 | def report(self, group_info, varnum, varchar): 177 | col=varnum+varchar 178 | 179 | group_info['miss_rate'] = group_info['miss_count'] / (group_info['miss_count'] + group_info['count']) 180 | group_info['total_count'] = (group_info['miss_count'] + group_info['count']) 181 | group_info = group_info[group_info['variable_name'].isin(col)] 182 | if len(varnum) < 1: 183 | base = group_info[['variable_name', 'f_group', 'f_Bad_rate', 'f_N_bad', 'f_N_obs', 'woe', 'iv', 184 | 'miss_rate']].drop_duplicates() 185 | 186 | else: 187 | base = group_info[ 188 | ['variable_name', 'f_group', 'f_Bad_rate', 'f_N_bad', 'f_N_obs', 'woe', 'iv', 'miss_rate']].drop_duplicates() 189 | if len(varnum) >= 1: 190 | num_data_report = group_info[group_info['variable_name'].isin(varnum)] 191 | label_num = num_data_report.groupby(['variable_name', 'f_group']).agg( 192 | {'s_min': 'min', 's_max': 'max', 'miss_s': 'max'}).reset_index().rename( 193 | {'s_min': 'f_minlabel', 's_max': 'f_maxlabel', 'miss_s': 'miss_f'}, 194 | axis=1) 195 | label_num['f_minlabel'] = round(label_num['f_minlabel'], 4) 196 | label_num['f_maxlabel'] = round(label_num['f_maxlabel'], 4) 197 | label_num['labelA'] = label_num.apply( 198 | lambda x: '%s < %s <= %s ' % (x['f_minlabel'], x['variable_name'], x['f_maxlabel']), axis=1) 199 | if len(num_data_report[num_data_report['value'].isnull() == False])>0: 200 | label_num_miss = num_data_report[num_data_report['value'].isnull() == False] 201 | label_num_miss['value'] = label_num_miss['value'].astype('str') 202 | label_num_miss['value'] = label_num_miss['value'] + ' ; ' 203 | label_num_miss = label_num_miss.groupby(['variable_name', 'f_group'])['value'].sum().reset_index().rename( 204 | {'value': 'labelB'}, axis=1) 205 | label = pd.merge(label_num, label_num_miss, how='left', on=['variable_name', 'f_group']) 206 | label = label.astype({'labelB': 'str'}) 207 | else: 208 | label = label_num 209 | label['label'] = label.apply( 210 | lambda x: x['labelB'] if (np.isnan(x['f_maxlabel'])) & (np.isnan(x['f_minlabel'])) & ( 211 | x['miss_f'] == True) 212 | else x['labelA'] if (np.isnan(x['f_maxlabel']) == False) & (np.isnan(x['f_minlabel']) == False) & ( 213 | x['miss_f'] == False) 214 | else '[' + str(x['labelB']) + ']' + x['labelA'] if (np.isnan(x['f_maxlabel']) == False) & ( 215 | np.isnan(x['f_minlabel']) == False) & (x['miss_f'] == True) 216 | else '', axis=1) 217 | 218 | labellist_num = label[['variable_name', 'f_group', 'label']] 219 | else: 220 | labellist_num = pd.DataFrame() 221 | if len(varchar) >= 1: 222 | char_data_report = group_info[group_info['variable_name'].isin(varchar)] 223 | char_data_report['value'] = char_data_report['value'].astype('str') 224 | char_data_report['value'] = char_data_report['value'] + ' ; ' 225 | labellist_char = char_data_report.groupby(['variable_name', 'f_group'])['value'].sum().reset_index().rename( 226 | {'value': 'label'}, axis=1) 227 | else: 228 | labellist_char = pd.DataFrame() 229 | label = pd.concat([labellist_char, labellist_num]) 230 | label["f_group"] = pd.to_numeric(label["f_group"]) 231 | reportA = pd.merge(base, label, how='left', on=['variable_name', 'f_group']) 232 | reportA.sort_values(by='iv', ascending=False) 233 | reportA['f_Bad_rate'] = reportA.apply(lambda x: "%.2f%%" % (x['f_Bad_rate'] * 100), axis=1) 234 | reportA['miss_rate'] = reportA.apply(lambda x: "%.2f%%" % (x['miss_rate'] * 100), axis=1) 235 | 236 | f_group_report = reportA 237 | 238 | # s_group 239 | group_info['miss_rate'] = group_info['miss_count'] / (group_info['miss_count'] + group_info['count']) 240 | group_info['total_count'] = (group_info['miss_count'] + group_info['count']) 241 | 242 | 243 | if len(varnum) < 1: 244 | base = group_info[['variable_name', 's_Bad_rate', 's_N_bad', 's_N_obs', 'f_group', 'value', 245 | 'miss_s']].drop_duplicates() 246 | 247 | else: 248 | base = group_info[['variable_name', 's_group', 's_Bad_rate', 's_N_bad', 's_N_obs', 'f_group', 'value', 249 | 'miss_s']].drop_duplicates() 250 | 251 | if len(varnum) >= 1: 252 | num_data_report = group_info[group_info['variable_name'].isin(varnum)] 253 | label_num = num_data_report.groupby(['variable_name', 's_group', 'miss_s']).agg( 254 | {'s_min': 'min', 's_max': 'max'}).reset_index().rename({'s_min': 's_minlabel', 's_max': 's_maxlabel'}, 255 | axis=1) 256 | label_num['s_minlabel'] = round(label_num['s_minlabel'], 4) 257 | label_num['s_maxlabel'] = round(label_num['s_maxlabel'], 4) 258 | label_num['labelA'] = label_num.apply( 259 | lambda x: '%s < %s <= %s ' % (x['s_minlabel'], x['variable_name'], x['s_maxlabel']), axis=1) 260 | if len(num_data_report[num_data_report['value'].isnull() == False])>0: 261 | label_num_miss = num_data_report[num_data_report['value'].isnull() == False] 262 | label_num_miss['value'] = label_num_miss['value'].astype('str') 263 | label_num_miss['value'] = label_num_miss['value'] + ' ; ' 264 | label_num_miss = label_num_miss.groupby(['variable_name', 's_group'])['value'].sum().reset_index().rename( 265 | {'value': 'labelB'}, axis=1) 266 | label = pd.merge(label_num, label_num_miss, how='left', on=['variable_name', 's_group']) 267 | else: 268 | label = label_num 269 | label['label'] = label.apply( 270 | lambda x: x['labelB'] if (np.isnan(x['s_maxlabel'])) & (np.isnan(x['s_minlabel'])) & ( 271 | x['miss_s'] == True) 272 | else x['labelA'] if (np.isnan(x['s_maxlabel']) == False) & (np.isnan(x['s_minlabel']) == False) & ( 273 | x['miss_s'] == False) 274 | else '[' + x['labelB'] + ']' + x['labelA'] if (np.isnan(x['s_maxlabel']) == False) & ( 275 | np.isnan(x['s_minlabel']) == False) & (x['miss_s'] == True) 276 | else '', axis=1) 277 | labellist_num = label[['variable_name', 's_group', 'label']] 278 | labellist_num["s_group"] = pd.to_numeric(labellist_num["s_group"]) 279 | reportA = pd.merge(base, labellist_num, how='right', on=['variable_name', 's_group']) 280 | else: 281 | reportA = pd.DataFrame() 282 | if len(varchar) >= 1: 283 | char_data_report = group_info[group_info['variable_name'].isin(varchar)] 284 | char_data_report['value'] = char_data_report['value'].astype('str') 285 | labellist_char = char_data_report.groupby(['variable_name'])['value'].sum().reset_index() 286 | reportB = char_data_report[ 287 | ['variable_name', 'f_group', 's_Bad_rate', 's_N_bad', 's_N_obs', 'value', 'miss_s']] 288 | reportB['label'] = reportB['value'] 289 | else: 290 | reportB = pd.DataFrame() 291 | 292 | ss = pd.concat([reportA, reportB]) 293 | ss['s_Bad_rate'] = ss.apply(lambda x: "%.2f%%" % (x['s_Bad_rate'] * 100), axis=1) 294 | s_group_report = pd.merge(ss, f_group_report.drop(columns=['label']), how='left', 295 | on=['variable_name', 'f_group']) 296 | return s_group_report, f_group_report -------------------------------------------------------------------------------- /funcc.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from base import group_func 3 | import numpy as np 4 | from joblib import Parallel, delayed 5 | import joblib 6 | 7 | import math 8 | group_func=group_func() 9 | 10 | class binning(): 11 | def fit_bin(self, data, varnum, varchar, target, s_bin_num=20, special_code=pd.DataFrame(), min_num=500, min_pct=0.05, n_job=None, criterion='entropy', splitter='best', max_depth=6, min_samples_leaf=500,max_leaf_nodes=9): 12 | if n_job == None: 13 | n_job = joblib.cpu_count() - 1 14 | # colnum=data[colmns].select_dtypes(include=['float','int8','int16','int32','int64']).columns.values.tolist() 15 | # colchar=data[colmns].select_dtypes(include=['object']).columns.values.tolist() 16 | column = varnum + varchar 17 | lenp = math.ceil(len(column) / (n_job)) 18 | 19 | def func(part): 20 | temp1 = pd.DataFrame() 21 | temp2 = [] 22 | for i in range(lenp * part, min(lenp * (part + 1), len(column))): 23 | col = column[i] 24 | if special_code.empty == False: 25 | specialcode_list = list(special_code[special_code['variable'] == col]['value']) 26 | else: 27 | specialcode_list = [] 28 | inputdata = data[[col, target]] 29 | if col in varchar: 30 | out, outdata = group_func.charvar(inputdata=inputdata, col=col, min_num=min_num, min_pct=min_pct, 31 | target=target, criterion=criterion, splitter=splitter, 32 | max_depth=max_depth, min_samples_leaf=min_samples_leaf, 33 | max_leaf_nodes=max_leaf_nodes) 34 | else: 35 | out, outdata = group_func.numericvar(inputdata=inputdata, col=col, 36 | specialcode_list=specialcode_list, s_bin_num=s_bin_num, 37 | target=target, criterion=criterion, splitter=splitter, 38 | max_depth=max_depth, min_samples_leaf=min_samples_leaf, 39 | max_leaf_nodes=max_leaf_nodes) 40 | out = out.drop_duplicates() 41 | outdata = outdata.drop_duplicates() 42 | temp1 = pd.concat([temp1, out]) 43 | temp2.append(outdata) 44 | return temp1, temp2 45 | 46 | # if len(data.columns)*len(data)>3*10**8: 47 | # print('your dataset is quiet large Parallel may not work , n_job=1 is recommend') 48 | results = Parallel(n_jobs=n_job, max_nbytes=None, verbose=5)(delayed(func)(part) for part in range(n_job)) 49 | self.group_info = pd.DataFrame() 50 | self.group_info_data = data.copy() 51 | for i in range(len(results)): 52 | te = results[i][0] 53 | self.group_info = pd.concat([self.group_info, te], sort=True) 54 | tf = results[i][1] 55 | for pp in range(len(tf)): 56 | one = tf[pp] 57 | self.group_info_data = pd.merge(self.group_info_data, one, how='left') 58 | return self.group_info, self.group_info_data 59 | 60 | def fit_bin_aprior(self, data, varnum, target, s_bin_num=20, special_code=pd.DataFrame(), n_job=None): 61 | if n_job == None: 62 | n_job = joblib.cpu_count() - 1 63 | # colnum=data[colmns].select_dtypes(include=['float','int8','int16','int32','int64']).columns.values.tolist() 64 | # colchar=data[colmns].select_dtypes(include=['object']).columns.values.tolist() 65 | column = varnum 66 | lenp = math.ceil(len(column) / (n_job)) 67 | 68 | def func(part): 69 | temp1 = pd.DataFrame() 70 | temp2 = [] 71 | for i in range(lenp * part, min(lenp * (part + 1), len(column))): 72 | col = column[i] 73 | if special_code.empty == False: 74 | specialcode_list = list(special_code[special_code['variable'] == col]['value']) 75 | else: 76 | specialcode_list = [] 77 | inputdata = data[[col, target]] 78 | 79 | out, outdata = group_func.numericvar_apior(inputdata=inputdata, col=col, 80 | specialcode_list=specialcode_list, s_bin_num=s_bin_num, 81 | target=target) 82 | out = out.drop_duplicates() 83 | outdata = outdata.drop_duplicates() 84 | temp1 = pd.concat([temp1, out]) 85 | temp2.append(outdata) 86 | return temp1, temp2 87 | 88 | # if len(data.columns)*len(data)>3*10**8: 89 | # print('your dataset is quiet large Parallel may not work , n_job=1 is recommend') 90 | results = Parallel(n_jobs=n_job, max_nbytes=None, verbose=5)(delayed(func)(part) for part in range(n_job)) 91 | self.group_info = pd.DataFrame() 92 | self.group_info_data = data.copy() 93 | for i in range(len(results)): 94 | te = results[i][0] 95 | self.group_info = pd.concat([self.group_info, te], sort=True) 96 | tf = results[i][1] 97 | for pp in range(len(tf)): 98 | one = tf[pp] 99 | self.group_info_data = pd.merge(self.group_info_data, one, how='left') 100 | return self.group_info, self.group_info_data 101 | 102 | def fit_bin_existing(self, data, varnum, varchar, target, group_info, data_only=False, n_job=None): 103 | 104 | if n_job == None: 105 | n_job = joblib.cpu_count() - 2 106 | data_col = list(data.columns) 107 | group_col = varnum + varchar 108 | column = list(set(data_col).intersection(set(group_col))) 109 | lenp = math.ceil(len(column) / (n_job)) 110 | 111 | 112 | def func(part): 113 | temp1 = pd.DataFrame() 114 | temp2 = [] 115 | for i in range(lenp * part, min(lenp * (part + 1), len(column))): 116 | col = column[i] 117 | if data_only == False: 118 | inputdata = data[[col, target]] 119 | else: 120 | inputdata = data[[col]] 121 | group_info_col = group_info[group_info['variable_name'] == col] 122 | if data_only == False: 123 | if col in varchar: 124 | 125 | out, outdata = group_func.charvarexist(group_info_old=group_info_col, data_only=data_only, 126 | inputdata=inputdata, col=col, target=target) 127 | else: 128 | out, outdata = group_func.numericexist(group_info_old=group_info_col, data_only=data_only, 129 | inputdata=inputdata, col=col, target=target, modify=False, 130 | add_value=0) 131 | out = out.drop_duplicates() 132 | outdata = outdata.drop_duplicates() 133 | temp1 = pd.concat([temp1, out]) 134 | temp2.append(outdata) 135 | else: 136 | if col in varchar: 137 | 138 | outdata = group_func.charvarexist(group_info_old=group_info_col, data_only=data_only, 139 | inputdata=inputdata, col=col, target=target) 140 | else: 141 | outdata = group_func.numericexist(group_info_old=group_info_col, data_only=data_only, 142 | inputdata=inputdata, col=col, target=target, modify=False, 143 | add_value=0) 144 | 145 | outdata = outdata.drop_duplicates() 146 | temp2.append(outdata) 147 | # return outdata 148 | if data_only==False: 149 | return temp1, temp2 150 | else : 151 | return temp2 152 | 153 | results = Parallel(n_jobs=n_job, max_nbytes=None, verbose=5)(delayed(func)(part) for part in range(n_job)) 154 | if data_only == False: 155 | self.group_info = pd.DataFrame() 156 | self.group_info_data = data.copy() 157 | for i in range(len(results)): 158 | te = results[i][0] 159 | self.group_info = pd.concat([self.group_info, te], sort=True) 160 | tf = results[i][1] 161 | for pp in range(len(tf)): 162 | one = tf[pp] 163 | self.group_info_data = pd.merge(self.group_info_data, one, how='left') 164 | return self.group_info, self.group_info_data 165 | else: 166 | 167 | self.group_info_data = data.copy() 168 | for i in range(len(results)): 169 | tf = results[i] 170 | for pp in range(len(tf)): 171 | one = tf[pp] 172 | self.group_info_data = pd.merge(self.group_info_data, one, how='left') 173 | return self.group_info_data 174 | 175 | 176 | def report(self, group_info, varnum, varchar): 177 | col=varnum+varchar 178 | 179 | group_info['miss_rate'] = group_info['miss_count'] / (group_info['miss_count'] + group_info['count']) 180 | group_info['total_count'] = (group_info['miss_count'] + group_info['count']) 181 | group_info = group_info[group_info['variable_name'].isin(col)] 182 | if len(varnum) < 1: 183 | base = group_info[['variable_name', 'f_group', 'f_Bad_rate', 'f_N_bad', 'f_N_obs', 'woe', 'iv', 184 | 'miss_rate']].drop_duplicates() 185 | 186 | else: 187 | base = group_info[ 188 | ['variable_name', 'f_group', 'f_Bad_rate', 'f_N_bad', 'f_N_obs', 'woe', 'iv', 'miss_rate']].drop_duplicates() 189 | if len(varnum) >= 1: 190 | num_data_report = group_info[group_info['variable_name'].isin(varnum)] 191 | label_num = num_data_report.groupby(['variable_name', 'f_group']).agg( 192 | {'s_min': 'min', 's_max': 'max', 'miss_s': 'max'}).reset_index().rename( 193 | {'s_min': 'f_minlabel', 's_max': 'f_maxlabel', 'miss_s': 'miss_f'}, 194 | axis=1) 195 | label_num['f_minlabel'] = round(label_num['f_minlabel'], 4) 196 | label_num['f_maxlabel'] = round(label_num['f_maxlabel'], 4) 197 | label_num['labelA'] = label_num.apply( 198 | lambda x: '%s < %s <= %s ' % (x['f_minlabel'], x['variable_name'], x['f_maxlabel']), axis=1) 199 | if len(num_data_report[num_data_report['value'].isnull() == False])>0: 200 | label_num_miss = num_data_report[num_data_report['value'].isnull() == False] 201 | label_num_miss['value'] = label_num_miss['value'].astype('str') 202 | label_num_miss['value'] = label_num_miss['value'] + ' ; ' 203 | label_num_miss = label_num_miss.groupby(['variable_name', 'f_group'])['value'].sum().reset_index().rename( 204 | {'value': 'labelB'}, axis=1) 205 | label = pd.merge(label_num, label_num_miss, how='left', on=['variable_name', 'f_group']) 206 | label = label.astype({'labelB': 'str'}) 207 | else: 208 | label = label_num 209 | label['label'] = label.apply( 210 | lambda x: x['labelB'] if (np.isnan(x['f_maxlabel'])) & (np.isnan(x['f_minlabel'])) & ( 211 | x['miss_f'] == True) 212 | else x['labelA'] if (np.isnan(x['f_maxlabel']) == False) & (np.isnan(x['f_minlabel']) == False) & ( 213 | x['miss_f'] == False) 214 | else '[' + str(x['labelB']) + ']' + x['labelA'] if (np.isnan(x['f_maxlabel']) == False) & ( 215 | np.isnan(x['f_minlabel']) == False) & (x['miss_f'] == True) 216 | else '', axis=1) 217 | 218 | labellist_num = label[['variable_name', 'f_group', 'label']] 219 | else: 220 | labellist_num = pd.DataFrame() 221 | if len(varchar) >= 1: 222 | char_data_report = group_info[group_info['variable_name'].isin(varchar)] 223 | char_data_report['value'] = char_data_report['value'].astype('str') 224 | char_data_report['value'] = char_data_report['value'] + ' ; ' 225 | labellist_char = char_data_report.groupby(['variable_name', 'f_group'])['value'].sum().reset_index().rename( 226 | {'value': 'label'}, axis=1) 227 | else: 228 | labellist_char = pd.DataFrame() 229 | label = pd.concat([labellist_char, labellist_num]) 230 | label["f_group"] = pd.to_numeric(label["f_group"]) 231 | reportA = pd.merge(base, label, how='left', on=['variable_name', 'f_group']) 232 | reportA.sort_values(by='iv', ascending=False) 233 | reportA['f_Bad_rate'] = reportA.apply(lambda x: "%.2f%%" % (x['f_Bad_rate'] * 100), axis=1) 234 | reportA['miss_rate'] = reportA.apply(lambda x: "%.2f%%" % (x['miss_rate'] * 100), axis=1) 235 | 236 | f_group_report = reportA 237 | 238 | # s_group 239 | group_info['miss_rate'] = group_info['miss_count'] / (group_info['miss_count'] + group_info['count']) 240 | group_info['total_count'] = (group_info['miss_count'] + group_info['count']) 241 | 242 | 243 | if len(varnum) < 1: 244 | base = group_info[['variable_name', 's_Bad_rate', 's_N_bad', 's_N_obs', 'f_group', 'value', 245 | 'miss_s']].drop_duplicates() 246 | 247 | else: 248 | base = group_info[['variable_name', 's_group', 's_Bad_rate', 's_N_bad', 's_N_obs', 'f_group', 'value', 249 | 'miss_s']].drop_duplicates() 250 | 251 | if len(varnum) >= 1: 252 | num_data_report = group_info[group_info['variable_name'].isin(varnum)] 253 | label_num = num_data_report.groupby(['variable_name', 's_group', 'miss_s']).agg( 254 | {'s_min': 'min', 's_max': 'max'}).reset_index().rename({'s_min': 's_minlabel', 's_max': 's_maxlabel'}, 255 | axis=1) 256 | label_num['s_minlabel'] = round(label_num['s_minlabel'], 4) 257 | label_num['s_maxlabel'] = round(label_num['s_maxlabel'], 4) 258 | label_num['labelA'] = label_num.apply( 259 | lambda x: '%s < %s <= %s ' % (x['s_minlabel'], x['variable_name'], x['s_maxlabel']), axis=1) 260 | if len(num_data_report[num_data_report['value'].isnull() == False])>0: 261 | label_num_miss = num_data_report[num_data_report['value'].isnull() == False] 262 | label_num_miss['value'] = label_num_miss['value'].astype('str') 263 | label_num_miss['value'] = label_num_miss['value'] + ' ; ' 264 | label_num_miss = label_num_miss.groupby(['variable_name', 's_group'])['value'].sum().reset_index().rename( 265 | {'value': 'labelB'}, axis=1) 266 | label = pd.merge(label_num, label_num_miss, how='left', on=['variable_name', 's_group']) 267 | else: 268 | label = label_num 269 | label['label'] = label.apply( 270 | lambda x: x['labelB'] if (np.isnan(x['s_maxlabel'])) & (np.isnan(x['s_minlabel'])) & ( 271 | x['miss_s'] == True) 272 | else x['labelA'] if (np.isnan(x['s_maxlabel']) == False) & (np.isnan(x['s_minlabel']) == False) & ( 273 | x['miss_s'] == False) 274 | else '[' + x['labelB'] + ']' + x['labelA'] if (np.isnan(x['s_maxlabel']) == False) & ( 275 | np.isnan(x['s_minlabel']) == False) & (x['miss_s'] == True) 276 | else '', axis=1) 277 | labellist_num = label[['variable_name', 's_group', 'label']] 278 | labellist_num["s_group"] = pd.to_numeric(labellist_num["s_group"]) 279 | reportA = pd.merge(base, labellist_num, how='right', on=['variable_name', 's_group']) 280 | else: 281 | reportA = pd.DataFrame() 282 | if len(varchar) >= 1: 283 | char_data_report = group_info[group_info['variable_name'].isin(varchar)] 284 | char_data_report['value'] = char_data_report['value'].astype('str') 285 | labellist_char = char_data_report.groupby(['variable_name'])['value'].sum().reset_index() 286 | reportB = char_data_report[ 287 | ['variable_name', 'f_group', 's_Bad_rate', 's_N_bad', 's_N_obs', 'value', 'miss_s']] 288 | reportB['label'] = reportB['value'] 289 | else: 290 | reportB = pd.DataFrame() 291 | 292 | ss = pd.concat([reportA, reportB]) 293 | ss['s_Bad_rate'] = ss.apply(lambda x: "%.2f%%" % (x['s_Bad_rate'] * 100), axis=1) 294 | s_group_report = pd.merge(ss, f_group_report.drop(columns=['label']), how='left', 295 | on=['variable_name', 'f_group']) 296 | return s_group_report, f_group_report -------------------------------------------------------------------------------- /inputdata.py: -------------------------------------------------------------------------------- 1 | import tkinter as tk 2 | from tkinter import ttk 3 | from tkinter import * 4 | import pandas as pd 5 | from pandastable import Table 6 | import pickle as pickle 7 | import datetime 8 | import os 9 | from tkinter import filedialog 10 | class inputdata(): 11 | def __init__(self, mainfram, project_info): 12 | self.data_set = pd.DataFrame() 13 | self.data_name = None 14 | self.data_path = None 15 | self.data_role = None 16 | self.data_coding = None 17 | self.first_load = 'Y' 18 | 19 | self.save = 'N' 20 | self.data_variable_setting = pd.DataFrame() 21 | self.summary = pd.DataFrame 22 | 23 | self.node_setting = {'node_type': 'DATA', 24 | 'data_path': self.data_path, 25 | 'data_role': self.data_role, 26 | 'data_variable_setting': self.data_variable_setting.copy(), 27 | 'check_change': [], 28 | 'data_coding': self.data_coding, 29 | 'data_name': self.data_name, 30 | 'node_name': self.data_name, 31 | 'time': None, 32 | 'node_save_path': None, 33 | 'use_node': []} 34 | 35 | self.project_path =os.path.split(project_info[project_info['模块类型'] == 'project']['保存地址'][0])[0] 36 | self.exist_data = list(project_info['模块名字']) 37 | self.master = mainfram 38 | 39 | # self.data_variable_set_ui=Toplevel(self.master) 40 | # self.data_variable_set_ui.withdraw() 41 | def load(self, node_info): 42 | 43 | self.data_path = node_info[0]['data_path'] 44 | self.data_role = node_info[0]['data_role'] 45 | self.data_variable_setting = node_info[0]['data_variable_setting'] 46 | self.data_coding = node_info[0]['data_coding'] 47 | self.data_name = node_info[0]['data_name'] 48 | self.data_time = node_info[0]['time'] 49 | self.check_time = node_info[0]['check_change'] 50 | node_save_path = node_info[0]['node_save_path'] 51 | self.data_set = node_info[1] 52 | self.node_setting = {'node_type': 'DATA', 53 | 'data_path': self.data_path, 54 | 'data_role': self.data_role, 55 | 'data_variable_setting': self.data_variable_setting.copy(), 56 | 'data_coding': self.data_coding, 57 | 'data_name': self.data_name, 58 | 'node_name': self.data_name, 59 | 'time': self.data_time, 60 | 'check_change': self.check_time, 61 | 'node_save_path': node_save_path, 62 | 'use_node': [], 63 | 'use_node_path':[]} 64 | 65 | self.first_load = 'N' 66 | self.save = 'Y' 67 | 68 | 69 | def data_detail(self, event): 70 | tt = Toplevel(self.master) 71 | tt.title(self.data_name) 72 | width = 500 73 | height = 200 74 | screenwidth = tt.winfo_screenwidth() 75 | screenheight = tt.winfo_screenheight() 76 | alignstr = '%dx%d+%d+%d' % (width, height, (screenwidth - width) / 2, (screenheight - height) / 2) 77 | tt.geometry(alignstr) 78 | 79 | 80 | L1 = Label(tt, width=20, text="数据集路径(CSV):") 81 | L1.grid(column=0, row=0, sticky=(W)) 82 | L2 = Label(tt, text=self.data_path) 83 | L2.grid(column=1, row=0, sticky=(W)) 84 | 85 | L3 = Label(tt, width=20, text="数据集名称:") 86 | L3.grid(column=0, row=1, sticky=(W)) 87 | L4 = Label(tt, text=self.data_name) 88 | L4.grid(column=1, row=1, sticky=(W)) 89 | 90 | L5 = Label(tt, width=20, text="数据集编码:") 91 | L5.grid(column=0, row=2, sticky=(W)) 92 | L6 = Label(tt, text=self.data_coding) 93 | L6.grid(column=1, row=2, sticky=(W)) 94 | 95 | L7 = Label(tt, width=20, text="数据集角色:") 96 | L7.grid(column=0, row=3, sticky=(W)) 97 | L8 = Label(tt, text=self.data_role) 98 | L8.grid(column=1, row=3, sticky=(W)) 99 | 100 | L9 = Label(tt, width=20, text="数据集样本数:") 101 | L9.grid(column=0, row=4, sticky=(W)) 102 | L10 = Label(tt, text=len(self.data_set)) 103 | L10.grid(column=1, row=4, sticky=(W)) 104 | 105 | L11 = Label(tt, width=20, text="数据集列数:") 106 | L11.grid(column=0, row=5, sticky=(W)) 107 | L12 = Label(tt, text=len(list(self.data_set.columns))) 108 | L12.grid(column=1, row=5, sticky=(W)) 109 | 110 | L12 = Label(tt, width=20, text="数值型自变量:") 111 | L12.grid(column=0, row=6, sticky=(W)) 112 | L13 = Label(tt, text=len(self.data_variable_setting[(self.data_variable_setting['变量角色'] == '自变量') & ( 113 | self.data_variable_setting['变量类型'] == '数值型')])) 114 | L13.grid(column=1, row=6, sticky=(W)) 115 | 116 | L14 = Label(tt, width=20, text="字符型自变量:") 117 | L14.grid(column=0, row=7, sticky=(W)) 118 | L15 = Label(tt, text=len(self.data_variable_setting[(self.data_variable_setting['变量角色'] == '自变量') & ( 119 | self.data_variable_setting['变量类型'] == '字符型')])) 120 | L15.grid(column=1, row=7, sticky=(W)) 121 | 122 | if len(self.data_variable_setting[self.data_variable_setting['变量角色'] == '目标']) == 1: 123 | target = list(self.data_variable_setting[self.data_variable_setting['变量角色'] == '目标']['变量名称'])[0] 124 | 125 | L11 = Label(tt, width=20, text="坏样本数:") 126 | L11.grid(column=0, row=8, sticky=(W)) 127 | L12 = Label(tt, text=self.data_set[target].sum()) 128 | L12.grid(column=1, row=8, sticky=(W)) 129 | 130 | def newdatainput(self): 131 | width = 500 132 | height = 250 133 | screenwidth = self.master.winfo_screenwidth() 134 | screenheight = self.master.winfo_screenheight() 135 | alignstr = '%dx%d+%d+%d' % (width, height, (screenwidth - width) / 2, (screenheight - height) / 2) 136 | self.master.geometry(alignstr) 137 | 138 | def selectExcelfile(): 139 | sfname = filedialog.askopenfilename(title='选择CSV文件', filetypes=[('CSV', '*.csv')]) 140 | self.E1.delete(0, 'end') 141 | self.E1.insert(INSERT, sfname) 142 | entry_name = os.path.basename(sfname).replace('.csv','') 143 | self.E2.delete(0, 'end') 144 | self.E2.insert(INSERT, entry_name) 145 | 146 | L1 = Label(self.master, text="数据集路径(CSV)") 147 | L1.grid(column=0, row=0, columnspan=2, sticky=(W)) 148 | self.E1 = Entry(self.master, width=50, bd=1) 149 | self.E1.grid(column=1, row=0, sticky=(W)) 150 | button1 = ttk.Button(self.master, text='浏览', width=8, command=selectExcelfile) 151 | button1.grid(column=2, row=0, sticky=(W)) 152 | 153 | L1 = Label(self.master, text="数据集名称") 154 | L1.grid(column=0, row=1, columnspan=2, sticky=(W)) 155 | self.E2 = Entry(self.master, width=23, bd=1) 156 | self.E2.grid(column=1, row=1, sticky=(W)) 157 | 158 | L3 = Label(self.master, text="数据集编码") 159 | L3.grid(column=0, row=2, sticky=(W)) 160 | self.E3 = ttk.Combobox(self.master) 161 | self.E3["value"] = ['utf-8', 'gbk'] 162 | self.E3.current(0) 163 | self.E3.grid(column=1, row=2, sticky=(W)) 164 | 165 | L2 = Label(self.master, text="数据集角色") 166 | L2.grid(column=0, row=3, sticky=(W)) 167 | self.role = ttk.Combobox(self.master) 168 | self.role["value"] = ['Training model', 'Reject', 'out of time sample','Score'] 169 | self.role.current(0) 170 | self.role.grid(column=1, row=3, sticky=(W)) 171 | 172 | test_button4 = ttk.Button(self.master, text='确定') 173 | test_button4.grid(column=1, row=5, sticky=(W)) 174 | test_button4.bind("", self.readdata) 175 | 176 | def data_explore(self, event): 177 | dd = self.data_set.describe() 178 | 179 | de = dd.T 180 | de['变量名称'] = de.index 181 | df = de 182 | self.tt = Toplevel() 183 | self.tt.title(self.data_name) 184 | f = Frame(self.tt) 185 | 186 | f.grid(column=0, row=1, rowspan=len(df), sticky=(E, W)) 187 | screen_width = f.winfo_screenwidth() * 0.8 188 | screen_height = f.winfo_screenheight() * 0.8 189 | table = ptm = Table(f, dataframe=df, height=screen_height, width=screen_width) 190 | ptm.show() 191 | 192 | def data_preview(self, event): 193 | data_len = min(200, len(self.data_set)) 194 | de = self.data_set[0:data_len] 195 | df = de 196 | self.tt1 = Toplevel() 197 | self.tt1.title(self.data_name) 198 | f = Frame(self.tt1) 199 | f.grid(column=0, row=1, rowspan=len(df), sticky=(E, W)) 200 | screen_width = f.winfo_screenwidth() * 0.8 201 | screen_height = f.winfo_screenheight() * 0.8 202 | table = ptm = Table(f, dataframe=df, height=screen_height, width=screen_width) 203 | ptm.show() 204 | 205 | def closeall(self, event): 206 | if self.save != 'Y': 207 | tk.messagebox.showwarning('错误', "错误:请先保存您的设置") 208 | elif self.data_variable_setting.equals(self.node_setting['data_variable_setting']) == False: 209 | self.close_tip = Toplevel(self.data_variable_set_ui) 210 | screenwidth = self.data_variable_set_ui.winfo_screenwidth() 211 | screenheight = self.data_variable_set_ui.winfo_screenheight() 212 | self.close_tip.geometry('%dx%d+%d+%d' % (400, 100, (screenwidth - 150) / 2, (screenheight - 100) / 2)) 213 | L2 = Label(self.close_tip, text="参数设置以更改,是否保存更改") 214 | L2.grid(column=0, row=0, columnspan=3) 215 | test_button4 = ttk.Button(self.close_tip, text='保存') 216 | test_button4.grid(column=0, row=1, sticky=(W)) 217 | test_button4.bind("", self.colse_save) 218 | 219 | test_button4 = ttk.Button(self.close_tip, text='不保存(关闭)') 220 | test_button4.grid(column=2, row=1, sticky=(W)) 221 | test_button4.bind("", self.final_close) 222 | # if self.save=='N'and self.update=='Y': 223 | else: 224 | self.final_close(event) 225 | 226 | def colse_save(self, event): 227 | try: 228 | self.close_tip.destroy() 229 | except: 230 | pass 231 | 232 | self.save_d() 233 | 234 | def final_close(self, event): 235 | try: 236 | self.tt.destroy() 237 | except: 238 | pass 239 | try: 240 | self.tt1.destroy() 241 | except: 242 | pass 243 | try: 244 | self.data_variable_set_ui.destroy() 245 | except: 246 | pass 247 | try: 248 | self.master.destroy() 249 | except: 250 | pass 251 | 252 | def oneclike(self, event): 253 | try: 254 | self.comboxlist_modify_f_group.destroy() 255 | 256 | except: 257 | pass 258 | self.table.handle_left_click(event) 259 | 260 | def readdata(self, event): 261 | path = self.E1.get() 262 | name = self.E2.get() 263 | coding = self.E3.get() 264 | datarole = self.role.get() 265 | if name in self.exist_data: 266 | tk.messagebox.showwarning('错误', "该数据集集名称已存在") 267 | 268 | else: 269 | 270 | try: 271 | data = pd.read_csv(r'%s' % path, encoding='%s' % coding, low_memory=False) 272 | if data.empty == True: 273 | tk.messagebox.showwarning('错误', "错误:数据集为空") 274 | else: 275 | settingdata = pd.DataFrame() 276 | settingdata['变量名称'] = data.columns 277 | 278 | colnum = data.select_dtypes( 279 | include=['float', 'int8', 'int16', 'int32', 'int64']).columns.values.tolist() 280 | colchar = data.select_dtypes(include=['object']).columns.values.tolist() 281 | settingdata['变量类型'] = settingdata.apply(lambda x: '数值型' if x['变量名称'] in colnum else '字符型', axis=1) 282 | remove = [] 283 | remove_char=[] 284 | for col in list(data.columns): 285 | if len(data[col].unique()) < 2: 286 | remove.append(col) 287 | if (len(data[col].unique()) >50) and (list(settingdata[settingdata['变量名称']==col]['变量类型'])[0]=='字符型'): 288 | remove_char.append(col) 289 | settingdata['是否使用'] = settingdata.apply(lambda x: '不使用' if x['变量名称'] in remove+remove_char else '使用', axis=1) 290 | settingdata['变量角色'] = settingdata.apply(lambda x: '拒绝' if x['变量名称'] in remove+remove_char else '自变量', axis=1) 291 | settingdata['备注'] = settingdata.apply(lambda x: '只有一个值' if x['变量名称'] in remove else None, axis=1) 292 | settingdata['备注'] = settingdata.apply(lambda x: '字符值太多' if x['变量名称'] in remove_char else x['备注'], axis=1) 293 | self.data_set = data 294 | self.data_path = path 295 | self.data_role = datarole 296 | self.data_name = name 297 | self.data_variable_setting = settingdata 298 | self.data_coding = coding 299 | self.variable_seting_ui() 300 | except Exception as e: 301 | tk.messagebox.showwarning('错误', e) 302 | 303 | def variable_seting_ui(self): 304 | for child in self.master.winfo_children(): 305 | child.destroy() 306 | # self.data_variable_set_ui=Toplevel(self.master) 307 | 308 | self.data_variable_set_ui = self.master 309 | self.data_variable_set_ui.withdraw() 310 | self.data_variable_set_ui.title(self.data_name) 311 | self.data_variable_set_ui.update() 312 | self.data_variable_set_ui.deiconify() 313 | self.data_variable_set_ui.title('%s' % self.data_name) 314 | screenwidth = self.master.winfo_screenwidth() 315 | screenheight = self.master.winfo_screenheight() 316 | alignstr = '%dx%d+%d+%d' % ( 317 | screenwidth * 0.8, screenheight * 0.8, (screenwidth * 0.2) / 2, (screenheight * 0.2) / 2) 318 | self.data_variable_set_ui.geometry(alignstr) 319 | test_button4 = ttk.Button(self.data_variable_set_ui, text='保存') 320 | test_button4.grid(column=0, row=0, sticky=(W)) 321 | test_button4.bind("", self.variable_role_check) 322 | 323 | test_button5 = ttk.Button(self.data_variable_set_ui, text='关闭') 324 | test_button5.grid(column=1, row=0, sticky=(W)) 325 | test_button5.bind("", self.closeall) 326 | 327 | test_button6 = ttk.Button(self.data_variable_set_ui, text='数据探索') 328 | test_button6.grid(column=2, row=0, sticky=(W)) 329 | test_button6.bind("", self.data_explore) 330 | 331 | test_button7 = ttk.Button(self.data_variable_set_ui, text='数据预览') 332 | test_button7.grid(column=3, row=0, sticky=(W)) 333 | test_button7.bind("", self.data_preview) 334 | 335 | test_button8 = ttk.Button(self.data_variable_set_ui, text='数据介绍') 336 | test_button8.grid(column=4, row=0, sticky=(W)) 337 | test_button8.bind("", self.data_detail) 338 | self.refresh_df(mianfram=self.data_variable_set_ui, df=self.data_variable_setting) 339 | 340 | def variable_role_check(self, event): 341 | # 判断是否需要设置target,以及设置是否正确 342 | error=0 343 | if 'SCORE' in self.data_variable_setting['变量名称']: 344 | error=1 345 | tk.messagebox.showwarning('错误', "SCORE 将用在以好打分中请更改变量名") 346 | elif 'SCORECARD_LR_p_1' in self.data_variable_setting['变量名称']: 347 | error = 1 348 | tk.messagebox.showwarning('错误', "SCORECARD_LR_p_1 将用在以好打分中请更改变量名") 349 | elif 'const' in self.data_variable_setting['变量名称']: 350 | error = 1 351 | tk.messagebox.showwarning('错误', "const 将用在以后模型训练中请更改变量名") 352 | elif len(self.data_variable_setting[self.data_variable_setting['变量角色'] == 'TimeID']) == 1: 353 | timeid=list(self.data_variable_setting[self.data_variable_setting['变量角色'] == 'TimeID']['变量名称'])[0] 354 | if len(list(self.data_set[timeid].unique()))>30: 355 | error=1 356 | tk.messagebox.showwarning('错误', "Timeid 数量太多请合并日期") 357 | 358 | 359 | if error==0 and len(self.data_variable_setting[self.data_variable_setting['变量角色'] == '目标']) == 0: 360 | if self.data_role == 'Training model': 361 | tk.messagebox.showwarning('错误', "训练集中必须有且只有一个目标") 362 | else: 363 | if len(self.data_variable_setting[self.data_variable_setting['变量角色'] == 'TimeID']) > 1: 364 | tk.messagebox.showwarning('错误', "最多只有一个TimeID") 365 | else: 366 | self.save_d() 367 | elif error==0 and len(self.data_variable_setting[self.data_variable_setting['变量角色'] == '目标']) == 1: 368 | target = list(self.data_variable_setting[self.data_variable_setting['变量角色'] == '目标']['变量名称'])[0] 369 | if set(self.data_set[target].unique()) != set([0, 1]): 370 | tk.messagebox.showwarning('错误', "目标角色只能有【0,1】两个值") 371 | else: 372 | if self.data_role == 'Training model': 373 | if (len(self.data_variable_setting[self.data_variable_setting['变量角色'] == '自变量']) == 0) | ( 374 | len(self.data_variable_setting[self.data_variable_setting['是否使用'] == '使用']) == 0): 375 | tk.messagebox.showwarning('错误', "训练集中只要有一个是可以使用的自变量") 376 | else: 377 | if len(self.data_variable_setting[self.data_variable_setting['变量角色'] == 'TimeID']) > 1: 378 | tk.messagebox.showwarning('错误', "最多只有一个TimeID") 379 | else: 380 | self.save_d() 381 | else: 382 | if len(self.data_variable_setting[self.data_variable_setting['变量角色'] == 'TimeID']) > 1: 383 | tk.messagebox.showwarning('错误', "最多只有一个TimeID") 384 | else: 385 | self.save_d() 386 | else: 387 | tk.messagebox.showwarning('错误', "变量角色中必须有且只有一个目标") 388 | 389 | def save_d(self): 390 | 391 | var_num = list(self.data_variable_setting[self.data_variable_setting['变量类型'] == '数值型']['变量名称']) 392 | var_char = list(self.data_variable_setting[self.data_variable_setting['变量类型'] == '字符型']['变量名称']) 393 | try: 394 | for var in var_num: 395 | self.data_set['var'] = self.data_set[var].astype('float') 396 | for var in var_char: 397 | self.data_set['var'] = self.data_set[var].astype('object') 398 | 399 | nowTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') 400 | node_save_path = self.project_path + '/' + '%s.dataset' % self.data_name 401 | self.node_setting = {'node_type': 'DATA', 402 | 'data_path': self.data_path, 403 | 'data_role': self.data_role, 404 | 'data_variable_setting': self.data_variable_setting.copy(), 405 | 'data_coding': self.data_coding, 406 | 'data_name': self.data_name, 407 | 'node_name': self.data_name, 408 | 'time': nowTime, 409 | 'node_save_path': node_save_path, 410 | 'check_change': [{'node_name': self.data_name, 'node_time': nowTime}], 411 | 'use_node': [self.data_name], 412 | 'use_node_path':[self.data_path]} 413 | data_save = (self.node_setting, self.data_set) 414 | 415 | error2 = Toplevel(self.data_variable_set_ui) 416 | screenwidth = self.data_variable_set_ui.winfo_screenwidth() 417 | screenheight = self.data_variable_set_ui.winfo_screenheight() 418 | 419 | error2.geometry('%dx%d+%d+%d' % (150, 100, (screenwidth - 150) / 2, (screenheight - 100) / 2)) 420 | L2 = Label(error2, text="保存中") 421 | L2.grid() 422 | 423 | self.data_variable_set_ui.update() 424 | filename = self.project_path + '/' + '%s.dataset' % self.data_name 425 | fw = open(filename, 'wb') 426 | pickle.dump(data_save, fw, 1) 427 | fw.close() 428 | 429 | self.save = 'Y' 430 | try: 431 | error2.destroy() 432 | except: 433 | pass 434 | self.data_variable_set_ui.update() 435 | # self.variable_seting_ui() 436 | except Exception as e: 437 | tk.messagebox.showwarning('错误', "%s:%s" %(var ,e)) 438 | 439 | def refresh_df(self, mianfram, df): 440 | 441 | f = Frame(mianfram) 442 | f.grid(column=0, row=1, rowspan=len(df), 443 | columnspan=6, sticky=(E, W)) 444 | screen_width = f.winfo_screenwidth() * 0.8 445 | screen_height = f.winfo_screenheight() * 0.8 446 | self.table = self.ptm = Table(f, dataframe=df, colspan=7, height=screen_height, width=screen_width) 447 | self.ptm.show() 448 | self.table.bind("", self.modify_variable_role) 449 | self.table.bind("", self.modify_variable_role) 450 | self.table.bind("", self.oneclike) 451 | self.table.bind("", self.modify_variable_role) 452 | self.table.bind("", self.modify_variable_role) 453 | self.table.bind("", self.modify_variable_role) 454 | self.table.bind("", self.modify_variable_role) 455 | self.table.bind("", self.modify_variable_role) 456 | self.table.bind("", self.modify_variable_role) 457 | 458 | def variable_role_update(self, event): 459 | if (self.comboxlist_modify_f_group.get() == '自变量') & ( 460 | self.data_variable_setting.iloc[self.rowclicked]['备注'] == '只有一个值') & ( 461 | list(self.data_variable_setting.columns)[self.colclicked] == '变量角色'): 462 | self.comboxlist_modify_f_group.destroy() 463 | tk.messagebox.showwarning('错误', "该变量只有一个值,不能设置为自变量") 464 | else: 465 | value = self.comboxlist_modify_f_group.get() 466 | self.data_variable_setting.iloc[self.rowclicked, self.colclicked] = value 467 | self.comboxlist_modify_f_group.destroy() 468 | 469 | self.refresh_df(mianfram=self.data_variable_set_ui, df=self.data_variable_setting) 470 | 471 | def modify_variable_role(self, event): 472 | try: 473 | self.comboxlist_modify_f_group.destroy() 474 | 475 | except: 476 | pass 477 | self.rowclicked = self.ptm.get_row_clicked(event) 478 | self.colclicked = self.ptm.get_col_clicked(event) 479 | 480 | if list(self.data_variable_setting.columns)[self.colclicked] == '是否使用': 481 | try: 482 | self.comboxlist_modify_f_group = ttk.Combobox(self.data_variable_set_ui) 483 | 484 | self.comboxlist_modify_f_group["value"] = ['使用', '不使用'] 485 | self.data_variable_set_ui.update() 486 | self.comboxlist_modify_f_group.place(x=event.x_root - self.data_variable_set_ui.winfo_rootx(), 487 | y=event.y_root - self.data_variable_set_ui.winfo_rooty()) 488 | self.comboxlist_modify_f_group.bind("<>", self.variable_role_update) 489 | 490 | except: 491 | pass 492 | 493 | elif list(self.data_variable_setting.columns)[self.colclicked] == '变量角色': 494 | try: 495 | self.comboxlist_modify_f_group = ttk.Combobox(self.data_variable_set_ui) 496 | 497 | self.comboxlist_modify_f_group["value"] = ['自变量', 'ID', 'TimeID', '目标', '以前模型分数'] 498 | self.data_variable_set_ui.update() 499 | self.comboxlist_modify_f_group.place(x=event.x_root - self.data_variable_set_ui.winfo_rootx(), 500 | y=event.y_root - self.data_variable_set_ui.winfo_rooty()) 501 | self.comboxlist_modify_f_group.bind("<>", self.variable_role_update) 502 | 503 | except: 504 | pass 505 | elif list(self.data_variable_setting.columns)[self.colclicked] == '变量类型': 506 | try: 507 | self.comboxlist_modify_f_group = ttk.Combobox(self.data_variable_set_ui) 508 | 509 | self.comboxlist_modify_f_group["value"] = ['数值型', '字符型'] 510 | self.data_variable_set_ui.update() 511 | self.comboxlist_modify_f_group.place(x=event.x_root - self.data_variable_set_ui.winfo_rootx(), 512 | y=event.y_root - self.data_variable_set_ui.winfo_rooty()) 513 | self.comboxlist_modify_f_group.bind("<>", self.variable_role_update) 514 | 515 | except: 516 | pass 517 | else: 518 | pass 519 | -------------------------------------------------------------------------------- /load_node.py: -------------------------------------------------------------------------------- 1 | import tkinter as tk 2 | from tkinter import ttk 3 | from tkinter import * 4 | 5 | import pickle as pickle 6 | 7 | from tkinter import filedialog 8 | class import_node(): 9 | def __init__(self, mainfram, project_info): 10 | self.root2=mainfram 11 | self.node_name = None 12 | self.node_type = None 13 | self.node_save_path = None 14 | self.node_current_save_path=None 15 | self.project_path = project_info[project_info['模块类型'] == 'project']['保存地址'][0] 16 | self.exist_data = list(project_info['模块名字']) 17 | self.exist_add = list(project_info['保存地址']) 18 | self.master = mainfram 19 | self.load_node() 20 | self.save = 'N' 21 | def load_node(self): 22 | width = 500 23 | height = 250 24 | screenwidth = self.root2.winfo_screenwidth() 25 | screenheight = self.root2.winfo_screenheight() 26 | alignstr = '%dx%d+%d+%d' % (width, height, (screenwidth - width) / 2, (screenheight - height) / 2) 27 | self.root2.geometry(alignstr) 28 | 29 | def selectExcelfile(): 30 | sfname = filedialog.askopenfilename(title='选择模块文件', filetypes=[('IGN', '*.IGN'),('DATASET', '*.dataset'),('Sampling', '*.sample'),('Spliting', '*.spliting'),('Model','model')]) 31 | self.node_current_save_path=sfname 32 | self.nodeimport_E1.delete(0, 'end') 33 | self.nodeimport_E1.insert(INSERT, sfname) 34 | try: 35 | fr = open(sfname, 'rb') 36 | node_info = pickle.load(fr) 37 | self.node_data = node_info 38 | fr.close() 39 | try: 40 | self.node_name=node_info[0]['node_name'] 41 | self.node_type=node_info[0]['node_type'] 42 | self.use_node = node_info[0]['use_node'] 43 | self.node_time=node_info[0]['time'] 44 | self.node_save_path=node_info[0]['node_save_path'] 45 | self.nodeimport_E2.delete(0, 'end') 46 | self.nodeimport_E2.insert(INSERT, self.node_name) 47 | self.label_str.set(self.node_type) 48 | except Exception as e: 49 | tk.messagebox.showwarning('错误', e) 50 | except Exception as e: 51 | tk.messagebox.showwarning('错误', e) 52 | L1 = Label(self.root2, text="模块路径") 53 | L1.grid(column=0, row=0, columnspan=2, sticky=(W)) 54 | self.nodeimport_E1 = Entry(self.root2, width=50, bd=1) 55 | self.nodeimport_E1.grid(column=1, row=0, sticky=(W)) 56 | button1 = ttk.Button(self.root2, text='浏览', width=8, command=selectExcelfile) 57 | button1.grid(column=2, row=0, sticky=(W)) 58 | 59 | L1 = Label(self.root2, text="模块名称") 60 | L1.grid(column=0, row=1, columnspan=2, sticky=(W)) 61 | self.nodeimport_E2 = Entry(self.root2, width=23, bd=1) 62 | self.nodeimport_E2.grid(column=1, row=1, sticky=(W)) 63 | 64 | L3 = Label(self.root2, text="模块类型") 65 | L3.grid(column=0, row=2, sticky=(W)) 66 | self.label_str = StringVar() 67 | warning = Label(self.root2, textvariable=self.label_str) 68 | warning.grid(column=1, row=2, sticky=(W)) 69 | 70 | 71 | test_button4 = ttk.Button(self.root2, text='确定') 72 | test_button4.grid(column=1, row=5, sticky=(W)) 73 | test_button4.bind("", self.save_node) 74 | 75 | def save_node(self,event): 76 | flag_error=0 77 | if self.nodeimport_E1.get()!=self.node_current_save_path: 78 | self.node_current_save_path=self.nodeimport_E1.get() 79 | try: 80 | fr = open(self.nodeimport_E1.get(), 'rb') 81 | node_info = pickle.load(fr) 82 | self.node_data=node_info 83 | fr.close() 84 | try: 85 | self.node_name = node_info[0]['node_name'] 86 | self.node_type = node_info[0]['node_type'] 87 | self.use_node = node_info[0]['use_node'] 88 | self.node_time = node_info[0]['time'] 89 | # self.node_save_path = node_info[0]['node_save_path'] 90 | self.nodeimport_E2.delete(0, 'end') 91 | self.nodeimport_E2.insert(INSERT, self.node_name) 92 | self.label_str.set(self.node_type) 93 | except Exception as e: 94 | flag_error = 1 95 | tk.messagebox.showwarning('错误', e) 96 | except Exception as e: 97 | flag_error = 1 98 | tk.messagebox.showwarning('错误', e) 99 | if flag_error==0: 100 | if self.nodeimport_E2.get() in self.exist_data: 101 | tk.messagebox.showwarning('错误', '该名称已经在project中,请改名') 102 | elif self.node_current_save_path in self.exist_add: 103 | tk.messagebox.showwarning('错误', '该地址已经在project中,请勿重复导入') 104 | elif self.nodeimport_E2.get() != self.node_name: 105 | self.node_name= self.nodeimport_E2.get() 106 | else: 107 | self.node_setting={'node_type':self.node_type,'node_name':self.node_name,'use_node':self.use_node,'node_save_path':self.node_current_save_path,'time':self.node_time} 108 | self.save = 'Y' 109 | self.root2.destroy() 110 | -------------------------------------------------------------------------------- /render.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Awesome-pyecharts 6 | 7 | 8 | 9 | 10 |
11 | 241 | 242 | 243 | -------------------------------------------------------------------------------- /sampling.py: -------------------------------------------------------------------------------- 1 | import tkinter as tk 2 | from tkinter import ttk 3 | from tkinter import * 4 | import pandas as pd 5 | import pickle as pickle 6 | import datetime 7 | import matplotlib.pyplot as plt 8 | from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg 9 | from matplotlib_venn import venn2 10 | import os 11 | 12 | class sample(): 13 | def __init__(self, mainframe, project_info): 14 | self.master = mainframe 15 | self.project_info = project_info 16 | self.project_path = os.path.split(project_info[project_info['模块类型'] == 'project']['保存地址'][0])[0] 17 | self.train_data_list=[] 18 | self.method = '' 19 | self.train_pct = 0.7 20 | self.valid_pct = 0.00001 21 | self.seed = 123456 22 | self.bad_pct = 1 23 | self.sample_flag = '否' 24 | self.replace = 'False' 25 | self.bad_rate = 0.1 26 | self.par_train_data = pd.DataFrame() 27 | self.node_name = 'sample' 28 | self.exist_data = list(project_info['模块名字']) 29 | self.load = 'N' 30 | self.target = 'Y' 31 | self.save = 'N' 32 | 33 | def load_node(self, node_data, ac): 34 | # 重新进去页面 35 | self.load = 'Y' 36 | self.node_name = node_data[0]['node_name'] 37 | self.method = node_data[0]['method'] 38 | self.replace = node_data[0]['replace'] 39 | self.seed = node_data[0]['seed'] 40 | self.train_pct = node_data[0]['train_pct'] 41 | self.data_role=node_data[0]['data_role'] 42 | self.sample_flag = node_data[0]['adjuest_bad'] 43 | self.bad_pct = node_data[0]['bad_sample_pct'] 44 | self.bad_rate = node_data[0]['sample_bad_rate'] 45 | self.previous_node_name=node_data[0]['previous_node_name'] 46 | self.previous_node_time = node_data[0]['previous_node_time'] 47 | self.check_list=node_data[0]['check_change'] 48 | self.par_traindatavariable_setting=node_data[0]['data_variable_setting'] 49 | self.trainpart_data = node_data[1] 50 | self.par_train_data =node_data[2] 51 | 52 | 53 | if ac=='setting': 54 | path_list = self.project_info[self.project_info['创建时间'] == self.previous_node_time]['保存地址'] 55 | def continu(event): 56 | for child in self.master.winfo_children(): 57 | child.destroy() 58 | self.ui_start() 59 | def back(event): 60 | self.master.destroy() 61 | if len(path_list)==0: 62 | self.master.title('提示') 63 | L00 = Label(self.master, width=80, text="该模块引用的%s (创建于 %s)模块 没有在项目中找到,\n可能该模块已经更新,删除," 64 | "或未导入\n继续设置可能会导致以前结果丢失" %(self.previous_node_name,self.previous_node_time)) 65 | L00.grid(column=0, row=0, columnspan=3, sticky=(W)) 66 | button_contin = ttk.Button(self.master, text='继续设置') 67 | button_contin.grid(column=0, row=1, sticky=(W), padx=10, pady=10) 68 | button_contin.bind("", continu) 69 | button_back = ttk.Button(self.master, text='返回') 70 | button_back.grid(column=2, row=1, sticky=(W), padx=10, pady=10) 71 | button_back.bind("", back) 72 | else: 73 | path=path_list[0] 74 | try: 75 | fr = open(path,'rb') 76 | node_info = pickle.load(fr) 77 | fr.close() 78 | self.par_traindatavariable_setting = node_info[0]['data_variable_setting'] 79 | self.previous_check_change=node_info[0]['check_change'] 80 | self.previous_node_usedlist=node_info[0]['use_node'] 81 | self.previous_node_name=node_info[0]['node_name'] 82 | self.data_role=node_info[0]['data_role'] 83 | self.par_train_data=node_info[1] 84 | self.ui_start() 85 | except Exception as e: 86 | self.master.title('提示') 87 | L00 = Label(self.master, width=80, text="导入%s (创建于 %s)模块 发生错误,\n可能该模块已经被破坏或删除," 88 | "\n%s" % (self.previous_node_name, self.previous_node_time,e)) 89 | L00.grid(column=0, row=0, columnspan=3, sticky=(W)) 90 | button_contin = ttk.Button(self.master, text='继续设置') 91 | button_contin.grid(column=0, row=1, sticky=(W), padx=10, pady=10) 92 | button_contin.bind("", continu) 93 | button_back = ttk.Button(self.master, text='返回') 94 | button_back.grid(column=2, row=1, sticky=(W), padx=10, pady=10) 95 | button_back.bind("", back) 96 | else: 97 | self.result_ui(self.master,ac='re') 98 | 99 | # n = 0 100 | # for check_item in self.check_list: 101 | # # 为了读取失败以后可以进入设置页面 102 | # try: 103 | # path = self.project_info[self.project_info['模块名字'] == check_item['node_name']]['保存地址'][0] 104 | # fr = open(path, 'rb') 105 | # node_info = pickle.load(fr) 106 | # fr.close() 107 | # if node_info[0]['time'] != check_item['node_time']: 108 | # tk.messagebox.showwarning('错误', '之前引用的%s已经发生改变 \n 请重新设置更新结果' %check_item) 109 | # #为了重新设置更新结果的时候不用再选一遍数据集了 110 | # self.par_traindatavariable_setting = node_info[0]['data_variable_setting'] 111 | # self.previous_check_change=node_info[0]['check_change'] 112 | # self.previous_node_usedlist=node_info[0]['use_node'] 113 | # self.previous_node_name = node_info[0]['node_name'] 114 | # self.data_role=node_info[0]['data_role'] 115 | # self.par_train_data = node_info[1] 116 | # 117 | # n = n + 1 118 | # elif check_item['node_name'] == self.previous_node_name: 119 | # self.par_traindatavariable_setting = node_info[0]['data_variable_setting'] 120 | # self.previous_check_change=node_info[0]['check_change'] 121 | # self.previous_node_usedlist=node_info[0]['use_node'] 122 | # self.previous_node_name = node_info[0]['node_name'] 123 | # self.data_role=node_info[0]['data_role'] 124 | # self.par_train_data = node_info[1] 125 | # else: 126 | # pass 127 | # except Exception as e: 128 | # n=n+1 129 | # tk.messagebox.showwarning('错误', '%s error: %s' %(check_item['node_name'],e)) 130 | # if n > 1: 131 | # self.ui_start() 132 | # else: 133 | # if ac == 'setting': 134 | # self.ui_start() 135 | # else: 136 | # self.result_ui(self.master, ac='re') 137 | def pre_data(self): 138 | dd = list(self.project_info[(self.project_info['模块类型']=='DATA')&(self.project_info['状态']=='Good')]['保存地址']) 139 | for add in dd: 140 | try: 141 | fr = open(add, 'rb') 142 | node_info = pickle.load(fr) 143 | fr.close() 144 | data_name = node_info[0]['data_name'] 145 | self.train_data_list.append(data_name) 146 | except Exception as e: 147 | tk.messagebox.showwarning('错误', "%s数据集导入错误:%s" % (add, e)) 148 | def ui_start(self): 149 | # 初始页面设置 150 | if self.train_data_list==[]: 151 | self.pre_data() 152 | self.start_window_base = self.master 153 | width = self.master.winfo_screenwidth() * 0.15 154 | height = self.master.winfo_screenheight() * 0.3 155 | screenwidth = self.master.winfo_screenwidth() 156 | screenheight = self.master.winfo_screenheight() 157 | self.start_window_base.geometry( 158 | '%dx%d+%d+%d' % (width, height, (screenwidth - width) / 2, (screenheight - height) / 2)) 159 | self.start_window_base.title('样本抽样参数设置') 160 | # 参数设置 161 | self.start_window_data = LabelFrame(self.start_window_base , text='参数设置') 162 | 163 | L00 = Label(self.start_window_data, width=20, text="名称") 164 | L00.grid(column=0, row=0, sticky=(W)) 165 | if self.load == 'N': 166 | nodename = tk.StringVar(value=self.node_name) 167 | self.entry_node_name = Entry(self.start_window_data, textvariable=nodename, bd=1, width=18) 168 | self.entry_node_name.grid(column=1, row=0, sticky=(W)) 169 | else: 170 | L01 = Label(self.start_window_data, width=20, text=self.node_name, bd=2) 171 | L01.grid(column=1, row=0, sticky=(W)) 172 | L0 = Label(self.start_window_data, width=20, text="原始样本") 173 | L0.grid(column=0, row=1, sticky=(W)) 174 | self.comboxlist_train_data = ttk.Combobox(self.start_window_data, width=15) 175 | self.comboxlist_train_data["value"] = self.train_data_list 176 | #因为页面要根据选的数据集重新刷新所以要加一个数据集的判定 177 | if (self.load=='Y')|(self.par_train_data.empty==False): 178 | for i in range(len(self.train_data_list)): 179 | if self.train_data_list[i] == self.previous_node_name: 180 | self.comboxlist_train_data.current(i) 181 | self.comboxlist_train_data.bind("<>", lambda event: self.load_data(event, datatype='train')) 182 | self.comboxlist_train_data.grid(column=1, row=1, sticky=(W)) 183 | 184 | L1 = Label(self.start_window_data, width=20, text="抽样方法") 185 | L1.grid(column=0, row=2, sticky=(W)) 186 | self.comboxlist_method = ttk.Combobox(self.start_window_data, width=15) 187 | if self.target == 'Y': 188 | self.comboxlist_method["value"] = ['简单随机', '分层(好坏)'] 189 | if self.method=='简单随机': 190 | self.comboxlist_method.current(0) 191 | else: 192 | self.comboxlist_method.current(1) 193 | else: 194 | self.comboxlist_method["value"] = ['简单随机'] 195 | if self.load=='Y': 196 | if self.method=='简单随机': 197 | self.comboxlist_method.current(0) 198 | else: 199 | try: 200 | self.comboxlist_method.current(1) 201 | except: 202 | self.comboxlist_method.current(0) 203 | else: 204 | self.comboxlist_method.current(0) 205 | 206 | self.comboxlist_method.grid(column=1, row=2, sticky=(W)) 207 | 208 | L3 = Label(self.start_window_data, width=20, text="是否放回") 209 | L3.grid(column=0, row=3, sticky=(W)) 210 | self.comboxlist_replace = ttk.Combobox(self.start_window_data, width=15) 211 | self.comboxlist_replace["value"] = ['False', 'True'] 212 | 213 | if self.replace=='False': 214 | self.comboxlist_replace.current(0) 215 | else: 216 | self.comboxlist_replace.current(1) 217 | 218 | self.comboxlist_replace.grid(column=1, row=3, sticky=(W)) 219 | 220 | L2 = Label(self.start_window_data, width=20, text="随机种子") 221 | L2.grid(column=0, row=4, sticky=(W)) 222 | seed = tk.StringVar(value=self.seed) 223 | self.entry_seed = Entry(self.start_window_data, textvariable=seed, bd=1, width=18) 224 | self.entry_seed.grid(column=1, row=4, sticky=(W)) 225 | self.entry_seed.bind('', lambda event: self.int_num_check(event, 'seed', 'int')) 226 | self.start_window_data.grid(column=0, row=0, columnspan=2, padx=10, pady=10) 227 | 228 | # 分配比例 229 | 230 | self.start_window_pct = LabelFrame(self.start_window_base, text='数据集抽样比例') 231 | L3 = Label(self.start_window_pct, width=20, text="抽样样本比例") 232 | L3.grid(column=0, row=0, sticky=(W)) 233 | train_pct = tk.StringVar(value=self.train_pct) 234 | self.entry_train_pct = Entry(self.start_window_pct, textvariable=train_pct, bd=1, width=18) 235 | self.entry_train_pct.grid(column=1, row=0, sticky=(W)) 236 | self.entry_train_pct.bind('', lambda event: self.int_num_check(event, 'train_pct', 'g')) 237 | self.start_window_pct.grid(column=0, row=1, columnspan=2, padx=10, pady=10) 238 | 239 | # 是否调整样本坏账率 240 | 241 | self.start_window_sample = LabelFrame(self.start_window_base, text='设置样本坏账率') 242 | 243 | L3 = Label(self.start_window_sample, width=20, text="是否调整样本坏账率") 244 | L3.grid(column=0, row=0, sticky=(W)) 245 | if self.target == 'Y': 246 | self.comboxlist_sample_flag = ttk.Combobox(self.start_window_sample, width=15) 247 | self.comboxlist_sample_flag["value"] = ['否', '是'] 248 | if self.sample_flag=='是': 249 | self.comboxlist_sample_flag.current(1) 250 | else: 251 | self.comboxlist_sample_flag.current(0) 252 | self.comboxlist_sample_flag.grid(column=1, row=0, sticky=(W)) 253 | 254 | L3 = Label(self.start_window_sample, width=20, text="坏样本抽样比例") 255 | L3.grid(column=0, row=1, sticky=(W)) 256 | if self.target == 'Y': 257 | bad_pct = tk.StringVar(value=self.bad_pct) 258 | self.entry_bad_pct = Entry(self.start_window_sample, textvariable=bad_pct, bd=1, width=18) 259 | self.entry_bad_pct.grid(column=1, row=1, sticky=(W)) 260 | self.entry_bad_pct.bind('', lambda event: self.int_num_check(event, 'bad_pct', 'g')) 261 | 262 | L4 = Label(self.start_window_sample, width=20, text="抽样后坏账率") 263 | L4.grid(column=0, row=2, sticky=(W)) 264 | if self.target == 'Y': 265 | valid_pct = tk.StringVar(value=self.bad_rate) 266 | self.entry_bad_rate = Entry(self.start_window_sample, textvariable=valid_pct, bd=1, width=18) 267 | self.entry_bad_rate.grid(column=1, row=2, sticky=(W)) 268 | self.entry_bad_rate.bind('', lambda event: self.int_num_check(event, 'badrate', 'pct')) 269 | self.start_window_sample.grid(column=0, row=2, columnspan=2, padx=10, pady=10) 270 | 271 | if self.load == 'N': 272 | self.button_setting_save = ttk.Button(self.start_window_base, text='保存 确认') 273 | self.button_setting_save.grid(column=0, row=3, sticky=(W), padx=10, pady=10) 274 | self.button_setting_save.bind("", self.check_all_setting) 275 | else: 276 | self.button_setting_save = ttk.Button(self.start_window_base, text='更新结果') 277 | self.button_setting_save.grid(column=0, row=3, sticky=(W), padx=10, pady=10) 278 | self.button_setting_save.bind("", self.check_all_setting) 279 | 280 | def load_data(self, event, datatype): 281 | # 读取数据 282 | try: 283 | if (datatype == 'train') & (len(self.comboxlist_train_data.get()) >= 1): 284 | path = self.project_info[self.project_info['模块名字'] == self.comboxlist_train_data.get()]['保存地址'][0] 285 | fr = open(path, 'rb') 286 | node_info = pickle.load(fr) 287 | fr.close() 288 | self.par_traindatavariable_setting = node_info[0]['data_variable_setting'] 289 | self.previous_check_change=node_info[0]['check_change'] 290 | self.previous_node_usedlist=node_info[0]['use_node'] 291 | self.previous_node_name = node_info[0]['node_name'] 292 | self.previous_node_time = node_info[0]['time'] 293 | self.previous_node_path = path 294 | self.data_role=node_info[0]['data_role'] 295 | self.par_train_data = node_info[1] 296 | self.get_par() 297 | if len(self.par_traindatavariable_setting[self.par_traindatavariable_setting['变量角色'] == '目标']) == 0: 298 | self.get_par() 299 | self.target = 'N' 300 | for child in self.master.winfo_children(): 301 | child.destroy() 302 | self.ui_start() 303 | else: 304 | self.target = 'Y' 305 | for child in self.master.winfo_children(): 306 | child.destroy() 307 | self.ui_start() 308 | self.get_par() 309 | elif len(self.comboxlist_train_data.get()) <= 1: 310 | self.par_train_data = pd.DataFrame() 311 | else: 312 | pass 313 | except Exception as e: 314 | self.par_train_data = pd.DataFrame() 315 | tk.messagebox.showwarning('错误', "%s数据集导入错误:%s" % (self.comboxlist_train_data.get(), e)) 316 | 317 | def split_function(self): 318 | # 如果index有重复则重新reindex 319 | if self.par_train_data.index.is_unique == False: 320 | 321 | self.par_train_data = self.par_train_data.reset_index(drop=True) 322 | else: 323 | pass 324 | # 是否调整坏账率进行抽样 325 | if self.sample_flag == '是': 326 | target = list(self.par_traindatavariable_setting[self.par_traindatavariable_setting['变量角色'] == '目标']['变量名称'])[0] 327 | bad_data = self.par_train_data[self.par_train_data[target] == 1] 328 | if self.bad_pct <= 1: 329 | 330 | bad_part = bad_data.sample(frac=self.bad_pct, replace=False, random_state=self.seed) 331 | else: 332 | 333 | bad_part = bad_data.sample(frac=self.bad_pct, replace=True, random_state=self.seed) 334 | bad_num = len(bad_part) 335 | good_num = int(round((bad_num / self.bad_rate) * (1 - self.bad_rate), 0)) 336 | good_data = self.par_train_data[self.par_train_data[target] == 0] 337 | if good_num > len(good_data): 338 | 339 | good_part = good_data.sample(n=good_num, replace=True, random_state=self.seed) 340 | else: 341 | 342 | good_part = good_data.sample(n=good_num, replace=False, random_state=self.seed) 343 | or_data = pd.concat([good_part, bad_part]) 344 | self.trainpart_data = or_data 345 | else: 346 | print('gggg') 347 | or_data = self.par_train_data 348 | # 简单随机 349 | if self.method == '简单随机': 350 | # 过抽样 351 | if ((self.train_pct > 1) | (self.valid_pct > 1)) & (self.replace == 'False'): 352 | tk.messagebox.showwarning('错误', '由于样本比例大于1 \n 进行有放回抽样') 353 | self.replace = 'True' 354 | # 不放回抽样 355 | if (self.train_pct + self.valid_pct == 1) & (self.replace == 'False'): 356 | 357 | self.trainpart_data = or_data.sample(frac=self.train_pct, replace=False, random_state=self.seed) 358 | 359 | elif (self.train_pct + self.valid_pct < 1) & (self.replace == 'False'): 360 | 361 | self.trainpart_data = or_data.sample(frac=self.train_pct, replace=False, random_state=self.seed) 362 | elif (self.train_pct + self.valid_pct > 1) & (self.replace == 'False'): 363 | 364 | self.trainpart_data = or_data.sample(frac=self.train_pct, replace=False, random_state=self.seed) 365 | else: 366 | 367 | self.trainpart_data = or_data.sample(frac=self.train_pct, replace=True, random_state=self.seed) 368 | else: 369 | # 根据好坏进行抽样 370 | target = \ 371 | list(self.par_traindatavariable_setting[self.par_traindatavariable_setting['变量角色'] == '目标']['变量名称'])[0] 372 | good_data = or_data[or_data[target] == 0] 373 | bad_data = or_data[or_data[target] == 1] 374 | if ((self.train_pct > 1) | (self.valid_pct > 1)) & (self.replace == 'False'): 375 | tk.messagebox.showwarning('错误', '由于抽样比例大于1 \n 进行有放回抽样') 376 | self.replace = True 377 | if (self.train_pct + self.valid_pct == 1) & (self.replace == 'False'): 378 | 379 | # good 380 | trainpart_gdata = good_data.sample(frac=self.train_pct, replace=False, random_state=self.seed) 381 | # bad 382 | trainpart_bdata = bad_data.sample(frac=self.train_pct, replace=False, random_state=self.seed) 383 | elif (self.train_pct + self.valid_pct < 1) & (self.replace == 'False'): 384 | 385 | # good 386 | trainpart_gdata = good_data.sample(frac=self.train_pct, replace=False, random_state=self.seed) 387 | # bad 388 | trainpart_bdata = bad_data.sample(frac=self.train_pct, replace=False, random_state=self.seed) 389 | elif (self.train_pct + self.valid_pct > 1) & (self.replace == 'False'): 390 | 391 | # good 392 | trainpart_gdata = good_data.sample(frac=self.train_pct, replace=False, random_state=self.seed) 393 | # bad 394 | trainpart_bdata = bad_data.sample(frac=self.train_pct, replace=False, random_state=self.seed) 395 | else: 396 | 397 | # good 398 | trainpart_gdata = good_data.sample(frac=self.train_pct, replace=True, random_state=self.seed) 399 | # bad 400 | trainpart_bdata = bad_data.sample(frac=self.train_pct, replace=True, random_state=self.seed) 401 | self.trainpart_data = pd.concat([trainpart_gdata, trainpart_bdata]) 402 | try: 403 | self.tt.destroy() 404 | except: 405 | pass 406 | result_main_frame = Toplevel(self.master) 407 | self.result_ui(result_main_frame, ac='setting') 408 | self.master.wait_window(result_main_frame) 409 | 410 | def result_ui(self, mainframe, ac): 411 | 412 | self.tt = mainframe 413 | if ac != 're': 414 | self.button_result_save = ttk.Button(self.tt, text='保存 确认') 415 | self.button_result_save.grid(column=0, row=0, sticky=(W), padx=10, pady=10) 416 | self.button_result_save.bind("", self.save_data) 417 | 418 | self.button_reset = ttk.Button(self.tt, text='重新分区') 419 | self.button_reset.grid(column=3, row=0, sticky=(W), padx=10, pady=10) 420 | self.button_reset.bind("", self.all_reset) 421 | # 展示结果 422 | if self.target == 'Y': 423 | target = list(self.par_traindatavariable_setting[self.par_traindatavariable_setting['变量角色'] == '目标']['变量名称'])[0] 424 | t_data = {"data": '原始数据集', "obs": self.par_train_data[target].count(), 425 | "bad_num": self.par_train_data[target].sum(), 426 | "bad_rate": str(round(self.par_train_data[target].mean() * 100, 2)) + '%'} 427 | t_trian_data = {"data": '抽样数据集', "obs": self.trainpart_data[target].count(), 428 | "bad_num": self.trainpart_data[target].sum(), 429 | "bad_rate": str(round(self.trainpart_data[target].mean() * 100, 2)) + '%'} 430 | 431 | t = [] 432 | t.append(t_data) 433 | t.append(t_trian_data) 434 | 435 | summ = LabelFrame(self.tt, text='分区结果', width=20, height=20) 436 | tree = ttk.Treeview(summ, columns=["data", 'obs', 'bad_num', 'bad_rate'], show="headings", 437 | selectmode=tk.BROWSE) 438 | tree.column("data", width=80, minwidth=80, stretch=tk.NO, anchor="center") 439 | tree.column("obs", width=80, minwidth=80, stretch=tk.NO, anchor="center") 440 | tree.column("bad_num", width=80, minwidth=80, stretch=tk.NO, anchor="center") 441 | tree.column("bad_rate", width=80, minwidth=80, stretch=tk.NO, anchor="center") 442 | 443 | tree.heading("data", text="数据集") 444 | tree.heading("obs", text="样本数") 445 | tree.heading("bad_num", text="坏客户数") 446 | tree.heading("bad_rate", text="坏账率") 447 | 448 | i = 0 449 | for v in t: 450 | tree.insert('', i, values=(v.get("data"), v.get("obs"), v.get("bad_num"), v.get("bad_rate"))) 451 | i += 1 452 | 453 | tree.grid() 454 | summ.grid(column=0, row=1,columnspan=4, padx=8, pady=8) 455 | else: 456 | t_data = {"data": '原始数据集', "obs": len(self.par_train_data)} 457 | t_trian_data = {"data": '抽样数据集', "obs": len(self.trainpart_data)} 458 | 459 | t = [] 460 | t.append(t_data) 461 | t.append(t_trian_data) 462 | 463 | summ = LabelFrame(self.tt, text='分区结果', width=20, height=20) 464 | tree = ttk.Treeview(summ, columns=["data", 'obs', 'bad_num', 'bad_rate'], show="headings", 465 | selectmode=tk.BROWSE) 466 | tree.column("data", width=80, minwidth=80, stretch=tk.NO, anchor="center") 467 | tree.column("obs", width=80, minwidth=80, stretch=tk.NO, anchor="center") 468 | 469 | tree.heading("data", text="数据集") 470 | tree.heading("obs", text="样本数") 471 | 472 | i = 0 473 | for v in t: 474 | tree.insert('', i, values=(v.get("data"), v.get("obs"))) 475 | i += 1 476 | 477 | tree.grid() 478 | summ.grid(column=0, row=1,columnspan=4, padx=8, pady=8) 479 | # 总样本 480 | la = LabelFrame(self.tt, text='总样本分布') 481 | t = set(self.trainpart_data.index) 482 | 483 | t2 = set(self.par_train_data.index) 484 | canvas = tk.Canvas() 485 | g = plt.figure(figsize=(4, 4)) 486 | pp = venn2(subsets=[t, t2], set_labels=('train', 'total'), set_colors=('r', 'g')) 487 | canvas = FigureCanvasTkAgg(g, la) 488 | canvas.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=1) 489 | canvas._tkcanvas.pack(side=tk.TOP, fill=tk.BOTH, expand=1) 490 | la.grid(column=4, row=1,columnspan=4, padx=8, pady=8) 491 | if self.target == 'Y': 492 | # 好样本 493 | target = list(self.par_traindatavariable_setting[self.par_traindatavariable_setting['变量角色'] == '目标']['变量名称'])[0] 494 | good = LabelFrame(self.tt, text='好样本分布', width=20, height=20) 495 | tg = set(self.trainpart_data[self.trainpart_data[target] == 0].index) 496 | 497 | tg2 = set(self.par_train_data[self.par_train_data[target] == 0].index) 498 | canvasg = tk.Canvas() 499 | gg = plt.figure(figsize=(4, 4)) 500 | pp = venn2(subsets=[tg, tg2], set_labels=('train', 'total'), set_colors=('r', 'g')) 501 | canvasg = FigureCanvasTkAgg(gg, good) 502 | canvasg.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=1) 503 | canvasg._tkcanvas.pack(side=tk.TOP, fill=tk.BOTH, expand=1) 504 | good.grid(column=0, row=2,columnspan=4, padx=8, pady=8) 505 | # 坏样本 506 | bad = LabelFrame(self.tt, text='坏样本分布', width=20, height=20) 507 | tg = set(self.trainpart_data[self.trainpart_data[target] == 1].index) 508 | 509 | tg2 = set(self.par_train_data[self.par_train_data[target] == 1].index) 510 | canvasg = tk.Canvas() 511 | gg = plt.figure(figsize=(4, 4)) 512 | pp = venn2(subsets=[tg, tg2], set_labels=('train', 'total'), set_colors=('r', 'g')) 513 | canvasg = FigureCanvasTkAgg(gg, bad) 514 | canvasg.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=1) 515 | canvasg._tkcanvas.pack(side=tk.TOP, fill=tk.BOTH, expand=1) 516 | bad.grid(column=4, row=2,columnspan=4, padx=8, pady=8) 517 | 518 | def all_reset(self, event): 519 | # 返回重新分区 520 | try: 521 | self.tt.destroy() 522 | except Exception as e: 523 | tk.messagebox.showwarning('错误', e) 524 | 525 | def save_data(self, event): 526 | # 保存数据 527 | try: 528 | node_save_path = self.project_path + '/' + '%s.sampling' % self.node_name 529 | nowTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') 530 | self.node_setting = {'node_type': 'SAMPLE', 531 | 'node_name': self.node_name, 532 | 'node_save_path': node_save_path, 533 | 'data_role': self.data_role, 534 | 'method': self.method, 535 | 'replace': self.replace, 536 | 'seed': self.seed, 537 | 'train_pct': self.train_pct, 538 | 'target': self.target, 539 | 'adjuest_bad': self.sample_flag, 540 | 'bad_sample_pct': self.bad_pct, 541 | 'sample_bad_rate': self.bad_rate, 542 | 'time': nowTime, 543 | 'check_change': [{'node_name': self.node_name, 544 | 'node_time': nowTime}] + self.previous_check_change, 545 | 'data_variable_setting': self.par_traindatavariable_setting, 546 | 'previous_node_name': self.previous_node_name, 547 | 'previous_node_time': self.previous_node_time, 548 | 'previous_node_path': [self.previous_node_path], 549 | 'use_node': [self.node_name] + self.previous_node_usedlist} 550 | 551 | data_save = (self.node_setting, self.trainpart_data,self.par_train_data) 552 | error2 = Toplevel(self.master) 553 | screenwidth = self.master.winfo_screenwidth() 554 | screenheight = self.master.winfo_screenheight() 555 | 556 | error2.geometry('%dx%d+%d+%d' % (150, 100, (screenwidth - 150) / 2, (screenheight - 100) / 2)) 557 | L2 = Label(error2, text="保存中") 558 | L2.grid() 559 | self.master.update() 560 | 561 | filename = node_save_path 562 | fw = open(filename, 'wb') 563 | pickle.dump(data_save, fw, 1) 564 | fw.close() 565 | 566 | self.save = 'Y' 567 | try: 568 | error2.destroy() 569 | except: 570 | pass 571 | self.master.destroy() 572 | except Exception as e: 573 | tk.messagebox.showwarning('错误', e) 574 | 575 | # 检查所有变量参数是否正确 576 | def check_all_setting(self, event): 577 | try: 578 | self.load_data(event, 'train') 579 | self.get_par() 580 | if (self.node_name in self.exist_data) & (self.load == 'N'): 581 | tk.messagebox.showwarning('错误', "该名称已经被占用,请更改") 582 | else: 583 | e = 0 584 | if self.par_train_data.empty == True: 585 | tk.messagebox.showwarning('错误', "错误:训练样本为空") 586 | else: 587 | if self.target == 'Y': 588 | total = ['seed', 'train_pct', 'bad_pct', 'bad_rate'] 589 | else: 590 | total = ['seed', 'train_pct'] 591 | for p in total: 592 | if p in ['seed']: 593 | flag = 'int' 594 | entry_p = p 595 | er = self.int_num_check(event, entry_p, flag) 596 | elif p in ['bad_rate']: 597 | flag = 'pct' 598 | entry_p = p 599 | er = self.int_num_check(event, entry_p, flag) 600 | else: 601 | flag = 'g' 602 | entry_p = p 603 | er = self.int_num_check(event, entry_p, flag) 604 | e = e + er 605 | 606 | if e == 0: 607 | self.split_function() 608 | except Exception as e: 609 | tk.messagebox.showwarning('错误', e) 610 | 611 | def get_par(self): 612 | # 更新得到的设置 613 | if self.target == 'Y': 614 | self.sample_flag = self.comboxlist_sample_flag.get() 615 | self.bad_pct = float(self.entry_bad_pct.get()) 616 | self.bad_rate = float(self.entry_bad_rate.get()) 617 | self.train_pct = float(self.entry_train_pct.get()) 618 | self.seed = int(self.entry_seed.get()) 619 | self.replace = self.comboxlist_replace.get() 620 | self.par_train_data = self.par_train_data 621 | self.method = self.comboxlist_method.get() 622 | try: 623 | self.node_name = self.entry_node_name.get() 624 | except: 625 | pass 626 | 627 | def int_num_check(self, event, entry_p, flag): 628 | # 检查数字是否正确 629 | flag_er = 0 630 | if entry_p == 'seed': 631 | inputnum = self.entry_seed.get() 632 | tip = '随机种子' 633 | elif entry_p == 'train_pct': 634 | inputnum = self.entry_train_pct.get() 635 | tip = '抽样样本比例' 636 | 637 | elif (self.target == 'Y') & (entry_p == 'bad_rate'): 638 | inputnum = self.entry_bad_rate.get() 639 | tip = '整体样本坏账率' 640 | elif (self.target == 'Y') & (entry_p == 'bad_pct'): 641 | inputnum = self.entry_bad_pct.get() 642 | tip = '坏样本抽样比例' 643 | else: 644 | pass 645 | 646 | try: 647 | if float(inputnum) <= 0: 648 | 649 | tk.messagebox.showwarning('错误', '%s:输入值不能小于等于0' % tip) 650 | flag_er = flag_er + 1 651 | else: 652 | if flag == 'int': 653 | try: 654 | int(inputnum) 655 | except Exception as e: 656 | tk.messagebox.showwarning('错误','%s:%s'%(tip, e)) 657 | flag_er = flag_er + 1 658 | elif flag == 'pct': 659 | try: 660 | num = float(inputnum) 661 | if num > 1: 662 | tk.messagebox.showwarning('错误', '%s:输入值不能大于1' % tip) 663 | flag_er = flag_er + 1 664 | else: 665 | pass 666 | except Exception as e: 667 | tk.messagebox.showwarning('错误', '%s:%s' % (tip, e)) 668 | flag_er = flag_er + 1 669 | else: 670 | try: 671 | num = float(inputnum) 672 | except Exception as e: 673 | tk.messagebox.showwarning('错误', '%s:%s' % (tip, e)) 674 | flag_er = flag_er + 1 675 | except Exception as e: 676 | tk.messagebox.showwarning('错误', '%s:%s' % (tip, e)) 677 | flag_er = flag_er + 1 678 | return flag_er 679 | -------------------------------------------------------------------------------- /score_ui.py: -------------------------------------------------------------------------------- 1 | import tkinter as tk 2 | from tkinter import ttk 3 | from tkinter import * 4 | import pandas as pd 5 | 6 | import pickle as pickle 7 | from .func import binning 8 | import math 9 | from .model import lrmodel 10 | 11 | lrmodel = lrmodel() 12 | import datetime 13 | 14 | binning = binning() 15 | import statsmodels.api as sm 16 | 17 | from .score_result_ui import score_result_ui 18 | 19 | import os 20 | 21 | 22 | 23 | class scoreing(): 24 | def __init__(self, mianframe, project_info): 25 | self.master = mianframe 26 | # project参数 27 | self.save = 'N' 28 | self.node_type = 'scoring' 29 | self.project_info = project_info 30 | self.project_path = os.path.split(project_info[project_info['模块类型'] == 'project']['保存地址'][0])[0] 31 | self.node_name = 'Score' 32 | self.exist_data = list(project_info['模块名字']) 33 | self.load = 'N' 34 | self.finsh = 'N' 35 | # IGN参数 36 | self.IGN_f_group_report = pd.DataFrame() 37 | self.IGN_grouped_train_data = pd.DataFrame() 38 | self.predict_score_data=pd.DataFrame() 39 | self.IGN_par_traindatavariable_setting = None 40 | self.IGN_node_time = None 41 | 42 | self.timeid_train = None 43 | self.target_train = None 44 | self.timeid_score = None 45 | self.target_score=None 46 | # 模型参数 47 | self.par_intercept_flag = True 48 | self.par_variable_type = 'WOE' 49 | self.model_start_flag = 'N' 50 | 51 | # 分组过程参数 52 | # 评分卡变量 53 | self.predict_score_data=pd.DataFrame() 54 | self.predict_train_data = pd.DataFrame() 55 | self.IGN_grouping_data=pd.DataFrame() 56 | self.model_ppp = [] 57 | self.scorecard_df = pd.DataFrame() 58 | self.lasso_df = pd.DataFrame() 59 | self.par_score_data=pd.DataFrame() 60 | self.pre_data() 61 | self.pre_model() 62 | 63 | # 模块参数 64 | def pre_data(self): 65 | dd = list(self.project_info[self.project_info['模块类型'] == 'DATA']['保存地址']) 66 | self.scoredf_list = [] 67 | for add in dd: 68 | try: 69 | fr = open(add, 'rb') 70 | node_info = pickle.load(fr) 71 | fr.close() 72 | data_role = node_info[0]['data_role'] 73 | node_name = node_info[0]['node_name'] 74 | if data_role == 'Score': 75 | self.scoredf_list.append(node_name) 76 | except Exception as e: 77 | tk.messagebox.showwarning('错误', "%s数据集导入错误:%s" % (add, e)) 78 | def pre_model(self): 79 | try: 80 | dd = list(self.project_info[(self.project_info['模块类型'] == 'SCR') &(self.project_info['状态'] == 'Good')]['模块名字']) 81 | self.SCR_list = dd 82 | except Exception as e: 83 | tk.messagebox.showwarning('错误', e) 84 | def load_node(self,node_data,ac): 85 | print('ccc') 86 | self.finsh = 'Y' 87 | self.node_setting=node_data[0] 88 | self.predict_score_data=node_data[1] 89 | self.node_model_data=node_data[2] 90 | self.node_score_data=node_data[3] 91 | 92 | self.node_model_name=self.node_setting['model_nodename'] 93 | self.par_score_dataname=self.node_setting['model_dataname'] 94 | self.par_score_data=self.node_setting['score_data_node'] 95 | self.node_save_path=self.node_setting['node_save_path'] 96 | previous_node_name=self.node_setting['previous_node_name'] 97 | previous_node_time=self.node_setting['previous_node_time'] 98 | self.node_name=self.node_setting['node_name'] 99 | 100 | self.par_scoredatavariable_setting = self.node_score_data[0]['data_variable_setting'] 101 | self.par_score_dataname = self.node_score_data[0]['node_name'] 102 | self.par_score_data_time = self.node_score_data[0]['time'] 103 | self.par_score_data = self.node_score_data[1] 104 | if len(self.par_scoredatavariable_setting[self.par_scoredatavariable_setting['变量角色'] == 'TimeID']) == 1: 105 | self.flag_timeid_score = True 106 | self.timeid_score = \ 107 | self.par_scoredatavariable_setting.loc[self.par_scoredatavariable_setting['变量角色'] == 'TimeID'][ 108 | '变量名称'].values[0] 109 | if len(self.par_scoredatavariable_setting[self.par_scoredatavariable_setting['变量角色'] == '目标']) == 1: 110 | self.flag_target_score = True 111 | self.target_score = \ 112 | self.par_scoredatavariable_setting.loc[self.par_scoredatavariable_setting['变量角色'] == '目标']['变量名称'].values[0] 113 | 114 | 115 | self.par_variable_type = self.node_model_data[0]['par_variable_type'] 116 | self.predict_train_data = self.node_model_data[0]['predict_train_data'] 117 | self.model_ppp = self.node_model_data[0]['model'] 118 | self.f_scorecard = self.node_model_data[0]['scorecard_df'] 119 | self.target_train = self.node_model_data[0]['report_para']['train_target'] 120 | self.timeid_train = self.node_model_data[0]['report_para']['timeid_train'] 121 | self.IGN_f_group_report = self.node_model_data[0]['report_para']['f_group_report'] 122 | self.IGN_grouping_data = self.node_model_data[0]['IGN_grouping_data'] 123 | if ac == 'setting': 124 | error_list=[] 125 | print('a') 126 | for i in range(0,2): 127 | if previous_node_name[i]!=None: 128 | path_list = self.project_info[self.project_info['创建时间'] == previous_node_time[i]]['保存地址'] 129 | if len(path_list) == 0: 130 | print(previous_node_time) 131 | print({'name':previous_node_name[i],'time':previous_node_time[i]}) 132 | error_list=error_list+[{'name':previous_node_name[i],'time':previous_node_time[i]}] 133 | def continu(event): 134 | for child in self.master.winfo_children(): 135 | child.destroy() 136 | # 以前数据集更新了就重新更新结果 137 | self.load = 'N' 138 | self.Start_UI() 139 | self.adjustsetting() 140 | 141 | def back(event): 142 | self.master.destroy() 143 | 144 | if len(error_list) > 0: 145 | self.master.title('提示') 146 | L00 = Label(self.master, width=80, text="该模块引用的%s 模块 没有在项目中找到,\n可能该模块已经更新,删除," 147 | "或未导入\n继续设置可能会导致以前结果丢失" % (error_list)) 148 | L00.grid(column=0, row=0, columnspan=3, sticky=(W)) 149 | button_contin = ttk.Button(self.master, text='继续设置') 150 | button_contin.grid(column=0, row=1, sticky=(W), padx=10, pady=10) 151 | button_contin.bind("", continu) 152 | button_back = ttk.Button(self.master, text='返回') 153 | button_back.grid(column=2, row=1, sticky=(W), padx=10, pady=10) 154 | button_back.bind("", back) 155 | else: 156 | self.load='Y' 157 | self.Start_UI() 158 | self.adjustsetting() 159 | else: 160 | print('b') 161 | self.reult_show_only(self.master) 162 | 163 | def import_data_node(self,event): 164 | path = self.project_info[self.project_info['模块名字'] == self.comboxlist_score_data.get()]['保存地址'][0] 165 | fr = open(path, 'rb') 166 | node_info = pickle.load(fr) 167 | fr.close() 168 | self.node_score_data=node_info 169 | self.par_scoredatavariable_setting = node_info[0]['data_variable_setting'] 170 | self.par_score_dataname = node_info[0]['node_name'] 171 | self.par_score_data_time = node_info[0]['time'] 172 | self.par_score_data = node_info[1] 173 | # self.previous_reject_check_change = node_info[0]['check_change'] 174 | self.previous_reject_node_usedlist = node_info[0]['use_node'] 175 | def load_model_data(self, event): 176 | try: 177 | path = self.project_info[self.project_info['模块名字'] == self.comboxlist_SCR.get()]['保存地址'][0] 178 | fr = open(path, 'rb') 179 | node_data = pickle.load(fr) 180 | fr.close() 181 | self.node_model_data=node_data 182 | self.node_setting=node_data[0] 183 | self.node_type = node_data[0]['node_type'] 184 | self.node_model_name=node_data[0]['node_name'] 185 | self.node_model_time=node_data[0]['time'] 186 | self.node_save_path=node_data[0]['node_save_path'] 187 | self.par_variable_type=node_data[0]['par_variable_type'] 188 | self.predict_train_data=node_data[0]['predict_train_data'] 189 | self.model_ppp=node_data[0]['model'] 190 | self.f_scorecard=node_data[0]['scorecard_df'] 191 | self.target_train=node_data[0]['report_para']['train_target'] 192 | self.timeid_train=node_data[0]['report_para']['timeid_train'] 193 | self.IGN_f_group_report=node_data[0]['report_para']['f_group_report'] 194 | self.IGN_grouping_data=node_data[0]['IGN_grouping_data'] 195 | except Exception as e: 196 | tk.messagebox.showwarning('错误', "%s数据集导入错误:%s" % (self.comboxlist_SCR.get(), e)) 197 | def Start_UI(self): 198 | self.start_window_base = self.master 199 | width = self.master.winfo_screenwidth() * 0.2 200 | height = self.master.winfo_screenheight() * 0.4 201 | screenwidth = self.master.winfo_screenwidth() 202 | screenheight = self.master.winfo_screenheight() 203 | self.start_window_base.geometry( 204 | '%dx%d+%d+%d' % (width, height, (screenwidth - width) / 2, (screenheight - height) / 2)) 205 | self.start_window_base.title('数据集打分') 206 | def adjustsetting(self): 207 | # 导入数据 208 | self.node_intro = LabelFrame(self.start_window_base, text='模块名称:') 209 | L8 = Label(self.node_intro, width=25, text="模块名称:", justify="left") 210 | L8.grid(column=0, row=0, sticky=(W)) 211 | if (self.load == 'N') & (self.finsh == 'N'): 212 | node_name = tk.StringVar(value=self.node_name) 213 | self.entry_node_name = Entry(self.node_intro, textvariable=node_name, bd=1, width=18) 214 | self.entry_node_name.grid(column=1, row=0, sticky=(W)) 215 | else: 216 | L88 = Label(self.node_intro, width=25, text="%s" % self.node_name, justify="left") 217 | L88.grid(column=1, row=0, sticky=(W)) 218 | self.node_intro.grid(columnspan=3, sticky=(W), padx=10, pady=10) 219 | 220 | self.start_window_data = LabelFrame(self.start_window_base, text='导入打分数据:') 221 | L1 = Label(self.start_window_data, width=25, text="分组数据:", justify="left") 222 | L1.grid(column=0, row=0, sticky=(W)) 223 | self.comboxlist_score_data = ttk.Combobox(self.start_window_data, width=15) 224 | self.comboxlist_score_data["value"] = self.scoredf_list 225 | if self.par_score_data.empty != True: 226 | for i in range(len(self.scoredf_list)): 227 | if self.scoredf_list[i]==self.par_score_dataname: 228 | self.comboxlist_score_data.current(i) 229 | self.comboxlist_score_data.bind("<>", lambda event: self.import_data_node(event)) 230 | self.comboxlist_score_data.grid(column=1, row=0, sticky=(W)) 231 | 232 | L3 = Label(self.start_window_data, width=25, text="导入模型:", justify="left") 233 | L3.grid(column=0, row=2, sticky=(W)) 234 | self.comboxlist_SCR = ttk.Combobox(self.start_window_data, width=15) 235 | self.comboxlist_SCR["value"] = self.SCR_list 236 | if self.IGN_grouping_data.empty != True: 237 | for i in range(len(self.SCR_list)): 238 | if self.SCR_list[i] == self.node_model_name: 239 | self.comboxlist_SCR.current(i) 240 | self.comboxlist_SCR.grid(column=1, row=2, sticky=(W)) 241 | self.comboxlist_SCR.bind("<>", lambda event: self.load_model_data(event)) 242 | self.start_window_data.grid(columnspan=3, sticky=(W), padx=10, pady=10) 243 | 244 | self.button_setting_save = ttk.Button(self.start_window_base, text='(保存)退出') 245 | self.button_setting_save.grid(column=0, row=7, sticky=(W), padx=10, pady=10) 246 | self.button_setting_save.bind("", self.save_project) 247 | if (self.load == 'Y') | (self.finsh == 'Y'): 248 | self.check_result = ttk.Button(self.start_window_base, text='查看结果') 249 | self.check_result.grid(column=1, row=7, sticky=(W), padx=10, pady=10) 250 | self.check_result.bind("", self.scorecard_result_show_ui) 251 | if (self.load == 'N') & (self.finsh == 'N'): 252 | self.button_setting_run = ttk.Button(self.start_window_base, text='应用') 253 | self.button_setting_run.grid(column=2, row=7, sticky=(W)) 254 | self.button_setting_run.bind("", self.Scoring) 255 | else: 256 | self.button_refresh_run = ttk.Button(self.start_window_base, text='刷新结果') 257 | self.button_refresh_run.grid(column=2, row=7, sticky=(W)) 258 | self.button_refresh_run.bind("", self.Scoring) 259 | self.button_output = ttk.Button(self.start_window_base, text='导出数据集') 260 | self.button_output.grid(column=0, row=8, sticky=(W), padx=10, pady=10) 261 | self.button_output.bind("", self.out_dataset) 262 | def out_dataset(self, event): 263 | try: 264 | word = '导出数据集:\n Score数据集:%s/%s_train.csv \n' % (self.project_path, self.node_name) 265 | self.predict_score_data.to_csv(self.project_path + '/' + '%s_score.csv' % self.node_name, index=False, 266 | encoding='utf-8') 267 | tk.messagebox.showwarning('成功', word) 268 | except Exception as e: 269 | tk.messagebox.showwarning('错误', e) 270 | def save_project(self, event): 271 | try: 272 | node_save_path = self.project_path + '/' + '%s.model' % self.node_name 273 | error2 = Toplevel(self.master) 274 | screenwidth = self.master.winfo_screenwidth() 275 | screenheight = self.master.winfo_screenheight() 276 | error2.geometry('%dx%d+%d+%d' % (150, 100, (screenwidth - 150) / 2, (screenheight - 100) / 2)) 277 | L2 = Label(error2, text="保存中") 278 | L2.grid() 279 | self.master.update() 280 | filename = node_save_path 281 | fw = open(filename, 'wb') 282 | pickle.dump([self.node_setting,self.predict_score_data,self.node_model_data,self.node_score_data], fw, protocol=4) 283 | fw.close() 284 | self.save = 'Y' 285 | try: 286 | error2.destroy() 287 | except: 288 | pass 289 | try: 290 | self.master.destroy() 291 | except: 292 | pass 293 | except Exception as e: 294 | tk.messagebox.showwarning('错误', e) 295 | def Scoring(self, event): 296 | # 检查各个数据集变量情况 297 | try: 298 | if self.model_start_flag=='N': 299 | self.model_start_flag='Y' 300 | variable_list = list(self.model_ppp[1].params.reset_index()['index']) 301 | try: 302 | variable_list.remove('const') 303 | except: 304 | pass 305 | variable_df=self.model_ppp[2] 306 | inmodel_df = variable_df[variable_df['model_var'].isin(variable_list)] 307 | include_var = list(inmodel_df[inmodel_df['ori_var'].isnull() == False]['ori_var']) 308 | if len(inmodel_df[inmodel_df['var_type'] == 'add']) > 0: 309 | include_var = include_var + list(inmodel_df[inmodel_df['var_type'] == 'add']['vara']) 310 | include_var = include_var + list(inmodel_df[inmodel_df['var_type'] == 'add']['varb']) 311 | unique_varable = list(set(include_var)) 312 | model_var_num = list(set(self.IGN_grouping_data[(self.IGN_grouping_data['variable_name'].isin(unique_varable)) & ( 313 | self.IGN_grouping_data['variable_type'] == 'num')]['variable_name'])) 314 | model_var_char = list(set(self.IGN_grouping_data[(self.IGN_grouping_data['variable_name'].isin(unique_varable)) & ( 315 | self.IGN_grouping_data['variable_type'] == 'char')]['variable_name'])) 316 | 317 | if len(self.par_scoredatavariable_setting[self.par_scoredatavariable_setting['变量角色'] == 'TimeID']) == 1: 318 | self.flag_timeid_score = True 319 | self.timeid_score = self.par_scoredatavariable_setting.loc[self.par_scoredatavariable_setting['变量角色'] == 'TimeID']['变量名称'].values[0] 320 | if len(self.par_scoredatavariable_setting[self.par_scoredatavariable_setting['变量角色'] == '目标']) == 1: 321 | self.flag_target_score = True 322 | self.target_score = self.par_scoredatavariable_setting.loc[self.par_scoredatavariable_setting['变量角色'] == '目标']['变量名称'].values[0] 323 | 324 | self.varchar_score = list(self.par_scoredatavariable_setting[self.par_scoredatavariable_setting['变量类型'] == '字符型']['变量名称']) 325 | self.varnum_score = list(self.par_scoredatavariable_setting[self.par_scoredatavariable_setting['变量类型'] == '数值型']['变量名称']) 326 | 327 | if list(set(model_var_num) - set(self.varnum_score)) != []: 328 | tk.messagebox.showwarning('错误',"打分集中没有如下数值型变量%s" % (list(set(model_var_num) - set(self.varnum_score)))) 329 | self.model_start_flag = 'N' 330 | elif list(set(model_var_char) - set(self.varchar_score )) != []: 331 | tk.messagebox.showwarning('错误',"打分集中没有如下字符型变量%s" % (list(set(model_var_num) - set(self.varchar_score)))) 332 | self.model_start_flag = 'N' 333 | else: 334 | self.grouped_score_data = binning.fit_bin_existing(data=self.par_score_data, 335 | varnum=model_var_num, 336 | varchar=model_var_char, 337 | target=self.target_train, 338 | group_info=self.IGN_grouping_data, 339 | data_only=True) 340 | self.scorecard_data_pre(self.model_ppp) 341 | 342 | node_save_path = self.project_path + '/' + '%s.model' % self.node_name 343 | nowTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') 344 | self.node_setting = {'node_type':'Scoring', 345 | 'node_name':self.node_name, 346 | 'model_nodename':self.node_model_name, 347 | 'model_dataname':self.par_score_dataname, 348 | 'score_data_node': self.par_score_data, 349 | 'scored_data':self.predict_score_data, 350 | 'time':nowTime, 351 | 'node_save_path':node_save_path, 352 | 'use_node':[self.node_name,self.node_model_name,self.par_score_dataname], 353 | 'previous_node_name': [self.node_model_name,self.par_score_dataname], 354 | 'previous_node_time': [self.node_model_time,self.par_score_data_time] 355 | } 356 | self.finsh = 'Y' 357 | for child in self.master.winfo_children(): 358 | child.destroy() 359 | self.adjustsetting() 360 | self.model_start_flag = 'N' 361 | except Exception as e: 362 | tk.messagebox.showwarning('错误', e) 363 | self.model_start_flag = 'N' 364 | def scorecard_result_show_ui(self, event): 365 | try: 366 | if self.result_page.state() == 'normal': 367 | tk.messagebox.showwarning('错误', "请先处理当前打开窗口") 368 | except: 369 | self.result_page = Toplevel(self.master) 370 | score_result_ui(mainframe=self.result_page, 371 | predict_train_data=self.predict_train_data, 372 | predict_score_data=self.predict_score_data, 373 | train_target=self.target_train, 374 | score_target=self.target_score, 375 | train_time_id=self.timeid_train, score_time_id=self.timeid_score, 376 | record_list=self.model_ppp[0], model=self.model_ppp[1], scorecarddf=self.f_scorecard, 377 | f_group_report=self.IGN_f_group_report, 378 | model_var_type=self.par_variable_type) 379 | def scorecard_data_pre(self, model_re): 380 | def score_predict(scorecard, df): 381 | if len(scorecard[scorecard['variable_name']=='const'])==1: 382 | df['SCORE']=list(scorecard[scorecard['variable_name'] =='const']['scorecard'])[0] 383 | else: 384 | df['SCORE'] = 0 385 | for var in list(scorecard['variable_name'].unique()): 386 | if var != 'const': 387 | df['SCR_%s' % var] = 0 388 | for group in scorecard[scorecard['variable_name'] == var]['f_group']: 389 | df['SCR_%s' % var][df['f_group_%s' % var] == group] = \ 390 | list(scorecard[(scorecard['variable_name'] == var) & (scorecard['f_group'] == group)]['scorecard'])[0] 391 | df['SCORE'] = df['SCORE'] + df['SCR_%s' % var] 392 | return df 393 | if self.par_variable_type == 'WOE': 394 | # woe评分卡 395 | def woe_predict(model, intercept, df, woe_score): 396 | if len(woe_score[(woe_score['var_type'] == 'add') & (woe_score['coff'].isnull() == False)]) > 0: 397 | add_variabile_df = woe_score[(woe_score['var_type'] == 'add') & (woe_score['coff'].isnull() == False)] 398 | add_variabile_df['group_a'] = add_variabile_df['group_a'].astype('int') 399 | add_variabile_df['group_b'] = add_variabile_df['group_b'].astype('int') 400 | 401 | def add_indictor(vara, varb, groupa, df, groupb): 402 | df['ind_f_group_%s_%s_f_group_%s_%s' % (vara, int(groupa), varb, int(groupb))] = df.apply( 403 | lambda x: 1 if (x['f_group_%s' % vara] == groupa) & ( 404 | x['f_group_%s' % varb] == groupb) else 0, axis=1) 405 | df['f_group_ind_f_group_%s_%s_f_group_%s_%s' % (vara, int(groupa), varb, int(groupb))] = df[ 406 | 'ind_f_group_%s_%s_f_group_%s_%s' % (vara, int(groupa), varb, int(groupb))].astype('int8') 407 | 408 | add_variabile_df.apply( 409 | lambda x: add_indictor(df=df, vara=x['vara'], varb=x['varb'], groupa=x['group_a'], 410 | groupb=x['group_b']), axis=1) 411 | 412 | input_list = list( 413 | pd.DataFrame(model.params).reset_index().rename({'index': 'woe_variable_name', 0: 'coff'}, axis=1)[ 414 | 'woe_variable_name']) 415 | 416 | try: 417 | input_list.remove('const') 418 | except: 419 | pass 420 | if intercept == True: 421 | df['SCORECARD_LR_p_1'] = model.predict(sm.add_constant(df[input_list])) 422 | else: 423 | df['SCORECARD_LR_p_1'] = model.predict(df[input_list]) 424 | return df 425 | 426 | woe_model_re = model_re[1] 427 | cof = pd.DataFrame(woe_model_re.params).reset_index().rename({'index': 'woe_variable_name', 0: 'coff'}, 428 | axis=1) 429 | 430 | variable_df = model_re[2] 431 | woe_score = pd.merge(variable_df, cof, how='outer', left_on='model_var', right_on='woe_variable_name') 432 | 433 | 434 | # 给数据集打分 435 | self.predict_score_data = woe_predict(model=woe_model_re, intercept=self.par_intercept_flag, 436 | df=self.grouped_score_data, woe_score=woe_score) 437 | self.predict_score_data = score_predict(self.f_scorecard, self.predict_score_data) 438 | else: 439 | # group 评分卡 440 | grp_ppp = model_re 441 | grp_model = grp_ppp[1] 442 | cof = pd.DataFrame(grp_model.params).reset_index().rename({'index': 'grp_variable_name', 0: 'coff'}, axis=1) 443 | group_report = self.IGN_f_group_report 444 | variable_df = grp_ppp[2] 445 | total = group_report.groupby(['variable_name'])['f_N_obs'].sum().reset_index().rename( 446 | {'f_N_obs': 'total_count'}, axis=1) 447 | group_report = pd.merge(group_report, total, how='left', on='variable_name') 448 | group_report['pct_f_N_obs'] = group_report['f_N_obs'] / group_report['total_count'] 449 | grp_score = pd.merge(variable_df, cof, how='outer', left_on='model_var', right_on='grp_variable_name') 450 | grp_score['variable'][grp_score['grp_variable_name'] == 'const'] = 'const' 451 | use = grp_score.groupby('variable')['coff'].max().reset_index() 452 | use = list(use[use['coff'].isnull() == False]['variable']) 453 | grp_model_df = grp_score[grp_score['variable'].isin(use)].fillna(0) 454 | grp_model_df = grp_model_df.rename({'group': 'f_group'}, axis=1) 455 | grp_model_df['variable_name'] = grp_model_df['variable'].apply(lambda x: 'const' if x == 'const' else x[8:]) 456 | scorecard = pd.merge(grp_model_df, group_report, how='left', on=['variable_name', 'f_group'])[ 457 | ['variable_name', 'f_group', 'var_type', 'f_N_obs', 'label', 'f_Bad_rate', 'pct_f_N_obs', 'coff', 458 | 'woe']] 459 | B = self.par_odds_double_score / math.log(2) 460 | A = self.par_odds_score_ratio + B * math.log(self.par_odds_ratio) 461 | scorecard['SCORE'] = scorecard.apply( 462 | lambda x: A - B * x['coff'] if x['variable_name'] == 'const' else -B * x['coff'], axis=1) 463 | score_adjust = scorecard.groupby('variable_name')['SCORE'].min().reset_index().rename( 464 | {'SCORE': 'score_min'}, axis=1) 465 | adjust_num = score_adjust[score_adjust['score_min'] < 0]['score_min'].sum() 466 | score_adjust['score_min'][score_adjust['variable_name'] == 'const'] = -adjust_num 467 | f_scorecard = pd.merge(scorecard, score_adjust, how='left', on='variable_name') 468 | 469 | 470 | # 给数据集打分 471 | def grp_predict(model, intercept, df): 472 | input_list = list(pd.DataFrame(model.params).reset_index().rename({'index': 'grp_variable_name', 0: 'coff'}, axis=1)['grp_variable_name']) 473 | try: 474 | input_list.remove('const') 475 | except: 476 | pass 477 | if intercept == True: 478 | df['SCORECARD_LR_p_1'] = model.predict(sm.add_constant(df[input_list])) 479 | else: 480 | df['SCORECARD_LR_p_1'] = model.predict(df[input_list]) 481 | return df 482 | 483 | def group_data_pre(df, f_scorecard): 484 | for varable in list(set(f_scorecard[f_scorecard['variable_name'] != 'const']['variable_name'])): 485 | grouplist = list(set(f_scorecard[f_scorecard['variable_name'] == varable]['f_group'])) 486 | for value in grouplist: 487 | df['f_group_%s_%s' % (varable, int(value))] = df['f_group_%s' % varable].apply( 488 | lambda x: 1 if x == int(value) else 0) 489 | df['f_group_%s_%s' % (varable, int(value))] = df[ 490 | 'f_group_%s_%s' % (varable, int(value))].astype('int8') 491 | return df 492 | self.predict_score_data = grp_predict(model=grp_model, intercept=self.par_intercept_flag, 493 | df=group_data_pre(self.grouped_score_data, f_scorecard)) 494 | self.predict_score_data = score_predict(self.f_scorecard, self.predict_score_data) 495 | def reult_show_only(self, result_page): 496 | score_result_ui(mainframe=result_page, 497 | predict_train_data=self.predict_train_data, 498 | predict_score_data=self.predict_score_data, 499 | train_target=self.target_train, 500 | score_target=self.target_score, 501 | train_time_id=self.timeid_train, score_time_id=self.timeid_score, 502 | record_list=self.model_ppp[0], model=self.model_ppp[1], scorecarddf=self.f_scorecard, 503 | f_group_report=self.IGN_f_group_report, 504 | model_var_type=self.par_variable_type) 505 | 506 | -------------------------------------------------------------------------------- /split.py: -------------------------------------------------------------------------------- 1 | import tkinter as tk 2 | from tkinter import ttk 3 | from tkinter import * 4 | import pandas as pd 5 | import pickle as pickle 6 | import datetime 7 | import os 8 | 9 | import matplotlib.pyplot as plt 10 | 11 | from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg 12 | from matplotlib_venn import venn3 13 | class spliting(): 14 | def __init__(self,mainframe,project_info): 15 | self.master=mainframe 16 | self.project_info=project_info 17 | self.project_path=os.path.split(project_info[project_info['模块类型'] == 'project']['保存地址'][0])[0] 18 | self.method='' 19 | self.train_pct=0.7 20 | self.valid_pct=0.3 21 | self.seed=123456 22 | self.bad_pct=1 23 | self.sample_flag='否' 24 | self.replace='False' 25 | self.bad_rate=0.1 26 | self.par_train_data=pd.DataFrame() 27 | self.node_name='split' 28 | self.exist_data=list(project_info['模块名字']) 29 | self.load='N' 30 | self.save='N' 31 | def load_node(self,node_data,ac): 32 | #重新进去页面 33 | self.load='Y' 34 | self.node_name= node_data[0]['node_name'] 35 | self.previous_node_name= node_data[0]['previous_node_name'] 36 | self.previous_node_time = node_data[0]['previous_node_time'] 37 | self.method= node_data[0]['method'] 38 | self.replace= node_data[0]['replace'] 39 | self.seed= node_data[0]['seed'] 40 | self.train_pct= node_data[0]['train_pct'] 41 | self.valid_pct= node_data[0]['valid_pct'] 42 | self.sample_flag= node_data[0]['adjuest_bad'] 43 | self.bad_pct= node_data[0]['bad_sample_pct'] 44 | self.bad_rate= node_data[0]['sample_bad_rate'] 45 | #self.check_list=node_data[0]['check_change'] 46 | self.par_traindatavariable_setting=node_data[0]['data_variable_setting'] 47 | self.trainpart_data= node_data[1] 48 | self.validpart_data= node_data[2] 49 | self.par_train_data= node_data[3] 50 | if ac=='setting': 51 | path_list = self.project_info[self.project_info['创建时间'] == self.previous_node_time]['保存地址'] 52 | def continu(event): 53 | for child in self.master.winfo_children(): 54 | child.destroy() 55 | self.ui_start() 56 | def back(event): 57 | self.master.destroy() 58 | if len(path_list)==0: 59 | self.master.title('提示') 60 | L00 = Label(self.master, width=80, text="该模块引用的%s (创建于 %s)模块 没有在项目中找到,\n可能该模块已经更新,删除," 61 | "或未导入\n继续设置可能会导致以前结果丢失" %(self.previous_node_name,self.previous_node_time)) 62 | L00.grid(column=0, row=0, columnspan=3, sticky=(W)) 63 | button_contin = ttk.Button(self.master, text='继续设置') 64 | button_contin.grid(column=0, row=1, sticky=(W), padx=10, pady=10) 65 | button_contin.bind("", continu) 66 | button_back = ttk.Button(self.master, text='返回') 67 | button_back.grid(column=2, row=1, sticky=(W), padx=10, pady=10) 68 | button_back.bind("", back) 69 | else: 70 | path=path_list[0] 71 | try: 72 | fr = open(path,'rb') 73 | node_info = pickle.load(fr) 74 | fr.close() 75 | # self.previous_check_change=node_info[0]['check_change'] 76 | self.previous_node_usedlist=node_info[0]['use_node'] 77 | self.previous_node_name=node_info[0]['node_name'] 78 | self.data_role=node_info[0]['data_role'] 79 | self.par_train_data=node_info[1] 80 | self.ui_start() 81 | except Exception as e: 82 | self.master.title('提示') 83 | L00 = Label(self.master, width=80, text="导入%s (创建于 %s)模块 发生错误,\n可能该模块已经被破坏或删除," 84 | "\n%s" % (self.previous_node_name, self.previous_node_time,e)) 85 | L00.grid(column=0, row=0, columnspan=3, sticky=(W)) 86 | button_contin = ttk.Button(self.master, text='继续设置') 87 | button_contin.grid(column=0, row=1, sticky=(W), padx=10, pady=10) 88 | button_contin.bind("", continu) 89 | button_back = ttk.Button(self.master, text='返回') 90 | button_back.grid(column=2, row=1, sticky=(W), padx=10, pady=10) 91 | button_back.bind("", back) 92 | else: 93 | self.result_ui(self.master,ac='re') 94 | def ui_start(self): 95 | #初始页面设置 96 | dd=list(self.project_info[((self.project_info['模块类型']=='DATA')|(self.project_info['模块类型']=='SAMPLE')) 97 | &(self.project_info['状态']=='Good')]['保存地址']) 98 | train_data_list=[] 99 | 100 | for add in dd: 101 | try: 102 | fr = open(add,'rb') 103 | node_info = pickle.load(fr) 104 | fr.close() 105 | data_role=node_info[0]['data_role'] 106 | if data_role=='Training model': 107 | data_name=self.project_info[(self.project_info['保存地址'] == add) ]['模块名字'][0] 108 | train_data_list.append(data_name) 109 | else : 110 | pass 111 | except Exception as e: 112 | tk.messagebox.showwarning('错误', "%s数据集导入错误:%s" % (add, e)) 113 | self.start_window_base=self.master 114 | width = self.master.winfo_screenwidth()*0.15 115 | height = self.master.winfo_screenheight()*0.3 116 | screenwidth = self.master.winfo_screenwidth() 117 | screenheight = self.master.winfo_screenheight() 118 | self.start_window_base.geometry('%dx%d+%d+%d' % (width, height, (screenwidth-width)/2, (screenheight-height)/2)) 119 | self.start_window_base.title('样本分区参数设置') 120 | #参数设置 121 | self.start_window_data=LabelFrame(self.start_window_base,text='参数设置') 122 | L00= Label(self.start_window_data,width=20, text="名称") 123 | L00.grid( column=0, row=0,sticky=(W)) 124 | if self.load=='N': 125 | nodename=tk.StringVar(value=self.node_name) 126 | self.entry_node_name = Entry(self.start_window_data ,textvariable=nodename,bd =1,width=18) 127 | self.entry_node_name.grid( column=1, row=0,sticky=(W)) 128 | else: 129 | L01= Label(self.start_window_data,width=20, text=self.node_name,bd=2) 130 | L01.grid( column=1, row=0,sticky=(W)) 131 | L0 = Label(self.start_window_data,width=20, text="训练样本") 132 | L0.grid( column=0, row=1,sticky=(W)) 133 | self.comboxlist_train_data = ttk.Combobox(self.start_window_data,width=15) 134 | self.comboxlist_train_data["value"] = train_data_list 135 | if self.load=='Y': 136 | for i in range(len(train_data_list)): 137 | if train_data_list[i]==self.previous_node_name: 138 | self.comboxlist_train_data.current(i) 139 | self.comboxlist_train_data.bind("<>",lambda event:self.load_data(event,datatype='train')) 140 | self.comboxlist_train_data.grid( column=1, row=1,sticky=(W)) 141 | 142 | L1 = Label(self.start_window_data,width=20, text="方法") 143 | L1.grid( column=0, row=2,sticky=(W)) 144 | self.comboxlist_method = ttk.Combobox(self.start_window_data,width=15) 145 | self.comboxlist_method["value"] = ['简单随机','分层(好坏)'] 146 | if self.load=='Y': 147 | if self.method=='简单随机': 148 | self.comboxlist_method.current(0) 149 | else: 150 | self.comboxlist_method.current(1) 151 | else: 152 | self.comboxlist_method.current(0) 153 | self.comboxlist_method.grid( column=1, row=2,sticky=(W)) 154 | 155 | L3 = Label(self.start_window_data,width=20, text="是否放回") 156 | L3.grid( column=0, row=3,sticky=(W)) 157 | self.comboxlist_replace = ttk.Combobox(self.start_window_data,width=15) 158 | self.comboxlist_replace["value"] = ['False','True'] 159 | if self.load=='Y': 160 | if self.replace=='True': 161 | self.comboxlist_replace.current(1) 162 | else: 163 | self.comboxlist_replace.current(0) 164 | else: 165 | self.comboxlist_replace.current(0) 166 | self.comboxlist_replace.grid( column=1, row=3,sticky=(W)) 167 | 168 | 169 | L2 = Label(self.start_window_data,width=20, text="随机种子") 170 | L2.grid( column=0, row=4,sticky=(W)) 171 | seed=tk.StringVar(value=self.seed) 172 | self.entry_seed = Entry(self.start_window_data ,textvariable=seed,bd =1,width=18) 173 | self.entry_seed.grid( column=1, row=4,sticky=(W)) 174 | self.entry_seed.bind('',lambda event : self.int_num_check(event,'seed','int')) 175 | self.start_window_data.grid(columnspan=2, padx=10, pady=10) 176 | 177 | #分配比例 178 | 179 | self.start_window_pct=LabelFrame(self.start_window_base,text='数据集分配比例') 180 | L3 = Label(self.start_window_pct,width=20, text="训练样本比例") 181 | L3.grid( column=0, row=0,sticky=(W)) 182 | train_pct=tk.StringVar(value=self.train_pct) 183 | self.entry_train_pct = Entry(self.start_window_pct ,textvariable=train_pct,bd =1,width=18) 184 | self.entry_train_pct.grid( column=1, row=0,sticky=(W)) 185 | self.entry_train_pct.bind('',lambda event : self.int_num_check(event,'train_pct','g')) 186 | 187 | L4 = Label(self.start_window_pct,width=20, text="验证样本比例") 188 | L4.grid( column=0, row=1,sticky=(W)) 189 | valid_pct=tk.StringVar(value=self.valid_pct) 190 | self.entry_valid_pct = Entry(self.start_window_pct ,textvariable=valid_pct,bd =1,width=18) 191 | self.entry_valid_pct.grid( column=1, row=1,sticky=(W)) 192 | self.entry_valid_pct.bind('',lambda event : self.int_num_check(event,'valid_pct','g')) 193 | self.start_window_pct.grid(columnspan=2, padx=10, pady=10) 194 | 195 | 196 | #是否调整样本坏账率 197 | 198 | 199 | self.start_window_sample=LabelFrame(self.start_window_base,text='设置样本坏账率') 200 | 201 | L3 = Label(self.start_window_sample,width=20, text="是否调整样本坏账率") 202 | L3.grid( column=0, row=0,sticky=(W)) 203 | self.comboxlist_sample_flag = ttk.Combobox(self.start_window_sample,width=15) 204 | self.comboxlist_sample_flag["value"] = ['否','是'] 205 | if self.load == 'Y': 206 | if self.sample_flag =='否': 207 | self.comboxlist_sample_flag.current(0) 208 | else: 209 | self.comboxlist_sample_flag.current(1) 210 | else: 211 | self.comboxlist_sample_flag.current(0) 212 | self.comboxlist_sample_flag.grid( column=1, row=0,sticky=(W)) 213 | 214 | 215 | L3 = Label(self.start_window_sample,width=20, text="坏样本抽样比例") 216 | L3.grid( column=0, row=1,sticky=(W)) 217 | bad_pct=tk.StringVar(value=self.bad_pct) 218 | self.entry_bad_pct = Entry(self.start_window_sample ,textvariable=bad_pct,bd =1,width=18) 219 | self.entry_bad_pct.grid( column=1, row=1,sticky=(W)) 220 | self.entry_bad_pct.bind('',lambda event : self.int_num_check(event,'bad_pct','g')) 221 | 222 | L4 = Label(self.start_window_sample,width=20, text="整体样本坏账率") 223 | L4.grid( column=0, row=2,sticky=(W)) 224 | valid_pct=tk.StringVar(value=self.bad_rate) 225 | self.entry_bad_rate = Entry(self.start_window_sample ,textvariable=valid_pct,bd =1,width=18) 226 | self.entry_bad_rate.grid( column=1, row=2,sticky=(W)) 227 | self.entry_bad_rate.bind('',lambda event : self.int_num_check(event,'badrate','pct')) 228 | self.start_window_sample.grid(columnspan=2, padx=10, pady=10) 229 | 230 | if self.load=='N': 231 | self.button_setting_save = ttk.Button(self.start_window_base,text='保存 确认') 232 | self.button_setting_save.grid( column=0, row=7,sticky=(W), padx=10, pady=10) 233 | self.button_setting_save.bind("",self.check_all_setting) 234 | else: 235 | self.button_setting_save = ttk.Button(self.start_window_base,text='更新结果') 236 | self.button_setting_save.grid( column=0, row=7,sticky=(W), padx=10, pady=10) 237 | self.button_setting_save.bind("",self.check_all_setting) 238 | def load_data(self,event,datatype): 239 | #读取数据 240 | try: 241 | print(datatype,self.comboxlist_train_data.get()) 242 | if (datatype=='train')&(len(self.comboxlist_train_data.get())>=1): 243 | path=self.project_info[self.project_info['模块名字']==self.comboxlist_train_data.get()]['保存地址'][0] 244 | 245 | fr = open(path,'rb') 246 | node_info = pickle.load(fr) 247 | fr.close() 248 | 249 | self.par_traindatavariable_setting=node_info[0]['data_variable_setting'] 250 | # self.previous_check_change=node_info[0]['check_change'] 251 | self.previous_node_usedlist=node_info[0]['use_node'] 252 | self.previous_node_name=node_info[0]['node_name'] 253 | self.previous_node_time=node_info[0]['time'] 254 | self.data_role=node_info[0]['data_role'] 255 | self.par_train_data=node_info[1] 256 | elif len(self.comboxlist_train_data.get())<=1: 257 | self.par_train_data = pd.DataFrame() 258 | else: 259 | pass 260 | except Exception as e : 261 | self.par_train_data = pd.DataFrame() 262 | tk.messagebox.showwarning('错误', "%s数据集导入错误:%s" % (self.comboxlist_train_data.get(), e)) 263 | def split_function(self): 264 | #如果index有重复则重新reindex 265 | if self.par_train_data.index.is_unique==False: 266 | 267 | self.par_train_data=self.par_train_data.reset_index(drop=True) 268 | else: 269 | pass 270 | #是否调整坏账率进行抽样 271 | if self.sample_flag=='是': 272 | 273 | target=list(self.par_traindatavariable_setting[ self.par_traindatavariable_setting['变量角色']=='目标']['变量名称'])[0] 274 | bad_data=self.par_train_data[self.par_train_data[target]==1] 275 | if self.bad_pct<=1: 276 | 277 | bad_part=bad_data.sample(frac=self.bad_pct, replace=False, random_state=self.seed) 278 | else: 279 | 280 | bad_part=bad_data.sample(frac=self.bad_pct, replace=True, random_state=self.seed) 281 | bad_num=len(bad_part) 282 | good_num=int(round((bad_num/self.bad_rate)*(1-self.bad_rate),0)) 283 | good_data=self.par_train_data[self.par_train_data[target]==0] 284 | if good_num>len(good_data): 285 | 286 | good_part=good_data.sample(n=good_num, replace=True, random_state=self.seed) 287 | else: 288 | 289 | good_part=good_data.sample(n=good_num, replace=False, random_state=self.seed) 290 | or_data=pd.concat([good_part,bad_part]) 291 | else: 292 | 293 | or_data=self.par_train_data 294 | #简单随机 295 | if self.method=='简单随机': 296 | #过抽样 297 | if ((self.train_pct>1)|(self.valid_pct>1)) & (self.replace=='False'): 298 | tk.messagebox.showwarning('错误','由于验证集或训练集比例大于1 \n 进行有放回抽样') 299 | self.replace='True' 300 | #不放回抽样 301 | if (self.train_pct+self.valid_pct==1)&(self.replace=='False'): 302 | 303 | self.trainpart_data = or_data.sample(frac=self.train_pct, replace=False, random_state=self.seed) 304 | self.validpart_data = or_data.drop(self.trainpart_data.index) 305 | elif (self.train_pct+self.valid_pct<1)&(self.replace=='False'): 306 | 307 | self.trainpart_data = or_data.sample(frac=self.train_pct, replace=False, random_state=self.seed) 308 | rest_data = or_data.drop(self.trainpart_data.index) 309 | self.validpart_data = rest_data.sample(frac=(self.valid_pct/(1-self.train_pct)), replace=False, random_state=self.seed) 310 | elif (self.train_pct+self.valid_pct>1)&(self.replace=='False'): 311 | 312 | self.trainpart_data = or_data.sample(frac=self.train_pct, replace=False, random_state=self.seed) 313 | rest_data = or_data.drop(self.trainpart_data.index) 314 | self.validpart_data = self.trainpart_data.sample(frac=((self.valid_pct-(1-self.train_pct))/(1-self.train_pct)), replace=False, random_state=self.seed) 315 | else: 316 | 317 | self.trainpart_data = or_data.sample(frac=self.train_pct, replace=True, random_state=self.seed) 318 | self.validpart_data = or_data.sample(frac=self.valid_pct, replace=True, random_state=self.seed*2) 319 | else: 320 | #根据好坏进行抽样 321 | target=list(self.par_traindatavariable_setting[ self.par_traindatavariable_setting['变量角色']=='目标']['变量名称'])[0] 322 | good_data=or_data[or_data[target]==0] 323 | bad_data=or_data[or_data[target]==1] 324 | if ((self.train_pct>1)|(self.valid_pct>1)) & (self.replace=='False'): 325 | 326 | tk.messagebox.showwarning('错误','由于验证集或训练集比例大于1 \n 进行有放回抽样') 327 | self.replace='True' 328 | if (self.train_pct+self.valid_pct==1)&(self.replace=='False'): 329 | 330 | #good 331 | trainpart_gdata = good_data.sample(frac=self.train_pct, replace=False, random_state=self.seed) 332 | validpart_gdata = good_data.drop(trainpart_gdata.index) 333 | #bad 334 | trainpart_bdata = bad_data.sample(frac=self.train_pct, replace=False, random_state=self.seed) 335 | validpart_bdata = bad_data.drop(trainpart_bdata.index) 336 | elif (self.train_pct+self.valid_pct<1)&(self.replace=='False'): 337 | 338 | #good 339 | trainpart_gdata = good_data.sample(frac=self.train_pct, replace=False, random_state=self.seed) 340 | rest_data = good_data.drop(trainpart_gdata.index) 341 | validpart_gdata = rest_data.sample(frac=(self.valid_pct/(1-self.train_pct)), replace=False, random_state=self.seed) 342 | #bad 343 | trainpart_bdata = bad_data.sample(frac=self.train_pct, replace=False, random_state=self.seed) 344 | rest_data = bad_data.drop(self.trainpart_bdata.index) 345 | validpart_bdata = rest_data.sample(frac=(self.valid_pct/(1-self.train_pct)), replace=False, random_state=self.seed) 346 | elif (self.train_pct+self.valid_pct>1)&(self.replace=='False'): 347 | 348 | #good 349 | trainpart_gdata = good_data.sample(frac=self.train_pct, replace=False, random_state=self.seed) 350 | rest_data = good_data.drop(trainpart_gdata.index) 351 | validpart_gdata1 = trainpart_gdata.sample(frac=((self.valid_pct-(1-self.train_pct))/(1-self.train_pct)), replace=False, random_state=self.seed) 352 | validpart_gdata=pd.concat([rest_data,validpart_gdata1]) 353 | #bad 354 | trainpart_bdata = bad_data.sample(frac=self.train_pct, replace=False, random_state=self.seed) 355 | rest_data = bad_data.drop(self.trainpart_bdata.index) 356 | validpart_bdata1 = trainpart_bdata.sample(frac=((self.valid_pct-(1-self.train_pct))/(1-self.train_pct)), replace=False, random_state=self.seed) 357 | validpart_bdata=pd.concat([rest_data,validpart_bdata1]) 358 | else: 359 | 360 | #good 361 | trainpart_gdata = good_data.sample(frac=self.train_pct, replace=True, random_state=self.seed) 362 | validpart_gdata = good_data.sample(frac=self.valid_pct, replace=True, random_state=self.seed*2) 363 | #bad 364 | trainpart_bdata = bad_data.sample(frac=self.train_pct, replace=True, random_state=self.seed) 365 | validpart_bdata = bad_data.sample(frac=self.valid_pct, replace=True, random_state=self.seed*2) 366 | self.trainpart_data=pd.concat([trainpart_gdata,trainpart_bdata]) 367 | self.validpart_data=pd.concat([validpart_gdata,validpart_bdata]) 368 | try: 369 | self.tt.destroy() 370 | except: 371 | pass 372 | result_main_frame=Toplevel(self.master) 373 | self.result_ui(result_main_frame,ac='setting') 374 | def result_ui(self,mainframe,ac): 375 | #展示结果 376 | 377 | self.tt = mainframe 378 | target=list(self.par_traindatavariable_setting[ self.par_traindatavariable_setting['变量角色']=='目标']['变量名称'])[0] 379 | t_data = {"data":'原始数据集', "obs":self.par_train_data[target].count(), 380 | "bad_num":self.par_train_data[target].sum(), 381 | "bad_rate":str(round(self.par_train_data[target].mean() * 100, 2)) + '%'} 382 | t_trian_data = {"data":'训练数据集',"obs": self.trainpart_data[target].count(), 383 | "bad_num": self.trainpart_data[target].sum(), 384 | "bad_rate":str(round(self.trainpart_data[target].mean() * 100, 2)) + '%'} 385 | t_valid_data = {"data":'验证数据集',"obs": self.validpart_data[target].count(), 386 | "bad_num": self.validpart_data[target].sum(), 387 | "bad_rate": str(round(self.validpart_data[target].mean() * 100, 2)) + '%'} 388 | t = [] 389 | t.append(t_data) 390 | t.append(t_trian_data) 391 | t.append(t_valid_data) 392 | 393 | if ac!='re': 394 | self.button_result_save = ttk.Button(self.tt,text='保存 确认') 395 | self.button_result_save.grid( column=0, row=0,sticky=(W), padx=10, pady=10) 396 | self.button_result_save.bind("",self.save_data) 397 | 398 | self.button_reset = ttk.Button(self.tt,text='重新分区') 399 | self.button_reset.grid( column=3, row=0,sticky=(W), padx=10, pady=10) 400 | self.button_reset.bind("",self.all_reset) 401 | 402 | summ=LabelFrame(self.tt,text='分区结果',width=20,height=20) 403 | tree=ttk.Treeview(summ,columns=["data", 'obs', 'bad_num', 'bad_rate'],show="headings",selectmode = tk.BROWSE) 404 | tree.column("data", width=80, minwidth=80, stretch=tk.NO, anchor = "center") 405 | tree.column("obs", width=80, minwidth=80, stretch=tk.NO, anchor = "center") 406 | tree.column("bad_num", width=80, minwidth=80, stretch=tk.NO, anchor = "center") 407 | tree.column("bad_rate", width=80, minwidth=80, stretch=tk.NO,anchor = "center") 408 | 409 | tree.heading("data", text = "数据集") 410 | tree.heading("obs", text = "样本数") 411 | tree.heading("bad_num", text = "坏客户数") 412 | tree.heading("bad_rate", text = "坏账率") 413 | 414 | i = 0 415 | for v in t: 416 | tree.insert('', i, values = (v.get("data"), v.get("obs"), v.get("bad_num"), v.get("bad_rate"))) 417 | i += 1 418 | 419 | tree.grid() 420 | summ.grid(column=0,row=1,columnspan=4,padx=8,pady=8) 421 | 422 | 423 | la=LabelFrame(self.tt,text='总样本分布') 424 | t=set(self.trainpart_data.index) 425 | t1=set(self.validpart_data.index) 426 | t2=set(self.par_train_data.index) 427 | canvas=tk.Canvas() 428 | g=plt.figure(figsize=(4,4)) 429 | pp=venn3(subsets=[t,t1,t2],set_labels=('train','valid','total'),set_colors=('r','b','g')) 430 | canvas=FigureCanvasTkAgg(g,la) 431 | canvas.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=1) 432 | canvas._tkcanvas.pack(side=tk.TOP, fill=tk.BOTH, expand=1) 433 | la.grid(column=4,row=1,columnspan=4,padx=8,pady=8) 434 | 435 | good=LabelFrame(self.tt,text='好样本分布',width=20,height=20) 436 | tg=set(self.trainpart_data[self.trainpart_data[target]==0].index) 437 | tg1=set(self.validpart_data[self.validpart_data[target]==0].index) 438 | tg2=set(self.par_train_data[self.par_train_data[target]==0].index) 439 | canvasg=tk.Canvas() 440 | gg=plt.figure(figsize=(4,4)) 441 | pp=venn3(subsets=[tg,tg1,tg2],set_labels=('train','valid','total'),set_colors=('r','b','g')) 442 | canvasg=FigureCanvasTkAgg(gg,good) 443 | canvasg.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=1) 444 | canvasg._tkcanvas.pack(side=tk.TOP, fill=tk.BOTH, expand=1) 445 | good.grid(column=0,row=2,columnspan=4,padx=8,pady=8) 446 | 447 | bad=LabelFrame(self.tt,text='坏样本分布',width=20,height=20) 448 | tg=set(self.trainpart_data[self.trainpart_data[target]==1].index) 449 | tg1=set(self.validpart_data[self.validpart_data[target]==1].index) 450 | tg2=set(self.par_train_data[self.par_train_data[target]==1].index) 451 | canvasg=tk.Canvas() 452 | gg=plt.figure(figsize=(4,4)) 453 | pp=venn3(subsets=[tg,tg1,tg2],set_labels=('train','valid','total'),set_colors=('r','b','g')) 454 | canvasg=FigureCanvasTkAgg(gg,bad) 455 | canvasg.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=1) 456 | canvasg._tkcanvas.pack(side=tk.TOP, fill=tk.BOTH, expand=1) 457 | bad.grid(column=4,row=2,columnspan=4,padx=8,pady=8) 458 | def all_reset(self,event): 459 | #返回重新分区 460 | try: 461 | self.tt.destroy() 462 | except Exception as e: 463 | tk.messagebox.showwarning('错误',e) 464 | def save_data(self,event): 465 | #保存数据 466 | try: 467 | node_save_path=self.project_path+'/'+'%s.spliting' %self.node_name 468 | nowTime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') 469 | self.node_setting={'node_type':'SPLIT', 470 | 'node_name':self.node_name, 471 | 'previous_node_name':self.previous_node_name, 472 | 'previous_node_time':self.previous_node_time, 473 | 'node_save_path':node_save_path, 474 | 'method':self.method, 475 | 'data_role':self.data_role, 476 | 'replace':self.replace, 477 | 'seed':self.seed, 478 | 'train_pct':self.train_pct, 479 | 'valid_pct':self.valid_pct, 480 | 'adjuest_bad':self.sample_flag, 481 | 'bad_sample_pct':self.bad_pct, 482 | 'sample_bad_rate':self.bad_rate, 483 | 'time':nowTime, 484 | #'check_change':[{'node_name': self.node_name,'node_time':nowTime}]+self.previous_check_change, 485 | 'data_variable_setting': self.par_traindatavariable_setting, 486 | 'use_node':[self.node_name]+self.previous_node_usedlist} 487 | 488 | 489 | 490 | data_save=(self.node_setting,self.trainpart_data,self.validpart_data,self.par_train_data) 491 | error2= Toplevel(self.master) 492 | screenwidth = self.master.winfo_screenwidth() 493 | screenheight = self.master.winfo_screenheight() 494 | 495 | error2.geometry( '%dx%d+%d+%d' % (150,100 ,(screenwidth-150)/2, (screenheight-100)/2)) 496 | L2 = Label(error2, text="保存中") 497 | L2.grid() 498 | self.master.update() 499 | 500 | filename=node_save_path 501 | fw = open(filename,'wb') 502 | pickle.dump(data_save, fw, 1) 503 | fw.close() 504 | 505 | self.save='Y' 506 | try: 507 | error2.destroy() 508 | except: 509 | pass 510 | self.master.destroy() 511 | 512 | except Exception as e: 513 | tk.messagebox.showwarning('错误',e) 514 | #检查所有变量参数是否正确 515 | def check_all_setting(self,event): 516 | try: 517 | self.get_par(event) 518 | if (self.node_name in self.exist_data)&(self.load=='N'): 519 | tk.messagebox.showwarning('错误',"该名称已经被占用,请更改") 520 | else: 521 | e=0 522 | if self.par_train_data.empty==True: 523 | tk.messagebox.showwarning('错误',"错误:训练样本为空") 524 | else: 525 | total=['seed','train_pct', 'valid_pct','bad_pct','bad_rate'] 526 | 527 | for p in total: 528 | if p in ['seed']: 529 | flag='int' 530 | entry_p=p 531 | er=self.int_num_check(event,entry_p,flag) 532 | elif p in ['bad_rate']: 533 | flag='pct' 534 | entry_p=p 535 | er=self.int_num_check(event,entry_p,flag) 536 | else: 537 | flag='g' 538 | entry_p=p 539 | er=self.int_num_check(event,entry_p,flag) 540 | e=e+er 541 | 542 | if e==0: 543 | self.split_function() 544 | except Exception as e: 545 | tk.messagebox.showwarning('错误',e) 546 | def get_par(self,event): 547 | #更新得到的设置 548 | self.load_data(event,'train') 549 | self.valid_pct=float(self.entry_valid_pct.get()) 550 | self.train_pct=float(self.entry_train_pct.get()) 551 | self.seed=int(self.entry_seed.get()) 552 | self.replace=self.comboxlist_replace.get() 553 | self.par_train_data=self.par_train_data 554 | self.bad_pct=float(self.entry_bad_pct.get()) 555 | self.bad_rate=float(self.entry_bad_rate.get()) 556 | self.sample_flag=self.comboxlist_sample_flag.get() 557 | self.method=self.comboxlist_method.get() 558 | if self.load=='N': 559 | self.node_name=self.entry_node_name.get() 560 | def int_num_check(self,event,entry_p,flag): 561 | # 检查数字是否正确 562 | flag_er=0 563 | if entry_p=='seed': 564 | inputnum= self.entry_seed.get() 565 | tip='随机种子' 566 | elif entry_p=='train_pct': 567 | inputnum= self.entry_train_pct.get() 568 | tip='训练集样本比例' 569 | elif entry_p=='valid_pct': 570 | inputnum=self.entry_valid_pct.get() 571 | tip='验证集样本比例' 572 | elif entry_p=='bad_rate': 573 | inputnum=self.entry_bad_rate.get() 574 | tip='整体样本坏账率' 575 | elif entry_p=='bad_pct': 576 | inputnum=self.entry_bad_pct.get() 577 | tip='坏样本抽样比例' 578 | else: 579 | pass 580 | 581 | try : 582 | if float(inputnum)<=0: 583 | 584 | tk.messagebox.showwarning('错误','%s:输入值不能小于等于0' %tip) 585 | flag_er=flag_er+1 586 | else: 587 | if flag=='int': 588 | try: 589 | int(inputnum) 590 | except Exception as e: 591 | tk.messagebox.showwarning('错误','%s:%s'%(tip, e)) 592 | flag_er=flag_er+1 593 | elif flag=='pct': 594 | try: 595 | num=float(inputnum) 596 | if num>1: 597 | tk.messagebox.showwarning('错误','%s:输入值不能大于1' %tip) 598 | flag_er=flag_er+1 599 | else : 600 | pass 601 | except Exception as e: 602 | tk.messagebox.showwarning('错误','%s:%s'%(tip, e)) 603 | flag_er=flag_er+1 604 | else : 605 | try: 606 | num=float(inputnum) 607 | except Exception as e: 608 | tk.messagebox.showwarning('错误','%s:%s'%(tip, e)) 609 | flag_er=flag_er+1 610 | except Exception as e: 611 | tk.messagebox.showwarning('错误','%s:%s'%(tip, e)) 612 | flag_er=flag_er+1 613 | return flag_er 614 | -------------------------------------------------------------------------------- /start.py: -------------------------------------------------------------------------------- 1 | import tkinter as tk 2 | from tkinter import ttk 3 | from tkinter import * 4 | import pandas as pd 5 | from pandastable import Table 6 | import pickle as pickle 7 | import os 8 | from .inputdata import inputdata 9 | from .split import spliting 10 | from .sampling import sample 11 | from .interactive_grouping import IGN 12 | from .policy import PLC 13 | from .model_ui import model 14 | from .score_ui import scoreing 15 | from .load_node import import_node 16 | from tkinter import filedialog 17 | from os.path import exists 18 | import datetime 19 | class scorecard(): 20 | def __init__(self): 21 | self.row = 0 22 | self.col = 0 23 | self.project_name = None 24 | self.project_path = None 25 | 26 | self.project_seting = {} 27 | self.project_detail = pd.DataFrame(columns=['模块类型', '模块名字', '引用模块', '保存地址', '状态','创建时间']) 28 | self.root = Tk() 29 | 30 | self.Start_UI() 31 | self.root.withdraw() 32 | self.root.mainloop() 33 | def Start_UI(self): 34 | self.start_window_base = Toplevel(self.root) 35 | self.start_window_base.title('项目') 36 | self.start_window = LabelFrame(self.start_window_base, text='创建新项目') 37 | name = tk.StringVar(value='scorecard1') 38 | width = 500 39 | height = 200 40 | 41 | def selectExcelfold(): 42 | sfname = filedialog.askdirectory() 43 | self.project_path_E.insert(INSERT, sfname) 44 | 45 | screenwidth = self.start_window.winfo_screenwidth() 46 | screenheight = self.start_window.winfo_screenheight() 47 | alignstr = '%dx%d+%d+%d' % (width, height, (screenwidth - width) / 2, (screenheight - height) / 2) 48 | self.start_window_base.geometry(alignstr) 49 | 50 | L1 = Label(self.start_window, text="项目路径") 51 | L1.grid(column=0, row=0, sticky=(W)) 52 | self.project_path_E = Entry(self.start_window, width=50, bd=1) 53 | self.project_path_E.grid(column=1, row=0, sticky=(W)) 54 | button1 = ttk.Button(self.start_window, text='浏览', width=8, command=selectExcelfold) 55 | button1.grid(column=2, row=0, sticky=(W)) 56 | 57 | L2 = Label(self.start_window, text="项目名称") 58 | L2.grid(column=0, row=1, sticky=(W)) 59 | self.project_name_E = Entry(self.start_window, textvariable=name, bd=1) 60 | self.project_name_E.grid(column=1, row=1, sticky=(W)) 61 | 62 | test_button4 = ttk.Button(self.start_window, text='确定') 63 | test_button4.grid(column=1, row=2, sticky=(W)) 64 | test_button4.bind("", self.new_project) 65 | self.start_window.grid(column=0, row=0, columnspan=2, rowspan=3) 66 | 67 | def selectExcelfile(): 68 | sfname = filedialog.askopenfilename(title='选择project文件', filetypes=[('project', '*.project')]) 69 | self.project_path_Ex.insert(INSERT, sfname) 70 | 71 | self.start_window_ex = LabelFrame(self.start_window_base, text='导入现有项目') 72 | 73 | L5 = Label(self.start_window_ex, text="项目路径") 74 | L5.grid(column=0, row=4, sticky=(W)) 75 | self.project_path_Ex = Entry(self.start_window_ex, width=50, bd=1) 76 | self.project_path_Ex.grid(column=1, row=4, sticky=(W)) 77 | button1 = ttk.Button(self.start_window_ex, text='浏览', width=8, command=selectExcelfile) 78 | button1.grid(column=2, row=4, sticky=(W)) 79 | 80 | test_button5 = ttk.Button(self.start_window_ex, text='导入') 81 | test_button5.grid(column=1, row=5, sticky=(W)) 82 | test_button5.bind("", self.load_project) 83 | self.start_window_ex.grid() 84 | 85 | def load_project(self, event): 86 | try: 87 | project_add = self.project_path_Ex.get() 88 | fr = open(project_add, 'rb') 89 | project_info = pickle.load(fr) 90 | fr.close() 91 | self.project_detail = project_info 92 | self.project_path = project_add 93 | self.project_name = self.project_detail[self.project_detail['模块类型'] == 'project']['模块名字'][0] 94 | self.project_detail['保存地址'][self.project_detail['模块类型'] == 'project']=self.project_path 95 | self.start_window_base.destroy() 96 | self.base_UI() 97 | except Exception as e: 98 | tk.messagebox.showwarning('错误', e) 99 | 100 | def new_project(self, event): 101 | self.project_name = self.project_name_E.get() 102 | self.project_path = self.project_path_E.get()+ '/' + '%s.project' % self.project_name 103 | if exists(self.project_path)==False: 104 | self.project_seting = {'project_name': self.project_name, 'project_path': self.project_path} 105 | tt = [{'模块类型': 'project', 106 | '模块名字': self.project_name, 107 | '引用模块': [], 108 | '保存地址': self.project_path, 109 | '状态': 'Good', 110 | '创建时间':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') }] 111 | mm = pd.DataFrame(tt) 112 | self.project_detail = self.project_detail.append(mm) 113 | 114 | try: 115 | self.save_project() 116 | except Exception as e: 117 | tk.messagebox.showwarning('错误', e) 118 | self.start_window_base.destroy() 119 | self.root.destroy() 120 | self.__init__() 121 | self.start_window_base.destroy() 122 | self.base_UI() 123 | else: 124 | tk.messagebox.showwarning('错误', '在文件夹下有同名项目') 125 | 126 | def base_UI(self): 127 | 128 | self.root.update() 129 | self.root.deiconify() 130 | width = 1000 131 | height = 600 132 | screenwidth = self.root.winfo_screenwidth() 133 | screenheight = self.root.winfo_screenheight() 134 | alignstr = '%dx%d+%d+%d' % (width, height, (screenwidth - width) / 2, (screenheight - height) / 2) 135 | self.root.geometry(alignstr) 136 | 137 | self.root.title(self.project_name) 138 | menubar = Menu(self.root) 139 | sysmenu_inputdata = Menu(menubar, tearoff=False) 140 | sysmenu_save = Menu(menubar, tearoff=False) 141 | sysmenu_data_deal = Menu(menubar, tearoff=False) 142 | sysmenu_IGN = Menu(menubar, tearoff=False) 143 | sysmenu_model = Menu(menubar, tearoff=False) 144 | sysmenu_load_node = Menu(menubar, tearoff=False) 145 | sysmenu_policy= Menu(menubar, tearoff=False) 146 | 147 | menubar.add_cascade(label='保存/刷新', menu=sysmenu_save) 148 | sysmenu_save.add_command(label='保存项目', command=lambda: self.save_project()) 149 | sysmenu_save.add_command(label='刷新', command=lambda: self.refresh_df(self.root, self.project_detail)) 150 | 151 | menubar.add_cascade(label='导入模块', menu=sysmenu_load_node) 152 | sysmenu_load_node.add_command(label='导入', command=lambda: self.func_menu('load_node')) 153 | 154 | menubar.add_cascade(label='导入数据集', menu=sysmenu_inputdata) 155 | sysmenu_inputdata.add_command(label='添加', command=lambda: self.func_menu('importdata')) 156 | 157 | menubar.add_cascade(label='规则集', menu=sysmenu_policy) 158 | sysmenu_policy.add_command(label='添加', command=lambda: self.func_menu('policy')) 159 | 160 | menubar.add_cascade(label='数据集处理', menu=sysmenu_data_deal) 161 | sysmenu_data_deal.add_command(label='分区', command=lambda: self.func_menu('split')) 162 | sysmenu_data_deal.add_command(label='抽样', command=lambda: self.func_menu('sampling')) 163 | 164 | menubar.add_cascade(label='交互分组', menu=sysmenu_IGN) 165 | sysmenu_IGN.add_command(label='单变量分组', command=lambda: self.func_menu('IGN')) 166 | 167 | menubar.add_cascade(label='评分卡', menu=sysmenu_model) 168 | sysmenu_model.add_command(label='训练模型', command=lambda: self.func_menu('model')) 169 | sysmenu_model.add_command(label='数据集打分', command=lambda: self.func_menu('Scoring')) 170 | 171 | self.root.grid() 172 | self.root.config(menu=menubar) 173 | self.refresh_df(self.root, self.project_detail) 174 | self.root.update() 175 | 176 | def save_project(self): 177 | 178 | filename = self.project_path 179 | fw = open(filename, 'wb') 180 | pickle.dump(self.project_detail, fw, 1) 181 | fw.close() 182 | 183 | def func_menu(self, func): 184 | try: 185 | if self.root2.state() == 'normal': 186 | tk.messagebox.showwarning('错误', "请先处理当前打开窗口") 187 | except: 188 | self.root2 = Toplevel(self.root) 189 | if func == 'importdata': 190 | self.root2.title('导入数据集') 191 | new_node = inputdata(self.root2, self.project_detail) 192 | new_node.newdatainput() 193 | tip = '导入数据集' 194 | elif func == 'split': 195 | self.root2.title('数据集分区') 196 | new_node = spliting(self.root2, self.project_detail) 197 | new_node.ui_start() 198 | tip = '数据集分区' 199 | elif func == 'sampling': 200 | self.root2.title('数据集抽样') 201 | new_node = sample(self.root2, self.project_detail) 202 | new_node.ui_start() 203 | tip = '数据集抽样' 204 | elif func == 'IGN': 205 | self.root2.title('交互式分组') 206 | new_node = IGN(self.root2, self.project_detail) 207 | new_node.Start_UI() 208 | new_node.adjustsetting() 209 | tip = '交互式分组' 210 | elif func == 'model': 211 | self.root2.title('评分卡模型') 212 | new_node = model(self.root2, self.project_detail) 213 | new_node.Start_UI() 214 | new_node.adjustsetting() 215 | tip = '评分卡模型' 216 | elif func == 'Scoring': 217 | self.root2.title('数据集打分') 218 | new_node = scoreing(self.root2, self.project_detail) 219 | new_node.Start_UI() 220 | new_node.adjustsetting() 221 | tip = '数据集打分' 222 | elif func == 'load_node': 223 | new_node = import_node(self.root2, self.project_detail) 224 | tip = '导入模块' 225 | elif func =='policy': 226 | new_node = PLC(self.root2, self.project_detail) 227 | new_node.Start_UI() 228 | new_node.adjustsetting() 229 | tip = '规则集生成' 230 | self.root.wait_window(self.root2) 231 | 232 | if new_node.save != 'Y': 233 | tk.messagebox.showwarning('错误', "%s未完成" % tip) 234 | else: 235 | try: 236 | print(new_node.save) 237 | tt = [{'模块类型': new_node.node_setting['node_type'], 238 | '模块名字': new_node.node_setting['node_name'], 239 | '引用模块': new_node.node_setting['use_node'], 240 | '保存地址': new_node.node_setting['node_save_path'], 241 | '创建时间': new_node.node_setting['time'], 242 | '状态': 'Good'}] 243 | 244 | mm = pd.DataFrame(tt) 245 | print(mm) 246 | self.project_detail = self.project_detail.append(mm) 247 | # del new_node 248 | self.refresh_df(self.root, self.project_detail) 249 | except Exception as e: 250 | tk.messagebox.showwarning('错误', "%s未完成%s" % (tip, e)) 251 | def refresh_check(self, node_save_path): 252 | 253 | p2 = ttk.Label(self.root, text='checking.... \n wait.....') 254 | 255 | p2.grid(row=0, column=0) 256 | self.root.update() 257 | try: 258 | fr = open(node_save_path, 'rb') 259 | 260 | fr.close() 261 | 262 | p2.destroy() 263 | return 'Good' 264 | 265 | except Exception as e: 266 | 267 | p2.destroy() 268 | return 'error' 269 | 270 | def refresh_df(self, mianfram, df): 271 | try: 272 | self.save_project() 273 | except Exception as e: 274 | tk.messagebox.showwarning('错误', e) 275 | df['状态'] = df.apply(lambda x: 'Good' if x['模块类型'] == 'project' else self.refresh_check(x['保存地址']), axis=1) 276 | df = df[['模块类型', '模块名字', '引用模块', '保存地址', '状态','创建时间']] 277 | f = Frame(mianfram) 278 | f.grid(column=0, row=1, rowspan=1, 279 | columnspan=5, sticky=(E, W)) 280 | screen_width = f.winfo_screenwidth() * 0.8 281 | screen_height = f.winfo_screenheight() * 0.8 282 | self.table = self.ptm = Table(f, dataframe=df, height=screen_height, width=screen_width) 283 | self.ptm.show() 284 | self.table.grid() 285 | self.table.bind("", self.right_click_menu) 286 | self.table.bind("", self.right_click_menu) 287 | # self.table.bind("", self.right_click_menu) 288 | self.table.bind("", self.right_click_menu) 289 | self.table.bind("", self.right_click_menu) 290 | self.table.bind("", self.right_click_menu) 291 | self.table.bind("", self.right_click_menu) 292 | self.table.bind("", self.right_click_menu) 293 | self.table.bind("", self.right_click_menu) 294 | def right_click_menu(self, event): 295 | rowclicked = self.ptm.get_row_clicked(event) 296 | colclicked = self.ptm.get_col_clicked(event) 297 | menu = Menu(self.root) 298 | sysmenu_inputdata = Menu(menu, tearoff=False) 299 | menu.add_command(label="设置", command=lambda: self.setting(rowclicked, colclicked)) 300 | menu.add_separator() 301 | menu.add_command(label="结果", command=lambda: self.result(rowclicked, colclicked)) 302 | menu.add_separator() 303 | menu.add_command(label="删除", command=lambda: self.delet(rowclicked, colclicked)) 304 | 305 | menu.post(event.x_root, event.y_root) 306 | # self.root.update() 307 | 308 | def setting(self, rowclicked, colclicked): 309 | 310 | # data_variable_setting.iloc[self.rowclicked,self.colclicked] 311 | node_type = self.project_detail.iloc[rowclicked]['模块类型'] 312 | node_name = self.project_detail.iloc[rowclicked]['模块名字'] 313 | node_save_path = self.project_detail.iloc[rowclicked]['保存地址'] 314 | try: 315 | fr = open(node_save_path, 'rb') 316 | node_info = pickle.load(fr) 317 | fr.close() 318 | flag_error = 0 319 | except Exception as e: 320 | flag_error = 1 321 | tk.messagebox.showwarning('错误', e) 322 | if flag_error != 1: 323 | try: 324 | if self.root2.state() == 'normal': 325 | tk.messagebox.showwarning('错误', "请先处理当前打开窗口") 326 | except: 327 | self.root2 = Toplevel(self.root) 328 | self.root2.title(node_name) 329 | # try: 330 | if node_type == 'DATA': 331 | new_node = inputdata(self.root2, self.project_detail) 332 | new_node.load(node_info) 333 | new_node.variable_seting_ui() 334 | elif node_type == 'SPLIT': 335 | new_node = spliting(self.root2, self.project_detail) 336 | new_node.load_node(node_data=node_info, ac='setting') 337 | elif node_type == 'SAMPLE': 338 | new_node = sample(self.root2, self.project_detail) 339 | new_node.load_node(node_data=node_info, ac='setting') 340 | elif node_type == 'IGN': 341 | new_node = IGN(self.root2, self.project_detail) 342 | new_node.load_node(node_info,ac='setting') 343 | elif node_type == 'PLC': 344 | new_node = PLC(self.root2, self.project_detail) 345 | new_node.load_node(node_info,ac='setting') 346 | elif node_type == 'SCR': 347 | new_node = model(self.root2, self.project_detail) 348 | new_node.import_node( node_info,ac='setting') 349 | elif node_type == 'Scoring': 350 | new_node = scoreing(self.root2, self.project_detail) 351 | new_node.load_node(node_info,ac='setting') 352 | 353 | self.root.wait_window(self.root2) 354 | try: 355 | tt = [{'模块类型': new_node.node_setting['node_type'], 356 | '模块名字': new_node.node_setting['node_name'], 357 | '引用模块': new_node.node_setting['use_node'], 358 | '保存地址': new_node.node_setting['node_save_path'], 359 | '创建时间': new_node.node_setting['time'], 360 | '状态': 'Good'}] 361 | 362 | mm = pd.DataFrame(tt) 363 | print(mm) 364 | self.project_detail = self.project_detail[self.project_detail['模块名字'] != node_name] 365 | self.project_detail = self.project_detail.append(mm) 366 | self.refresh_df(self.root, self.project_detail) 367 | except: 368 | pass 369 | # except Exception as e: 370 | # tk.messagebox.showwarning('错误', e) 371 | 372 | def result(self, rowclicked, colclicked): 373 | node_type = self.project_detail.iloc[rowclicked]['模块类型'] 374 | node_name = self.project_detail.iloc[rowclicked]['模块名字'] 375 | node_save_path = self.project_detail.iloc[rowclicked]['保存地址'] 376 | try: 377 | fr = open(node_save_path, 'rb') 378 | node_info = pickle.load(fr) 379 | fr.close() 380 | flag_error = 0 381 | except Exception as e: 382 | flag_error = 1 383 | tk.messagebox.showwarning('错误', e) 384 | if flag_error != 1: 385 | try: 386 | if self.root2.state() == 'normal': 387 | tk.messagebox.showwarning('错误', "请先处理当前打开窗口") 388 | except: 389 | self.root2 = Toplevel(self.root) 390 | self.root2.title(node_name) 391 | if node_type == 'SPLIT': 392 | new_node = spliting(self.root2, self.project_detail) 393 | new_node.load_node(node_data=node_info, ac='result') 394 | elif node_type == 'DATA': 395 | new_node = inputdata(self.root2, self.project_detail) 396 | new_node.load(node_info) 397 | new_node.variable_seting_ui() 398 | elif node_type == 'SAMPLE': 399 | new_node = sample(self.root2, self.project_detail) 400 | new_node.load_node(node_data=node_info, ac='result') 401 | elif node_type == 'IGN': 402 | new_node = IGN(self.root2, self.project_detail) 403 | new_node.load_node(node_info, ac='result') 404 | new_node.result() 405 | elif node_type == 'PLC': 406 | new_node = PLC(self.root2, self.project_detail) 407 | new_node.load_node(node_info,ac='setting') 408 | 409 | elif node_type=='SCR': 410 | new_node = model(self.root2, self.project_detail) 411 | new_node.import_node(node_info,ac='result') 412 | new_node.reult_show_only(self.root2) 413 | elif node_type == 'Scoring': 414 | new_node = scoreing(self.root2, self.project_detail) 415 | new_node.load_node( node_info,ac='result') 416 | self.root.wait_window(self.root2) 417 | try: 418 | tt = [{'模块类型': new_node.node_setting['node_type'], 419 | '模块名字': new_node.node_setting['node_name'], 420 | '引用模块': new_node.node_setting['use_node'], 421 | '保存地址': new_node.node_setting['node_save_path'], 422 | '创建时间': new_node.node_setting['time'], 423 | '状态': 'Good'}] 424 | mm = pd.DataFrame(tt) 425 | print(mm) 426 | self.project_detail = self.project_detail[self.project_detail['模块名字'] != node_name] 427 | self.project_detail = self.project_detail.append(mm) 428 | self.refresh_df(self.root, self.project_detail) 429 | except: 430 | pass 431 | 432 | def delet(self, rowclicked, colclicked): 433 | 434 | node_name = self.project_detail.iloc[rowclicked]['模块名字'] 435 | node_save_path = self.project_detail.iloc[rowclicked]['保存地址'] 436 | try: 437 | os.remove(node_save_path) 438 | self.project_detail = self.project_detail[self.project_detail['模块名字'] != node_name] 439 | self.refresh_df(self.root, self.project_detail) 440 | except Exception as e: 441 | tk.messagebox.showwarning('错误', e) 442 | self.project_detail = self.project_detail[self.project_detail['模块名字'] != node_name] 443 | self.refresh_df(self.root, self.project_detail) 444 | 445 | 446 | # %% 447 | 448 | 449 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | #from base import group_func 4 | from funcc import binning 5 | 6 | data=pd.read_csv('D:\\SynologyDrive\\temp\\application_train.csv') 7 | target='TARGET' 8 | data['date']=np.random.choice(['2019-09','2019-10','2019-11','2019-12','2020-01','2020-02','2020-03','2020-04'], size=len(data), replace=True) 9 | specialcode_list=[] 10 | 11 | colnum=['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3'] 12 | #data.select_dtypes(include=['float','int8','int16','int32','int64']).columns.values.tolist()[3:] 13 | colchar=['NAME_CONTRACT_TYPE','CODE_GENDER','FLAG_OWN_CAR'] 14 | #data.select_dtypes(include=['object']).columns.values.tolist() 15 | 16 | min_size=10 17 | min_lift=1.4 18 | min_bad_rate=0.2 19 | s_group_map,s_group_data=binning.fit_bin_aprior(binning,data=data,varnum=colnum,target='TARGET', s_bin_num=40) 20 | print(s_group_map) 21 | s_group_map['iv']=0 22 | s_group_map['woe']=0 23 | s_group_map['f_Bad_rate']=0 24 | s_group_map['f_N_obs']=0 25 | s_group_map['f_N_bad']=0 26 | s_group_map['f_group']=0 27 | 28 | s_group_report,f_group_report = binning.report(binning,group_info=s_group_map, 29 | varchar=colchar, 30 | varnum=colnum) 31 | merge_data=s_group_report[['variable_name','s_group','value','label']] 32 | merge_data.loc[merge_data['value'].isnull()==False,'label']=merge_data.loc[merge_data['value'].isnull()==False,'variable_name']+'_'+merge_data.loc[merge_data['value'].isnull()==False,'value'] 33 | merge_data['variable_name']='s_group_'+merge_data['variable_name'] 34 | aprior_data=s_group_data[colchar+list(merge_data['variable_name'].unique())+[target]] 35 | 36 | for var in merge_data['variable_name'].unique(): 37 | aprior_data=pd.merge(merge_data[merge_data['variable_name']==var][['s_group','label']],aprior_data,left_on='s_group',right_on=var,how='right') 38 | aprior_data['ap_%s'%var]=aprior_data['label'] 39 | aprior_data=aprior_data.drop(columns=[var,'label','s_group']) 40 | for var in colchar: 41 | aprior_data['ap_%s'%var]=var+aprior_data[var] 42 | aprior_data.loc[aprior_data['ap_%s' % var].isnull(),'ap_%s' % var]='miss' 43 | aprior_data=aprior_data.drop(columns=[var]) 44 | aprior_data[target]=s_group_data[target] 45 | 46 | 47 | from itertools import combinations 48 | print (list(combinations(aprior_data.columns, 2))) 49 | 50 | pair_list=list(aprior_data.columns) 51 | pair_list.remove('TARGET') 52 | 53 | pair_list=list(combinations(pair_list, 1)) 54 | #+list(combinations(pair_list, 2))+list(combinations(pair_list, 3)) 55 | print(pair_list) 56 | 57 | re_df=pd.DataFrame() 58 | for i in range(len(pair_list)): 59 | pair=pair_list[i] 60 | tt=aprior_data[list(pair)+[target]] 61 | re=tt.groupby(list(pair))[target].agg(['sum','mean','count']).reset_index() 62 | re.head() 63 | re['rule']=re.apply(lambda x: [x[t] for t in list(pair)],axis=1) 64 | re['variable']=re.apply(lambda x: list(pair) , axis=1) 65 | re=re[['sum','mean','count','rule','variable']] 66 | re_df=re_df.append(re) 67 | 68 | 69 | 70 | re_df['lift']=re_df['mean']/np.mean(aprior_data[target]) 71 | re_df=re_df.sort_values(by='lift',ascending=False) 72 | final_rule=re_df[(re_df['count']>min_size)&(re_df['mean']>min_bad_rate)&(re_df['lift']>min_lift)] 73 | final_rule=final_rule.reset_index(drop=True) 74 | final_rule=final_rule.reset_index() 75 | for i in range(len(final_rule)): 76 | rule_t=final_rule.iloc[i] 77 | merge_data=pd.DataFrame(data=np.array([rule_t['rule']+[1]]),columns=np.array(rule_t['variable']+['flag_rule_'+str(rule_t['index'])])) 78 | aprior_data=pd.merge(aprior_data,merge_data,how='left') 79 | aprior_data.loc[aprior_data['flag_rule_'+str(rule_t['index'])].isnull()==True,'flag_rule_'+str(rule_t['index'])]=0 80 | 81 | final_rule['rule_name']=final_rule['index'].apply(lambda x: 'flag_rule_'+str(x) ) 82 | 83 | plot_df=aprior_data.copy() 84 | plot_df['rm__order__']=plot_df.index 85 | 86 | 87 | 88 | tt=aprior_data.groupby(['flag_rule_'+str(x) for x in range(len(final_rule)) ])[target].agg({'count','mean'}).reset_index() 89 | 90 | tt['target']=tt.apply(lambda x: "-".join(set([int(x[t])*t for t in ['flag_rule_'+str(m) for m in range(len(final_rule))]])),axis=1) 91 | 92 | tt['label']=tt.apply(lambda x: "&".join(set([int(x['flag_rule_%s' %m])*str(final_rule['rule'][m]) for m in range(len(final_rule))])),axis=1) 93 | 94 | 95 | tt['start']=tt.apply(lambda x: set([int(x[t])*t for t in ['flag_rule_'+str(m) for m in range(len(final_rule))]]),axis=1) 96 | 97 | 98 | links_list =[] 99 | 100 | nodes_list =[] 101 | for i in range(len(tt)): 102 | for s in tt.iloc[i]['start']: 103 | if s !='': 104 | links_list.append({'source':s,'target':tt.iloc[i]['target'],'value':tt.iloc[i]['count'],'badrate':tt.iloc[i]['mean']}) 105 | nodes_list.append(s) 106 | nodes_list.append(tt.iloc[i]['target']) 107 | 108 | #nodes_list.append({'name':s}) 109 | #nodes_list.append({'name':tt.iloc[i]['target']}) 110 | nodes_list=list(set(nodes_list)) 111 | nodes_list=[{'name':x } for x in nodes_list] 112 | 113 | 114 | 115 | import plotly.graph_objects as go 116 | import plotly.io as pio 117 | pio.renderers.default='browser' 118 | 119 | def rgb_to_hex(rgb): 120 | return '#%02x%02x%02x' % rgb 121 | dd=pd.DataFrame(links_list) 122 | labels=list(set(list(dd['source'])+list(dd['target']))) 123 | dd['num']=dd['value']*dd['badrate'] 124 | fi=dd.groupby('source')['num','value'].sum().reset_index() 125 | fi['rate']=fi['num']/fi['value'] 126 | 127 | badrate_df=pd.concat([fi[['source','rate']].rename({'source':'label','rate':'rate'},axis=1), 128 | dd[['target','badrate']].rename({'target':'label','badrate':'rate'},axis=1)]) 129 | 130 | comment_df=pd.concat([final_rule[['rule_name','rule']].rename({'rule_name':'label','rule':'comment'},axis=1), 131 | tt[['target','label']].rename({'target':'label','label':'comment'},axis=1)]) 132 | 133 | 134 | 135 | bad_rate_list=[list(badrate_df.loc[badrate_df['label']==m,'rate'])[0] for m in labels] 136 | 137 | commend=[list(comment_df.loc[comment_df['label']==m,'comment'])[0] for m in labels] 138 | 139 | 140 | 141 | col_code=[rgb_to_hex((256-int(x*255), 256-int(x*255), 256-int(x*255))) for x in bad_rate_list] 142 | mergedd=pd.DataFrame(labels) 143 | mergedd['order']=mergedd.index 144 | mergedd=mergedd.rename({0:'var'},axis=1) 145 | 146 | sankey_df=pd.merge(dd,mergedd,left_on='source',right_on='var',how='left') 147 | sankey_df=sankey_df.rename({'order':'source_order'},axis=1) 148 | 149 | sankey_df=pd.merge(sankey_df,mergedd,left_on='target',right_on='var',how='left') 150 | sankey_df=sankey_df.rename({'order':'target_order'},axis=1) 151 | 152 | fig = go.Figure(data=[go.Sankey( 153 | node = dict( 154 | pad = 15, 155 | thickness = 20, 156 | line = dict(color = "black", width = 0.5), 157 | label = [commend[m] for m in range(len(commend))], 158 | color = col_code, 159 | customdata =[[commend[m],round(bad_rate_list[m],3)] for m in range(len(commend))], 160 | hovertemplate='Node: %{label}
Rules:%{customdata[0]}
badrate: %{customdata[1]}', 161 | ), 162 | link = dict( 163 | source = list(sankey_df['source_order']), # indices correspond to labels, eg A1, A2, A1, B1, ... 164 | target = list(sankey_df['target_order']), 165 | value=list(sankey_df['value']), 166 | color=[rgb_to_hex((256-int(x*255), 256-int(x*255), 256-int(x*255))) for x in list(sankey_df['badrate']) ] 167 | ))]) 168 | fig.update_layout(title_text="Basic Sankey Diagram", font_size=10) 169 | fig.show() 170 | 171 | 172 | 173 | -------------------------------------------------------------------------------- /var_clus.py: -------------------------------------------------------------------------------- 1 | """ 2 | Class definitions used for VarClus 3 | """ 4 | 5 | import pandas as pd 6 | import numpy as np 7 | from sklearn.decomposition import PCA 8 | from sklearn.preprocessing import scale 9 | 10 | 11 | 12 | 13 | class Cluster: 14 | """ 15 | A tree-node type container that is capable of decomposing itself based on PCA and holds the 16 | following information 17 | - features in this cluster 18 | - first n PCA components and their corresponding eigenvalues 19 | """ 20 | 21 | def __init__(self, dataframe, n_split=2, feature_list=None, parents=None, children=None, 22 | name=None): 23 | """ 24 | 25 | :param dataframe: A pandas dataframe 26 | :param n_split: Number of sub-clusters every time a cluster is split 27 | :param feature_list: A list of feature names 28 | :param parents: A list of parents to this cluster, if any 29 | :param children: A list of children to this cluster, if any 30 | :param name: Name of the cluster 31 | """ 32 | 33 | # Using dataframe.columns will generate an index-list which is not convertible to set 34 | self.features = feature_list or list(dataframe) 35 | self.dataframe = dataframe[self.features] 36 | self.n_split = n_split 37 | self.parents = parents or [] 38 | self.children = children or [] 39 | self.name = name or '' 40 | self.pca = None 41 | self._pca_features = [] 42 | self._pca_corr = [] 43 | 44 | self.input_check() 45 | 46 | def run_pca(self): 47 | """ 48 | A wrapper around sklearn.decomposition.PCA.fit(). 49 | 50 | Additionally, it calculates the first n_split PCA components 51 | 52 | :return: 53 | """ 54 | 55 | if not self.features: 56 | print('No features to conduct PCA') 57 | return 58 | elif len(self.features) < self.n_split: 59 | print('Number of features is smaller than n_split, reducing n_split temporarily') 60 | n_split = len(self.features) 61 | else: 62 | n_split = self.n_split 63 | 64 | self.pca = PCA(n_components=n_split).fit(self.dataframe) 65 | 66 | for i in range(n_split): 67 | self._pca_features.append(self.dataframe.dot(self.pca.components_[i])) 68 | self._pca_corr.append(self.dataframe.corrwith(self._pca_features[i])) 69 | 70 | def input_check(self): 71 | """ 72 | Checks the input against below rules 73 | 1. If the features is a list 74 | 75 | :return: 76 | """ 77 | 78 | if type(self.features) is not list: 79 | print('Input argument features is not a list. Wrapping it in a list') 80 | self.features = [self.features] 81 | 82 | def return_all_leaves(self): 83 | """ 84 | Returns all terminal child leaves. If no children, returns self 85 | 86 | :return: A list of terminal child leaves if any. Otherwise, returns [self] 87 | """ 88 | 89 | if not self.children: 90 | return [self] 91 | 92 | child_leaves_nested = [child.return_all_leaves() for child in self.children] 93 | return [leaf for leaves in child_leaves_nested for leaf in leaves] 94 | 95 | @property 96 | def pca_eigenvalues(self): 97 | if not self.features: 98 | return [0] 99 | else: 100 | if self.pca is None: 101 | self.run_pca() 102 | 103 | return self.pca.explained_variance_ 104 | 105 | @property 106 | def pca_features(self): 107 | if not self.features: 108 | return [] 109 | else: 110 | if self.pca is None: 111 | self.run_pca() 112 | 113 | return self._pca_features 114 | 115 | @property 116 | def pca_corr(self): 117 | if not self.features: 118 | return [] 119 | else: 120 | if self.pca is None: 121 | self.run_pca() 122 | 123 | return self._pca_corr 124 | 125 | def __key(self): 126 | return (tuple(self.features), self.dataframe.shape) 127 | 128 | def __eq__(self, other): 129 | return self.__key() == other.__key() 130 | 131 | def __hash__(self): 132 | return hash(self.__key()) 133 | 134 | 135 | class VarClus(): 136 | """ 137 | A class that does oblique hierarchical decomposition of a feature space based on PCA. 138 | The general algorithm is 139 | 1. Conducts PCA on current feature space. If the max eigenvalue is smaller than threshold, 140 | stop decomposition 141 | 2. Calculates the first N PCA components and assign features to these components based on 142 | absolute correlation from high to low. These components are the initial centroids of 143 | these child clusters. 144 | 3. After initial assignment, the algorithm conducts an iterative assignment called Nearest 145 | Component Sorting (NCS). Basically, the centroid vectors are re-computed as the first 146 | components of the child clusters and the algorithm will re-assign each of the feature 147 | based on the same correlation rule. 148 | 4. After NCS, the algorithm tries to increase the total variance explained by the first 149 | PCA component of each child cluster by re-assigning features across clusters 150 | """ 151 | 152 | def __init__(self, n_split=2, max_eigenvalue=1, max_tries=3): 153 | """ 154 | 155 | :param n_split: Number of sub-clusters that every time a cluster is split into 156 | :param max_eigenvalue: Eigenvalue threshold below which the decomposition will be stopped 157 | :param max_tries: Number of max tries before the algorithm gives up 158 | """ 159 | 160 | self.n_split = n_split 161 | self.max_eigenvalue = max_eigenvalue 162 | self.max_tries = max_tries 163 | self.cluster = None 164 | 165 | @staticmethod 166 | def reassign_one_feature_pca(cluster_from, cluster_to, feature, other_clusters=None): 167 | """ 168 | Tries to re-assign a feature from a cluster to the other cluster to see if total 169 | explained variance of all clusters (represented by the first PCA component)is increased. 170 | If increased, the re-assignment will stay 171 | 172 | :param cluster_from: The cluster where the feature comes from 173 | :param cluster_to: The cluster where the feature will join 174 | :param feature: Feature to be tested 175 | :param other_clusters: Other clusters for calculating total explained variance 176 | :return: Original or new cluster_from and cluster_to 177 | """ 178 | 179 | if not (feature in cluster_from.features): 180 | return cluster_from, cluster_to 181 | 182 | # This shouldn't happen when calling decompose() 183 | if feature in cluster_to.features: 184 | print('feature {} is already in cluster_to'.format(feature)) 185 | return cluster_from, cluster_to 186 | 187 | print('assessing feature {}'.format(feature)) 188 | 189 | other_clusters = other_clusters or [] 190 | cluster_from_new_df = cluster_from.dataframe.drop(feature, axis=1) 191 | cluster_to_new_df = cluster_to.dataframe.join(cluster_from.dataframe[feature]) 192 | cluster_from_new = Cluster(dataframe=cluster_from_new_df, 193 | n_split=cluster_from.n_split, 194 | parents=cluster_from.parents, 195 | name=cluster_from.name) 196 | cluster_to_new = Cluster(dataframe=cluster_to_new_df, 197 | n_split=cluster_to.n_split, 198 | parents=cluster_to.parents, 199 | name=cluster_to.name) 200 | 201 | # This shouldn't happen logically 202 | if len(cluster_from.features + cluster_to.features) != \ 203 | len(cluster_from_new.features + cluster_to_new.features): 204 | missing_feature = set(cluster_from.features + cluster_to.features) - \ 205 | set(cluster_from_new.features + cluster_to_new.features) 206 | print('feature missing....the missing feature is...{}').format(missing_feature) 207 | 208 | explained_variance_before_assignment = np.sum( 209 | [cluster.pca_eigenvalues[0] for cluster in [cluster_from, cluster_to] + other_clusters], 210 | ) 211 | 212 | explained_variance_after_assignment = np.sum( 213 | [cluster.pca_eigenvalues[0] for cluster in 214 | [cluster_from_new, cluster_to_new] + other_clusters], 215 | ) 216 | 217 | print('current EV is {0}, new EV is {1}'.format(explained_variance_before_assignment, 218 | explained_variance_after_assignment)) 219 | 220 | if explained_variance_after_assignment > explained_variance_before_assignment: 221 | return cluster_from_new, cluster_to_new, True 222 | else: 223 | return cluster_from, cluster_to, False 224 | 225 | @staticmethod 226 | def reassign_features_pca(child_clusters, max_tries=3): 227 | """ 228 | Iteratively assesses if a re-assignment of a feature is going to increase the total 229 | variance explained of the child clusters. The variance explained by a child cluster is 230 | the variance explained by the first PCA component 231 | 232 | :param child_clusters: A list of clusters 233 | :param max_tries: Number of max tries before the algorithm gives up 234 | :return: New or original list of clusters 235 | """ 236 | 237 | if len(child_clusters) < 2: 238 | return child_clusters 239 | 240 | n_tries = 0 241 | 242 | # Loop through all features for all cluster combinations 243 | for i in range(len(child_clusters)): 244 | 245 | if len(child_clusters[i].features) == 1: 246 | continue 247 | 248 | for feature in child_clusters[i].features: 249 | for j in range(len(child_clusters)): 250 | 251 | if i == j: 252 | continue 253 | 254 | other_clusters = \ 255 | list(set(child_clusters) - {child_clusters[i], child_clusters[j]}) 256 | 257 | child_clusters[i], child_clusters[j], change_flag = \ 258 | VarClus.reassign_one_feature_pca(child_clusters[i], 259 | child_clusters[j], 260 | feature, 261 | other_clusters) 262 | if change_flag: 263 | print('Feature {} was re-assigned'.format(feature)) 264 | print('{name_0} has {number_0} features and name_1 has {number_1} ' \ 265 | 'features'.format(name_0=child_clusters[i].name, 266 | number_0=len(child_clusters[i].features), 267 | name_1=child_clusters[j].name, 268 | number_1=len(child_clusters[j].features))) 269 | 270 | if not change_flag: 271 | n_tries += 1 272 | 273 | if max_tries and n_tries >= max_tries: 274 | print('Number of max tries has been reached. Returning current result...') 275 | return child_clusters 276 | 277 | return child_clusters 278 | 279 | @staticmethod 280 | def nearest_component_sorting_once(initial_child_clusters): 281 | """ 282 | Updates the centroids of the initial child clusters and re-assigns the features to the 283 | clusters with updated centroids based on absolute correlation 284 | 285 | :param initial_child_clusters: A list of initial child clusters 286 | :return: A new list of child clusters and boolean indicating if the clusters have been 287 | updated or not 288 | """ 289 | 290 | full_dataframe = pd.concat( 291 | [cluster.dataframe for cluster in initial_child_clusters], axis=1 292 | ) 293 | 294 | corr_table = pd.concat( 295 | [full_dataframe.corrwith(cluster.pca_features[0]) for cluster in 296 | initial_child_clusters], 297 | axis=1 298 | ) 299 | 300 | corr_sq_table = corr_table ** 2 301 | corr_max = corr_sq_table.max(axis=1) 302 | cluster_membership = corr_sq_table.apply(lambda x: x == corr_max) 303 | 304 | if (cluster_membership.sum() == 0).sum(): 305 | print('Features of this cluster are most correlated with first PCA component. Consider ' 306 | 'increasing max_eigenvalue. Randomly assigning features to child clusters...') 307 | 308 | i_range, j_range = cluster_membership.shape 309 | for i in range(i_range): 310 | for j in range(j_range): 311 | cluster_membership.iloc[i, j] = (i % j_range == j) 312 | 313 | new_child_clusters = [ 314 | Cluster(dataframe=full_dataframe, 315 | n_split=initial_child_clusters[0].n_split, 316 | feature_list=[feature for (feature, condition) 317 | in cluster_membership[membership].to_dict().items() 318 | if condition], 319 | parents=initial_child_clusters[0].parents, 320 | name='{0}-{1}'.format(initial_child_clusters[0].parents[0].name, str(i))) 321 | for i, membership in enumerate(cluster_membership) 322 | ] 323 | 324 | # Check if clusters are unchanged 325 | old_cluster_features = set([ 326 | tuple(cluster.features.sort() or cluster.features) for cluster in initial_child_clusters 327 | ]) 328 | 329 | new_cluster_features = set([ 330 | tuple(cluster.features.sort() or cluster.features) for cluster in new_child_clusters 331 | ]) 332 | 333 | return new_child_clusters, old_cluster_features != new_cluster_features 334 | 335 | @staticmethod 336 | def nearest_component_sorting(initial_child_clusters): 337 | """ 338 | Iteratively assigns features to the child clusters based on re-computed centroids of each 339 | child cluster 340 | 341 | :param initial_child_clusters: A list of initial child clusters 342 | :return: Updated list of child clusters 343 | """ 344 | 345 | n_tries = 0 346 | change_flag = True 347 | new_child_clusters = initial_child_clusters 348 | 349 | while change_flag: 350 | new_child_clusters, change_flag = \ 351 | VarClus.nearest_component_sorting_once(new_child_clusters) 352 | 353 | return new_child_clusters 354 | 355 | @staticmethod 356 | def one_step_decompose(cluster, max_tries=3): 357 | """ 358 | Algorithm that conducts one-time decomposition of the cluster. 359 | 360 | :param cluster: A cluster to be decomposed 361 | :param max_tries: Number of max tries during re-assigning phase before it gives up 362 | :return: A list of child clusters of this cluster after decomposition 363 | """ 364 | 365 | corr_table = pd.concat(cluster.pca_corr, axis=1) 366 | corr_sq_table = corr_table ** 2 367 | corr_max = corr_sq_table.max(axis=1) 368 | cluster_membership = corr_sq_table.apply(lambda x: x == corr_max) 369 | 370 | if (cluster_membership.sum() == 0).sum(): 371 | print('Features of this cluster are most correlated with first PCA component. Consider ' 372 | 'increasing max_eigenvalue. Randomly assigning features to child clusters...') 373 | 374 | i_range, j_range = cluster_membership.shape 375 | for i in range(i_range): 376 | for j in range(j_range): 377 | cluster_membership.iloc[i, j] = (i % j_range == j) 378 | 379 | child_clusters = [ 380 | Cluster(dataframe=cluster.dataframe, 381 | n_split=cluster.n_split, 382 | feature_list=[feature for (feature, condition) 383 | in cluster_membership[membership].to_dict().items() 384 | if condition], 385 | parents=[cluster], 386 | name='{0}-{1}'.format(cluster.name, str(i))) 387 | for i, membership in enumerate(cluster_membership) 388 | ] 389 | 390 | # Phase 1: nearest component sorting 391 | print('phase #1: NCS') 392 | child_clusters = \ 393 | VarClus.nearest_component_sorting(child_clusters) 394 | 395 | # Phase 2: search algorithm 396 | print('phase #2: Search') 397 | child_clusters = \ 398 | VarClus.reassign_features_pca(child_clusters, max_tries=max_tries) 399 | 400 | return child_clusters 401 | 402 | @staticmethod 403 | def _decompose(cluster, max_eigenvalue, max_tries): 404 | """ 405 | Main recursive function to decompose a feature space based on certain rules. 406 | 407 | :param cluster: An instance of Cluster class that represents a feature space 408 | :param max_eigenvalue: Eigenvalue threshold below which the decomposition will be stopped 409 | :param max_tries: Max number of tries when re-assigning features before it gives up 410 | :return: 411 | """ 412 | 413 | if len(cluster.features) >= cluster.n_split and \ 414 | len(cluster.features) > 1 and \ 415 | cluster.pca_eigenvalues[-1] >= max_eigenvalue: 416 | print('decomposing cluster {}'.format(cluster.name)) 417 | cluster.children = VarClus.one_step_decompose(cluster, max_tries=max_tries) 418 | 419 | for child_cluster in cluster.children: 420 | VarClus._decompose(child_cluster, 421 | max_eigenvalue, 422 | max_tries) 423 | 424 | def decompose(self, dataframe): 425 | """ 426 | Scales and decomposes a given dataframe in an oblique hierarchical way. 427 | 428 | :param dataframe: a pandas dataframe that contains the feature space 429 | """ 430 | scaled_dataframe = pd.DataFrame(scale(dataframe), columns=dataframe.columns) 431 | 432 | self.cluster = Cluster(scaled_dataframe, 433 | self.n_split, 434 | name='cluster-0') 435 | 436 | VarClus._decompose(self.cluster, 437 | self.max_eigenvalue, 438 | self.max_tries) 439 | 440 | return self.cluster 441 | 442 | @property 443 | def final_cluster_structure(self): 444 | """ 445 | Gets the final cluster structure after decomposition 446 | 447 | :return: 448 | """ 449 | 450 | if not self.cluster: 451 | print('Please decompose the feature space first. Empty ') 452 | return [] 453 | 454 | return self.cluster.return_all_leaves() 455 | 456 | @staticmethod 457 | def _print_cluster_structure(root_cluster,model_variable, prefix='', h_space=5): 458 | """ 459 | Prints the hierarchical structure below a given root cluster 460 | 461 | :param root_cluster: An instance of Cluster class 462 | :param prefix: String to be prefixed for each print 463 | :param h_space: Horizontal spacing or indentation for each sub level 464 | :return: A printout that shows the decomposition of the feature space 465 | """ 466 | global record_list 467 | 468 | if root_cluster.children: 469 | for index, child_cluster in enumerate(root_cluster.children): 470 | # print(prefix + '|') 471 | try: 472 | record_list=record_list+prefix + '|'+'\n' 473 | except: 474 | record_list=prefix + '|'+'\n' 475 | # print(prefix + '|' + '-' * h_space + child_cluster.name) 476 | record_list=record_list+prefix + '|' + '-' * h_space + child_cluster.name+'\n' 477 | if index == len(root_cluster.children) - 1: 478 | new_prefix = prefix + ' ' * (h_space + 1) 479 | else: 480 | new_prefix = prefix + '|' + ' ' * h_space 481 | 482 | VarClus._print_cluster_structure(child_cluster,model_variable, prefix=new_prefix, h_space=h_space) 483 | else: 484 | for feature in root_cluster.features: 485 | if feature in model_variable: 486 | # print(prefix + '|' + '*' * h_space + feature) 487 | try: 488 | record_list=record_list+prefix + '|' + '*' * h_space + feature+'\n' 489 | except: 490 | record_list = prefix + '|' + '*' * h_space + feature + '\n' 491 | else: 492 | # print(prefix + '|' + '-' * h_space + feature) 493 | try: 494 | record_list=record_list+prefix + '|' + '-' * h_space + feature+'\n' 495 | except: 496 | record_list = prefix + '|' + '-' * h_space + feature + '\n' 497 | return record_list 498 | def print_cluster_structure(self,model_variable, h_space=5): 499 | """ 500 | Prints the hierarchical structure of the decomposition 501 | 502 | :param h_space: Horizontal spacing or indentation for each sub level 503 | :return: A printout that shows the decomposition of the feature space 504 | """ 505 | 506 | try: 507 | del record_list 508 | record_list='' 509 | except: 510 | pass 511 | reu=VarClus._print_cluster_structure(root_cluster=self.cluster,model_variable=model_variable, h_space=h_space) 512 | return reu 513 | --------------------------------------------------------------------------------