├── .gitignore ├── .idea └── vcs.xml ├── README.md └── scripts ├── explain.png ├── stats.py └── woe.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | .gitignore 3 | .idea/ 4 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | data_analysis 2 | = 3 | 1.scripts/stats.py(数据分析脚本) 4 | - 5 | ####1) 使用方法
6 | python scripts/**stats.py**   输入dataframe所在csv   输出dataframe所在csv   待分析特征变量
7 | 8 | ####2) 使用例子
9 | python scripts/**stats.py**   'd:\age.csv'   'd:\output.csv'   'age'
10 | 11 | ####3) 参数解释
12 | 输入dataframe
13 | 14 | 15 | 16 | 17 | 18 | 19 |
UserIdage
02722683822
01712641534
04752633456
...
20 | ![列名解释](scripts/explain.png "列名解释")
21 | 输出dataframe 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 |
agecnt_reccnt_target%target%cnt_rec%cnt_target%cum_cnt_rec%cum_cnt_targetcnt_nontarget%cum_nontarget%cum_target-%cum_nontarget
181109.08.18%0.36%0.53%0.36%0.53%101.00.35%0.18%
...
37 | 38 | 39 | 2.scripts/woe.py(计算[woe](http://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html)和[iv](http://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html)脚本)
40 | - 41 | ####1)使用方法
42 | python scripts/**woe.py**   输入dataframe所在csv   待分析特征变量   分段表达式(用逗号连接) y变量
43 | 44 | ####2)使用例子
45 | python scripts/**woe.py**   "age.csv" "age" "20,30,45" "is_dft"
46 | 47 | ####3)参数解释
48 | 输入dataframe
49 | 50 | 51 | 52 | 53 | 54 |
UserIdageis_dft
027226838221
027242238210
...
55 | 56 | 输出结果
57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 |
class good bad%good %bad allwoe iv
0 (0,20.0] 76 1519 4.76%95.24% 1595 -6.34584 0.048765
1 (20,30] 895 17129 4.97%95.03%18024 -8.75549 0.561679
2 (30,45] 673 10021 6.29%93.71%10694 7.31007 0.372628
3 (45...) 75 869 7.94%92.06% 944 2.57974 0.036832
4 NA 0 0 nan%nan% 0NaN 0.000000
1688 28819 5.53% 94.47% 305071.019905
-------------------------------------------------------------------------------- /scripts/explain.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jimmycheng603/data_analysis/5c78eef1501b3c41a98d9be6102886962ad916e9/scripts/explain.png -------------------------------------------------------------------------------- /scripts/stats.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | 3 | import pandas as pd 4 | 5 | 6 | if __name__ == '__main__': 7 | inputfile = argv[1] 8 | feature = argv[2] 9 | outputfile = argv[3] 10 | data = pd.read_csv(inputfile, sep=',') 11 | 12 | # cnt_rec 13 | counts_feature = data.groupby(feature).size().rename("cnt_rec") 14 | df_counts_feature = pd.DataFrame(counts_feature); 15 | 16 | # target 17 | feature_target = data.loc[data['is_dft'] == 1, :] 18 | df_feature_target = pd.DataFrame(feature_target.groupby(feature).size().rename("cnt_target")); 19 | 20 | # concat 21 | df_result = pd.concat([df_counts_feature, df_feature_target], axis=1); 22 | 23 | # cnt_target/cnt_rec 24 | df_result['%target'] = (df_result['cnt_target'] / df_result['cnt_rec']).map('{:.2%}'.format); 25 | 26 | cnt_rec = df_result['cnt_rec'] / df_result['cnt_rec'].sum() 27 | # cnt_rec/sum(cnt_rec) 28 | df_result['%cnt_rec'] = cnt_rec.map('{:.2%}'.format); 29 | 30 | cnt_target = df_result['cnt_target'] / df_result['cnt_target'].sum() 31 | # cnt_target/sum(cnt_target) 32 | df_result['%cnt_target'] = cnt_target.map('{:.2%}'.format); 33 | 34 | # %cum_cnt_rec 35 | df_result['%cum_cnt_rec'] = (cnt_rec.cumsum()).map('{:.2%}'.format); 36 | 37 | # %cum_cnt_target 38 | df_result['%cum_cnt_target'] = cnt_target.cumsum().map( 39 | '{:.2%}'.format); 40 | 41 | # cnt_nontarget 42 | df_result['cnt_nontarget'] = df_result['cnt_rec'] - df_result['cnt_target']; 43 | 44 | cnt_nontarget = df_result['cnt_nontarget'] / df_result['cnt_nontarget'].sum() 45 | # %cnt_nontarget 46 | df_result['%cnt_nontarget'] = cnt_nontarget.map('{:.2%}'.format); 47 | 48 | # %cum_nontarget 49 | df_result['%cum_nontarget'] = cnt_nontarget.cumsum().map( 50 | '{:.2%}'.format); 51 | 52 | # % cum_target - % cum_nontarget 53 | df_result['%cum_target-%cum_nontarget'] = (cnt_target 54 | - cnt_nontarget).map( 55 | '{:.2%}'.format) 56 | 57 | df_result.to_csv(outputfile, encoding='gb2312') 58 | -------------------------------------------------------------------------------- /scripts/woe.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __author__ = 'chengjunjie' 4 | 5 | from sys import argv 6 | 7 | import math 8 | import pandas as pd 9 | 10 | if __name__ == '__main__': 11 | """ 12 | inputfile:dataframe所在输入文件 13 | feture:需要分析的特征变量 14 | sep 分段表达式 15 | target y变量 16 | """ 17 | inputfile = argv[1] 18 | feature = argv[2] 19 | sep = argv[3] 20 | target = argv[4] 21 | 22 | data = pd.read_csv(inputfile, sep=',') 23 | # woe 24 | sep_value = sep.split(',') 25 | sep_len = len(sep_value) 26 | dict_bin = {} 27 | class_bin = {} 28 | len_dict_bin = {} 29 | len_dict_bin_0 = {} 30 | len_dict_bin_1 = {} 31 | woe_bin = {} 32 | iv_bin = {} 33 | if sep_len == 1: 34 | dict_bin[0] = data.loc[data[feature] <= float(sep_value[0]), :] 35 | dict_bin[1] = data.loc[data[feature] > float(sep_value[0]), :] 36 | dict_bin[2] = sum(data[feature].isnull()) 37 | len_dict_bin[0] = len(dict_bin[0]) 38 | len_dict_bin[1] = len(dict_bin[1]) 39 | len_dict_bin[2] = len(dict_bin[2]) 40 | class_bin[0] = "(0," + sep_value[0] + "]" 41 | class_bin[1] = "(" + sep_value[0] + "...)" 42 | class_bin[2] = "NA" 43 | else: 44 | for index, item in enumerate(sep_value): 45 | if index == 0: 46 | dict_bin[0] = data.loc[data[feature] <= float(item), :] 47 | len_dict_bin[0] = len(dict_bin[0]) 48 | class_bin[0] = "(0," + str(float(item)) + "]" 49 | else: 50 | dict_bin[index] = ( 51 | data.loc[(data[feature] >= float(sep_value[index - 1])) & (data[feature] < float(item)), 52 | :]) 53 | len_dict_bin[index] = len(dict_bin[index]) 54 | class_bin[index] = "(" + str(sep_value[index - 1]) + "," + str(sep_value[index]) + "]" 55 | dict_bin[index + 1] = data.loc[data[feature] > float(item), :] 56 | dict_bin[index + 2] = data.loc[data[feature].isnull()] 57 | len_dict_bin[index + 1] = len(dict_bin[index + 1]) 58 | len_dict_bin[index + 2] = len(dict_bin[index + 2]) 59 | class_bin[index + 1] = "(" + str(sep_value[index]) + "...)" 60 | class_bin[index + 2] = "NA" 61 | 62 | for index, item in enumerate(dict_bin): 63 | len_dict_bin_0[index] = len(dict_bin[index][dict_bin[index][target] == 0]) 64 | len_dict_bin_1[index] = len(dict_bin[index][dict_bin[index][target] == 1]) 65 | 66 | len_data_0 = len(data[data[target] == 0]) 67 | len_data_1 = len(data[data[target] == 1]) 68 | for index, item in enumerate(dict_bin): 69 | try: 70 | woe_bin[index] = math.log(math.e, (float(len_dict_bin_1[index]) / float(len_data_1)) / ( 71 | float(len_dict_bin_0[index]) / float(len_data_0))) 72 | iv_bin[index] = ((float(len_dict_bin_1[index]) / float(len_data_1)) - ( 73 | float(len_dict_bin_0[index]) / float(len_data_0))) * math.log(math.e, ( 74 | float(len_dict_bin_1[index]) / float(len_data_1)) / ( 75 | float(len_dict_bin_0[index]) / float( 76 | len_data_0))) 77 | except Exception, e: 78 | iv_bin[index] = 0 79 | 80 | dict_result = {} 81 | len_dict_bin_0[" "] = len_data_0 82 | len_dict_bin_1[" "] = len_data_1 83 | woe_bin[" "] = "" 84 | iv_bin[" "]=sum(iv_bin.values()) 85 | class_bin[" "] = "" 86 | len_dict_bin[" "] = len(data) 87 | dict_result["bad"] = len_dict_bin_0 88 | dict_result["good"] = len_dict_bin_1 89 | dict_result["all"] = len_dict_bin 90 | dict_result["woe"] = woe_bin 91 | dict_result["iv"] = iv_bin 92 | dict_result["class"] = class_bin 93 | df = pd.DataFrame(dict_result) 94 | 95 | dict_result["%good"] = (df['good'] / df['all']).map('{:.2%}'.format); 96 | dict_result["%bad"] = (df['bad'] / df['all']).map('{:.2%}'.format); 97 | df["%good"] = dict_result["%good"] 98 | df["%bad"] = dict_result["%bad"] 99 | 100 | # 调整列的顺序 101 | df = df.ix[:, ['class', 'good', 'bad', '%good', '%bad', 'all', 'woe', 'iv']] 102 | print df 103 | --------------------------------------------------------------------------------