├── .gitignore ├── .idea └── vcs.xml ├── README.md └── scripts ├── explain.png ├── stats.py └── woe.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | .gitignore 3 | .idea/ 4 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | data_analysis 2 | = 3 | 1.scripts/stats.py（数据分析脚本） 4 | - 5 | ####1) 使用方法
6 | python scripts/**stats.py** 输入dataframe所在csv 输出dataframe所在csv 待分析特征变量
7 | 8 | ####2) 使用例子
9 | python scripts/**stats.py** 'd:\age.csv' 'd:\output.csv' 'age'
10 | 11 | ####3) 参数解释
12 | 输入dataframe
13 | 14 | 15 | 16 | 17 | 18 | 19 |

	UserId	age
0	27226838	22
0	17126415	34
0	47526334	56
...

20 | ![列名解释](scripts/explain.png "列名解释")
21 | 输出dataframe 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 |

age	cnt_rec	cnt_target	%target	%cnt_rec	%cnt_target	%cum_cnt_rec	%cum_cnt_target	cnt_nontarget	%cum_nontarget	%cum_target-%cum_nontarget
18	110	9.0	8.18%	0.36%	0.53%	0.36%	0.53%	101.0	0.35%	0.18%
...

37 | 38 | 39 | 2.scripts/woe.py（计算[woe](http://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html)和[iv](http://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html)脚本）
40 | - 41 | ####1)使用方法
42 | python scripts/**woe.py** 输入dataframe所在csv 待分析特征变量分段表达式（用逗号连接） y变量
43 | 44 | ####2)使用例子
45 | python scripts/**woe.py** "age.csv" "age" "20,30,45" "is_dft"
46 | 47 | ####3)参数解释
48 | 输入dataframe
49 | 50 | 51 | 52 | 53 | 54 |

	UserId	age	is_dft
0	27226838	22	1
0	27242238	21	0
...

55 | 56 | 输出结果
57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 |

	class	good	bad	%good	%bad	all	woe	iv
0	(0,20.0]	76	1519	4.76%	95.24%	1595	-6.34584	0.048765
1	(20,30]	895	17129	4.97%	95.03%	18024	-8.75549	0.561679
2	(30,45]	673	10021	6.29%	93.71%	10694	7.31007	0.372628
3	(45...)	75	869	7.94%	92.06%	944	2.57974	0.036832
4	NA	0	0	nan%	nan%	0	NaN	0.000000
		1688	28819	5.53%	94.47%	30507		1.019905

-------------------------------------------------------------------------------- /scripts/explain.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jimmycheng603/data_analysis/5c78eef1501b3c41a98d9be6102886962ad916e9/scripts/explain.png -------------------------------------------------------------------------------- /scripts/stats.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | 3 | import pandas as pd 4 | 5 | 6 | if __name__ == '__main__': 7 | inputfile = argv[1] 8 | feature = argv[2] 9 | outputfile = argv[3] 10 | data = pd.read_csv(inputfile, sep=',') 11 | 12 | # cnt_rec 13 | counts_feature = data.groupby(feature).size().rename("cnt_rec") 14 | df_counts_feature = pd.DataFrame(counts_feature); 15 | 16 | # target 17 | feature_target = data.loc[data['is_dft'] == 1, :] 18 | df_feature_target = pd.DataFrame(feature_target.groupby(feature).size().rename("cnt_target")); 19 | 20 | # concat 21 | df_result = pd.concat([df_counts_feature, df_feature_target], axis=1); 22 | 23 | # cnt_target/cnt_rec 24 | df_result['%target'] = (df_result['cnt_target'] / df_result['cnt_rec']).map('{:.2%}'.format); 25 | 26 | cnt_rec = df_result['cnt_rec'] / df_result['cnt_rec'].sum() 27 | # cnt_rec/sum(cnt_rec) 28 | df_result['%cnt_rec'] = cnt_rec.map('{:.2%}'.format); 29 | 30 | cnt_target = df_result['cnt_target'] / df_result['cnt_target'].sum() 31 | # cnt_target/sum(cnt_target) 32 | df_result['%cnt_target'] = cnt_target.map('{:.2%}'.format); 33 | 34 | # %cum_cnt_rec 35 | df_result['%cum_cnt_rec'] = (cnt_rec.cumsum()).map('{:.2%}'.format); 36 | 37 | # %cum_cnt_target 38 | df_result['%cum_cnt_target'] = cnt_target.cumsum().map( 39 | '{:.2%}'.format); 40 | 41 | # cnt_nontarget 42 | df_result['cnt_nontarget'] = df_result['cnt_rec'] - df_result['cnt_target']; 43 | 44 | cnt_nontarget = df_result['cnt_nontarget'] / df_result['cnt_nontarget'].sum() 45 | # %cnt_nontarget 46 | df_result['%cnt_nontarget'] = cnt_nontarget.map('{:.2%}'.format); 47 | 48 | # %cum_nontarget 49 | df_result['%cum_nontarget'] = cnt_nontarget.cumsum().map( 50 | '{:.2%}'.format); 51 | 52 | # % cum_target - % cum_nontarget 53 | df_result['%cum_target-%cum_nontarget'] = (cnt_target 54 | - cnt_nontarget).map( 55 | '{:.2%}'.format) 56 | 57 | df_result.to_csv(outputfile, encoding='gb2312') 58 | -------------------------------------------------------------------------------- /scripts/woe.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __author__ = 'chengjunjie' 4 | 5 | from sys import argv 6 | 7 | import math 8 | import pandas as pd 9 | 10 | if __name__ == '__main__': 11 | """ 12 | inputfile:dataframe所在输入文件 13 | feture:需要分析的特征变量 14 | sep 分段表达式 15 | target y变量 16 | """ 17 | inputfile = argv[1] 18 | feature = argv[2] 19 | sep = argv[3] 20 | target = argv[4] 21 | 22 | data = pd.read_csv(inputfile, sep=',') 23 | # woe 24 | sep_value = sep.split(',') 25 | sep_len = len(sep_value) 26 | dict_bin = {} 27 | class_bin = {} 28 | len_dict_bin = {} 29 | len_dict_bin_0 = {} 30 | len_dict_bin_1 = {} 31 | woe_bin = {} 32 | iv_bin = {} 33 | if sep_len == 1: 34 | dict_bin[0] = data.loc[data[feature] <= float(sep_value[0]), :] 35 | dict_bin[1] = data.loc[data[feature] > float(sep_value[0]), :] 36 | dict_bin[2] = sum(data[feature].isnull()) 37 | len_dict_bin[0] = len(dict_bin[0]) 38 | len_dict_bin[1] = len(dict_bin[1]) 39 | len_dict_bin[2] = len(dict_bin[2]) 40 | class_bin[0] = "(0," + sep_value[0] + "]" 41 | class_bin[1] = "(" + sep_value[0] + "...)" 42 | class_bin[2] = "NA" 43 | else: 44 | for index, item in enumerate(sep_value): 45 | if index == 0: 46 | dict_bin[0] = data.loc[data[feature] <= float(item), :] 47 | len_dict_bin[0] = len(dict_bin[0]) 48 | class_bin[0] = "(0," + str(float(item)) + "]" 49 | else: 50 | dict_bin[index] = ( 51 | data.loc[(data[feature] >= float(sep_value[index - 1])) & (data[feature] < float(item)), 52 | :]) 53 | len_dict_bin[index] = len(dict_bin[index]) 54 | class_bin[index] = "(" + str(sep_value[index - 1]) + "," + str(sep_value[index]) + "]" 55 | dict_bin[index + 1] = data.loc[data[feature] > float(item), :] 56 | dict_bin[index + 2] = data.loc[data[feature].isnull()] 57 | len_dict_bin[index + 1] = len(dict_bin[index + 1]) 58 | len_dict_bin[index + 2] = len(dict_bin[index + 2]) 59 | class_bin[index + 1] = "(" + str(sep_value[index]) + "...)" 60 | class_bin[index + 2] = "NA" 61 | 62 | for index, item in enumerate(dict_bin): 63 | len_dict_bin_0[index] = len(dict_bin[index][dict_bin[index][target] == 0]) 64 | len_dict_bin_1[index] = len(dict_bin[index][dict_bin[index][target] == 1]) 65 | 66 | len_data_0 = len(data[data[target] == 0]) 67 | len_data_1 = len(data[data[target] == 1]) 68 | for index, item in enumerate(dict_bin): 69 | try: 70 | woe_bin[index] = math.log(math.e, (float(len_dict_bin_1[index]) / float(len_data_1)) / ( 71 | float(len_dict_bin_0[index]) / float(len_data_0))) 72 | iv_bin[index] = ((float(len_dict_bin_1[index]) / float(len_data_1)) - ( 73 | float(len_dict_bin_0[index]) / float(len_data_0))) * math.log(math.e, ( 74 | float(len_dict_bin_1[index]) / float(len_data_1)) / ( 75 | float(len_dict_bin_0[index]) / float( 76 | len_data_0))) 77 | except Exception, e: 78 | iv_bin[index] = 0 79 | 80 | dict_result = {} 81 | len_dict_bin_0[" "] = len_data_0 82 | len_dict_bin_1[" "] = len_data_1 83 | woe_bin[" "] = "" 84 | iv_bin[" "]=sum(iv_bin.values()) 85 | class_bin[" "] = "" 86 | len_dict_bin[" "] = len(data) 87 | dict_result["bad"] = len_dict_bin_0 88 | dict_result["good"] = len_dict_bin_1 89 | dict_result["all"] = len_dict_bin 90 | dict_result["woe"] = woe_bin 91 | dict_result["iv"] = iv_bin 92 | dict_result["class"] = class_bin 93 | df = pd.DataFrame(dict_result) 94 | 95 | dict_result["%good"] = (df['good'] / df['all']).map('{:.2%}'.format); 96 | dict_result["%bad"] = (df['bad'] / df['all']).map('{:.2%}'.format); 97 | df["%good"] = dict_result["%good"] 98 | df["%bad"] = dict_result["%bad"] 99 | 100 | # 调整列的顺序 101 | df = df.ix[:, ['class', 'good', 'bad', '%good', '%bad', 'all', 'woe', 'iv']] 102 | print df 103 | --------------------------------------------------------------------------------