├── .gitignore
├── .idea
└── vcs.xml
├── README.md
└── scripts
├── explain.png
├── stats.py
└── woe.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by .ignore support plugin (hsz.mobi)
2 | .gitignore
3 | .idea/
4 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | data_analysis
2 | =
3 | 1.scripts/stats.py(数据分析脚本)
4 | -
5 | ####1) 使用方法
6 | python scripts/**stats.py** 输入dataframe所在csv 输出dataframe所在csv 待分析特征变量
7 |
8 | ####2) 使用例子
9 | python scripts/**stats.py** 'd:\age.csv' 'd:\output.csv' 'age'
10 |
11 | ####3) 参数解释
12 | 输入dataframe
13 |
14 | | UserId | age |
15 | 0 | 27226838 | 22 |
16 | 0 | 17126415 | 34 |
17 | 0 | 47526334 | 56 |
18 | ... |
19 |
20 | 
21 | 输出dataframe
22 |
23 |
24 | age | cnt_rec | cnt_target |
25 | %target | %cnt_rec | %cnt_target |
26 | %cum_cnt_rec | %cum_cnt_target | cnt_nontarget |
27 | %cum_nontarget | %cum_target-%cum_nontarget |
28 |
29 |
30 | 18 | 110 | 9.0 |
31 | 8.18% | 0.36% | 0.53% |
32 | 0.36% | 0.53% | 101.0 |
33 | 0.35% | 0.18% |
34 |
35 | ... |
36 |
37 |
38 |
39 | 2.scripts/woe.py(计算[woe](http://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html)和[iv](http://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html)脚本)
40 | -
41 | ####1)使用方法
42 | python scripts/**woe.py** 输入dataframe所在csv 待分析特征变量 分段表达式(用逗号连接) y变量
43 |
44 | ####2)使用例子
45 | python scripts/**woe.py** "age.csv" "age" "20,30,45" "is_dft"
46 |
47 | ####3)参数解释
48 | 输入dataframe
49 |
50 | | UserId | age | is_dft |
51 | 0 | 27226838 | 22 | 1 |
52 | 0 | 27242238 | 21 | 0 |
53 | ... |
54 |
55 |
56 | 输出结果
57 |
58 |
59 | | class | good | bad |
60 | %good | %bad | all |
61 | woe | iv |
62 |
63 |
64 | 0 | (0,20.0] | 76 | 1519 | 4.76% |
65 | 95.24% | 1595 | -6.34584 | 0.048765 |
66 |
67 |
68 | 1 | (20,30] | 895 | 17129 | 4.97% |
69 | 95.03% |
70 | 18024 | -8.75549 | 0.561679 |
71 |
72 | 2 | (30,45] | 673 | 10021 | 6.29% |
73 | 93.71% |
74 | 10694 | 7.31007 | 0.372628 |
75 |
76 |
77 | 3 | (45...) | 75 | 869 | 7.94% |
78 | 92.06% | 944 | 2.57974 | 0.036832 |
79 |
80 |
81 | 4 | NA | 0 | 0 | nan% |
82 | nan% | 0 |
83 | NaN | 0.000000 |
84 |
85 |
86 | | |
87 | 1688 | 28819 | 5.53% | 94.47% | 30507 |
88 | | 1.019905 |
89 |
--------------------------------------------------------------------------------
/scripts/explain.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jimmycheng603/data_analysis/5c78eef1501b3c41a98d9be6102886962ad916e9/scripts/explain.png
--------------------------------------------------------------------------------
/scripts/stats.py:
--------------------------------------------------------------------------------
1 | from sys import argv
2 |
3 | import pandas as pd
4 |
5 |
6 | if __name__ == '__main__':
7 | inputfile = argv[1]
8 | feature = argv[2]
9 | outputfile = argv[3]
10 | data = pd.read_csv(inputfile, sep=',')
11 |
12 | # cnt_rec
13 | counts_feature = data.groupby(feature).size().rename("cnt_rec")
14 | df_counts_feature = pd.DataFrame(counts_feature);
15 |
16 | # target
17 | feature_target = data.loc[data['is_dft'] == 1, :]
18 | df_feature_target = pd.DataFrame(feature_target.groupby(feature).size().rename("cnt_target"));
19 |
20 | # concat
21 | df_result = pd.concat([df_counts_feature, df_feature_target], axis=1);
22 |
23 | # cnt_target/cnt_rec
24 | df_result['%target'] = (df_result['cnt_target'] / df_result['cnt_rec']).map('{:.2%}'.format);
25 |
26 | cnt_rec = df_result['cnt_rec'] / df_result['cnt_rec'].sum()
27 | # cnt_rec/sum(cnt_rec)
28 | df_result['%cnt_rec'] = cnt_rec.map('{:.2%}'.format);
29 |
30 | cnt_target = df_result['cnt_target'] / df_result['cnt_target'].sum()
31 | # cnt_target/sum(cnt_target)
32 | df_result['%cnt_target'] = cnt_target.map('{:.2%}'.format);
33 |
34 | # %cum_cnt_rec
35 | df_result['%cum_cnt_rec'] = (cnt_rec.cumsum()).map('{:.2%}'.format);
36 |
37 | # %cum_cnt_target
38 | df_result['%cum_cnt_target'] = cnt_target.cumsum().map(
39 | '{:.2%}'.format);
40 |
41 | # cnt_nontarget
42 | df_result['cnt_nontarget'] = df_result['cnt_rec'] - df_result['cnt_target'];
43 |
44 | cnt_nontarget = df_result['cnt_nontarget'] / df_result['cnt_nontarget'].sum()
45 | # %cnt_nontarget
46 | df_result['%cnt_nontarget'] = cnt_nontarget.map('{:.2%}'.format);
47 |
48 | # %cum_nontarget
49 | df_result['%cum_nontarget'] = cnt_nontarget.cumsum().map(
50 | '{:.2%}'.format);
51 |
52 | # % cum_target - % cum_nontarget
53 | df_result['%cum_target-%cum_nontarget'] = (cnt_target
54 | - cnt_nontarget).map(
55 | '{:.2%}'.format)
56 |
57 | df_result.to_csv(outputfile, encoding='gb2312')
58 |
--------------------------------------------------------------------------------
/scripts/woe.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | __author__ = 'chengjunjie'
4 |
5 | from sys import argv
6 |
7 | import math
8 | import pandas as pd
9 |
10 | if __name__ == '__main__':
11 | """
12 | inputfile:dataframe所在输入文件
13 | feture:需要分析的特征变量
14 | sep 分段表达式
15 | target y变量
16 | """
17 | inputfile = argv[1]
18 | feature = argv[2]
19 | sep = argv[3]
20 | target = argv[4]
21 |
22 | data = pd.read_csv(inputfile, sep=',')
23 | # woe
24 | sep_value = sep.split(',')
25 | sep_len = len(sep_value)
26 | dict_bin = {}
27 | class_bin = {}
28 | len_dict_bin = {}
29 | len_dict_bin_0 = {}
30 | len_dict_bin_1 = {}
31 | woe_bin = {}
32 | iv_bin = {}
33 | if sep_len == 1:
34 | dict_bin[0] = data.loc[data[feature] <= float(sep_value[0]), :]
35 | dict_bin[1] = data.loc[data[feature] > float(sep_value[0]), :]
36 | dict_bin[2] = sum(data[feature].isnull())
37 | len_dict_bin[0] = len(dict_bin[0])
38 | len_dict_bin[1] = len(dict_bin[1])
39 | len_dict_bin[2] = len(dict_bin[2])
40 | class_bin[0] = "(0," + sep_value[0] + "]"
41 | class_bin[1] = "(" + sep_value[0] + "...)"
42 | class_bin[2] = "NA"
43 | else:
44 | for index, item in enumerate(sep_value):
45 | if index == 0:
46 | dict_bin[0] = data.loc[data[feature] <= float(item), :]
47 | len_dict_bin[0] = len(dict_bin[0])
48 | class_bin[0] = "(0," + str(float(item)) + "]"
49 | else:
50 | dict_bin[index] = (
51 | data.loc[(data[feature] >= float(sep_value[index - 1])) & (data[feature] < float(item)),
52 | :])
53 | len_dict_bin[index] = len(dict_bin[index])
54 | class_bin[index] = "(" + str(sep_value[index - 1]) + "," + str(sep_value[index]) + "]"
55 | dict_bin[index + 1] = data.loc[data[feature] > float(item), :]
56 | dict_bin[index + 2] = data.loc[data[feature].isnull()]
57 | len_dict_bin[index + 1] = len(dict_bin[index + 1])
58 | len_dict_bin[index + 2] = len(dict_bin[index + 2])
59 | class_bin[index + 1] = "(" + str(sep_value[index]) + "...)"
60 | class_bin[index + 2] = "NA"
61 |
62 | for index, item in enumerate(dict_bin):
63 | len_dict_bin_0[index] = len(dict_bin[index][dict_bin[index][target] == 0])
64 | len_dict_bin_1[index] = len(dict_bin[index][dict_bin[index][target] == 1])
65 |
66 | len_data_0 = len(data[data[target] == 0])
67 | len_data_1 = len(data[data[target] == 1])
68 | for index, item in enumerate(dict_bin):
69 | try:
70 | woe_bin[index] = math.log(math.e, (float(len_dict_bin_1[index]) / float(len_data_1)) / (
71 | float(len_dict_bin_0[index]) / float(len_data_0)))
72 | iv_bin[index] = ((float(len_dict_bin_1[index]) / float(len_data_1)) - (
73 | float(len_dict_bin_0[index]) / float(len_data_0))) * math.log(math.e, (
74 | float(len_dict_bin_1[index]) / float(len_data_1)) / (
75 | float(len_dict_bin_0[index]) / float(
76 | len_data_0)))
77 | except Exception, e:
78 | iv_bin[index] = 0
79 |
80 | dict_result = {}
81 | len_dict_bin_0[" "] = len_data_0
82 | len_dict_bin_1[" "] = len_data_1
83 | woe_bin[" "] = ""
84 | iv_bin[" "]=sum(iv_bin.values())
85 | class_bin[" "] = ""
86 | len_dict_bin[" "] = len(data)
87 | dict_result["bad"] = len_dict_bin_0
88 | dict_result["good"] = len_dict_bin_1
89 | dict_result["all"] = len_dict_bin
90 | dict_result["woe"] = woe_bin
91 | dict_result["iv"] = iv_bin
92 | dict_result["class"] = class_bin
93 | df = pd.DataFrame(dict_result)
94 |
95 | dict_result["%good"] = (df['good'] / df['all']).map('{:.2%}'.format);
96 | dict_result["%bad"] = (df['bad'] / df['all']).map('{:.2%}'.format);
97 | df["%good"] = dict_result["%good"]
98 | df["%bad"] = dict_result["%bad"]
99 |
100 | # 调整列的顺序
101 | df = df.ix[:, ['class', 'good', 'bad', '%good', '%bad', 'all', 'woe', 'iv']]
102 | print df
103 |
--------------------------------------------------------------------------------