├── .idea
├── encodings.xml
└── vcs.xml
├── README.md
├── code
├── KMeans.py
├── bayes.py
├── k-means.py
├── randomData.py
├── test.py
├── test1.py
├── test2.py
└── xgb.py
└── data
├── a.csv
├── customer.txt
├── customer.xls
├── data.txt
├── test.csv
├── tests.csv
├── train.csv
└── trains.csv
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # k-means
2 | 使用k-means算法实现对用户金融数据的聚类分析
3 |
--------------------------------------------------------------------------------
/code/KMeans.py:
--------------------------------------------------------------------------------
1 | '''
2 | 先对数据进行预处理
3 | '''
4 | import pandas as pd
5 | from sklearn.cluster import KMeans
6 | from sklearn.decomposition import PCA
7 | import matplotlib.pyplot as plt
8 | from sklearn.preprocessing import MinMaxScaler
9 | import warnings
10 | warnings.filterwarnings("ignore")
11 | import numpy as np
12 | #21列数据
13 | col_names = [
14 | 'cat_input1',
15 | 'cat_input2',
16 | 'demog_age',
17 | 'demog_ho',
18 | 'demog_homeval',
19 | 'demog_inc',
20 | 'demog_pr'
21 | 'rfm1',
22 | 'rfm2',
23 | 'rfm3',
24 | 'rfm4',
25 | 'rfm5',
26 | 'rfm6',
27 | 'rfm7',
28 | 'rfm8',
29 | 'rfm9',
30 | 'rfm10',
31 | 'rfm11',
32 | 'rfm12',
33 | 'demog_gent',
34 | 'demog_denm',
35 | 'account',
36 | ]
37 | data = pd.read_csv("../data/tests.csv",encoding="gbk")
38 |
39 | res = pd.DataFrame()
40 |
41 | data.drop(data.columns[[22]],axis=1,inplace=True)
42 | clean_z = data['rfm3'].fillna(0)
43 | clean_z[clean_z==''] = 0
44 | data['rfm3'] = clean_z
45 |
46 | input1_mapping = {'X':0.6, 'Y':0.3,'Z':0.1}
47 | input2_mapping = {'A':0.5, 'B':0.3, 'C':0.15, 'D':0.05, 'E':0.0}
48 | demog_ho_mapping = {'是':1, '否':0}
49 |
50 |
51 | data['cat_input1'] = data['cat_input1'].map(input1_mapping)
52 | data['cat_input2'] = data['cat_input2'].map(input2_mapping)
53 |
54 |
55 |
56 | data['demog_ho'] = data['demog_ho'].map(demog_ho_mapping)
57 | data['demog_age'] = data['demog_age'].where(data['demog_age'].notnull(), 0)
58 |
59 | data['demog_inc'] = data['demog_inc'].str.replace('$', '')
60 | data['demog_inc'] = data['demog_inc'].str.replace(',', '')
61 | data['demog_inc'] = data['demog_inc'].astype(float)
62 |
63 | data['demog_homeval'] = data['demog_homeval'].str.replace('$', '')
64 | data['demog_homeval'] = data['demog_homeval'].str.replace(',', '')
65 | data['demog_homeval'] = data['demog_homeval'].astype(float)
66 |
67 | data['rfm1'] = data['rfm1'].str.replace('$', '')
68 | data['rfm1'] = data['rfm1'].str.replace('(', '')
69 | data['rfm1'] = data['rfm1'].str.replace(')', '')
70 | data['rfm1'] = data['rfm1'].str.replace(',', '')
71 | data['rfm1'] = data['rfm1'].astype(float)
72 |
73 |
74 | data['rfm2'] = data['rfm2'].str.replace('$', '')
75 | data['rfm2'] = data['rfm2'].str.replace(',', '')
76 | data['rfm2'] = data['rfm2'].astype(float)
77 |
78 |
79 | data['rfm3'] = data['rfm3'].str.replace('$', '')
80 | data['rfm3'] = data['rfm3'].str.replace(',', '')
81 | data['rfm3'] = data['rfm3'].astype(float)
82 |
83 |
84 | data['rfm4'] = data['rfm4'].str.replace('$', '')
85 | data['rfm4'] = data['rfm4'].str.replace(',', '')
86 | data['rfm4'] = data['rfm4'].astype(float)
87 |
88 | res['account'] = data['account']
89 | data = data.drop(['account'], axis=1)
90 |
91 | data = data.drop(['demog_ho'], axis=1)
92 | data = data.drop(['rfm3'], axis=1)
93 | print(np.isnan(data).any())
94 |
95 | data = data.as_matrix()
96 |
97 | pca = PCA(n_components=6)
98 | new_pca = pd.DataFrame(pca.fit_transform(data))
99 | X = new_pca.as_matrix()
100 | print(X)
101 |
102 |
103 | #调用kmeans,设置两个类,分别代表潜在客户与普通客户
104 | kms = KMeans(n_clusters=6)
105 | #获取类别标签
106 | Y= kms.fit_predict(X)
107 |
108 | res['class'] = Y
109 |
110 | res.to_csv("../data/res.csv")
111 |
112 | #使用PCA再次进行降维显示结果
113 | pca = PCA(n_components=2)
114 | new_pca = pd.DataFrame(pca.fit_transform(data))
115 |
116 | #显示效果
117 | d = new_pca[Y == 0]
118 | plt.plot(d[0], d[1], 'r.')
119 | d = new_pca[Y == 1]
120 | plt.plot(d[0], d[1], 'g.')
121 | d = new_pca[Y == 2]
122 | plt.plot(d[0], d[1], 'b.')
123 | d = new_pca[Y == 3]
124 | plt.plot(d[0], d[1], 'y.')
125 | d = new_pca[Y == 4]
126 | plt.plot(d[0], d[1], 'c.')
127 | d = new_pca[Y == 5]
128 | plt.plot(d[0], d[1], 'k.')
129 | plt.gcf().savefig('kmeans.png')
130 | plt.show()
--------------------------------------------------------------------------------
/code/bayes.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sbbug/k-means/2d4615b3240c7f61db564337ed73952972dc7109/code/bayes.py
--------------------------------------------------------------------------------
/code/k-means.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sbbug/k-means/2d4615b3240c7f61db564337ed73952972dc7109/code/k-means.py
--------------------------------------------------------------------------------
/code/randomData.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sat Feb 16 15:32:29 2019
4 |
5 | @author: sun
6 | """
7 |
8 | '''
9 | 生成随机数数据
10 | '''
11 | import random
12 |
13 | def generateOneData():
14 |
15 | item = []
16 |
17 | b_tgt = random.randint(0,1)
18 | item.append(b_tgt)
19 |
20 | level = ['X','Y','Z']
21 | cat_input1 = level[random.randint(0,2)]
22 | item.append(cat_input1)
23 |
24 | val_lev = ['A','B','C','D','E']
25 | cat_input2 = val_lev[random.randint(0,4)]
26 | item.append(cat_input2)
27 |
28 | rfm1 = round(random.uniform(0,40),2)
29 | item.append(rfm1)
30 | rfm2 = round(random.uniform(0,40),2)
31 | item.append(rfm2)
32 | rfm3 = round(random.uniform(0,50),2)
33 | item.append(rfm3)
34 | rfm4 = round(random.uniform(10,50),2)
35 | item.append(rfm4)
36 | rfm5 = random.randint(0,15)
37 | item.append(rfm5)
38 | rfm6 = random.randint(0,20)
39 | item.append(rfm6)
40 | rfm7 = random.randint(0,10)
41 | item.append(rfm7)
42 | rfm8 = random.randint(0,20)
43 | item.append(rfm8)
44 | rfm9 = random.randint(0,50)
45 | item.append(rfm9)
46 | rfm10 = random.randint(0,20)
47 | item.append(rfm10)
48 | rfm11 = random.randint(0,15)
49 | item.append(rfm11)
50 | rfm12 = random.randint(0,150)
51 | item.append(rfm12)
52 |
53 | demog_age = random.randint(20,50)
54 | item.append(demog_age)
55 | agents = ['男','女']
56 | demog_agent = agents[random.randint(0,1)]
57 | item.append(demog_agent)
58 | hos = ['是','否']
59 | demog_ho = hos[random.randint(0,1)]
60 | item.append(demog_ho)
61 | demog_homeval = round(random.uniform(10000,30000),2)
62 | item.append(demog_homeval)
63 | demog_inc = round(random.uniform(10000,20000),2)
64 | item.append(demog_inc)
65 | demog_pr = round(random.random(),2)
66 | item.append(demog_pr)
67 |
68 | return item
69 |
70 |
71 | if __name__ =="__main__":
72 |
73 | items = []
74 |
75 | for i in range(1000):
76 | items.append(generateOneData())
77 |
78 |
79 | f = open(r'C:\Users\sun\Desktop\finance\data.txt','w')
80 |
81 | for item in items:
82 |
83 | res = ''
84 |
85 | for i in item:
86 | res+=str(i)+","
87 |
88 | f.write(str(res)+"\n")
89 |
90 | f.close()
91 |
92 |
--------------------------------------------------------------------------------
/code/test.py:
--------------------------------------------------------------------------------
1 | from sklearn.cluster import KMeans
2 | from sklearn.decomposition import PCA
3 | import pandas as pd
4 | import numpy as np
5 | import matplotlib.pyplot as plt
6 |
7 | df = pd.read_csv('trip.csv', header=0, encoding='utf-8')
8 | df1 = df.ix[:, 2:]
9 | kmeans = KMeans(n_clusters=3, random_state=10).fit(df1)
10 | df1['jllable'] = kmeans.labels_
11 | df_count_type = df1.groupby('jllable').apply(np.size)
12 |
13 | ##各个类别的数目
14 | df_count_type
15 | ##聚类中心
16 | kmeans.cluster_centers_
17 | ##新的dataframe,命名为new_df ,并输出到本地,命名为new_df.csv。
18 | new_df = df1[:]
19 | new_df
20 | new_df.to_csv('new_df.csv')
21 |
22 | ##将用于聚类的数据的特征的维度降至2维,并输出降维后的数据,形成一个dataframe名字new_pca
23 | pca = PCA(n_components=2)
24 | new_pca = pd.DataFrame(pca.fit_transform(new_df))
25 |
26 | ##可视化
27 | d = new_pca[new_df['jllable'] == 0]
28 | plt.plot(d[0], d[1], 'r.')
29 | d = new_pca[new_df['jllable'] == 1]
30 | plt.plot(d[0], d[1], 'go')
31 | d = new_pca[new_df['jllable'] == 2]
32 | plt.plot(d[0], d[1], 'b*')
33 | plt.gcf().savefig('kmeans.png')
34 | plt.show()
--------------------------------------------------------------------------------
/code/test1.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sklearn.cluster import KMeans
3 | from sklearn.decomposition import PCA
4 | import matplotlib.pyplot as plt
5 |
6 | data = pd.read_csv("../data/customer.txt",sep=',',header=None,names = ["A","B","C"])
7 | print(data)
8 | # x = data.as_matrix()
9 | #
10 | # # 传入要分类的数目
11 | # kms = KMeans(n_clusters=2)
12 | # y = kms.fit_predict(x)
13 | # print(y)
14 | # print(kms.labels_)
15 | # label=kms.labels_
16 | #
17 | # pca = PCA(n_components=2)
18 | # new_pca = pd.DataFrame(pca.fit_transform(data))
19 | #
20 | # d = new_pca[y == 0]
21 | # plt.plot(d[0], d[1], 'r.')
22 | # d = new_pca[y == 1]
23 | # plt.plot(d[0], d[1], 'go')
24 | #
25 | # plt.gcf().savefig('kmeans.png')
26 | # plt.show()
27 |
--------------------------------------------------------------------------------
/code/test2.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sklearn.cluster import KMeans
3 | from sklearn.decomposition import PCA
4 | import matplotlib.pyplot as plt
5 |
6 | data = pd.read_csv("../data/data.txt",sep=',',header=None,)
7 | print(data)
8 | x = data.as_matrix()
9 |
10 | # 传入要分类的数目
11 | kms = KMeans(n_clusters=2)
12 | y = kms.fit_predict(x)
13 | print(y)
14 | print(kms.labels_)
15 | # label=kms.labels_
16 | #
17 | # pca = PCA(n_components=2)
18 | # new_pca = pd.DataFrame(pca.fit_transform(data))
19 | #
20 | # d = new_pca[y == 0]
21 | # plt.plot(d[0], d[1], 'r.')
22 | # d = new_pca[y == 1]
23 | # plt.plot(d[0], d[1], 'go')
24 | #
25 | # plt.gcf().savefig('kmeans.png')
26 | # plt.show()
27 |
--------------------------------------------------------------------------------
/code/xgb.py:
--------------------------------------------------------------------------------
1 | import xgboost as xgb
2 | import numpy as np
3 | import pandas as pd
4 | import warnings
5 | from sklearn.model_selection import train_test_split
6 | from sklearn import metrics
7 | from sklearn.datasets import make_hastie_10_2
8 | from sklearn.ensemble import GradientBoostingClassifier
9 | from xgboost.sklearn import XGBClassifier
10 | warnings.filterwarnings("ignore")
11 |
12 |
13 | #获取处理好的训练数据
14 | def getTrain():
15 | # 读取数据,首先对训练数据进行处理
16 | data = pd.read_csv("../data/trains.csv", encoding="gbk")
17 | # 筛选掉训练数据的account列
18 | data.drop(['account'], axis=1, inplace=True)
19 |
20 | # rfm3 先处理美元符号,然后将空的列补填该列的平均值
21 | data['rfm3'] = data['rfm3'].str.replace('$', '')
22 | data['rfm3'] = data['rfm3'].str.replace(',', '')
23 | data['rfm3'] = data['rfm3'].astype(float)
24 | clean_z = data['rfm3'].fillna(data['rfm3'].mean())
25 | clean_z[clean_z == ''] = data['rfm3'].mean()
26 | data['rfm3'] = clean_z
27 |
28 | # 对input1数据编码
29 | input1_mapping = {'X': 0.6, 'Y': 0.3, 'Z': 0.1}
30 | input2_mapping = {'A': 0.5, 'B': 0.3, 'C': 0.15, 'D': 0.05, 'E': 0.0}
31 | demog_ho_mapping = {'是': 1, '否': 0}
32 |
33 | data['cat_input1'] = data['cat_input1'].map(input1_mapping)
34 | data['cat_input2'] = data['cat_input2'].map(input2_mapping)
35 |
36 | data['demog_ho'] = data['demog_ho'].map(demog_ho_mapping)
37 | data['demog_age'] = data['demog_age'].where(data['demog_age'].notnull(), 0)
38 |
39 | data['demog_inc'] = data['demog_inc'].str.replace('$', '')
40 | data['demog_inc'] = data['demog_inc'].str.replace(',', '')
41 | data['demog_inc'] = data['demog_inc'].astype(float)
42 |
43 | data['demog_homeval'] = data['demog_homeval'].str.replace('$', '')
44 | data['demog_homeval'] = data['demog_homeval'].str.replace(',', '')
45 | data['demog_homeval'] = data['demog_homeval'].astype(float)
46 |
47 | # 处理rfm1,rfm2,rfm4的美元符号
48 | data['rfm1'] = data['rfm1'].str.replace('$', '')
49 | data['rfm1'] = data['rfm1'].str.replace('(', '')
50 | data['rfm1'] = data['rfm1'].str.replace(')', '')
51 | data['rfm1'] = data['rfm1'].str.replace(',', '')
52 | data['rfm1'] = data['rfm1'].astype(float)
53 |
54 | data['rfm2'] = data['rfm2'].str.replace('$', '')
55 | data['rfm2'] = data['rfm2'].str.replace(',', '')
56 | data['rfm2'] = data['rfm2'].astype(float)
57 |
58 | data['rfm4'] = data['rfm4'].str.replace('$', '')
59 | data['rfm4'] = data['rfm4'].str.replace(',', '')
60 | data['rfm4'] = data['rfm4'].astype(float)
61 |
62 | data.drop(['demog_ho'], axis=1, inplace=True)
63 | data.drop(['rfm3'], axis=1, inplace=True)
64 | train_y = data['b_tgt']
65 | data.drop(['b_tgt'], axis=1, inplace=True)
66 | train_x = data
67 |
68 | print(train_y)
69 | print(train_x)
70 | return train_x,train_y
71 |
72 | #获取处理好的测试数据
73 | def getTest():
74 | '''
75 | 然后开始处理测试数据
76 | '''
77 | # 读取数据,首先对训练数据进行处理
78 | data = pd.read_csv("../data/tests.csv", encoding="gbk")
79 | data.drop(data.columns[[22]], axis=1, inplace=True)
80 | # 筛选掉训练数据的account列
81 | res = pd.DataFrame()
82 | res['account'] = data['account']
83 | data.drop(['account'], axis=1, inplace=True)
84 |
85 | # rfm3 先处理美元符号,然后将空的列补填该列的平均值
86 | data['rfm3'] = data['rfm3'].str.replace('$', '')
87 | data['rfm3'] = data['rfm3'].str.replace(',', '')
88 | data['rfm3'] = data['rfm3'].astype(float)
89 | clean_z = data['rfm3'].fillna(data['rfm3'].mean())
90 | clean_z[clean_z == ''] = data['rfm3'].mean()
91 | data['rfm3'] = clean_z
92 |
93 | # 对input1数据编码
94 | input1_mapping = {'X': 0.6, 'Y': 0.3, 'Z': 0.1}
95 | input2_mapping = {'A': 0.5, 'B': 0.3, 'C': 0.15, 'D': 0.05, 'E': 0.0}
96 | demog_ho_mapping = {'是': 1, '否': 0}
97 |
98 | data['cat_input1'] = data['cat_input1'].map(input1_mapping)
99 | data['cat_input2'] = data['cat_input2'].map(input2_mapping)
100 |
101 | data['demog_ho'] = data['demog_ho'].map(demog_ho_mapping)
102 | data['demog_age'] = data['demog_age'].where(data['demog_age'].notnull(), 0)
103 |
104 | data['demog_inc'] = data['demog_inc'].str.replace('$', '')
105 | data['demog_inc'] = data['demog_inc'].str.replace(',', '')
106 | data['demog_inc'] = data['demog_inc'].astype(float)
107 |
108 | data['demog_homeval'] = data['demog_homeval'].str.replace('$', '')
109 | data['demog_homeval'] = data['demog_homeval'].str.replace(',', '')
110 | data['demog_homeval'] = data['demog_homeval'].astype(float)
111 |
112 | # 处理rfm1,rfm2,rfm4的美元符号
113 | data['rfm1'] = data['rfm1'].str.replace('$', '')
114 | data['rfm1'] = data['rfm1'].str.replace('(', '')
115 | data['rfm1'] = data['rfm1'].str.replace(')', '')
116 | data['rfm1'] = data['rfm1'].str.replace(',', '')
117 | data['rfm1'] = data['rfm1'].astype(float)
118 |
119 | data['rfm2'] = data['rfm2'].str.replace('$', '')
120 | data['rfm2'] = data['rfm2'].str.replace(',', '')
121 | data['rfm2'] = data['rfm2'].astype(float)
122 |
123 | data['rfm4'] = data['rfm4'].str.replace('$', '')
124 | data['rfm4'] = data['rfm4'].str.replace(',', '')
125 | data['rfm4'] = data['rfm4'].astype(float)
126 |
127 | data.drop(['demog_ho'], axis=1, inplace=True)
128 | data.drop(['rfm3'], axis=1, inplace=True)
129 |
130 | test_x = data
131 | print(test_x)
132 |
133 | return test_x,res
134 |
135 | if __name__ == '__main__':
136 |
137 | '''
138 | 先对训练数据进行处理
139 | '''
140 | train_x,train_y = getTrain()
141 | test_x,res = getTest()
142 |
143 |
144 | #开始训练模型
145 | clf = XGBClassifier(silent=0 ,#设置成1则没有运行信息输出,最好是设置为0.是否在运行升级时打印消息。
146 | #nthread=4,# cpu 线程数 默认最大
147 | learning_rate= 0.3, # 如同学习率
148 | min_child_weight=1,
149 | # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言
150 | #,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
151 | #这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。
152 | max_depth=6, # 构建树的深度,越大越容易过拟合
153 | gamma=0, # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守,一般0.1、0.2这样子。
154 | subsample=1, # 随机采样训练样本 训练实例的子采样比
155 | max_delta_step=0,#最大增量步长,我们允许每个树的权重估计。
156 | colsample_bytree=1, # 生成树时进行的列采样
157 | reg_lambda=1, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
158 | #reg_alpha=0, # L1 正则项参数
159 | #scale_pos_weight=1, #如果取值大于0的话,在类别样本不平衡的情况下有助于快速收敛。平衡正负权重
160 | #objective= 'multi:softmax', #多分类的问题 指定学习任务和相应的学习目标
161 | #num_class=10, # 类别数,多分类与 multisoftmax 并用
162 | n_estimators=100, #树的个数
163 | seed=1000 #随机种子
164 | #eval_metric= 'auc'
165 | )
166 | clf.fit(train_x, train_y)
167 | #测试数据
168 | y_pre = clf.predict(test_x)
169 | print(y_pre)
170 | #获取打分值
171 | y_pro = clf.predict_proba(test_x)[:, 1]
172 |
173 | res['score'] = y_pro
174 |
175 | res.to_csv("../data/score.csv")
176 |
177 |
178 |
179 |
--------------------------------------------------------------------------------
/data/a.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sbbug/k-means/2d4615b3240c7f61db564337ed73952972dc7109/data/a.csv
--------------------------------------------------------------------------------
/data/customer.txt:
--------------------------------------------------------------------------------
1 | 23,317,10
2 | 22,147,13
3 | 24,172,17
4 | 27,194,67
5 | 37,789,35
6 | 25,190,1
7 | 29,281,10
8 | 27,142,12
9 | 28,186,8
10 | 23,226,1
11 | 22,287,32
12 | 32,499,3
13 | 25,181,90
14 | 26,172,1
15 | 24,190,16
16 | 27,271,31
17 | 40,382,25
18 |
19 |
--------------------------------------------------------------------------------
/data/customer.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sbbug/k-means/2d4615b3240c7f61db564337ed73952972dc7109/data/customer.xls
--------------------------------------------------------------------------------
/data/data.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sbbug/k-means/2d4615b3240c7f61db564337ed73952972dc7109/data/data.txt
--------------------------------------------------------------------------------
/data/test.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sbbug/k-means/2d4615b3240c7f61db564337ed73952972dc7109/data/test.csv
--------------------------------------------------------------------------------
/data/train.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sbbug/k-means/2d4615b3240c7f61db564337ed73952972dc7109/data/train.csv
--------------------------------------------------------------------------------