├── .idea ├── encodings.xml └── vcs.xml ├── README.md ├── code ├── KMeans.py ├── bayes.py ├── k-means.py ├── randomData.py ├── test.py ├── test1.py ├── test2.py └── xgb.py └── data ├── a.csv ├── customer.txt ├── customer.xls ├── data.txt ├── test.csv ├── tests.csv ├── train.csv └── trains.csv /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # k-means 2 | 使用k-means算法实现对用户金融数据的聚类分析 3 | -------------------------------------------------------------------------------- /code/KMeans.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 先对数据进行预处理 3 | ''' 4 | import pandas as pd 5 | from sklearn.cluster import KMeans 6 | from sklearn.decomposition import PCA 7 | import matplotlib.pyplot as plt 8 | from sklearn.preprocessing import MinMaxScaler 9 | import warnings 10 | warnings.filterwarnings("ignore") 11 | import numpy as np 12 | #21列数据 13 | col_names = [ 14 | 'cat_input1', 15 | 'cat_input2', 16 | 'demog_age', 17 | 'demog_ho', 18 | 'demog_homeval', 19 | 'demog_inc', 20 | 'demog_pr' 21 | 'rfm1', 22 | 'rfm2', 23 | 'rfm3', 24 | 'rfm4', 25 | 'rfm5', 26 | 'rfm6', 27 | 'rfm7', 28 | 'rfm8', 29 | 'rfm9', 30 | 'rfm10', 31 | 'rfm11', 32 | 'rfm12', 33 | 'demog_gent', 34 | 'demog_denm', 35 | 'account', 36 | ] 37 | data = pd.read_csv("../data/tests.csv",encoding="gbk") 38 | 39 | res = pd.DataFrame() 40 | 41 | data.drop(data.columns[[22]],axis=1,inplace=True) 42 | clean_z = data['rfm3'].fillna(0) 43 | clean_z[clean_z==''] = 0 44 | data['rfm3'] = clean_z 45 | 46 | input1_mapping = {'X':0.6, 'Y':0.3,'Z':0.1} 47 | input2_mapping = {'A':0.5, 'B':0.3, 'C':0.15, 'D':0.05, 'E':0.0} 48 | demog_ho_mapping = {'是':1, '否':0} 49 | 50 | 51 | data['cat_input1'] = data['cat_input1'].map(input1_mapping) 52 | data['cat_input2'] = data['cat_input2'].map(input2_mapping) 53 | 54 | 55 | 56 | data['demog_ho'] = data['demog_ho'].map(demog_ho_mapping) 57 | data['demog_age'] = data['demog_age'].where(data['demog_age'].notnull(), 0) 58 | 59 | data['demog_inc'] = data['demog_inc'].str.replace('$', '') 60 | data['demog_inc'] = data['demog_inc'].str.replace(',', '') 61 | data['demog_inc'] = data['demog_inc'].astype(float) 62 | 63 | data['demog_homeval'] = data['demog_homeval'].str.replace('$', '') 64 | data['demog_homeval'] = data['demog_homeval'].str.replace(',', '') 65 | data['demog_homeval'] = data['demog_homeval'].astype(float) 66 | 67 | data['rfm1'] = data['rfm1'].str.replace('$', '') 68 | data['rfm1'] = data['rfm1'].str.replace('(', '') 69 | data['rfm1'] = data['rfm1'].str.replace(')', '') 70 | data['rfm1'] = data['rfm1'].str.replace(',', '') 71 | data['rfm1'] = data['rfm1'].astype(float) 72 | 73 | 74 | data['rfm2'] = data['rfm2'].str.replace('$', '') 75 | data['rfm2'] = data['rfm2'].str.replace(',', '') 76 | data['rfm2'] = data['rfm2'].astype(float) 77 | 78 | 79 | data['rfm3'] = data['rfm3'].str.replace('$', '') 80 | data['rfm3'] = data['rfm3'].str.replace(',', '') 81 | data['rfm3'] = data['rfm3'].astype(float) 82 | 83 | 84 | data['rfm4'] = data['rfm4'].str.replace('$', '') 85 | data['rfm4'] = data['rfm4'].str.replace(',', '') 86 | data['rfm4'] = data['rfm4'].astype(float) 87 | 88 | res['account'] = data['account'] 89 | data = data.drop(['account'], axis=1) 90 | 91 | data = data.drop(['demog_ho'], axis=1) 92 | data = data.drop(['rfm3'], axis=1) 93 | print(np.isnan(data).any()) 94 | 95 | data = data.as_matrix() 96 | 97 | pca = PCA(n_components=6) 98 | new_pca = pd.DataFrame(pca.fit_transform(data)) 99 | X = new_pca.as_matrix() 100 | print(X) 101 | 102 | 103 | #调用kmeans,设置两个类,分别代表潜在客户与普通客户 104 | kms = KMeans(n_clusters=6) 105 | #获取类别标签 106 | Y= kms.fit_predict(X) 107 | 108 | res['class'] = Y 109 | 110 | res.to_csv("../data/res.csv") 111 | 112 | #使用PCA再次进行降维显示结果 113 | pca = PCA(n_components=2) 114 | new_pca = pd.DataFrame(pca.fit_transform(data)) 115 | 116 | #显示效果 117 | d = new_pca[Y == 0] 118 | plt.plot(d[0], d[1], 'r.') 119 | d = new_pca[Y == 1] 120 | plt.plot(d[0], d[1], 'g.') 121 | d = new_pca[Y == 2] 122 | plt.plot(d[0], d[1], 'b.') 123 | d = new_pca[Y == 3] 124 | plt.plot(d[0], d[1], 'y.') 125 | d = new_pca[Y == 4] 126 | plt.plot(d[0], d[1], 'c.') 127 | d = new_pca[Y == 5] 128 | plt.plot(d[0], d[1], 'k.') 129 | plt.gcf().savefig('kmeans.png') 130 | plt.show() -------------------------------------------------------------------------------- /code/bayes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbbug/k-means/2d4615b3240c7f61db564337ed73952972dc7109/code/bayes.py -------------------------------------------------------------------------------- /code/k-means.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbbug/k-means/2d4615b3240c7f61db564337ed73952972dc7109/code/k-means.py -------------------------------------------------------------------------------- /code/randomData.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Feb 16 15:32:29 2019 4 | 5 | @author: sun 6 | """ 7 | 8 | ''' 9 | 生成随机数数据 10 | ''' 11 | import random 12 | 13 | def generateOneData(): 14 | 15 | item = [] 16 | 17 | b_tgt = random.randint(0,1) 18 | item.append(b_tgt) 19 | 20 | level = ['X','Y','Z'] 21 | cat_input1 = level[random.randint(0,2)] 22 | item.append(cat_input1) 23 | 24 | val_lev = ['A','B','C','D','E'] 25 | cat_input2 = val_lev[random.randint(0,4)] 26 | item.append(cat_input2) 27 | 28 | rfm1 = round(random.uniform(0,40),2) 29 | item.append(rfm1) 30 | rfm2 = round(random.uniform(0,40),2) 31 | item.append(rfm2) 32 | rfm3 = round(random.uniform(0,50),2) 33 | item.append(rfm3) 34 | rfm4 = round(random.uniform(10,50),2) 35 | item.append(rfm4) 36 | rfm5 = random.randint(0,15) 37 | item.append(rfm5) 38 | rfm6 = random.randint(0,20) 39 | item.append(rfm6) 40 | rfm7 = random.randint(0,10) 41 | item.append(rfm7) 42 | rfm8 = random.randint(0,20) 43 | item.append(rfm8) 44 | rfm9 = random.randint(0,50) 45 | item.append(rfm9) 46 | rfm10 = random.randint(0,20) 47 | item.append(rfm10) 48 | rfm11 = random.randint(0,15) 49 | item.append(rfm11) 50 | rfm12 = random.randint(0,150) 51 | item.append(rfm12) 52 | 53 | demog_age = random.randint(20,50) 54 | item.append(demog_age) 55 | agents = ['男','女'] 56 | demog_agent = agents[random.randint(0,1)] 57 | item.append(demog_agent) 58 | hos = ['是','否'] 59 | demog_ho = hos[random.randint(0,1)] 60 | item.append(demog_ho) 61 | demog_homeval = round(random.uniform(10000,30000),2) 62 | item.append(demog_homeval) 63 | demog_inc = round(random.uniform(10000,20000),2) 64 | item.append(demog_inc) 65 | demog_pr = round(random.random(),2) 66 | item.append(demog_pr) 67 | 68 | return item 69 | 70 | 71 | if __name__ =="__main__": 72 | 73 | items = [] 74 | 75 | for i in range(1000): 76 | items.append(generateOneData()) 77 | 78 | 79 | f = open(r'C:\Users\sun\Desktop\finance\data.txt','w') 80 | 81 | for item in items: 82 | 83 | res = '' 84 | 85 | for i in item: 86 | res+=str(i)+"," 87 | 88 | f.write(str(res)+"\n") 89 | 90 | f.close() 91 | 92 | -------------------------------------------------------------------------------- /code/test.py: -------------------------------------------------------------------------------- 1 | from sklearn.cluster import KMeans 2 | from sklearn.decomposition import PCA 3 | import pandas as pd 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | 7 | df = pd.read_csv('trip.csv', header=0, encoding='utf-8') 8 | df1 = df.ix[:, 2:] 9 | kmeans = KMeans(n_clusters=3, random_state=10).fit(df1) 10 | df1['jllable'] = kmeans.labels_ 11 | df_count_type = df1.groupby('jllable').apply(np.size) 12 | 13 | ##各个类别的数目 14 | df_count_type 15 | ##聚类中心 16 | kmeans.cluster_centers_ 17 | ##新的dataframe,命名为new_df ,并输出到本地,命名为new_df.csv。 18 | new_df = df1[:] 19 | new_df 20 | new_df.to_csv('new_df.csv') 21 | 22 | ##将用于聚类的数据的特征的维度降至2维,并输出降维后的数据,形成一个dataframe名字new_pca 23 | pca = PCA(n_components=2) 24 | new_pca = pd.DataFrame(pca.fit_transform(new_df)) 25 | 26 | ##可视化 27 | d = new_pca[new_df['jllable'] == 0] 28 | plt.plot(d[0], d[1], 'r.') 29 | d = new_pca[new_df['jllable'] == 1] 30 | plt.plot(d[0], d[1], 'go') 31 | d = new_pca[new_df['jllable'] == 2] 32 | plt.plot(d[0], d[1], 'b*') 33 | plt.gcf().savefig('kmeans.png') 34 | plt.show() -------------------------------------------------------------------------------- /code/test1.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.cluster import KMeans 3 | from sklearn.decomposition import PCA 4 | import matplotlib.pyplot as plt 5 | 6 | data = pd.read_csv("../data/customer.txt",sep=',',header=None,names = ["A","B","C"]) 7 | print(data) 8 | # x = data.as_matrix() 9 | # 10 | # # 传入要分类的数目 11 | # kms = KMeans(n_clusters=2) 12 | # y = kms.fit_predict(x) 13 | # print(y) 14 | # print(kms.labels_) 15 | # label=kms.labels_ 16 | # 17 | # pca = PCA(n_components=2) 18 | # new_pca = pd.DataFrame(pca.fit_transform(data)) 19 | # 20 | # d = new_pca[y == 0] 21 | # plt.plot(d[0], d[1], 'r.') 22 | # d = new_pca[y == 1] 23 | # plt.plot(d[0], d[1], 'go') 24 | # 25 | # plt.gcf().savefig('kmeans.png') 26 | # plt.show() 27 | -------------------------------------------------------------------------------- /code/test2.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.cluster import KMeans 3 | from sklearn.decomposition import PCA 4 | import matplotlib.pyplot as plt 5 | 6 | data = pd.read_csv("../data/data.txt",sep=',',header=None,) 7 | print(data) 8 | x = data.as_matrix() 9 | 10 | # 传入要分类的数目 11 | kms = KMeans(n_clusters=2) 12 | y = kms.fit_predict(x) 13 | print(y) 14 | print(kms.labels_) 15 | # label=kms.labels_ 16 | # 17 | # pca = PCA(n_components=2) 18 | # new_pca = pd.DataFrame(pca.fit_transform(data)) 19 | # 20 | # d = new_pca[y == 0] 21 | # plt.plot(d[0], d[1], 'r.') 22 | # d = new_pca[y == 1] 23 | # plt.plot(d[0], d[1], 'go') 24 | # 25 | # plt.gcf().savefig('kmeans.png') 26 | # plt.show() 27 | -------------------------------------------------------------------------------- /code/xgb.py: -------------------------------------------------------------------------------- 1 | import xgboost as xgb 2 | import numpy as np 3 | import pandas as pd 4 | import warnings 5 | from sklearn.model_selection import train_test_split 6 | from sklearn import metrics 7 | from sklearn.datasets import make_hastie_10_2 8 | from sklearn.ensemble import GradientBoostingClassifier 9 | from xgboost.sklearn import XGBClassifier 10 | warnings.filterwarnings("ignore") 11 | 12 | 13 | #获取处理好的训练数据 14 | def getTrain(): 15 | # 读取数据,首先对训练数据进行处理 16 | data = pd.read_csv("../data/trains.csv", encoding="gbk") 17 | # 筛选掉训练数据的account列 18 | data.drop(['account'], axis=1, inplace=True) 19 | 20 | # rfm3 先处理美元符号,然后将空的列补填该列的平均值 21 | data['rfm3'] = data['rfm3'].str.replace('$', '') 22 | data['rfm3'] = data['rfm3'].str.replace(',', '') 23 | data['rfm3'] = data['rfm3'].astype(float) 24 | clean_z = data['rfm3'].fillna(data['rfm3'].mean()) 25 | clean_z[clean_z == ''] = data['rfm3'].mean() 26 | data['rfm3'] = clean_z 27 | 28 | # 对input1数据编码 29 | input1_mapping = {'X': 0.6, 'Y': 0.3, 'Z': 0.1} 30 | input2_mapping = {'A': 0.5, 'B': 0.3, 'C': 0.15, 'D': 0.05, 'E': 0.0} 31 | demog_ho_mapping = {'是': 1, '否': 0} 32 | 33 | data['cat_input1'] = data['cat_input1'].map(input1_mapping) 34 | data['cat_input2'] = data['cat_input2'].map(input2_mapping) 35 | 36 | data['demog_ho'] = data['demog_ho'].map(demog_ho_mapping) 37 | data['demog_age'] = data['demog_age'].where(data['demog_age'].notnull(), 0) 38 | 39 | data['demog_inc'] = data['demog_inc'].str.replace('$', '') 40 | data['demog_inc'] = data['demog_inc'].str.replace(',', '') 41 | data['demog_inc'] = data['demog_inc'].astype(float) 42 | 43 | data['demog_homeval'] = data['demog_homeval'].str.replace('$', '') 44 | data['demog_homeval'] = data['demog_homeval'].str.replace(',', '') 45 | data['demog_homeval'] = data['demog_homeval'].astype(float) 46 | 47 | # 处理rfm1,rfm2,rfm4的美元符号 48 | data['rfm1'] = data['rfm1'].str.replace('$', '') 49 | data['rfm1'] = data['rfm1'].str.replace('(', '') 50 | data['rfm1'] = data['rfm1'].str.replace(')', '') 51 | data['rfm1'] = data['rfm1'].str.replace(',', '') 52 | data['rfm1'] = data['rfm1'].astype(float) 53 | 54 | data['rfm2'] = data['rfm2'].str.replace('$', '') 55 | data['rfm2'] = data['rfm2'].str.replace(',', '') 56 | data['rfm2'] = data['rfm2'].astype(float) 57 | 58 | data['rfm4'] = data['rfm4'].str.replace('$', '') 59 | data['rfm4'] = data['rfm4'].str.replace(',', '') 60 | data['rfm4'] = data['rfm4'].astype(float) 61 | 62 | data.drop(['demog_ho'], axis=1, inplace=True) 63 | data.drop(['rfm3'], axis=1, inplace=True) 64 | train_y = data['b_tgt'] 65 | data.drop(['b_tgt'], axis=1, inplace=True) 66 | train_x = data 67 | 68 | print(train_y) 69 | print(train_x) 70 | return train_x,train_y 71 | 72 | #获取处理好的测试数据 73 | def getTest(): 74 | ''' 75 | 然后开始处理测试数据 76 | ''' 77 | # 读取数据,首先对训练数据进行处理 78 | data = pd.read_csv("../data/tests.csv", encoding="gbk") 79 | data.drop(data.columns[[22]], axis=1, inplace=True) 80 | # 筛选掉训练数据的account列 81 | res = pd.DataFrame() 82 | res['account'] = data['account'] 83 | data.drop(['account'], axis=1, inplace=True) 84 | 85 | # rfm3 先处理美元符号,然后将空的列补填该列的平均值 86 | data['rfm3'] = data['rfm3'].str.replace('$', '') 87 | data['rfm3'] = data['rfm3'].str.replace(',', '') 88 | data['rfm3'] = data['rfm3'].astype(float) 89 | clean_z = data['rfm3'].fillna(data['rfm3'].mean()) 90 | clean_z[clean_z == ''] = data['rfm3'].mean() 91 | data['rfm3'] = clean_z 92 | 93 | # 对input1数据编码 94 | input1_mapping = {'X': 0.6, 'Y': 0.3, 'Z': 0.1} 95 | input2_mapping = {'A': 0.5, 'B': 0.3, 'C': 0.15, 'D': 0.05, 'E': 0.0} 96 | demog_ho_mapping = {'是': 1, '否': 0} 97 | 98 | data['cat_input1'] = data['cat_input1'].map(input1_mapping) 99 | data['cat_input2'] = data['cat_input2'].map(input2_mapping) 100 | 101 | data['demog_ho'] = data['demog_ho'].map(demog_ho_mapping) 102 | data['demog_age'] = data['demog_age'].where(data['demog_age'].notnull(), 0) 103 | 104 | data['demog_inc'] = data['demog_inc'].str.replace('$', '') 105 | data['demog_inc'] = data['demog_inc'].str.replace(',', '') 106 | data['demog_inc'] = data['demog_inc'].astype(float) 107 | 108 | data['demog_homeval'] = data['demog_homeval'].str.replace('$', '') 109 | data['demog_homeval'] = data['demog_homeval'].str.replace(',', '') 110 | data['demog_homeval'] = data['demog_homeval'].astype(float) 111 | 112 | # 处理rfm1,rfm2,rfm4的美元符号 113 | data['rfm1'] = data['rfm1'].str.replace('$', '') 114 | data['rfm1'] = data['rfm1'].str.replace('(', '') 115 | data['rfm1'] = data['rfm1'].str.replace(')', '') 116 | data['rfm1'] = data['rfm1'].str.replace(',', '') 117 | data['rfm1'] = data['rfm1'].astype(float) 118 | 119 | data['rfm2'] = data['rfm2'].str.replace('$', '') 120 | data['rfm2'] = data['rfm2'].str.replace(',', '') 121 | data['rfm2'] = data['rfm2'].astype(float) 122 | 123 | data['rfm4'] = data['rfm4'].str.replace('$', '') 124 | data['rfm4'] = data['rfm4'].str.replace(',', '') 125 | data['rfm4'] = data['rfm4'].astype(float) 126 | 127 | data.drop(['demog_ho'], axis=1, inplace=True) 128 | data.drop(['rfm3'], axis=1, inplace=True) 129 | 130 | test_x = data 131 | print(test_x) 132 | 133 | return test_x,res 134 | 135 | if __name__ == '__main__': 136 | 137 | ''' 138 | 先对训练数据进行处理 139 | ''' 140 | train_x,train_y = getTrain() 141 | test_x,res = getTest() 142 | 143 | 144 | #开始训练模型 145 | clf = XGBClassifier(silent=0 ,#设置成1则没有运行信息输出,最好是设置为0.是否在运行升级时打印消息。 146 | #nthread=4,# cpu 线程数 默认最大 147 | learning_rate= 0.3, # 如同学习率 148 | min_child_weight=1, 149 | # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言 150 | #,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。 151 | #这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。 152 | max_depth=6, # 构建树的深度,越大越容易过拟合 153 | gamma=0, # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守,一般0.1、0.2这样子。 154 | subsample=1, # 随机采样训练样本 训练实例的子采样比 155 | max_delta_step=0,#最大增量步长,我们允许每个树的权重估计。 156 | colsample_bytree=1, # 生成树时进行的列采样 157 | reg_lambda=1, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。 158 | #reg_alpha=0, # L1 正则项参数 159 | #scale_pos_weight=1, #如果取值大于0的话,在类别样本不平衡的情况下有助于快速收敛。平衡正负权重 160 | #objective= 'multi:softmax', #多分类的问题 指定学习任务和相应的学习目标 161 | #num_class=10, # 类别数,多分类与 multisoftmax 并用 162 | n_estimators=100, #树的个数 163 | seed=1000 #随机种子 164 | #eval_metric= 'auc' 165 | ) 166 | clf.fit(train_x, train_y) 167 | #测试数据 168 | y_pre = clf.predict(test_x) 169 | print(y_pre) 170 | #获取打分值 171 | y_pro = clf.predict_proba(test_x)[:, 1] 172 | 173 | res['score'] = y_pro 174 | 175 | res.to_csv("../data/score.csv") 176 | 177 | 178 | 179 | -------------------------------------------------------------------------------- /data/a.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbbug/k-means/2d4615b3240c7f61db564337ed73952972dc7109/data/a.csv -------------------------------------------------------------------------------- /data/customer.txt: -------------------------------------------------------------------------------- 1 | 23,317,10 2 | 22,147,13 3 | 24,172,17 4 | 27,194,67 5 | 37,789,35 6 | 25,190,1 7 | 29,281,10 8 | 27,142,12 9 | 28,186,8 10 | 23,226,1 11 | 22,287,32 12 | 32,499,3 13 | 25,181,90 14 | 26,172,1 15 | 24,190,16 16 | 27,271,31 17 | 40,382,25 18 | 19 | -------------------------------------------------------------------------------- /data/customer.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbbug/k-means/2d4615b3240c7f61db564337ed73952972dc7109/data/customer.xls -------------------------------------------------------------------------------- /data/data.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbbug/k-means/2d4615b3240c7f61db564337ed73952972dc7109/data/data.txt -------------------------------------------------------------------------------- /data/test.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbbug/k-means/2d4615b3240c7f61db564337ed73952972dc7109/data/test.csv -------------------------------------------------------------------------------- /data/train.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbbug/k-means/2d4615b3240c7f61db564337ed73952972dc7109/data/train.csv --------------------------------------------------------------------------------