├── .DS_Store ├── .gitattributes ├── K-means.py ├── README.md └── img-storage ├── .DS_Store ├── C_score.png ├── FM_score.png ├── R_score.png ├── dis.png ├── k=2^3.png └── w^3.png /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HarveyLau/RFM-Clustering/b7fd1fda5ec7325df92cbb732bb359ebb2cb25fd/.DS_Store -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /K-means.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 使用K-Means算法聚类消费行为特征数据 Python 3 | 4 | import pandas as pd 5 | from sklearn.cluster import KMeans 6 | import time 7 | import multiprocessing 8 | 9 | # 参数初始化 10 | inputfile = '/Users/liuzhanheng/Desktop/030.xlsx' # 销量及其他属性数据 11 | outputfile = '/Users/liuzhanheng/Desktop/030_output.csv' # 保存结果的文件名 12 | iteration = 2000000000 # 聚类最大循环次数 13 | W = [0.221, 0.341, 0.439] 14 | H = [u'重要价值会员',u'潜力会员',u'重要深耕会员',u'新会员',u'重要唤回会员', u'一般维持会员', u'重要挽留会员',u'流失会员'] 15 | def comput(max,a,b,c): 16 | return (max-a)*W[0] + b*W[1] + c*W[2] 17 | def comput_score(max,min,a,flag): 18 | if flag: 19 | return (max - a) / (max - min) 20 | else: 21 | return (a - min) / (max - min) 22 | 23 | def read_data(input_file_): 24 | data = pd.read_excel(input_file_, index_col='ID') # 读取数据 25 | return data 26 | 27 | def clear_data(data_): 28 | data_r = 1.0 * (data_['R'].max() - data_['R']) / (data_['R'].max() - data_['R'].min()) 29 | data_f = 1.0 * (data_['F'] - data_['F'].min()) / (data_['F'].max() - data_['F'].min()) 30 | data_m = 1.0 * (data_['M'] - data_['M'].min()) / (data_['M'].max() - data_['M'].min()) 31 | data_zs = pd.concat([data_r, data_f, data_m], axis=1) 32 | r_max = data_['R'].max() 33 | f_max = data_['F'].max() 34 | m_max = data_['M'].max() 35 | r_mean = data_['R'][100:-100].mean() #剔除最高,前100 降低平均值 36 | f_mean = data_['F'][100:-1].mean() 37 | m_mean = data_['M'][100:-350].mean() #剔除最高,前350 降低平均值 38 | r_min = data_['R'].min() 39 | f_min = data_['F'].min() 40 | m_min = data_['M'].min() 41 | return data_zs, r_max,f_max,m_max, r_mean, f_mean, m_mean,r_min,f_min,m_min 42 | 43 | def k_means(data_zs): 44 | model = KMeans(n_clusters=8, n_jobs=8, max_iter=iteration) # 分为k类,并发数8 45 | model.fit(data_zs) # 开始聚类 46 | return model 47 | 48 | def clear_k_means(model): 49 | r1 = pd.Series(model.labels_).value_counts() # 统计各个类别的数目 50 | r2 = pd.DataFrame(model.cluster_centers_) # 找出聚类中心 51 | r3 = pd.Series(r2[0] * W[0] + r2[1] * W[1] + r2[2] * W[2]) 52 | r = pd.concat([r2, r1, r3], axis=1) # axis=1 横向连接(0是纵向),得到聚类中心对应的类别下的数目 53 | r.columns = [u'R_质心'] + [u'F_质心'] + [u'M_质心'] + [u'类别数目'] + [u'分数'] # 重命名表头 54 | r = r.sort_values(by=[u'分数']) 55 | s = pd.Series(pd.DataFrame(r).index) 56 | r4 = pd.Series([8, 7, 6, 5, 4, 3, 2, 1]) 57 | s1 = pd.concat([s,r4],axis=1) 58 | s1.columns = [u'聚类类别'] + [u'排名'] 59 | # rs = pd.concat([r, r4],axis=1) 60 | # rs.columns = [u'R_质心'] + [u'F_质心'] + [u'M_质心'] + [u'类别数目'] + [u'分数'] + [u'排名'] 61 | return r,s1 62 | 63 | def init_clear_data(data,model,s): 64 | rs = pd.concat([data, pd.Series(model.labels_, index=data.index)], axis=1) # 详细输出每个样本对应的类别 65 | rs.columns = list(data.columns) + [u'聚类类别'] # 重命名表头 66 | #rs.merge(s, on = u'聚类类别',how = 'left') 67 | #rs[u'排名'] = rs[u'聚类类别'].map(s[u'排名']) 68 | rs = pd.merge(rs,s,how='left') 69 | #rs = rs.drop(u'聚类类别',1) 70 | #rs.columns = list(data.columns) + [u'聚类类别1'] 71 | #loan_inner = pd.merge(loanstats, member_grade, how='inner') 72 | # rs.columns = list(data.columns) + [u'排名'] 73 | print(s) 74 | print(rs.tail) 75 | rs[u'会员分类'] = None 76 | rs[u'会员价值分数'] = None 77 | rs[u'波动'] = None 78 | rs.sort_values(by='R') 79 | return rs 80 | 81 | # 详细输出原始数据及其类别 82 | def clear_s_data(data, model, r_max, f_max, m_max, r_mean, f_mean, m_mean, r_min, f_min, m_min,r,s): 83 | rs = init_clear_data(data=data,model=model,s=s) 84 | for index in rs.index: 85 | rs.loc[index, u'会员价值分数'] = comput(r_max,rs.loc[index]['R'],rs.loc[index]['F'],rs.loc[index]['M']) 86 | for i in range(8): 87 | if (rs.loc[index, u'聚类类别'] == r.index[i]): 88 | #rs.loc[index,u'个体聚类分数'] = r.loc[r.index[i],u'分数'] 89 | r_score = comput_score(r_max, r_min, rs.loc[index]['R'], 1) 90 | f_score = comput_score(f_max, f_min, rs.loc[index]['F'], 0) 91 | m_score = comput_score(m_max, m_min, rs.loc[index]['M'], 0) 92 | score = comput(0,-r_score,f_score,m_score) 93 | if (score > r.loc[r.index[i],u'分数']): 94 | rs.loc[index, u'波动'] = 1 95 | else: 96 | rs.loc[index, u'波动'] = 2 97 | 98 | if rs.loc[index]['R'] < r_mean: 99 | if rs.loc[index]['F'] > f_mean: 100 | if rs.loc[index]['M'] > m_mean: 101 | rs.loc[index, u'会员分类'] = H[0] 102 | else: 103 | rs.loc[index, u'会员分类'] = H[1] 104 | else: 105 | if rs.loc[index]['M'] > m_mean: 106 | rs.loc[index, u'会员分类'] = H[2] 107 | else: 108 | rs.loc[index, u'会员分类'] = H[3] 109 | else: 110 | if rs.loc[index]['F'] > f_mean: 111 | if rs.loc[index]['M'] > m_mean: 112 | rs.loc[index, u'会员分类'] = H[4] 113 | else: 114 | rs.loc[index, u'会员分类'] = H[5] 115 | else: 116 | if rs.loc[index]['M'] > m_mean: 117 | rs.loc[index, u'会员分类'] = H[6] 118 | else: 119 | rs.loc[index, u'会员分类'] = H[7] 120 | return rs 121 | 122 | def write_csv(rs_): 123 | rs_.to_csv(outputfile, encoding='utf-8') # 保存结果 124 | 125 | def main(): 126 | data = read_data(input_file_=inputfile) 127 | data_zs, r_max, f_max, m_max, r_mean, f_mean, m_mean, r_min, f_min, m_min = clear_data(data_=data) 128 | model = k_means(data_zs=data_zs) 129 | r,s = clear_k_means(model=model) 130 | print(r) 131 | print(r_mean,f_mean,m_mean) 132 | rs = clear_s_data(data=data, model=model, r_max=r_max,f_max=f_max,m_max=m_max, r_mean=r_mean, f_mean=f_mean, m_mean=m_mean,r_min=r_min,f_min=f_min,m_min=m_min,r=r,s=s) 133 | write_csv(rs_=rs) 134 | print(rs) 135 | if __name__ == '__main__': 136 | start = time.clock() 137 | main() 138 | end = time.clock() 139 | print('time',end-start) 140 | 141 | 142 | 143 | 144 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RFM-Clustering 2 | * 利用RFM模型建模,并通过聚类分析、分类,分别算出8中不同的价值会员 3 | 4 | 5 | ## RFM模型构建会员价值标签 6 | 7 | * R:最近一次消费(Recency) 8 | * F:消费频率(Frequency) 9 | * M:消费金额(Monetary) 10 | 11 | ## RFM的意义 12 | * 在CRM中,经常会用到RFM模型分析去衡量以为会员的价值,和给企业带来的利润能力。这个模型是通过会员最近一次购买的时间段间隔、购买总金额,购买频率这三个因素来描述这会会员的价值状况。 13 | 14 | ## 基于RFM的零售行业会员聚类分析 15 | 16 | ### 因子: 17 | * R:会员到门店消费的到目前为止的时间段,当R的值越大说明,会员上一次到门店的时间越大,则R越来大,与公司的价值是成负相关的。 18 | * F:会员的消费频率,次数越多,利益越大,与公司的价值是成正相关的。 19 | * M:会员消费的总金额,金额越大,利益越大,与公司的价值是成正相关的。 20 | 21 | ### 权重: 22 | * MBA百科库中:研究邀请了被研究的某电信企业的两位地区经理、两位市场营销人员和一位长期客户应用文献的标度含义对RFM各指标权重进行比较分析。在分别得到五位评价者的两两比较矩阵后,采取取平均的方法得到下表的评价矩阵。 23 | 24 | 25 | ### 专家评分矩阵表 26 | R F M 27 | R 1 0.71 0.46 28 | F 1.41 1 0.85 29 | M 2.18 1.18 1 30 | 31 | * 上表所示的两两比较矩阵的一致性比例C。 R < 0.1, 表明该判断矩阵的一致性可以接受。由上表得出RFM各指标相对权重为 32 | 33 | ![w](https://github.com/HarveyLau/RFM-Clustering/blob/master/img-storage/w%5E3.png) 34 | 35 | * 其中M的权重最大,即专家们认为客户交费金额的高低是影响顾客价值高低的最主要因素。 36 | 37 | ### 分类 38 | * 目标:使用K-means算法进行会员价值聚类,并加以RFM的指标,将具有相近终身价值的会员进行聚类。 39 | 40 | ### 步骤 41 | 1. 读取数据库中的数据(12个月),并清洗数据; 42 | 2. 将RFM中的三个指标,利用离差标准化将其数据标准化; 43 | 44 | ![FM](https://github.com/HarveyLau/RFM-Clustering/blob/master/img-storage/FM_score.png) 45 | 46 | ![R](https://github.com/HarveyLau/RFM-Clustering/blob/master/img-storage/R_score.png) 47 | 48 | 3. 应用AHP层次分析法来获取权重,并将各个指标加权;运用上述专家评定的评分矩阵: 49 | 4. 其中M的权重最大,即专家认为会员交易金额的高低是影响会员价值高低的最重要因素。 50 | 5. 根据CRM项目组的需求文案,确认聚类的类别的类别数量为K; 51 | 6. 将每类用户的RFM平均值和总的RFM平均值做比较,通过比较得到每类会员RFM的变动情况; 52 | 7. 分析会员的终身价值类别 53 | 54 | 指标 最小值 最大值 平均值 标准差 55 | 近度 2 128 60.07 20.191 56 | 频度 0 13 5.98 1.861 57 | 值度 54.43 1499.17 704.7467 216.22068 58 | 由于RFM三个指标的量纲不同,因此需要消除分布差异大的影响和量纲不同的影响。 59 | ### K-means数据聚类 60 | K-means算法是很典型的基于距离的聚类算法,采用距离作为相似性的评价指标,即认为两个对象的距离越近,其相似度就越大。该算法认为簇是由距离靠近的对象组成的,因此把得到紧凑且独立的簇作为最终目标。 61 | * 现在使用这三个因子作为本次建模的特征值(R、F、M),每个因子有两个变化,高与低,由此确认K的值: 62 | 63 | ![K的取值](https://github.com/HarveyLau/RFM-Clustering/blob/master/img-storage/k%3D2%5E3.png) 64 | 65 | * 应用于每类价值的会员: 66 | 67 | R F M Result 68 | 0 0 0 流失客户 69 | 0 0 1 一般维持客户 70 | 0 1 0 新客户 71 | 0 1 1 潜力客户 72 | 1 0 0 重要挽留客户 73 | 1 0 1 重要深耕客户 74 | 1 1 0 重要唤醒客户 75 | 1 1 1 重要价值客户 76 | 77 | * 算法的实现K-means in Python 78 | 在Python或Spark Milb包中,已经有对K-means、K-means++成熟的集成,详细的聚类算法讲解,我将放在文献和附录里面。而这里我们使用的距离公式采用默认欧的几何公式来推算: 79 | 80 | ![DIS](https://github.com/HarveyLau/RFM-Clustering/blob/master/img-storage/dis.png) 81 | 82 | ## 会员终身价值得分(特征结合) 83 | 1. AHP层次分析法 84 | 2. K-means(质心) 85 | 86 | * 总得分: 87 | 88 | ![C](https://github.com/HarveyLau/RFM-Clustering/blob/master/img-storage/C_score.png) 89 | 90 | * 其中C是每一类的质心,按照总得分来进行标签逻辑 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | -------------------------------------------------------------------------------- /img-storage/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HarveyLau/RFM-Clustering/b7fd1fda5ec7325df92cbb732bb359ebb2cb25fd/img-storage/.DS_Store -------------------------------------------------------------------------------- /img-storage/C_score.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HarveyLau/RFM-Clustering/b7fd1fda5ec7325df92cbb732bb359ebb2cb25fd/img-storage/C_score.png -------------------------------------------------------------------------------- /img-storage/FM_score.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HarveyLau/RFM-Clustering/b7fd1fda5ec7325df92cbb732bb359ebb2cb25fd/img-storage/FM_score.png -------------------------------------------------------------------------------- /img-storage/R_score.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HarveyLau/RFM-Clustering/b7fd1fda5ec7325df92cbb732bb359ebb2cb25fd/img-storage/R_score.png -------------------------------------------------------------------------------- /img-storage/dis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HarveyLau/RFM-Clustering/b7fd1fda5ec7325df92cbb732bb359ebb2cb25fd/img-storage/dis.png -------------------------------------------------------------------------------- /img-storage/k=2^3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HarveyLau/RFM-Clustering/b7fd1fda5ec7325df92cbb732bb359ebb2cb25fd/img-storage/k=2^3.png -------------------------------------------------------------------------------- /img-storage/w^3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HarveyLau/RFM-Clustering/b7fd1fda5ec7325df92cbb732bb359ebb2cb25fd/img-storage/w^3.png --------------------------------------------------------------------------------