├── README.md ├── data ├── DataModel.xlsx ├── DataRaw.xlsx └── 注册资本Ranks ├── data_explore ├── __init__.py └── correlationAnalysic.py ├── feature_proj ├── __init__.py ├── extractFeatures.py └── featureSelector.py ├── pictures ├── 决策树.png ├── 手肘法.png ├── 推荐架构.png ├── 某赋通数学期望.png ├── 特征相关性计算.png ├── 特征累计重要性计算.png ├── 特征重要性计算.png ├── 算法选型.png └── 聚类分群.png ├── recommender.py ├── requirements.txt ├── setting.py ├── user_point ├── __init__.py ├── evaluationFunction.py └── giveScores.py ├── user_recom ├── DecisionTree.py ├── __init__.py └── decesionTree.dot └── utils ├── __init__.py ├── cutScopebusiness.py ├── fasttextClassfy.py └── joinExcelByIndex.py /README.md: -------------------------------------------------------------------------------- 1 | # recommender 2 | 3 | ### 推荐模型简介 4 | - 目标:有三款产品某企通、某赋通和某票,通过推荐提高用户群体从某企通和某票到某赋通产品的转化率。 5 | - 数据情况:用户数据体量,某企通:某赋通:某票 = 100 :10 :1 6 | - 推荐算法选型:试验过当下十分流行的Item_CF和User_CF,产生的推荐效果并不理想,具体过程如下: 7 | 8 | - 推荐模型:选择了RFM模型、决策树模型和专业运营人员的经验知识,具体的推荐架构如下: 9 | 10 | 11 | ### 购买某赋通的数学期望 12 | - 核心思想:将某企通的用户通过Kmeans聚类进行分群,跑手肘法使分层数依次为2、3、4....10。分别计算相应层数中, 13 | 每个群体购买某赋通的数学期望,目标是找出获得最大数学期望的群体,进行画像推荐。 14 | - 用户分群的效果:随着分群的种类不断增加,用户购买某赋通的数学期望如下所示: 15 | 16 | - 结论:可以看出购买某赋通的数学期望偏低,这也同样证明了协同过滤推荐算法的不适用,因为数据在向未购买的用户群靠拢, 17 | 所以需要解决购买与不购买数据样本不平衡的问题。 18 | 19 | ### 解决思路 20 | 选择了非常弱的分类器决策树,参考传统的RFM模型,进行用户推荐。 21 | 22 | ### 环境 23 | - Windows 10 24 | - Python 3.6.5 25 | 26 | ### 依赖包 27 | ``` 28 | pip install -r requirements.txt 29 | ``` 30 | 31 | ### 程序执行 32 | ``` 33 | python recommender.py 34 | ``` 35 | 36 | ### 建模过程 37 | - 特征工程: 38 | > 数据清洗:对文本型数据、数值型数据、时间型数据和确实数据进行清洗,一致性检查和业务逻辑检查。
39 | > 特征选择:相关性和重要性计算。 40 | 41 | - 用户画像: 42 | > 基于到期时间的推荐:依次推荐给快到期用户续费使用;
43 | > 基于效用的推荐:开票量、开票金额和登录使用次数都是构成效用的重要指标;
44 | > 基于RFM用户价值模型的推荐:对消费金额和消费频次高的用户适时产生推荐;
45 | > 混合推荐:现实应用中,其实很少有直接用某种算法来做推荐的系统。通过给不同算法的结果加权重来综合结果, 46 | 或者是在不同的计算环节中运用不同的算法来混合,达到更贴合自己业务的目的。 47 | - 推荐引擎 48 | > 决策树:不确定性的计算采用的是基尼系数,之前已经计算过累计重要性,所以这里就不剪枝。 49 |
50 | > RFM模型:通过不断调整权重,运营人员认为如下公式比较符合预期。computeTotalScore。 51 | ```python 52 | def computeTotalScore(scoreDic): 53 | ''' 54 | 根据用户对所有商品的打分 55 | :param scoreDic:每个用户对所有商品的评级 56 | :return:对产品的打分 57 | index = ["times-wp","je-wp","recently-wp","times-bqt","je-bqt","recently-bqt","times-bft","je-bft","recently-bft","dayLoginTimes", 58 | "zczb","industry","ages","dayInvoiceNum","dayInvoiceJe"] 59 | ''' 60 | if len(scoreDic) <1:raise Exception('scoreDic is none') 61 | if scoreDic.get('dayLoginTimes') == None:scoreDic['dayLoginTimes'] = 0 62 | score_common = scoreDic['registeredCapital'] * 10 + scoreDic['dayCountAvg'] * 30 + scoreDic['loginFrequency'] * 30 + \ 63 | scoreDic['daySumAvg'] * 30 + scoreDic['dateOfEstablishment'] * 10 + scoreDic['industry'] * 10 64 | if scoreDic['deadline']==5: bft = 200 + (scoreDic['userConsumeTotalAmount'] + scoreDic['userConsumeTotalTimes'])* 50 65 | else:bft = 0 66 | if scoreDic['recently_bqt']==5: bqt = 100 + (scoreDic['je_bqt'] + scoreDic['times_bqt'])* 50 67 | else:bqt = 0 68 | if scoreDic['deadline_wp']==5: wp = 100 + (scoreDic['je_wp'] + scoreDic['times_wp'])* 50 69 | else:wp = 0 70 | score = bft + bqt + wp + score_common 71 | return score 72 | ``` 73 | 74 | ### 优化空间 75 | 1、加大力度对用户数据的采集;
76 | 2、根据最近几次推荐的反馈结果,对模型进行优化;
77 | 3、搭建实时的用户推荐系统,后期我会陆续更新基于Mahout的推荐系统代码。
78 | -------------------------------------------------------------------------------- /data/DataModel.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonWang00/recommender/02b26535b3cb754d006036e577586a270648c50d/data/DataModel.xlsx -------------------------------------------------------------------------------- /data/DataRaw.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonWang00/recommender/02b26535b3cb754d006036e577586a270648c50d/data/DataRaw.xlsx -------------------------------------------------------------------------------- /data_explore/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # __*__ coding: utf-8 __*__ 3 | 4 | ''' 5 | @Author: simonKing 6 | @License: (C) Copyright 2013-2019, Best Wonder Corporation Limited. 7 | @Os:Windows 10 x64 8 | @Contact: bw_wangxiaomeng@whty.com.cn 9 | @Software: PY PyCharm 10 | @File: __init__.py.py 11 | @Time: 2019/7/12 16:30 12 | @Desc: define your function 13 | ''' -------------------------------------------------------------------------------- /data_explore/correlationAnalysic.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # __*__ coding: utf-8 __*__ 3 | 4 | ''' 5 | @Author: simonKing 6 | @License: (C) Copyright 2013-2019, Best Wonder Corporation Limited. 7 | @Os:Windows 10 x64 8 | @Contact: bw_wangxiaomeng@whty.com.cn 9 | @Software: PY PyCharm 10 | @File: correlationAnalysic.py 11 | @Time: 2019/7/12 14:38 12 | @Desc: 关联性推荐 13 | ''' 14 | import pandas as pd 15 | import pylab as plt 16 | 17 | 18 | def corrPer(seris1,seris2,index = None): 19 | ''' 20 | 计算数列间的相关系数 21 | :param seris1:数列1 22 | :param seris2:数列2 23 | :param index:数列名称 24 | :return:相关系数,1相关,0不相关,-1负相关 25 | ''' 26 | # 利用Series将列表转换成新的、pandas可处理的数据 27 | s1 = pd.Series(seris1) 28 | s2 = pd.Series(seris2) 29 | # 计算皮尔逊相关系数,round(a, 4)是保留a的前四位小数 30 | corr_per = round(s1.corr(s2), 4) 31 | # print('corr_per :', corr_per) 32 | # 最后画一下两列表散点图,直观感受下,结合相关系数揣摩揣摩 33 | plt.scatter(seris1, seris2) 34 | if index: 35 | plt.xlabel(index[0]) 36 | plt.ylabel(index[1]) 37 | plt.title('corr_per :' + str(corr_per), fontproperties='SimHei') 38 | plt.show() 39 | return corr_per 40 | 41 | 42 | def graAnalysic(seris1, seris2): 43 | ''' 44 | 分析两个数列的相关性 45 | :param seris1:数列1 46 | :param seris2:数列2 47 | :return:-1负相关,0不相干,1正相关 48 | ''' 49 | x = pd.DataFrame(data=[seris1,seris2]) 50 | # 1、数据均值化处理 51 | x_mean = x.mean(axis=1) 52 | for i in range(x.index.size): 53 | x.iloc[i,:] = x.iloc[i,:]/x_mean[i] 54 | # 2、提取参考队列和比较队列 55 | ck=x.iloc[0,:] 56 | cp=x.iloc[1:,:] 57 | # 比较队列与参考队列相减 58 | t=pd.DataFrame() 59 | for j in range(cp.index.size): 60 | temp=pd.Series(cp.iloc[j,:] - ck) 61 | t=t.append(temp,ignore_index=True) 62 | #求最大差和最小差 63 | mmax=t.abs().max().max() 64 | mmin=t.abs().min().min() 65 | rho=0.5 66 | #3、求关联系数 67 | ksi=((mmin + rho*mmax)/(abs(t) + rho*mmax)) 68 | #4、求关联度 69 | r=ksi.sum(axis=1)/ksi.columns.size 70 | #5、关联度排序 71 | result=r.sort_values(ascending=False) 72 | plt.plot(seris1) 73 | plt.plot(seris2) 74 | plt.show() 75 | return result 76 | 77 | # if __name__ == '__main__': 78 | # pass 79 | # s1 = [0.4755,0.4299,0.6358,0.7527,0.4228,0.3358] 80 | # s2 = [0.6591,0.5739,0.5465,0.8993,0.6661,0.4037] 81 | # result = graAnalysic(s1,s2) -------------------------------------------------------------------------------- /feature_proj/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # __*__ coding: utf-8 __*__ 3 | 4 | ''' 5 | @Author: simonKing 6 | @License: (C) Copyright 2013-2019, Best Wonder Corporation Limited. 7 | @Os:Windows 10 x64 8 | @Contact: bw_wangxiaomeng@whty.com.cn 9 | @Software: PY PyCharm 10 | @File: __init__.py.py 11 | @Time: 2019/6/24 9:11 12 | @Desc: define your function 13 | ''' -------------------------------------------------------------------------------- /feature_proj/extractFeatures.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # __*__ coding: utf-8 __*__ 3 | 4 | ''' 5 | @Author: simonKing 6 | @License: (C) Copyright 2013-2019, Best Wonder Corporation Limited. 7 | @Os:Windows 10 x64 8 | @Contact: bw_wangxiaomeng@whty.com.cn 9 | @Software: PY PyCharm 10 | @File: extractFeatures.py 11 | @Time: 2019/6/24 13:57 12 | @Desc: define your function 13 | ''' 14 | 15 | import pandas as pd 16 | from feature_proj.featureSelector import FeatureSelector 17 | 18 | # 用户消费数据保密,不开源 19 | fname = '' 20 | train = pd.read_excel(fname) 21 | index = ["times-wp","je-wp","recently-wp","times-bqt","je-bqt","recently-bqt","times-bft","je-bft","recently-bft", 22 | "zczb_x","industry_x","ages_x","dayInvoiceNum_x","dayInvoiceJe_x","flag"] 23 | 24 | train = train.ix[:, index] 25 | 26 | train_labels = train['flag'] 27 | 28 | labels = [] 29 | for label in train_labels: 30 | if label <3:labels.append(1) 31 | else:labels.append(label) 32 | 33 | train = train.drop(columns = ['flag']) 34 | 35 | fs = FeatureSelector(data = train, labels = labels) 36 | 37 | # 皮尔逊相关系数 相关系数 0.8-1.0 极强相关 , 0.6-0.8 强相关 ,0.4-0.6 中等程度相关 ,0.2-0.4 弱相关 ,0.0-0.2 极弱相关或无相关 38 | fs.identify_collinear(correlation_threshold=0.9) 39 | correlated_features = fs.ops['collinear'] 40 | print(correlated_features[:]) 41 | # # 热力图 42 | fs.plot_collinear() 43 | fs.plot_collinear(plot_all=True) 44 | 45 | # 查看分类效果 46 | fs.identify_zero_importance(task = 'classification', eval_metric = 'auc', n_iterations = 10, early_stopping = True) 47 | print(fs.data_all.head(10)) 48 | zero_importance_features = fs.ops['zero_importance'] 49 | print(zero_importance_features[10:15]) 50 | # 变量重要性排名 51 | fs.plot_feature_importances(threshold = 0.9, plot_n = 14) 52 | 53 | # 累计重要性达到0.99,的特征排名 54 | fs.identify_low_importance(cumulative_importance = 0.99) 55 | low_importance_features = fs.ops['low_importance'] 56 | print(low_importance_features[:5]) 57 | 58 | # 特征删除 59 | # train_no_missing = fs.remove(methods = ['missing']) 60 | # # 删除多个 61 | # fs.identify_all(selection_params = {'missing_threshold': 0.6, 'correlation_threshold': 0.98,'task': 'classification', 62 | # 'eval_metric': 'auc','cumulative_importance': 0.99}) 63 | # # train_no_missing_zero = fs.remove(methods = ['missing', 'zero_importance']) 64 | # all_to_remove = fs.check_removal() 65 | # all_to_remove[10:25] 66 | # train_removed = fs.remove(methods = 'all') 67 | 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /feature_proj/featureSelector.py: -------------------------------------------------------------------------------- 1 | # numpy and pandas for data manipulation 2 | import pandas as pd 3 | import numpy as np 4 | 5 | # model used for feature importances 6 | import lightgbm as lgb 7 | 8 | # utility for early stopping with a validation set 9 | from sklearn.model_selection import train_test_split 10 | 11 | # visualizations 12 | import matplotlib.pyplot as plt 13 | import seaborn as sns 14 | 15 | # memory management 16 | import gc 17 | 18 | # utilities 19 | from itertools import chain 20 | 21 | class FeatureSelector(): 22 | """ 23 | Class for performing feature selection for machine learning or data preprocessing. 24 | 25 | Implements five different methods to identify features for removal 26 | 27 | 1. Find columns with a missing percentage greater than a specified threshold 28 | 2. Find columns with a single unique value 29 | 3. Find collinear variables with a correlation greater than a specified correlation coefficient 30 | 4. Find features with 0.0 feature importance from a gradient boosting machine (gbm) 31 | 5. Find low importance features that do not contribute to a specified cumulative feature importance from the gbm 32 | 33 | Parameters 34 | -------- 35 | data : dataframe 36 | A dataset with observations in the rows and features in the columns 37 | 38 | labels : array or series, default = None 39 | Array of labels for training the machine learning model to find feature importances. These can be either binary labels 40 | (if task is 'classification') or continuous targets (if task is 'regression'). 41 | If no labels are provided, then the feature importance based methods are not available. 42 | 43 | Attributes 44 | -------- 45 | 46 | ops : dict 47 | Dictionary of operations run and features identified for removal 48 | 49 | missing_stats : dataframe 50 | The fraction of missing values for all features 51 | 52 | record_missing : dataframe 53 | The fraction of missing values for features with missing fraction above threshold 54 | 55 | unique_stats : dataframe 56 | Number of unique values for all features 57 | 58 | record_single_unique : dataframe 59 | Records the features that have a single unique value 60 | 61 | corr_matrix : dataframe 62 | All correlations between all features in the data 63 | 64 | record_collinear : dataframe 65 | Records the pairs of collinear variables with a correlation coefficient above the threshold 66 | 67 | feature_importances : dataframe 68 | All feature importances from the gradient boosting machine 69 | 70 | record_zero_importance : dataframe 71 | Records the zero importance features in the data according to the gbm 72 | 73 | record_low_importance : dataframe 74 | Records the lowest importance features not needed to reach the threshold of cumulative importance according to the gbm 75 | 76 | 77 | Notes 78 | -------- 79 | 80 | - All 5 operations can be run with the `identify_all` method. 81 | - If using feature importances, one-hot encoding is used for categorical variables which creates new columns 82 | 83 | """ 84 | 85 | def __init__(self, data, labels=None): 86 | 87 | # Dataset and optional training labels 88 | self.data = data 89 | self.labels = labels 90 | 91 | if labels is None: 92 | print('No labels provided. Feature importance based methods are not available.') 93 | 94 | self.base_features = list(data.columns) 95 | self.one_hot_features = None 96 | 97 | # Dataframes recording information about features to remove 98 | self.record_missing = None 99 | self.record_single_unique = None 100 | self.record_collinear = None 101 | self.record_zero_importance = None 102 | self.record_low_importance = None 103 | 104 | self.missing_stats = None 105 | self.unique_stats = None 106 | self.corr_matrix = None 107 | self.feature_importances = None 108 | 109 | # Dictionary to hold removal operations 110 | self.ops = {} 111 | 112 | self.one_hot_correlated = False 113 | 114 | def identify_missing(self, missing_threshold): 115 | """Find the features with a fraction of missing values above `missing_threshold`""" 116 | 117 | self.missing_threshold = missing_threshold 118 | 119 | # Calculate the fraction of missing in each column 120 | missing_series = self.data.isnull().sum() / self.data.shape[0] 121 | self.missing_stats = pd.DataFrame(missing_series).rename(columns = {'index': 'feature', 0: 'missing_fraction'}) 122 | 123 | # Sort with highest number of missing values on top 124 | self.missing_stats = self.missing_stats.sort_values('missing_fraction', ascending = False) 125 | 126 | # Find the columns with a missing percentage above the threshold 127 | record_missing = pd.DataFrame(missing_series[missing_series > missing_threshold]).reset_index().rename(columns = 128 | {'index': 'feature', 129 | 0: 'missing_fraction'}) 130 | 131 | to_drop = list(record_missing['feature']) 132 | 133 | self.record_missing = record_missing 134 | self.ops['missing'] = to_drop 135 | 136 | print('%d features with greater than %0.2f missing values.\n' % (len(self.ops['missing']), self.missing_threshold)) 137 | 138 | def identify_single_unique(self): 139 | """Finds features with only a single unique value. NaNs do not count as a unique value. """ 140 | 141 | # Calculate the unique counts in each column 142 | unique_counts = self.data.nunique() 143 | self.unique_stats = pd.DataFrame(unique_counts).rename(columns = {'index': 'feature', 0: 'nunique'}) 144 | self.unique_stats = self.unique_stats.sort_values('nunique', ascending = True) 145 | 146 | # Find the columns with only one unique count 147 | record_single_unique = pd.DataFrame(unique_counts[unique_counts == 1]).reset_index().rename(columns = {'index': 'feature', 148 | 0: 'nunique'}) 149 | 150 | to_drop = list(record_single_unique['feature']) 151 | 152 | self.record_single_unique = record_single_unique 153 | self.ops['single_unique'] = to_drop 154 | 155 | print('%d features with a single unique value.\n' % len(self.ops['single_unique'])) 156 | 157 | def identify_collinear(self, correlation_threshold, one_hot=False): 158 | """ 159 | Finds collinear features based on the correlation coefficient between features. 160 | For each pair of features with a correlation coefficient greather than `correlation_threshold`, 161 | only one of the pair is identified for removal. 162 | 163 | Using code adapted from: https://chrisalbon.com/machine_learning/feature_selection/drop_highly_correlated_features/ 164 | 165 | Parameters 166 | -------- 167 | 168 | correlation_threshold : float between 0 and 1 169 | Value of the Pearson correlation cofficient for identifying correlation features 170 | 171 | one_hot : boolean, default = False 172 | Whether to one-hot encode the features before calculating the correlation coefficients 173 | 174 | """ 175 | 176 | self.correlation_threshold = correlation_threshold 177 | self.one_hot_correlated = one_hot 178 | 179 | # Calculate the correlations between every column 180 | if one_hot: 181 | 182 | # One hot encoding 183 | features = pd.get_dummies(self.data) 184 | self.one_hot_features = [column for column in features.columns if column not in self.base_features] 185 | 186 | # Add one hot encoded data to original data 187 | self.data_all = pd.concat([features[self.one_hot_features], self.data], axis = 1) 188 | 189 | corr_matrix = pd.get_dummies(features).corr() 190 | 191 | else: 192 | corr_matrix = self.data.corr() 193 | 194 | self.corr_matrix = corr_matrix 195 | 196 | # Extract the upper triangle of the correlation matrix 197 | upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(np.bool)) 198 | 199 | # Select the features with correlations above the threshold 200 | # Need to use the absolute value 201 | to_drop = [column for column in upper.columns if any(upper[column].abs() > correlation_threshold)] 202 | 203 | # Dataframe to hold correlated pairs 204 | record_collinear = pd.DataFrame(columns = ['drop_feature', 'corr_feature', 'corr_value']) 205 | 206 | # Iterate through the columns to drop to record pairs of correlated features 207 | for column in to_drop: 208 | 209 | # Find the correlated features 210 | corr_features = list(upper.index[upper[column].abs() > correlation_threshold]) 211 | 212 | # Find the correlated values 213 | corr_values = list(upper[column][upper[column].abs() > correlation_threshold]) 214 | drop_features = [column for _ in range(len(corr_features))] 215 | 216 | # Record the information (need a temp df for now) 217 | temp_df = pd.DataFrame.from_dict({'drop_feature': drop_features, 218 | 'corr_feature': corr_features, 219 | 'corr_value': corr_values}) 220 | 221 | # Add to dataframe 222 | record_collinear = record_collinear.append(temp_df, ignore_index = True) 223 | 224 | self.record_collinear = record_collinear 225 | self.ops['collinear'] = to_drop 226 | 227 | print('%d features with a correlation magnitude greater than %0.2f.\n' % (len(self.ops['collinear']), self.correlation_threshold)) 228 | 229 | def identify_zero_importance(self, task, eval_metric=None, n_iterations=10, early_stopping = True): 230 | """ 231 | 232 | Identify the features with zero importance according to a gradient boosting machine. 233 | The gbm can be trained with early stopping using a validation set to prevent overfitting. 234 | The feature importances are averaged over `n_iterations` to reduce variance. 235 | 236 | Uses the LightGBM implementation (http://lightgbm.readthedocs.io/en/latest/index.html) 237 | 238 | Parameters 239 | -------- 240 | 241 | eval_metric : string 242 | Evaluation metric to use for the gradient boosting machine for early stopping. Must be 243 | provided if `early_stopping` is True 244 | 245 | task : string 246 | The machine learning task, either 'classification' or 'regression' 247 | 248 | n_iterations : int, default = 10 249 | Number of iterations to train the gradient boosting machine 250 | 251 | early_stopping : boolean, default = True 252 | Whether or not to use early stopping with a validation set when training 253 | 254 | 255 | Notes 256 | -------- 257 | 258 | - Features are one-hot encoded to handle the categorical variables before training. 259 | - The gbm is not optimized for any particular task and might need some hyperparameter tuning 260 | - Feature importances, including zero importance features, can change across runs 261 | 262 | """ 263 | 264 | if early_stopping and eval_metric is None: 265 | raise ValueError("""eval metric must be provided with early stopping. Examples include "auc" for classification or 266 | "l2" for regression.""") 267 | 268 | if self.labels is None: 269 | raise ValueError("No training labels provided.") 270 | 271 | # One hot encoding 272 | features = pd.get_dummies(self.data) 273 | self.one_hot_features = [column for column in features.columns if column not in self.base_features] 274 | 275 | # Add one hot encoded data to original data 276 | self.data_all = pd.concat([features[self.one_hot_features], self.data], axis = 1) 277 | 278 | # Extract feature names 279 | feature_names = list(features.columns) 280 | 281 | # Convert to np array 282 | features = np.array(features) 283 | labels = np.array(self.labels).reshape((-1, )) 284 | 285 | # Empty array for feature importances 286 | feature_importance_values = np.zeros(len(feature_names)) 287 | 288 | print('Training Gradient Boosting Model\n') 289 | 290 | # Iterate through each fold 291 | for _ in range(n_iterations): 292 | 293 | if task == 'classification': 294 | model = lgb.LGBMClassifier(n_estimators=1000, learning_rate = 0.05, verbose = -1) 295 | 296 | elif task == 'regression': 297 | model = lgb.LGBMRegressor(n_estimators=1000, learning_rate = 0.05, verbose = -1) 298 | 299 | else: 300 | raise ValueError('Task must be either "classification" or "regression"') 301 | 302 | # If training using early stopping need a validation set 303 | if early_stopping: 304 | 305 | train_features, valid_features, train_labels, valid_labels = train_test_split(features, labels, test_size = 0.15) 306 | 307 | # Train the model with early stopping 308 | model.fit(train_features, train_labels, eval_metric = eval_metric, 309 | eval_set = [(valid_features, valid_labels)], 310 | early_stopping_rounds = 100, verbose = -1) 311 | 312 | # Clean up memory 313 | gc.enable() 314 | del train_features, train_labels, valid_features, valid_labels 315 | gc.collect() 316 | 317 | else: 318 | model.fit(features, labels) 319 | 320 | # Record the feature importances 321 | feature_importance_values += model.feature_importances_ / n_iterations 322 | 323 | feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values}) 324 | 325 | # Sort features according to importance 326 | feature_importances = feature_importances.sort_values('importance', ascending = False).reset_index(drop = True) 327 | 328 | # Normalize the feature importances to add up to one 329 | feature_importances['normalized_importance'] = feature_importances['importance'] / feature_importances['importance'].sum() 330 | feature_importances['cumulative_importance'] = np.cumsum(feature_importances['normalized_importance']) 331 | 332 | # Extract the features with zero importance 333 | record_zero_importance = feature_importances[feature_importances['importance'] == 0.0] 334 | 335 | to_drop = list(record_zero_importance['feature']) 336 | 337 | self.feature_importances = feature_importances 338 | self.record_zero_importance = record_zero_importance 339 | self.ops['zero_importance'] = to_drop 340 | 341 | print('\n%d features with zero importance after one-hot encoding.\n' % len(self.ops['zero_importance'])) 342 | 343 | def identify_low_importance(self, cumulative_importance): 344 | """ 345 | Finds the lowest importance features not needed to account for `cumulative_importance` fraction 346 | of the total feature importance from the gradient boosting machine. As an example, if cumulative 347 | importance is set to 0.95, this will retain only the most important features needed to 348 | reach 95% of the total feature importance. The identified features are those not needed. 349 | 350 | Parameters 351 | -------- 352 | cumulative_importance : float between 0 and 1 353 | The fraction of cumulative importance to account for 354 | 355 | """ 356 | 357 | self.cumulative_importance = cumulative_importance 358 | 359 | # The feature importances need to be calculated before running 360 | if self.feature_importances is None: 361 | raise NotImplementedError("""Feature importances have not yet been determined. Call the `identify_zero_importance` method first.""") 362 | 363 | # Make sure most important features are on top 364 | self.feature_importances = self.feature_importances.sort_values('cumulative_importance') 365 | 366 | # Identify the features not needed to reach the cumulative_importance 367 | record_low_importance = self.feature_importances[self.feature_importances['cumulative_importance'] > cumulative_importance] 368 | 369 | to_drop = list(record_low_importance['feature']) 370 | 371 | self.record_low_importance = record_low_importance 372 | self.ops['low_importance'] = to_drop 373 | 374 | print('%d features required for cumulative importance of %0.2f after one hot encoding.' % (len(self.feature_importances) - 375 | len(self.record_low_importance), self.cumulative_importance)) 376 | print('%d features do not contribute to cumulative importance of %0.2f.\n' % (len(self.ops['low_importance']), 377 | self.cumulative_importance)) 378 | 379 | def identify_all(self, selection_params): 380 | """ 381 | Use all five of the methods to identify features to remove. 382 | 383 | Parameters 384 | -------- 385 | 386 | selection_params : dict 387 | Parameters to use in the five feature selection methhods. 388 | Params must contain the keys ['missing_threshold', 'correlation_threshold', 'eval_metric', 'task', 'cumulative_importance'] 389 | 390 | """ 391 | 392 | # Check for all required parameters 393 | for param in ['missing_threshold', 'correlation_threshold', 'eval_metric', 'task', 'cumulative_importance']: 394 | if param not in selection_params.keys(): 395 | raise ValueError('%s is a required parameter for this method.' % param) 396 | 397 | # Implement each of the five methods 398 | self.identify_missing(selection_params['missing_threshold']) 399 | self.identify_single_unique() 400 | self.identify_collinear(selection_params['correlation_threshold']) 401 | self.identify_zero_importance(task = selection_params['task'], eval_metric = selection_params['eval_metric']) 402 | self.identify_low_importance(selection_params['cumulative_importance']) 403 | 404 | # Find the number of features identified to drop 405 | self.all_identified = set(list(chain(*list(self.ops.values())))) 406 | self.n_identified = len(self.all_identified) 407 | 408 | print('%d total features out of %d identified for removal after one-hot encoding.\n' % (self.n_identified, 409 | self.data_all.shape[1])) 410 | 411 | def check_removal(self, keep_one_hot=True): 412 | 413 | """Check the identified features before removal. Returns a list of the unique features identified.""" 414 | 415 | self.all_identified = set(list(chain(*list(self.ops.values())))) 416 | print('Total of %d features identified for removal' % len(self.all_identified)) 417 | 418 | if not keep_one_hot: 419 | if self.one_hot_features is None: 420 | print('Data has not been one-hot encoded') 421 | else: 422 | one_hot_to_remove = [x for x in self.one_hot_features if x not in self.all_identified] 423 | print('%d additional one-hot features can be removed' % len(one_hot_to_remove)) 424 | 425 | return list(self.all_identified) 426 | 427 | 428 | def remove(self, methods, keep_one_hot = True): 429 | """ 430 | Remove the features from the data according to the specified methods. 431 | 432 | Parameters 433 | -------- 434 | methods : 'all' or list of methods 435 | If methods == 'all', any methods that have identified features will be used 436 | Otherwise, only the specified methods will be used. 437 | Can be one of ['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'] 438 | keep_one_hot : boolean, default = True 439 | Whether or not to keep one-hot encoded features 440 | 441 | Return 442 | -------- 443 | data : dataframe 444 | Dataframe with identified features removed 445 | 446 | 447 | Notes 448 | -------- 449 | - If feature importances are used, the one-hot encoded columns will be added to the data (and then may be removed) 450 | - Check the features that will be removed before transforming data! 451 | 452 | """ 453 | 454 | 455 | features_to_drop = [] 456 | 457 | if methods == 'all': 458 | 459 | # Need to use one-hot encoded data as well 460 | data = self.data_all 461 | 462 | print('{} methods have been run\n'.format(list(self.ops.keys()))) 463 | 464 | # Find the unique features to drop 465 | features_to_drop = set(list(chain(*list(self.ops.values())))) 466 | 467 | else: 468 | # Need to use one-hot encoded data as well 469 | if 'zero_importance' in methods or 'low_importance' in methods or self.one_hot_correlated: 470 | data = self.data_all 471 | 472 | else: 473 | data = self.data 474 | 475 | # Iterate through the specified methods 476 | for method in methods: 477 | 478 | # Check to make sure the method has been run 479 | if method not in self.ops.keys(): 480 | raise NotImplementedError('%s method has not been run' % method) 481 | 482 | # Append the features identified for removal 483 | else: 484 | features_to_drop.append(self.ops[method]) 485 | 486 | # Find the unique features to drop 487 | features_to_drop = set(list(chain(*features_to_drop))) 488 | 489 | features_to_drop = list(features_to_drop) 490 | 491 | if not keep_one_hot: 492 | 493 | if self.one_hot_features is None: 494 | print('Data has not been one-hot encoded') 495 | else: 496 | 497 | features_to_drop = list(set(features_to_drop) | set(self.one_hot_features)) 498 | 499 | # Remove the features and return the data 500 | data = data.drop(columns = features_to_drop) 501 | self.removed_features = features_to_drop 502 | 503 | if not keep_one_hot: 504 | print('Removed %d features including one-hot features.' % len(features_to_drop)) 505 | else: 506 | print('Removed %d features.' % len(features_to_drop)) 507 | 508 | return data 509 | 510 | def plot_missing(self): 511 | """Histogram of missing fraction in each feature""" 512 | if self.record_missing is None: 513 | raise NotImplementedError("Missing values have not been calculated. Run `identify_missing`") 514 | 515 | self.reset_plot() 516 | 517 | # Histogram of missing values 518 | plt.style.use('seaborn-white') 519 | plt.figure(figsize = (7, 5)) 520 | plt.hist(self.missing_stats['missing_fraction'], bins = np.linspace(0, 1, 11), edgecolor = 'k', color = 'red', linewidth = 1.5) 521 | plt.xticks(np.linspace(0, 1, 11)) 522 | plt.xlabel('Missing Fraction', size = 14); plt.ylabel('Count of Features', size = 14) 523 | plt.title("Fraction of Missing Values Histogram", size = 16) 524 | plt.show() 525 | 526 | 527 | def plot_unique(self): 528 | """Histogram of number of unique values in each feature""" 529 | if self.record_single_unique is None: 530 | raise NotImplementedError('Unique values have not been calculated. Run `identify_single_unique`') 531 | 532 | self.reset_plot() 533 | 534 | # Histogram of number of unique values 535 | self.unique_stats.plot.hist(edgecolor = 'k', figsize = (7, 5)) 536 | plt.ylabel('Frequency', size = 14); plt.xlabel('Unique Values', size = 14) 537 | plt.title('Number of Unique Values Histogram', size = 16) 538 | plt.show() 539 | 540 | 541 | def plot_collinear(self, plot_all = False): 542 | """ 543 | Heatmap of the correlation values. If plot_all = True plots all the correlations otherwise 544 | plots only those features that have a correlation above the threshold 545 | 546 | Notes 547 | -------- 548 | - Not all of the plotted correlations are above the threshold because this plots 549 | all the variables that have been idenfitied as having even one correlation above the threshold 550 | - The features on the x-axis are those that will be removed. The features on the y-axis 551 | are the correlated features with those on the x-axis 552 | 553 | Code adapted from https://seaborn.pydata.org/examples/many_pairwise_correlations.html 554 | """ 555 | 556 | if self.record_collinear is None: 557 | raise NotImplementedError('Collinear features have not been idenfitied. Run `identify_collinear`.') 558 | 559 | if plot_all: 560 | corr_matrix_plot = self.corr_matrix 561 | title = 'All Correlations' 562 | 563 | else: 564 | # Identify the correlations that were above the threshold 565 | # columns (x-axis) are features to drop and rows (y_axis) are correlated pairs 566 | corr_matrix_plot = self.corr_matrix.loc[list(set(self.record_collinear['corr_feature'])), 567 | list(set(self.record_collinear['drop_feature']))] 568 | 569 | title = "Correlations Above Threshold" 570 | 571 | 572 | f, ax = plt.subplots(figsize=(10, 8)) 573 | 574 | # Diverging colormap 575 | cmap = sns.diverging_palette(220, 10, as_cmap=True) 576 | 577 | # Draw the heatmap with a color bar 578 | sns.heatmap(corr_matrix_plot, cmap=cmap, center=0,linewidths=.25, cbar_kws={"shrink": 0.6},annot=True) 579 | 580 | # Set the ylabels 581 | ax.set_yticks([x + 0.5 for x in list(range(corr_matrix_plot.shape[0]))]) 582 | ax.set_yticklabels(list(corr_matrix_plot.index), size = int(160 / corr_matrix_plot.shape[0])) 583 | 584 | # Set the xlabels 585 | ax.set_xticks([x + 0.5 for x in list(range(corr_matrix_plot.shape[1]))]) 586 | ax.set_xticklabels(list(corr_matrix_plot.columns), size = int(160 / corr_matrix_plot.shape[1])) 587 | plt.title(title, size = 14) 588 | plt.show() 589 | 590 | def plot_feature_importances(self, plot_n = 15, threshold = None): 591 | """ 592 | Plots `plot_n` most important features and the cumulative importance of features. 593 | If `threshold` is provided, prints the number of features needed to reach `threshold` cumulative importance. 594 | 595 | Parameters 596 | -------- 597 | 598 | plot_n : int, default = 15 599 | Number of most important features to plot. Defaults to 15 or the maximum number of features whichever is smaller 600 | 601 | threshold : float, between 0 and 1 default = None 602 | Threshold for printing information about cumulative importances 603 | 604 | """ 605 | 606 | if self.record_zero_importance is None: 607 | raise NotImplementedError('Feature importances have not been determined. Run `idenfity_zero_importance`') 608 | 609 | # Need to adjust number of features if greater than the features in the data 610 | if plot_n > self.feature_importances.shape[0]: 611 | plot_n = self.feature_importances.shape[0] - 1 612 | 613 | self.reset_plot() 614 | 615 | # Make a horizontal bar chart of feature importances 616 | plt.figure(figsize = (10, 6)) 617 | ax = plt.subplot() 618 | 619 | # Need to reverse the index to plot most important on top 620 | # There might be a more efficient method to accomplish this 621 | ax.barh(list(reversed(list(self.feature_importances.index[:plot_n]))), 622 | self.feature_importances['normalized_importance'][:plot_n], 623 | align = 'center', edgecolor = 'k') 624 | 625 | # Set the yticks and labels 626 | ax.set_yticks(list(reversed(list(self.feature_importances.index[:plot_n])))) 627 | ax.set_yticklabels(self.feature_importances['feature'][:plot_n], size = 12) 628 | 629 | # Plot labeling 630 | plt.xlabel('Normalized Importance', size = 16) 631 | plt.title('Feature Importances', size = 18) 632 | plt.show() 633 | 634 | # Cumulative importance plot 635 | plt.figure(figsize = (6, 4)) 636 | plt.plot(list(range(1, len(self.feature_importances) + 1)), self.feature_importances['cumulative_importance'], 'r-') 637 | plt.xlabel('Number of Features', size = 14); plt.ylabel('Cumulative Importance', size = 14) 638 | plt.title('Cumulative Feature Importance', size = 16) 639 | 640 | if threshold: 641 | 642 | # Index of minimum number of features needed for cumulative importance threshold 643 | # np.where returns the index so need to add 1 to have correct number 644 | importance_index = np.min(np.where(self.feature_importances['cumulative_importance'] > threshold)) 645 | plt.vlines(x = importance_index + 1, ymin = 0, ymax = 1, linestyles='--', colors = 'blue') 646 | plt.show() 647 | 648 | print('%d features required for %0.2f of cumulative importance' % (importance_index + 1, threshold)) 649 | 650 | def reset_plot(self): 651 | plt.rcParams = plt.rcParamsDefault 652 | -------------------------------------------------------------------------------- /pictures/决策树.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonWang00/recommender/02b26535b3cb754d006036e577586a270648c50d/pictures/决策树.png -------------------------------------------------------------------------------- /pictures/手肘法.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonWang00/recommender/02b26535b3cb754d006036e577586a270648c50d/pictures/手肘法.png -------------------------------------------------------------------------------- /pictures/推荐架构.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonWang00/recommender/02b26535b3cb754d006036e577586a270648c50d/pictures/推荐架构.png -------------------------------------------------------------------------------- /pictures/某赋通数学期望.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonWang00/recommender/02b26535b3cb754d006036e577586a270648c50d/pictures/某赋通数学期望.png -------------------------------------------------------------------------------- /pictures/特征相关性计算.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonWang00/recommender/02b26535b3cb754d006036e577586a270648c50d/pictures/特征相关性计算.png -------------------------------------------------------------------------------- /pictures/特征累计重要性计算.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonWang00/recommender/02b26535b3cb754d006036e577586a270648c50d/pictures/特征累计重要性计算.png -------------------------------------------------------------------------------- /pictures/特征重要性计算.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonWang00/recommender/02b26535b3cb754d006036e577586a270648c50d/pictures/特征重要性计算.png -------------------------------------------------------------------------------- /pictures/算法选型.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonWang00/recommender/02b26535b3cb754d006036e577586a270648c50d/pictures/算法选型.png -------------------------------------------------------------------------------- /pictures/聚类分群.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonWang00/recommender/02b26535b3cb754d006036e577586a270648c50d/pictures/聚类分群.png -------------------------------------------------------------------------------- /recommender.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # __*__ coding: utf-8 __*__ 3 | 4 | ''' 5 | @Author: simonKing 6 | @License: (C) Copyright 2013-2019, Best Wonder Corporation Limited. 7 | @Os:Windows 10 x64 8 | @Contact: bw_wangxiaomeng@whty.com.cn 9 | @Software: PY PyCharm 10 | @File: recommender.py 11 | @Time: 2019/7/16 15:37 12 | @Desc: define your function 13 | ''' 14 | import time 15 | import pymysql 16 | import datetime 17 | import pandas as pd 18 | from user_point.evaluationFunction import evaluationTotalProduct,computeTotalScore 19 | from user_recom.DecisionTree import Dtcmodel 20 | from setting import * 21 | 22 | 23 | def loadData(sql): 24 | ''' 25 | 从mysql中导入数据, 26 | :param sql: 27 | :return:DataFrame 28 | ''' 29 | conn = pymysql.connect(host=server,port=port,user=user,password=password,db=dbName,charset='utf8') 30 | result = pd.read_sql_query(sql=sql,con=conn) 31 | return result 32 | 33 | def getColumns(): 34 | ''' 35 | 接入生产环境的用户数据 36 | :return: 37 | ''' 38 | sql = "SELECT COLUMN_NAME FROM information_schema.COLUMNS WHERE TABLE_SCHEMA = 'dcm' AND TABLE_NAME = 'basic_info_all' LIMIT 10000;" 39 | conn = pymysql.connect(host=server,port=port,user=user,password=password,db=dbName,charset='utf8').cursor() 40 | conn.execute(sql) 41 | columns = conn.fetchall() 42 | conn.close() 43 | columns = [row[0] for row in columns] 44 | return columns 45 | 46 | def bftDeadtimeCalcu(deadtime): 47 | ''' 48 | 计算到期的天数 49 | :param deadtime:到期时间 50 | :return:天 51 | ''' 52 | if deadtime == None:return -1 53 | recently = str(deadtime) 54 | if recently.strip() == '0' or recently.strip() == None: 55 | return -1 56 | try: 57 | d1 = datetime.datetime.strptime(recently, '%Y-%m-%d') 58 | except: 59 | d1 = datetime.datetime.strptime(recently, '%Y/%m/%d') 60 | nwt = datetime.datetime.now() 61 | days = (d1 - nwt).days 62 | if days <0: days=720 63 | return days 64 | 65 | def deadtimeCalcu(recently): 66 | ''' 67 | 根据最近一次购买时间,算出到期时间 68 | :param recently:最近一次购买日期 69 | :return:还有多久到期 70 | ''' 71 | if recently ==None:return -1 72 | recently = str(recently) 73 | if recently.strip() == '0' or recently.strip() == None:return -1 74 | try: 75 | d1 = datetime.datetime.strptime(recently, '%Y-%m-%d') 76 | except: 77 | d1 = datetime.datetime.strptime(recently, '%Y/%m/%d') 78 | delta = datetime.timedelta(days=365) 79 | deadtime = d1 + delta 80 | nwt = datetime.datetime.now() 81 | days = (deadtime - nwt).days 82 | if days <0: days=720 83 | return days 84 | 85 | def agesCalcu(age): 86 | ''' 87 | 计算到目前为止,成立的日期 88 | :param establish:成立日期 89 | :return:成立了多少年 90 | ''' 91 | if age ==None:return 3 92 | age = str(age) 93 | if age.strip() =="未知" or age == '0': 94 | return 3 95 | try: 96 | d1 = datetime.datetime.strptime(age, '%Y-%m-%d') 97 | except: 98 | d1 = datetime.datetime.strptime(age, '%Y/%m/%d') 99 | nwt = datetime.datetime.now() 100 | years = round((nwt - d1).days/365,2) 101 | return years 102 | 103 | def rankCalcu(zczb): 104 | ''' 105 | 计算注册资本排名 106 | :param zczb:输入注册资本 107 | :return:排名 108 | ''' 109 | if zczb ==0 or zczb =="未知" or zczb =="0" or zczb ==None:return '未知' 110 | fname = Rfname 111 | f = open(fname,'r').readlines() 112 | zczbArr = pd.Series(f) 113 | 114 | zczbArr = [int(eval(zb.strip())) for zb in zczbArr] 115 | try: 116 | rank = zczbArr.index(int(eval(zczb))) + 1 117 | except: 118 | rank = '未知' 119 | return rank 120 | 121 | def scoreArrCalcu(data): 122 | ''' 123 | 计算推荐得分 124 | :param data:用户评级 125 | :return:所有得分 126 | ''' 127 | scoreArr = [computeTotalScore(dict(zip(index,data.iloc[i]))) for i in range(len(data))] 128 | return scoreArr 129 | 130 | def scoreCalcu(row): 131 | ''' 132 | 计算单个用户的得分 133 | :param row: 用户等级 134 | :return: 得分 135 | ''' 136 | score = computeTotalScore(dict(zip(index,row))) 137 | return score 138 | 139 | 140 | def exeTiming(func): 141 | ''' 142 | 设置装饰器,定时执行 143 | :param func:方法 144 | :return: 145 | ''' 146 | def wrapper(): 147 | t1 = time.time() 148 | nwt = datetime.datetime.now().strftime('%d %H') 149 | # 每月1号 5点执行 150 | if nwt == exeTime: 151 | func() 152 | print('recommend sucess !') 153 | time.sleep(60*60*24) 154 | else: 155 | time.sleep(60*30) 156 | t2 = time.time() 157 | print(t2-t1) 158 | return wrapper 159 | 160 | def outputXls(data): 161 | ''' 162 | 输出结果,如果用户应该推荐,就写入excel中 163 | :param data:用户数据 164 | :return: 165 | ''' 166 | OutputFile = Output + 'recommendList' + datetime.datetime.now().strftime('%Y%m%d') + '.xls' 167 | df = pd.DataFrame(data=data) 168 | try: 169 | # 生产环境导表头 170 | columns = getColumns() 171 | except: 172 | # 测试环境用index 173 | columns = index 174 | columns.append("Rank") 175 | columns.append("score") 176 | df.columns = columns 177 | df.to_excel(OutputFile) 178 | 179 | 180 | @exeTiming 181 | def recommend(): 182 | ''' 183 | 推荐主方法 184 | :return:输出名单 185 | ''' 186 | # 测试数据预处理 187 | test_data = pd.read_excel(Tname) 188 | data = test_data.ix[:,index] 189 | # 缺失填充0 190 | data = data.fillna(0) 191 | ages = [agesCalcu(y) for y in data['dateOfEstablishment']] 192 | recently_wp = [bftDeadtimeCalcu(x) for x in data['deadline_wp']] 193 | recently_bft = [bftDeadtimeCalcu(x) for x in data['deadline']] 194 | recently_bqt = [deadtimeCalcu(x) for x in data['recently_bqt']] 195 | data['dateOfEstablishment'] = ages 196 | data['deadline_wp'] = recently_wp 197 | data['deadline'] = recently_bft 198 | data['recently_bqt'] = recently_bqt 199 | data_raw = [] 200 | # print('raw_data:',data) 201 | # print(len(data)) 202 | # # 给输入数据评级 203 | for i in range(len(data)): 204 | row = data.iloc[i] 205 | test_X = evaluationTotalProduct(row,index) 206 | data_raw.append(test_X) 207 | RecommendList = [] 208 | # # 加载模型,推荐预测 209 | y_predict = Dtcmodel(Mname,index,data_raw).tolist() 210 | for j,label in enumerate(y_predict): 211 | if label ==3: 212 | row = data_raw[j] 213 | score = scoreCalcu(row) 214 | info = test_data.iloc[j].tolist() 215 | rank = rankCalcu(test_data.iloc[j]['registeredCapital']) 216 | info.append(rank) 217 | info.append(score) 218 | RecommendList.append(info) 219 | print("推荐意愿打分:",score) 220 | # 输出结果 221 | if len(RecommendList) > 0: 222 | print("注意!生成了推荐名单") 223 | outputXls(RecommendList) 224 | else: 225 | print("没有产生推荐名单") 226 | 227 | if __name__ =='__main__': 228 | while True: 229 | recommend() -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==0.23.0 2 | numpy==1.16.3 3 | lightgbm==2.1.1 4 | sklearn 5 | matplotlib==3.0.3 6 | seaborn==0.8.1 7 | openpyxl==2.5.3 8 | IPython 9 | pydotplus==2.0.2 10 | pymysql==0.8.1 11 | jieba==0.39 12 | -------------------------------------------------------------------------------- /setting.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # __*__ coding: utf-8 __*__ 3 | 4 | ''' 5 | @Author: simonKing 6 | @License: (C) Copyright 2013-2019, Best Wonder Corporation Limited. 7 | @Os:Windows 10 x64 8 | @Contact: bw_wangxiaomeng@whty.com.cn 9 | @Software: PY PyCharm 10 | @File: setting.py 11 | @Time: 2019/8/18 11:21 12 | @Desc: 项目配置文件 13 | ''' 14 | 15 | # mysql config 16 | server = '127.0.0.1' 17 | port = 3306 18 | user = 'root' 19 | password = '000000' 20 | dbName = 'dcm' 21 | 22 | # 准备待预测数据 23 | Tname = "./data/DataRaw.xlsx" 24 | # 准备模型数据 25 | Mname = "./data/DataModel.xlsx" 26 | # header 27 | index = ["times_wp","je_wp","deadline_wp","times_bqt","je_bqt","recently_bqt","userConsumeTotalTimes","userConsumeTotalAmount","deadline","loginFrequency", 28 | "registeredCapital","industry","dateOfEstablishment","dayCountAvg","daySumAvg"] 29 | 30 | # 排名数据 31 | Rfname = "./data/注册资本Ranks" 32 | 33 | # 输出名单 34 | Output = './data/' 35 | 36 | # 定时执行的时间设置,如每月1号,早上5点执行 37 | exeTime = '01 05' -------------------------------------------------------------------------------- /user_point/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # __*__ coding: utf-8 __*__ 3 | 4 | ''' 5 | @Author: simonKing 6 | @License: (C) Copyright 2013-2019, Best Wonder Corporation Limited. 7 | @Os:Windows 10 x64 8 | @Contact: bw_wangxiaomeng@whty.com.cn 9 | @Software: PY PyCharm 10 | @File: __init__.py.py 11 | @Time: 2019/6/24 9:12 12 | @Desc: define your function 13 | ''' -------------------------------------------------------------------------------- /user_point/evaluationFunction.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # __*__ coding: utf-8 __*__ 3 | 4 | ''' 5 | @Author: simonKing 6 | @License: (C) Copyright 2013-2019, Best Wonder Corporation Limited. 7 | @Os:Windows 10 x64 8 | @Contact: bw_wangxiaomeng@whty.com.cn 9 | @Software: PY PyCharm 10 | @File: evaluationFunction.py 11 | @Time: 2019/7/11 10:21 12 | @Desc: define your function 13 | ''' 14 | ''' 15 | R,到期时间,得分越高,最高5分,最低1分 16 | 90天内购买;5 17 | 90-180天未购买;4 18 | 180-360天未购买;3 19 | 360-720;2 20 | 720以上;1 21 | 22 | 23 | F,交易频率越高,得分越高,最高5分,最低1分 24 | [1,24] ,平均1.1 25 | 1 -->1分; 26 | 2 -->2分; 27 | 3 -->3分; 28 | 4 -->4分; 29 | 5以上 -->5分; 30 | 31 | M,交易金额越高,得分越高,最高5分,最低1分,平均金额374 32 | 大于2000 --> 5; 33 | 900~2000 --> 4; 34 | 370~900 --> 3; 35 | 200~370 --> 2; 36 | 0~200 --> 1; 37 | 38 | 注册资本: 39 | 大于1000万 --> 5; 40 | 500-1000万 --> 4; 41 | 100-500万 --> 3; 42 | 50-100万 --> 2; 43 | 50万以内,未知 -->1; 44 | 45 | 成立日期: 46 | 两年内 --> 2; 47 | 两年后 --> 1; 48 | 49 | 行业分类: 50 | 餐饮住宿 --> 3; 51 | 制造业 --> 2; 52 | 其它 --> 1; 53 | 54 | 日均登录次数: 55 | 5次及以上 --> 5; 56 | 4次 --> 4; 57 | 3次 --> 3; 58 | 2次 --> 2; 59 | 0,1次 --> 1; 60 | 61 | 日均开票次数: 62 | 10次及以上 --> 3; 63 | 5-10次 --> 2; 64 | 0-5次 --> 1; 65 | 66 | 日均开票金额: 67 | 1000及以上 --> 3; 68 | 100-1000 --> 2; 69 | 100以内 --> 1; 70 | ''' 71 | 72 | 73 | import pandas as pd 74 | 75 | 76 | def loadExcel(fname,index): 77 | ''' 78 | 从Excel 中导入数据 79 | :param fname:excel文件名 80 | :param index:导入的列 81 | :return:矩阵 82 | ''' 83 | df = pd.read_excel(fname) 84 | return df.ix[:, index] 85 | 86 | def R_eval(days): 87 | ''' 88 | 最近一次购买,分析最近的购买需求度,越小越好 89 | :param days:距离2019年7月最近一次购买天数 90 | :return:1-5 91 | ''' 92 | if int(days) == -1:return 0 93 | if abs(days) <90:score = 5 94 | elif 90<= abs(days) < 180:score = 4 95 | elif 180<= abs(days) < 360:score = 3 96 | elif 360<= abs(days) < 720:score = 2 97 | elif abs(days) >= 720:score = 1 98 | else:raise Exception('check days type. ERR') 99 | return score 100 | 101 | def F_eval(times): 102 | ''' 103 | 购买频率打分,频率越高越忠诚,得分越高 104 | :param times:购买次数 105 | :return:1-5 106 | ''' 107 | if abs(times) > 4:score = 5 108 | elif 3< abs(times) <= 4:score = 4 109 | elif 2< abs(times) <= 3:score = 3 110 | elif 1< abs(times) <= 2:score = 2 111 | elif 0< abs(times) <= 1:score = 1 112 | elif times ==0:score=0 113 | else:raise Exception('check times type. ERR') 114 | return score 115 | 116 | def M_eval(je): 117 | ''' 118 | 消费金额打分,得分越高贡献度越高 119 | :param je:金额 120 | :return:1-5 121 | ''' 122 | if abs(je) >= 2000:score = 5 123 | elif 900<= abs(je) < 2000: score = 4 124 | elif 370<= abs(je) < 900:score = 3 125 | elif 200<= abs(je) < 370:score = 2 126 | elif 0< abs(je) < 200:score = 1 127 | elif abs(je) == 0: score = 0 128 | else:raise Exception('check je type. ERR') 129 | return score 130 | 131 | def ZB_eval(zczb): 132 | ''' 133 | 注册资本打分 134 | :param zczb:注册资本 135 | :return:1-5 136 | ''' 137 | if abs(zczb) >= 1000:score = 5 138 | elif 500<= abs(zczb) < 1000:score = 4 139 | elif 100<= abs(zczb) < 500:score = 3 140 | elif 50<= abs(zczb) < 100:score = 2 141 | elif 0< abs(zczb) < 50:score = 1 142 | else:score = 0 143 | return score 144 | 145 | def AGE_eval(age): 146 | ''' 147 | 公司成立日期打分,成立较短的打高分 148 | :param age:成立年限 149 | :return:打分2,1 150 | ''' 151 | if abs(age) > 2:score = 1 152 | elif 0< abs(age) <= 2:score = 2 153 | elif abs(age) == 0: score = 0 154 | else:raise Exception('check age type. ERR') 155 | return score 156 | 157 | def INDUSTRY_eval(indestry): 158 | ''' 159 | 行业分类打分,为了突出餐饮住宿、制造业,打分 160 | :param indestry:行业分类 161 | :return:行业分类打分3,2,1 162 | ''' 163 | if indestry == '餐饮住宿':score = 3 164 | elif indestry == '制造业':score = 2 165 | else:score = 1 166 | return score 167 | 168 | def LOGINS_eval(logins): 169 | ''' 170 | 日均登录次数打分 171 | :param logins:日均登录次数 172 | :return:打分1-5 173 | ''' 174 | if abs(logins) >= 5:score = 5 175 | elif 4<= abs(logins) < 5:score = 4 176 | elif 3<= abs(logins) < 4:score = 3 177 | elif 2<= abs(logins) < 3:score = 2 178 | elif 0< abs(logins) < 2:score = 1 179 | else:score = 0 180 | return score 181 | 182 | def INVOICES_eval(invoices): 183 | ''' 184 | 日均开票次数打分 185 | :param invoices:日均开票次数 186 | :return:打分3,2,1 187 | ''' 188 | if abs(invoices) >= 10:score = 3 189 | elif 5<= abs(invoices) < 10:score = 2 190 | elif 0< abs(invoices) < 5:score = 1 191 | else:score = 0 192 | return score 193 | 194 | def INVOICEJE_eval(invoiceJe): 195 | ''' 196 | 日均开票金额打分 197 | :param invoiceJe: 日均开票金额 198 | :return: 分数 3,2,1 199 | ''' 200 | if abs(invoiceJe) >= 1000:score = 3 201 | elif 100<= abs(invoiceJe) < 1000:score = 2 202 | elif 0< abs(invoiceJe) < 100:score = 1 203 | else:score = 0 204 | return score 205 | 206 | def computeScore(scoreDic): 207 | ''' 208 | 根据每个用户评级dic 算出对产品的打分 209 | :param scoreDic:每个用户对商品的评级 210 | :return:对产品的打分 211 | ''' 212 | if len(scoreDic) <1:raise Exception('scoreDic is none') 213 | if scoreDic.get('dayLoginTimes') == None:scoreDic['dayLoginTimes'] = 0 214 | score = scoreDic['recently'] * 1000 + scoreDic['je'] * 100 + scoreDic['times'] * 100 + \ 215 | scoreDic['zczb'] * 10 + scoreDic['dayInvoiceNum'] * 10 + scoreDic['dayLoginTimes'] * 10 + \ 216 | scoreDic['dayInvoiceJe'] * 10 + scoreDic['ages'] * 10 + scoreDic['industry'] * 1 217 | return score 218 | 219 | def computeTotalScore(scoreDic): 220 | ''' 221 | 根据用户对所有商品的打分 222 | :param scoreDic:每个用户对所有商品的评级 223 | :return:对产品的打分 224 | index = ["times-wp","je-wp","recently-wp","times-bqt","je-bqt","recently-bqt","times-bft","je-bft","recently-bft","dayLoginTimes", 225 | "zczb","industry","ages","dayInvoiceNum","dayInvoiceJe"] 226 | ''' 227 | if len(scoreDic) <1:raise Exception('scoreDic is none') 228 | if scoreDic.get('dayLoginTimes') == None:scoreDic['dayLoginTimes'] = 0 229 | score_common = scoreDic['registeredCapital'] * 10 + scoreDic['dayCountAvg'] * 30 + scoreDic['loginFrequency'] * 30 + \ 230 | scoreDic['daySumAvg'] * 30 + scoreDic['dateOfEstablishment'] * 10 + scoreDic['industry'] * 10 231 | if scoreDic['deadline']==5: bft = 200 + (scoreDic['userConsumeTotalAmount'] + scoreDic['userConsumeTotalTimes'])* 50 232 | else:bft = 0 233 | if scoreDic['recently_bqt']==5: bqt = 100 + (scoreDic['je_bqt'] + scoreDic['times_bqt'])* 50 234 | else:bqt = 0 235 | if scoreDic['deadline_wp']==5: wp = 100 + (scoreDic['je_wp'] + scoreDic['times_wp'])* 50 236 | else:wp = 0 237 | score = bft + bqt + wp + score_common 238 | return score 239 | 240 | 241 | def evaluationTotal(row,index): 242 | ''' 243 | 对输入的用户特征进行评价 244 | :param row:用户特征数据 245 | :param index:选择的列数 246 | :return:打分 247 | ''' 248 | scoreDic = {} 249 | if len(row) != len(index): 250 | raise Exception('table header is incorrect') 251 | for i in index: 252 | if i == 'registeredCapital': 253 | zczb = int(row[i]) 254 | score = ZB_eval(zczb) 255 | scoreDic['registeredCapital'] = score 256 | elif i =='industry': 257 | industry = row[i] 258 | score = INDUSTRY_eval(industry) 259 | scoreDic['industry'] = score 260 | elif i =='je': 261 | je = float(row[i]) 262 | score = M_eval(je) 263 | scoreDic['je'] = score 264 | elif i =='times': 265 | times = int(row[i]) 266 | score = F_eval(times) 267 | scoreDic['times'] = score 268 | elif i =='dayLoginTimes': 269 | dayLoginTimes = float(row[i]) 270 | score = LOGINS_eval(dayLoginTimes) 271 | scoreDic['dayLoginTimes'] = score 272 | elif i =='dayInvoiceNum': 273 | dayInvoiceNum = float(row[i]) 274 | score = INVOICES_eval(dayInvoiceNum) 275 | scoreDic['dayInvoiceNum'] = score 276 | elif i =='dayInvoiceJe': 277 | dayInvoiceJe = row[i] 278 | score = INVOICEJE_eval(dayInvoiceJe) 279 | scoreDic['dayInvoiceJe'] = score 280 | elif i =='ages': 281 | ages = float(row[i]) 282 | score = AGE_eval(ages) 283 | scoreDic['ages'] = score 284 | elif i =='recently': 285 | recently = float(row[i]) 286 | score = R_eval(recently) 287 | scoreDic['recently'] = score 288 | # print(scoreDic) 289 | SCORE = computeScore(scoreDic) 290 | return SCORE,scoreDic 291 | 292 | def evaluationTotalProduct(row,index): 293 | ''' 294 | 对输入的用户所有特征进行评价 295 | :param row:用户特征数据 296 | :param index:选择的列数 297 | :return:打分 298 | ''' 299 | scoreDic = {} 300 | scoreArr = [] 301 | if len(row) != len(index): 302 | raise Exception('table header is incorrect') 303 | for i in index: 304 | if i == 'registeredCapital': 305 | try: 306 | zczb = float(row[i]) 307 | except: 308 | zczb = 0 309 | score = ZB_eval(zczb) 310 | scoreDic['registeredCapital'] = score 311 | elif i == 'industry': 312 | industry = row[i] 313 | score = INDUSTRY_eval(industry) 314 | scoreDic['industry'] = score 315 | elif i == 'je_wp': 316 | je = float(row[i]) 317 | score = M_eval(je) 318 | scoreDic['je_wp'] = score 319 | elif i == 'userConsumeTotalAmount': 320 | je = float(row[i]) 321 | score = M_eval(je) 322 | scoreDic['userConsumeTotalAmount'] = score 323 | elif i == 'je_bqt': 324 | je = float(row[i]) 325 | score = M_eval(je) 326 | scoreDic['je_bqt'] = score 327 | elif i == 'times_wp': 328 | times = int(row[i]) 329 | score = F_eval(times) 330 | scoreDic['times_wp'] = score 331 | elif i == 'userConsumeTotalTimes': 332 | times = int(row[i]) 333 | score = F_eval(times) 334 | scoreDic['userConsumeTotalTimes'] = score 335 | elif i == 'times_bqt': 336 | times = int(row[i]) 337 | score = F_eval(times) 338 | scoreDic['times_bqt'] = score 339 | elif i == 'loginFrequency': 340 | dayLoginTimes = float(row[i]) 341 | score = LOGINS_eval(dayLoginTimes) 342 | scoreDic['loginFrequency'] = score 343 | elif i == 'dayCountAvg': 344 | dayInvoiceNum = float(row[i]) 345 | score = INVOICES_eval(dayInvoiceNum) 346 | scoreDic['dayCountAvg'] = score 347 | elif i == 'daySumAvg': 348 | dayInvoiceJe = row[i] 349 | score = INVOICEJE_eval(dayInvoiceJe) 350 | scoreDic['daySumAvg'] = score 351 | elif i == 'dateOfEstablishment': 352 | ages = float(row[i]) 353 | score = AGE_eval(ages) 354 | scoreDic['dateOfEstablishment'] = score 355 | elif i == 'deadline_wp': 356 | recently = float(row[i]) 357 | score = R_eval(recently) 358 | scoreDic['deadline_wp'] = score 359 | elif i == 'deadline': 360 | recently = float(row[i]) 361 | score = R_eval(recently) 362 | scoreDic['deadline'] = score 363 | elif i == 'recently_bqt': 364 | recently = float(row[i]) 365 | score = R_eval(recently) 366 | scoreDic['recently_bqt'] = score 367 | else:raise Exception('please check index is exist! err:',row) 368 | scoreArr.append(score) 369 | # print(scoreDic) 370 | return scoreArr 371 | 372 | # if __name__ == '__main__': 373 | # # 准备数据 374 | # fname = r'D:\baiwang\21.数据运营\recommender\user_point\评价\方案2\bft_origin2.xlsx' 375 | # # 百赋通 376 | # index = ['tax_id','zczb','industry','je','times','dayLoginTimes','dayInvoiceNum','dayInvoiceJe','ages','recently'] 377 | # # 百企通,旺票 378 | # # index = ['tax_id','zczb','industry','times','je','recently','ages','dayInvoiceNum','dayInvoiceJe'] 379 | # # index = ['tax_id','zczb','industry','times','je','recently','ages','dayInvoiceNum','dayInvoiceJe'] 380 | # data_origin = loadExcel(fname,index) 381 | # 382 | # # 写入Excel 383 | # writer = pd.ExcelWriter('./bftClassfy.xlsx') 384 | # scoreArr = [] 385 | # TaxidArr = [] 386 | # zczbArr = [] 387 | # industryArr = [] 388 | # timesArr = [] 389 | # dayLoginTimesArr = [] 390 | # jeArr = [] 391 | # recentlyArr = [] 392 | # agesArr = [] 393 | # dayInvoiceNumArr = [] 394 | # dayInvoiceJeArr = [] 395 | # # 逐行遍历 396 | # for i in range(len(data_origin)): 397 | # row = data_origin.iloc[i] 398 | # score,scoreDic = evaluationTotal(row, index) 399 | # TaxidArr.append(row['tax_id']) 400 | # scoreArr.append(score) 401 | # zczbArr.append(scoreDic.get('zczb')) 402 | # industryArr.append(scoreDic.get('industry')) 403 | # timesArr.append(scoreDic.get('times')) 404 | # jeArr.append(scoreDic.get('je')) 405 | # recentlyArr.append(scoreDic.get('recently')) 406 | # dayLoginTimesArr.append(scoreDic.get('dayLoginTimes')) 407 | # agesArr.append(scoreDic.get('ages')) 408 | # dayInvoiceNumArr.append(scoreDic.get('dayInvoiceNum')) 409 | # dayInvoiceJeArr.append(scoreDic.get('dayInvoiceJe')) 410 | # print(row['tax_id'],'score is:',score,'socreDict is:',scoreDic) 411 | # pd_taxid = pd.DataFrame({'tax_id': TaxidArr}) 412 | # pd_score = pd.DataFrame({'score': scoreArr}) 413 | # pd_zczb = pd.DataFrame({'zczb': zczbArr}) 414 | # pd_industry = pd.DataFrame({'industry': industryArr}) 415 | # pd_times = pd.DataFrame({'times': timesArr}) 416 | # pd_je = pd.DataFrame({'je': jeArr}) 417 | # pd_recently = pd.DataFrame({'recently': recentlyArr}) 418 | # pd_dayLoginTimes = pd.DataFrame({'dayLoginTimes': dayLoginTimesArr}) 419 | # pd_ages = pd.DataFrame({'ages': agesArr}) 420 | # pd_dayInvoiceNum = pd.DataFrame({'dayInvoiceNum': dayInvoiceNumArr}) 421 | # pd_dayInvoiceJe = pd.DataFrame({'dayInvoiceJe': dayInvoiceJeArr}) 422 | # pd_taxid.to_excel(writer, sheet_name='Sheet1', startcol=0, index=False) 423 | # pd_score.to_excel(writer, sheet_name='Sheet1', startcol=1, index=False) 424 | # pd_zczb.to_excel(writer, sheet_name='Sheet1', startcol=2, index=False) 425 | # pd_industry.to_excel(writer, sheet_name='Sheet1', startcol=3, index=False) 426 | # pd_times.to_excel(writer, sheet_name='Sheet1', startcol=4, index=False) 427 | # pd_je.to_excel(writer, sheet_name='Sheet1', startcol=5, index=False) 428 | # pd_recently.to_excel(writer, sheet_name='Sheet1', startcol=6, index=False) 429 | # pd_dayLoginTimes.to_excel(writer, sheet_name='Sheet1', startcol=10, index=False) 430 | # pd_ages.to_excel(writer, sheet_name='Sheet1', startcol=7, index=False) 431 | # pd_dayInvoiceNum.to_excel(writer, sheet_name='Sheet1', startcol=8, index=False) 432 | # pd_dayInvoiceJe.to_excel(writer, sheet_name='Sheet1', startcol=9, index=False) 433 | # # 不加会报错 434 | # writer.save() 435 | -------------------------------------------------------------------------------- /user_point/giveScores.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # __*__ coding: utf-8 __*__ 3 | 4 | ''' 5 | @Author: simonKing 6 | @License: (C) Copyright 2013-2019, Best Wonder Corporation Limited. 7 | @Os:Windows 10 x64 8 | @Contact: bw_wangxiaomeng@whty.com.cn 9 | @Software: PY PyCharm 10 | @File: giveScores.py 11 | @Time: 2019/7/3 9:24 12 | @Desc: define your function 13 | ''' 14 | 15 | ''' 16 | ############################################################### 17 | * 算法设计目的:根据用户使用数据,对百旺和百赋通打分,0~5 18 | * 算法流程:1、数据清洗;2、数据编码码;3、K-means生成得分 19 | * 算法输入:用户在百赋通使用数据,用户在百旺的消费数据 20 | * 算法输出:分别算出每个用户对百赋通和百旺的打分 21 | ############################################################### 22 | 23 | R,到期时间,得分越高,最高5分,最低1分 24 | 90天内购买;5 25 | 90-180天未购买;4 26 | 180-360天未购买;3 27 | 360-720;2 28 | 720以上;1 29 | 30 | 31 | F,交易频率越高,得分越高,最高5分,最低1分 32 | [1,24] ,平均1.1 33 | 1 -->1分; 34 | 2 -->2分; 35 | 3 -->3分; 36 | 4 -->4分; 37 | 5以上 -->5分; 38 | 39 | M,交易金额越高,得分越高,最高5分,最低1分,平均金额374 40 | 大于2000 --> 5; 41 | 900~2000 --> 4; 42 | 370~900 --> 3; 43 | 200~370 --> 2; 44 | 0~200 --> 1; 45 | 46 | 注册资本: 47 | 大于1000万 --> 5; 48 | 500-1000万 --> 4; 49 | 100-500万 --> 3; 50 | 50-100万 --> 2; 51 | 50万以内,未知 -->1; 52 | 53 | 成立日期: 54 | 两年内 --> 2; 55 | 两年后 --> 1; 56 | 57 | 行业分类: 58 | 餐饮住宿 --> 3; 59 | 制造业 --> 2; 60 | 其它 --> 1; 61 | 62 | 日均登录次数: 63 | 5次及以上 --> 5; 64 | 4次 --> 4; 65 | 3次 --> 3; 66 | 2次 --> 2; 67 | 0,1次 --> 1; 68 | 69 | 日均登录次数: 70 | 10次及以上 --> 3; 71 | 5-10次 --> 2; 72 | 0-5次 --> 1; 73 | 74 | 日均开票金额: 75 | 1000及以上 --> 3; 76 | 100-1000 --> 2; 77 | 100以内 --> 1; 78 | 79 | ''' 80 | 81 | import numpy as np 82 | import openpyxl 83 | import pandas as pd 84 | from sklearn.cluster import KMeans,MiniBatchKMeans 85 | from sklearn.decomposition import PCA 86 | import matplotlib.pyplot as plt 87 | from mpl_toolkits.mplot3d.axes3d import Axes3D 88 | 89 | 90 | 91 | 92 | def loadData(fname,splitchar='\t'): 93 | ''' 94 | 导入数据 95 | :param fname:文件名 96 | :param splitchar:字符间的分割符合 97 | :return:输入向量 98 | ''' 99 | f = open(fname) 100 | X = [[float(v.split(splitchar)[0].strip()), float(v.split(splitchar)[1].strip()),float(v.split(splitchar)[2].strip())] for v in f] 101 | # X = [[float(v.split(splitchar)[0].strip()), float(v.split(splitchar)[1].strip()), float(v.split(splitchar)[2].strip()), float(v.split(splitchar)[3].strip()) 102 | # , float(v.split(splitchar)[4].strip()), float(v.split(splitchar)[5].strip()), float(v.split(splitchar)[6].strip()), float(v.split(splitchar)[7].strip())] for v in f] 103 | X = np.array(X) 104 | return X 105 | 106 | def loadExcel(fname,index): 107 | ''' 108 | 从Excel 中导入数据 109 | :param fname:excel文件名 110 | :param index:导入的列 111 | :return:矩阵 112 | ''' 113 | df = pd.read_excel(fname) 114 | X = df.ix[:, index] 115 | # print(X) 116 | # print(type(X)) 117 | Input_X = np.array(X) 118 | return Input_X 119 | 120 | def insertExcel(fname,col,data): 121 | wb = openpyxl.load_workbook(fname) 122 | ws = wb.worksheets[0] 123 | # ws.insert_cols(col) 124 | for index,row in enumerate(ws.rows): 125 | if index ==0: 126 | row[col+1].value = '评分' 127 | else: 128 | # ws.rows 比data多出了1行,header 129 | row[col+1].value = data.tolist()[index-1] 130 | wb.save('./new.xlsx') 131 | 132 | def Kderiv(Input,n=2): 133 | ''' 134 | 求离散数列的2阶导数 135 | :param Input:输入离散数列 136 | :param n:n阶导数 137 | :return:二阶导数 138 | ''' 139 | fun = np.poly1d(Input) 140 | fun_1 = np.poly1d.deriv(fun) 141 | fun_2 = np.poly1d.deriv(fun_1) 142 | return fun_2 143 | 144 | def EmCalcu(data,labels,k): 145 | ''' 146 | 计算群体购买百赋通的数学期望 147 | :param data:原始数据 148 | :param labels:标签 149 | :param k:K均值 150 | :return: 151 | ''' 152 | bestBuy = [] 153 | for i in range(k): 154 | buy = 0 155 | nbuy = 0 156 | for j in range(len(labels)): 157 | if i == labels[j]: 158 | if data[j][-1] > 0: 159 | buy = buy + 1 160 | elif data[j][-1] ==0: 161 | nbuy = nbuy + 1 162 | else: 163 | raise Exception('label not right') 164 | idx = labels.tolist().index(i) 165 | buyTimes = buy+ nbuy 166 | buyRate = buy/buyTimes 167 | print('K is %s, label is %s , buyRate is %s ,buyTime is %s'%(k,i,buyRate,buyTimes)) 168 | print('example point:',data[idx]) 169 | print("***"*30) 170 | 171 | 172 | def choiceK(data): 173 | ''' 174 | 手肘法:采用MiniBatch方式寻找最佳的K值 175 | :param data:输入的数据集 176 | :return:最佳的K值 177 | ''' 178 | data = [row for row in data if row[1] > 0] 179 | raw_data = [row[:6] for row in data] 180 | sse = [] 181 | for k in range(1, 11): 182 | # estimator = KMeans(n_clusters=k) 183 | estimator = MiniBatchKMeans(init='k-means++', n_clusters=k, batch_size=100000, n_init=10, max_no_improvement=10, verbose=0) 184 | estimator.fit(raw_data) 185 | labels = estimator.labels_ 186 | EmCalcu(data,labels,k) 187 | sse.append(estimator.inertia_) 188 | # data2 = [] 189 | # for d in data.tolist(): 190 | # if d not in data2: 191 | # data2.append(d) 192 | # data2 = np.array(data2) 193 | # 显示每个Kmeans聚类效果 194 | # label_pred = estimator.labels_ 195 | # centroids = estimator.cluster_centers_ 196 | # plotKmeans(data, k, centroids, label_pred).subplot(330+k) 197 | X = range(1, 11) 198 | plt.xlabel('K') 199 | plt.ylabel('SSE') 200 | plt.title("choice best k value") 201 | plt.plot(X, sse, 'o-') 202 | plt.savefig('./shouzhou.png') 203 | plt.show() 204 | return 205 | 206 | def KmeansModel(data,k): 207 | ''' 208 | # 构造聚类器 209 | :param data: 输入训练集 210 | :param k:聚类的K值 211 | :return:聚类模型 212 | ''' 213 | estimator = KMeans(n_clusters=k) 214 | estimator.fit(data) 215 | return estimator 216 | 217 | def KmeansPredict(clf,Input): 218 | ''' 219 | Kmeans进行预测 220 | :param clf:Kmeans分类器 221 | :param Input:输入待测数据 222 | :return:分类标签 223 | ''' 224 | labels = clf.predict(Input) 225 | return labels 226 | 227 | def plotKmeans(dataSet,k,centroids,label_pred): 228 | ''' 229 | 用于绘制Kmeans聚类效果 230 | :param dataSet:输入数据集 231 | :param k:聚类的K值 232 | :param centroids:聚类的质心 233 | :param label_pred:预测的标签 234 | :return: 235 | ''' 236 | mark = [ '^r', '+b', 'sg', 'x', 'gini = 0.158
samples = 85315
value = [77980, 6724, 611]
class = low>, fillcolor="#e58139e7"] ; 5 | 1 [label=gini = 0.087
samples = 81726
value = [77980, 3746, 0]
class = low>, fillcolor="#e58139f3"] ; 6 | 0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ; 7 | 2 [label=gini = 0.0
samples = 66828
value = [66815, 13, 0]
class = low>, fillcolor="#e58139ff"] ; 8 | 1 -> 2 ; 9 | 3 [label=samples = 66670
value = [66670, 0, 0]
class = low>, fillcolor="#e58139ff"] ; 10 | 2 -> 3 ; 11 | 4 [label=gini = 0.151
samples = 158
value = [145, 13, 0]
class = low>, fillcolor="#e58139e8"] ; 12 | 2 -> 4 ; 13 | 5 [label=gini = 0.052
samples = 149
value = [145, 4, 0]
class = low>, fillcolor="#e58139f8"] ; 14 | 4 -> 5 ; 15 | 6 [label=samples = 140
value = [140, 0, 0]
class = low>, fillcolor="#e58139ff"] ; 16 | 5 -> 6 ; 17 | 7 [label=gini = 0.494
samples = 9
value = [5, 4, 0]
class = low>, fillcolor="#e5813933"] ; 18 | 5 -> 7 ; 19 | 8 [label=samples = 3
value = [3, 0, 0]
class = low>, fillcolor="#e58139ff"] ; 20 | 7 -> 8 ; 21 | 9 [label=gini = 0.444
samples = 6
value = [2, 4, 0]
class = medium>, fillcolor="#39e5817f"] ; 22 | 7 -> 9 ; 23 | 10 [label=samples = 4
value = [0, 4, 0]
class = medium>, fillcolor="#39e581ff"] ; 24 | 9 -> 10 ; 25 | 11 [label=samples = 2
value = [2, 0, 0]
class = low>, fillcolor="#e58139ff"] ; 26 | 9 -> 11 ; 27 | 12 [label=samples = 9
value = [0, 9, 0]
class = medium>, fillcolor="#39e581ff"] ; 28 | 4 -> 12 ; 29 | 13 [label=gini = 0.376
samples = 14898
value = [11165, 3733, 0]
class = low>, fillcolor="#e58139aa"] ; 30 | 1 -> 13 ; 31 | 14 [label=samples = 11034
value = [11034, 0, 0]
class = low>, fillcolor="#e58139ff"] ; 32 | 13 -> 14 ; 33 | 15 [label=gini = 0.066
samples = 3864
value = [131, 3733, 0]
class = medium>, fillcolor="#39e581f6"] ; 34 | 13 -> 15 ; 35 | 16 [label=gini = 0.006
samples = 3744
value = [11, 3733, 0]
class = medium>, fillcolor="#39e581fe"] ; 36 | 15 -> 16 ; 37 | 17 [label=gini = 0.099
samples = 210
value = [11, 199, 0]
class = medium>, fillcolor="#39e581f1"] ; 38 | 16 -> 17 ; 39 | 18 [label=samples = 11
value = [11, 0, 0]
class = low>, fillcolor="#e58139ff"] ; 40 | 17 -> 18 ; 41 | 19 [label=samples = 199
value = [0, 199, 0]
class = medium>, fillcolor="#39e581ff"] ; 42 | 17 -> 19 ; 43 | 20 [label=samples = 3534
value = [0, 3534, 0]
class = medium>, fillcolor="#39e581ff"] ; 44 | 16 -> 20 ; 45 | 21 [label=samples = 120
value = [120, 0, 0]
class = low>, fillcolor="#e58139ff"] ; 46 | 15 -> 21 ; 47 | 22 [label=gini = 0.283
samples = 3589
value = [0, 2978, 611]
class = medium>, fillcolor="#39e581cb"] ; 48 | 0 -> 22 [labeldistance=2.5, labelangle=-45, headlabel="False"] ; 49 | 23 [label=gini = 0.085
samples = 3115
value = [0, 2976, 139]
class = medium>, fillcolor="#39e581f3"] ; 50 | 22 -> 23 ; 51 | 24 [label=gini = 0.007
samples = 2475
value = [0, 2466, 9]
class = medium>, fillcolor="#39e581fe"] ; 52 | 23 -> 24 ; 53 | 25 [label=samples = 2454
value = [0, 2454, 0]
class = medium>, fillcolor="#39e581ff"] ; 54 | 24 -> 25 ; 55 | 26 [label=gini = 0.49
samples = 21
value = [0, 12, 9]
class = medium>, fillcolor="#39e58140"] ; 56 | 24 -> 26 ; 57 | 27 [label=samples = 12
value = [0, 12, 0]
class = medium>, fillcolor="#39e581ff"] ; 58 | 26 -> 27 ; 59 | 28 [label=samples = 9
value = [0, 0, 9]
class = high>, fillcolor="#8139e5ff"] ; 60 | 26 -> 28 ; 61 | 29 [label=gini = 0.324
samples = 640
value = [0, 510, 130]
class = medium>, fillcolor="#39e581be"] ; 62 | 23 -> 29 ; 63 | 30 [label=gini = 0.132
samples = 506
value = [0, 470, 36]
class = medium>, fillcolor="#39e581eb"] ; 64 | 29 -> 30 ; 65 | 31 [label=gini = 0.086
samples = 489
value = [0, 467, 22]
class = medium>, fillcolor="#39e581f3"] ; 66 | 30 -> 31 ; 67 | 32 [label=gini = 0.054
samples = 434
value = [0, 422, 12]
class = medium>, fillcolor="#39e581f8"] ; 68 | 31 -> 32 ; 69 | 33 [label=gini = 0.025
samples = 399
value = [0, 394, 5]
class = medium>, fillcolor="#39e581fc"] ; 70 | 32 -> 33 ; 71 | 34 [label=gini = 0.021
samples = 386
value = [0, 382, 4]
class = medium>, fillcolor="#39e581fc"] ; 72 | 33 -> 34 ; 73 | 35 [label=gini = 0.013
samples = 301
value = [0, 299, 2]
class = medium>, fillcolor="#39e581fd"] ; 74 | 34 -> 35 ; 75 | 36 [label=samples = 168
value = [0, 168, 0]
class = medium>, fillcolor="#39e581ff"] ; 76 | 35 -> 36 ; 77 | 37 [label=gini = 0.03
samples = 133
value = [0, 131, 2]
class = medium>, fillcolor="#39e581fb"] ; 78 | 35 -> 37 ; 79 | 38 [label=gini = 0.165
samples = 11
value = [0, 10, 1]
class = medium>, fillcolor="#39e581e6"] ; 80 | 37 -> 38 ; 81 | 39 [label=samples = 6
value = [0, 6, 0]
class = medium>, fillcolor="#39e581ff"] ; 82 | 38 -> 39 ; 83 | 40 [label=gini = 0.32
samples = 5
value = [0, 4, 1]
class = medium>, fillcolor="#39e581bf"] ; 84 | 38 -> 40 ; 85 | 41 [label=samples = 2
value = [0, 2, 0]
class = medium>, fillcolor="#39e581ff"] ; 86 | 40 -> 41 ; 87 | 42 [label=gini = 0.444
samples = 3
value = [0, 2, 1]
class = medium>, fillcolor="#39e5817f"] ; 88 | 40 -> 42 ; 89 | 43 [label=samples = 2
value = [0, 1, 1]
class = medium>, fillcolor="#39e58100"] ; 90 | 42 -> 43 ; 91 | 44 [label=samples = 1
value = [0, 1, 0]
class = medium>, fillcolor="#39e581ff"] ; 92 | 42 -> 44 ; 93 | 45 [label=gini = 0.016
samples = 122
value = [0, 121, 1]
class = medium>, fillcolor="#39e581fd"] ; 94 | 37 -> 45 ; 95 | 46 [label=samples = 35
value = [0, 35, 0]
class = medium>, fillcolor="#39e581ff"] ; 96 | 45 -> 46 ; 97 | 47 [label=gini = 0.023
samples = 87
value = [0, 86, 1]
class = medium>, fillcolor="#39e581fc"] ; 98 | 45 -> 47 ; 99 | 48 [label=samples = 22
value = [0, 22, 0]
class = medium>, fillcolor="#39e581ff"] ; 100 | 47 -> 48 ; 101 | 49 [label=gini = 0.03
samples = 65
value = [0, 64, 1]
class = medium>, fillcolor="#39e581fb"] ; 102 | 47 -> 49 ; 103 | 50 [label=gini = 0.033
samples = 60
value = [0, 59, 1]
class = medium>, fillcolor="#39e581fb"] ; 104 | 49 -> 50 ; 105 | 51 [label=samples = 59
value = [0, 58, 1]
class = medium>, fillcolor="#39e581fb"] ; 106 | 50 -> 51 ; 107 | 52 [label=samples = 1
value = [0, 1, 0]
class = medium>, fillcolor="#39e581ff"] ; 108 | 50 -> 52 ; 109 | 53 [label=samples = 5
value = [0, 5, 0]
class = medium>, fillcolor="#39e581ff"] ; 110 | 49 -> 53 ; 111 | 54 [label=gini = 0.046
samples = 85
value = [0, 83, 2]
class = medium>, fillcolor="#39e581f9"] ; 112 | 34 -> 54 ; 113 | 55 [label=gini = 0.059
samples = 66
value = [0, 64, 2]
class = medium>, fillcolor="#39e581f7"] ; 114 | 54 -> 55 ; 115 | 56 [label=gini = 0.1
samples = 19
value = [0, 18, 1]
class = medium>, fillcolor="#39e581f1"] ; 116 | 55 -> 56 ; 117 | 57 [label=samples = 6
value = [0, 6, 0]
class = medium>, fillcolor="#39e581ff"] ; 118 | 56 -> 57 ; 119 | 58 [label=samples = 13
value = [0, 12, 1]
class = medium>, fillcolor="#39e581ea"] ; 120 | 56 -> 58 ; 121 | 59 [label=gini = 0.042
samples = 47
value = [0, 46, 1]
class = medium>, fillcolor="#39e581f9"] ; 122 | 55 -> 59 ; 123 | 60 [label=samples = 27
value = [0, 26, 1]
class = medium>, fillcolor="#39e581f5"] ; 124 | 59 -> 60 ; 125 | 61 [label=samples = 20
value = [0, 20, 0]
class = medium>, fillcolor="#39e581ff"] ; 126 | 59 -> 61 ; 127 | 62 [label=samples = 19
value = [0, 19, 0]
class = medium>, fillcolor="#39e581ff"] ; 128 | 54 -> 62 ; 129 | 63 [label=gini = 0.142
samples = 13
value = [0, 12, 1]
class = medium>, fillcolor="#39e581ea"] ; 130 | 33 -> 63 ; 131 | 64 [label=samples = 4
value = [0, 4, 0]
class = medium>, fillcolor="#39e581ff"] ; 132 | 63 -> 64 ; 133 | 65 [label=gini = 0.198
samples = 9
value = [0, 8, 1]
class = medium>, fillcolor="#39e581df"] ; 134 | 63 -> 65 ; 135 | 66 [label=samples = 2
value = [0, 2, 0]
class = medium>, fillcolor="#39e581ff"] ; 136 | 65 -> 66 ; 137 | 67 [label=gini = 0.245
samples = 7
value = [0, 6, 1]
class = medium>, fillcolor="#39e581d4"] ; 138 | 65 -> 67 ; 139 | 68 [label=samples = 1
value = [0, 1, 0]
class = medium>, fillcolor="#39e581ff"] ; 140 | 67 -> 68 ; 141 | 69 [label=gini = 0.278
samples = 6
value = [0, 5, 1]
class = medium>, fillcolor="#39e581cc"] ; 142 | 67 -> 69 ; 143 | 70 [label=samples = 5
value = [0, 4, 1]
class = medium>, fillcolor="#39e581bf"] ; 144 | 69 -> 70 ; 145 | 71 [label=samples = 1
value = [0, 1, 0]
class = medium>, fillcolor="#39e581ff"] ; 146 | 69 -> 71 ; 147 | 72 [label=gini = 0.32
samples = 35
value = [0, 28, 7]
class = medium>, fillcolor="#39e581bf"] ; 148 | 32 -> 72 ; 149 | 73 [label=samples = 28
value = [0, 28, 0]
class = medium>, fillcolor="#39e581ff"] ; 150 | 72 -> 73 ; 151 | 74 [label=samples = 7
value = [0, 0, 7]
class = high>, fillcolor="#8139e5ff"] ; 152 | 72 -> 74 ; 153 | 75 [label=gini = 0.298
samples = 55
value = [0, 45, 10]
class = medium>, fillcolor="#39e581c6"] ; 154 | 31 -> 75 ; 155 | 76 [label=gini = 0.26
samples = 52
value = [0, 44, 8]
class = medium>, fillcolor="#39e581d1"] ; 156 | 75 -> 76 ; 157 | 77 [label=samples = 15
value = [0, 15, 0]
class = medium>, fillcolor="#39e581ff"] ; 158 | 76 -> 77 ; 159 | 78 [label=gini = 0.339
samples = 37
value = [0, 29, 8]
class = medium>, fillcolor="#39e581b9"] ; 160 | 76 -> 78 ; 161 | 79 [label=gini = 0.313
samples = 36
value = [0, 29, 7]
class = medium>, fillcolor="#39e581c1"] ; 162 | 78 -> 79 ; 163 | 80 [label=gini = 0.342
samples = 32
value = [0, 25, 7]
class = medium>, fillcolor="#39e581b8"] ; 164 | 79 -> 80 ; 165 | 81 [label=samples = 12
value = [0, 10, 2]
class = medium>, fillcolor="#39e581cc"] ; 166 | 80 -> 81 ; 167 | 82 [label=samples = 20
value = [0, 15, 5]
class = medium>, fillcolor="#39e581aa"] ; 168 | 80 -> 82 ; 169 | 83 [label=samples = 4
value = [0, 4, 0]
class = medium>, fillcolor="#39e581ff"] ; 170 | 79 -> 83 ; 171 | 84 [label=samples = 1
value = [0, 0, 1]
class = high>, fillcolor="#8139e5ff"] ; 172 | 78 -> 84 ; 173 | 85 [label=gini = 0.444
samples = 3
value = [0, 1, 2]
class = high>, fillcolor="#8139e57f"] ; 174 | 75 -> 85 ; 175 | 86 [label=samples = 2
value = [0, 1, 1]
class = medium>, fillcolor="#39e58100"] ; 176 | 85 -> 86 ; 177 | 87 [label=samples = 1
value = [0, 0, 1]
class = high>, fillcolor="#8139e5ff"] ; 178 | 85 -> 87 ; 179 | 88 [label=gini = 0.291
samples = 17
value = [0, 3, 14]
class = high>, fillcolor="#8139e5c8"] ; 180 | 30 -> 88 ; 181 | 89 [label=gini = 0.5
samples = 6
value = [0, 3, 3]
class = medium>, fillcolor="#39e58100"] ; 182 | 88 -> 89 ; 183 | 90 [label=gini = 0.48
samples = 5
value = [0, 3, 2]
class = medium>, fillcolor="#39e58155"] ; 184 | 89 -> 90 ; 185 | 91 [label=samples = 1
value = [0, 1, 0]
class = medium>, fillcolor="#39e581ff"] ; 186 | 90 -> 91 ; 187 | 92 [label=gini = 0.5
samples = 4
value = [0, 2, 2]
class = medium>, fillcolor="#39e58100"] ; 188 | 90 -> 92 ; 189 | 93 [label=gini = 0.444
samples = 3
value = [0, 1, 2]
class = high>, fillcolor="#8139e57f"] ; 190 | 92 -> 93 ; 191 | 94 [label=samples = 2
value = [0, 1, 1]
class = medium>, fillcolor="#39e58100"] ; 192 | 93 -> 94 ; 193 | 95 [label=samples = 1
value = [0, 0, 1]
class = high>, fillcolor="#8139e5ff"] ; 194 | 93 -> 95 ; 195 | 96 [label=samples = 1
value = [0, 1, 0]
class = medium>, fillcolor="#39e581ff"] ; 196 | 92 -> 96 ; 197 | 97 [label=samples = 1
value = [0, 0, 1]
class = high>, fillcolor="#8139e5ff"] ; 198 | 89 -> 97 ; 199 | 98 [label=samples = 11
value = [0, 0, 11]
class = high>, fillcolor="#8139e5ff"] ; 200 | 88 -> 98 ; 201 | 99 [label=gini = 0.419
samples = 134
value = [0, 40, 94]
class = high>, fillcolor="#8139e592"] ; 202 | 29 -> 99 ; 203 | 100 [label=gini = 0.399
samples = 40
value = [0, 29, 11]
class = medium>, fillcolor="#39e5819e"] ; 204 | 99 -> 100 ; 205 | 101 [label=gini = 0.238
samples = 29
value = [0, 25, 4]
class = medium>, fillcolor="#39e581d6"] ; 206 | 100 -> 101 ; 207 | 102 [label=gini = 0.191
samples = 28
value = [0, 25, 3]
class = medium>, fillcolor="#39e581e0"] ; 208 | 101 -> 102 ; 209 | 103 [label=gini = 0.091
samples = 21
value = [0, 20, 1]
class = medium>, fillcolor="#39e581f2"] ; 210 | 102 -> 103 ; 211 | 104 [label=samples = 19
value = [0, 19, 0]
class = medium>, fillcolor="#39e581ff"] ; 212 | 103 -> 104 ; 213 | 105 [label=gini = 0.5
samples = 2
value = [0, 1, 1]
class = medium>, fillcolor="#39e58100"] ; 214 | 103 -> 105 ; 215 | 106 [label=samples = 1
value = [0, 1, 0]
class = medium>, fillcolor="#39e581ff"] ; 216 | 105 -> 106 ; 217 | 107 [label=samples = 1
value = [0, 0, 1]
class = high>, fillcolor="#8139e5ff"] ; 218 | 105 -> 107 ; 219 | 108 [label=gini = 0.408
samples = 7
value = [0, 5, 2]
class = medium>, fillcolor="#39e58199"] ; 220 | 102 -> 108 ; 221 | 109 [label=samples = 5
value = [0, 5, 0]
class = medium>, fillcolor="#39e581ff"] ; 222 | 108 -> 109 ; 223 | 110 [label=samples = 2
value = [0, 0, 2]
class = high>, fillcolor="#8139e5ff"] ; 224 | 108 -> 110 ; 225 | 111 [label=samples = 1
value = [0, 0, 1]
class = high>, fillcolor="#8139e5ff"] ; 226 | 101 -> 111 ; 227 | 112 [label=gini = 0.463
samples = 11
value = [0, 4, 7]
class = high>, fillcolor="#8139e56d"] ; 228 | 100 -> 112 ; 229 | 113 [label=gini = 0.444
samples = 6
value = [0, 4, 2]
class = medium>, fillcolor="#39e5817f"] ; 230 | 112 -> 113 ; 231 | 114 [label=samples = 4
value = [0, 4, 0]
class = medium>, fillcolor="#39e581ff"] ; 232 | 113 -> 114 ; 233 | 115 [label=samples = 2
value = [0, 0, 2]
class = high>, fillcolor="#8139e5ff"] ; 234 | 113 -> 115 ; 235 | 116 [label=samples = 5
value = [0, 0, 5]
class = high>, fillcolor="#8139e5ff"] ; 236 | 112 -> 116 ; 237 | 117 [label=gini = 0.207
samples = 94
value = [0, 11, 83]
class = high>, fillcolor="#8139e5dd"] ; 238 | 99 -> 117 ; 239 | 118 [label=gini = 0.359
samples = 47
value = [0, 11, 36]
class = high>, fillcolor="#8139e5b1"] ; 240 | 117 -> 118 ; 241 | 119 [label=gini = 0.477
samples = 28
value = [0, 11, 17]
class = high>, fillcolor="#8139e55a"] ; 242 | 118 -> 119 ; 243 | 120 [label=gini = 0.43
samples = 16
value = [0, 11, 5]
class = medium>, fillcolor="#39e5818b"] ; 244 | 119 -> 120 ; 245 | 121 [label=gini = 0.165
samples = 11
value = [0, 10, 1]
class = medium>, fillcolor="#39e581e6"] ; 246 | 120 -> 121 ; 247 | 122 [label=gini = 0.32
samples = 5
value = [0, 4, 1]
class = medium>, fillcolor="#39e581bf"] ; 248 | 121 -> 122 ; 249 | 123 [label=samples = 3
value = [0, 3, 0]
class = medium>, fillcolor="#39e581ff"] ; 250 | 122 -> 123 ; 251 | 124 [label=gini = 0.5
samples = 2
value = [0, 1, 1]
class = medium>, fillcolor="#39e58100"] ; 252 | 122 -> 124 ; 253 | 125 [label=samples = 1
value = [0, 0, 1]
class = high>, fillcolor="#8139e5ff"] ; 254 | 124 -> 125 ; 255 | 126 [label=samples = 1
value = [0, 1, 0]
class = medium>, fillcolor="#39e581ff"] ; 256 | 124 -> 126 ; 257 | 127 [label=samples = 6
value = [0, 6, 0]
class = medium>, fillcolor="#39e581ff"] ; 258 | 121 -> 127 ; 259 | 128 [label=gini = 0.32
samples = 5
value = [0, 1, 4]
class = high>, fillcolor="#8139e5bf"] ; 260 | 120 -> 128 ; 261 | 129 [label=samples = 1
value = [0, 1, 0]
class = medium>, fillcolor="#39e581ff"] ; 262 | 128 -> 129 ; 263 | 130 [label=samples = 4
value = [0, 0, 4]
class = high>, fillcolor="#8139e5ff"] ; 264 | 128 -> 130 ; 265 | 131 [label=samples = 12
value = [0, 0, 12]
class = high>, fillcolor="#8139e5ff"] ; 266 | 119 -> 131 ; 267 | 132 [label=samples = 19
value = [0, 0, 19]
class = high>, fillcolor="#8139e5ff"] ; 268 | 118 -> 132 ; 269 | 133 [label=samples = 47
value = [0, 0, 47]
class = high>, fillcolor="#8139e5ff"] ; 270 | 117 -> 133 ; 271 | 134 [label=gini = 0.008
samples = 474
value = [0, 2, 472]
class = high>, fillcolor="#8139e5fe"] ; 272 | 22 -> 134 ; 273 | 135 [label=samples = 2
value = [0, 2, 0]
class = medium>, fillcolor="#39e581ff"] ; 274 | 134 -> 135 ; 275 | 136 [label=samples = 472
value = [0, 0, 472]
class = high>, fillcolor="#8139e5ff"] ; 276 | 134 -> 136 ; 277 | } -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # __*__ coding: utf-8 __*__ 3 | 4 | ''' 5 | @Author: simonKing 6 | @License: (C) Copyright 2013-2019, Best Wonder Corporation Limited. 7 | @Os:Windows 10 x64 8 | @Contact: bw_wangxiaomeng@whty.com.cn 9 | @Software: PY PyCharm 10 | @File: __init__.py.py 11 | @Time: 2019/6/25 10:37 12 | @Desc: define your function 13 | ''' -------------------------------------------------------------------------------- /utils/cutScopebusiness.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # __*__ coding: utf-8 __*__ 3 | 4 | ''' 5 | @Author: simonKing 6 | @License: (C) Copyright 2013-2019, Best Wonder Corporation Limited. 7 | @Os:Windows 10 x64 8 | @Contact: bw_wangxiaomeng@whty.com.cn 9 | @Software: PY PyCharm 10 | @File: cutScopebusiness.py 11 | @Time: 2019/6/25 10:56 12 | @Desc: define your function 13 | ''' 14 | 15 | ''' 16 | 行业分类参考:https://blog.csdn.net/chenhuamain/article/details/84579667 17 | 农林牧渔 1 18 | 制造业 2 19 | 卫生医疗 3 20 | 商务服务 4 21 | 居民服务 5 22 | 建筑产业 6 23 | 房地产业 7 24 | 教育培训 8 25 | 文体娱乐 9 26 | 电信通讯 10 27 | 科学技术 11 28 | 租赁服务 12 29 | 维修服务 13 30 | 设计服务 14 31 | 运输物流 15 32 | 采矿工业 16 33 | 金融服务 17 34 | 餐饮住宿 18 35 | ''' 36 | import pymysql 37 | 38 | class MysqlOperate: 39 | def __init__(self): 40 | self.db = pymysql.connect("192.168.5.135", "root", "000000", "platform") 41 | self.cursor = self.db.cursor() 42 | pass 43 | 44 | def read_data(self,sql): 45 | self.cursor.execute(sql) 46 | # datas = self.cursor.fetchmany(100) 47 | datas = self.cursor.fetchall() 48 | # print(datas) 49 | self.db.close() 50 | return datas 51 | 52 | def update_data(self,sql): 53 | self.cursor.execute(sql) 54 | self.db.commit() 55 | return 56 | -------------------------------------------------------------------------------- /utils/fasttextClassfy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # __*__ coding: utf-8 __*__ 3 | 4 | ''' 5 | @Author: simonKing 6 | @License: (C) Copyright 2013-2019, Best Wonder Corporation Limited. 7 | @Os:Windows 10 x64 8 | @Contact: bw_wangxiaomeng@whty.com.cn 9 | @Software: PY PyCharm 10 | @File: fasttextClassfy.py 11 | @Time: 2019/6/25 10:47 12 | @Desc: define your function 13 | ''' 14 | import jieba 15 | import random 16 | import jieba.posseg as pseg 17 | from sklearn.svm import SVC 18 | from sklearn.naive_bayes import MultinomialNB 19 | from sklearn.neighbors import KNeighborsClassifier 20 | from utils.cutScopebusiness import MysqlOperate 21 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 22 | 23 | 24 | # 朴素贝叶斯算法 25 | def nb_model(train, train_label, test, test_label): 26 | clf_model = MultinomialNB(alpha=0.01) 27 | clf_model.fit(train, train_label) 28 | predict_results = clf_model.predict(test) 29 | 30 | count = 0 31 | predict_list = predict_results.tolist() 32 | for i, pred in enumerate(predict_list): 33 | if (pred == test_label[i]): 34 | count += 1 35 | 36 | print("nb_model_precision_score: " + str(float(count) / len(predict_list))) 37 | 38 | # 创建朴素贝叶斯分类模型 39 | def nb_classfy_model(train, train_label,inputs): 40 | ''' 41 | :param train: 训练语料 42 | :param train_label: 训练标签 43 | :param inputs:待预测预料 44 | :return:预测结果 45 | ''' 46 | clf_model = MultinomialNB(alpha=0.01) 47 | clf_model.fit(train, train_label) 48 | predict_results = clf_model.predict(inputs) 49 | # print('贝叶斯预测结果:',predict_results) 50 | predict_results = predict_results[0].split('__label__')[1] 51 | return predict_results 52 | 53 | 54 | # K近邻算法 55 | def knn_model(train, train_label, test, test_label): 56 | knn_model = KNeighborsClassifier(n_neighbors=8) 57 | knn_model.fit(train, train_label) 58 | predict_results = knn_model.predict(test) 59 | 60 | count = 0 61 | predict_list = predict_results.tolist() 62 | for i, pred in enumerate(predict_list): 63 | if (pred == test_label[i]): 64 | count += 1 65 | 66 | print("knn_model_precision_score: " + str(float(count) / len(predict_list))) 67 | 68 | 69 | # 支持向量机算法 70 | def svm_model(train, train_label, test, test_label): 71 | svm_clf = SVC(kernel="linear", verbose=False) 72 | svm_clf.fit(train, train_label) 73 | predict_results = svm_clf.predict(test) 74 | 75 | count = 0 76 | predict_list = predict_results.tolist() 77 | for i, pred in enumerate(predict_list): 78 | if (pred == test_label[i]): 79 | count += 1 80 | print("svm_model_precision_score: " + str(float(count) / len(predict_list))) 81 | 82 | 83 | # 使用传统方法的文本分类 84 | def text_classification(inputs): 85 | count = 0 86 | test_text_list = [] 87 | train_text_list = [] 88 | test_label_list = [] 89 | train_label_list = [] 90 | total_text_list = [] 91 | total_label_list = [] 92 | # 待测 93 | inputs_text_list = [] 94 | inputs_text_list.append(inputs) 95 | 96 | print("start loading data...") 97 | finput = open("../data/data_train.txt", encoding='utf-8') 98 | for line in finput: 99 | count += 1 100 | text_array = line.split("\\t", 1) 101 | if (len(text_array) != 2): 102 | continue 103 | 104 | # 保存全部样本 105 | total_text_list.append(text_array[1]) 106 | total_label_list.append(text_array[0]) 107 | 108 | # 划分训练集和测试集 109 | probability = random.random() 110 | if (probability > 0.1): 111 | train_text_list.append(text_array[1]) 112 | train_label_list.append(text_array[0]) 113 | else: 114 | test_text_list.append(text_array[1]) 115 | test_label_list.append(text_array[0]) 116 | finput.close() 117 | print("load data is finished...") 118 | 119 | print("start building vector model...") 120 | # 构建词典 121 | vec_total = CountVectorizer() 122 | vec_total.fit_transform(total_text_list) 123 | 124 | # 基于构建的词典分别统计训练集/测试集词频, 即每个词出现1次、2次、3次等 125 | vec_train = CountVectorizer(vocabulary=vec_total.vocabulary_) 126 | tf_train = vec_train.fit_transform(train_text_list) 127 | 128 | vec_test = CountVectorizer(vocabulary=vec_total.vocabulary_) 129 | tf_test = vec_test.fit_transform(test_text_list) 130 | 131 | vec_inputs = CountVectorizer(vocabulary=vec_total.vocabulary_) 132 | tf_inputs = vec_inputs.fit_transform(inputs_text_list) 133 | 134 | # 进一步计算词频-逆文档频率 135 | tfidftransformer = TfidfTransformer() 136 | tfidf_train = tfidftransformer.fit(tf_train).transform(tf_train) 137 | tfidf_test = tfidftransformer.fit(tf_test).transform(tf_test) 138 | tfidf_inputs = tfidftransformer.fit(tf_inputs).transform(tf_inputs) 139 | print("building vector model is finished...") 140 | 141 | # 朴素贝叶斯算法 142 | nb_model(tfidf_train, train_label_list, tfidf_test, test_label_list) 143 | predict_result = nb_classfy_model(tfidf_train, train_label_list,tfidf_inputs) 144 | # K近邻算法 145 | # knn_model(tfidf_train, train_label_list, tfidf_test, test_label_list) 146 | # 支持向量机算法 147 | # svm_model(tfidf_train, train_label_list, tfidf_test, test_label_list) 148 | print("building predict model is finished...") 149 | return predict_result 150 | 151 | 152 | industry_dict = ['农林牧渔','制造业','卫生医疗','商务服务','居民服务','建筑产业','房地产业','教育培训','文体娱乐', 153 | '电信通讯','科学技术','租赁服务','维修服务','设计服务','运输物流','采矿工业','金融服务','餐饮住宿'] 154 | 155 | 156 | if __name__ == '__main__': 157 | print("贝叶斯文本分类...") 158 | stopword = ['、', ';', ', ', '。', '(', ',', ')', '++', '**', '*','[ ',']','【','】',':','国务院','部门','国家' 159 | ,':','法律','法规','的','规定','决定','许可','批准','禁止不得','经营应当','审批经','审批',';无需市场主体','经营','机关后','自主', 160 | '选择','开展','活动','后方','相关','经','可','须','取得','无需市场主体',';','(',' )','有限公司'] 161 | sql = "select enterpriseName,scopeOfBusiness,uniformSocialCreditCode from basic_info" 162 | 163 | # sql = "select * from basic_info" 164 | datas = MysqlOperate().read_data(sql) 165 | for data in datas: 166 | # inputs = '科技企业孵化;投资管理;高新技术开发、技术咨询、技术服务;计算机技术培训;出租办公用房;设计、制作、代理、发布国内各类广告;会议会展服务;企业咨询服务(不含民间借贷中介及证券、期货、保险、金融投资信息咨询);企业管理服务;企业营销策划;教育咨询服务(依法须经批准的项目,经相关部门批准后方可开展经营活动)' 167 | inputs = data[1] 168 | inputs_array_list = [] 169 | names = [w for w,f in pseg.cut(data[0]) if f !='ns' ] 170 | for word in jieba.lcut(inputs) + names: 171 | if word not in stopword: 172 | if len(word) >1: 173 | inputs_array_list.append(word) 174 | inputs = ' '.join(inputs_array_list) 175 | print('企业名称:%s,输入语料:%s'%(data[0],inputs)) 176 | predict = int(text_classification(inputs)) -1 177 | labels = industry_dict[predict] 178 | print('labels:',labels) 179 | query = "update basic_info set industry = '%s' where uniformSocialCreditCode = '%s'"%(labels,data[2]) 180 | MysqlOperate().update_data(query) 181 | print("\n----------------------------------------------") -------------------------------------------------------------------------------- /utils/joinExcelByIndex.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # __*__ coding: utf-8 __*__ 3 | 4 | ''' 5 | @Author: simonKing 6 | @License: (C) Copyright 2013-2019, Best Wonder Corporation Limited. 7 | @Os:Windows 10 x64 8 | @Contact: bw_wangxiaomeng@whty.com.cn 9 | @Software: PY PyCharm 10 | @File: joinExcelByIndex.py 11 | @Time: 2019/7/4 10:43 12 | @Desc: define your function 13 | ''' 14 | 15 | import pandas as pd 16 | from pandas import DataFrame 17 | 18 | 19 | def concat_excels(xlsx1,xlsx2,index): 20 | ''' 21 | 根据字段名合并表格 22 | :param xlsx1:输入excel表格1; 23 | :param xlsx2:输入excel表格2; 24 | :param index:合并的字段名 25 | :return:输出合并后的表格 26 | ''' 27 | if '.xlsx' not in xlsx1: 28 | raise Exception('输入文件类型有误!') 29 | if '.xlsx' not in xlsx2: 30 | raise Exception('输入文件类型有误!') 31 | data1 = pd.read_excel(xlsx1, sheet_name='Sheet1', dtype={index: str}) 32 | df_obj1 = DataFrame(data1) 33 | data2 = pd.read_excel(xlsx2, sheet_name='Sheet1', dtype={index: str}) 34 | data2 = data2.drop_duplicates([index]) 35 | df_obj2 = DataFrame(data2) 36 | 37 | excel = pd.merge(df_obj1, df_obj2, on=index,how='outer') 38 | excel_list = [excel] 39 | total_excel = pd.concat(excel_list) 40 | # total_excel = excel_list.set_index('cate_tp').T.to_dict('list') 41 | total_excel.to_excel('../user_point/tice20190717.xlsx', index=False) 42 | return 43 | 44 | # if __name__ =='__main__': 45 | # pass 46 | # index = 'tax_id' 47 | # xs1 = r'D:\invoice1.xlsx' 48 | # xs2 = r'D:\invoice2.xlsx' 49 | # concat_excels(xs1,xs2,index) --------------------------------------------------------------------------------