├── README.md
├── data
├── DataModel.xlsx
├── DataRaw.xlsx
└── 注册资本Ranks
├── data_explore
├── __init__.py
└── correlationAnalysic.py
├── feature_proj
├── __init__.py
├── extractFeatures.py
└── featureSelector.py
├── pictures
├── 决策树.png
├── 手肘法.png
├── 推荐架构.png
├── 某赋通数学期望.png
├── 特征相关性计算.png
├── 特征累计重要性计算.png
├── 特征重要性计算.png
├── 算法选型.png
└── 聚类分群.png
├── recommender.py
├── requirements.txt
├── setting.py
├── user_point
├── __init__.py
├── evaluationFunction.py
└── giveScores.py
├── user_recom
├── DecisionTree.py
├── __init__.py
└── decesionTree.dot
└── utils
├── __init__.py
├── cutScopebusiness.py
├── fasttextClassfy.py
└── joinExcelByIndex.py
/README.md:
--------------------------------------------------------------------------------
1 | # recommender
2 |
3 | ### 推荐模型简介
4 | - 目标:有三款产品某企通、某赋通和某票,通过推荐提高用户群体从某企通和某票到某赋通产品的转化率。
5 | - 数据情况:用户数据体量,某企通:某赋通:某票 = 100 :10 :1
6 | - 推荐算法选型:试验过当下十分流行的Item_CF和User_CF,产生的推荐效果并不理想,具体过程如下:
7 |
8 | - 推荐模型:选择了RFM模型、决策树模型和专业运营人员的经验知识,具体的推荐架构如下:
9 |
10 |
11 | ### 购买某赋通的数学期望
12 | - 核心思想:将某企通的用户通过Kmeans聚类进行分群,跑手肘法使分层数依次为2、3、4....10。分别计算相应层数中,
13 | 每个群体购买某赋通的数学期望,目标是找出获得最大数学期望的群体,进行画像推荐。
14 | - 用户分群的效果:随着分群的种类不断增加,用户购买某赋通的数学期望如下所示:
15 | 
16 | - 结论:可以看出购买某赋通的数学期望偏低,这也同样证明了协同过滤推荐算法的不适用,因为数据在向未购买的用户群靠拢,
17 | 所以需要解决购买与不购买数据样本不平衡的问题。
18 |
19 | ### 解决思路
20 | 选择了非常弱的分类器决策树,参考传统的RFM模型,进行用户推荐。
21 |
22 | ### 环境
23 | - Windows 10
24 | - Python 3.6.5
25 |
26 | ### 依赖包
27 | ```
28 | pip install -r requirements.txt
29 | ```
30 |
31 | ### 程序执行
32 | ```
33 | python recommender.py
34 | ```
35 |
36 | ### 建模过程
37 | - 特征工程:
38 | > 数据清洗:对文本型数据、数值型数据、时间型数据和确实数据进行清洗,一致性检查和业务逻辑检查。
39 | > 特征选择:相关性和重要性计算。
40 |
41 | - 用户画像:
42 | > 基于到期时间的推荐:依次推荐给快到期用户续费使用;
43 | > 基于效用的推荐:开票量、开票金额和登录使用次数都是构成效用的重要指标;
44 | > 基于RFM用户价值模型的推荐:对消费金额和消费频次高的用户适时产生推荐;
45 | > 混合推荐:现实应用中,其实很少有直接用某种算法来做推荐的系统。通过给不同算法的结果加权重来综合结果,
46 | 或者是在不同的计算环节中运用不同的算法来混合,达到更贴合自己业务的目的。
47 | - 推荐引擎
48 | > 决策树:不确定性的计算采用的是基尼系数,之前已经计算过累计重要性,所以这里就不剪枝。
49 |
50 | > RFM模型:通过不断调整权重,运营人员认为如下公式比较符合预期。computeTotalScore。
51 | ```python
52 | def computeTotalScore(scoreDic):
53 | '''
54 | 根据用户对所有商品的打分
55 | :param scoreDic:每个用户对所有商品的评级
56 | :return:对产品的打分
57 | index = ["times-wp","je-wp","recently-wp","times-bqt","je-bqt","recently-bqt","times-bft","je-bft","recently-bft","dayLoginTimes",
58 | "zczb","industry","ages","dayInvoiceNum","dayInvoiceJe"]
59 | '''
60 | if len(scoreDic) <1:raise Exception('scoreDic is none')
61 | if scoreDic.get('dayLoginTimes') == None:scoreDic['dayLoginTimes'] = 0
62 | score_common = scoreDic['registeredCapital'] * 10 + scoreDic['dayCountAvg'] * 30 + scoreDic['loginFrequency'] * 30 + \
63 | scoreDic['daySumAvg'] * 30 + scoreDic['dateOfEstablishment'] * 10 + scoreDic['industry'] * 10
64 | if scoreDic['deadline']==5: bft = 200 + (scoreDic['userConsumeTotalAmount'] + scoreDic['userConsumeTotalTimes'])* 50
65 | else:bft = 0
66 | if scoreDic['recently_bqt']==5: bqt = 100 + (scoreDic['je_bqt'] + scoreDic['times_bqt'])* 50
67 | else:bqt = 0
68 | if scoreDic['deadline_wp']==5: wp = 100 + (scoreDic['je_wp'] + scoreDic['times_wp'])* 50
69 | else:wp = 0
70 | score = bft + bqt + wp + score_common
71 | return score
72 | ```
73 |
74 | ### 优化空间
75 | 1、加大力度对用户数据的采集;
76 | 2、根据最近几次推荐的反馈结果,对模型进行优化;
77 | 3、搭建实时的用户推荐系统,后期我会陆续更新基于Mahout的推荐系统代码。
78 |
--------------------------------------------------------------------------------
/data/DataModel.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimonWang00/recommender/02b26535b3cb754d006036e577586a270648c50d/data/DataModel.xlsx
--------------------------------------------------------------------------------
/data/DataRaw.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimonWang00/recommender/02b26535b3cb754d006036e577586a270648c50d/data/DataRaw.xlsx
--------------------------------------------------------------------------------
/data_explore/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # __*__ coding: utf-8 __*__
3 |
4 | '''
5 | @Author: simonKing
6 | @License: (C) Copyright 2013-2019, Best Wonder Corporation Limited.
7 | @Os:Windows 10 x64
8 | @Contact: bw_wangxiaomeng@whty.com.cn
9 | @Software: PY PyCharm
10 | @File: __init__.py.py
11 | @Time: 2019/7/12 16:30
12 | @Desc: define your function
13 | '''
--------------------------------------------------------------------------------
/data_explore/correlationAnalysic.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # __*__ coding: utf-8 __*__
3 |
4 | '''
5 | @Author: simonKing
6 | @License: (C) Copyright 2013-2019, Best Wonder Corporation Limited.
7 | @Os:Windows 10 x64
8 | @Contact: bw_wangxiaomeng@whty.com.cn
9 | @Software: PY PyCharm
10 | @File: correlationAnalysic.py
11 | @Time: 2019/7/12 14:38
12 | @Desc: 关联性推荐
13 | '''
14 | import pandas as pd
15 | import pylab as plt
16 |
17 |
18 | def corrPer(seris1,seris2,index = None):
19 | '''
20 | 计算数列间的相关系数
21 | :param seris1:数列1
22 | :param seris2:数列2
23 | :param index:数列名称
24 | :return:相关系数,1相关,0不相关,-1负相关
25 | '''
26 | # 利用Series将列表转换成新的、pandas可处理的数据
27 | s1 = pd.Series(seris1)
28 | s2 = pd.Series(seris2)
29 | # 计算皮尔逊相关系数,round(a, 4)是保留a的前四位小数
30 | corr_per = round(s1.corr(s2), 4)
31 | # print('corr_per :', corr_per)
32 | # 最后画一下两列表散点图,直观感受下,结合相关系数揣摩揣摩
33 | plt.scatter(seris1, seris2)
34 | if index:
35 | plt.xlabel(index[0])
36 | plt.ylabel(index[1])
37 | plt.title('corr_per :' + str(corr_per), fontproperties='SimHei')
38 | plt.show()
39 | return corr_per
40 |
41 |
42 | def graAnalysic(seris1, seris2):
43 | '''
44 | 分析两个数列的相关性
45 | :param seris1:数列1
46 | :param seris2:数列2
47 | :return:-1负相关,0不相干,1正相关
48 | '''
49 | x = pd.DataFrame(data=[seris1,seris2])
50 | # 1、数据均值化处理
51 | x_mean = x.mean(axis=1)
52 | for i in range(x.index.size):
53 | x.iloc[i,:] = x.iloc[i,:]/x_mean[i]
54 | # 2、提取参考队列和比较队列
55 | ck=x.iloc[0,:]
56 | cp=x.iloc[1:,:]
57 | # 比较队列与参考队列相减
58 | t=pd.DataFrame()
59 | for j in range(cp.index.size):
60 | temp=pd.Series(cp.iloc[j,:] - ck)
61 | t=t.append(temp,ignore_index=True)
62 | #求最大差和最小差
63 | mmax=t.abs().max().max()
64 | mmin=t.abs().min().min()
65 | rho=0.5
66 | #3、求关联系数
67 | ksi=((mmin + rho*mmax)/(abs(t) + rho*mmax))
68 | #4、求关联度
69 | r=ksi.sum(axis=1)/ksi.columns.size
70 | #5、关联度排序
71 | result=r.sort_values(ascending=False)
72 | plt.plot(seris1)
73 | plt.plot(seris2)
74 | plt.show()
75 | return result
76 |
77 | # if __name__ == '__main__':
78 | # pass
79 | # s1 = [0.4755,0.4299,0.6358,0.7527,0.4228,0.3358]
80 | # s2 = [0.6591,0.5739,0.5465,0.8993,0.6661,0.4037]
81 | # result = graAnalysic(s1,s2)
--------------------------------------------------------------------------------
/feature_proj/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # __*__ coding: utf-8 __*__
3 |
4 | '''
5 | @Author: simonKing
6 | @License: (C) Copyright 2013-2019, Best Wonder Corporation Limited.
7 | @Os:Windows 10 x64
8 | @Contact: bw_wangxiaomeng@whty.com.cn
9 | @Software: PY PyCharm
10 | @File: __init__.py.py
11 | @Time: 2019/6/24 9:11
12 | @Desc: define your function
13 | '''
--------------------------------------------------------------------------------
/feature_proj/extractFeatures.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # __*__ coding: utf-8 __*__
3 |
4 | '''
5 | @Author: simonKing
6 | @License: (C) Copyright 2013-2019, Best Wonder Corporation Limited.
7 | @Os:Windows 10 x64
8 | @Contact: bw_wangxiaomeng@whty.com.cn
9 | @Software: PY PyCharm
10 | @File: extractFeatures.py
11 | @Time: 2019/6/24 13:57
12 | @Desc: define your function
13 | '''
14 |
15 | import pandas as pd
16 | from feature_proj.featureSelector import FeatureSelector
17 |
18 | # 用户消费数据保密,不开源
19 | fname = ''
20 | train = pd.read_excel(fname)
21 | index = ["times-wp","je-wp","recently-wp","times-bqt","je-bqt","recently-bqt","times-bft","je-bft","recently-bft",
22 | "zczb_x","industry_x","ages_x","dayInvoiceNum_x","dayInvoiceJe_x","flag"]
23 |
24 | train = train.ix[:, index]
25 |
26 | train_labels = train['flag']
27 |
28 | labels = []
29 | for label in train_labels:
30 | if label <3:labels.append(1)
31 | else:labels.append(label)
32 |
33 | train = train.drop(columns = ['flag'])
34 |
35 | fs = FeatureSelector(data = train, labels = labels)
36 |
37 | # 皮尔逊相关系数 相关系数 0.8-1.0 极强相关 , 0.6-0.8 强相关 ,0.4-0.6 中等程度相关 ,0.2-0.4 弱相关 ,0.0-0.2 极弱相关或无相关
38 | fs.identify_collinear(correlation_threshold=0.9)
39 | correlated_features = fs.ops['collinear']
40 | print(correlated_features[:])
41 | # # 热力图
42 | fs.plot_collinear()
43 | fs.plot_collinear(plot_all=True)
44 |
45 | # 查看分类效果
46 | fs.identify_zero_importance(task = 'classification', eval_metric = 'auc', n_iterations = 10, early_stopping = True)
47 | print(fs.data_all.head(10))
48 | zero_importance_features = fs.ops['zero_importance']
49 | print(zero_importance_features[10:15])
50 | # 变量重要性排名
51 | fs.plot_feature_importances(threshold = 0.9, plot_n = 14)
52 |
53 | # 累计重要性达到0.99,的特征排名
54 | fs.identify_low_importance(cumulative_importance = 0.99)
55 | low_importance_features = fs.ops['low_importance']
56 | print(low_importance_features[:5])
57 |
58 | # 特征删除
59 | # train_no_missing = fs.remove(methods = ['missing'])
60 | # # 删除多个
61 | # fs.identify_all(selection_params = {'missing_threshold': 0.6, 'correlation_threshold': 0.98,'task': 'classification',
62 | # 'eval_metric': 'auc','cumulative_importance': 0.99})
63 | # # train_no_missing_zero = fs.remove(methods = ['missing', 'zero_importance'])
64 | # all_to_remove = fs.check_removal()
65 | # all_to_remove[10:25]
66 | # train_removed = fs.remove(methods = 'all')
67 |
68 |
69 |
70 |
71 |
--------------------------------------------------------------------------------
/feature_proj/featureSelector.py:
--------------------------------------------------------------------------------
1 | # numpy and pandas for data manipulation
2 | import pandas as pd
3 | import numpy as np
4 |
5 | # model used for feature importances
6 | import lightgbm as lgb
7 |
8 | # utility for early stopping with a validation set
9 | from sklearn.model_selection import train_test_split
10 |
11 | # visualizations
12 | import matplotlib.pyplot as plt
13 | import seaborn as sns
14 |
15 | # memory management
16 | import gc
17 |
18 | # utilities
19 | from itertools import chain
20 |
21 | class FeatureSelector():
22 | """
23 | Class for performing feature selection for machine learning or data preprocessing.
24 |
25 | Implements five different methods to identify features for removal
26 |
27 | 1. Find columns with a missing percentage greater than a specified threshold
28 | 2. Find columns with a single unique value
29 | 3. Find collinear variables with a correlation greater than a specified correlation coefficient
30 | 4. Find features with 0.0 feature importance from a gradient boosting machine (gbm)
31 | 5. Find low importance features that do not contribute to a specified cumulative feature importance from the gbm
32 |
33 | Parameters
34 | --------
35 | data : dataframe
36 | A dataset with observations in the rows and features in the columns
37 |
38 | labels : array or series, default = None
39 | Array of labels for training the machine learning model to find feature importances. These can be either binary labels
40 | (if task is 'classification') or continuous targets (if task is 'regression').
41 | If no labels are provided, then the feature importance based methods are not available.
42 |
43 | Attributes
44 | --------
45 |
46 | ops : dict
47 | Dictionary of operations run and features identified for removal
48 |
49 | missing_stats : dataframe
50 | The fraction of missing values for all features
51 |
52 | record_missing : dataframe
53 | The fraction of missing values for features with missing fraction above threshold
54 |
55 | unique_stats : dataframe
56 | Number of unique values for all features
57 |
58 | record_single_unique : dataframe
59 | Records the features that have a single unique value
60 |
61 | corr_matrix : dataframe
62 | All correlations between all features in the data
63 |
64 | record_collinear : dataframe
65 | Records the pairs of collinear variables with a correlation coefficient above the threshold
66 |
67 | feature_importances : dataframe
68 | All feature importances from the gradient boosting machine
69 |
70 | record_zero_importance : dataframe
71 | Records the zero importance features in the data according to the gbm
72 |
73 | record_low_importance : dataframe
74 | Records the lowest importance features not needed to reach the threshold of cumulative importance according to the gbm
75 |
76 |
77 | Notes
78 | --------
79 |
80 | - All 5 operations can be run with the `identify_all` method.
81 | - If using feature importances, one-hot encoding is used for categorical variables which creates new columns
82 |
83 | """
84 |
85 | def __init__(self, data, labels=None):
86 |
87 | # Dataset and optional training labels
88 | self.data = data
89 | self.labels = labels
90 |
91 | if labels is None:
92 | print('No labels provided. Feature importance based methods are not available.')
93 |
94 | self.base_features = list(data.columns)
95 | self.one_hot_features = None
96 |
97 | # Dataframes recording information about features to remove
98 | self.record_missing = None
99 | self.record_single_unique = None
100 | self.record_collinear = None
101 | self.record_zero_importance = None
102 | self.record_low_importance = None
103 |
104 | self.missing_stats = None
105 | self.unique_stats = None
106 | self.corr_matrix = None
107 | self.feature_importances = None
108 |
109 | # Dictionary to hold removal operations
110 | self.ops = {}
111 |
112 | self.one_hot_correlated = False
113 |
114 | def identify_missing(self, missing_threshold):
115 | """Find the features with a fraction of missing values above `missing_threshold`"""
116 |
117 | self.missing_threshold = missing_threshold
118 |
119 | # Calculate the fraction of missing in each column
120 | missing_series = self.data.isnull().sum() / self.data.shape[0]
121 | self.missing_stats = pd.DataFrame(missing_series).rename(columns = {'index': 'feature', 0: 'missing_fraction'})
122 |
123 | # Sort with highest number of missing values on top
124 | self.missing_stats = self.missing_stats.sort_values('missing_fraction', ascending = False)
125 |
126 | # Find the columns with a missing percentage above the threshold
127 | record_missing = pd.DataFrame(missing_series[missing_series > missing_threshold]).reset_index().rename(columns =
128 | {'index': 'feature',
129 | 0: 'missing_fraction'})
130 |
131 | to_drop = list(record_missing['feature'])
132 |
133 | self.record_missing = record_missing
134 | self.ops['missing'] = to_drop
135 |
136 | print('%d features with greater than %0.2f missing values.\n' % (len(self.ops['missing']), self.missing_threshold))
137 |
138 | def identify_single_unique(self):
139 | """Finds features with only a single unique value. NaNs do not count as a unique value. """
140 |
141 | # Calculate the unique counts in each column
142 | unique_counts = self.data.nunique()
143 | self.unique_stats = pd.DataFrame(unique_counts).rename(columns = {'index': 'feature', 0: 'nunique'})
144 | self.unique_stats = self.unique_stats.sort_values('nunique', ascending = True)
145 |
146 | # Find the columns with only one unique count
147 | record_single_unique = pd.DataFrame(unique_counts[unique_counts == 1]).reset_index().rename(columns = {'index': 'feature',
148 | 0: 'nunique'})
149 |
150 | to_drop = list(record_single_unique['feature'])
151 |
152 | self.record_single_unique = record_single_unique
153 | self.ops['single_unique'] = to_drop
154 |
155 | print('%d features with a single unique value.\n' % len(self.ops['single_unique']))
156 |
157 | def identify_collinear(self, correlation_threshold, one_hot=False):
158 | """
159 | Finds collinear features based on the correlation coefficient between features.
160 | For each pair of features with a correlation coefficient greather than `correlation_threshold`,
161 | only one of the pair is identified for removal.
162 |
163 | Using code adapted from: https://chrisalbon.com/machine_learning/feature_selection/drop_highly_correlated_features/
164 |
165 | Parameters
166 | --------
167 |
168 | correlation_threshold : float between 0 and 1
169 | Value of the Pearson correlation cofficient for identifying correlation features
170 |
171 | one_hot : boolean, default = False
172 | Whether to one-hot encode the features before calculating the correlation coefficients
173 |
174 | """
175 |
176 | self.correlation_threshold = correlation_threshold
177 | self.one_hot_correlated = one_hot
178 |
179 | # Calculate the correlations between every column
180 | if one_hot:
181 |
182 | # One hot encoding
183 | features = pd.get_dummies(self.data)
184 | self.one_hot_features = [column for column in features.columns if column not in self.base_features]
185 |
186 | # Add one hot encoded data to original data
187 | self.data_all = pd.concat([features[self.one_hot_features], self.data], axis = 1)
188 |
189 | corr_matrix = pd.get_dummies(features).corr()
190 |
191 | else:
192 | corr_matrix = self.data.corr()
193 |
194 | self.corr_matrix = corr_matrix
195 |
196 | # Extract the upper triangle of the correlation matrix
197 | upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(np.bool))
198 |
199 | # Select the features with correlations above the threshold
200 | # Need to use the absolute value
201 | to_drop = [column for column in upper.columns if any(upper[column].abs() > correlation_threshold)]
202 |
203 | # Dataframe to hold correlated pairs
204 | record_collinear = pd.DataFrame(columns = ['drop_feature', 'corr_feature', 'corr_value'])
205 |
206 | # Iterate through the columns to drop to record pairs of correlated features
207 | for column in to_drop:
208 |
209 | # Find the correlated features
210 | corr_features = list(upper.index[upper[column].abs() > correlation_threshold])
211 |
212 | # Find the correlated values
213 | corr_values = list(upper[column][upper[column].abs() > correlation_threshold])
214 | drop_features = [column for _ in range(len(corr_features))]
215 |
216 | # Record the information (need a temp df for now)
217 | temp_df = pd.DataFrame.from_dict({'drop_feature': drop_features,
218 | 'corr_feature': corr_features,
219 | 'corr_value': corr_values})
220 |
221 | # Add to dataframe
222 | record_collinear = record_collinear.append(temp_df, ignore_index = True)
223 |
224 | self.record_collinear = record_collinear
225 | self.ops['collinear'] = to_drop
226 |
227 | print('%d features with a correlation magnitude greater than %0.2f.\n' % (len(self.ops['collinear']), self.correlation_threshold))
228 |
229 | def identify_zero_importance(self, task, eval_metric=None, n_iterations=10, early_stopping = True):
230 | """
231 |
232 | Identify the features with zero importance according to a gradient boosting machine.
233 | The gbm can be trained with early stopping using a validation set to prevent overfitting.
234 | The feature importances are averaged over `n_iterations` to reduce variance.
235 |
236 | Uses the LightGBM implementation (http://lightgbm.readthedocs.io/en/latest/index.html)
237 |
238 | Parameters
239 | --------
240 |
241 | eval_metric : string
242 | Evaluation metric to use for the gradient boosting machine for early stopping. Must be
243 | provided if `early_stopping` is True
244 |
245 | task : string
246 | The machine learning task, either 'classification' or 'regression'
247 |
248 | n_iterations : int, default = 10
249 | Number of iterations to train the gradient boosting machine
250 |
251 | early_stopping : boolean, default = True
252 | Whether or not to use early stopping with a validation set when training
253 |
254 |
255 | Notes
256 | --------
257 |
258 | - Features are one-hot encoded to handle the categorical variables before training.
259 | - The gbm is not optimized for any particular task and might need some hyperparameter tuning
260 | - Feature importances, including zero importance features, can change across runs
261 |
262 | """
263 |
264 | if early_stopping and eval_metric is None:
265 | raise ValueError("""eval metric must be provided with early stopping. Examples include "auc" for classification or
266 | "l2" for regression.""")
267 |
268 | if self.labels is None:
269 | raise ValueError("No training labels provided.")
270 |
271 | # One hot encoding
272 | features = pd.get_dummies(self.data)
273 | self.one_hot_features = [column for column in features.columns if column not in self.base_features]
274 |
275 | # Add one hot encoded data to original data
276 | self.data_all = pd.concat([features[self.one_hot_features], self.data], axis = 1)
277 |
278 | # Extract feature names
279 | feature_names = list(features.columns)
280 |
281 | # Convert to np array
282 | features = np.array(features)
283 | labels = np.array(self.labels).reshape((-1, ))
284 |
285 | # Empty array for feature importances
286 | feature_importance_values = np.zeros(len(feature_names))
287 |
288 | print('Training Gradient Boosting Model\n')
289 |
290 | # Iterate through each fold
291 | for _ in range(n_iterations):
292 |
293 | if task == 'classification':
294 | model = lgb.LGBMClassifier(n_estimators=1000, learning_rate = 0.05, verbose = -1)
295 |
296 | elif task == 'regression':
297 | model = lgb.LGBMRegressor(n_estimators=1000, learning_rate = 0.05, verbose = -1)
298 |
299 | else:
300 | raise ValueError('Task must be either "classification" or "regression"')
301 |
302 | # If training using early stopping need a validation set
303 | if early_stopping:
304 |
305 | train_features, valid_features, train_labels, valid_labels = train_test_split(features, labels, test_size = 0.15)
306 |
307 | # Train the model with early stopping
308 | model.fit(train_features, train_labels, eval_metric = eval_metric,
309 | eval_set = [(valid_features, valid_labels)],
310 | early_stopping_rounds = 100, verbose = -1)
311 |
312 | # Clean up memory
313 | gc.enable()
314 | del train_features, train_labels, valid_features, valid_labels
315 | gc.collect()
316 |
317 | else:
318 | model.fit(features, labels)
319 |
320 | # Record the feature importances
321 | feature_importance_values += model.feature_importances_ / n_iterations
322 |
323 | feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
324 |
325 | # Sort features according to importance
326 | feature_importances = feature_importances.sort_values('importance', ascending = False).reset_index(drop = True)
327 |
328 | # Normalize the feature importances to add up to one
329 | feature_importances['normalized_importance'] = feature_importances['importance'] / feature_importances['importance'].sum()
330 | feature_importances['cumulative_importance'] = np.cumsum(feature_importances['normalized_importance'])
331 |
332 | # Extract the features with zero importance
333 | record_zero_importance = feature_importances[feature_importances['importance'] == 0.0]
334 |
335 | to_drop = list(record_zero_importance['feature'])
336 |
337 | self.feature_importances = feature_importances
338 | self.record_zero_importance = record_zero_importance
339 | self.ops['zero_importance'] = to_drop
340 |
341 | print('\n%d features with zero importance after one-hot encoding.\n' % len(self.ops['zero_importance']))
342 |
343 | def identify_low_importance(self, cumulative_importance):
344 | """
345 | Finds the lowest importance features not needed to account for `cumulative_importance` fraction
346 | of the total feature importance from the gradient boosting machine. As an example, if cumulative
347 | importance is set to 0.95, this will retain only the most important features needed to
348 | reach 95% of the total feature importance. The identified features are those not needed.
349 |
350 | Parameters
351 | --------
352 | cumulative_importance : float between 0 and 1
353 | The fraction of cumulative importance to account for
354 |
355 | """
356 |
357 | self.cumulative_importance = cumulative_importance
358 |
359 | # The feature importances need to be calculated before running
360 | if self.feature_importances is None:
361 | raise NotImplementedError("""Feature importances have not yet been determined. Call the `identify_zero_importance` method first.""")
362 |
363 | # Make sure most important features are on top
364 | self.feature_importances = self.feature_importances.sort_values('cumulative_importance')
365 |
366 | # Identify the features not needed to reach the cumulative_importance
367 | record_low_importance = self.feature_importances[self.feature_importances['cumulative_importance'] > cumulative_importance]
368 |
369 | to_drop = list(record_low_importance['feature'])
370 |
371 | self.record_low_importance = record_low_importance
372 | self.ops['low_importance'] = to_drop
373 |
374 | print('%d features required for cumulative importance of %0.2f after one hot encoding.' % (len(self.feature_importances) -
375 | len(self.record_low_importance), self.cumulative_importance))
376 | print('%d features do not contribute to cumulative importance of %0.2f.\n' % (len(self.ops['low_importance']),
377 | self.cumulative_importance))
378 |
379 | def identify_all(self, selection_params):
380 | """
381 | Use all five of the methods to identify features to remove.
382 |
383 | Parameters
384 | --------
385 |
386 | selection_params : dict
387 | Parameters to use in the five feature selection methhods.
388 | Params must contain the keys ['missing_threshold', 'correlation_threshold', 'eval_metric', 'task', 'cumulative_importance']
389 |
390 | """
391 |
392 | # Check for all required parameters
393 | for param in ['missing_threshold', 'correlation_threshold', 'eval_metric', 'task', 'cumulative_importance']:
394 | if param not in selection_params.keys():
395 | raise ValueError('%s is a required parameter for this method.' % param)
396 |
397 | # Implement each of the five methods
398 | self.identify_missing(selection_params['missing_threshold'])
399 | self.identify_single_unique()
400 | self.identify_collinear(selection_params['correlation_threshold'])
401 | self.identify_zero_importance(task = selection_params['task'], eval_metric = selection_params['eval_metric'])
402 | self.identify_low_importance(selection_params['cumulative_importance'])
403 |
404 | # Find the number of features identified to drop
405 | self.all_identified = set(list(chain(*list(self.ops.values()))))
406 | self.n_identified = len(self.all_identified)
407 |
408 | print('%d total features out of %d identified for removal after one-hot encoding.\n' % (self.n_identified,
409 | self.data_all.shape[1]))
410 |
411 | def check_removal(self, keep_one_hot=True):
412 |
413 | """Check the identified features before removal. Returns a list of the unique features identified."""
414 |
415 | self.all_identified = set(list(chain(*list(self.ops.values()))))
416 | print('Total of %d features identified for removal' % len(self.all_identified))
417 |
418 | if not keep_one_hot:
419 | if self.one_hot_features is None:
420 | print('Data has not been one-hot encoded')
421 | else:
422 | one_hot_to_remove = [x for x in self.one_hot_features if x not in self.all_identified]
423 | print('%d additional one-hot features can be removed' % len(one_hot_to_remove))
424 |
425 | return list(self.all_identified)
426 |
427 |
428 | def remove(self, methods, keep_one_hot = True):
429 | """
430 | Remove the features from the data according to the specified methods.
431 |
432 | Parameters
433 | --------
434 | methods : 'all' or list of methods
435 | If methods == 'all', any methods that have identified features will be used
436 | Otherwise, only the specified methods will be used.
437 | Can be one of ['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance']
438 | keep_one_hot : boolean, default = True
439 | Whether or not to keep one-hot encoded features
440 |
441 | Return
442 | --------
443 | data : dataframe
444 | Dataframe with identified features removed
445 |
446 |
447 | Notes
448 | --------
449 | - If feature importances are used, the one-hot encoded columns will be added to the data (and then may be removed)
450 | - Check the features that will be removed before transforming data!
451 |
452 | """
453 |
454 |
455 | features_to_drop = []
456 |
457 | if methods == 'all':
458 |
459 | # Need to use one-hot encoded data as well
460 | data = self.data_all
461 |
462 | print('{} methods have been run\n'.format(list(self.ops.keys())))
463 |
464 | # Find the unique features to drop
465 | features_to_drop = set(list(chain(*list(self.ops.values()))))
466 |
467 | else:
468 | # Need to use one-hot encoded data as well
469 | if 'zero_importance' in methods or 'low_importance' in methods or self.one_hot_correlated:
470 | data = self.data_all
471 |
472 | else:
473 | data = self.data
474 |
475 | # Iterate through the specified methods
476 | for method in methods:
477 |
478 | # Check to make sure the method has been run
479 | if method not in self.ops.keys():
480 | raise NotImplementedError('%s method has not been run' % method)
481 |
482 | # Append the features identified for removal
483 | else:
484 | features_to_drop.append(self.ops[method])
485 |
486 | # Find the unique features to drop
487 | features_to_drop = set(list(chain(*features_to_drop)))
488 |
489 | features_to_drop = list(features_to_drop)
490 |
491 | if not keep_one_hot:
492 |
493 | if self.one_hot_features is None:
494 | print('Data has not been one-hot encoded')
495 | else:
496 |
497 | features_to_drop = list(set(features_to_drop) | set(self.one_hot_features))
498 |
499 | # Remove the features and return the data
500 | data = data.drop(columns = features_to_drop)
501 | self.removed_features = features_to_drop
502 |
503 | if not keep_one_hot:
504 | print('Removed %d features including one-hot features.' % len(features_to_drop))
505 | else:
506 | print('Removed %d features.' % len(features_to_drop))
507 |
508 | return data
509 |
510 | def plot_missing(self):
511 | """Histogram of missing fraction in each feature"""
512 | if self.record_missing is None:
513 | raise NotImplementedError("Missing values have not been calculated. Run `identify_missing`")
514 |
515 | self.reset_plot()
516 |
517 | # Histogram of missing values
518 | plt.style.use('seaborn-white')
519 | plt.figure(figsize = (7, 5))
520 | plt.hist(self.missing_stats['missing_fraction'], bins = np.linspace(0, 1, 11), edgecolor = 'k', color = 'red', linewidth = 1.5)
521 | plt.xticks(np.linspace(0, 1, 11))
522 | plt.xlabel('Missing Fraction', size = 14); plt.ylabel('Count of Features', size = 14)
523 | plt.title("Fraction of Missing Values Histogram", size = 16)
524 | plt.show()
525 |
526 |
527 | def plot_unique(self):
528 | """Histogram of number of unique values in each feature"""
529 | if self.record_single_unique is None:
530 | raise NotImplementedError('Unique values have not been calculated. Run `identify_single_unique`')
531 |
532 | self.reset_plot()
533 |
534 | # Histogram of number of unique values
535 | self.unique_stats.plot.hist(edgecolor = 'k', figsize = (7, 5))
536 | plt.ylabel('Frequency', size = 14); plt.xlabel('Unique Values', size = 14)
537 | plt.title('Number of Unique Values Histogram', size = 16)
538 | plt.show()
539 |
540 |
541 | def plot_collinear(self, plot_all = False):
542 | """
543 | Heatmap of the correlation values. If plot_all = True plots all the correlations otherwise
544 | plots only those features that have a correlation above the threshold
545 |
546 | Notes
547 | --------
548 | - Not all of the plotted correlations are above the threshold because this plots
549 | all the variables that have been idenfitied as having even one correlation above the threshold
550 | - The features on the x-axis are those that will be removed. The features on the y-axis
551 | are the correlated features with those on the x-axis
552 |
553 | Code adapted from https://seaborn.pydata.org/examples/many_pairwise_correlations.html
554 | """
555 |
556 | if self.record_collinear is None:
557 | raise NotImplementedError('Collinear features have not been idenfitied. Run `identify_collinear`.')
558 |
559 | if plot_all:
560 | corr_matrix_plot = self.corr_matrix
561 | title = 'All Correlations'
562 |
563 | else:
564 | # Identify the correlations that were above the threshold
565 | # columns (x-axis) are features to drop and rows (y_axis) are correlated pairs
566 | corr_matrix_plot = self.corr_matrix.loc[list(set(self.record_collinear['corr_feature'])),
567 | list(set(self.record_collinear['drop_feature']))]
568 |
569 | title = "Correlations Above Threshold"
570 |
571 |
572 | f, ax = plt.subplots(figsize=(10, 8))
573 |
574 | # Diverging colormap
575 | cmap = sns.diverging_palette(220, 10, as_cmap=True)
576 |
577 | # Draw the heatmap with a color bar
578 | sns.heatmap(corr_matrix_plot, cmap=cmap, center=0,linewidths=.25, cbar_kws={"shrink": 0.6},annot=True)
579 |
580 | # Set the ylabels
581 | ax.set_yticks([x + 0.5 for x in list(range(corr_matrix_plot.shape[0]))])
582 | ax.set_yticklabels(list(corr_matrix_plot.index), size = int(160 / corr_matrix_plot.shape[0]))
583 |
584 | # Set the xlabels
585 | ax.set_xticks([x + 0.5 for x in list(range(corr_matrix_plot.shape[1]))])
586 | ax.set_xticklabels(list(corr_matrix_plot.columns), size = int(160 / corr_matrix_plot.shape[1]))
587 | plt.title(title, size = 14)
588 | plt.show()
589 |
590 | def plot_feature_importances(self, plot_n = 15, threshold = None):
591 | """
592 | Plots `plot_n` most important features and the cumulative importance of features.
593 | If `threshold` is provided, prints the number of features needed to reach `threshold` cumulative importance.
594 |
595 | Parameters
596 | --------
597 |
598 | plot_n : int, default = 15
599 | Number of most important features to plot. Defaults to 15 or the maximum number of features whichever is smaller
600 |
601 | threshold : float, between 0 and 1 default = None
602 | Threshold for printing information about cumulative importances
603 |
604 | """
605 |
606 | if self.record_zero_importance is None:
607 | raise NotImplementedError('Feature importances have not been determined. Run `idenfity_zero_importance`')
608 |
609 | # Need to adjust number of features if greater than the features in the data
610 | if plot_n > self.feature_importances.shape[0]:
611 | plot_n = self.feature_importances.shape[0] - 1
612 |
613 | self.reset_plot()
614 |
615 | # Make a horizontal bar chart of feature importances
616 | plt.figure(figsize = (10, 6))
617 | ax = plt.subplot()
618 |
619 | # Need to reverse the index to plot most important on top
620 | # There might be a more efficient method to accomplish this
621 | ax.barh(list(reversed(list(self.feature_importances.index[:plot_n]))),
622 | self.feature_importances['normalized_importance'][:plot_n],
623 | align = 'center', edgecolor = 'k')
624 |
625 | # Set the yticks and labels
626 | ax.set_yticks(list(reversed(list(self.feature_importances.index[:plot_n]))))
627 | ax.set_yticklabels(self.feature_importances['feature'][:plot_n], size = 12)
628 |
629 | # Plot labeling
630 | plt.xlabel('Normalized Importance', size = 16)
631 | plt.title('Feature Importances', size = 18)
632 | plt.show()
633 |
634 | # Cumulative importance plot
635 | plt.figure(figsize = (6, 4))
636 | plt.plot(list(range(1, len(self.feature_importances) + 1)), self.feature_importances['cumulative_importance'], 'r-')
637 | plt.xlabel('Number of Features', size = 14); plt.ylabel('Cumulative Importance', size = 14)
638 | plt.title('Cumulative Feature Importance', size = 16)
639 |
640 | if threshold:
641 |
642 | # Index of minimum number of features needed for cumulative importance threshold
643 | # np.where returns the index so need to add 1 to have correct number
644 | importance_index = np.min(np.where(self.feature_importances['cumulative_importance'] > threshold))
645 | plt.vlines(x = importance_index + 1, ymin = 0, ymax = 1, linestyles='--', colors = 'blue')
646 | plt.show()
647 |
648 | print('%d features required for %0.2f of cumulative importance' % (importance_index + 1, threshold))
649 |
650 | def reset_plot(self):
651 | plt.rcParams = plt.rcParamsDefault
652 |
--------------------------------------------------------------------------------
/pictures/决策树.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimonWang00/recommender/02b26535b3cb754d006036e577586a270648c50d/pictures/决策树.png
--------------------------------------------------------------------------------
/pictures/手肘法.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimonWang00/recommender/02b26535b3cb754d006036e577586a270648c50d/pictures/手肘法.png
--------------------------------------------------------------------------------
/pictures/推荐架构.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimonWang00/recommender/02b26535b3cb754d006036e577586a270648c50d/pictures/推荐架构.png
--------------------------------------------------------------------------------
/pictures/某赋通数学期望.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimonWang00/recommender/02b26535b3cb754d006036e577586a270648c50d/pictures/某赋通数学期望.png
--------------------------------------------------------------------------------
/pictures/特征相关性计算.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimonWang00/recommender/02b26535b3cb754d006036e577586a270648c50d/pictures/特征相关性计算.png
--------------------------------------------------------------------------------
/pictures/特征累计重要性计算.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimonWang00/recommender/02b26535b3cb754d006036e577586a270648c50d/pictures/特征累计重要性计算.png
--------------------------------------------------------------------------------
/pictures/特征重要性计算.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimonWang00/recommender/02b26535b3cb754d006036e577586a270648c50d/pictures/特征重要性计算.png
--------------------------------------------------------------------------------
/pictures/算法选型.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimonWang00/recommender/02b26535b3cb754d006036e577586a270648c50d/pictures/算法选型.png
--------------------------------------------------------------------------------
/pictures/聚类分群.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimonWang00/recommender/02b26535b3cb754d006036e577586a270648c50d/pictures/聚类分群.png
--------------------------------------------------------------------------------
/recommender.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # __*__ coding: utf-8 __*__
3 |
4 | '''
5 | @Author: simonKing
6 | @License: (C) Copyright 2013-2019, Best Wonder Corporation Limited.
7 | @Os:Windows 10 x64
8 | @Contact: bw_wangxiaomeng@whty.com.cn
9 | @Software: PY PyCharm
10 | @File: recommender.py
11 | @Time: 2019/7/16 15:37
12 | @Desc: define your function
13 | '''
14 | import time
15 | import pymysql
16 | import datetime
17 | import pandas as pd
18 | from user_point.evaluationFunction import evaluationTotalProduct,computeTotalScore
19 | from user_recom.DecisionTree import Dtcmodel
20 | from setting import *
21 |
22 |
23 | def loadData(sql):
24 | '''
25 | 从mysql中导入数据,
26 | :param sql:
27 | :return:DataFrame
28 | '''
29 | conn = pymysql.connect(host=server,port=port,user=user,password=password,db=dbName,charset='utf8')
30 | result = pd.read_sql_query(sql=sql,con=conn)
31 | return result
32 |
33 | def getColumns():
34 | '''
35 | 接入生产环境的用户数据
36 | :return:
37 | '''
38 | sql = "SELECT COLUMN_NAME FROM information_schema.COLUMNS WHERE TABLE_SCHEMA = 'dcm' AND TABLE_NAME = 'basic_info_all' LIMIT 10000;"
39 | conn = pymysql.connect(host=server,port=port,user=user,password=password,db=dbName,charset='utf8').cursor()
40 | conn.execute(sql)
41 | columns = conn.fetchall()
42 | conn.close()
43 | columns = [row[0] for row in columns]
44 | return columns
45 |
46 | def bftDeadtimeCalcu(deadtime):
47 | '''
48 | 计算到期的天数
49 | :param deadtime:到期时间
50 | :return:天
51 | '''
52 | if deadtime == None:return -1
53 | recently = str(deadtime)
54 | if recently.strip() == '0' or recently.strip() == None:
55 | return -1
56 | try:
57 | d1 = datetime.datetime.strptime(recently, '%Y-%m-%d')
58 | except:
59 | d1 = datetime.datetime.strptime(recently, '%Y/%m/%d')
60 | nwt = datetime.datetime.now()
61 | days = (d1 - nwt).days
62 | if days <0: days=720
63 | return days
64 |
65 | def deadtimeCalcu(recently):
66 | '''
67 | 根据最近一次购买时间,算出到期时间
68 | :param recently:最近一次购买日期
69 | :return:还有多久到期
70 | '''
71 | if recently ==None:return -1
72 | recently = str(recently)
73 | if recently.strip() == '0' or recently.strip() == None:return -1
74 | try:
75 | d1 = datetime.datetime.strptime(recently, '%Y-%m-%d')
76 | except:
77 | d1 = datetime.datetime.strptime(recently, '%Y/%m/%d')
78 | delta = datetime.timedelta(days=365)
79 | deadtime = d1 + delta
80 | nwt = datetime.datetime.now()
81 | days = (deadtime - nwt).days
82 | if days <0: days=720
83 | return days
84 |
85 | def agesCalcu(age):
86 | '''
87 | 计算到目前为止,成立的日期
88 | :param establish:成立日期
89 | :return:成立了多少年
90 | '''
91 | if age ==None:return 3
92 | age = str(age)
93 | if age.strip() =="未知" or age == '0':
94 | return 3
95 | try:
96 | d1 = datetime.datetime.strptime(age, '%Y-%m-%d')
97 | except:
98 | d1 = datetime.datetime.strptime(age, '%Y/%m/%d')
99 | nwt = datetime.datetime.now()
100 | years = round((nwt - d1).days/365,2)
101 | return years
102 |
103 | def rankCalcu(zczb):
104 | '''
105 | 计算注册资本排名
106 | :param zczb:输入注册资本
107 | :return:排名
108 | '''
109 | if zczb ==0 or zczb =="未知" or zczb =="0" or zczb ==None:return '未知'
110 | fname = Rfname
111 | f = open(fname,'r').readlines()
112 | zczbArr = pd.Series(f)
113 |
114 | zczbArr = [int(eval(zb.strip())) for zb in zczbArr]
115 | try:
116 | rank = zczbArr.index(int(eval(zczb))) + 1
117 | except:
118 | rank = '未知'
119 | return rank
120 |
121 | def scoreArrCalcu(data):
122 | '''
123 | 计算推荐得分
124 | :param data:用户评级
125 | :return:所有得分
126 | '''
127 | scoreArr = [computeTotalScore(dict(zip(index,data.iloc[i]))) for i in range(len(data))]
128 | return scoreArr
129 |
130 | def scoreCalcu(row):
131 | '''
132 | 计算单个用户的得分
133 | :param row: 用户等级
134 | :return: 得分
135 | '''
136 | score = computeTotalScore(dict(zip(index,row)))
137 | return score
138 |
139 |
140 | def exeTiming(func):
141 | '''
142 | 设置装饰器,定时执行
143 | :param func:方法
144 | :return:
145 | '''
146 | def wrapper():
147 | t1 = time.time()
148 | nwt = datetime.datetime.now().strftime('%d %H')
149 | # 每月1号 5点执行
150 | if nwt == exeTime:
151 | func()
152 | print('recommend sucess !')
153 | time.sleep(60*60*24)
154 | else:
155 | time.sleep(60*30)
156 | t2 = time.time()
157 | print(t2-t1)
158 | return wrapper
159 |
160 | def outputXls(data):
161 | '''
162 | 输出结果,如果用户应该推荐,就写入excel中
163 | :param data:用户数据
164 | :return:
165 | '''
166 | OutputFile = Output + 'recommendList' + datetime.datetime.now().strftime('%Y%m%d') + '.xls'
167 | df = pd.DataFrame(data=data)
168 | try:
169 | # 生产环境导表头
170 | columns = getColumns()
171 | except:
172 | # 测试环境用index
173 | columns = index
174 | columns.append("Rank")
175 | columns.append("score")
176 | df.columns = columns
177 | df.to_excel(OutputFile)
178 |
179 |
180 | @exeTiming
181 | def recommend():
182 | '''
183 | 推荐主方法
184 | :return:输出名单
185 | '''
186 | # 测试数据预处理
187 | test_data = pd.read_excel(Tname)
188 | data = test_data.ix[:,index]
189 | # 缺失填充0
190 | data = data.fillna(0)
191 | ages = [agesCalcu(y) for y in data['dateOfEstablishment']]
192 | recently_wp = [bftDeadtimeCalcu(x) for x in data['deadline_wp']]
193 | recently_bft = [bftDeadtimeCalcu(x) for x in data['deadline']]
194 | recently_bqt = [deadtimeCalcu(x) for x in data['recently_bqt']]
195 | data['dateOfEstablishment'] = ages
196 | data['deadline_wp'] = recently_wp
197 | data['deadline'] = recently_bft
198 | data['recently_bqt'] = recently_bqt
199 | data_raw = []
200 | # print('raw_data:',data)
201 | # print(len(data))
202 | # # 给输入数据评级
203 | for i in range(len(data)):
204 | row = data.iloc[i]
205 | test_X = evaluationTotalProduct(row,index)
206 | data_raw.append(test_X)
207 | RecommendList = []
208 | # # 加载模型,推荐预测
209 | y_predict = Dtcmodel(Mname,index,data_raw).tolist()
210 | for j,label in enumerate(y_predict):
211 | if label ==3:
212 | row = data_raw[j]
213 | score = scoreCalcu(row)
214 | info = test_data.iloc[j].tolist()
215 | rank = rankCalcu(test_data.iloc[j]['registeredCapital'])
216 | info.append(rank)
217 | info.append(score)
218 | RecommendList.append(info)
219 | print("推荐意愿打分:",score)
220 | # 输出结果
221 | if len(RecommendList) > 0:
222 | print("注意!生成了推荐名单")
223 | outputXls(RecommendList)
224 | else:
225 | print("没有产生推荐名单")
226 |
227 | if __name__ =='__main__':
228 | while True:
229 | recommend()
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas==0.23.0
2 | numpy==1.16.3
3 | lightgbm==2.1.1
4 | sklearn
5 | matplotlib==3.0.3
6 | seaborn==0.8.1
7 | openpyxl==2.5.3
8 | IPython
9 | pydotplus==2.0.2
10 | pymysql==0.8.1
11 | jieba==0.39
12 |
--------------------------------------------------------------------------------
/setting.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # __*__ coding: utf-8 __*__
3 |
4 | '''
5 | @Author: simonKing
6 | @License: (C) Copyright 2013-2019, Best Wonder Corporation Limited.
7 | @Os:Windows 10 x64
8 | @Contact: bw_wangxiaomeng@whty.com.cn
9 | @Software: PY PyCharm
10 | @File: setting.py
11 | @Time: 2019/8/18 11:21
12 | @Desc: 项目配置文件
13 | '''
14 |
15 | # mysql config
16 | server = '127.0.0.1'
17 | port = 3306
18 | user = 'root'
19 | password = '000000'
20 | dbName = 'dcm'
21 |
22 | # 准备待预测数据
23 | Tname = "./data/DataRaw.xlsx"
24 | # 准备模型数据
25 | Mname = "./data/DataModel.xlsx"
26 | # header
27 | index = ["times_wp","je_wp","deadline_wp","times_bqt","je_bqt","recently_bqt","userConsumeTotalTimes","userConsumeTotalAmount","deadline","loginFrequency",
28 | "registeredCapital","industry","dateOfEstablishment","dayCountAvg","daySumAvg"]
29 |
30 | # 排名数据
31 | Rfname = "./data/注册资本Ranks"
32 |
33 | # 输出名单
34 | Output = './data/'
35 |
36 | # 定时执行的时间设置,如每月1号,早上5点执行
37 | exeTime = '01 05'
--------------------------------------------------------------------------------
/user_point/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # __*__ coding: utf-8 __*__
3 |
4 | '''
5 | @Author: simonKing
6 | @License: (C) Copyright 2013-2019, Best Wonder Corporation Limited.
7 | @Os:Windows 10 x64
8 | @Contact: bw_wangxiaomeng@whty.com.cn
9 | @Software: PY PyCharm
10 | @File: __init__.py.py
11 | @Time: 2019/6/24 9:12
12 | @Desc: define your function
13 | '''
--------------------------------------------------------------------------------
/user_point/evaluationFunction.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # __*__ coding: utf-8 __*__
3 |
4 | '''
5 | @Author: simonKing
6 | @License: (C) Copyright 2013-2019, Best Wonder Corporation Limited.
7 | @Os:Windows 10 x64
8 | @Contact: bw_wangxiaomeng@whty.com.cn
9 | @Software: PY PyCharm
10 | @File: evaluationFunction.py
11 | @Time: 2019/7/11 10:21
12 | @Desc: define your function
13 | '''
14 | '''
15 | R,到期时间,得分越高,最高5分,最低1分
16 | 90天内购买;5
17 | 90-180天未购买;4
18 | 180-360天未购买;3
19 | 360-720;2
20 | 720以上;1
21 |
22 |
23 | F,交易频率越高,得分越高,最高5分,最低1分
24 | [1,24] ,平均1.1
25 | 1 -->1分;
26 | 2 -->2分;
27 | 3 -->3分;
28 | 4 -->4分;
29 | 5以上 -->5分;
30 |
31 | M,交易金额越高,得分越高,最高5分,最低1分,平均金额374
32 | 大于2000 --> 5;
33 | 900~2000 --> 4;
34 | 370~900 --> 3;
35 | 200~370 --> 2;
36 | 0~200 --> 1;
37 |
38 | 注册资本:
39 | 大于1000万 --> 5;
40 | 500-1000万 --> 4;
41 | 100-500万 --> 3;
42 | 50-100万 --> 2;
43 | 50万以内,未知 -->1;
44 |
45 | 成立日期:
46 | 两年内 --> 2;
47 | 两年后 --> 1;
48 |
49 | 行业分类:
50 | 餐饮住宿 --> 3;
51 | 制造业 --> 2;
52 | 其它 --> 1;
53 |
54 | 日均登录次数:
55 | 5次及以上 --> 5;
56 | 4次 --> 4;
57 | 3次 --> 3;
58 | 2次 --> 2;
59 | 0,1次 --> 1;
60 |
61 | 日均开票次数:
62 | 10次及以上 --> 3;
63 | 5-10次 --> 2;
64 | 0-5次 --> 1;
65 |
66 | 日均开票金额:
67 | 1000及以上 --> 3;
68 | 100-1000 --> 2;
69 | 100以内 --> 1;
70 | '''
71 |
72 |
73 | import pandas as pd
74 |
75 |
76 | def loadExcel(fname,index):
77 | '''
78 | 从Excel 中导入数据
79 | :param fname:excel文件名
80 | :param index:导入的列
81 | :return:矩阵
82 | '''
83 | df = pd.read_excel(fname)
84 | return df.ix[:, index]
85 |
86 | def R_eval(days):
87 | '''
88 | 最近一次购买,分析最近的购买需求度,越小越好
89 | :param days:距离2019年7月最近一次购买天数
90 | :return:1-5
91 | '''
92 | if int(days) == -1:return 0
93 | if abs(days) <90:score = 5
94 | elif 90<= abs(days) < 180:score = 4
95 | elif 180<= abs(days) < 360:score = 3
96 | elif 360<= abs(days) < 720:score = 2
97 | elif abs(days) >= 720:score = 1
98 | else:raise Exception('check days type. ERR')
99 | return score
100 |
101 | def F_eval(times):
102 | '''
103 | 购买频率打分,频率越高越忠诚,得分越高
104 | :param times:购买次数
105 | :return:1-5
106 | '''
107 | if abs(times) > 4:score = 5
108 | elif 3< abs(times) <= 4:score = 4
109 | elif 2< abs(times) <= 3:score = 3
110 | elif 1< abs(times) <= 2:score = 2
111 | elif 0< abs(times) <= 1:score = 1
112 | elif times ==0:score=0
113 | else:raise Exception('check times type. ERR')
114 | return score
115 |
116 | def M_eval(je):
117 | '''
118 | 消费金额打分,得分越高贡献度越高
119 | :param je:金额
120 | :return:1-5
121 | '''
122 | if abs(je) >= 2000:score = 5
123 | elif 900<= abs(je) < 2000: score = 4
124 | elif 370<= abs(je) < 900:score = 3
125 | elif 200<= abs(je) < 370:score = 2
126 | elif 0< abs(je) < 200:score = 1
127 | elif abs(je) == 0: score = 0
128 | else:raise Exception('check je type. ERR')
129 | return score
130 |
131 | def ZB_eval(zczb):
132 | '''
133 | 注册资本打分
134 | :param zczb:注册资本
135 | :return:1-5
136 | '''
137 | if abs(zczb) >= 1000:score = 5
138 | elif 500<= abs(zczb) < 1000:score = 4
139 | elif 100<= abs(zczb) < 500:score = 3
140 | elif 50<= abs(zczb) < 100:score = 2
141 | elif 0< abs(zczb) < 50:score = 1
142 | else:score = 0
143 | return score
144 |
145 | def AGE_eval(age):
146 | '''
147 | 公司成立日期打分,成立较短的打高分
148 | :param age:成立年限
149 | :return:打分2,1
150 | '''
151 | if abs(age) > 2:score = 1
152 | elif 0< abs(age) <= 2:score = 2
153 | elif abs(age) == 0: score = 0
154 | else:raise Exception('check age type. ERR')
155 | return score
156 |
157 | def INDUSTRY_eval(indestry):
158 | '''
159 | 行业分类打分,为了突出餐饮住宿、制造业,打分
160 | :param indestry:行业分类
161 | :return:行业分类打分3,2,1
162 | '''
163 | if indestry == '餐饮住宿':score = 3
164 | elif indestry == '制造业':score = 2
165 | else:score = 1
166 | return score
167 |
168 | def LOGINS_eval(logins):
169 | '''
170 | 日均登录次数打分
171 | :param logins:日均登录次数
172 | :return:打分1-5
173 | '''
174 | if abs(logins) >= 5:score = 5
175 | elif 4<= abs(logins) < 5:score = 4
176 | elif 3<= abs(logins) < 4:score = 3
177 | elif 2<= abs(logins) < 3:score = 2
178 | elif 0< abs(logins) < 2:score = 1
179 | else:score = 0
180 | return score
181 |
182 | def INVOICES_eval(invoices):
183 | '''
184 | 日均开票次数打分
185 | :param invoices:日均开票次数
186 | :return:打分3,2,1
187 | '''
188 | if abs(invoices) >= 10:score = 3
189 | elif 5<= abs(invoices) < 10:score = 2
190 | elif 0< abs(invoices) < 5:score = 1
191 | else:score = 0
192 | return score
193 |
194 | def INVOICEJE_eval(invoiceJe):
195 | '''
196 | 日均开票金额打分
197 | :param invoiceJe: 日均开票金额
198 | :return: 分数 3,2,1
199 | '''
200 | if abs(invoiceJe) >= 1000:score = 3
201 | elif 100<= abs(invoiceJe) < 1000:score = 2
202 | elif 0< abs(invoiceJe) < 100:score = 1
203 | else:score = 0
204 | return score
205 |
206 | def computeScore(scoreDic):
207 | '''
208 | 根据每个用户评级dic 算出对产品的打分
209 | :param scoreDic:每个用户对商品的评级
210 | :return:对产品的打分
211 | '''
212 | if len(scoreDic) <1:raise Exception('scoreDic is none')
213 | if scoreDic.get('dayLoginTimes') == None:scoreDic['dayLoginTimes'] = 0
214 | score = scoreDic['recently'] * 1000 + scoreDic['je'] * 100 + scoreDic['times'] * 100 + \
215 | scoreDic['zczb'] * 10 + scoreDic['dayInvoiceNum'] * 10 + scoreDic['dayLoginTimes'] * 10 + \
216 | scoreDic['dayInvoiceJe'] * 10 + scoreDic['ages'] * 10 + scoreDic['industry'] * 1
217 | return score
218 |
219 | def computeTotalScore(scoreDic):
220 | '''
221 | 根据用户对所有商品的打分
222 | :param scoreDic:每个用户对所有商品的评级
223 | :return:对产品的打分
224 | index = ["times-wp","je-wp","recently-wp","times-bqt","je-bqt","recently-bqt","times-bft","je-bft","recently-bft","dayLoginTimes",
225 | "zczb","industry","ages","dayInvoiceNum","dayInvoiceJe"]
226 | '''
227 | if len(scoreDic) <1:raise Exception('scoreDic is none')
228 | if scoreDic.get('dayLoginTimes') == None:scoreDic['dayLoginTimes'] = 0
229 | score_common = scoreDic['registeredCapital'] * 10 + scoreDic['dayCountAvg'] * 30 + scoreDic['loginFrequency'] * 30 + \
230 | scoreDic['daySumAvg'] * 30 + scoreDic['dateOfEstablishment'] * 10 + scoreDic['industry'] * 10
231 | if scoreDic['deadline']==5: bft = 200 + (scoreDic['userConsumeTotalAmount'] + scoreDic['userConsumeTotalTimes'])* 50
232 | else:bft = 0
233 | if scoreDic['recently_bqt']==5: bqt = 100 + (scoreDic['je_bqt'] + scoreDic['times_bqt'])* 50
234 | else:bqt = 0
235 | if scoreDic['deadline_wp']==5: wp = 100 + (scoreDic['je_wp'] + scoreDic['times_wp'])* 50
236 | else:wp = 0
237 | score = bft + bqt + wp + score_common
238 | return score
239 |
240 |
241 | def evaluationTotal(row,index):
242 | '''
243 | 对输入的用户特征进行评价
244 | :param row:用户特征数据
245 | :param index:选择的列数
246 | :return:打分
247 | '''
248 | scoreDic = {}
249 | if len(row) != len(index):
250 | raise Exception('table header is incorrect')
251 | for i in index:
252 | if i == 'registeredCapital':
253 | zczb = int(row[i])
254 | score = ZB_eval(zczb)
255 | scoreDic['registeredCapital'] = score
256 | elif i =='industry':
257 | industry = row[i]
258 | score = INDUSTRY_eval(industry)
259 | scoreDic['industry'] = score
260 | elif i =='je':
261 | je = float(row[i])
262 | score = M_eval(je)
263 | scoreDic['je'] = score
264 | elif i =='times':
265 | times = int(row[i])
266 | score = F_eval(times)
267 | scoreDic['times'] = score
268 | elif i =='dayLoginTimes':
269 | dayLoginTimes = float(row[i])
270 | score = LOGINS_eval(dayLoginTimes)
271 | scoreDic['dayLoginTimes'] = score
272 | elif i =='dayInvoiceNum':
273 | dayInvoiceNum = float(row[i])
274 | score = INVOICES_eval(dayInvoiceNum)
275 | scoreDic['dayInvoiceNum'] = score
276 | elif i =='dayInvoiceJe':
277 | dayInvoiceJe = row[i]
278 | score = INVOICEJE_eval(dayInvoiceJe)
279 | scoreDic['dayInvoiceJe'] = score
280 | elif i =='ages':
281 | ages = float(row[i])
282 | score = AGE_eval(ages)
283 | scoreDic['ages'] = score
284 | elif i =='recently':
285 | recently = float(row[i])
286 | score = R_eval(recently)
287 | scoreDic['recently'] = score
288 | # print(scoreDic)
289 | SCORE = computeScore(scoreDic)
290 | return SCORE,scoreDic
291 |
292 | def evaluationTotalProduct(row,index):
293 | '''
294 | 对输入的用户所有特征进行评价
295 | :param row:用户特征数据
296 | :param index:选择的列数
297 | :return:打分
298 | '''
299 | scoreDic = {}
300 | scoreArr = []
301 | if len(row) != len(index):
302 | raise Exception('table header is incorrect')
303 | for i in index:
304 | if i == 'registeredCapital':
305 | try:
306 | zczb = float(row[i])
307 | except:
308 | zczb = 0
309 | score = ZB_eval(zczb)
310 | scoreDic['registeredCapital'] = score
311 | elif i == 'industry':
312 | industry = row[i]
313 | score = INDUSTRY_eval(industry)
314 | scoreDic['industry'] = score
315 | elif i == 'je_wp':
316 | je = float(row[i])
317 | score = M_eval(je)
318 | scoreDic['je_wp'] = score
319 | elif i == 'userConsumeTotalAmount':
320 | je = float(row[i])
321 | score = M_eval(je)
322 | scoreDic['userConsumeTotalAmount'] = score
323 | elif i == 'je_bqt':
324 | je = float(row[i])
325 | score = M_eval(je)
326 | scoreDic['je_bqt'] = score
327 | elif i == 'times_wp':
328 | times = int(row[i])
329 | score = F_eval(times)
330 | scoreDic['times_wp'] = score
331 | elif i == 'userConsumeTotalTimes':
332 | times = int(row[i])
333 | score = F_eval(times)
334 | scoreDic['userConsumeTotalTimes'] = score
335 | elif i == 'times_bqt':
336 | times = int(row[i])
337 | score = F_eval(times)
338 | scoreDic['times_bqt'] = score
339 | elif i == 'loginFrequency':
340 | dayLoginTimes = float(row[i])
341 | score = LOGINS_eval(dayLoginTimes)
342 | scoreDic['loginFrequency'] = score
343 | elif i == 'dayCountAvg':
344 | dayInvoiceNum = float(row[i])
345 | score = INVOICES_eval(dayInvoiceNum)
346 | scoreDic['dayCountAvg'] = score
347 | elif i == 'daySumAvg':
348 | dayInvoiceJe = row[i]
349 | score = INVOICEJE_eval(dayInvoiceJe)
350 | scoreDic['daySumAvg'] = score
351 | elif i == 'dateOfEstablishment':
352 | ages = float(row[i])
353 | score = AGE_eval(ages)
354 | scoreDic['dateOfEstablishment'] = score
355 | elif i == 'deadline_wp':
356 | recently = float(row[i])
357 | score = R_eval(recently)
358 | scoreDic['deadline_wp'] = score
359 | elif i == 'deadline':
360 | recently = float(row[i])
361 | score = R_eval(recently)
362 | scoreDic['deadline'] = score
363 | elif i == 'recently_bqt':
364 | recently = float(row[i])
365 | score = R_eval(recently)
366 | scoreDic['recently_bqt'] = score
367 | else:raise Exception('please check index is exist! err:',row)
368 | scoreArr.append(score)
369 | # print(scoreDic)
370 | return scoreArr
371 |
372 | # if __name__ == '__main__':
373 | # # 准备数据
374 | # fname = r'D:\baiwang\21.数据运营\recommender\user_point\评价\方案2\bft_origin2.xlsx'
375 | # # 百赋通
376 | # index = ['tax_id','zczb','industry','je','times','dayLoginTimes','dayInvoiceNum','dayInvoiceJe','ages','recently']
377 | # # 百企通,旺票
378 | # # index = ['tax_id','zczb','industry','times','je','recently','ages','dayInvoiceNum','dayInvoiceJe']
379 | # # index = ['tax_id','zczb','industry','times','je','recently','ages','dayInvoiceNum','dayInvoiceJe']
380 | # data_origin = loadExcel(fname,index)
381 | #
382 | # # 写入Excel
383 | # writer = pd.ExcelWriter('./bftClassfy.xlsx')
384 | # scoreArr = []
385 | # TaxidArr = []
386 | # zczbArr = []
387 | # industryArr = []
388 | # timesArr = []
389 | # dayLoginTimesArr = []
390 | # jeArr = []
391 | # recentlyArr = []
392 | # agesArr = []
393 | # dayInvoiceNumArr = []
394 | # dayInvoiceJeArr = []
395 | # # 逐行遍历
396 | # for i in range(len(data_origin)):
397 | # row = data_origin.iloc[i]
398 | # score,scoreDic = evaluationTotal(row, index)
399 | # TaxidArr.append(row['tax_id'])
400 | # scoreArr.append(score)
401 | # zczbArr.append(scoreDic.get('zczb'))
402 | # industryArr.append(scoreDic.get('industry'))
403 | # timesArr.append(scoreDic.get('times'))
404 | # jeArr.append(scoreDic.get('je'))
405 | # recentlyArr.append(scoreDic.get('recently'))
406 | # dayLoginTimesArr.append(scoreDic.get('dayLoginTimes'))
407 | # agesArr.append(scoreDic.get('ages'))
408 | # dayInvoiceNumArr.append(scoreDic.get('dayInvoiceNum'))
409 | # dayInvoiceJeArr.append(scoreDic.get('dayInvoiceJe'))
410 | # print(row['tax_id'],'score is:',score,'socreDict is:',scoreDic)
411 | # pd_taxid = pd.DataFrame({'tax_id': TaxidArr})
412 | # pd_score = pd.DataFrame({'score': scoreArr})
413 | # pd_zczb = pd.DataFrame({'zczb': zczbArr})
414 | # pd_industry = pd.DataFrame({'industry': industryArr})
415 | # pd_times = pd.DataFrame({'times': timesArr})
416 | # pd_je = pd.DataFrame({'je': jeArr})
417 | # pd_recently = pd.DataFrame({'recently': recentlyArr})
418 | # pd_dayLoginTimes = pd.DataFrame({'dayLoginTimes': dayLoginTimesArr})
419 | # pd_ages = pd.DataFrame({'ages': agesArr})
420 | # pd_dayInvoiceNum = pd.DataFrame({'dayInvoiceNum': dayInvoiceNumArr})
421 | # pd_dayInvoiceJe = pd.DataFrame({'dayInvoiceJe': dayInvoiceJeArr})
422 | # pd_taxid.to_excel(writer, sheet_name='Sheet1', startcol=0, index=False)
423 | # pd_score.to_excel(writer, sheet_name='Sheet1', startcol=1, index=False)
424 | # pd_zczb.to_excel(writer, sheet_name='Sheet1', startcol=2, index=False)
425 | # pd_industry.to_excel(writer, sheet_name='Sheet1', startcol=3, index=False)
426 | # pd_times.to_excel(writer, sheet_name='Sheet1', startcol=4, index=False)
427 | # pd_je.to_excel(writer, sheet_name='Sheet1', startcol=5, index=False)
428 | # pd_recently.to_excel(writer, sheet_name='Sheet1', startcol=6, index=False)
429 | # pd_dayLoginTimes.to_excel(writer, sheet_name='Sheet1', startcol=10, index=False)
430 | # pd_ages.to_excel(writer, sheet_name='Sheet1', startcol=7, index=False)
431 | # pd_dayInvoiceNum.to_excel(writer, sheet_name='Sheet1', startcol=8, index=False)
432 | # pd_dayInvoiceJe.to_excel(writer, sheet_name='Sheet1', startcol=9, index=False)
433 | # # 不加会报错
434 | # writer.save()
435 |
--------------------------------------------------------------------------------
/user_point/giveScores.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # __*__ coding: utf-8 __*__
3 |
4 | '''
5 | @Author: simonKing
6 | @License: (C) Copyright 2013-2019, Best Wonder Corporation Limited.
7 | @Os:Windows 10 x64
8 | @Contact: bw_wangxiaomeng@whty.com.cn
9 | @Software: PY PyCharm
10 | @File: giveScores.py
11 | @Time: 2019/7/3 9:24
12 | @Desc: define your function
13 | '''
14 |
15 | '''
16 | ###############################################################
17 | * 算法设计目的:根据用户使用数据,对百旺和百赋通打分,0~5
18 | * 算法流程:1、数据清洗;2、数据编码码;3、K-means生成得分
19 | * 算法输入:用户在百赋通使用数据,用户在百旺的消费数据
20 | * 算法输出:分别算出每个用户对百赋通和百旺的打分
21 | ###############################################################
22 |
23 | R,到期时间,得分越高,最高5分,最低1分
24 | 90天内购买;5
25 | 90-180天未购买;4
26 | 180-360天未购买;3
27 | 360-720;2
28 | 720以上;1
29 |
30 |
31 | F,交易频率越高,得分越高,最高5分,最低1分
32 | [1,24] ,平均1.1
33 | 1 -->1分;
34 | 2 -->2分;
35 | 3 -->3分;
36 | 4 -->4分;
37 | 5以上 -->5分;
38 |
39 | M,交易金额越高,得分越高,最高5分,最低1分,平均金额374
40 | 大于2000 --> 5;
41 | 900~2000 --> 4;
42 | 370~900 --> 3;
43 | 200~370 --> 2;
44 | 0~200 --> 1;
45 |
46 | 注册资本:
47 | 大于1000万 --> 5;
48 | 500-1000万 --> 4;
49 | 100-500万 --> 3;
50 | 50-100万 --> 2;
51 | 50万以内,未知 -->1;
52 |
53 | 成立日期:
54 | 两年内 --> 2;
55 | 两年后 --> 1;
56 |
57 | 行业分类:
58 | 餐饮住宿 --> 3;
59 | 制造业 --> 2;
60 | 其它 --> 1;
61 |
62 | 日均登录次数:
63 | 5次及以上 --> 5;
64 | 4次 --> 4;
65 | 3次 --> 3;
66 | 2次 --> 2;
67 | 0,1次 --> 1;
68 |
69 | 日均登录次数:
70 | 10次及以上 --> 3;
71 | 5-10次 --> 2;
72 | 0-5次 --> 1;
73 |
74 | 日均开票金额:
75 | 1000及以上 --> 3;
76 | 100-1000 --> 2;
77 | 100以内 --> 1;
78 |
79 | '''
80 |
81 | import numpy as np
82 | import openpyxl
83 | import pandas as pd
84 | from sklearn.cluster import KMeans,MiniBatchKMeans
85 | from sklearn.decomposition import PCA
86 | import matplotlib.pyplot as plt
87 | from mpl_toolkits.mplot3d.axes3d import Axes3D
88 |
89 |
90 |
91 |
92 | def loadData(fname,splitchar='\t'):
93 | '''
94 | 导入数据
95 | :param fname:文件名
96 | :param splitchar:字符间的分割符合
97 | :return:输入向量
98 | '''
99 | f = open(fname)
100 | X = [[float(v.split(splitchar)[0].strip()), float(v.split(splitchar)[1].strip()),float(v.split(splitchar)[2].strip())] for v in f]
101 | # X = [[float(v.split(splitchar)[0].strip()), float(v.split(splitchar)[1].strip()), float(v.split(splitchar)[2].strip()), float(v.split(splitchar)[3].strip())
102 | # , float(v.split(splitchar)[4].strip()), float(v.split(splitchar)[5].strip()), float(v.split(splitchar)[6].strip()), float(v.split(splitchar)[7].strip())] for v in f]
103 | X = np.array(X)
104 | return X
105 |
106 | def loadExcel(fname,index):
107 | '''
108 | 从Excel 中导入数据
109 | :param fname:excel文件名
110 | :param index:导入的列
111 | :return:矩阵
112 | '''
113 | df = pd.read_excel(fname)
114 | X = df.ix[:, index]
115 | # print(X)
116 | # print(type(X))
117 | Input_X = np.array(X)
118 | return Input_X
119 |
120 | def insertExcel(fname,col,data):
121 | wb = openpyxl.load_workbook(fname)
122 | ws = wb.worksheets[0]
123 | # ws.insert_cols(col)
124 | for index,row in enumerate(ws.rows):
125 | if index ==0:
126 | row[col+1].value = '评分'
127 | else:
128 | # ws.rows 比data多出了1行,header
129 | row[col+1].value = data.tolist()[index-1]
130 | wb.save('./new.xlsx')
131 |
132 | def Kderiv(Input,n=2):
133 | '''
134 | 求离散数列的2阶导数
135 | :param Input:输入离散数列
136 | :param n:n阶导数
137 | :return:二阶导数
138 | '''
139 | fun = np.poly1d(Input)
140 | fun_1 = np.poly1d.deriv(fun)
141 | fun_2 = np.poly1d.deriv(fun_1)
142 | return fun_2
143 |
144 | def EmCalcu(data,labels,k):
145 | '''
146 | 计算群体购买百赋通的数学期望
147 | :param data:原始数据
148 | :param labels:标签
149 | :param k:K均值
150 | :return:
151 | '''
152 | bestBuy = []
153 | for i in range(k):
154 | buy = 0
155 | nbuy = 0
156 | for j in range(len(labels)):
157 | if i == labels[j]:
158 | if data[j][-1] > 0:
159 | buy = buy + 1
160 | elif data[j][-1] ==0:
161 | nbuy = nbuy + 1
162 | else:
163 | raise Exception('label not right')
164 | idx = labels.tolist().index(i)
165 | buyTimes = buy+ nbuy
166 | buyRate = buy/buyTimes
167 | print('K is %s, label is %s , buyRate is %s ,buyTime is %s'%(k,i,buyRate,buyTimes))
168 | print('example point:',data[idx])
169 | print("***"*30)
170 |
171 |
172 | def choiceK(data):
173 | '''
174 | 手肘法:采用MiniBatch方式寻找最佳的K值
175 | :param data:输入的数据集
176 | :return:最佳的K值
177 | '''
178 | data = [row for row in data if row[1] > 0]
179 | raw_data = [row[:6] for row in data]
180 | sse = []
181 | for k in range(1, 11):
182 | # estimator = KMeans(n_clusters=k)
183 | estimator = MiniBatchKMeans(init='k-means++', n_clusters=k, batch_size=100000, n_init=10, max_no_improvement=10, verbose=0)
184 | estimator.fit(raw_data)
185 | labels = estimator.labels_
186 | EmCalcu(data,labels,k)
187 | sse.append(estimator.inertia_)
188 | # data2 = []
189 | # for d in data.tolist():
190 | # if d not in data2:
191 | # data2.append(d)
192 | # data2 = np.array(data2)
193 | # 显示每个Kmeans聚类效果
194 | # label_pred = estimator.labels_
195 | # centroids = estimator.cluster_centers_
196 | # plotKmeans(data, k, centroids, label_pred).subplot(330+k)
197 | X = range(1, 11)
198 | plt.xlabel('K')
199 | plt.ylabel('SSE')
200 | plt.title("choice best k value")
201 | plt.plot(X, sse, 'o-')
202 | plt.savefig('./shouzhou.png')
203 | plt.show()
204 | return
205 |
206 | def KmeansModel(data,k):
207 | '''
208 | # 构造聚类器
209 | :param data: 输入训练集
210 | :param k:聚类的K值
211 | :return:聚类模型
212 | '''
213 | estimator = KMeans(n_clusters=k)
214 | estimator.fit(data)
215 | return estimator
216 |
217 | def KmeansPredict(clf,Input):
218 | '''
219 | Kmeans进行预测
220 | :param clf:Kmeans分类器
221 | :param Input:输入待测数据
222 | :return:分类标签
223 | '''
224 | labels = clf.predict(Input)
225 | return labels
226 |
227 | def plotKmeans(dataSet,k,centroids,label_pred):
228 | '''
229 | 用于绘制Kmeans聚类效果
230 | :param dataSet:输入数据集
231 | :param k:聚类的K值
232 | :param centroids:聚类的质心
233 | :param label_pred:预测的标签
234 | :return:
235 | '''
236 | mark = [ '^r', '+b', 'sg', 'x', 'gini = 0.158
samples = 85315
value = [77980, 6724, 611]
class = low>, fillcolor="#e58139e7"] ;
5 | 1 [label=gini = 0.087
samples = 81726
value = [77980, 3746, 0]
class = low>, fillcolor="#e58139f3"] ;
6 | 0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
7 | 2 [label=gini = 0.0
samples = 66828
value = [66815, 13, 0]
class = low>, fillcolor="#e58139ff"] ;
8 | 1 -> 2 ;
9 | 3 [label=samples = 66670
value = [66670, 0, 0]
class = low>, fillcolor="#e58139ff"] ;
10 | 2 -> 3 ;
11 | 4 [label=gini = 0.151
samples = 158
value = [145, 13, 0]
class = low>, fillcolor="#e58139e8"] ;
12 | 2 -> 4 ;
13 | 5 [label=gini = 0.052
samples = 149
value = [145, 4, 0]
class = low>, fillcolor="#e58139f8"] ;
14 | 4 -> 5 ;
15 | 6 [label=samples = 140
value = [140, 0, 0]
class = low>, fillcolor="#e58139ff"] ;
16 | 5 -> 6 ;
17 | 7 [label=gini = 0.494
samples = 9
value = [5, 4, 0]
class = low>, fillcolor="#e5813933"] ;
18 | 5 -> 7 ;
19 | 8 [label=samples = 3
value = [3, 0, 0]
class = low>, fillcolor="#e58139ff"] ;
20 | 7 -> 8 ;
21 | 9 [label=gini = 0.444
samples = 6
value = [2, 4, 0]
class = medium>, fillcolor="#39e5817f"] ;
22 | 7 -> 9 ;
23 | 10 [label=samples = 4
value = [0, 4, 0]
class = medium>, fillcolor="#39e581ff"] ;
24 | 9 -> 10 ;
25 | 11 [label=samples = 2
value = [2, 0, 0]
class = low>, fillcolor="#e58139ff"] ;
26 | 9 -> 11 ;
27 | 12 [label=samples = 9
value = [0, 9, 0]
class = medium>, fillcolor="#39e581ff"] ;
28 | 4 -> 12 ;
29 | 13 [label=gini = 0.376
samples = 14898
value = [11165, 3733, 0]
class = low>, fillcolor="#e58139aa"] ;
30 | 1 -> 13 ;
31 | 14 [label=samples = 11034
value = [11034, 0, 0]
class = low>, fillcolor="#e58139ff"] ;
32 | 13 -> 14 ;
33 | 15 [label=gini = 0.066
samples = 3864
value = [131, 3733, 0]
class = medium>, fillcolor="#39e581f6"] ;
34 | 13 -> 15 ;
35 | 16 [label=gini = 0.006
samples = 3744
value = [11, 3733, 0]
class = medium>, fillcolor="#39e581fe"] ;
36 | 15 -> 16 ;
37 | 17 [label=gini = 0.099
samples = 210
value = [11, 199, 0]
class = medium>, fillcolor="#39e581f1"] ;
38 | 16 -> 17 ;
39 | 18 [label=samples = 11
value = [11, 0, 0]
class = low>, fillcolor="#e58139ff"] ;
40 | 17 -> 18 ;
41 | 19 [label=samples = 199
value = [0, 199, 0]
class = medium>, fillcolor="#39e581ff"] ;
42 | 17 -> 19 ;
43 | 20 [label=samples = 3534
value = [0, 3534, 0]
class = medium>, fillcolor="#39e581ff"] ;
44 | 16 -> 20 ;
45 | 21 [label=samples = 120
value = [120, 0, 0]
class = low>, fillcolor="#e58139ff"] ;
46 | 15 -> 21 ;
47 | 22 [label=gini = 0.283
samples = 3589
value = [0, 2978, 611]
class = medium>, fillcolor="#39e581cb"] ;
48 | 0 -> 22 [labeldistance=2.5, labelangle=-45, headlabel="False"] ;
49 | 23 [label=gini = 0.085
samples = 3115
value = [0, 2976, 139]
class = medium>, fillcolor="#39e581f3"] ;
50 | 22 -> 23 ;
51 | 24 [label=gini = 0.007
samples = 2475
value = [0, 2466, 9]
class = medium>, fillcolor="#39e581fe"] ;
52 | 23 -> 24 ;
53 | 25 [label=samples = 2454
value = [0, 2454, 0]
class = medium>, fillcolor="#39e581ff"] ;
54 | 24 -> 25 ;
55 | 26 [label=gini = 0.49
samples = 21
value = [0, 12, 9]
class = medium>, fillcolor="#39e58140"] ;
56 | 24 -> 26 ;
57 | 27 [label=samples = 12
value = [0, 12, 0]
class = medium>, fillcolor="#39e581ff"] ;
58 | 26 -> 27 ;
59 | 28 [label=samples = 9
value = [0, 0, 9]
class = high>, fillcolor="#8139e5ff"] ;
60 | 26 -> 28 ;
61 | 29 [label=gini = 0.324
samples = 640
value = [0, 510, 130]
class = medium>, fillcolor="#39e581be"] ;
62 | 23 -> 29 ;
63 | 30 [label=gini = 0.132
samples = 506
value = [0, 470, 36]
class = medium>, fillcolor="#39e581eb"] ;
64 | 29 -> 30 ;
65 | 31 [label=gini = 0.086
samples = 489
value = [0, 467, 22]
class = medium>, fillcolor="#39e581f3"] ;
66 | 30 -> 31 ;
67 | 32 [label=gini = 0.054
samples = 434
value = [0, 422, 12]
class = medium>, fillcolor="#39e581f8"] ;
68 | 31 -> 32 ;
69 | 33 [label=gini = 0.025
samples = 399
value = [0, 394, 5]
class = medium>, fillcolor="#39e581fc"] ;
70 | 32 -> 33 ;
71 | 34 [label=gini = 0.021
samples = 386
value = [0, 382, 4]
class = medium>, fillcolor="#39e581fc"] ;
72 | 33 -> 34 ;
73 | 35 [label=gini = 0.013
samples = 301
value = [0, 299, 2]
class = medium>, fillcolor="#39e581fd"] ;
74 | 34 -> 35 ;
75 | 36 [label=samples = 168
value = [0, 168, 0]
class = medium>, fillcolor="#39e581ff"] ;
76 | 35 -> 36 ;
77 | 37 [label=gini = 0.03
samples = 133
value = [0, 131, 2]
class = medium>, fillcolor="#39e581fb"] ;
78 | 35 -> 37 ;
79 | 38 [label=gini = 0.165
samples = 11
value = [0, 10, 1]
class = medium>, fillcolor="#39e581e6"] ;
80 | 37 -> 38 ;
81 | 39 [label=samples = 6
value = [0, 6, 0]
class = medium>, fillcolor="#39e581ff"] ;
82 | 38 -> 39 ;
83 | 40 [label=gini = 0.32
samples = 5
value = [0, 4, 1]
class = medium>, fillcolor="#39e581bf"] ;
84 | 38 -> 40 ;
85 | 41 [label=samples = 2
value = [0, 2, 0]
class = medium>, fillcolor="#39e581ff"] ;
86 | 40 -> 41 ;
87 | 42 [label=gini = 0.444
samples = 3
value = [0, 2, 1]
class = medium>, fillcolor="#39e5817f"] ;
88 | 40 -> 42 ;
89 | 43 [label=samples = 2
value = [0, 1, 1]
class = medium>, fillcolor="#39e58100"] ;
90 | 42 -> 43 ;
91 | 44 [label=samples = 1
value = [0, 1, 0]
class = medium>, fillcolor="#39e581ff"] ;
92 | 42 -> 44 ;
93 | 45 [label=gini = 0.016
samples = 122
value = [0, 121, 1]
class = medium>, fillcolor="#39e581fd"] ;
94 | 37 -> 45 ;
95 | 46 [label=samples = 35
value = [0, 35, 0]
class = medium>, fillcolor="#39e581ff"] ;
96 | 45 -> 46 ;
97 | 47 [label=gini = 0.023
samples = 87
value = [0, 86, 1]
class = medium>, fillcolor="#39e581fc"] ;
98 | 45 -> 47 ;
99 | 48 [label=samples = 22
value = [0, 22, 0]
class = medium>, fillcolor="#39e581ff"] ;
100 | 47 -> 48 ;
101 | 49 [label=gini = 0.03
samples = 65
value = [0, 64, 1]
class = medium>, fillcolor="#39e581fb"] ;
102 | 47 -> 49 ;
103 | 50 [label=gini = 0.033
samples = 60
value = [0, 59, 1]
class = medium>, fillcolor="#39e581fb"] ;
104 | 49 -> 50 ;
105 | 51 [label=samples = 59
value = [0, 58, 1]
class = medium>, fillcolor="#39e581fb"] ;
106 | 50 -> 51 ;
107 | 52 [label=samples = 1
value = [0, 1, 0]
class = medium>, fillcolor="#39e581ff"] ;
108 | 50 -> 52 ;
109 | 53 [label=samples = 5
value = [0, 5, 0]
class = medium>, fillcolor="#39e581ff"] ;
110 | 49 -> 53 ;
111 | 54 [label=gini = 0.046
samples = 85
value = [0, 83, 2]
class = medium>, fillcolor="#39e581f9"] ;
112 | 34 -> 54 ;
113 | 55 [label=gini = 0.059
samples = 66
value = [0, 64, 2]
class = medium>, fillcolor="#39e581f7"] ;
114 | 54 -> 55 ;
115 | 56 [label=gini = 0.1
samples = 19
value = [0, 18, 1]
class = medium>, fillcolor="#39e581f1"] ;
116 | 55 -> 56 ;
117 | 57 [label=samples = 6
value = [0, 6, 0]
class = medium>, fillcolor="#39e581ff"] ;
118 | 56 -> 57 ;
119 | 58 [label=samples = 13
value = [0, 12, 1]
class = medium>, fillcolor="#39e581ea"] ;
120 | 56 -> 58 ;
121 | 59 [label=gini = 0.042
samples = 47
value = [0, 46, 1]
class = medium>, fillcolor="#39e581f9"] ;
122 | 55 -> 59 ;
123 | 60 [label=samples = 27
value = [0, 26, 1]
class = medium>, fillcolor="#39e581f5"] ;
124 | 59 -> 60 ;
125 | 61 [label=samples = 20
value = [0, 20, 0]
class = medium>, fillcolor="#39e581ff"] ;
126 | 59 -> 61 ;
127 | 62 [label=samples = 19
value = [0, 19, 0]
class = medium>, fillcolor="#39e581ff"] ;
128 | 54 -> 62 ;
129 | 63 [label=gini = 0.142
samples = 13
value = [0, 12, 1]
class = medium>, fillcolor="#39e581ea"] ;
130 | 33 -> 63 ;
131 | 64 [label=samples = 4
value = [0, 4, 0]
class = medium>, fillcolor="#39e581ff"] ;
132 | 63 -> 64 ;
133 | 65 [label=gini = 0.198
samples = 9
value = [0, 8, 1]
class = medium>, fillcolor="#39e581df"] ;
134 | 63 -> 65 ;
135 | 66 [label=samples = 2
value = [0, 2, 0]
class = medium>, fillcolor="#39e581ff"] ;
136 | 65 -> 66 ;
137 | 67 [label=gini = 0.245
samples = 7
value = [0, 6, 1]
class = medium>, fillcolor="#39e581d4"] ;
138 | 65 -> 67 ;
139 | 68 [label=samples = 1
value = [0, 1, 0]
class = medium>, fillcolor="#39e581ff"] ;
140 | 67 -> 68 ;
141 | 69 [label=gini = 0.278
samples = 6
value = [0, 5, 1]
class = medium>, fillcolor="#39e581cc"] ;
142 | 67 -> 69 ;
143 | 70 [label=samples = 5
value = [0, 4, 1]
class = medium>, fillcolor="#39e581bf"] ;
144 | 69 -> 70 ;
145 | 71 [label=samples = 1
value = [0, 1, 0]
class = medium>, fillcolor="#39e581ff"] ;
146 | 69 -> 71 ;
147 | 72 [label=gini = 0.32
samples = 35
value = [0, 28, 7]
class = medium>, fillcolor="#39e581bf"] ;
148 | 32 -> 72 ;
149 | 73 [label=samples = 28
value = [0, 28, 0]
class = medium>, fillcolor="#39e581ff"] ;
150 | 72 -> 73 ;
151 | 74 [label=samples = 7
value = [0, 0, 7]
class = high>, fillcolor="#8139e5ff"] ;
152 | 72 -> 74 ;
153 | 75 [label=gini = 0.298
samples = 55
value = [0, 45, 10]
class = medium>, fillcolor="#39e581c6"] ;
154 | 31 -> 75 ;
155 | 76 [label=gini = 0.26
samples = 52
value = [0, 44, 8]
class = medium>, fillcolor="#39e581d1"] ;
156 | 75 -> 76 ;
157 | 77 [label=samples = 15
value = [0, 15, 0]
class = medium>, fillcolor="#39e581ff"] ;
158 | 76 -> 77 ;
159 | 78 [label=gini = 0.339
samples = 37
value = [0, 29, 8]
class = medium>, fillcolor="#39e581b9"] ;
160 | 76 -> 78 ;
161 | 79 [label=gini = 0.313
samples = 36
value = [0, 29, 7]
class = medium>, fillcolor="#39e581c1"] ;
162 | 78 -> 79 ;
163 | 80 [label=gini = 0.342
samples = 32
value = [0, 25, 7]
class = medium>, fillcolor="#39e581b8"] ;
164 | 79 -> 80 ;
165 | 81 [label=samples = 12
value = [0, 10, 2]
class = medium>, fillcolor="#39e581cc"] ;
166 | 80 -> 81 ;
167 | 82 [label=samples = 20
value = [0, 15, 5]
class = medium>, fillcolor="#39e581aa"] ;
168 | 80 -> 82 ;
169 | 83 [label=samples = 4
value = [0, 4, 0]
class = medium>, fillcolor="#39e581ff"] ;
170 | 79 -> 83 ;
171 | 84 [label=samples = 1
value = [0, 0, 1]
class = high>, fillcolor="#8139e5ff"] ;
172 | 78 -> 84 ;
173 | 85 [label=gini = 0.444
samples = 3
value = [0, 1, 2]
class = high>, fillcolor="#8139e57f"] ;
174 | 75 -> 85 ;
175 | 86 [label=samples = 2
value = [0, 1, 1]
class = medium>, fillcolor="#39e58100"] ;
176 | 85 -> 86 ;
177 | 87 [label=samples = 1
value = [0, 0, 1]
class = high>, fillcolor="#8139e5ff"] ;
178 | 85 -> 87 ;
179 | 88 [label=gini = 0.291
samples = 17
value = [0, 3, 14]
class = high>, fillcolor="#8139e5c8"] ;
180 | 30 -> 88 ;
181 | 89 [label=gini = 0.5
samples = 6
value = [0, 3, 3]
class = medium>, fillcolor="#39e58100"] ;
182 | 88 -> 89 ;
183 | 90 [label=gini = 0.48
samples = 5
value = [0, 3, 2]
class = medium>, fillcolor="#39e58155"] ;
184 | 89 -> 90 ;
185 | 91 [label=samples = 1
value = [0, 1, 0]
class = medium>, fillcolor="#39e581ff"] ;
186 | 90 -> 91 ;
187 | 92 [label=gini = 0.5
samples = 4
value = [0, 2, 2]
class = medium>, fillcolor="#39e58100"] ;
188 | 90 -> 92 ;
189 | 93 [label=gini = 0.444
samples = 3
value = [0, 1, 2]
class = high>, fillcolor="#8139e57f"] ;
190 | 92 -> 93 ;
191 | 94 [label=samples = 2
value = [0, 1, 1]
class = medium>, fillcolor="#39e58100"] ;
192 | 93 -> 94 ;
193 | 95 [label=samples = 1
value = [0, 0, 1]
class = high>, fillcolor="#8139e5ff"] ;
194 | 93 -> 95 ;
195 | 96 [label=samples = 1
value = [0, 1, 0]
class = medium>, fillcolor="#39e581ff"] ;
196 | 92 -> 96 ;
197 | 97 [label=samples = 1
value = [0, 0, 1]
class = high>, fillcolor="#8139e5ff"] ;
198 | 89 -> 97 ;
199 | 98 [label=samples = 11
value = [0, 0, 11]
class = high>, fillcolor="#8139e5ff"] ;
200 | 88 -> 98 ;
201 | 99 [label=gini = 0.419
samples = 134
value = [0, 40, 94]
class = high>, fillcolor="#8139e592"] ;
202 | 29 -> 99 ;
203 | 100 [label=gini = 0.399
samples = 40
value = [0, 29, 11]
class = medium>, fillcolor="#39e5819e"] ;
204 | 99 -> 100 ;
205 | 101 [label=gini = 0.238
samples = 29
value = [0, 25, 4]
class = medium>, fillcolor="#39e581d6"] ;
206 | 100 -> 101 ;
207 | 102 [label=gini = 0.191
samples = 28
value = [0, 25, 3]
class = medium>, fillcolor="#39e581e0"] ;
208 | 101 -> 102 ;
209 | 103 [label=gini = 0.091
samples = 21
value = [0, 20, 1]
class = medium>, fillcolor="#39e581f2"] ;
210 | 102 -> 103 ;
211 | 104 [label=samples = 19
value = [0, 19, 0]
class = medium>, fillcolor="#39e581ff"] ;
212 | 103 -> 104 ;
213 | 105 [label=gini = 0.5
samples = 2
value = [0, 1, 1]
class = medium>, fillcolor="#39e58100"] ;
214 | 103 -> 105 ;
215 | 106 [label=samples = 1
value = [0, 1, 0]
class = medium>, fillcolor="#39e581ff"] ;
216 | 105 -> 106 ;
217 | 107 [label=samples = 1
value = [0, 0, 1]
class = high>, fillcolor="#8139e5ff"] ;
218 | 105 -> 107 ;
219 | 108 [label=gini = 0.408
samples = 7
value = [0, 5, 2]
class = medium>, fillcolor="#39e58199"] ;
220 | 102 -> 108 ;
221 | 109 [label=samples = 5
value = [0, 5, 0]
class = medium>, fillcolor="#39e581ff"] ;
222 | 108 -> 109 ;
223 | 110 [label=samples = 2
value = [0, 0, 2]
class = high>, fillcolor="#8139e5ff"] ;
224 | 108 -> 110 ;
225 | 111 [label=samples = 1
value = [0, 0, 1]
class = high>, fillcolor="#8139e5ff"] ;
226 | 101 -> 111 ;
227 | 112 [label=gini = 0.463
samples = 11
value = [0, 4, 7]
class = high>, fillcolor="#8139e56d"] ;
228 | 100 -> 112 ;
229 | 113 [label=gini = 0.444
samples = 6
value = [0, 4, 2]
class = medium>, fillcolor="#39e5817f"] ;
230 | 112 -> 113 ;
231 | 114 [label=samples = 4
value = [0, 4, 0]
class = medium>, fillcolor="#39e581ff"] ;
232 | 113 -> 114 ;
233 | 115 [label=samples = 2
value = [0, 0, 2]
class = high>, fillcolor="#8139e5ff"] ;
234 | 113 -> 115 ;
235 | 116 [label=samples = 5
value = [0, 0, 5]
class = high>, fillcolor="#8139e5ff"] ;
236 | 112 -> 116 ;
237 | 117 [label=gini = 0.207
samples = 94
value = [0, 11, 83]
class = high>, fillcolor="#8139e5dd"] ;
238 | 99 -> 117 ;
239 | 118 [label=gini = 0.359
samples = 47
value = [0, 11, 36]
class = high>, fillcolor="#8139e5b1"] ;
240 | 117 -> 118 ;
241 | 119 [label=gini = 0.477
samples = 28
value = [0, 11, 17]
class = high>, fillcolor="#8139e55a"] ;
242 | 118 -> 119 ;
243 | 120 [label=gini = 0.43
samples = 16
value = [0, 11, 5]
class = medium>, fillcolor="#39e5818b"] ;
244 | 119 -> 120 ;
245 | 121 [label=gini = 0.165
samples = 11
value = [0, 10, 1]
class = medium>, fillcolor="#39e581e6"] ;
246 | 120 -> 121 ;
247 | 122 [label=gini = 0.32
samples = 5
value = [0, 4, 1]
class = medium>, fillcolor="#39e581bf"] ;
248 | 121 -> 122 ;
249 | 123 [label=samples = 3
value = [0, 3, 0]
class = medium>, fillcolor="#39e581ff"] ;
250 | 122 -> 123 ;
251 | 124 [label=gini = 0.5
samples = 2
value = [0, 1, 1]
class = medium>, fillcolor="#39e58100"] ;
252 | 122 -> 124 ;
253 | 125 [label=samples = 1
value = [0, 0, 1]
class = high>, fillcolor="#8139e5ff"] ;
254 | 124 -> 125 ;
255 | 126 [label=samples = 1
value = [0, 1, 0]
class = medium>, fillcolor="#39e581ff"] ;
256 | 124 -> 126 ;
257 | 127 [label=samples = 6
value = [0, 6, 0]
class = medium>, fillcolor="#39e581ff"] ;
258 | 121 -> 127 ;
259 | 128 [label=gini = 0.32
samples = 5
value = [0, 1, 4]
class = high>, fillcolor="#8139e5bf"] ;
260 | 120 -> 128 ;
261 | 129 [label=samples = 1
value = [0, 1, 0]
class = medium>, fillcolor="#39e581ff"] ;
262 | 128 -> 129 ;
263 | 130 [label=samples = 4
value = [0, 0, 4]
class = high>, fillcolor="#8139e5ff"] ;
264 | 128 -> 130 ;
265 | 131 [label=samples = 12
value = [0, 0, 12]
class = high>, fillcolor="#8139e5ff"] ;
266 | 119 -> 131 ;
267 | 132 [label=samples = 19
value = [0, 0, 19]
class = high>, fillcolor="#8139e5ff"] ;
268 | 118 -> 132 ;
269 | 133 [label=samples = 47
value = [0, 0, 47]
class = high>, fillcolor="#8139e5ff"] ;
270 | 117 -> 133 ;
271 | 134 [label=gini = 0.008
samples = 474
value = [0, 2, 472]
class = high>, fillcolor="#8139e5fe"] ;
272 | 22 -> 134 ;
273 | 135 [label=samples = 2
value = [0, 2, 0]
class = medium>, fillcolor="#39e581ff"] ;
274 | 134 -> 135 ;
275 | 136 [label=samples = 472
value = [0, 0, 472]
class = high>, fillcolor="#8139e5ff"] ;
276 | 134 -> 136 ;
277 | }
--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # __*__ coding: utf-8 __*__
3 |
4 | '''
5 | @Author: simonKing
6 | @License: (C) Copyright 2013-2019, Best Wonder Corporation Limited.
7 | @Os:Windows 10 x64
8 | @Contact: bw_wangxiaomeng@whty.com.cn
9 | @Software: PY PyCharm
10 | @File: __init__.py.py
11 | @Time: 2019/6/25 10:37
12 | @Desc: define your function
13 | '''
--------------------------------------------------------------------------------
/utils/cutScopebusiness.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # __*__ coding: utf-8 __*__
3 |
4 | '''
5 | @Author: simonKing
6 | @License: (C) Copyright 2013-2019, Best Wonder Corporation Limited.
7 | @Os:Windows 10 x64
8 | @Contact: bw_wangxiaomeng@whty.com.cn
9 | @Software: PY PyCharm
10 | @File: cutScopebusiness.py
11 | @Time: 2019/6/25 10:56
12 | @Desc: define your function
13 | '''
14 |
15 | '''
16 | 行业分类参考:https://blog.csdn.net/chenhuamain/article/details/84579667
17 | 农林牧渔 1
18 | 制造业 2
19 | 卫生医疗 3
20 | 商务服务 4
21 | 居民服务 5
22 | 建筑产业 6
23 | 房地产业 7
24 | 教育培训 8
25 | 文体娱乐 9
26 | 电信通讯 10
27 | 科学技术 11
28 | 租赁服务 12
29 | 维修服务 13
30 | 设计服务 14
31 | 运输物流 15
32 | 采矿工业 16
33 | 金融服务 17
34 | 餐饮住宿 18
35 | '''
36 | import pymysql
37 |
38 | class MysqlOperate:
39 | def __init__(self):
40 | self.db = pymysql.connect("192.168.5.135", "root", "000000", "platform")
41 | self.cursor = self.db.cursor()
42 | pass
43 |
44 | def read_data(self,sql):
45 | self.cursor.execute(sql)
46 | # datas = self.cursor.fetchmany(100)
47 | datas = self.cursor.fetchall()
48 | # print(datas)
49 | self.db.close()
50 | return datas
51 |
52 | def update_data(self,sql):
53 | self.cursor.execute(sql)
54 | self.db.commit()
55 | return
56 |
--------------------------------------------------------------------------------
/utils/fasttextClassfy.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # __*__ coding: utf-8 __*__
3 |
4 | '''
5 | @Author: simonKing
6 | @License: (C) Copyright 2013-2019, Best Wonder Corporation Limited.
7 | @Os:Windows 10 x64
8 | @Contact: bw_wangxiaomeng@whty.com.cn
9 | @Software: PY PyCharm
10 | @File: fasttextClassfy.py
11 | @Time: 2019/6/25 10:47
12 | @Desc: define your function
13 | '''
14 | import jieba
15 | import random
16 | import jieba.posseg as pseg
17 | from sklearn.svm import SVC
18 | from sklearn.naive_bayes import MultinomialNB
19 | from sklearn.neighbors import KNeighborsClassifier
20 | from utils.cutScopebusiness import MysqlOperate
21 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
22 |
23 |
24 | # 朴素贝叶斯算法
25 | def nb_model(train, train_label, test, test_label):
26 | clf_model = MultinomialNB(alpha=0.01)
27 | clf_model.fit(train, train_label)
28 | predict_results = clf_model.predict(test)
29 |
30 | count = 0
31 | predict_list = predict_results.tolist()
32 | for i, pred in enumerate(predict_list):
33 | if (pred == test_label[i]):
34 | count += 1
35 |
36 | print("nb_model_precision_score: " + str(float(count) / len(predict_list)))
37 |
38 | # 创建朴素贝叶斯分类模型
39 | def nb_classfy_model(train, train_label,inputs):
40 | '''
41 | :param train: 训练语料
42 | :param train_label: 训练标签
43 | :param inputs:待预测预料
44 | :return:预测结果
45 | '''
46 | clf_model = MultinomialNB(alpha=0.01)
47 | clf_model.fit(train, train_label)
48 | predict_results = clf_model.predict(inputs)
49 | # print('贝叶斯预测结果:',predict_results)
50 | predict_results = predict_results[0].split('__label__')[1]
51 | return predict_results
52 |
53 |
54 | # K近邻算法
55 | def knn_model(train, train_label, test, test_label):
56 | knn_model = KNeighborsClassifier(n_neighbors=8)
57 | knn_model.fit(train, train_label)
58 | predict_results = knn_model.predict(test)
59 |
60 | count = 0
61 | predict_list = predict_results.tolist()
62 | for i, pred in enumerate(predict_list):
63 | if (pred == test_label[i]):
64 | count += 1
65 |
66 | print("knn_model_precision_score: " + str(float(count) / len(predict_list)))
67 |
68 |
69 | # 支持向量机算法
70 | def svm_model(train, train_label, test, test_label):
71 | svm_clf = SVC(kernel="linear", verbose=False)
72 | svm_clf.fit(train, train_label)
73 | predict_results = svm_clf.predict(test)
74 |
75 | count = 0
76 | predict_list = predict_results.tolist()
77 | for i, pred in enumerate(predict_list):
78 | if (pred == test_label[i]):
79 | count += 1
80 | print("svm_model_precision_score: " + str(float(count) / len(predict_list)))
81 |
82 |
83 | # 使用传统方法的文本分类
84 | def text_classification(inputs):
85 | count = 0
86 | test_text_list = []
87 | train_text_list = []
88 | test_label_list = []
89 | train_label_list = []
90 | total_text_list = []
91 | total_label_list = []
92 | # 待测
93 | inputs_text_list = []
94 | inputs_text_list.append(inputs)
95 |
96 | print("start loading data...")
97 | finput = open("../data/data_train.txt", encoding='utf-8')
98 | for line in finput:
99 | count += 1
100 | text_array = line.split("\\t", 1)
101 | if (len(text_array) != 2):
102 | continue
103 |
104 | # 保存全部样本
105 | total_text_list.append(text_array[1])
106 | total_label_list.append(text_array[0])
107 |
108 | # 划分训练集和测试集
109 | probability = random.random()
110 | if (probability > 0.1):
111 | train_text_list.append(text_array[1])
112 | train_label_list.append(text_array[0])
113 | else:
114 | test_text_list.append(text_array[1])
115 | test_label_list.append(text_array[0])
116 | finput.close()
117 | print("load data is finished...")
118 |
119 | print("start building vector model...")
120 | # 构建词典
121 | vec_total = CountVectorizer()
122 | vec_total.fit_transform(total_text_list)
123 |
124 | # 基于构建的词典分别统计训练集/测试集词频, 即每个词出现1次、2次、3次等
125 | vec_train = CountVectorizer(vocabulary=vec_total.vocabulary_)
126 | tf_train = vec_train.fit_transform(train_text_list)
127 |
128 | vec_test = CountVectorizer(vocabulary=vec_total.vocabulary_)
129 | tf_test = vec_test.fit_transform(test_text_list)
130 |
131 | vec_inputs = CountVectorizer(vocabulary=vec_total.vocabulary_)
132 | tf_inputs = vec_inputs.fit_transform(inputs_text_list)
133 |
134 | # 进一步计算词频-逆文档频率
135 | tfidftransformer = TfidfTransformer()
136 | tfidf_train = tfidftransformer.fit(tf_train).transform(tf_train)
137 | tfidf_test = tfidftransformer.fit(tf_test).transform(tf_test)
138 | tfidf_inputs = tfidftransformer.fit(tf_inputs).transform(tf_inputs)
139 | print("building vector model is finished...")
140 |
141 | # 朴素贝叶斯算法
142 | nb_model(tfidf_train, train_label_list, tfidf_test, test_label_list)
143 | predict_result = nb_classfy_model(tfidf_train, train_label_list,tfidf_inputs)
144 | # K近邻算法
145 | # knn_model(tfidf_train, train_label_list, tfidf_test, test_label_list)
146 | # 支持向量机算法
147 | # svm_model(tfidf_train, train_label_list, tfidf_test, test_label_list)
148 | print("building predict model is finished...")
149 | return predict_result
150 |
151 |
152 | industry_dict = ['农林牧渔','制造业','卫生医疗','商务服务','居民服务','建筑产业','房地产业','教育培训','文体娱乐',
153 | '电信通讯','科学技术','租赁服务','维修服务','设计服务','运输物流','采矿工业','金融服务','餐饮住宿']
154 |
155 |
156 | if __name__ == '__main__':
157 | print("贝叶斯文本分类...")
158 | stopword = ['、', ';', ', ', '。', '(', ',', ')', '++', '**', '*','[ ',']','【','】',':','国务院','部门','国家'
159 | ,':','法律','法规','的','规定','决定','许可','批准','禁止不得','经营应当','审批经','审批',';无需市场主体','经营','机关后','自主',
160 | '选择','开展','活动','后方','相关','经','可','须','取得','无需市场主体',';','(',' )','有限公司']
161 | sql = "select enterpriseName,scopeOfBusiness,uniformSocialCreditCode from basic_info"
162 |
163 | # sql = "select * from basic_info"
164 | datas = MysqlOperate().read_data(sql)
165 | for data in datas:
166 | # inputs = '科技企业孵化;投资管理;高新技术开发、技术咨询、技术服务;计算机技术培训;出租办公用房;设计、制作、代理、发布国内各类广告;会议会展服务;企业咨询服务(不含民间借贷中介及证券、期货、保险、金融投资信息咨询);企业管理服务;企业营销策划;教育咨询服务(依法须经批准的项目,经相关部门批准后方可开展经营活动)'
167 | inputs = data[1]
168 | inputs_array_list = []
169 | names = [w for w,f in pseg.cut(data[0]) if f !='ns' ]
170 | for word in jieba.lcut(inputs) + names:
171 | if word not in stopword:
172 | if len(word) >1:
173 | inputs_array_list.append(word)
174 | inputs = ' '.join(inputs_array_list)
175 | print('企业名称:%s,输入语料:%s'%(data[0],inputs))
176 | predict = int(text_classification(inputs)) -1
177 | labels = industry_dict[predict]
178 | print('labels:',labels)
179 | query = "update basic_info set industry = '%s' where uniformSocialCreditCode = '%s'"%(labels,data[2])
180 | MysqlOperate().update_data(query)
181 | print("\n----------------------------------------------")
--------------------------------------------------------------------------------
/utils/joinExcelByIndex.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | # __*__ coding: utf-8 __*__
3 |
4 | '''
5 | @Author: simonKing
6 | @License: (C) Copyright 2013-2019, Best Wonder Corporation Limited.
7 | @Os:Windows 10 x64
8 | @Contact: bw_wangxiaomeng@whty.com.cn
9 | @Software: PY PyCharm
10 | @File: joinExcelByIndex.py
11 | @Time: 2019/7/4 10:43
12 | @Desc: define your function
13 | '''
14 |
15 | import pandas as pd
16 | from pandas import DataFrame
17 |
18 |
19 | def concat_excels(xlsx1,xlsx2,index):
20 | '''
21 | 根据字段名合并表格
22 | :param xlsx1:输入excel表格1;
23 | :param xlsx2:输入excel表格2;
24 | :param index:合并的字段名
25 | :return:输出合并后的表格
26 | '''
27 | if '.xlsx' not in xlsx1:
28 | raise Exception('输入文件类型有误!')
29 | if '.xlsx' not in xlsx2:
30 | raise Exception('输入文件类型有误!')
31 | data1 = pd.read_excel(xlsx1, sheet_name='Sheet1', dtype={index: str})
32 | df_obj1 = DataFrame(data1)
33 | data2 = pd.read_excel(xlsx2, sheet_name='Sheet1', dtype={index: str})
34 | data2 = data2.drop_duplicates([index])
35 | df_obj2 = DataFrame(data2)
36 |
37 | excel = pd.merge(df_obj1, df_obj2, on=index,how='outer')
38 | excel_list = [excel]
39 | total_excel = pd.concat(excel_list)
40 | # total_excel = excel_list.set_index('cate_tp').T.to_dict('list')
41 | total_excel.to_excel('../user_point/tice20190717.xlsx', index=False)
42 | return
43 |
44 | # if __name__ =='__main__':
45 | # pass
46 | # index = 'tax_id'
47 | # xs1 = r'D:\invoice1.xlsx'
48 | # xs2 = r'D:\invoice2.xlsx'
49 | # concat_excels(xs1,xs2,index)
--------------------------------------------------------------------------------