├── .DS_Store
├── .gitignore
├── README.md
├── 推荐系统算法实践—源码下载
├── .DS_Store
├── 第11章DNN
│ └── DNN.py
├── 第12章Wide & Deep模型
│ ├── WideAndDeep.py
│ ├── adult.data
│ ├── adult.names
│ └── adult.test
├── 第13章DeepFM模型
│ └── DeepFM.py
├── 第14章YouTube
│ └── YouTubeNet.py
├── 第15章基于电商平台的商品召回
│ └── myGoodsRecall.scala
├── 第16章基于逻辑回归的音乐评分预测
│ └── LR.scala
├── 第17章Kaggle竞赛之Outbrain点击率预估
│ ├── EnsembleTree.scala
│ ├── FFM.py
│ └── XGB.py
├── 第18章基于深度学习的电商商品点击率预估
│ └── DeepInterestNetwork.py
├── 第19章Notebook实践
│ ├── Debug_CF.scala
│ ├── Debug_FM.py
│ ├── Debug_Sk_LR.py
│ ├── Debug_Spark_LR.scala
│ └── Debug_TF_LR.py
├── 第4章协同过滤
│ ├── .DS_Store
│ ├── I2iTest.scala
│ ├── ItemSimilarity.scala
│ └── ml-latest-small
│ │ ├── README.txt
│ │ ├── links.csv
│ │ ├── movies.csv
│ │ ├── ratings.csv
│ │ └── tags.csv
├── 第5章Word2vec
│ ├── Word2vec.py
│ └── Word2vec.scala
├── 第6章逻辑回归
│ ├── LR.py
│ ├── LogisticRegression.py
│ └── LogisticRegression.scala
├── 第7章FM
│ ├── FM.py
│ └── FM_Sk.py
├── 第8章决策树
│ ├── DecisionTrees.scala
│ ├── Tree.py
│ └── sample_libsvm_data.txt
└── 第9章集成学习
│ ├── GbdtLr.scala
│ └── gcForest.py
└── 推荐系统算法实践—补充部分
├── .DS_Store
├── 第12章节
├── adult.data
├── adult.names
└── adult.test
├── 第14章节
├── 001
└── others.py
├── 第5章节
├── other.py
├── sequence_sample.csv
└── windows_skip_sample.csv
└── 第6-11和13章节
├── .DS_Store
├── csv-00000
├── data
├── 00000
└── 00001
├── sample_libsvm_data.txt
├── sklearn_others.py
├── spark_others.scala
└── tf_others.py
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WallaceLiu/recommendation_algorithm/36cdd077446ab7a2831c168f7419b058bd2fcbb0/.DS_Store
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 | *.log
3 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # recommendation_algorithm
2 | 推荐系统算法实践
3 |
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WallaceLiu/recommendation_algorithm/36cdd077446ab7a2831c168f7419b058bd2fcbb0/推荐系统算法实践—源码下载/.DS_Store
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第11章DNN/DNN.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # ## 1)环境准备
5 |
6 | # In[1]:
7 |
8 | import numpy as np
9 | import tensorflow as tf
10 | import pandas as pd
11 | import random
12 | import math
13 | import re
14 |
15 | from sklearn import preprocessing
16 | from os import path, listdir
17 | from sklearn.datasets import load_svmlight_files
18 | from sklearn.model_selection import train_test_split
19 | from sklearn import metrics
20 | from tensorflow.contrib import layers
21 |
22 | from sklearn import metrics
23 |
24 | import time
25 | import datetime
26 |
27 | import os
28 | os.environ["CUDA_VISIBLE_DEVICES"]="0"
29 |
30 | print tf.__version__
31 | print tf.__path__
32 |
33 |
34 | # ## 2)数据准备Dataset格式
35 |
36 | # 数据处理
37 | def process_data(data_type, my_path, feature_size, batch_size=32, num_epochs=1):
38 | filenames = get_file_list(my_path)
39 | next_element = read_my_file_format(data_type, filenames, feature_size, batch_size, num_epochs)
40 | return next_element
41 |
42 | # 创建session,指定GPU或者CPU使用率
43 | def get_session(gpu_fraction=0.1):
44 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction,
45 | allow_growth=True)
46 | # server = tf.train.Server.create_local_server()
47 | return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
48 |
49 |
50 | # In[3]:
51 |
52 | # 测试数据
53 | filenames = '/data/all-csv'
54 | feature_size = 530
55 | batch_size = 3
56 | num_epochs = 1
57 | data_type = 'csv'
58 | next_element = process_data(data_type, filenames, feature_size, batch_size, num_epochs)
59 | print next_element['dense_vector']
60 | print next_element['labels']
61 |
62 | gpu_fraction = 0.2
63 | my_device='/gpu:0'
64 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer())
65 | with tf.device(my_device):
66 | sess = get_session(gpu_fraction)
67 | sess.run(init_op)
68 | dense_vector, labels = sess.run([next_element['dense_vector'],next_element['labels']])
69 | print dense_vector
70 | print labels
71 |
72 |
73 | # ## 3)DNN模型
74 |
75 | # In[4]:
76 |
77 | class DNN(object):
78 | """ 初始化成员变量 """
79 | def __init__(self,
80 | feature_size,
81 | loss_fuc,
82 | train_optimizer,
83 | learning_rate,
84 | reg_type,
85 | reg_param,
86 | dnn_layer,
87 | dnn_active_fuc,
88 | is_dropout_dnn,
89 | dropout_dnn,
90 | is_batch_norm):
91 | # 特征向量长度
92 | self.feature_size = feature_size
93 | # 损失函数
94 | self.loss_fuc = loss_fuc
95 | # 优化方法
96 | self.train_optimizer = train_optimizer
97 | # 学习率
98 | self.learning_rate = learning_rate
99 | # 正则类型
100 | self.reg_type = reg_type
101 | # 正则因子
102 | self.reg_param = reg_param
103 | # dnn_layer
104 | self.dnn_layer = dnn_layer
105 | self.dnn_active_fuc = dnn_active_fuc
106 | # dropout_dnn
107 | self.is_dropout_dnn = is_dropout_dnn
108 | self.dropout_dnn = dropout_dnn
109 | # is_batch_norm
110 | self.is_batch_norm = is_batch_norm
111 |
112 | # aglobal_step
113 | self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')
114 |
115 | """ dnn全连接层计算 """
116 | def _udf_full_connect(self, inputs, input_size, output_size, activation='relu'):
117 | # 生成或者攻取weights和biases
118 | weights = tf.get_variable("weights",
119 | [input_size, output_size],
120 | initializer=tf.glorot_normal_initializer(),
121 | trainable=True)
122 | biases = tf.get_variable("biases",
123 | [output_size],
124 | initializer=tf.glorot_normal_initializer(),
125 | trainable=True)
126 | # 全连接计算
127 | layer = tf.matmul(inputs, weights) + biases
128 | # 激活函数
129 | if activation == 'relu':
130 | layer = tf.nn.relu(layer)
131 | elif activation == 'tanh':
132 | layer = tf.nn.tanh(layer)
133 | return layer
134 |
135 | def train(self, batch_data, is_train):
136 | """ 1 定义输入数据 """
137 | print("1 定义输入数据")
138 | with tf.name_scope('input_data'):
139 | # 标签:[batch_size, 1]
140 | labels = batch_data['labels']
141 | # 用户特征向量:[batch_size, feature_size]
142 | dense_vector = tf.reshape(batch_data['dense_vector'], shape=[-1, feature_size]) # None * feature_size * 1
143 | print("%s: %s" % ("dense_vector", dense_vector))
144 | print("%s: %s" % ("labels", labels))
145 |
146 | """ 2 定义网络输出 """
147 | print("2 DNN网络输出" )
148 | with tf.name_scope("DNN_Comput_Score"):
149 | # 第一层计算
150 | with tf.variable_scope("deep_layer1", reuse=tf.AUTO_REUSE):
151 | input_size = self.feature_size
152 | output_size = self.dnn_layer[0]
153 | deep_inputs = dense_vector # None * F
154 | print("%s: %s" % ("deep_layer1, deep_inputs", deep_inputs))
155 | # 输入dropout
156 | if is_train and self.is_dropout_dnn:
157 | deep_inputs = tf.nn.dropout(deep_inputs, self.dropout_dnn[0])
158 | # 全连接计算
159 | deep_outputs = self._udf_full_connect(deep_inputs, input_size, output_size, self.dnn_active_fuc[0])
160 | # batch_norm
161 | if self.is_batch_norm:
162 | deep_outputs = tf.layers.batch_normalization(deep_outputs, axis=-1, training=is_train)
163 | # 输出dropout
164 | if is_train and self.is_dropout_dnn:
165 | deep_outputs = tf.nn.dropout(deep_outputs, dropout_dnn[1])
166 | print("%s: %s" % ("deep_layer1, deep_outputs", deep_outputs))
167 | # 中间层计算
168 | for i in range(len(self.dnn_layer) - 1):
169 | with tf.variable_scope("deep_layer%d"%(i+2), reuse=tf.AUTO_REUSE):
170 | # 全连接计算
171 | deep_outputs = self._udf_full_connect(deep_outputs, self.dnn_layer[i], self.dnn_layer[i+1], self.dnn_active_fuc[i+1])
172 | # batch_norm
173 | if self.is_batch_norm:
174 | deep_outputs = tf.layers.batch_normalization(deep_outputs, axis=-1, training=is_train)
175 | # 输出dropout
176 | if is_train and self.is_dropout_dnn:
177 | deep_outputs = tf.nn.dropout(deep_outputs, self.dropout_dnn[i+2])
178 | print("%s, %s: %s" % ("deep_layer%d"%(i+2), "deep_outputs", deep_outputs))
179 | # 输出层计算
180 | with tf.variable_scope("deep_layer%d"%(len(dnn_layer)+1), reuse=tf.AUTO_REUSE):
181 | deep_outputs = self._udf_full_connect(deep_outputs, self.dnn_layer[-1], 1, self.dnn_active_fuc[-1])
182 | print("%s, %s: %s" % ("deep_layer%d"%(len(dnn_layer)+1), "deep_outputs", deep_outputs))
183 | # 正则化,默认L2
184 | dnn_regularization = 0.0
185 | for j in range(len(self.dnn_layer)+1):
186 | with tf.variable_scope("deep_layer%d"%(j+1), reuse=True):
187 | weights = tf.get_variable("weights")
188 | if reg_type == 'l1_reg':
189 | dnn_regularization = dnn_regularization + tf.reduce_sum(tf.abs(weights))
190 | elif reg_type == 'l2_reg':
191 | dnn_regularization = dnn_regularization + tf.nn.l2_loss(weights)
192 | else:
193 | dnn_regularization = dnn_regularization + tf.nn.l2_loss(weights)
194 | # Deep输出
195 | Y_Out=deep_outputs
196 | print("%s: %s" % ("Y_Out", Y_Out))
197 | # ---------- score ----------
198 | score=tf.nn.sigmoid(Y_Out,name='score')
199 | score=tf.reshape(score, shape=[-1, 1])
200 | print("%s: %s" % ("score", score))
201 |
202 | """ 3 定义损失函数和AUC指标 """
203 | print("3 定义损失函数和AUC指标" )
204 | with tf.name_scope("loss"):
205 | # loss:Squared_error,Cross_entropy ,FTLR
206 | regularization = self.reg_param * dnn_regularization
207 | if loss_fuc == 'Squared_error':
208 | loss = tf.reduce_mean(tf.reduce_sum(tf.square(labels - score), reduction_indices=[1])) + regularization
209 | elif loss_fuc == 'Cross_entropy':
210 | loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=tf.reshape(Y_Out, [-1]), labels=tf.reshape(labels, [-1]))) + regularization
211 | elif loss_fuc == 'FTLR':
212 | loss = tf.reduce_mean(tf.reduce_sum(tf.square(labels - score), reduction_indices=[1])) + regularization
213 | # AUC
214 | auc = tf.metrics.auc(labels, score)
215 | print("%s: %s" % ("labels", labels))
216 |
217 | """ 4 设定optimizer """
218 | print("4 设定optimizer" )
219 | with tf.name_scope("optimizer"):
220 | with tf.variable_scope("optimizer", reuse=tf.AUTO_REUSE):
221 | #------bulid optimizer------
222 | if train_optimizer == 'Adam':
223 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8)
224 | elif train_optimizer == 'Adagrad':
225 | optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=1e-8)
226 | elif train_optimizer == 'Momentum':
227 | optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.95)
228 | elif train_optimizer == 'ftrl':
229 | optimizer = tf.train.FtrlOptimizer(learning_rate)
230 | train_step = optimizer.minimize(loss, global_step=self.global_step)
231 |
232 | """5 设定summary,以便在Tensorboard里进行可视化 """
233 | print("5 设定summary" )
234 | with tf.name_scope("summaries"):
235 | tf.summary.scalar("loss", loss)
236 | tf.summary.scalar("accumulate_auc", auc[0])
237 | for j in range(len(self.dnn_layer)+1):
238 | with tf.variable_scope("deep_layer%d"%(j+1), reuse=True):
239 | weights = tf.get_variable("weights")
240 | tf.summary.histogram("w%d"%(j+1), weights)
241 | # 好几个summary,所以这里要merge_all
242 | summary_op = tf.summary.merge_all()
243 |
244 | """6 返回结果 """
245 | return Y_Out, score, regularization, loss, auc, train_step, labels, score, summary_op
246 |
247 |
248 | # ## 4)模型训练测试
249 |
250 | # In[5]:
251 |
252 | # 数据参数
253 | print("0 数据准备和参数设置" )
254 | filenames = '/data/csv-all'
255 | data_type='csv'
256 | feature_size = 530
257 | batch_size = 60000
258 | num_epochs = 200
259 | next_element = process_data(data_type, filenames, feature_size, batch_size, num_epochs)
260 | print("%s: %s" % ("next_element", next_element))
261 |
262 | # 模型参数
263 | loss_fuc = 'Squared_error'
264 | train_optimizer = 'Adam'
265 | learning_rate = 0.01
266 | reg_type = 'l2_reg'
267 | reg_param = 0.0
268 |
269 | dnn_layer=[100,50]
270 | dnn_active_fuc=['relu','relu','output']
271 | dropout_fm=[1,1]
272 | is_dropout_dnn=True
273 | dropout_dnn=[0.7,0.7,0.7]
274 | is_batch_norm=True
275 |
276 | log_path='/data/log/DNN_Squared_error_L2_0_20180816_01'
277 |
278 | # 开始训练
279 | bea_model = DNN(feature_size,
280 | loss_fuc,
281 | train_optimizer,
282 | learning_rate,
283 | reg_type,
284 | reg_param,
285 | dnn_layer,
286 | dnn_active_fuc,
287 | is_dropout_dnn,
288 | dropout_dnn,
289 | is_batch_norm)
290 | Y_Out, score, regularization, loss, auc, train_step, labels, score, summary_op = bea_model.train(next_element, is_train=True)
291 |
292 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer())
293 | gpu_fraction = 0.5
294 | my_device='/gpu:0'
295 | with tf.device(my_device):
296 | sess = get_session(gpu_fraction)
297 | sess.run(init_op)
298 | batch_cnt = 0
299 | #选定可视化存储目录
300 | writer = tf.summary.FileWriter(log_path, sess.graph)
301 | print("6 迭代过程" )
302 | try:
303 | while True:
304 | batch_cnt = batch_cnt + 1
305 | a, b, c, summary = sess.run([loss, auc, train_step, summary_op])
306 | if batch_cnt % 100 == 0 or batch_cnt <= 10:
307 | y, p = sess.run([labels, score])
308 | if y.sum() > 0.0:
309 | batch_auc=metrics.roc_auc_score(y, p)
310 | else:
311 | batch_auc=0.0
312 | print("batch: {} loss: {:.4f} accumulate_auc: {:.4f} batch_auc: {:.4f}".format(batch_cnt, a, b[0], batch_auc))
313 | writer.add_summary(summary, batch_cnt)
314 | except tf.errors.OutOfRangeError:
315 | print("Train end of dataset")
316 |
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第12章Wide & Deep模型/WideAndDeep.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # ## 1)环境准备
5 |
6 | # In[8]:
7 |
8 | import numpy as np
9 | import tensorflow as tf
10 | import pandas as pd
11 | import random
12 | import math
13 | import re
14 |
15 | from sklearn import preprocessing
16 | from os import path, listdir
17 | from sklearn.datasets import load_svmlight_files
18 | from sklearn.model_selection import train_test_split
19 | from sklearn import metrics
20 | from tensorflow.contrib import layers
21 |
22 | from sklearn import metrics
23 |
24 | import time
25 | import datetime
26 |
27 | import os
28 | os.environ["CUDA_VISIBLE_DEVICES"]="0"
29 |
30 | import tensorflow as tf
31 |
32 | print tf.__version__
33 | print tf.__path__
34 |
35 |
36 | # ## 2)数据准备
37 |
38 | # In[9]:
39 |
40 | # 定义输入样本格式
41 | _CSV_COLUMNS = [
42 | 'age', 'workclass', 'fnlwgt', 'education', 'education_num',
43 | 'marital_status', 'occupation', 'relationship', 'race', 'gender',
44 | 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
45 | 'income_bracket'
46 | ]
47 | _CSV_COLUMN_DEFAULTS = [[0], [''], [0], [''], [0], [''], [''], [''], [''], [''],
48 | [0], [0], [0], [''], ['']]
49 | _NUM_EXAMPLES = {
50 | 'train': 32561,
51 | 'validation': 16281,
52 | }
53 |
54 | """Builds a set of wide and deep feature columns."""
55 | def build_model_columns():
56 | # 1. 特征处理,包括:连续特征、离散特征、转换特征、交叉特征等
57 |
58 | # 连续特征 (其中在Wide和Deep组件都会用到)
59 | age = tf.feature_column.numeric_column('age')
60 | education_num = tf.feature_column.numeric_column('education_num')
61 | capital_gain = tf.feature_column.numeric_column('capital_gain')
62 | capital_loss = tf.feature_column.numeric_column('capital_loss')
63 | hours_per_week = tf.feature_column.numeric_column('hours_per_week')
64 |
65 | # 离散特征
66 | education = tf.feature_column.categorical_column_with_vocabulary_list(
67 | 'education', [
68 | 'Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
69 | 'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
70 | '5th-6th', '10th', '1st-4th', 'Preschool', '12th'])
71 |
72 | marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
73 | 'marital_status', [
74 | 'Married-civ-spouse', 'Divorced', 'Married-spouse-absent',
75 | 'Never-married', 'Separated', 'Married-AF-spouse', 'Widowed'])
76 |
77 | relationship = tf.feature_column.categorical_column_with_vocabulary_list(
78 | 'relationship', [
79 | 'Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried',
80 | 'Other-relative'])
81 |
82 | workclass = tf.feature_column.categorical_column_with_vocabulary_list(
83 | 'workclass', [
84 | 'Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov',
85 | 'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'])
86 |
87 | # 离散hash bucket特征
88 | occupation = tf.feature_column.categorical_column_with_hash_bucket(
89 | 'occupation', hash_bucket_size=1000
90 | )
91 |
92 | # 特征Transformations
93 | age_buckets = tf.feature_column.bucketized_column(
94 | age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]
95 | )
96 |
97 | # 2. 设定Wide层特征
98 | """
99 | Wide部分使用了规范化后的连续特征、离散特征、交叉特征
100 | """
101 | # 基本特征列
102 | base_columns = [
103 | # 全是离散特征
104 | education, marital_status, relationship, workclass, occupation,
105 | age_buckets,
106 | ]
107 |
108 | # 交叉特征列
109 | crossed_columns = [
110 | tf.feature_column.crossed_column(
111 | ['education', 'occupation'], hash_bucket_size=1000),
112 | tf.feature_column.crossed_column(
113 | [age_buckets, 'education', 'occupation'], hash_bucket_size=1000
114 | )
115 | ]
116 |
117 | # wide特征列
118 | wide_columns = base_columns + crossed_columns
119 |
120 | # 3. 设定Deep层特征
121 | """
122 | Deep层主要针对离散特征进行处理,其中处理方式有:
123 | 1. Sparse Features -> Embedding vector -> 串联(连续特征),其中Embedding Values随机初始化。
124 | 2. 另外一种处理离散特征的方法是:one-hot和multi-hot representation. 此方法适用于低维度特征,其中embedding是通用的做法
125 | 其中:采用embedding_column(embedding)和indicator_column(multi-hot)API
126 | """
127 | # deep特征列
128 | deep_columns = [
129 | age,
130 | education_num,
131 | capital_gain,
132 | capital_loss,
133 | hours_per_week,
134 | tf.feature_column.indicator_column(workclass),
135 | tf.feature_column.indicator_column(education),
136 | tf.feature_column.indicator_column(marital_status),
137 | tf.feature_column.indicator_column(relationship),
138 |
139 | # embedding特征
140 | tf.feature_column.embedding_column(occupation, dimension=8)
141 | ]
142 | return wide_columns, deep_columns
143 |
144 | # Estimator Input
145 | # 定义输入
146 | def input_fn(data_file, num_epochs, shuffle, batch_size):
147 | """为Estimator创建一个input function"""
148 | assert tf.gfile.Exists(data_file), "{0} not found.".format(data_file)
149 | def parse_csv(line):
150 | print("Parsing", data_file)
151 | # tf.decode_csv会把csv文件转换成Tensor。其中record_defaults用于指明每一列的缺失值用什么填充。
152 | columns = tf.decode_csv(line, record_defaults=_CSV_COLUMN_DEFAULTS)
153 | features = dict(zip(_CSV_COLUMNS, columns))
154 | labels = features.pop('income_bracket')
155 | # tf.equal(x, y) 返回一个bool类型Tensor, 表示x == y, element-wise
156 | return features, tf.equal(labels, '>50K')
157 | dataset = tf.data.TextLineDataset(data_file).map(parse_csv, num_parallel_calls=5)
158 | dataset = dataset.repeat(num_epochs)
159 | dataset = dataset.batch(batch_size)
160 | iterator = dataset.make_one_shot_iterator()
161 | batch_features, batch_labels = iterator.get_next()
162 | return batch_features, batch_labels
163 |
164 |
165 | # ## 3)模型准备
166 |
167 | # In[10]:
168 |
169 | # Wide & Deep Model
170 | def build_estimator(model_dir, model_type):
171 | """Build an estimator appropriate for the given model type."""
172 | wide_columns, deep_columns = build_model_columns()
173 | hidden_units = [100, 50]
174 |
175 | # Create a tf.estimator.RunConfig to ensure the model is run on CPU, which
176 | # trains faster than GPU for this model.
177 | run_config = tf.estimator.RunConfig().replace(
178 | session_config=tf.ConfigProto(device_count={'GPU': 0}))
179 |
180 | if model_type == 'wide':
181 | return tf.estimator.LinearClassifier(
182 | model_dir=model_dir,
183 | feature_columns=wide_columns,
184 | config=run_config)
185 | elif model_type == 'deep':
186 | return tf.estimator.DNNClassifier(
187 | model_dir=model_dir,
188 | feature_columns=deep_columns,
189 | hidden_units=hidden_units,
190 | config=run_config)
191 | else:
192 | return tf.estimator.DNNLinearCombinedClassifier(
193 | model_dir=model_dir,
194 | linear_feature_columns=wide_columns,
195 | dnn_feature_columns=deep_columns,
196 | dnn_hidden_units=hidden_units,
197 | config=run_config)
198 |
199 | # 模型路径
200 | model_type = 'widedeep'
201 | model_dir = '/data/model/wide_deep'
202 |
203 | # Wide & Deep 联合模型
204 | model = build_estimator(model_dir, model_type)
205 |
206 |
207 | # ## 4)模型训练
208 |
209 | # In[11]:
210 |
211 | # 训练参数
212 | train_epochs = 10
213 | batch_size = 5000
214 | train_file = '/data/adult.data'
215 | test_file = '/data/adult.test'
216 |
217 | # 6. 开始训练
218 | for n in range(train_epochs):
219 | # 模型训练
220 | model.train(input_fn=lambda: input_fn(train_file, train_epochs, True, batch_size))
221 | # 模型评估
222 | results = model.evaluate(input_fn=lambda: input_fn(test_file, 1, False, batch_size))
223 | # 打印评估结果
224 | print("Results at epoch {0}".format((n+1) * train_epochs))
225 | print('-'*30)
226 | for key in sorted(results):
227 | print("{0:20}: {1:.4f}".format(key, results[key]))
228 |
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第12章Wide & Deep模型/adult.names:
--------------------------------------------------------------------------------
1 | | This data was extracted from the census bureau database found at
2 | | http://www.census.gov/ftp/pub/DES/www/welcome.html
3 | | Donor: Ronny Kohavi and Barry Becker,
4 | | Data Mining and Visualization
5 | | Silicon Graphics.
6 | | e-mail: ronnyk@sgi.com for questions.
7 | | Split into train-test using MLC++ GenCVFiles (2/3, 1/3 random).
8 | | 48842 instances, mix of continuous and discrete (train=32561, test=16281)
9 | | 45222 if instances with unknown values are removed (train=30162, test=15060)
10 | | Duplicate or conflicting instances : 6
11 | | Class probabilities for adult.all file
12 | | Probability for the label '>50K' : 23.93% / 24.78% (without unknowns)
13 | | Probability for the label '<=50K' : 76.07% / 75.22% (without unknowns)
14 | |
15 | | Extraction was done by Barry Becker from the 1994 Census database. A set of
16 | | reasonably clean records was extracted using the following conditions:
17 | | ((AAGE>16) && (AGI>100) && (AFNLWGT>1)&& (HRSWK>0))
18 | |
19 | | Prediction task is to determine whether a person makes over 50K
20 | | a year.
21 | |
22 | | First cited in:
23 | | @inproceedings{kohavi-nbtree,
24 | | author={Ron Kohavi},
25 | | title={Scaling Up the Accuracy of Naive-Bayes Classifiers: a
26 | | Decision-Tree Hybrid},
27 | | booktitle={Proceedings of the Second International Conference on
28 | | Knowledge Discovery and Data Mining},
29 | | year = 1996,
30 | | pages={to appear}}
31 | |
32 | | Error Accuracy reported as follows, after removal of unknowns from
33 | | train/test sets):
34 | | C4.5 : 84.46+-0.30
35 | | Naive-Bayes: 83.88+-0.30
36 | | NBTree : 85.90+-0.28
37 | |
38 | |
39 | | Following algorithms were later run with the following error rates,
40 | | all after removal of unknowns and using the original train/test split.
41 | | All these numbers are straight runs using MLC++ with default values.
42 | |
43 | | Algorithm Error
44 | | -- ---------------- -----
45 | | 1 C4.5 15.54
46 | | 2 C4.5-auto 14.46
47 | | 3 C4.5 rules 14.94
48 | | 4 Voted ID3 (0.6) 15.64
49 | | 5 Voted ID3 (0.8) 16.47
50 | | 6 T2 16.84
51 | | 7 1R 19.54
52 | | 8 NBTree 14.10
53 | | 9 CN2 16.00
54 | | 10 HOODG 14.82
55 | | 11 FSS Naive Bayes 14.05
56 | | 12 IDTM (Decision table) 14.46
57 | | 13 Naive-Bayes 16.12
58 | | 14 Nearest-neighbor (1) 21.42
59 | | 15 Nearest-neighbor (3) 20.35
60 | | 16 OC1 15.04
61 | | 17 Pebls Crashed. Unknown why (bounds WERE increased)
62 | |
63 | | Conversion of original data as follows:
64 | | 1. Discretized agrossincome into two ranges with threshold 50,000.
65 | | 2. Convert U.S. to US to avoid periods.
66 | | 3. Convert Unknown to "?"
67 | | 4. Run MLC++ GenCVFiles to generate data,test.
68 | |
69 | | Description of fnlwgt (final weight)
70 | |
71 | | The weights on the CPS files are controlled to independent estimates of the
72 | | civilian noninstitutional population of the US. These are prepared monthly
73 | | for us by Population Division here at the Census Bureau. We use 3 sets of
74 | | controls.
75 | | These are:
76 | | 1. A single cell estimate of the population 16+ for each state.
77 | | 2. Controls for Hispanic Origin by age and sex.
78 | | 3. Controls by Race, age and sex.
79 | |
80 | | We use all three sets of controls in our weighting program and "rake" through
81 | | them 6 times so that by the end we come back to all the controls we used.
82 | |
83 | | The term estimate refers to population totals derived from CPS by creating
84 | | "weighted tallies" of any specified socio-economic characteristics of the
85 | | population.
86 | |
87 | | People with similar demographic characteristics should have
88 | | similar weights. There is one important caveat to remember
89 | | about this statement. That is that since the CPS sample is
90 | | actually a collection of 51 state samples, each with its own
91 | | probability of selection, the statement only applies within
92 | | state.
93 |
94 |
95 | >50K, <=50K.
96 |
97 | age: continuous.
98 | workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
99 | fnlwgt: continuous.
100 | education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
101 | education-num: continuous.
102 | marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
103 | occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
104 | relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
105 | race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
106 | sex: Female, Male.
107 | capital-gain: continuous.
108 | capital-loss: continuous.
109 | hours-per-week: continuous.
110 | native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.
111 |
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第14章YouTube/YouTubeNet.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # ## 0)环境准备
5 |
6 | # In[8]:
7 |
8 | import numpy as np
9 | import tensorflow as tf
10 | import pandas as pd
11 | import random
12 | import math
13 | import re
14 |
15 | from sklearn import preprocessing
16 | from os import path, listdir
17 | from sklearn.datasets import load_svmlight_files
18 | from sklearn.model_selection import train_test_split
19 | from sklearn import metrics
20 | from tensorflow.contrib import layers
21 |
22 | from sklearn import metrics
23 |
24 | import time
25 | import datetime
26 |
27 | import os
28 | os.environ["CUDA_VISIBLE_DEVICES"]="0"
29 |
30 | print tf.__version__
31 | print tf.__path__
32 |
33 |
34 | # ## 1)数据准备Dataset格式
35 |
36 | # In[14]:
37 |
38 | # 每一行解析 sequence格式
39 | # 351702070890229|0,0,0,0,0,0,0,0,0,0,0,0,0,0|1,1173,0,0,0|18578
40 |
41 | # 数据处理
42 | def process_data(data_type, my_path, feature_size, batch_size=32, num_epochs=1):
43 | filenames = get_file_list(my_path)
44 | next_element = read_my_file_format(data_type, filenames, feature_size, batch_size, num_epochs)
45 | return next_element
46 |
47 | # 创建session,指定GPU或者CPU使用率
48 | def get_session(gpu_fraction=0.1):
49 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction,
50 | allow_growth=True)
51 | # server = tf.train.Server.create_local_server()
52 | return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
53 |
54 |
55 | # In[15]:
56 |
57 | # 测试数据
58 | filenames = '/data/sequence_normalize/001'
59 | item_size = 5
60 | batch_size = 3
61 | num_epochs = 1
62 | data_type = 'sequence'
63 | next_element = process_data(data_type, filenames, item_size, batch_size, num_epochs)
64 | # print next_element['label']
65 | # print next_element['hist_click']
66 | # print next_element['normalized_continuous_features']
67 |
68 | gpu_fraction = 0.2
69 | my_device='/gpu:0'
70 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer())
71 | with tf.device(my_device):
72 | sess = get_session(gpu_fraction)
73 | sess.run(init_op)
74 | label, item, other = sess.run([next_element['label'],next_element['hist_click'],next_element['normalized_continuous_features']])
75 | print label
76 | print item
77 | print other
78 |
79 |
80 | # ## 2)定义YouTubeNet模型
81 |
82 | # In[26]:
83 |
84 | class YouTubeNet(object):
85 | """ 初始化成员变量 """
86 | def __init__(self,
87 | item_count,
88 | embedding_size,
89 | num_sampled,
90 | learning_rate,
91 | hist_click_length,
92 | normalized_continuous_features_length,
93 | log_path):
94 | # 资源池大小
95 | self.item_count = item_count
96 | # embedding大小
97 | self.embedding_size = embedding_size
98 | # NCE采样数量
99 | self.num_sampled = num_sampled
100 | # 学习率
101 | self.learning_rate = learning_rate
102 | # 用户行为序列特征长度
103 | self.hist_click_length = hist_click_length
104 | # 用户其它特征长度
105 | self.normalized_continuous_features_length = normalized_continuous_features_length
106 | # log_path
107 | self.log_path = log_path
108 |
109 | def train(self, batch_data):
110 | """ 1 定义输入数据 """
111 | print("1 定义输入数据" )
112 | with tf.name_scope('input_data'):
113 | # 用户其它特征向量:[batch_size, normalized_continuous_features_length]
114 | normalized_continuous_features = batch_data['normalized_continuous_features']
115 | # 用户行为序列特征向量:[batch_size, hist_click_length]
116 | hist_click = batch_data['hist_click']
117 | # 用户标签:[batch_size, 1]
118 | label = batch_data['label']
119 | # 计算item序列中0的比例
120 | batch_item_ratio = tf.reduce_mean(tf.reduce_mean(tf.to_float(tf.abs(hist_click) > 0),1),0)
121 | print("%s: %s" % ("normalized_continuous_features", normalized_continuous_features))
122 | print("%s: %s" % ("hist_click", hist_click))
123 | print("%s: %s" % ("label", label))
124 |
125 | """ 2 Embedding初始化 """
126 | # 初始化物品embedding向量V:[item_count, embedding_size]
127 | print("2 Embedding初始化" )
128 | with tf.name_scope('embedding'):
129 | with tf.variable_scope("embedding", reuse=tf.AUTO_REUSE):
130 | self.weights = tf.Variable(tf.truncated_normal([self.item_count, self.embedding_size],
131 | stddev=1.0 / math.sqrt(self.embedding_size)))
132 | self.biases = tf.Variable(tf.zeros([self.item_count]))
133 | print("%s: %s" % ("weights", self.weights))
134 | print("%s: %s" % ("biases", self.biases))
135 |
136 | """ 3 对用户行为序列进行embedding_lookup查找,得到用户的行为embed向量 """
137 | print("3 对用户item序列进行embedding_lookup查找" )
138 | with tf.name_scope("embedding_lookup"):
139 | # weights:[item_count, embedding_size]
140 | # hist_click:[batch_size, hist_click_length]
141 | # embed:[batch_size, hist_click_length, embedding_size]
142 | inputs = tf.nn.embedding_lookup(self.weights, hist_click)
143 | print("%s: %s" % ("inputs", inputs))
144 |
145 | """ 4 pooling操作,根据用户行为embed向量,进行求和或者平均操作 """
146 | print("4 对用户序列进行pooling操作" )
147 | with tf.name_scope('pooling_layer'):
148 | pooling_embed = tf.reduce_sum(inputs, axis=1)
149 | print("%s: %s" % ("pooling_embed", pooling_embed))
150 |
151 | """ 5 用户特征向量拼接 """
152 | print("5 用户特征向量拼接")
153 | with tf.name_scope("all_concat"):
154 | all_concat = tf.concat([pooling_embed, normalized_continuous_features], 1)
155 | print("%s: %s" % ("all_concat", all_concat))
156 |
157 | """ 6 多层感知器神经网络计算,最终得到用户的embedding向量U:[batch_size, embedding_size] """
158 | # 省略,可以参照第13章或者第12章。
159 |
160 | """ 7 Softmax计算,用户的embedding向量U 乘以物品的embedding向量V,然后通过Softmax计算结果,其中Loss采用NCE负采样方法 """
161 | print("7 最后一层Softmax计算")
162 | with tf.name_scope('Softmax_Classifer'):
163 | with tf.variable_scope("softmax_classifer", reuse=tf.AUTO_REUSE):
164 | # 省略,可以参照https://github.com/ogerhsou/Youtube-Recommendation-Tensorflow/blob/master/youtube_recommendation.py。
165 |
166 | """8 设定summary,以便在Tensorboard里进行可视化 """
167 | print("8 设定summary" )
168 | with tf.name_scope("summaries"):
169 | tf.summary.scalar("loss", loss)
170 | tf.summary.histogram("weightsweight", self.weights)
171 | # 好几个summary,所以这里要merge_all
172 | summary_op = tf.summary.merge_all()
173 |
174 | """9 返回结果 """
175 | return out, loss, batch_item_ratio, label, summary_op, train_step
176 |
177 |
178 | # ## 3)模型训练测试
179 |
180 | # In[ ]:
181 |
182 | # 数据参数
183 | print("0 数据准备和参数设置" )
184 | batch_size=2000
185 | item_size = 30
186 | num_epochs=1
187 | filenames = '/data/001'
188 | data_type = 'sequence'
189 | next_element = process_data(data_type, filenames, item_size, batch_size, num_epochs)
190 | print("%s: %s" % ("next_element", next_element))
191 |
192 | # 模型参数
193 | item_count = 99974
194 | embedding_size = 32
195 | num_sampled = 32
196 | learning_rate = 0.01
197 | hist_click_length = item_size * 3
198 | f_size = hist_click_length + 2
199 | normalized_continuous_features_length = f_size - hist_click_length - 1
200 | log_path='/data/log/youtubenet_20180810_001'
201 |
202 | # 开始训练
203 | bea_model = YouTubeNet(item_count, embedding_size, num_sampled, learning_rate, hist_click_length, normalized_continuous_features_length, log_path)
204 | out, loss, batch_item_ratio, label, summary_op, train_step = bea_model.train(next_element)
205 |
206 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer())
207 | gpu_fraction = 0.5
208 | my_device='/cpu:0'
209 | with tf.device(my_device):
210 | sess = get_session(gpu_fraction)
211 | sess.run(init_op)
212 | batch_cnt = 0
213 | #选定可视化存储目录
214 | writer = tf.summary.FileWriter(log_path, sess.graph)
215 | print("9 迭代过程" )
216 | try:
217 | while True:
218 | batch_cnt = batch_cnt + 1
219 | a, b, c, d, summary, _ = sess.run([out, loss, batch_item_ratio, label, summary_op, train_step])
220 | if batch_cnt % 400 == 0 or batch_cnt <= 10:
221 | print("batch: {} loss: {} item_ratio: {}".format(batch_cnt, b, c))
222 | writer.add_summary(summary, batch_cnt)
223 | except tf.errors.OutOfRangeError:
224 | print("Train end of dataset")
225 |
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第15章基于电商平台的商品召回/myGoodsRecall.scala:
--------------------------------------------------------------------------------
1 | package book_code
2 |
3 | import scala.collection.mutable.ArrayBuffer
4 |
5 | class myGoodsRecall extends Serializable {
6 |
7 | /**
8 | *
9 | * 根据用户请求,返回召回列表
10 | *
11 | */
12 |
13 | def recall(request: Request, extendMap: Map[String, String]): Response = {
14 |
15 | // 1 获取参数
16 | val recallNum = extendMap.getOrElse("recallNum", "500").toInt
17 | val recallByKeyNum = extendMap.getOrElse("recallByKeyNum", "20").toInt
18 | val userActTopK = extendMap.getOrElse("userActTopK", "20").toInt
19 |
20 | // 2.1 获取用户数据,取用户TopK个浏览商品,这里一般通过其他接口,取相应的用户数据,在代码中不展开,这里采用一个数组来做实例讲解
21 | val userGoldsdArray = Array("101", "108", "109", "105")
22 | // 2.2 获取用户数据,取用户类别兴趣数据,这里一般通过其他接口,取相应的用户数据,在代码中不展开,这里采用一个数组来做实例讲解
23 | val userCategoryArray = Array("1", "2", "11")
24 |
25 | // 3.1 goldCF召回查询
26 | val userGoldCfRecallArray = userGoldsdArray.map { itemKey: String =>
27 | // 通过key查询,得到列表,这里一般通过其他接口,取相应的数据,在本代码中不展开,这里采用一个Map
28 | // 需要解析召回内容,并且取top,用数据格式返回
29 | val itemByOneKeyArray = Map[String, Array[Item]]().getOrElse(itemKey, Array[Item]()).slice(0, recallByKeyNum)
30 | itemByOneKeyArray
31 | }
32 | // 3.2 汇总并去重
33 | val userGoldCfRecallDistinctTmp = userGoldCfRecallArray.flatMap(f => f)
34 | val userGoldCfRecallDistinct = ArrayBuffer[Item]()
35 | for (i <- 0 to userGoldCfRecallDistinctTmp.size - 1) {
36 | val item = userGoldCfRecallDistinctTmp(i)
37 | if (!userGoldCfRecallDistinct.map(f => f.itemKey).contains(item.itemKey)) {
38 | userGoldCfRecallDistinct += item
39 | }
40 | }
41 |
42 | // 4 相似内容召回查询
43 | val userGoldSimilarContentArray = userGoldsdArray.map { itemKey: String =>
44 | // 通过key查询,得到列表,这里一般通过其他接口,取相应的数据,在本代码中不展开,这里采用一个Map来做实例讲解
45 | // 需要解析召回内容,并且取top,用数据格式返回
46 | val itemByOneKeyArray = Map[String, Array[Item]]().getOrElse(itemKey, Array[Item]()).slice(0, recallByKeyNum)
47 | itemByOneKeyArray
48 | }
49 | // 4.2 汇总并去重
50 | val userGoldSimilarContentRecallDistinctTmp = userGoldSimilarContentArray.flatMap(f => f)
51 | val userGoldSimilarContentRecallDistinct = ArrayBuffer[Item]()
52 | for (i <- 0 to userGoldSimilarContentRecallDistinctTmp.size - 1) {
53 | val item = userGoldSimilarContentRecallDistinctTmp(i)
54 | if (!userGoldSimilarContentRecallDistinctTmp.map(f => f.itemKey).contains(item.itemKey)) {
55 | userGoldSimilarContentRecallDistinct += item
56 | }
57 | }
58 |
59 | // 5 用户类别兴趣召回查询
60 | val userGoldSimilarCategoryArray = userCategoryArray.map { category: String =>
61 | // 通过key查询,得到列表,这里一般通过其他接口,取相应的数据,在本代码中不展开,这里采用一个Map来做实例讲解
62 | // 需要解析召回内容,并且取top,用数据格式返回
63 | val itemByOneKeyArray = Map[String, Array[Item]]().getOrElse(category, Array[Item]()).slice(0, recallByKeyNum)
64 | itemByOneKeyArray
65 | }
66 | // 5.2 汇总并去重
67 | val userGoldSimilarCategoryRecallDistinctTmp = userGoldSimilarCategoryArray.flatMap(f => f)
68 | val userGoldSimilarCategoryRecallDistinct = ArrayBuffer[Item]()
69 | for (i <- 0 to userGoldSimilarCategoryRecallDistinctTmp.size - 1) {
70 | val item = userGoldSimilarCategoryRecallDistinctTmp(i)
71 | if (!userGoldSimilarCategoryRecallDistinctTmp.map(f => f.itemKey).contains(item.itemKey)) {
72 | userGoldSimilarCategoryRecallDistinct += item
73 | }
74 | }
75 |
76 | // 6 依此类推,查询其它召回数据,这里主不展开了
77 |
78 | // 7 多个召回数据合并,排序,并且取TopK
79 | // 7.1 CF
80 | // 取每个召回的参数权重,这里用个Map来做实例讲解
81 | val weightCF = Map[String, Double]().getOrElse("CF", 1.0)
82 | // 取物品,以及对应的分值
83 | val recallCF = userGoldCfRecallDistinct.toArray.map(x => (x.itemKey, x.score * weightCF))
84 | // 7.2 Content
85 | // 取每个召回的参数权重,这里用个Map来做实例讲解
86 | val weightContent = Map[String, Double]().getOrElse("Content", 1.0)
87 | // 取物品,以及对应的分值
88 | val recallContent = userGoldSimilarContentRecallDistinct.toArray.map(x => (x.itemKey, x.score * weightContent))
89 | // 7.3 Category
90 | // 取每个召回的参数权重,这里用个Map来做实例讲解
91 | val weightCategory = Map[String, Double]().getOrElse("Category", 1.0)
92 | // 取物品,以及对应的分值
93 | val recallCategory = userGoldSimilarCategoryRecallDistinct.toArray.map(x => (x.itemKey, x.score * weightCategory))
94 |
95 | // 7.4 合并,并且返回ToK,排序按照分值降序排
96 | val recallMerge = (recallCF ++ recallContent ++ recallCategory).
97 | sortBy(f => -1 * f._2).
98 | slice(0, recallNum).map {
99 | case (itemKey: String, score: Double) =>
100 | new Item(itemKey).setScore(score)
101 | }
102 |
103 | // 8 返回结果
104 | val recallStatus = if (recallMerge.size > 0) "True" else "False"
105 | val response = new Response(request.getSessionID).
106 | setStatus(recallStatus).
107 | setItemArray(recallMerge)
108 | response
109 | }
110 |
111 | }
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第16章基于逻辑回归的音乐评分预测/LR.scala:
--------------------------------------------------------------------------------
1 | package book_code
2 |
3 | import org.apache.spark.mllib.classification.{ LogisticRegressionModel, LogisticRegressionWithLBFGS }
4 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
5 | import org.apache.spark.mllib.linalg.Vectors
6 | import org.apache.spark.mllib.regression.LabeledPoint
7 |
8 | import org.apache.spark.sql.types._
9 | import org.apache.spark.sql.functions._
10 | import org.apache.spark.sql._
11 | import org.apache.spark.sql.SparkSession
12 |
13 | import java.io.{ ObjectInputStream, ObjectOutputStream }
14 | import java.net.URI
15 | import java.sql.Connection
16 | import org.apache.hadoop.conf.Configuration
17 | import org.apache.hadoop.fs.{ FileSystem, Path }
18 |
19 | object LR {
20 |
21 | def main(args: Array[String]): Unit = {
22 |
23 | val spark = SparkSession
24 | .builder
25 | .appName("LR")
26 | .config("spark.hadoop.validateOutputSpecs", "false")
27 | .enableHiveSupport()
28 | .getOrCreate()
29 | import spark.implicits._
30 |
31 | // 1.1 初始化参数
32 | val dataPath = "hdfs://1.1.1.1:9000/LR_Data/sample_original_1/all.csv"
33 | val minFeature = 10
34 | val defaultValue = 0.0
35 | val modelSavePath = ""
36 |
37 | val iter = 100
38 | val reg_param = 0.0
39 | val elastic_net_param = 0.0
40 |
41 | // 2.2 取样本数据
42 | val dataRead = spark.read.options(Map(("delimiter", "|"), ("header", "false"))).csv(dataPath)
43 | val col = dataRead.columns
44 | val readSampleData = dataRead.withColumnRenamed(col(0), "label").
45 | withColumnRenamed(col(1), "feature").
46 | withColumnRenamed(col(2), "item")
47 | readSampleData.cache()
48 |
49 | //2.3 建立标签ID的索引以及数据处理方法
50 | val dataProcessObj = new DataProcess()
51 |
52 | // 2.4 生成样本
53 | val (training, test) = sampleDataProcess(spark, readSampleData, dataProcessObj)
54 | training.cache()
55 | training.count()
56 | test.cache()
57 | test.count()
58 |
59 | //3.1 建立逻辑回归模型
60 | val lr = new LogisticRegressionWithLBFGS().setNumClasses(2)
61 | lr.optimizer.setNumIterations(iter)
62 | lr.optimizer.setRegParam(reg_param)
63 | val lrModel = lr.run(training.rdd)
64 |
65 | //3.2 计算模型指标
66 | lrModel.clearThreshold()
67 | val scoreAndLabels = test.rdd.map { point =>
68 | val score = lrModel.predict(point.features)
69 | (score, point.label)
70 | }
71 | val metrics = new BinaryClassificationMetrics(scoreAndLabels)
72 | val auc = metrics.areaUnderROC()
73 | val aupr = metrics.areaUnderPR()
74 | println(s"AUC: ${auc}")
75 | println(s"AUPR: ${aupr}")
76 |
77 | // 4.1 封装模型
78 | val mllibLR = new LrModel(lrModel, defaultValue, dataProcessObj)
79 | // 4.2 保存模型
80 | modelSave(mllibLR, modelSavePath)
81 |
82 | }
83 |
84 | /**
85 | * 保存序列化的模型
86 | */
87 | def modelSave(
88 | model: LrModel,
89 | path: String): Unit = {
90 | }
91 |
92 | def sampleDataProcess(): (Dataset[LabeledPoint], Dataset[LabeledPoint]) = {
93 | (training, test)
94 | }
95 |
96 | }
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第17章Kaggle竞赛之Outbrain点击率预估/EnsembleTree.scala:
--------------------------------------------------------------------------------
1 | package book_code
2 |
3 | import org.apache.hadoop.conf.Configuration
4 | import org.apache.hadoop.fs.{ FileSystem, Path }
5 | import org.apache.spark.mllib.classification.{ LogisticRegressionModel, LogisticRegressionWithLBFGS }
6 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
7 | import org.apache.spark.mllib.linalg.Vectors
8 | import org.apache.spark.ml.linalg.{ Vector => mlVector }
9 | import org.apache.spark.mllib.linalg.Vector
10 | import org.apache.spark.mllib.regression.LabeledPoint
11 | import org.apache.spark.mllib.tree.GradientBoostedTrees
12 | import org.apache.spark.mllib.tree.RandomForest
13 | import org.apache.spark.mllib.tree.configuration.BoostingStrategy
14 | import org.apache.spark.mllib.tree.configuration.FeatureType._
15 | import org.apache.spark.mllib.tree.model.{ GradientBoostedTreesModel, Node, RandomForestModel }
16 | import org.apache.spark.mllib.tree.configuration.Algo.{ Algo, Regression }
17 | import org.apache.spark.mllib.tree.configuration.QuantileStrategy._
18 | import org.apache.spark.mllib.tree.configuration.Strategy
19 | import org.apache.spark.mllib.tree.impurity.Variance
20 | import org.apache.spark.rdd.RDD
21 | import org.apache.spark.sql._
22 | import scala.collection.mutable.ArrayBuffer
23 | import org.apache.spark.sql.types._
24 | import org.apache.spark.sql.functions._
25 | import org.apache.spark.sql._
26 | import org.apache.spark.sql.SparkSession
27 | import java.io.{ ObjectInputStream, ObjectOutputStream }
28 | import java.net.URI
29 | import java.sql.Connection
30 | import org.apache.hadoop.conf.Configuration
31 | import org.apache.hadoop.fs.{ FileSystem, Path }
32 |
33 | object EnsembleTree {
34 |
35 | def main(args: Array[String]): Unit = {
36 |
37 | val spark = SparkSession.builder().
38 | master("local").
39 | appName("EnsembleTree").
40 | getOrCreate()
41 |
42 | import spark.implicits._
43 |
44 | // 1.1 初始化参数
45 | val dataPath = "hdfs://1.1.1:9000/data/Outbrain/all.csv"
46 | val minFeature = 10
47 | val defaultValue = 0.0
48 | val modelSavePath = ""
49 | var iteratTree = 10
50 | var iteratDepth = 10
51 | var maxAuc = 0.0
52 | var maxDepth = 10
53 | var numTrees = 10
54 | var minInstancesPerNode = 2
55 | var iter = 100
56 | var reg_param = 0.0
57 | var elastic_net_param = 0.0
58 |
59 | // 2.1 取样本数据
60 | val dataRead = spark.read.options(Map(("delimiter", "|"), ("header", "false"))).csv(dataPath)
61 | val col = dataRead.columns
62 | val readSampleData = dataRead.withColumnRenamed(col(0), "label").
63 | withColumnRenamed(col(1), "feature").
64 | withColumnRenamed(col(2), "item")
65 | readSampleData.cache()
66 |
67 | //2.2 建立数据处理方法
68 | val dataProcessObj1 = new DataProcess()
69 | val dataProcessObj2 = new DataProcess()
70 | val dataProcessObjAll = new DataProcess()
71 |
72 | //2 训练样本准备,准备2份
73 | val (training1, test1) = sampleDataProcess(spark, readSampleData, dataProcessObj1)
74 | training1.cache()
75 | training1.count()
76 | test1.cache()
77 | test1.count()
78 |
79 | val (training2, test2) = sampleDataProcess(spark, readSampleData, dataProcessObj2)
80 | training2.cache()
81 | training2.count()
82 | test2.cache()
83 | test2.count()
84 |
85 | //3.1 Gbdt1模型训练
86 | val boostingStrategy = BoostingStrategy.defaultParams("Regression")
87 | boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]()
88 | boostingStrategy.treeStrategy.minInstancesPerNode = minInstancesPerNode
89 | boostingStrategy.numIterations = numTrees
90 | boostingStrategy.treeStrategy.maxDepth = maxDepth
91 | val gbdtMode1 = GradientBoostedTrees.train(training1.rdd, boostingStrategy)
92 |
93 | //3.2 Gbdt2模型训练
94 | val gbdtMode2 = GradientBoostedTrees.train(training2.rdd, boostingStrategy)
95 |
96 |
97 | //4 解析样本,通过2个树模型映射到最终的LR输入向量
98 | val gbdtMode1_BC = spark.sparkContext.broadcast(gbdtMode1)
99 | val gbdtMode2_BC = spark.sparkContext.broadcast(gbdtMode2)
100 |
101 | val mergeSampleData = readSampleData.map { row =>
102 | val click = row(0).toString().toInt
103 | val detail = row(1).toString()
104 | val itemid = row(2).toString()
105 | val label = if (click > 0) 1.0 else 0.0
106 |
107 | //第1个GBDT映射
108 | val (tree1Size, tree1NodeFeature) = gettreeNode(gbdtMode1_BC.value, tree1Feature)
109 |
110 | //第2个GBDT映射
111 | val (tree2Size, tree2NodeFeature) = gettreeNode(gbdtMode2_BC.value, tree2Feature)
112 |
113 | //所有样本归一化
114 | val allFeature = allMap
115 | val allSize = dataProcessObjAll.numFeatures
116 |
117 | //合并
118 | val mergeFeature = (tree1NodeFeature ++
119 | (tree2NodeFeature.map(f => (f._1 + tree1Size.toInt, f._2))) ++
120 | (tree3NodeFeature.map(f => (f._1 + tree1Size.toInt + tree2Size.toInt, f._2))) ++
121 | (allFeature.map(f => (f._1 + tree1Size.toInt + tree2Size.toInt + tree3Size.toInt, f._2)))).sortBy(f => f._1)
122 | val mergeSize = tree1Size + tree2Size + tree3Size + allSize
123 | val point = LabeledPoint(label, Vectors.sparse(mergeSize.toInt, mergeFeature.map(_._1), mergeFeature.map(_._2)))
124 | point
125 | }
126 |
127 | //5 lr模型训练
128 | val Splits = mergeSampleData.randomSplit(Array(0.7, 0.3))
129 | val Training = Splits(0)
130 | val Test = Splits(1)
131 | Training.cache()
132 | Test.cache()
133 | Training.count()
134 | Test.count()
135 |
136 | val lr = new LogisticRegressionWithLBFGS().setNumClasses(2)
137 | lr.optimizer.setNumIterations(iter)
138 | lr.optimizer.setRegParam(reg_param)
139 | val lrModel = lr.run(Training.rdd)
140 |
141 | //6 计算模型指标
142 | lrModel.clearThreshold()
143 | val scoreAndLabels = Test.rdd.map { point =>
144 | val score = lrModel.predict(point.features)
145 | (score, point.label)
146 | }
147 | val metrics = new BinaryClassificationMetrics(scoreAndLabels)
148 | val auc = metrics.areaUnderROC()
149 | val aupr = metrics.areaUnderPR()
150 | println(s"AUC: ${auc}")
151 | println(s"AUPR: ${aupr}")
152 |
153 | // 7.1 封装模型
154 | val mllibEST = new EnsembleTreeModel()
155 | // 7.2 保存模型
156 | modelSave(mllibEST, modelSavePath)
157 |
158 | }
159 |
160 | }
161 |
162 |
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第17章Kaggle竞赛之Outbrain点击率预估/FFM.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # In[ ]:
5 |
6 |
7 | import xlearn as xl
8 |
9 | # 1 模型建立
10 | # model = create_linear() # Create linear model
11 | # model = create_fm() # Create factorization machines
12 | # model = create_ffm() # Create field-aware factorizarion machines.
13 | ffm_model = xl.create_ffm() # 建立field-aware factorization machine模型
14 | ffm_model.setTrain("./small_train.txt") # 设置训练数据
15 | ffm_model.setValidate("./small_test.txt") # 设置测试数据
16 |
17 | # 2 模型参数:
18 | # task: {'binary', # 二元分类
19 | # 'reg'} # 回归
20 | # metric: {'acc', 'prec', 'recall', 'f1', 'auc', # 分类指标
21 | # 'mae', 'mape', 'rmse', 'rmsd'} # 回归指标
22 | # lr: float value # 学习率
23 | # lambda: float value #正则因子
24 | # 其它超参因子参照API说明:https://xlearn-doc-cn.readthedocs.io/en/latest/all_api/index.html
25 | param = {'task':'binary', 'lr':0.1, 'lambda':0.001, 'metric':'auc'}
26 |
27 | # 3 训练模型
28 | # The trained model will be stored in ffm_model.out
29 | ffm_model.fit(param, './ffm_model.out')
30 |
31 | # 4 测试
32 | ffm_model.setTest("./small_test.txt") # Test data
33 | ffm_model.setSigmoid() # Convert output to 0-1
34 |
35 | # 预测结果
36 | # The output result will be stored in output.txt
37 | ffm_model.predict("./ffm_model.out", "./output.txt")
38 |
39 |
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第17章Kaggle竞赛之Outbrain点击率预估/XGB.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # In[ ]:
5 |
6 |
7 | import xgboost as xgb
8 | import numpy as np
9 | import random
10 | import math
11 | import os
12 | import sys
13 | from sklearn import metrics
14 |
15 | # 1 数据准备
16 | dtrain = xgb.DMatrix(train_data_path, feature_names = features)
17 | dtest = xgb.DMatrix(test_data_path, feature_names = features)
18 | dvalid = xgb.DMatrix(valid_data_path, feature_names = features)
19 |
20 | # 2 参数准备
21 | param = {'booster': booster,
22 | 'eval_metric':eval_metric,
23 | 'max_depth':max_depth,
24 | 'gamma': gamma,
25 | 'min_child_weight':min_child_weight,
26 | 'eta':eta,
27 | 'objective':objective,
28 | 'subsample': subsample,
29 | 'colsample_bytree': colsample_bytree}
30 |
31 | # 3 模型训练
32 | bst = xgb.train(param, dtrain, round, evals=[(dtrain,'train'), (dtest,'test')])
33 |
34 | # 4 模型测试
35 | preds = bst.predict(dtest)
36 | auc = metrics.roc_auc_score(labels, preds)
37 | precision = metrics.average_precision_score(labels, preds)
38 | mae = metrics.mean_absolute_error(labels, preds)
39 | rmse = math.sqrt(metrics.mean_squared_error(labels, preds))
40 |
41 | # 5 模型保存
42 | bst.save_model(local_path_bin)
43 | bst.dump_model(local_path)
44 |
45 |
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第18章基于深度学习的电商商品点击率预估/DeepInterestNetwork.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # ## 1)环境准备
5 |
6 | # In[28]:
7 |
8 |
9 | import numpy as np
10 | import tensorflow as tf
11 | import pandas as pd
12 | import random
13 | import math
14 | import re
15 |
16 | from sklearn import preprocessing
17 | from os import path, listdir
18 | from sklearn.datasets import load_svmlight_files
19 | from sklearn.model_selection import train_test_split
20 | from sklearn import metrics
21 | from tensorflow.contrib import layers
22 |
23 | from sklearn import metrics
24 |
25 | import time
26 | import datetime
27 |
28 | import os
29 |
30 | print(tf.__version__)
31 | print(tf.__path__)
32 |
33 |
34 | # ## 2)数据准备
35 |
36 | # In[23]:
37 |
38 |
39 | # 获取商品和类目的embedding数据
40 | def get_embedding():
41 | # 类目embedding数据
42 | # 商品embedding数据
43 | return {"category_list": category_list, "golds_list": golds_list}
44 |
45 | # 读取用户行为数据,格式:点击|浏览序列|点击序列|购买序列|类目兴趣序列|用户画像特征
46 | # 1000000123|0,0,0,0,0,0,0,0,0,0,0|0,0,0,0,0,0,0,0,0||0,0,0,0,0,0,0,0,0,0|1,1173,0,0,0
47 | def decode_sequence(line, gold_size, category_size, profile_size):
48 | # 数据解析
49 | return {"label": label, "goods": goods, "category": category, "profile": profile}
50 |
51 | # 数据处理
52 | def process_data(my_path, gold_size, category_size, other_size, batch_size=32, num_epochs=1):
53 | filenames = get_file_list(my_path)
54 | next_element = read_my_file_format(filenames, gold_size, category_size, other_size, batch_size, num_epochs)
55 | return next_element
56 |
57 |
58 | # In[29]:
59 |
60 |
61 | # 测试数据
62 | filenames = 'D:\\Data\\GoldsData\\User\\user_data.csv'
63 | batch_size = 2
64 | num_epochs = 1
65 | gold_size = 10
66 | category_size = 8
67 | other_size = 12
68 | next_element = process_data(filenames, gold_size, category_size, other_size, batch_size, num_epochs)
69 | print(next_element['label'])
70 | print(next_element['goods'])
71 | print(next_element['category'])
72 | print(next_element['profile'])
73 |
74 | my_device='/cpu:0'
75 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer())
76 | with tf.device(my_device):
77 | sess = tf.Session()
78 | sess.run(init_op)
79 | label, goods, category, other = sess.run([next_element['label'],next_element['goods'],next_element['category'],next_element['profile']])
80 | print(label)
81 | print(goods)
82 | print(category)
83 | print(other)
84 |
85 |
86 |
87 | # In[30]:
88 |
89 |
90 | # embedding数据
91 | embedding = get_embedding()
92 | print(embedding['category_list'].shape)
93 | print(embedding['golds_list'].shape)
94 | print(embedding['golds_list'])
95 | print(embedding['golds_list'])
96 |
97 |
98 | # ## 3)定义DeepInterestNetwork模型
99 |
100 | # In[31]:
101 |
102 |
103 | class DeepInterestNetwork(object):
104 | """ 一、初始化成员变量 """
105 | def __init__(self,
106 | goods_size,
107 | goods_embedding_size,
108 | category_embedding_size,
109 | num_sampled,
110 | learning_rate,
111 | attention_size,
112 | goods_input_length,
113 | category_input_length,
114 | profile_input_length,
115 | log_path):
116 | # 商品池大小
117 | self.goods_size = goods_size
118 | # 商品embedding大小
119 | self.goods_embedding_size = goods_embedding_size
120 | self.category_embedding_size = category_embedding_size
121 | # NCE采样数量
122 | self.num_sampled = num_sampled
123 | # 学习率
124 | self.learning_rate = learning_rate
125 | # attention层大小
126 | self.attention_size = attention_size
127 | # 用户购买序列特征长度
128 | self.goods_input_length = goods_input_length
129 | # 用户类目兴趣特征长度
130 | self.category_input_length = category_input_length
131 | # 用户画像特征长度
132 | self.profile_input_length = profile_input_length
133 | # log_path
134 | self.log_path = log_path
135 |
136 | """ 二、计算网络最后一层的输出 """
137 | def _comput_lay_out(self, batch_data):
138 | """ 1 定义输入数据 """
139 | print("1 定义输入数据" )
140 | with tf.name_scope('input_data'):
141 | # 用户画像特征向量:[batch_size, profile_input_length]
142 | input_profile = batch_data['profile']
143 | # 用户类目特征向量:[batch_size, category_input_length]
144 | input_category = batch_data['category']
145 | # 用户购买序列特征向量:[batch_size, goods_input_length]
146 | input_goods = batch_data['goods']
147 | print("%s: %s" % ("input_profile", input_profile))
148 | print("%s: %s" % ("input_goods", input_goods))
149 | print("%s: %s" % ("input_category", input_category))
150 |
151 | # 计算gold序列中0的比例
152 | batch_goods_ratio = tf.reduce_mean(tf.reduce_mean(tf.to_float(tf.abs(input_goods) > 0),1),0)
153 |
154 | """ 2 对用户行为序列进行embedding_lookup查找,得到用户的行为embed向量 """
155 | # 省略,可以参照第14章。
156 |
157 | """ 3 attention机制,根据用户行为embed向量,通过多层感知神经网络,最后通过Saftmax得到alpha权重向量 """
158 | print("3 对用户序列进行attention层计算" )
159 | with tf.name_scope('attention_layer'):
160 | with tf.variable_scope("attention_layer", reuse=tf.AUTO_REUSE):
161 | # 全连接层计算
162 | # inputs shape: [batch_size, goods_input_length, embedding_size]
163 | # h: [batch_size, goods_input_length, embedding_size]
164 | h = layers.fully_connected(inputs_goods_emb, self.attention_size, activation_fn=tf.nn.tanh)
165 | print("%s: %s" % ("h", h))
166 |
167 | # 输出层计算
168 | # u_context: importance vector
169 | u_context = tf.Variable(tf.truncated_normal([self.attention_size]))
170 | hu_sum = tf.reduce_sum(tf.multiply(h, u_context), axis=2, keep_dims=True)
171 | print("%s: %s" % ("hu_sum", hu_sum))
172 | # 防止 exp 溢出
173 | hu_max = tf.reduce_max(hu_sum, axis=1, keep_dims=True)
174 | print("%s: %s" % ("hu_max", hu_max))
175 | hu_normal = hu_sum - hu_max
176 | print("%s: %s" % ("hu_normal", hu_normal))
177 |
178 | # Softmax计算
179 | # hu_sum: [batch_size, goods_input_length, 1]
180 | exp = tf.exp(hu_normal)
181 | exp_adapt = exp
182 | print("%s: %s" % ("exp_adapt", exp_adapt))
183 |
184 | exp_adapt_sum = tf.reduce_sum(exp_adapt, axis=1, keep_dims=True)
185 | print("%s: %s" % ("exp_adapt_sum", exp_adapt_sum))
186 | alpha = tf.div(exp_adapt, exp_adapt_sum)
187 | print("%s: %s" % ("alpha", alpha))
188 |
189 | # attention计算,[batch_size, embedding_size]
190 | atten_embed = tf.reduce_sum(tf.multiply(inputs_goods_emb, alpha), axis=1)
191 | print("%s: %s" % ("atten_embed", atten_embed))
192 |
193 | """ 4 用户特征向量拼接 """
194 | # 省略,可以参照第14章。
195 |
196 | """ 5 多层感知器神经网络计算,最终得到用户的embedding向量U:[batch_size, embedding_size] """
197 | # 省略,可以参照第14章。
198 |
199 |
200 | """ 三、网络训练 """
201 | def train(self, batch_data, goods_embedding, category_embedding):
202 | """ 1 Embedding初始化 """
203 | with tf.name_scope('embedding'):
204 | self.goods_embedding = tf.convert_to_tensor(goods_embedding, dtype=tf.float32)
205 | self.category_embedding = tf.convert_to_tensor(category_embedding, dtype=tf.float32)
206 | print("%s: %s" % ("goods_embedding", self.goods_embedding))
207 | print("%s: %s" % ("category_embedding", self.category_embedding))
208 | with tf.variable_scope("embedding", reuse=tf.AUTO_REUSE):
209 | self.nce_biases = tf.get_variable(name='nce_biases', shape=[self.goods_size], initializer=tf.constant_initializer(0.0))
210 | print("%s: %s" % ("nce_biases", self.nce_biases))
211 |
212 | """ 2 计算深度神经网络的最后一层输出 """
213 | layer_out, batch_goods_ratio = self._comput_lay_out(batch_data)
214 | # 用户标签:[batch_size, 1]
215 | input_label = batch_data['label']
216 | print("%s: %s" % ("input_label", input_label))
217 |
218 | """ 3 Softmax计算,用户的embedding向量U乘以商品的embedding向量V,然后通过Softmax计算结果,其中Loss采用NCE负采样方法 """
219 | print("3 最后一层Softmax计算")
220 | # 省略,可以参照第14章。
221 |
222 | """4 设定summary,以便在Tensorboard里进行可视化 """
223 | print("4 设定summary" )
224 | with tf.name_scope("summaries"):
225 | tf.summary.scalar("loss", loss)
226 | tf.summary.histogram("nce_biases", self.nce_biases)
227 | # 好几个summary,所以这里要merge_all
228 | summary_op = tf.summary.merge_all()
229 |
230 | """5 返回结果 """
231 | return out, loss, batch_goods_ratio, input_label, summary_op, train_step
232 |
233 | """ 四、预测计算 """
234 | def predict(self, batch_data):
235 | """ 1 计算深度神经网络的最后一层输出 """
236 | layer_out, _ = self._comput_lay_out(batch_data)
237 |
238 | """ 2 计算Softmax的预测结果 """
239 | predict_score = tf.nn.softmax(tf.matmul(layer_out, tf.transpose(self.goods_embedding)) + self.nce_biases, dim=1)
240 | # 结果返回
241 | return predict_score
242 |
243 |
244 | # ## 4)模型训练测试
245 |
246 | # In[32]:
247 |
248 |
249 | # 数据参数
250 | print("0 数据准备和参数设置" )
251 | filenames = 'D:\\Data\\GoldsData\\User\\user_data.csv'
252 | batch_size = 2000
253 | num_epochs = 1000
254 | gold_size = 10
255 | category_size = 8
256 | profile_size = 12
257 | next_element = process_data(filenames, gold_size, category_size, other_size, batch_size, num_epochs)
258 | print("%s: %s" % ("next_element", next_element))
259 |
260 | # 模型参数
261 | goods_size = 40742
262 | goods_embedding_size = 100
263 | category_embedding_size = 10
264 | num_sampled = 32
265 | learning_rate = 0.01
266 | attention_size = 60
267 | goods_input_length = gold_size * 3
268 | category_input_length = category_size
269 | profile_input_length = profile_size
270 | log_path='D:\\Data\\log\\20180915'
271 |
272 | # embedding参数
273 | embedding = get_embedding()
274 | goods_embedding = embedding['golds_list']
275 | category_embedding = embedding['category_list']
276 | print("%s: %s" % ("goods_embedding.shape", goods_embedding.shape))
277 | print("%s: %s" % ("category_embedding.shape", category_embedding.shape))
278 |
279 | # 开始训练
280 | golds_rec_model = DeepInterestNetwork(goods_size,
281 | goods_embedding_size,
282 | category_embedding_size,
283 | num_sampled,
284 | learning_rate,
285 | attention_size,
286 | goods_input_length,
287 | category_input_length,
288 | profile_input_length,
289 | log_path)
290 | out, loss, batch_goods_ratio, input_label, summary_op, train_step = golds_rec_model.train(next_element, goods_embedding, category_embedding)
291 |
292 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer())
293 | my_device='/cpu:0'
294 | with tf.device(my_device):
295 | sess = tf.Session()
296 | sess.run(init_op)
297 | batch_cnt = 0
298 | #选定可视化存储目录
299 | writer = tf.summary.FileWriter(log_path, sess.graph)
300 | print("5 迭代过程" )
301 | try:
302 | while True:
303 | batch_cnt = batch_cnt + 1
304 | a, b, c, d, summary, _ = sess.run([out, loss, batch_goods_ratio, input_label, summary_op, train_step])
305 | if batch_cnt % 200 == 0 or batch_cnt <= 10:
306 | print("batch: {} loss: {} gold_ratio: {}".format(batch_cnt, b, c))
307 | writer.add_summary(summary, batch_cnt)
308 | except tf.errors.OutOfRangeError:
309 | print("Train end of dataset")
310 |
311 |
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第19章Notebook实践/Debug_CF.scala:
--------------------------------------------------------------------------------
1 | import scala.math._
2 | import org.apache.spark.sql.SparkSession
3 | import org.apache.spark.sql.Dataset
4 | import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
5 | import org.apache.spark.sql.functions._
6 | import org.apache.spark.sql._
7 | import scala.collection.mutable.WrappedArray
8 | import scala.collection.JavaConverters._
9 | import scala.collection.mutable.ArrayBuffer
10 | import scala.math._
11 | import org.apache.spark.sql.SparkSession
12 | import org.apache.spark.sql.Dataset
13 | import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
14 | import org.apache.spark.sql.functions._
15 | import org.apache.spark.sql._
16 | import scala.collection.mutable.WrappedArray
17 | import scala.collection.JavaConverters._
18 | import scala.collection.mutable.ArrayBuffer
19 |
20 | import spark.implicits._
21 | /**
22 | * *********************************
23 | * 1 数据准备
24 | * 数据来源:
25 | * MovieLens 【数据地址:https://grouplens.org/datasets/movielens/】(1M、10M、20M 共三个数据集)
26 | * *********************************
27 | */
28 | // 1.1读取item配置表
29 | val item_conf_path = "hdfs://192.168.1.100:9000/Recommended_Algorithm_Action/I2I/movies.csv"
30 | val item_conf_df = spark.read.options(Map(("delimiter", ","), ("header", "true"))).csv(item_conf_path)
31 | item_conf_df.show(5,false)
32 | val item_id2title_map = item_conf_df.select("movieId", "title").collect().map(row => (row(0).toString(), row(1).toString())).toMap
33 | val item_id2genres_map = item_conf_df.select("movieId", "genres").collect().map(row => (row(0).toString(), row(1).toString())).toMap
34 |
35 | // 1.2读取用户行为数据
36 | val user_rating_path = "hdfs://192.168.1.100:9000/user/Recommended_Algorithm_Action/I2I/ratings.csv"
37 | val user_rating_df = spark.read.options(Map(("delimiter", ","), ("header", "true"))).csv(user_rating_path)
38 | user_rating_df.dtypes
39 | val user_ds = user_rating_df.map {
40 | case Row(userId: String, movieId: String, rating: String, timestamp: String) =>
41 | ItemPref(userId, movieId, rating.toDouble)
42 | }
43 | user_ds.show(5, false)
44 | user_ds.cache()
45 | user_ds.count()
46 |
47 | // 1 (用户:物品) => (用户:(物品集合))
48 | val user_ds1 = user_ds.groupBy("userid").agg(collect_set("itemid")).withColumnRenamed ("collect_set(itemid)", "itemid_set")
49 | user_ds1.show(2, false)
50 |
51 | // 2 物品:物品,上三角数据
52 | val user_ds2 = user_ds1.flatMap { row =>
53 | val itemlist = row.getAs[scala.collection.mutable.WrappedArray[String]](1).toArray. sorted
54 | val result = new ArrayBuffer[(String, String, Double)]()
55 | for (i <- 0 to itemlist.length - 2) {
56 | for (j <- i + 1 to itemlist.length - 1) {
57 | result += ((itemlist(i), itemlist(j), 1.0))
58 | }
59 | }
60 | result
61 | }.withColumnRenamed("_1", "itemidI").withColumnRenamed("_2", "itemidJ").withColumnRenamed("_3", "score")
62 | user_ds2.show(5, false)
63 |
64 | // 3 计算物品与物品,上三角,同现频次
65 | val user_ds3 = user_ds2.groupBy("itemidI", "itemidJ").agg(sum("score").as("sumIJ"))
66 | user_ds3.
67 | show(5, false)
68 |
69 | // 4 计算物品总共出现的频次
70 | val user_ds0 = user_ds.withColumn("score", lit(1)).groupBy("itemid").agg(sum("score").as("score"))
71 | user_ds0.show(5, false)
72 |
73 | // 5 计算同现相似度
74 | val user_ds4 = user_ds3.join(user_ds0.withColumnRenamed("itemid", "itemidJ").withColumnRenamed("score", "sumJ").select("itemidJ", "sumJ"), "itemidJ")
75 | user_ds4.show(5, false)
76 |
77 | val user_ds5 = user_ds4.join(user_ds0.withColumnRenamed("itemid", "itemidI").withColumnRenamed("score", "sumI").select("itemidI", "sumI"), "itemidI")
78 | user_ds5.show(5, false)
79 |
80 | // 根据公式N(i)∩N(j)/sqrt(N(i)*N(j)) 计算
81 | val user_ds6 = user_ds5.withColumn("result", col("sumIJ") / sqrt(col("sumI") * col("sumJ")))
82 | user_ds6.show(5, false)
83 |
84 | // 6 上、下三角合并
85 | println(s"user_ds6.count(): ${user_ds6.count()}")
86 | val user_ds8 = user_ds6.select("itemidI", "itemidJ", "result").union(user_ds6.select($"itemidJ".as("itemidI"), $"itemidI".as("itemidJ"), $"result"))
87 | println(s"user_ds8.count(): ${user_ds8.count()}")
88 | user_ds8.show(5, false)
89 |
90 | // 7 结果返回
91 | val out = user_ds8.select("itemidI", "itemidJ", "result").map { row =>
92 | val itemidI = row.getString(0)
93 | val itemidJ = row.getString(1)
94 | val similar = row.getDouble(2)
95 | ItemSimi(itemidI, itemidJ, similar)
96 | }
97 | out.show(5, false)
98 |
99 | // 结果增加配置信息
100 | val item_id2title_map_BC = spark.sparkContext.broadcast(item_id2title_map)
101 | val item_id2genres_map_BC = spark.sparkContext.broadcast(item_id2genres_map)
102 |
103 | val items_similar_cooccurrence = out.map {
104 | case ItemSimi(itemidI: String, itemidJ: String, similar: Double) =>
105 | val i_title = item_id2title_map_BC.value.getOrElse(itemidI, "")
106 | val j_title = item_id2title_map_BC.value.getOrElse(itemidJ, "")
107 | val i_genres = item_id2genres_map_BC.value.getOrElse(itemidI, "")
108 | val j_genres = item_id2genres_map_BC.value.getOrElse(itemidJ, "")
109 | (itemidI, itemidJ, similar, i_title, j_title, i_genres, j_genres)
110 | }.withColumnRenamed("_1", "itemidI").
111 | withColumnRenamed("_2", "itemidJ").
112 | withColumnRenamed("_3", "similar").
113 | withColumnRenamed("_4", "i_title").
114 | withColumnRenamed("_5", "j_title").
115 | withColumnRenamed("_6", "i_genres").
116 | withColumnRenamed("_7", "j_genres")
117 | items_similar_cooccurrence.columns
118 | items_similar_cooccurrence.cache()
119 | items_similar_cooccurrence.count
120 |
121 | // 查询结果信息,查询各种Case
122 | items_similar_cooccurrence.
123 | orderBy($"itemidI".asc, $"similar".desc).
124 | select("i_title", "j_title", "i_genres", "j_genres", "similar").
125 | show(20)
126 |
127 | // 3.1 同现相似度推荐
128 | val cooccurrence = items_similar_cooccurrence.select("itemidI", "itemidJ", "similar").map {
129 | case Row(itemidI: String, itemidJ: String, similar: Double) =>
130 | ItemSimi(itemidI, itemidJ, similar)
131 | }
132 | cooccurrence.show(5)
133 |
134 | // 1 数据准备
135 | val items_similar_ds1 = cooccurrence
136 | val user_prefer_ds1 = user_ds
137 |
138 | // 2 根据用户的item召回相似物品
139 | val user_prefer_ds2 = items_similar_ds1.join(user_prefer_ds1, $"itemidI" === $"itemid", "inner")
140 | user_prefer_ds2.show(5)
141 |
142 | // 3 计算召回的用户物品得分
143 | val user_prefer_ds3 = user_prefer_ds2.withColumn("score", col("pref") * col("similar")).select("userid", "itemidJ", "score")
144 | user_prefer_ds3.show(5)
145 |
146 | // 4 得分汇总
147 | val user_prefer_ds4 = user_prefer_ds3.groupBy("userid", "itemidJ").agg(sum("score").as("score")).withColumnRenamed("itemidJ", "itemid")
148 | user_prefer_ds4.show(5)
149 |
150 | // 5 用户得分排序结果,去除用户已评分物品
151 | val user_prefer_ds5 = user_prefer_ds4.join(user_prefer_ds1, Seq("userid", "itemid"), "left").where("pref is null")
152 | user_prefer_ds5.show(5)
153 |
154 | // 6 结果返回
155 | val out1 = user_prefer_ds5.select("userid", "itemid", "score").map { row =>
156 | val userid = row.getString(0)
157 | val itemid = row.getString(1)
158 | val pref = row.getDouble(2)
159 | UserRecomm(userid, itemid, pref)
160 | }
161 |
162 | // 结果增加配置信息
163 | val user_predictr_cooccurrence = out1.map {
164 | case UserRecomm(userid: String, itemid: String, pref: Double) =>
165 | val title = item_id2title_map_BC.value.getOrElse(itemid, "")
166 | val genres = item_id2genres_map_BC.value.getOrElse(itemid, "")
167 | (userid, itemid, title, genres, pref)
168 | }.withColumnRenamed("_1", "userid").
169 | withColumnRenamed("_2", "itemid").
170 | withColumnRenamed("_3", "title").
171 | withColumnRenamed("_4", "genres").
172 | withColumnRenamed("_5", "pref")
173 | user_predictr_cooccurrence.columns
174 | user_predictr_cooccurrence.cache()
175 | user_predictr_cooccurrence.count()
176 |
177 | // 查询结果信息,查询各种Case
178 | user_predictr_cooccurrence.orderBy($"userid".asc, $"pref".desc).show(20)
179 |
180 |
181 |
182 |
183 |
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第19章Notebook实践/Debug_FM.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # ## 1)环境准备
5 |
6 | # In[1]:
7 |
8 | import numpy as np
9 | import tensorflow as tf
10 | import pandas as pd
11 | import random
12 | import math
13 | import re
14 |
15 | from sklearn import preprocessing
16 | from os import path, listdir
17 | from sklearn.datasets import load_svmlight_files
18 | from sklearn.model_selection import train_test_split
19 | from sklearn import metrics
20 | from tensorflow.contrib import layers
21 |
22 | import time
23 | import datetime
24 |
25 | import os
26 | os.environ["CUDA_VISIBLE_DEVICES"]="0"
27 |
28 | print tf.__version__
29 | print tf.__path__
30 |
31 |
32 | # ## 2)数据准备Dataset格式
33 |
34 | # In[2]:
35 |
36 | # 每一行解析,解析标签csv格式
37 | # 0.0,0.6666666666666666,0.5,0.0,0.0,0.0,0.0,0.7272727272727273,0.42857142857142855
38 |
39 | # 数据处理
40 | def process_data(data_type, my_path, feature_size, batch_size=32, num_epochs=1):
41 | filenames = get_file_list(my_path)
42 | next_element = read_my_file_format(data_type, filenames, feature_size, batch_size, num_epochs)
43 | return next_element
44 |
45 | # 创建session,指定GPU或者CPU使用率
46 | def get_session(gpu_fraction=0.1):
47 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction,
48 | allow_growth=True)
49 | # server = tf.train.Server.create_local_server()
50 | return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
51 |
52 |
53 | # ## 3)Debug代码
54 |
55 | # In[3]:
56 |
57 | """ 0 测试数据 """
58 | filenames = '/data/data0/001'
59 | feature_size = 530
60 | fm_v_size = 10
61 | batch_size = 3
62 | num_epochs = 1
63 | data_type = 'libsvm'
64 | batch_data = process_data(data_type, filenames, feature_size, batch_size, num_epochs)
65 | print("%s: %s" % ("batch_data", batch_data))
66 |
67 |
68 | # In[9]:
69 |
70 | """ 1 定义输入数据 """
71 | # 标签:[batch_size, 1]
72 | labels = batch_data['labels']
73 | # 用户特征向量:[batch_size, feature_size]
74 | dense_vector = tf.reshape(batch_data['dense_vector'], shape=[-1, feature_size, 1]) # None * feature_size * 1
75 | print("%s: %s" % ("dense_vector", dense_vector))
76 | print("%s: %s" % ("labels", labels))
77 |
78 |
79 | # In[10]:
80 |
81 | """ 2 定义网络输出 """
82 | # FM参数,生成或者获取W V
83 | with tf.variable_scope("lr_layer", reuse=tf.AUTO_REUSE):
84 | FM_W = tf.get_variable(name='fm_w', shape=[feature_size, 1], initializer=tf.glorot_normal_initializer())
85 | FM_V = tf.get_variable(name='fm_v', shape=[feature_size, fm_v_size], initializer=tf.glorot_normal_initializer())
86 | FM_B = tf.Variable(tf.constant(0.0), dtype=tf.float32 ,name="fm_bias") # W0
87 | print("%s: %s" % ("FM_W", FM_W))
88 | print("%s: %s" % ("FM_V", FM_V))
89 | print("%s: %s" % ("FM_B", FM_B))
90 |
91 |
92 | # In[11]:
93 |
94 | # ---------- w * x ----------
95 | Y_first = tf.reduce_sum(tf.multiply(FM_W, dense_vector), 2) # None * F
96 | print("%s: %s" % ("Y_first", Y_first))
97 |
98 |
99 | # In[12]:
100 |
101 | # ---------- Vij * Vij* Xij ---------------
102 | embeddings = tf.multiply(FM_V, dense_vector) # None * V * X
103 | print("%s: %s" % ("embeddings", embeddings))
104 | # sum_square part
105 | summed_features_emb = tf.reduce_sum(embeddings, 1) # sum(v*x)
106 | summed_features_emb_square = tf.square(summed_features_emb) # (sum(v*x))^2
107 |
108 | # square_sum part
109 | squared_features_emb = tf.square(embeddings) # (v*x)^2
110 | squared_sum_features_emb = tf.reduce_sum(squared_features_emb, 1) # sum((v*x)^2)
111 |
112 | # second order
113 | Y_second = 0.5 * tf.subtract(summed_features_emb_square, squared_sum_features_emb) # 0.5*((sum(v*x))^2 - sum((v*x)^2))
114 | print("%s: %s" % ("Y_second", Y_second))
115 |
116 |
117 | # In[15]:
118 |
119 | # out = W * X + Vij * Vij* Xij
120 | FM_out_lay1 = tf.concat([Y_first, Y_second], axis=1)
121 | print("%s: %s" % ("FM_out_lay1", FM_out_lay1))
122 |
123 | Y_Out = tf.reduce_sum(FM_out_lay1, 1)
124 | print("%s: %s" % ("Y_Out", Y_Out))
125 |
126 |
127 | # In[16]:
128 |
129 | # out = out + bias
130 | y_d = tf.reshape(Y_Out,shape=[-1])
131 | Y_bias = FM_B * tf.ones_like(y_d, dtype=tf.float32) # Y_bias
132 | Y_Out = tf.add(Y_Out, Y_bias, name='Y_Out')
133 | print("%s: %s" % ("Y_bias", Y_bias))
134 | print("%s: %s" % ("Y_Out", Y_Out))
135 |
136 |
137 | # In[17]:
138 |
139 | # ---------- score ----------
140 | score=tf.nn.sigmoid(Y_Out,name='score')
141 | score=tf.reshape(score, shape=[-1, 1])
142 | print("%s: %s" % ("score", score))
143 |
144 |
145 | # In[18]:
146 |
147 | """ 3 定义损失函数和AUC指标 """
148 | reg_type = 'l2_reg'
149 | loss_fuc = 'Cross_entropy'
150 | reg_param = 0.01
151 | learning_rate = 0.01
152 | print("%s: %s" % ("reg_type", reg_type))
153 | print("%s: %s" % ("loss_fuc", loss_fuc))
154 | print("%s: %s" % ("reg_param", reg_param))
155 | print("%s: %s" % ("learning_rate", learning_rate))
156 |
157 |
158 | # In[19]:
159 |
160 | # loss:Squared_error,Cross_entropy ,FTLR
161 | if reg_type == 'l1_reg':
162 | regularization = reg_param * tf.reduce_sum(tf.abs(FM_W))
163 | elif reg_type == 'l2_reg':
164 | regularization = reg_param * tf.nn.l2_loss(FM_W)
165 | else:
166 | regularization = reg_param * tf.nn.l2_loss(FM_W)
167 |
168 | if loss_fuc == 'Squared_error':
169 | loss = tf.reduce_mean(tf.reduce_sum(tf.square(labels - score), reduction_indices=[1])) + regularization
170 | elif loss_fuc == 'Cross_entropy':
171 | loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=tf.reshape(Y_Out, [-1]), labels=tf.reshape(labels, [-1]))) + regularization
172 | elif loss_fuc == 'FTLR':
173 | loss = tf.reduce_mean(tf.reduce_sum(tf.square(labels - score), reduction_indices=[1])) + regularization
174 |
175 |
176 | # In[20]:
177 |
178 | # AUC
179 | auc = tf.metrics.auc(labels, score)
180 | print("%s: %s" % ("labels", labels))
181 | print("%s: %s" % ("score", score))
182 |
183 |
184 | # In[21]:
185 |
186 | # w为0的比例,w的平均值
187 | w_zero_ratio = tf.reduce_mean(tf.to_float(tf.abs(FM_W) <= 1.0e-5))
188 | w_avg = tf.reduce_mean(FM_W)
189 | v_zero_ratio = tf.reduce_mean(tf.to_float(tf.abs(FM_V) <= 1.0e-5))
190 | v_avg = tf.reduce_mean(FM_V)
191 |
192 |
193 | # In[22]:
194 |
195 | """ 4 设定optimizer """
196 | global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')
197 | with tf.variable_scope("optimizer", reuse=tf.AUTO_REUSE):
198 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8)
199 | train_step = optimizer.minimize(loss, global_step=global_step)
200 |
201 |
202 | # In[23]:
203 |
204 | """ 分步调试,对上面各个步骤中的变量值进行打印和查看,以方便定位问题 """
205 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer())
206 | with tf.device('/cpu:0'):
207 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2, allow_growth=True)
208 | sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
209 | sess.run(init_op)
210 | a, b = sess.run([Y_Out, score])
211 | print a
212 | print b
213 |
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第19章Notebook实践/Debug_Sk_LR.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # ## 1)环境准备
5 |
6 | In [25]:
7 | from sklearn.linear_model import LogisticRegression
8 | rom sklearn import metrics
9 | from os import path, listdir
10 | from sklearn.datasets import load_svmlight_files
11 | from sklearn.model_selection import train_test_split
12 | from sklearn.externals import joblib
13 | from sklearn import preprocessing
14 | import numpy as np
15 | import pandas as pd
16 | import random
17 | import platform
18 | print("Python Version: %s"%(platform.python_version()))
19 |
20 | In [26]:
21 | """
22 | 处理libSVM数据方法,生成样本,支持Batch格式返回,也支持X/Y返回
23 | 步骤:
24 | 1)读取libSVM格式数据。
25 | 2)数据归一化处理。
26 | 3)划分训练集和测试集。
27 | 4)生成Batch数据。
28 | """
29 |
30 | In [27]:
31 | # 数据测试
32 | data_path = '/data/data01/'
33 | test_rat=0.4
34 | random_seed=0
35 | train_batch_size=20000
36 | test_batch_size=20000
37 | feature_size=530
38 |
39 | # 获取样本数据
40 | data = process_data(data_path, feature_size, test_rat, random_seed, train_batch_size, test_batch_size)
41 |
42 | train_batch = data['train_batch']
43 | test_batch = data['test_batch']
44 | X_train = data['X_train']
45 | Y_train = data['Y_train']
46 | X_test = data['X_test']
47 | Y_test = data['Y_test']
48 |
49 | # 查看样本数据大小
50 | print("X_train.shape: ")
51 | print(X_train.shape)
52 | print("Y_train.shape: ")
53 | print(Y_train.shape)
54 | print("X_test.shape: ")
55 | print(X_test.shape)
56 | print("Y_test.shape: ")
57 | print(Y_test.shape)
58 |
59 | In [30]:
60 | # 3.1 建立逻辑回归模型,并且设定参数
61 | lr_model= LogisticRegression(penalty='l2', C=1000, solver='lbfgs', max_iter=500)
62 |
63 | # 3.2 训练逻辑回归模型
64 | lr_model.fit(X_train,Y_train.values.ravel())
65 |
66 | In [31]:
67 | # 3.3 采用测试集验证模型离线指标
68 | # 训练集AUC
69 | probs_train= lr_model.predict_proba(X_train)
70 | AUC1 = metrics.roc_auc_score(Y_train, probs_train[:,1])
71 | print("Train Auc: %s"%(AUC1))
72 |
73 | # 测试集AUC
74 | probs_test= lr_model.predict_proba(X_test)
75 | predict_test = lr_model.predict(X_test)
76 | AUC2 = metrics.roc_auc_score(Y_test, probs_test[:,1])
77 | print("Test Auc: %s"%(AUC2))
78 |
79 | # 准确率
80 | accuracy = metrics.accuracy_score(Y_test, predict_test)
81 | print("Test Accuracy: %s"%(accuracy))
82 |
83 | # 召回率
84 | recall = metrics.recall_score(Y_test, predict_test)
85 | print("Test Recall: %s"%(recall))
86 |
87 | # F1值
88 | f1 = metrics.f1_score(Y_test, predict_test)
89 | print("Test F1: %s"%(f1))
90 |
91 | In [42]:
92 | # 3.4 打印模型参数
93 | w=lr_model.coef_
94 | print("参数大小:")
95 | print(w.shape)
96 | print("参数前10个:")
97 | print(lr_model.coef_[:,0:10])
98 | print("截距:")
99 | print(lr_model.intercept_)
100 | print("稀疏化特征比率:%.2f%%" %(np.mean(lr_model.coef_.ravel()==0)*100))
101 | print("sigmoid函数转化的值,即:概率p")
102 | print(lr_model.predict_proba(X_test[0:5]))
103 |
104 | In [43]:
105 | # 3.5 模型保存
106 | joblib.dump(lr_model,"logistic_lr.model")
107 | #模型加载
108 | load_lr = joblib.load("logistic_lr.model")
109 | print(load_lr.predict_proba(X_test[0:5]))
110 |
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第19章Notebook实践/Debug_Spark_LR.scala:
--------------------------------------------------------------------------------
1 | import org.apache.spark.ml.classification.{ BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel }
2 | import org.apache.spark.ml.evaluation.{ MulticlassClassificationEvaluator, BinaryClassificationEvaluator }
3 | import org.apache.spark.ml.linalg.{ Vector, Vectors }
4 | import org.apache.spark.sql.types._
5 | import org.apache.spark.sql.functions._import org.apache.spark.sql._
6 | import org.apache.spark.sql.SparkSession
7 | import org.apache.spark.ml.feature._
8 |
9 | /**
10 | * 读取libSVM格式的文件,生成训练样本和测试样本
11 | * 1)读取文件
12 | * 2)生成标签索引
13 | * 3)样本处理
14 | * 4)样本划分
15 | */
16 | def readLibSvmSampleData(): ={
17 | }
18 |
19 | //1 参数准备
20 | val dataPath = "hdfs://192.168.1.100:9000/Recommended_Algorithm_Action/data01/"
21 | val iter = 500
22 | val reg_param = 0.0
23 | val elastic_net_param = 0.0
24 |
25 | //2 训练样本准备
26 | val (training, test) = readLibSvmSampleData(spark, dataPath)
27 | training.cache()
28 | test.cache()
29 | println(s"training.count(): ${training.count()}")
30 | println(s"test.count(): ${test.count()}")
31 | training.show
32 |
33 | //3 建立逻辑回归模型
34 | val lr = new LogisticRegression().
35 | setMaxIter(iter).
36 | setRegParam(reg_param).
37 | setElasticNetParam(elastic_net_param)
38 |
39 | //4 根据训练样本进行模型训练
40 | val lrModel = lr.fit(training)
41 |
42 | //5 打印模型信息
43 | println(s"Coefficients Top 10: ${lrModel.coefficients.toArray.slice(0, 10).mkString(" ")}")
44 | println(s"Intercept: ${lrModel.intercept}")
45 |
46 | //6 对模型进行测试
47 | val test_predict = lrModel.transform(test)
48 | test_predict.show
49 | test_predict.select("features", "label", "probability", "prediction").take(5).foreach {
50 | case Row(features: Vector, label: Double, prob: Vector, prediction: Double) =>
51 | println(s"($features, $label) -> prob=$prob, prediction=$prediction")
52 | }
53 |
54 |
55 | //10 模型摘要
56 | val trainingSummary = lrModel.summary
57 |
58 | //11 每次迭代目标值
59 | val objectiveHistory = trainingSummary.objectiveHistory
60 | println("objectiveHistory:")
61 | objectiveHistory.foreach(loss => println(loss))
62 |
63 | //12 计算模型指标数据
64 | val binarySummary = trainingSummary.asInstanceOf[BinaryLogisticRegressionSummary]
65 |
66 | //13 模型摘要AUC指标
67 | val roc = binarySummary.roc
68 | roc.show()
69 | val AUC = binarySummary.areaUnderROC
70 | println(s"areaUnderROC: ${binarySummary.areaUnderROC}")
71 |
72 | //14 测试集AUC指标
73 | val evaluator = new BinaryClassificationEvaluator().
74 | setLabelCol("label").
75 | setRawPredictionCol("probability").
76 | setMetricName("areaUnderROC")
77 | val testAUC = evaluator.evaluate(test_predict)
78 | println("Test AUC = " + testAUC)
79 |
80 | //15 设置模型阈值
81 | // 不同的阈值,计算不同的F1,然后通过最大的F1找出并重设模型的最佳阈值
82 | val fMeasure = binarySummary.fMeasureByThreshold
83 | fMeasure.show
84 | // 获得最大的F1值
85 | val maxFMeasure = fMeasure.select(max("F-Measure")).head().getDouble(0)
86 | // 找出最大F1值对应的阈值(最佳阈值)
87 | val bestThreshold = fMeasure.where($"F-Measure" === maxFMeasure).select("threshold").head().getDouble(0)
88 | // 将模型的Threshold设置为选择出来的最佳分类阈值
89 | lrModel.setThreshold(bestThreshold)
90 |
91 |
92 | //16 模型保存与加载
93 | // 保存
94 | lrModel.save("hdfs://192.168.1.100:9000/mlv2/lrmodel")
95 | // 加载
96 | val load_lrModel = LogisticRegressionModel.load("hdfs://192.168.1.100:9000/mlv2/lrmodel")
97 | // 加载测试
98 | val load_predict = load_lrModel.transform(test)
99 | load_predict.select("features", "label", "probability", "prediction").take(5).foreach {
100 | case Row(features: Vector, label: Double, prob: Vector, prediction: Double) =>
101 | println(s"($features, $label) -> prob=$prob, prediction=$prediction")
102 | }
103 |
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第19章Notebook实践/Debug_TF_LR.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # ## 1)环境准备
5 |
6 | In [1]:
7 | import numpy as np
8 | import tensorflow as tf
9 | import pandas as pd
10 | import random
11 | import math
12 | import re
13 |
14 | from sklearn import preprocessing
15 | from os import path, listdir
16 | from sklearn.datasets import load_svmlight_files
17 | from sklearn.model_selection import train_test_split
18 | from sklearn import metrics
19 | from tensorflow.contrib import layers
20 |
21 | from sklearn import metrics
22 |
23 | import time
24 | import datetime
25 |
26 | import os
27 | os.environ["CUDA_VISIBLE_DEVICES"]="0"
28 |
29 | print tf.__version__
30 | print tf.__path__
31 |
32 | In [6]:
33 | """
34 | 解析CSV格式,对输入的每一行样本进行格式解析,返回labels和dense_vector格式数据
35 | 例如输入CSV格式字符串: 0.0,0.6666666666666666,0.5,0.0,0.0,0.0,0.0,0.7272727272727273,0.42857142857142855
36 | 函数参数:
37 | line:需要解析的字符串
38 | feature_size:特征长度
39 | 函数返回:
40 | 返回字典,格式:{'labels': labels, 'dense_vector': dense_vector}
41 | labels:样本的labels
42 | dense_vector:样本的特征向量
43 | """
44 |
45 | In [8]:
46 | # 测试数据
47 | filenames = '/data/all-csv'
48 | feature_size = 530
49 | batch_size = 3
50 | num_epochs = 1
51 | data_type = 'csv'
52 | next_element = process_data(data_type, filenames, feature_size, batch_size, num_epochs)
53 | print next_element['dense_vector']
54 | print next_element['labels']
55 |
56 | gpu_fraction = 0.2
57 | my_device='/gpu:0'
58 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer())
59 | with tf.device(my_device):
60 | sess = get_session(gpu_fraction)
61 | sess.run(init_op)
62 | dense_vector, labels = sess.run([next_element['dense_vector'],next_element['labels']])
63 | print dense_vector
64 | print labels
65 |
66 | In [7]:
67 | #基于逻辑回归的网络结构在TensorFlow中实现逻辑回归模型。其中“LR模型”代码省略,具体内容可以参考6.2.3节中的相关代码。
68 | class LR(object):
69 | """ 初始化成员变量 """
70 | def __init__(self, feature_size, loss_fuc, train_optimizer, learning_rate, reg_type, reg_param):
71 | # 特征向量长度
72 | self.feature_size = feature_size
73 | # 损失函数
74 | self.loss_fuc = loss_fuc
75 | # 优化方法
76 | self.train_optimizer = train_optimizer
77 | # 学习率
78 | self.learning_rate = learning_rate
79 | # 正则类型
80 | self.reg_type = reg_type
81 | # 正则因子
82 | self.reg_param = reg_param
83 | # aglobal_step
84 | self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')
85 |
86 | def train(self, batch_data):
87 | """ 1 定义输入数据 """
88 | with tf.name_scope('input_data'):
89 |
90 |
91 | In [9]:
92 | # 数据准备
93 | filenames = '/data/csv-all'
94 | data_type='csv'
95 | feature_size = 530
96 | batch_size = 60000
97 | num_epochs = 200
98 | next_element = process_data(data_type, filenames, feature_size, batch_size, num_epochs)
99 |
100 | # 模型参数
101 | loss_fuc = 'Squared_error'
102 | train_optimizer = 'Adam'
103 | learning_rate = 0.01
104 | reg_type = 'l2_reg'
105 | reg_param = 0.0
106 | log_path='/data/log/Squared_error_lr_L2_0_20180816_01'
107 |
108 | # 开始训练
109 | bea_model = LR(feature_size, loss_fuc, train_optimizer, learning_rate, reg_type, reg_param)
110 | Y_Out, score, regularization, loss, auc, train_step, w_zero_ratio, w_avg, labels, score, summary_op = bea_model.train(next_element)
111 |
112 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer())
113 | gpu_fraction = 0.4
114 | my_device='/gpu:0'
115 | with tf.device(my_device):
116 | sess = get_session(gpu_fraction)
117 | sess.run(init_op)
118 | batch_cnt = 0
119 | #选定可视化存储目录
120 | writer = tf.summary.FileWriter(log_path, sess.graph)
121 | try:
122 | while True:
123 | batch_cnt = batch_cnt + 1
124 | a, b, c, d, e, summary = sess.run([loss, auc, w_zero_ratio, w_avg, train_step, summary_op])
125 | if batch_cnt % 50 == 0 or batch_cnt <= 10:
126 | y, p = sess.run([labels, score])
127 | if y.sum() > 0.0:
128 | batch_auc=metrics.roc_auc_score(y, p)
129 | else:
130 | batch_auc=0.0
131 | print("batch: {} loss: {:.4f} accumulate_auc: {:.4f} batch_auc: {:.4f} w_zero_ratio: {:.4f} w_avg: {:.4f}".format(batch_cnt, a, b[0], batch_auc, c, d))
132 | writer.add_summary(summary, batch_cnt)
133 | except tf.errors.OutOfRangeError:
134 | print("3、Train end of dataset")
135 |
136 |
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第4章协同过滤/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WallaceLiu/recommendation_algorithm/36cdd077446ab7a2831c168f7419b058bd2fcbb0/推荐系统算法实践—源码下载/第4章协同过滤/.DS_Store
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第4章协同过滤/I2iTest.scala:
--------------------------------------------------------------------------------
1 | package book_code
2 |
3 | import scala.math._
4 | import org.apache.spark.sql.SparkSession
5 |
6 | import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
7 | import org.apache.spark.sql.Encoder
8 | import org.apache.spark.sql.types._
9 | import org.apache.spark.sql.functions._
10 | import org.apache.spark.sql._
11 | import scala.collection.mutable.WrappedArray
12 | import scala.collection.JavaConverters._
13 | import scala.collection.mutable.ArrayBuffer
14 |
15 | object I2iTest {
16 |
17 | def main(args: Array[String]): Unit = {
18 |
19 | val spark = SparkSession
20 | .builder
21 | .appName("I2iTest")
22 | .enableHiveSupport()
23 | .getOrCreate()
24 |
25 | import spark.implicits._
26 |
27 | /**
28 | * *********************************
29 | * 1 数据准备
30 | * 数据来源:
31 | * MovieLens 【数据地址:https://grouplens.org/datasets/movielens/】(1M、10M、20M 共三个数据集)
32 | * *********************************
33 | */
34 |
35 | // 1.1读取item配置表
36 | val item_conf_path = "hdfs://1.1.1.1:9000/I2I/movies.csv"
37 | val item_conf_df = spark.read.options(Map(("delimiter", ","), ("header", "true"))).csv(item_conf_path)
38 | val item_id2title_map = item_conf_df.select("movieId", "title").collect().map(row => (row(0).toString(), row(1).toString())).toMap
39 | val item_id2genres_map = item_conf_df.select("movieId", "genres").collect().map(row => (row(0).toString(), row(1).toString())).toMap
40 |
41 | // 1.2读取用户行为数据
42 | val user_rating_path = "hdfs://1.1.1.1:9000/I2I/ratings.csv"
43 | val user_rating_df = spark.read.options(Map(("delimiter", ","), ("header", "true"))).csv(user_rating_path)
44 |
45 | user_rating_df.dtypes
46 | val user_ds = user_rating_df.map {
47 | case Row(userId: String, movieId: String, rating: String, timestamp: String) =>
48 | ItemPref(userId, movieId, rating.toDouble)
49 | }
50 | println("user_ds.show(10)")
51 | user_ds.show(10)
52 | user_ds.cache()
53 | user_ds.count()
54 |
55 | /**
56 | * *********************************
57 | * 2 相似度计算
58 | * *********************************
59 | */
60 | val item_id2title_map_BC = spark.sparkContext.broadcast(item_id2title_map)
61 | val item_id2genres_map_BC = spark.sparkContext.broadcast(item_id2genres_map)
62 |
63 | // 2.1 同现相似度
64 | val items_similar_cooccurrence = ItemSimilarity.CooccurrenceSimilarity(user_ds).map {
65 | case ItemSimi(itemidI: String, itemidJ: String, similar: Double) =>
66 | val i_title = item_id2title_map_BC.value.getOrElse(itemidI, "")
67 | val j_title = item_id2title_map_BC.value.getOrElse(itemidJ, "")
68 | val i_genres = item_id2genres_map_BC.value.getOrElse(itemidI, "")
69 | val j_genres = item_id2genres_map_BC.value.getOrElse(itemidJ, "")
70 | (itemidI, itemidJ, similar, i_title, j_title, i_genres, j_genres)
71 | }.withColumnRenamed("_1", "itemidI").
72 | withColumnRenamed("_2", "itemidJ").
73 | withColumnRenamed("_3", "similar").
74 | withColumnRenamed("_4", "i_title").
75 | withColumnRenamed("_5", "j_title").
76 | withColumnRenamed("_6", "i_genres").
77 | withColumnRenamed("_7", "j_genres")
78 | items_similar_cooccurrence.columns
79 | // 结果打打印
80 | items_similar_cooccurrence.cache()
81 | items_similar_cooccurrence.count
82 | println("items_similar_cooccurrence.show(20)")
83 | items_similar_cooccurrence.
84 | orderBy($"itemidI".asc, $"similar".desc).
85 | select("i_title", "j_title", "i_genres", "j_genres", "similar").
86 | show(20)
87 |
88 | // 2.2 余弦相似度
89 | val items_similar_cosine = ItemSimilarity.CosineSimilarity(user_ds).map {
90 | case ItemSimi(itemidI: String, itemidJ: String, similar: Double) =>
91 | val i_title = item_id2title_map_BC.value.getOrElse(itemidI, "")
92 | val j_title = item_id2title_map_BC.value.getOrElse(itemidJ, "")
93 | val i_genres = item_id2genres_map_BC.value.getOrElse(itemidI, "")
94 | val j_genres = item_id2genres_map_BC.value.getOrElse(itemidJ, "")
95 | (itemidI, itemidJ, similar, i_title, j_title, i_genres, j_genres)
96 | }.withColumnRenamed("_1", "itemidI").
97 | withColumnRenamed("_2", "itemidJ").
98 | withColumnRenamed("_3", "similar").
99 | withColumnRenamed("_4", "i_title").
100 | withColumnRenamed("_5", "j_title").
101 | withColumnRenamed("_6", "i_genres").
102 | withColumnRenamed("_7", "j_genres")
103 | items_similar_cosine.columns
104 | // 结果打打印
105 | items_similar_cosine.cache()
106 | items_similar_cosine.count
107 | println("items_similar_cosine.show(20)")
108 | items_similar_cosine.
109 | orderBy($"itemidI".asc, $"similar".desc).
110 | select("i_title", "j_title", "i_genres", "j_genres", "similar").
111 | show(20)
112 |
113 | // 2.3 欧氏距离相似度
114 | val items_similar_euclidean = ItemSimilarity.EuclideanDistanceSimilarity(user_ds).map {
115 | case ItemSimi(itemidI: String, itemidJ: String, similar: Double) =>
116 | val i_title = item_id2title_map_BC.value.getOrElse(itemidI, "")
117 | val j_title = item_id2title_map_BC.value.getOrElse(itemidJ, "")
118 | val i_genres = item_id2genres_map_BC.value.getOrElse(itemidI, "")
119 | val j_genres = item_id2genres_map_BC.value.getOrElse(itemidJ, "")
120 | (itemidI, itemidJ, similar, i_title, j_title, i_genres, j_genres)
121 | }.withColumnRenamed("_1", "itemidI").
122 | withColumnRenamed("_2", "itemidJ").
123 | withColumnRenamed("_3", "similar").
124 | withColumnRenamed("_4", "i_title").
125 | withColumnRenamed("_5", "j_title").
126 | withColumnRenamed("_6", "i_genres").
127 | withColumnRenamed("_7", "j_genres")
128 | items_similar_euclidean.columns
129 | // 结果打打印
130 | items_similar_euclidean.cache()
131 | items_similar_euclidean.count
132 | println("items_similar_euclidean.show(20)")
133 | items_similar_euclidean.
134 | orderBy($"itemidI".asc, $"similar".desc).
135 | select("i_title", "j_title", "i_genres", "j_genres", "similar").
136 | show(20)
137 |
138 | /**
139 | * *********************************
140 | * 3 推荐计算
141 | * *********************************
142 | */
143 |
144 | // 推荐结果计算
145 | // 3.1 同现相似度推荐
146 | val cooccurrence = items_similar_cooccurrence.select("itemidI", "itemidJ", "similar").map {
147 | case Row(itemidI: String, itemidJ: String, similar: Double) =>
148 | ItemSimi(itemidI, itemidJ, similar)
149 | }
150 | val user_predictr_cooccurrence = ItemSimilarity.Recommend(cooccurrence, user_ds).map {
151 | case UserRecomm(userid: String, itemid: String, pref: Double) =>
152 | val title = item_id2title_map_BC.value.getOrElse(itemid, "")
153 | val genres = item_id2genres_map_BC.value.getOrElse(itemid, "")
154 | (userid, itemid, title, genres, pref)
155 | }.withColumnRenamed("_1", "userid").
156 | withColumnRenamed("_2", "itemid").
157 | withColumnRenamed("_3", "title").
158 | withColumnRenamed("_4", "genres").
159 | withColumnRenamed("_5", "pref")
160 | user_predictr_cooccurrence.columns
161 | user_predictr_cooccurrence.cache()
162 | user_predictr_cooccurrence.count()
163 | println("user_predictr_cooccurrence.show(20)")
164 | user_predictr_cooccurrence.orderBy($"userid".asc, $"pref".desc).show(20)
165 |
166 | // 3.2 余弦相似度推荐
167 | val cosine = items_similar_cosine.select("itemidI", "itemidJ", "similar").map {
168 | case Row(itemidI: String, itemidJ: String, similar: Double) =>
169 | ItemSimi(itemidI, itemidJ, similar)
170 | }
171 | val user_predictr_cosine = ItemSimilarity.Recommend(cosine, user_ds).map {
172 | case UserRecomm(userid: String, itemid: String, pref: Double) =>
173 | val title = item_id2title_map_BC.value.getOrElse(itemid, "")
174 | val genres = item_id2genres_map_BC.value.getOrElse(itemid, "")
175 | (userid, itemid, title, genres, pref)
176 | }.withColumnRenamed("_1", "userid").
177 | withColumnRenamed("_2", "itemid").
178 | withColumnRenamed("_3", "title").
179 | withColumnRenamed("_4", "genres").
180 | withColumnRenamed("_5", "pref")
181 | user_predictr_cosine.columns
182 | user_predictr_cosine.cache()
183 | user_predictr_cosine.count()
184 | println("user_predictr_cosine.show(20)")
185 | user_predictr_cosine.orderBy($"userid".asc, $"pref".desc).show(20)
186 |
187 | // 3.3 欧氏距离相似度推荐
188 | val euclidean = items_similar_euclidean.select("itemidI", "itemidJ", "similar").map {
189 | case Row(itemidI: String, itemidJ: String, similar: Double) =>
190 | ItemSimi(itemidI, itemidJ, similar)
191 | }
192 | val user_predictr_euclidean = ItemSimilarity.Recommend(euclidean, user_ds).map {
193 | case UserRecomm(userid: String, itemid: String, pref: Double) =>
194 | val title = item_id2title_map_BC.value.getOrElse(itemid, "")
195 | val genres = item_id2genres_map_BC.value.getOrElse(itemid, "")
196 | (userid, itemid, title, genres, pref)
197 | }.withColumnRenamed("_1", "userid").
198 | withColumnRenamed("_2", "itemid").
199 | withColumnRenamed("_3", "title").
200 | withColumnRenamed("_4", "genres").
201 | withColumnRenamed("_5", "pref")
202 | user_predictr_euclidean.columns
203 | user_predictr_euclidean.cache()
204 | user_predictr_euclidean.count()
205 | println("user_predictr_euclidean.show(20)")
206 | user_predictr_euclidean.orderBy($"userid".asc, $"itemid".desc).show(20)
207 |
208 | // 推荐结果保存
209 | val table_date = 20181025
210 | val recommend_table = "table_i2i_recommend_result"
211 | user_predictr_cooccurrence.createOrReplaceTempView("df_to_hive_table")
212 | val insertSql1 = s"insert overwrite table ${recommend_table} partition(ds=${table_date}) select userid, itemid, pref from df_to_hive_table"
213 | println(insertSql1)
214 | // spark.sql(insertSql1)
215 |
216 | }
217 |
218 | }
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第4章协同过滤/ItemSimilarity.scala:
--------------------------------------------------------------------------------
1 | package book_code
2 |
3 | import scala.math._
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.sql.Dataset
6 | import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
7 | import org.apache.spark.sql.functions._
8 | import org.apache.spark.sql._
9 | import scala.collection.mutable.WrappedArray
10 | import scala.collection.JavaConverters._
11 | import scala.collection.mutable.ArrayBuffer
12 |
13 | object ItemSimilarity extends Serializable {
14 |
15 | import org.apache.spark.sql.functions._
16 |
17 | /**
18 | * 关联规则计算.
19 | * 支持度(Support):在所有项集中{X, Y}出现的可能性,即项集中同时含有X和Y的概率,P(X U Y)/P(I),I是总事务集
20 | * 置信度(Confidence):在先决条件X发生的条件下,关联结果Y发生的概率,P(X U Y)/P(X)
21 | * 提升度(lift):在含有X的条件下同时含有Y的可能性与没有X这个条件下项集中含有Y的可能性之比,confidence(X => Y)/P(Y)
22 | * @param user_rdd 用户评分
23 | * @param RDD[ItemAssociation] 返回物品相似度
24 | *
25 | */
26 | def AssociationRules(user_ds: Dataset[ItemPref]): Dataset[ItemAssociation] = {
27 | import user_ds.sparkSession.implicits._
28 | // 1 (用户:物品) => (用户:(物品集合))
29 | val user_ds1 = user_ds.groupBy("userid").agg(collect_set("itemid")).withColumnRenamed("collect_set(itemid)", "itemid_set")
30 |
31 | // 2 物品:物品,上三角数据
32 | val user_ds2 = user_ds1.flatMap { row =>
33 | val itemlist = row.getAs[WrappedArray[String]](1).toArray.sorted
34 | val result = new ArrayBuffer[(String, String, Double)]()
35 | for (i <- 0 to itemlist.length - 2) {
36 | for (j <- i + 1 to itemlist.length - 1) {
37 | result += ((itemlist(i), itemlist(j), 1.0))
38 | }
39 | }
40 | result
41 | }.withColumnRenamed("_1", "itemidI").withColumnRenamed("_2", "itemidJ").withColumnRenamed("_3", "score")
42 |
43 | // 3 计算物品与物品,上三角,同现频次
44 | val user_ds3 = user_ds2.groupBy("itemidI", "itemidJ").agg(sum("score").as("sumIJ"))
45 |
46 | //4 计算物品总共出现的频次
47 | val user_ds0 = user_ds.withColumn("score", lit(1)).groupBy("itemid").agg(sum("score").as("score"))
48 | val user_all = user_ds1.count
49 |
50 | //5 计算支持度(Support)
51 | val user_ds4 = user_ds3.select("itemidI", "itemidJ", "sumIJ").
52 | union(user_ds3.select($"itemidJ".as("itemidI"), $"itemidI".as("itemidJ"), $"sumIJ")).
53 | withColumn("support", $"sumIJ" / user_all.toDouble)
54 |
55 | // user_ds4.orderBy($"support".desc).show
56 |
57 | //6 置信度(Confidence)
58 | val user_ds5 = user_ds4.
59 | join(user_ds0.withColumnRenamed("itemid", "itemidI").withColumnRenamed("score", "sumI"), "itemidI").
60 | withColumn("confidence", $"sumIJ" / $"sumI")
61 |
62 | // user_ds5.orderBy($"confidence".desc).show
63 |
64 | //7 提升度(lift)
65 | val user_ds6 = user_ds5.
66 | join(user_ds0.withColumnRenamed("itemid", "itemidJ").withColumnRenamed("score", "sumJ"), "itemidJ").
67 | withColumn("lift", $"confidence" / ($"sumJ" / user_all.toDouble))
68 |
69 | // user_ds6.orderBy($"lift".desc).show
70 |
71 | // 计算同现相似度
72 | val user_ds8 = user_ds6.withColumn("similar", col("sumIJ") / sqrt(col("sumI") * col("sumJ")))
73 | // user_ds8.orderBy($"similar".desc).show
74 |
75 | // 8 结果返回
76 | val out = user_ds8.select("itemidI", "itemidJ", "support", "confidence", "lift", "similar").map { row =>
77 | val itemidI = row.getString(0)
78 | val itemidJ = row.getString(1)
79 | val support = row.getDouble(2)
80 | val confidence = row.getDouble(3)
81 | val lift = row.getDouble(4)
82 | val similar = row.getDouble(5)
83 | ItemAssociation(itemidI, itemidJ, support, confidence, lift, similar)
84 | }
85 | out
86 | }
87 |
88 | /**
89 | * 余弦相似度矩阵计算.
90 | * T(x,y) = ∑x(i)y(i) / sqrt(∑(x(i)*x(i))) * sqrt(∑(y(i)*y(i)))
91 | * @param user_rdd 用户评分
92 | * @param RDD[ItemSimi] 返回物品相似度
93 | *
94 | */
95 | def CosineSimilarity(user_ds: Dataset[ItemPref]): Dataset[ItemSimi] = {
96 | import user_ds.sparkSession.implicits._
97 |
98 | // 1 数据做准备
99 | val user_ds1 = user_ds.
100 | withColumn("iv", concat_ws(":", $"itemid", $"pref")).
101 | groupBy("userid").agg(collect_set("iv")).
102 | withColumnRenamed("collect_set(iv)", "itemid_set").
103 | select("userid", "itemid_set")
104 |
105 | // 2 物品:物品,上三角数据
106 | val user_ds2 = user_ds1.flatMap { row =>
107 | val itemlist = row.getAs[scala.collection.mutable.WrappedArray[String]](1).toArray.sorted
108 | val result = new ArrayBuffer[(String, String, Double, Double)]()
109 | for (i <- 0 to itemlist.length - 2) {
110 | for (j <- i + 1 to itemlist.length - 1) {
111 | result += ((itemlist(i).split(":")(0), itemlist(j).split(":")(0), itemlist(i).split(":")(1).toDouble, itemlist(j).split(":")(1).toDouble))
112 | }
113 | }
114 | result
115 | }.withColumnRenamed("_1", "itemidI").withColumnRenamed("_2", "itemidJ").withColumnRenamed("_3", "scoreI").withColumnRenamed("_4", "scoreJ")
116 |
117 | // 3 按照公式计算sim
118 | // x*y = ∑x(i)y(i)
119 | // |x|^2 = ∑(x(i)*x(i))
120 | // |y|^2 = ∑(y(i)*y(i))
121 | // result = x*y / sqrt(|x|^2) * sqrt(|y|^2)
122 | val user_ds3 = user_ds2.
123 | withColumn("cnt", lit(1)).
124 | groupBy("itemidI", "itemidJ").
125 | agg(sum(($"scoreI" * $"scoreJ")).as("sum_xy"),
126 | sum(($"scoreI" * $"scoreI")).as("sum_x"),
127 | sum(($"scoreJ" * $"scoreJ")).as("sum_y")).
128 | withColumn("result", $"sum_xy" / (sqrt($"sum_x") * sqrt($"sum_y")))
129 |
130 | // 4 上、下三角合并
131 | val user_ds8 = user_ds3.select("itemidI", "itemidJ", "result").
132 | union(user_ds3.select($"itemidJ".as("itemidI"), $"itemidI".as("itemidJ"), $"result"))
133 |
134 | // 5 结果返回
135 | val out = user_ds8.select("itemidI", "itemidJ", "result").map { row =>
136 | val itemidI = row.getString(0)
137 | val itemidJ = row.getString(1)
138 | val similar = row.getDouble(2)
139 | ItemSimi(itemidI, itemidJ, similar)
140 | }
141 | out
142 | }
143 |
144 | /**
145 | * 欧氏距离相似度矩阵计算.
146 | * d(x, y) = sqrt(∑((x(i)-y(i)) * (x(i)-y(i))))
147 | * sim(x, y) = n / (1 + d(x, y))
148 | * @param user_rdd 用户评分
149 | * @param RDD[ItemSimi] 返回物品相似度
150 | *
151 | */
152 | def EuclideanDistanceSimilarity(user_ds: Dataset[ItemPref]): Dataset[ItemSimi] = {
153 | import user_ds.sparkSession.implicits._
154 |
155 | // 1 数据做准备
156 | val user_ds1 = user_ds.
157 | withColumn("iv", concat_ws(":", $"itemid", $"pref")).
158 | groupBy("userid").agg(collect_set("iv")).
159 | withColumnRenamed("collect_set(iv)", "itemid_set").
160 | select("userid", "itemid_set")
161 |
162 | // 2 物品:物品,上三角数据
163 | val user_ds2 = user_ds1.flatMap { row =>
164 | val itemlist = row.getAs[scala.collection.mutable.WrappedArray[String]](1).toArray.sorted
165 | val result = new ArrayBuffer[(String, String, Double, Double)]()
166 | for (i <- 0 to itemlist.length - 2) {
167 | for (j <- i + 1 to itemlist.length - 1) {
168 | result += ((itemlist(i).split(":")(0), itemlist(j).split(":")(0), itemlist(i).split(":")(1).toDouble, itemlist(j).split(":")(1).toDouble))
169 | }
170 | }
171 | result
172 | }.withColumnRenamed("_1", "itemidI").withColumnRenamed("_2", "itemidJ").withColumnRenamed("_3", "scoreI").withColumnRenamed("_4", "scoreJ")
173 |
174 | // 3 按照公式计算sim
175 | // dist = sqrt(∑((x(i)-y(i)) * (x(i)-y(i))))
176 | // cntSum = sum(1)
177 | // result = cntSum / (1 + dist)
178 | val user_ds3 = user_ds2.
179 | withColumn("cnt", lit(1)).
180 | groupBy("itemidI", "itemidJ").
181 | agg(sqrt(sum(($"scoreI" - $"scoreJ") * ($"scoreI" - $"scoreJ"))).as("dist"), sum($"cnt").as("cntSum")).
182 | withColumn("result", $"cntSum" / (lit(1.0) + $"dist"))
183 |
184 | // 4 上、下三角合并
185 | val user_ds8 = user_ds3.select("itemidI", "itemidJ", "result").union(user_ds3.select($"itemidJ".as("itemidI"), $"itemidI".as("itemidJ"), $"result"))
186 |
187 | // 5 结果返回
188 | val out = user_ds8.select("itemidI", "itemidJ", "result").map { row =>
189 | val itemidI = row.getString(0)
190 | val itemidJ = row.getString(1)
191 | val similar = row.getDouble(2)
192 | ItemSimi(itemidI, itemidJ, similar)
193 | }
194 | out
195 | }
196 |
197 | /**
198 | * 同现相似度矩阵计算.
199 | * w(i,j) = N(i)∩N(j)/sqrt(N(i)*N(j))
200 | * @param user_rdd 用户评分
201 | * @param RDD[ItemSimi] 返回物品相似度
202 | *
203 | */
204 | def CooccurrenceSimilarity(user_ds: Dataset[ItemPref]): Dataset[ItemSimi] = {
205 | import user_ds.sparkSession.implicits._
206 |
207 | // 1 (用户:物品) => (用户:(物品集合))
208 | val user_ds1 = user_ds.groupBy("userid").agg(collect_set("itemid")).withColumnRenamed("collect_set(itemid)", "itemid_set")
209 |
210 | // 2 物品:物品,上三角数据
211 | val user_ds2 = user_ds1.flatMap { row =>
212 | val itemlist = row.getAs[scala.collection.mutable.WrappedArray[String]](1).toArray.sorted
213 | val result = new ArrayBuffer[(String, String, Double)]()
214 | for (i <- 0 to itemlist.length - 2) {
215 | for (j <- i + 1 to itemlist.length - 1) {
216 | result += ((itemlist(i), itemlist(j), 1.0))
217 | }
218 | }
219 | result
220 | }.withColumnRenamed("_1", "itemidI").withColumnRenamed("_2", "itemidJ").withColumnRenamed("_3", "score")
221 |
222 | // 3 计算物品与物品,上三角,同现频次
223 | val user_ds3 = user_ds2.groupBy("itemidI", "itemidJ").agg(sum("score").as("sumIJ"))
224 |
225 | // 4 计算物品总共出现的频次
226 | val user_ds0 = user_ds.withColumn("score", lit(1)).groupBy("itemid").agg(sum("score").as("score"))
227 |
228 | // 5 计算同现相似度
229 | val user_ds4 = user_ds3.join(user_ds0.withColumnRenamed("itemid", "itemidJ").withColumnRenamed("score", "sumJ").select("itemidJ", "sumJ"), "itemidJ")
230 |
231 | val user_ds5 = user_ds4.join(user_ds0.withColumnRenamed("itemid", "itemidI").withColumnRenamed("score", "sumI").select("itemidI", "sumI"), "itemidI")
232 |
233 | // 根据公式N(i)∩N(j)/sqrt(N(i)*N(j)) 计算
234 | val user_ds6 = user_ds5.withColumn("result", col("sumIJ") / sqrt(col("sumI") * col("sumJ")))
235 |
236 | // 6 上、下三角合并
237 | println(s"user_ds6.count(): ${user_ds6.count()}")
238 | val user_ds8 = user_ds6.select("itemidI", "itemidJ", "result").union(user_ds6.select($"itemidJ".as("itemidI"), $"itemidI".as("itemidJ"), $"result"))
239 | println(s"user_ds8.count(): ${user_ds8.count()}")
240 |
241 | // 7 结果返回
242 | val out = user_ds8.select("itemidI", "itemidJ", "result").map { row =>
243 | val itemidI = row.getString(0)
244 | val itemidJ = row.getString(1)
245 | val similar = row.getDouble(2)
246 | ItemSimi(itemidI, itemidJ, similar)
247 | }
248 | out
249 | }
250 |
251 | /**
252 | * 计算推荐结果.
253 | * @param items_similar 物品相似矩阵
254 | * @param user_prefer 用户评分表
255 | * @param RDD[UserRecomm] 返回用户推荐结果
256 | *
257 | */
258 | def Recommend(items_similar: Dataset[ItemSimi],
259 | user_prefer: Dataset[ItemPref]): Dataset[UserRecomm] = {
260 | import user_prefer.sparkSession.implicits._
261 |
262 | // 1 数据准备
263 | val items_similar_ds1 = items_similar
264 | val user_prefer_ds1 = user_prefer
265 | // 2 根据用户的item召回相似物品
266 | val user_prefer_ds2 = items_similar_ds1.join(user_prefer_ds1, $"itemidI" === $"itemid", "inner")
267 | // user_prefer_ds2.show()
268 | // 3 计算召回的用户物品得分
269 | val user_prefer_ds3 = user_prefer_ds2.withColumn("score", col("pref") * col("similar")).select("userid", "itemidJ", "score")
270 | // user_prefer_ds3.show()
271 | // 4 得分汇总
272 | val user_prefer_ds4 = user_prefer_ds3.groupBy("userid", "itemidJ").agg(sum("score").as("score")).withColumnRenamed("itemidJ", "itemid")
273 | // user_prefer_ds4.show()
274 | // 5 用户得分排序结果,去除用户已评分物品
275 | val user_prefer_ds5 = user_prefer_ds4.join(user_prefer_ds1, Seq("userid", "itemid"), "left").where("pref is null")
276 | // user_prefer_ds5.show()
277 | // 6 结果返回
278 | val out1 = user_prefer_ds5.select("userid", "itemid", "score").map { row =>
279 | val userid = row.getString(0)
280 | val itemid = row.getString(1)
281 | val pref = row.getDouble(2)
282 | UserRecomm(userid, itemid, pref)
283 | }
284 | // out1.orderBy($"userid", $"pref".desc).show
285 | out1
286 | }
287 |
288 | }
289 |
290 |
291 |
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第4章协同过滤/ml-latest-small/README.txt:
--------------------------------------------------------------------------------
1 | Summary
2 | =======
3 |
4 | This dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from [MovieLens](http://movielens.org), a movie recommendation service. It contains 100836 ratings and 3683 tag applications across 9742 movies. These data were created by 610 users between March 29, 1996 and September 24, 2018. This dataset was generated on September 26, 2018.
5 |
6 | Users were selected at random for inclusion. All selected users had rated at least 20 movies. No demographic information is included. Each user is represented by an id, and no other information is provided.
7 |
8 | The data are contained in the files `links.csv`, `movies.csv`, `ratings.csv` and `tags.csv`. More details about the contents and use of all these files follows.
9 |
10 | This is a *development* dataset. As such, it may change over time and is not an appropriate dataset for shared research results. See available *benchmark* datasets if that is your intent.
11 |
12 | This and other GroupLens data sets are publicly available for download at .
13 |
14 |
15 | Usage License
16 | =============
17 |
18 | Neither the University of Minnesota nor any of the researchers involved can guarantee the correctness of the data, its suitability for any particular purpose, or the validity of results based on the use of the data set. The data set may be used for any research purposes under the following conditions:
19 |
20 | * The user may not state or imply any endorsement from the University of Minnesota or the GroupLens Research Group.
21 | * The user must acknowledge the use of the data set in publications resulting from the use of the data set (see below for citation information).
22 | * The user may redistribute the data set, including transformations, so long as it is distributed under these same license conditions.
23 | * The user may not use this information for any commercial or revenue-bearing purposes without first obtaining permission from a faculty member of the GroupLens Research Project at the University of Minnesota.
24 | * The executable software scripts are provided "as is" without warranty of any kind, either expressed or implied, including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose. The entire risk as to the quality and performance of them is with you. Should the program prove defective, you assume the cost of all necessary servicing, repair or correction.
25 |
26 | In no event shall the University of Minnesota, its affiliates or employees be liable to you for any damages arising out of the use or inability to use these programs (including but not limited to loss of data or data being rendered inaccurate).
27 |
28 | If you have any further questions or comments, please email
29 |
30 |
31 | Citation
32 | ========
33 |
34 | To acknowledge use of the dataset in publications, please cite the following paper:
35 |
36 | > F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19.
37 |
38 |
39 | Further Information About GroupLens
40 | ===================================
41 |
42 | GroupLens is a research group in the Department of Computer Science and Engineering at the University of Minnesota. Since its inception in 1992, GroupLens's research projects have explored a variety of fields including:
43 |
44 | * recommender systems
45 | * online communities
46 | * mobile and ubiquitious technologies
47 | * digital libraries
48 | * local geographic information systems
49 |
50 | GroupLens Research operates a movie recommender based on collaborative filtering, MovieLens, which is the source of these data. We encourage you to visit to try it out! If you have exciting ideas for experimental work to conduct on MovieLens, send us an email at - we are always interested in working with external collaborators.
51 |
52 |
53 | Content and Use of Files
54 | ========================
55 |
56 | Formatting and Encoding
57 | -----------------------
58 |
59 | The dataset files are written as [comma-separated values](http://en.wikipedia.org/wiki/Comma-separated_values) files with a single header row. Columns that contain commas (`,`) are escaped using double-quotes (`"`). These files are encoded as UTF-8. If accented characters in movie titles or tag values (e.g. Misérables, Les (1995)) display incorrectly, make sure that any program reading the data, such as a text editor, terminal, or script, is configured for UTF-8.
60 |
61 |
62 | User Ids
63 | --------
64 |
65 | MovieLens users were selected at random for inclusion. Their ids have been anonymized. User ids are consistent between `ratings.csv` and `tags.csv` (i.e., the same id refers to the same user across the two files).
66 |
67 |
68 | Movie Ids
69 | ---------
70 |
71 | Only movies with at least one rating or tag are included in the dataset. These movie ids are consistent with those used on the MovieLens web site (e.g., id `1` corresponds to the URL ). Movie ids are consistent between `ratings.csv`, `tags.csv`, `movies.csv`, and `links.csv` (i.e., the same id refers to the same movie across these four data files).
72 |
73 |
74 | Ratings Data File Structure (ratings.csv)
75 | -----------------------------------------
76 |
77 | All ratings are contained in the file `ratings.csv`. Each line of this file after the header row represents one rating of one movie by one user, and has the following format:
78 |
79 | userId,movieId,rating,timestamp
80 |
81 | The lines within this file are ordered first by userId, then, within user, by movieId.
82 |
83 | Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars).
84 |
85 | Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
86 |
87 |
88 | Tags Data File Structure (tags.csv)
89 | -----------------------------------
90 |
91 | All tags are contained in the file `tags.csv`. Each line of this file after the header row represents one tag applied to one movie by one user, and has the following format:
92 |
93 | userId,movieId,tag,timestamp
94 |
95 | The lines within this file are ordered first by userId, then, within user, by movieId.
96 |
97 | Tags are user-generated metadata about movies. Each tag is typically a single word or short phrase. The meaning, value, and purpose of a particular tag is determined by each user.
98 |
99 | Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
100 |
101 |
102 | Movies Data File Structure (movies.csv)
103 | ---------------------------------------
104 |
105 | Movie information is contained in the file `movies.csv`. Each line of this file after the header row represents one movie, and has the following format:
106 |
107 | movieId,title,genres
108 |
109 | Movie titles are entered manually or imported from , and include the year of release in parentheses. Errors and inconsistencies may exist in these titles.
110 |
111 | Genres are a pipe-separated list, and are selected from the following:
112 |
113 | * Action
114 | * Adventure
115 | * Animation
116 | * Children's
117 | * Comedy
118 | * Crime
119 | * Documentary
120 | * Drama
121 | * Fantasy
122 | * Film-Noir
123 | * Horror
124 | * Musical
125 | * Mystery
126 | * Romance
127 | * Sci-Fi
128 | * Thriller
129 | * War
130 | * Western
131 | * (no genres listed)
132 |
133 |
134 | Links Data File Structure (links.csv)
135 | ---------------------------------------
136 |
137 | Identifiers that can be used to link to other sources of movie data are contained in the file `links.csv`. Each line of this file after the header row represents one movie, and has the following format:
138 |
139 | movieId,imdbId,tmdbId
140 |
141 | movieId is an identifier for movies used by . E.g., the movie Toy Story has the link .
142 |
143 | imdbId is an identifier for movies used by . E.g., the movie Toy Story has the link .
144 |
145 | tmdbId is an identifier for movies used by . E.g., the movie Toy Story has the link .
146 |
147 | Use of the resources listed above is subject to the terms of each provider.
148 |
149 |
150 | Cross-Validation
151 | ----------------
152 |
153 | Prior versions of the MovieLens dataset included either pre-computed cross-folds or scripts to perform this computation. We no longer bundle either of these features with the dataset, since most modern toolkits provide this as a built-in feature. If you wish to learn about standard approaches to cross-fold computation in the context of recommender systems evaluation, see [LensKit](http://lenskit.org) for tools, documentation, and open-source code examples.
154 |
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第4章协同过滤/ml-latest-small/movies.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WallaceLiu/recommendation_algorithm/36cdd077446ab7a2831c168f7419b058bd2fcbb0/推荐系统算法实践—源码下载/第4章协同过滤/ml-latest-small/movies.csv
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第5章Word2vec/Word2vec.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # ## 1)环境准备
5 |
6 | # In[1]:
7 |
8 | import tensorflow as tf
9 | import numpy as np
10 | import pandas as pd
11 | import random
12 | import math
13 | import re
14 | from os import path, listdir
15 | import os
16 | os.environ["CUDA_VISIBLE_DEVICES"]="0"
17 |
18 | print tf.__version__
19 | print tf.__path__
20 |
21 |
22 | # ## 2)数据准备Dataset格式
23 |
24 | # In[2]:
25 |
26 | # 每一行解析,解析标签csv格式
27 | # 5805 17357
28 | # 数据处理
29 | def process_data(my_path, batch_size=32, num_epochs=1):
30 | filenames = get_file_list(my_path)
31 | next_element = read_my_file_format(filenames, batch_size, num_epochs)
32 | return next_element
33 | # 创建session,指定GPU或者CPU使用率
34 | def get_session(gpu_fraction=0.1):
35 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction,
36 | allow_growth=True)
37 | return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
38 |
39 |
40 | # ## 3)Skip-gram模型
41 |
42 | # In[3]:
43 |
44 | class SkipGram(object):
45 | """ 初始化成员变量 """
46 | def __init__(self, vocab_size, embed_size, num_sampled, train_optimizer, learning_rate):
47 | # 字典长度
48 | self.vocab_size = vocab_size
49 | # 词向量长度
50 | self.embed_size = embed_size
51 | # 负采样数量
52 | self.num_sampled = num_sampled
53 | # 优化方法
54 | self.train_optimizer = train_optimizer
55 | # 学习率
56 | self.learning_rate = learning_rate
57 | # aglobal_step
58 | self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')
59 |
60 | def train(self, batch_data):
61 | """ 1 定义输入数据 """
62 | with tf.name_scope('input_data'):
63 | # center_words
64 | center_words = tf.reshape(batch_data['center_words'], shape=[-1])
65 | # target_words
66 | target_words = tf.reshape(batch_data['target_words'], shape=[-1,1])
67 | print("%s: %s" % ("center_words", center_words))
68 | print("%s: %s" % ("target_words", target_words))
69 |
70 | """ 2 定义网络输出 """
71 | with tf.name_scope("Comput_Score"):
72 | # 词向量矩阵
73 | with tf.variable_scope("embed", reuse=tf.AUTO_REUSE):
74 | self.embedding_dict = tf.get_variable(name='embed', shape=[self.vocab_size, self.embed_size], initializer=tf.glorot_uniform_initializer())
75 | print("%s: %s" % ("embedding_dict", self.embedding_dict))
76 |
77 | # 模型内部参数矩阵
78 | with tf.variable_scope("nce", reuse=tf.AUTO_REUSE):
79 | self.nce_weight = tf.get_variable(name='nce_weight', shape=[self.vocab_size, self.embed_size], initializer=tf.glorot_normal_initializer())
80 | self.nce_biases = tf.get_variable(name='nce_biases', shape=[1], initializer=tf.constant_initializer(0.0))
81 | print("%s: %s" % ("nce_weight", self.nce_weight))
82 | print("%s: %s" % ("nce_biases", self.nce_biases))
83 |
84 | # 将输入序列向量化
85 | # 其实就是一个简单的查表
86 | embed = tf.nn.embedding_lookup(self.embedding_dict, center_words, name='embed')
87 | print("%s: %s" % ("embed", embed))
88 |
89 | # 得到NCE损失(负采样得到的损失)
90 | loss = tf.reduce_mean(
91 | tf.nn.nce_loss(
92 | weights = self.nce_weight, # 权重
93 | biases = self.nce_biases, # 偏差
94 | labels = target_words, # 输入的标签
95 | inputs = embed, # 输入向量
96 | num_sampled = self.num_sampled, # 负采样的个数
97 | num_classes = self.vocab_size # 字典数目
98 | )
99 | )
100 | print("%s: %s" % ("loss", loss))
101 |
102 | """ 3 设定optimizer """
103 | with tf.name_scope("optimizer"):
104 | with tf.variable_scope("optimizer", reuse=tf.AUTO_REUSE):
105 | #------bulid optimizer------
106 | if train_optimizer == 'Adam':
107 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8)
108 | elif train_optimizer == 'Adagrad':
109 | optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=1e-8)
110 | train_step = optimizer.minimize(loss, global_step=self.global_step)
111 |
112 | """4 设定summary,以便在Tensorboard里进行可视化 """
113 | with tf.name_scope("summaries"):
114 | tf.summary.scalar("loss", loss)
115 | tf.summary.histogram("embedding_dict", self.embedding_dict)
116 | # 好几个summary,所以这里要merge_all
117 | summary_op = tf.summary.merge_all()
118 |
119 | """5 返回结果 """
120 | return train_step, loss, summary_op
121 |
122 |
123 | # ## 4)模型训练测试
124 |
125 | # In[4]:
126 |
127 | # 测试数据
128 | filenames = "/data/windows_skip_sample.csv"
129 | batch_size = 100000
130 | num_epochs = 200
131 | next_element = process_data(filenames, batch_size, num_epochs)
132 |
133 | # 模型参数
134 | vocab_size = 6834
135 | embed_size = 30
136 | num_sampled = 50
137 | train_optimizer = 'Adam'
138 | learning_rate = 0.01
139 | log_path='/data/log/20180915'
140 |
141 | # 开始训练
142 | bea_model = SkipGram(vocab_size, embed_size, num_sampled, train_optimizer, learning_rate)
143 | train_step, loss, summary_op = bea_model.train(next_element)
144 |
145 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer())
146 | gpu_fraction = 0.4
147 | my_device='/gpu:0'
148 | with tf.device(my_device):
149 | sess = get_session(gpu_fraction)
150 | sess.run(init_op)
151 | batch_cnt = 0
152 | #选定可视化存储目录
153 | writer = tf.summary.FileWriter(log_path, sess.graph)
154 | try:
155 | while True:
156 | batch_cnt = batch_cnt + 1
157 | a, b, summary = sess.run([train_step, loss, summary_op])
158 | if batch_cnt % 1000 == 0 or batch_cnt <= 10:
159 | print("batch: {} loss: {:.4f}".format(batch_cnt, b))
160 | writer.add_summary(summary, batch_cnt)
161 | except tf.errors.OutOfRangeError:
162 | print("Train end of dataset")
163 |
164 |
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第5章Word2vec/Word2vec.scala:
--------------------------------------------------------------------------------
1 | package book_code
2 |
3 | import org.apache.spark.sql.{ SparkSession, _ }
4 | import org.apache.spark.sql._
5 | import org.apache.spark.sql.functions._
6 | import org.apache.spark.mllib.feature.Word2Vec
7 | import java.util.Date
8 | import java.text.SimpleDateFormat
9 |
10 | object Word2vec {
11 |
12 | /**
13 | * word2vec实现:
14 | *
15 | * 1)读取训练样本
16 | * 2)w2v模型训练
17 | * 3)提取词向量,并且计算相似词
18 | *
19 | * @author sunbow
20 | */
21 |
22 | def main(args: Array[String]): Unit = {
23 |
24 | /**
25 | * #############################################################
26 | *
27 | * Step 1:初始化
28 | *
29 | * ##############################################################
30 | */
31 |
32 | val spark = SparkSession
33 | .builder
34 | .appName("Word2vec")
35 | .config("spark.hadoop.validateOutputSpecs", "false")
36 | .enableHiveSupport()
37 | .getOrCreate()
38 |
39 | import spark.implicits._
40 | val data_path = args(0)
41 | val conf_path = args(1)
42 | val defaultFS = args(2)
43 | val NumIterations = args(3).toInt
44 | val MaxSentenceLength = args(4).toInt
45 | val MinCount = args(5).toInt
46 | val VectorSize = args(6).toInt
47 | val WindowSize = args(7).toInt
48 | val simil_size = args(8).toInt
49 |
50 | /**
51 | * #############################################################
52 | *
53 | * Step 2:数据准备
54 | *
55 | * ##############################################################
56 | */
57 | // 2.1读取item配置表
58 | val id_conf_df = spark.read.options(Map(("delimiter", "|"), ("header", "false"))).csv(conf_path)
59 | val id2title_map = id_conf_df.collect().map(row => (row(0).toString(), row(1).toString())).toMap
60 |
61 | // 2.2读取样本数据
62 | val sequence_sample = spark.read.text(data_path).map {
63 | case Row(id_list: String) =>
64 | val seq = id_list.split(" ").toSeq
65 | seq
66 | }
67 | sequence_sample.repartition(500).cache()
68 | sequence_sample.count()
69 | println("sequence_sample.show()")
70 | sequence_sample.show()
71 |
72 | /**
73 | * #############################################################
74 | *
75 | * Step 3:Word2Vec
76 | *
77 | * ##############################################################
78 | */
79 | // 训练模型
80 | val word2Vec = new Word2Vec().
81 | setNumIterations(NumIterations).
82 | setMaxSentenceLength(MaxSentenceLength).
83 | setMinCount(MinCount).
84 | setVectorSize(VectorSize).
85 | setWindowSize(WindowSize)
86 | val model = word2Vec.fit(sequence_sample.rdd)
87 |
88 | // 模型保存
89 | val now = new Date()
90 | val dateFormat1 = new SimpleDateFormat("yyyyMMddHHmmss")
91 | val time_stamp = dateFormat1.format(now)
92 | val model_path = s"${defaultFS}/Word2vec/model/${time_stamp}"
93 | println(model_path)
94 | model.save(spark.sparkContext, model_path)
95 |
96 | /**
97 | * #############################################################
98 | *
99 | * Step 4:词向量结果保存
100 | *
101 | * ##############################################################
102 | */
103 | val modelBC = spark.sparkContext.broadcast(model)
104 | val id2title_map_BC = spark.sparkContext.broadcast(id2title_map)
105 | // 词,向量,相似词
106 | val word2vector_rdd = spark.sparkContext.parallelize(model.getVectors.toSeq).map {
107 | case (word: String, vec: Array[Float]) =>
108 | // 根据word查找相似word
109 | val simil_word = modelBC.value.findSynonyms(word, simil_size)
110 | val simil_word_str = simil_word.map(f => s"${f._1}:${f._2.formatted("%.4f")}").mkString(",")
111 | val title = id2title_map_BC.value.getOrElse(word, "")
112 | val simil_title = simil_word.map(f => id2title_map_BC.value.getOrElse(f._1, "")).mkString(",")
113 | // 向量
114 | val vec_str = vec.mkString(",")
115 | (word, vec_str, simil_word, title, simil_title)
116 | }
117 |
118 | println("word2vector_rdd.toDF().show(30)")
119 | word2vector_rdd.toDF().withColumnRenamed("_4", "word").withColumnRenamed("_5", "simil_word").select("word", "simil_word").show(20)
120 |
121 | // 结果保存
122 | val save_path = s"${defaultFS}/Word2vec/model_result/${time_stamp}"
123 | word2vector_rdd.map(f => s"${f._1}|${f._2}|${f._3}|${f._4}|${f._5}").saveAsTextFile(save_path)
124 |
125 | }
126 |
127 | }
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第6章逻辑回归/LR.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # ## 1)环境准备
5 |
6 | # In[1]:
7 |
8 | import numpy as np
9 | import tensorflow as tf
10 | import pandas as pd
11 | import random
12 | import math
13 | import re
14 |
15 | from sklearn import preprocessing
16 | from os import path, listdir
17 | from sklearn.datasets import load_svmlight_files
18 | from sklearn.model_selection import train_test_split
19 | from sklearn import metrics
20 | from tensorflow.contrib import layers
21 |
22 | from sklearn import metrics
23 |
24 | import time
25 | import datetime
26 |
27 | import os
28 | os.environ["CUDA_VISIBLE_DEVICES"]="0"
29 |
30 | print tf.__version__
31 | print tf.__path__
32 |
33 |
34 | # ## 2)数据准备Dataset格式
35 |
36 | # In[6]:
37 |
38 | """
39 | 解析CSV格式,对输入的每一行样本,进行格式解析,返回labels和dense_vector格式数据
40 | 例如输入csv格式字符串: 0.0,0.6666666666666666,0.5,0.0,0.0,0.0,0.0,0.7272727272727273,0.42857142857142855
41 | """
42 | # 数据处理
43 | def process_data(data_type, my_path, feature_size, batch_size=32, num_epochs=1):
44 | filenames = get_file_list(my_path)
45 | next_element = read_my_file_format(data_type, filenames, feature_size, batch_size, num_epochs)
46 | return next_element
47 |
48 | # 创建session,指定GPU或者CPU使用率
49 | def get_session(gpu_fraction=0.1):
50 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction,
51 | allow_growth=True)
52 | return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
53 |
54 |
55 | # In[8]:
56 |
57 | # 测试数据
58 | filenames = '/data/all-csv'
59 | feature_size = 530
60 | batch_size = 3
61 | num_epochs = 1
62 | data_type = 'csv'
63 | next_element = process_data(data_type, filenames, feature_size, batch_size, num_epochs)
64 | print next_element['dense_vector']
65 | print next_element['labels']
66 |
67 | gpu_fraction = 0.2
68 | my_device='/gpu:0'
69 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer())
70 | with tf.device(my_device):
71 | sess = get_session(gpu_fraction)
72 | sess.run(init_op)
73 | dense_vector, labels = sess.run([next_element['dense_vector'],next_element['labels']])
74 | print dense_vector
75 | print labels
76 |
77 |
78 | # ## 3)LR模型
79 |
80 | # In[7]:
81 |
82 | class LR(object):
83 | """ 初始化成员变量 """
84 | def __init__(self, feature_size, loss_fuc, train_optimizer, learning_rate, reg_type, reg_param):
85 | # 特征向量长度
86 | self.feature_size = feature_size
87 | # 损失函数
88 | self.loss_fuc = loss_fuc
89 | # 优化方法
90 | self.train_optimizer = train_optimizer
91 | # 学习率
92 | self.learning_rate = learning_rate
93 | # 正则类型
94 | self.reg_type = reg_type
95 | # 正则因子
96 | self.reg_param = reg_param
97 | # aglobal_step
98 | self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')
99 |
100 | def train(self, batch_data):
101 | """ 1 定义输入数据 """
102 | with tf.name_scope('input_data'):
103 | # 标签:[batch_size, 1]
104 | labels = batch_data['labels']
105 | # 用户特征向量:[batch_size, feature_size]
106 | dense_vector = tf.reshape(batch_data['dense_vector'], shape=[-1, feature_size, 1]) # None * feature_size * 1
107 | print("%s: %s" % ("dense_vector", dense_vector))
108 | print("%s: %s" % ("labels", labels))
109 |
110 | """ 2 定义网络输出 """
111 | with tf.name_scope("LR_Comput_Score"):
112 | # LR参数,生成或者获取w b
113 | with tf.variable_scope("lr_layer", reuse=tf.AUTO_REUSE):
114 | self.w = tf.get_variable(name='w', shape=[self.feature_size, 1], initializer=tf.glorot_normal_initializer())
115 | self.b = tf.get_variable(name='bias', shape=[1], initializer=tf.constant_initializer(0.0))
116 | print("%s: %s" % ("w", self.w))
117 | print("%s: %s" % ("b", self.b))
118 |
119 | # ---------- w * x + b----------
120 | Y_first = tf.reduce_sum(tf.multiply(self.w, dense_vector), 2) # None * F
121 | print("%s: %s" % ("Y_first", Y_first))
122 | # ---------- sum(w * x) + b----------
123 | Y_Out = tf.reduce_sum(Y_first, 1)
124 | Y_bias = self.b * tf.ones_like(Y_Out, dtype=tf.float32) # None * 1
125 | print("%s: %s" % ("Y_bias", Y_bias))
126 | Y_Out = tf.add(Y_Out, Y_bias, name='Y_Out')
127 | print("%s: %s" % ("Y_Out", Y_Out))
128 | # ---------- score ----------
129 | score=tf.nn.sigmoid(Y_Out,name='score')
130 | score=tf.reshape(score, shape=[-1, 1])
131 | print("%s: %s" % ("score", score))
132 |
133 | """ 3 定义损失函数和AUC指标 """
134 | with tf.name_scope("loss"):
135 | # loss:Squared_error,Cross_entropy ,FTLR
136 | if reg_type == 'l1_reg':
137 | regularization = self.reg_param * tf.reduce_sum(tf.abs(self.w))
138 | # tf.contrib.layers.l1_regularizer(self.reg_param)(self.w)
139 | elif reg_type == 'l2_reg':
140 | regularization = self.reg_param * tf.nn.l2_loss(self.w)
141 | else:
142 | regularization = self.reg_param * tf.nn.l2_loss(self.w)
143 |
144 | if loss_fuc == 'Squared_error':
145 | loss = tf.reduce_mean(tf.reduce_sum(tf.square(labels - score), reduction_indices=[1])) + regularization
146 | elif loss_fuc == 'Cross_entropy':
147 | loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=tf.reshape(Y_Out, [-1]), labels=tf.reshape(labels, [-1]))) + regularization
148 | elif loss_fuc == 'FTLR':
149 | loss = tf.reduce_mean(tf.reduce_sum(tf.square(labels - score), reduction_indices=[1])) + regularization
150 | # AUC
151 | auc = tf.metrics.auc(labels, score)
152 | print("%s: %s" % ("labels", labels))
153 | # w为0的比例,w的平均值
154 | w_zero_ratio = tf.reduce_mean(tf.to_float(tf.abs(self.w) <= 1.0e-5))
155 | w_avg = tf.reduce_mean(self.w)
156 |
157 | """ 4 设定optimizer """
158 | with tf.name_scope("optimizer"):
159 | with tf.variable_scope("optimizer", reuse=tf.AUTO_REUSE):
160 | #------bulid optimizer------
161 | if train_optimizer == 'Adam':
162 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8)
163 | elif train_optimizer == 'Adagrad':
164 | optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=1e-8)
165 | elif train_optimizer == 'Momentum':
166 | optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.95)
167 | elif train_optimizer == 'ftrl':
168 | optimizer = tf.train.FtrlOptimizer(learning_rate)
169 | train_step = optimizer.minimize(loss, global_step=self.global_step)
170 |
171 | """5 设定summary,以便在Tensorboard里进行可视化 """
172 | with tf.name_scope("summaries"):
173 | tf.summary.scalar("loss", loss)
174 | tf.summary.scalar("accumulate_auc", auc[0])
175 | tf.summary.scalar("w_avg", w_avg)
176 | tf.summary.scalar("w_zero_ratio", w_zero_ratio)
177 | tf.summary.histogram("w", self.w)
178 | # 好几个summary,所以这里要merge_all
179 | summary_op = tf.summary.merge_all()
180 |
181 | """6 返回结果 """
182 | return Y_Out, score, regularization, loss, auc, train_step, w_zero_ratio, w_avg, labels, score, summary_op
183 |
184 |
185 | # ## 4)模型训练测试
186 |
187 | # In[9]:
188 |
189 | # 数据准备
190 | filenames = '/data/csv-all'
191 | data_type='csv'
192 | feature_size = 530
193 | batch_size = 60000
194 | num_epochs = 200
195 | next_element = process_data(data_type, filenames, feature_size, batch_size, num_epochs)
196 |
197 | # 模型参数
198 | loss_fuc = 'Squared_error'
199 | train_optimizer = 'Adam'
200 | learning_rate = 0.01
201 | reg_type = 'l2_reg'
202 | reg_param = 0.0
203 | log_path='/data/log/Squared_error_lr_L2_0_20180816_01'
204 |
205 | # 开始训练
206 | bea_model = LR(feature_size, loss_fuc, train_optimizer, learning_rate, reg_type, reg_param)
207 | Y_Out, score, regularization, loss, auc, train_step, w_zero_ratio, w_avg, labels, score, summary_op = bea_model.train(next_element)
208 |
209 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer())
210 | gpu_fraction = 0.4
211 | my_device='/gpu:1'
212 | with tf.device(my_device):
213 | sess = get_session(gpu_fraction)
214 | sess.run(init_op)
215 | batch_cnt = 0
216 | #选定可视化存储目录
217 | writer = tf.summary.FileWriter(log_path, sess.graph)
218 | try:
219 | while True:
220 | batch_cnt = batch_cnt + 1
221 | a, b, c, d, e, summary = sess.run([loss, auc, w_zero_ratio, w_avg, train_step, summary_op])
222 | if batch_cnt % 50 == 0 or batch_cnt <= 10:
223 | y, p = sess.run([labels, score])
224 | if y.sum() > 0.0:
225 | batch_auc=metrics.roc_auc_score(y, p)
226 | else:
227 | batch_auc=0.0
228 | print("batch: {} loss: {:.4f} accumulate_auc: {:.4f} batch_auc: {:.4f} w_zero_ratio: {:.4f} w_avg: {:.4f}".format(batch_cnt, a, b[0], batch_auc, c, d))
229 | writer.add_summary(summary, batch_cnt)
230 | except tf.errors.OutOfRangeError:
231 | print("3、Train end of dataset")
232 |
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第6章逻辑回归/LogisticRegression.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # ## 1)环境设定
5 |
6 | # In[25]:
7 |
8 | from sklearn.linear_model import LogisticRegression
9 | from sklearn import metrics
10 | from os import path, listdir
11 | from sklearn.datasets import load_svmlight_files
12 | from sklearn.model_selection import train_test_split
13 | from sklearn.externals import joblib
14 | from sklearn import preprocessing
15 | import numpy as np
16 | import pandas as pd
17 | import random
18 | import platform
19 | print("Python Version: %s"%(platform.python_version()))
20 |
21 |
22 | # ## 2)数据准备
23 |
24 | # In[26]:
25 |
26 | """
27 | 处理libSVM数据方法,生成样本,支持Batch格式返回,也支持X/Y返回
28 | """
29 | def process_data(data_path, feature_size, test_rat=0.3, random_seed=0, train_batch_size=5000, test_batch_size=5000):
30 | # 读取文件
31 | # batch生成
32 | return {"train_batch": train_batch, "test_batch": test_batch, "X_train": X_train, "X_test": X_test, "Y_train": Y_train, "Y_test": Y_test}
33 |
34 | # In[27]:
35 |
36 | # 数据测试
37 | data_path = '/data/data01/'
38 | test_rat=0.4
39 | random_seed=0
40 | train_batch_size=20000
41 | test_batch_size=20000
42 | feature_size=530
43 |
44 | # 获取样本数据
45 | data = process_data(data_path, feature_size, test_rat, random_seed, train_batch_size, test_batch_size)
46 |
47 | train_batch = data['train_batch']
48 | test_batch = data['test_batch']
49 | X_train = data['X_train']
50 | Y_train = data['Y_train']
51 | X_test = data['X_test']
52 | Y_test = data['Y_test']
53 |
54 | # 查看样本数据大小
55 | print("X_train.shape: ")
56 | print(X_train.shape)
57 | print("Y_train.shape: ")
58 | print(Y_train.shape)
59 | print("X_test.shape: ")
60 | print(X_test.shape)
61 | print("Y_test.shape: ")
62 | print(Y_test.shape)
63 |
64 |
65 | # ## 3)LR模型
66 |
67 | # In[30]:
68 |
69 | # 3.1 建立逻辑回归模型,并且设定参数
70 | lr_model= LogisticRegression(penalty='l2', C=1000, solver='lbfgs', max_iter=500)
71 |
72 | # 3.2 训练逻辑回归模型
73 | lr_model.fit(X_train,Y_train.values.ravel())
74 |
75 |
76 | # In[31]:
77 |
78 | # 3.3 采用测试集验证模型离线指标
79 | # 训练集AUC
80 | probs_train= lr_model.predict_proba(X_train)
81 | AUC1 = metrics.roc_auc_score(Y_train, probs_train[:,1])
82 | print("Train Auc: %s"%(AUC1))
83 |
84 | # 测试集AUC
85 | probs_test= lr_model.predict_proba(X_test)
86 | predict_test = lr_model.predict(X_test)
87 | AUC2 = metrics.roc_auc_score(Y_test, probs_test[:,1])
88 | print("Test Auc: %s"%(AUC2))
89 |
90 | # 准确率
91 | accuracy = metrics.accuracy_score(Y_test, predict_test)
92 | print("Test Accuracy: %s"%(accuracy))
93 |
94 | # 召回率
95 | recall = metrics.recall_score(Y_test, predict_test)
96 | print("Test Recall: %s"%(recall))
97 |
98 | # F1值
99 | f1 = metrics.f1_score(Y_test, predict_test)
100 | print("Test F1: %s"%(f1))
101 |
102 |
103 | # In[42]:
104 |
105 | # 3.4 打印模型参数
106 | w=lr_model.coef_
107 | print("参数大小:")
108 | print(w.shape)
109 | print("参数前10个:")
110 | print(lr_model.coef_[:,0:10])
111 | print("截距:")
112 | print(lr_model.intercept_)
113 | print("稀疏化特征比率:%.2f%%" %(np.mean(lr_model.coef_.ravel()==0)*100))
114 | print("sigmoid函数转化的值,即:概率p")
115 | print(lr_model.predict_proba(X_test[0:5]))
116 |
117 |
118 | # In[43]:
119 |
120 | # 3.5 模型保存
121 | joblib.dump(lr_model,"logistic_lr.model")
122 | #模型加载
123 | load_lr = joblib.load("logistic_lr.model")
124 | print(load_lr.predict_proba(X_test[0:5]))
125 |
126 |
127 | # In[ ]:
128 |
129 | # 3.1 建立逻辑回归模型,并且设定参数
130 | lr_model= LogisticRegression(penalty='l2', C=1000, solver='lbfgs', max_iter=500)
131 |
132 | # 3.2 训练逻辑回归模型
133 | lr_model.fit(X_train,Y_train)
134 |
135 | # 3.3 采用测试集验证模型离线指标
136 | # 训练集AUC
137 | probs_train= lr_model.predict_proba(X_train)
138 | AUC1 = metrics.roc_auc_score(Y_train, probs_train[:,1])
139 | print("Train Auc: %s"%(AUC1))
140 |
141 | # 测试集AUC
142 | probs_test= lr_model.predict_proba(X_test)
143 | predict_test = lr_model.predict(X_test)
144 | AUC2 = metrics.roc_auc_score(Y_test, probs_test[:,1])
145 | print("Test Auc: %s"%(AUC2))
146 |
147 | # 准确率
148 | accuracy = metrics.accuracy_score(Y_test, predict_test)
149 | print("Test Accuracy: %s"%(accuracy))
150 |
151 | # 召回率
152 | recall = metrics.recall_score(Y_test, predict_test)
153 | print("Test Recall: %s"%(recall))
154 |
155 | # F1值
156 | f1 = metrics.f1_score(Y_test, predict_test)
157 | print("Test F1: %s"%(f1))
158 |
159 | # 3.4 打印模型参数
160 | print("参数:",lr_model.coef_)
161 | print("截距:",lr_model.intercept_)
162 | print("稀疏化特征比率:%.2f%%" %(np.mean(lr_model.coef_.ravel()==0)*100))
163 | print("=========sigmoid函数转化的值,即:概率p=========")
164 | print(lr_model.predict_proba(X_test[0:5])) #sigmoid函数转化的值,即:概率p
165 |
166 | # 3.5 模型保存
167 | joblib.dump(lr_model,"logistic_lr.model")
168 | #模型加载
169 | load_lr = joblib.load("logistic_lr.model")
170 | print(load_lr.predict_proba(X_test[0:5]))
171 |
172 | # In[30]:
173 |
174 | # 3.1 建立逻辑回归模型,并且设定参数
175 | lr_model= LogisticRegression(penalty='l2', C=1000, solver='lbfgs', max_iter=500)
176 |
177 | # 3.2 训练逻辑回归模型
178 | lr_model.fit(X_train,Y_train)
179 |
180 |
181 | # In[46]:
182 |
183 | # 3.3 采用测试集验证模型离线指标
184 | # 训练集AUC
185 | probs_train= lr_model.predict_proba(X_train)
186 | AUC1 = metrics.roc_auc_score(Y_train, probs_train[:,1])
187 | print("Train Auc: %s"%(AUC1))
188 |
189 | # 测试集AUC
190 | probs_test= lr_model.predict_proba(X_test)
191 | predict_test = lr_model.predict(X_test)
192 | AUC2 = metrics.roc_auc_score(Y_test, probs_test[:,1])
193 | print("Test Auc: %s"%(AUC2))
194 |
195 | # 准确率
196 | accuracy = metrics.accuracy_score(Y_test, predict_test)
197 | print("Test Accuracy: %s"%(accuracy))
198 |
199 | # 召回率
200 | recall = metrics.recall_score(Y_test, predict_test)
201 | print("Test Recall: %s"%(recall))
202 |
203 | # F1值
204 | f1 = metrics.f1_score(Y_test, predict_test)
205 | print("Test F1: %s"%(f1))
206 |
207 |
208 | # In[49]:
209 |
210 | # 3.4 打印模型参数
211 | print("参数:",lr_model.coef_)
212 | print("截距:",lr_model.intercept_)
213 | print("稀疏化特征比率:%.2f%%" %(np.mean(lr_model.coef_.ravel()==0)*100))
214 | print("=========sigmoid函数转化的值,即:概率p=========")
215 | print(lr_model.predict_proba(X_test[0:5])) #sigmoid函数转化的值,即:概率p
216 |
217 |
218 | # In[53]:
219 |
220 | # 3.5 模型保存
221 | joblib.dump(lr_model,"logistic_lr.model")
222 | #模型加载
223 | load_lr = joblib.load("logistic_lr.model")
224 | print(load_lr.predict_proba(X_test[0:5]))
225 |
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第6章逻辑回归/LogisticRegression.scala:
--------------------------------------------------------------------------------
1 | package book_code
2 |
3 | import org.apache.spark.ml.classification.{ BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel }
4 | import org.apache.spark.ml.evaluation.{ MulticlassClassificationEvaluator, BinaryClassificationEvaluator }
5 | import org.apache.spark.ml.linalg.{ Vector, Vectors }
6 | import org.apache.spark.sql.types._
7 | import org.apache.spark.sql.functions._
8 | import org.apache.spark.sql._
9 | import org.apache.spark.sql.SparkSession
10 | import org.apache.spark.ml.feature._
11 | import java.util.Date
12 | import java.text.SimpleDateFormat
13 |
14 | object LogisticRegression {
15 |
16 | def main(args: Array[String]): Unit = {
17 |
18 | val spark = SparkSession.
19 | builder().
20 | appName("LogisticRegression").
21 | enableHiveSupport().
22 | getOrCreate()
23 |
24 | import spark.implicits._
25 |
26 | //1 参数准备
27 | val dataPath = "hdfs://1.1.1.1:9000/user/data01/"
28 | val iter = 500
29 | val reg_param = 0.0
30 | val elastic_net_param = 0.0
31 |
32 | //2 训练样本准备
33 | val (training, test) = readLibSvmSampleData(spark, dataPath)
34 | training.cache()
35 | test.cache()
36 | println(s"training.count(): ${training.count()}")
37 | println(s"test.count(): ${test.count()}")
38 | println("training.show")
39 | training.show
40 |
41 | //3 建立逻辑回归模型
42 | val lr = new LogisticRegression().
43 | setMaxIter(iter).
44 | setRegParam(reg_param).
45 | setElasticNetParam(elastic_net_param)
46 |
47 | //4 根据训练样本进行模型训练
48 | val lrModel = lr.fit(training)
49 |
50 | //5 打印模型信息
51 | println(s"Coefficients Top 10: ${lrModel.coefficients.toArray.slice(0, 10).mkString(" ")}")
52 | println(s"Intercept: ${lrModel.intercept}")
53 |
54 | //6 建立多元回归模型
55 | val mlr = new LogisticRegression().
56 | setMaxIter(500).
57 | setRegParam(0.0).
58 | setElasticNetParam(0.0).
59 | setFamily("multinomial")
60 |
61 | //7 根据训练样本进行模型训练
62 | val mlrModel = mlr.fit(training)
63 |
64 | //8 打印模型信息
65 | println(s"Multinomial coefficients: ${mlrModel.coefficientMatrix}")
66 | println(s"Multinomial intercepts: ${mlrModel.interceptVector}")
67 |
68 | //9 对模型进行测试
69 | val test_predict = lrModel.transform(test)
70 | test_predict.show
71 | test_predict.select("features", "label", "probability", "prediction").take(5).foreach {
72 | case Row(features: Vector, label: Double, prob: Vector, prediction: Double) =>
73 | println(s"($features, $label) -> prob=$prob, prediction=$prediction")
74 | }
75 |
76 | //10 模型摘要
77 | val trainingSummary = lrModel.summary
78 |
79 | //11 每次迭代目标值
80 | val objectiveHistory = trainingSummary.objectiveHistory
81 | println("objectiveHistory:")
82 | objectiveHistory.foreach(loss => println(loss))
83 |
84 | //12 计算模型指标数据
85 | val binarySummary = trainingSummary.asInstanceOf[BinaryLogisticRegressionSummary]
86 |
87 | //13 模型摘要AUC指标
88 | val roc = binarySummary.roc
89 | println("roc.show()")
90 | roc.show()
91 | val AUC = binarySummary.areaUnderROC
92 | println(s"areaUnderROC: ${binarySummary.areaUnderROC}")
93 |
94 | //14 测试集AUC指标
95 | val evaluator = new BinaryClassificationEvaluator().
96 | setLabelCol("label").
97 | setRawPredictionCol("probability").
98 | setMetricName("areaUnderROC")
99 | val testAUC = evaluator.evaluate(test_predict)
100 | println("Test AUC = " + testAUC)
101 |
102 | //15 设置模型阈值
103 | // 不同的阈值,计算不同的F1,然后通过最大的F1找出并重设模型的最佳阈值。
104 | val fMeasure = binarySummary.fMeasureByThreshold
105 | fMeasure.show
106 | // 获得最大的F1值
107 | val maxFMeasure = fMeasure.select(max("F-Measure")).head().getDouble(0)
108 | // 找出最大F1值对应的阈值(最佳阈值)
109 | val bestThreshold = fMeasure.where($"F-Measure" === maxFMeasure).select("threshold").head().getDouble(0)
110 | // 并将模型的Threshold设置为选择出来的最佳分类阈值
111 | lrModel.setThreshold(bestThreshold)
112 |
113 | //16 模型保存与加载
114 | // 保存
115 | val now = new Date()
116 | val dateFormat1 = new SimpleDateFormat("yyyyMMddHHmmss")
117 | val time_stamp = dateFormat1.format(now)
118 |
119 | lrModel.save(s"hdfs://1.1.1.1:9000/lrmodel/${time_stamp}")
120 | // 加载
121 | val load_lrModel = LogisticRegressionModel.load(s"hdfs://1.1.1.1:9000/lrmodel/${time_stamp}")
122 | // 加载测试
123 | val load_predict = load_lrModel.transform(test)
124 | println("加载测试")
125 | load_predict.select("features", "label", "probability", "prediction").take(5).foreach {
126 | case Row(features: Vector, label: Double, prob: Vector, prediction: Double) =>
127 | println(s"($features, $label) -> prob=$prob, prediction=$prediction")
128 | }
129 |
130 | }
131 |
132 | /**
133 | * 读取libSVM格式的文件,生成训练样本和测试样本。
134 | */
135 | def readLibSvmSampleData(
136 | @transient spark: org.apache.spark.sql.SparkSession,
137 | dataPath: String): (Dataset[LabeledPoint], Dataset[LabeledPoint]) = {
138 | import spark.implicits._
139 | // 2.1 读取样本
140 |
141 | // 2.3 划分样本
142 |
143 | (training, test)
144 | }
145 |
146 | }
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第7章FM/FM.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # ## 0)环境准备
5 |
6 | # In[1]:
7 |
8 | import numpy as np
9 | import tensorflow as tf
10 | import pandas as pd
11 | import random
12 | import math
13 | import re
14 |
15 | from sklearn import preprocessing
16 | from os import path, listdir
17 | from sklearn.datasets import load_svmlight_files
18 | from sklearn.model_selection import train_test_split
19 | from sklearn import metrics
20 | from tensorflow.contrib import layers
21 |
22 | import time
23 | import datetime
24 |
25 | import os
26 | os.environ["CUDA_VISIBLE_DEVICES"]="0"
27 |
28 | print tf.__version__
29 | print tf.__path__
30 |
31 |
32 | # ## 1)数据准备Dataset格式
33 |
34 | # In[2]:
35 |
36 | # 每一行解析,解析标签csv格式
37 | # 0.0,0.6666666666666666,0.5,0.0,0.0,0.0,0.0,0.7272727272727273,0.42857142857142855
38 | # 数据处理
39 | def process_data(data_type, my_path, feature_size, batch_size=32, num_epochs=1):
40 | filenames = get_file_list(my_path)
41 | next_element = read_my_file_format(data_type, filenames, feature_size, batch_size, num_epochs)
42 | return next_element
43 |
44 | # 创建session,指定GPU或者CPU使用率
45 | def get_session(gpu_fraction=0.1):
46 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction,
47 | allow_growth=True)
48 | # server = tf.train.Server.create_local_server()
49 | return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
50 |
51 |
52 | # ## 2)FM模型
53 |
54 | # In[3]:
55 |
56 | class FM(object):
57 | """ 初始化成员变量 """
58 | def __init__(self, feature_size, fm_v_size, loss_fuc, train_optimizer, learning_rate, reg_type, reg_param):
59 | # 特征向量长度
60 | self.feature_size = feature_size
61 | # fm_v_size向量长度
62 | self.fm_v_size = fm_v_size
63 | # 损失函数
64 | self.loss_fuc = loss_fuc
65 | # 优化方法
66 | self.train_optimizer = train_optimizer
67 | # 学习率
68 | self.learning_rate = learning_rate
69 | # 正则类型
70 | self.reg_type = reg_type
71 | # 正则因子
72 | self.reg_param = reg_param
73 | # aglobal_step
74 | self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')
75 |
76 | def train(self, batch_data):
77 | """ 1 定义输入数据 """
78 | with tf.name_scope('input_data'):
79 | # 标签:[batch_size, 1]
80 | labels = batch_data['labels']
81 | # 用户特征向量:[batch_size, feature_size]
82 | dense_vector = tf.reshape(batch_data['dense_vector'], shape=[-1, feature_size, 1]) # None * feature_size * 1
83 | print("%s: %s" % ("dense_vector", dense_vector))
84 | print("%s: %s" % ("labels", labels))
85 |
86 | """ 2 定义网络输出 """
87 | with tf.name_scope("FM_Comput_Score"):
88 | # FM参数,生成或者获取W V
89 | with tf.variable_scope("fm_layer", reuse=tf.AUTO_REUSE):
90 | self.FM_W = tf.get_variable(name='fm_w', shape=[self.feature_size, 1], initializer=tf.glorot_normal_initializer())
91 | self.FM_V = tf.get_variable(name='fm_v', shape=[self.feature_size, self.fm_v_size], initializer=tf.glorot_normal_initializer())
92 | self.FM_B = tf.Variable(tf.constant(0.0), dtype=tf.float32 ,name="fm_bias") # W0
93 | print("%s: %s" % ("FM_W", self.FM_W))
94 | print("%s: %s" % ("FM_V", self.FM_V))
95 | print("%s: %s" % ("FM_B", self.FM_B))
96 |
97 | # ---------- w * x----------
98 | Y_first = tf.reduce_sum(tf.multiply(self.FM_W, dense_vector), 2) # None * F
99 | print("%s: %s" % ("Y_first", Y_first))
100 |
101 | # ---------- Vij * Vij* Xij ---------------
102 | embeddings = tf.multiply(self.FM_V, dense_vector) # None * V * X
103 | # sum_square part
104 | summed_features_emb = tf.reduce_sum(embeddings, 1) # sum(v*x)
105 | summed_features_emb_square = tf.square(summed_features_emb) # (sum(v*x))^2
106 |
107 | # square_sum part
108 | squared_features_emb = tf.square(embeddings) # (v*x)^2
109 | squared_sum_features_emb = tf.reduce_sum(squared_features_emb, 1) # sum((v*x)^2)
110 |
111 | # second order
112 | Y_second = 0.5 * tf.subtract(summed_features_emb_square, squared_sum_features_emb) # 0.5*((sum(v*x))^2 - sum((v*x)^2))
113 | print("%s: %s" % ("Y_second", Y_second))
114 |
115 | # out = W * X + Vij * Vij* Xij
116 | FM_out_lay1 = tf.concat([Y_first, Y_second], axis=1)
117 | Y_Out = tf.reduce_sum(FM_out_lay1, 1)
118 | # out = out + bias
119 | y_d = tf.reshape(Y_Out,shape=[-1])
120 | Y_bias = self.FM_B * tf.ones_like(y_d, dtype=tf.float32) # Y_bias
121 | Y_Out = tf.add(Y_Out, Y_bias, name='Y_Out')
122 | print("%s: %s" % ("Y_bias", Y_bias))
123 | print("%s: %s" % ("Y_Out", Y_Out))
124 | # ---------- score ----------
125 | score=tf.nn.sigmoid(Y_Out,name='score')
126 | score=tf.reshape(score, shape=[-1, 1])
127 | print("%s: %s" % ("score", score))
128 |
129 | """ 3 定义损失函数和AUC指标 """
130 | with tf.name_scope("loss"):
131 | # loss:Squared_error,Cross_entropy ,FTLR
132 | if reg_type == 'l1_reg':
133 | regularization = tf.contrib.layers.l1_regularizer(self.reg_param)(self.FM_W)
134 | elif reg_type == 'l2_reg':
135 | regularization = self.reg_param * tf.nn.l2_loss(self.FM_W)
136 | else:
137 | regularization = self.reg_param * tf.nn.l2_loss(self.FM_W)
138 |
139 | if loss_fuc == 'Squared_error':
140 | loss = tf.reduce_mean(tf.reduce_sum(tf.square(labels - score), reduction_indices=[1])) + regularization
141 | elif loss_fuc == 'Cross_entropy':
142 | loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=tf.reshape(Y_Out, [-1]), labels=tf.reshape(labels, [-1]))) + regularization
143 | elif loss_fuc == 'FTLR':
144 | loss = tf.reduce_mean(tf.reduce_sum(tf.square(labels - score), reduction_indices=[1])) + regularization
145 | # AUC
146 | auc = tf.metrics.auc(labels, score)
147 | print("%s: %s" % ("labels", labels))
148 | # w为0的比例,w的平均值
149 | w_zero_ratio = tf.reduce_mean(tf.to_float(tf.abs(self.FM_W) <= 1.0e-5))
150 | w_avg = tf.reduce_mean(self.FM_W)
151 | v_zero_ratio = tf.reduce_mean(tf.to_float(tf.abs(self.FM_V) <= 1.0e-5))
152 | v_avg = tf.reduce_mean(self.FM_V)
153 |
154 | """ 4 设定optimizer """
155 | with tf.name_scope("optimizer"):
156 | #------bulid optimizer------
157 | with tf.variable_scope("Optimizer", reuse=tf.AUTO_REUSE):
158 | if train_optimizer == 'Adam':
159 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8)
160 | elif train_optimizer == 'Adagrad':
161 | optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=1e-8)
162 | elif train_optimizer == 'Momentum':
163 | optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.95)
164 | elif train_optimizer == 'ftrl':
165 | optimizer = tf.train.FtrlOptimizer(learning_rate)
166 | train_step = optimizer.minimize(loss, global_step=self.global_step)
167 |
168 | """5 设定summary,以便在Tensorboard里进行可视化 """
169 | with tf.name_scope("summaries"):
170 | tf.summary.scalar("loss", loss)
171 | tf.summary.scalar("accumulate_auc", auc[0])
172 | tf.summary.scalar("w_avg", w_avg)
173 | tf.summary.scalar("w_zero_ratio", w_zero_ratio)
174 | tf.summary.scalar("v_avg", v_avg)
175 | tf.summary.scalar("v_zero_ratio", v_zero_ratio)
176 | tf.summary.histogram("FM_W", self.FM_W)
177 | tf.summary.histogram("FM_V", self.FM_V)
178 | # 好几个summary,所以这里要merge_all
179 | summary_op = tf.summary.merge_all()
180 |
181 | """6 返回结果 """
182 | return Y_Out, score, regularization, loss, auc, train_step, w_zero_ratio, w_avg, v_zero_ratio, v_avg, labels, score, summary_op
183 |
184 |
185 | # ## 3)模型训练测试
186 |
187 | # In[4]:
188 |
189 | # 测试数据
190 | filenames = '/data/csv-all'
191 | data_type='csv'
192 | feature_size = 530
193 | batch_size = 6000
194 | num_epochs = 200
195 | next_element = process_data(data_type, filenames, feature_size, batch_size, num_epochs)
196 |
197 | # 模型参数
198 | feature_size = 530
199 | fm_v_size = 20
200 | loss_fuc = 'Cross_entropy'
201 | train_optimizer = 'Adam'
202 | learning_rate = 0.01
203 | reg_type = 'l2_reg'
204 | reg_param = 0.000
205 | log_path='/data/log/FM_Cross_entropy_L2_0_20180816_01'
206 |
207 | # 开始训练
208 | bea_model = FM(feature_size, fm_v_size, loss_fuc, train_optimizer, learning_rate, reg_type, reg_param)
209 | Y_Out, score, regularization, loss, auc, train_step, w_zero_ratio, w_avg, v_zero_ratio, v_avg, labels, score, summary_op = bea_model.train(next_element)
210 |
211 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer())
212 | gpu_fraction = 0.6
213 | my_device='/gpu:0'
214 | with tf.device(my_device):
215 | sess = get_session(gpu_fraction)
216 | sess.run(init_op)
217 | batch_cnt = 0
218 | #选定可视化存储目录
219 | writer = tf.summary.FileWriter(log_path, sess.graph)
220 | try:
221 | while True:
222 | batch_cnt = batch_cnt + 1
223 | a, b, c, d, e, summary = sess.run([loss, auc, w_zero_ratio, w_avg, train_step, summary_op])
224 | if batch_cnt % 50 == 0 or batch_cnt <= 10:
225 | y, p = sess.run([labels, score])
226 | if y.sum() > 0.0:
227 | batch_auc=metrics.roc_auc_score(y, p)
228 | else:
229 | batch_auc=0.0
230 | print("batch: {} loss: {:.4f} accumulate_auc: {:.4f} batch_auc: {:.4f} w_zero_ratio: {:.4f} w_avg: {:.4f}".format(batch_cnt, a, b[0], batch_auc, c, d))
231 | writer.add_summary(summary, batch_cnt)
232 | except tf.errors.OutOfRangeError:
233 | print("3、Train end of dataset")
234 |
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第7章FM/FM_Sk.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # ## 0)环境设定
5 |
6 | # In[1]:
7 |
8 | from sklearn import metrics
9 | from os import path, listdir
10 | from sklearn.datasets import load_svmlight_files
11 | from sklearn.model_selection import train_test_split
12 | from sklearn.externals import joblib
13 | from sklearn import preprocessing
14 | from sklearn import metrics
15 | from fastFM import als
16 | import numpy as np
17 | import pandas as pd
18 | import random
19 |
20 |
21 | # ## 1)数据准备
22 |
23 | # In[2]:
24 |
25 | # 数据处理,读取libSVM格式数据,并且将数据归一化,样本划分,并且根据batch参数生成batch
26 | def process_data(data_path, feature_size, test_rat=0.3, random_seed=0, train_batch_size=5000, test_batch_size=5000):
27 | # 读取文件
28 | # batch生成
29 | return {"train_batch": train_batch, "test_batch": test_batch, "X_train": X_train, "X_test": X_test, "Y_train": Y_train, "Y_test": Y_test}
30 |
31 | # In[3]:
32 |
33 | data_path = '/data/data01/'
34 | test_rat=0.4
35 | random_seed=0
36 | train_batch_size=20000
37 | test_batch_size=20000
38 | feature_size=530
39 |
40 | # 获取样本数据
41 | data = process_data(data_path, feature_size, test_rat, random_seed, train_batch_size, test_batch_size)
42 |
43 | train_batch = data['train_batch']
44 | test_batch = data['test_batch']
45 |
46 | X_train = data['X_train']
47 | Y_train = data['Y_train']
48 | X_test = data['X_test']
49 | Y_test = data['Y_test']
50 |
51 | print X_train.shape
52 | print Y_train.shape
53 | print X_test.shape
54 | print Y_test.shape
55 |
56 |
57 | # In[6]:
58 |
59 | print Y_train
60 |
61 |
62 | # ## 3)FM模型
63 |
64 | # In[4]:
65 |
66 | # 3.1 建立FM模型,并且设定参数
67 | fm_model = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=10, random_state=0, l2_reg_w=0.0, l2_reg_V=0.0, l2_reg=0)
68 |
69 | # 3.2 训练FM模型
70 | fm_model.fit(X_train,Y_train)
71 |
72 | # 3.3 采用测试集验证模型离线指标
73 | # 训练集AUC
74 | probs_train= fm_model.predict_proba(X_train)
75 | AUC1 = metrics.roc_auc_score(Y_train, probs_train[:,1])
76 | print("Train Auc: %s"%(AUC1))
77 |
78 | # 测试集AUC
79 | probs_test= fm_model.predict_proba(X_test)
80 | predict_test = fm_model.predict(X_test)
81 | AUC2 = metrics.roc_auc_score(Y_test, probs_test[:,1])
82 | print("Test Auc: %s"%(AUC2))
83 |
84 | # 准确率
85 | accuracy = metrics.accuracy_score(Y_test, predict_test)
86 | print("Test Accuracy: %s"%(accuracy))
87 |
88 | # 召回率
89 | recall = metrics.recall_score(Y_test, predict_test)
90 | print("Test Recall: %s"%(recall))
91 |
92 | # F1值
93 | f1 = metrics.f1_score(Y_test, predict_test)
94 | print("Test F1: %s"%(f1))
95 |
96 | # 3.5 模型保存
97 | joblib.dump(fm_model,"FM.model")
98 | #模型加载
99 | print("模型加载")
100 | load_lr = joblib.load("FM.model")
101 | print(load_lr.predict_proba(X_test[0:5]))
102 |
103 |
104 | # In[ ]:
105 |
106 | # 3.4 打印模型参数
107 | print("参数:",lr_model.coef_)
108 | print("截距:",lr_model.intercept_)
109 | print("稀疏化特征比率:%.2f%%" %(np.mean(lr_model.coef_.ravel()==0)*100))
110 | print("=========sigmoid函数转化的值,即:概率p=========")
111 | print(lr_model.predict_proba(X_test[0:5])) #sigmoid函数转化的值,即:概率p
112 |
113 | # 3.5 模型保存
114 | joblib.dump(fm_model,"FM.model")
115 | #模型加载
116 | load_lr = joblib.load("FM.model")
117 | print(load_lr.predict_proba(X_test[0:5]))
118 |
119 |
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第8章决策树/DecisionTrees.scala:
--------------------------------------------------------------------------------
1 | package book_code
2 |
3 | import org.apache.spark.ml.feature._
4 | import org.apache.spark.ml.Pipeline
5 | import org.apache.spark.ml.classification.{ RandomForestClassificationModel, RandomForestClassifier }
6 | import org.apache.spark.ml.classification.{ DecisionTreeClassifier, DecisionTreeClassificationModel }
7 | import org.apache.spark.ml.classification.{ GBTClassificationModel, GBTClassifier }
8 | import org.apache.spark.ml.evaluation.{ MulticlassClassificationEvaluator, BinaryClassificationEvaluator }
9 | import org.apache.spark.ml.{ Pipeline, PipelineModel }
10 | import org.apache.spark.ml.param.ParamMap
11 | import org.apache.spark.ml.linalg.{ Vector, Vectors }
12 | import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
13 | import org.apache.spark.sql.Encoder
14 | import org.apache.spark.sql.types._
15 | import org.apache.spark.sql.functions._
16 | import org.apache.spark.sql._
17 | import org.apache.spark.sql.SparkSession
18 | import java.util.Date
19 | import java.text.SimpleDateFormat
20 |
21 | object DecisionTrees {
22 |
23 | def main(args: Array[String]): Unit = {
24 |
25 | val spark = SparkSession.builder().
26 | master("local").
27 | appName("decision_trees").
28 | getOrCreate()
29 |
30 | import spark.implicits._
31 |
32 | //1 参数准备
33 | val dataPath = "hdfs://1.1.1.1:9000/user/data01/"
34 |
35 | //2 训练样本准备
36 | val (training, test) = readLibSvmSampleData(spark, dataPath)
37 | training.cache()
38 | test.cache()
39 | println(s"training.count(): ${training.count()}")
40 | println(s"test.count(): ${test.count()}")
41 | println("training.show")
42 | training.show
43 |
44 | val data = training.unionAll(test)
45 |
46 | //2 标签进行索引编号
47 | val labelIndexer = new StringIndexer().
48 | setInputCol("label").
49 | setOutputCol("indexedLabel").
50 | fit(data)
51 | // 对离散特征进行标记索引,以用来确定哪些特征是离散特征
52 | // 如果一个特征的值超过4个以上,该特征视为连续特征,否则将会标记得离散特征并进行索引编号
53 | val featureIndexer = new VectorIndexer().
54 | setInputCol("features").
55 | setOutputCol("indexedFeatures").
56 | setMaxCategories(4).
57 | fit(data)
58 |
59 | //3 样本划分
60 | val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
61 |
62 | //4 训练决策树模型
63 | val dt = new DecisionTreeClassifier().
64 | setLabelCol("indexedLabel").
65 | setFeaturesCol("indexedFeatures")
66 |
67 | //4 训练随机森林模型
68 | val rf = new RandomForestClassifier()
69 | .setLabelCol("indexedLabel")
70 | .setFeaturesCol("indexedFeatures")
71 | .setNumTrees(10)
72 | .setMaxDepth(15)
73 |
74 | //4 训练GBDT模型
75 | val gbt = new GBTClassifier()
76 | .setLabelCol("indexedLabel")
77 | .setFeaturesCol("indexedFeatures")
78 | .setMaxIter(10)
79 | .setMaxDepth(15)
80 |
81 | //5 将索引的标签转回原始标签
82 | val labelConverter = new IndexToString().
83 | setInputCol("prediction").
84 | setOutputCol("predictedLabel").
85 | setLabels(labelIndexer.labels)
86 |
87 | //6 构建Pipeline
88 | val pipeline1 = new Pipeline().
89 | setStages(Array(labelIndexer, featureIndexer, dt, labelConverter))
90 | val pipeline2 = new Pipeline().
91 | setStages(Array(labelIndexer, featureIndexer, rf, labelConverter))
92 | val pipeline3 = new Pipeline().
93 | setStages(Array(labelIndexer, featureIndexer, gbt, labelConverter))
94 |
95 | //7 Pipeline开始训练
96 | val model1 = pipeline1.fit(trainingData)
97 | val model2 = pipeline2.fit(trainingData)
98 | val model3 = pipeline3.fit(trainingData)
99 |
100 | //8 模型测试
101 | val predictions = model3.transform(testData)
102 | println("predictions.show")
103 | predictions.select("predictedLabel", "label", "features").show(10)
104 |
105 | //9 分类指标
106 | // 正确率
107 | val evaluator1 = new MulticlassClassificationEvaluator().
108 | setLabelCol("indexedLabel").
109 | setPredictionCol("prediction").
110 | setMetricName("accuracy")
111 | val accuracy = evaluator1.evaluate(predictions)
112 | println("Test Error = " + (1.0 - accuracy))
113 | // f1
114 | val evaluator2 = new MulticlassClassificationEvaluator().
115 | setLabelCol("indexedLabel").
116 | setPredictionCol("prediction").
117 | setMetricName("f1")
118 | val f1 = evaluator2.evaluate(predictions)
119 | println("f1 = " + f1)
120 | // Precision
121 | val evaluator3 = new MulticlassClassificationEvaluator().
122 | setLabelCol("indexedLabel").
123 | setPredictionCol("prediction").
124 | setMetricName("weightedPrecision")
125 | val Precision = evaluator3.evaluate(predictions)
126 | println("Precision = " + Precision)
127 | // Recall
128 | val evaluator4 = new MulticlassClassificationEvaluator().
129 | setLabelCol("indexedLabel").
130 | setPredictionCol("prediction").
131 | setMetricName("weightedRecall")
132 | val Recall = evaluator4.evaluate(predictions)
133 | println("Recall = " + Recall)
134 |
135 | // AUC
136 | val evaluator5 = new BinaryClassificationEvaluator().
137 | setLabelCol("indexedLabel").
138 | setRawPredictionCol("prediction").
139 | setMetricName("areaUnderROC")
140 | val AUC = evaluator5.evaluate(predictions)
141 | println("Test AUC = " + AUC)
142 |
143 | // aupr
144 | val evaluator6 = new BinaryClassificationEvaluator().
145 | setLabelCol("indexedLabel").
146 | setRawPredictionCol("prediction").
147 | setMetricName("areaUnderPR")
148 | val aupr = evaluator6.evaluate(predictions)
149 | println("Test aupr = " + aupr)
150 |
151 | //10 决策树打印
152 | val treeModel = model1.stages(2).asInstanceOf[DecisionTreeClassificationModel]
153 | println("Learned classification tree model:\n" + treeModel.toDebugString)
154 |
155 | //11 模型保存与加载
156 | val now = new Date()
157 | val dateFormat1 = new SimpleDateFormat("yyyyMMddHHmmss")
158 | val time_stamp = dateFormat1.format(now)
159 | model1.save("hdfs://1.1.1.1:9000/dtmodel/${time_stamp}")
160 | val load_treeModel = PipelineModel.load(s"hdfs://1.1.1.1:9000/dtmodel/${time_stamp}")
161 |
162 | }
163 |
164 | /**
165 | * 读取libSVM格式的文件,生成训练样本和测试样本。
166 | */
167 | def readLibSvmSampleData(
168 | @transient spark: org.apache.spark.sql.SparkSession,
169 | dataPath: String): (Dataset[LabeledPoint], Dataset[LabeledPoint]) = {
170 | import spark.implicits._
171 | // 2.1 读取样本
172 | // 2.3 划分样本
173 | (training, test)
174 | }
175 |
176 | }
177 |
178 |
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第8章决策树/Tree.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # ## 1)环境设定
5 |
6 | # In[1]:
7 |
8 | from sklearn.linear_model import LogisticRegression
9 | from sklearn import metrics
10 | from os import path, listdir
11 | from sklearn.datasets import load_svmlight_files
12 | from sklearn.model_selection import train_test_split
13 | from sklearn.externals import joblib
14 | from sklearn import preprocessing
15 | import numpy as np
16 | import pandas as pd
17 | import random
18 |
19 | from sklearn import metrics
20 | from sklearn.svm import SVC
21 | from sklearn.neural_network import MLPClassifier
22 | from sklearn.neighbors import KNeighborsClassifier
23 | from sklearn.tree import DecisionTreeClassifier
24 | from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
25 | from sklearn.naive_bayes import GaussianNB
26 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
27 | from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
28 |
29 |
30 | # ## 2)数据准备
31 |
32 | # In[2]:
33 |
34 | # 数据处理,读取libSVM格式数据,并且将数据归一化,样本划分,并且根据batch参数生成batch
35 | def process_data(data_path, feature_size, test_rat=0.3, random_seed=0, train_batch_size=5000, test_batch_size=5000):
36 | # 读取文件
37 | # batch生成
38 | return {"train_batch": train_batch, "test_batch": test_batch, "X_train": X_train, "X_test": X_test, "Y_train": Y_train, "Y_test": Y_test}
39 |
40 | # In[3]:
41 |
42 | data_path = '/data/data01/'
43 | test_rat=0.4
44 | random_seed=0
45 | train_batch_size=20000
46 | test_batch_size=20000
47 | feature_size=530
48 |
49 | # 获取样本数据
50 | data = process_data(data_path, feature_size, test_rat, random_seed, train_batch_size, test_batch_size)
51 |
52 | train_batch = data['train_batch']
53 | test_batch = data['test_batch']
54 |
55 | X_train = data['X_train']
56 | Y_train = data['Y_train']
57 | X_test = data['X_test']
58 | Y_test = data['Y_test']
59 |
60 | print X_train.shape
61 | print Y_train.shape
62 | print X_test.shape
63 | print Y_test.shape
64 |
65 |
66 | # ## 3)Tree模型
67 |
68 | # In[4]:
69 |
70 | # 3.1 随机森林模型,并且设定参数
71 | rf_model= RandomForestClassifier(
72 | n_estimators=30,
73 | criterion='gini',
74 | max_depth=20,
75 | min_samples_leaf=200)
76 |
77 | # 3.1 GBDT模型,并且设定参数
78 | gbdt_model= GradientBoostingClassifier(
79 | n_estimators=30,
80 | criterion='friedman_mse',
81 | max_depth=20,
82 | min_samples_leaf=200)
83 |
84 | # 3.2 训练模型
85 | rf_model.fit(X_train,Y_train.values.ravel())
86 | gbdt_model.fit(X_train,Y_train.values.ravel())
87 |
88 |
89 | # In[5]:
90 |
91 | # 3.3 采用测试集验证模型离线指标
92 | # 训练集AUC
93 | probs_train= rf_model.predict_proba(X_train)
94 | AUC1 = metrics.roc_auc_score(Y_train, probs_train[:,1])
95 | print("RF Train Auc: %s"%(AUC1))
96 |
97 | # 测试集AUC
98 | probs_test= rf_model.predict_proba(X_test)
99 | predict_test = rf_model.predict(X_test)
100 | AUC2 = metrics.roc_auc_score(Y_test, probs_test[:,1])
101 | print("RF Test Auc: %s"%(AUC2))
102 |
103 | # 训练集AUC
104 | probs_train2= gbdt_model.predict_proba(X_train)
105 | AUC3 = metrics.roc_auc_score(Y_train, probs_train2[:,1])
106 | print("Gbdt Train Auc: %s"%(AUC3))
107 |
108 | # 测试集AUC
109 | probs_test2= gbdt_model.predict_proba(X_test)
110 | AUC4 = metrics.roc_auc_score(Y_test, probs_test2[:,1])
111 | print("Gbdt Test Auc: %s"%(AUC4))
112 |
113 |
114 | # In[6]:
115 |
116 | # 准确率
117 | accuracy = metrics.accuracy_score(Y_test, predict_test)
118 | print("Test Accuracy: %s"%(accuracy))
119 |
120 | # 召回率
121 | recall = metrics.recall_score(Y_test, predict_test)
122 | print("Test Recall: %s"%(recall))
123 |
124 | # F1值
125 | f1 = metrics.f1_score(Y_test, predict_test)
126 | print("Test F1: %s"%(f1))
127 |
128 |
129 | # In[7]:
130 |
131 | # 3.1 随机森林模型,并且设定参数
132 | rf_model= RandomForestClassifier(
133 | n_estimators=50,
134 | criterion='gini',
135 | max_depth=30,
136 | min_samples_leaf=100)
137 |
138 | # 3.1 GBDT模型,并且设定参数
139 | gbdt_model= GradientBoostingClassifier(
140 | n_estimators=50,
141 | criterion='friedman_mse',
142 | max_depth=30,
143 | min_samples_leaf=100)
144 |
145 | # 3.2 训练模型
146 | rf_model.fit(X_train,Y_train)
147 | gbdt_model.fit(X_train,Y_train)
148 |
149 | # 3.3 采用测试集验证模型离线指标
150 | # RF训练集AUC
151 | probs_train= rf_model.predict_proba(X_train)
152 | AUC1 = metrics.roc_auc_score(Y_train, probs_train[:,1])
153 | print("RF Train Auc: %s"%(AUC1))
154 |
155 | # RF测试集AUC
156 | probs_test= rf_model.predict_proba(X_test)
157 | predict_test = rf_model.predict(X_test)
158 | AUC2 = metrics.roc_auc_score(Y_test, probs_test[:,1])
159 | print("RF Test Auc: %s"%(AUC2))
160 |
161 | # Gbdt训练集AUC
162 | probs_train2= gbdt_model.predict_proba(X_train)
163 | AUC3 = metrics.roc_auc_score(Y_train, probs_train2[:,1])
164 | print("Gbdt Train Auc: %s"%(AUC3))
165 |
166 | # Gbdt测试集AUC
167 | probs_test2= gbdt_model.predict_proba(X_test)
168 | AUC4 = metrics.roc_auc_score(Y_test, probs_test2[:,1])
169 | print("Gbdt Test Auc: %s"%(AUC4))
170 |
171 | # 准确率
172 | accuracy = metrics.accuracy_score(Y_test, predict_test)
173 | print("Test Accuracy: %s"%(accuracy))
174 |
175 | # 召回率
176 | recall = metrics.recall_score(Y_test, predict_test)
177 | print("Test Recall: %s"%(recall))
178 |
179 | # F1值
180 | f1 = metrics.f1_score(Y_test, predict_test)
181 | print("Test F1: %s"%(f1))
182 |
183 | # 3.5 模型保存
184 | joblib.dump(rf_model,"rf_model.model")
185 | joblib.dump(gbdt_model,"gbdt_model.model")
186 | #模型加载
187 | load_rf = joblib.load("rf_model.model")
188 | load_gbdt = joblib.load("gbdt_model.model")
189 | print(load_rf.predict_proba(X_test[0:5]))
190 | print(load_gbdt.predict_proba(X_test[0:5]))
191 |
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第9章集成学习/GbdtLr.scala:
--------------------------------------------------------------------------------
1 | package book_code
2 |
3 | import org.apache.hadoop.conf.Configuration
4 | import org.apache.hadoop.fs.{ FileSystem, Path }
5 | import org.apache.spark.mllib.classification.{ LogisticRegressionModel, LogisticRegressionWithLBFGS }
6 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
7 | import org.apache.spark.mllib.linalg.Vectors
8 | import org.apache.spark.ml.linalg.{ Vector => mlVector }
9 | import org.apache.spark.mllib.linalg.Vector
10 | import org.apache.spark.mllib.regression.LabeledPoint
11 | import org.apache.spark.mllib.tree.GradientBoostedTrees
12 | import org.apache.spark.mllib.tree.configuration.BoostingStrategy
13 | import org.apache.spark.mllib.tree.configuration.FeatureType._
14 | import org.apache.spark.mllib.tree.model.{ GradientBoostedTreesModel, Node }
15 | import org.apache.spark.rdd.RDD
16 | import org.apache.spark.sql._
17 | import scala.collection.mutable.ArrayBuffer
18 |
19 | object GbdtLr {
20 |
21 | def main(args: Array[String]): Unit = {
22 |
23 | val spark = SparkSession.builder().
24 | master("local").
25 | appName("GbdtLr").
26 | getOrCreate()
27 |
28 | import spark.implicits._
29 |
30 | //1 参数准备
31 | val iteratTree = 10
32 | val iteratDepth = 10
33 | val maxAuc = 0.0
34 | val maxDepth = 15
35 | val numTrees = 10
36 | val minInstancesPerNode = 2
37 |
38 | //2 训练样本准备
39 | val dataPath = "hdfs://1.1.1.1:9000/user/data01/"
40 |
41 | //2 训练样本准备
42 | val (trainingData, testData) = readLibSvmSampleData(spark, dataPath)
43 | trainingData.cache()
44 | testData.cache()
45 | println(s"trainingData.count(): ${trainingData.count()}")
46 | println(s"testData.count(): ${testData.count()}")
47 | println("trainingData.show")
48 | trainingData.show
49 | val data = trainingData.unionAll(testData)
50 |
51 | //3 Gbdt模型训练
52 | val boostingStrategy = BoostingStrategy.defaultParams("Regression")
53 | boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]()
54 | boostingStrategy.treeStrategy.minInstancesPerNode = minInstancesPerNode
55 | boostingStrategy.numIterations = numTrees
56 | boostingStrategy.treeStrategy.maxDepth = maxDepth
57 | val gbdtModel = GradientBoostedTrees.train(trainingData.rdd, boostingStrategy)
58 |
59 | //4 gbdt模型解析:取出所有树的叶子节点
60 | val treeLeafMap = getTreeLeafMap(gbdtModel)
61 |
62 | //5 样本数据转换成gbdt叶子节点编号的样本
63 | val lrSampleLablePoint = lrSample(data.rdd, treeLeafMap, gbdtModel)
64 | val lrSplits = lrSampleLablePoint.randomSplit(Array(0.7, 0.3))
65 | val (lrTrainingData, lrTestData) = (lrSplits(0), lrSplits(1))
66 | lrTrainingData.cache()
67 | lrTrainingData.count()
68 | lrTestData.cache()
69 | lrTestData.count()
70 |
71 | //6 lr模型训练
72 | val lr = new LogisticRegressionWithLBFGS().setNumClasses(2)
73 | lr.optimizer.setNumIterations(100)
74 | lr.optimizer.setRegParam(0.0)
75 | val lrModel = lr.run(lrTrainingData)
76 |
77 | //7 计算模型指标
78 | lrModel.clearThreshold()
79 | val scoreAndLabels = lrTestData.map { point =>
80 | val score = lrModel.predict(point.features)
81 | (score, point.label)
82 | }
83 | val metrics = new BinaryClassificationMetrics(scoreAndLabels)
84 | val auc = metrics.areaUnderROC()
85 | val aupr = metrics.areaUnderPR()
86 | println(s"AUC: ${auc}")
87 | println(s"AUPR: ${aupr}")
88 |
89 | }
90 |
91 | /**
92 | * 根据gbdt模型生成gbdtlr模型的样本
93 | */
94 | def lrSample(): RDD[LabeledPoint] = {
95 | lrSamplLablePoint
96 | }
97 |
98 | /**
99 | * gbdt模型解析叶子节点
100 | */
101 | def getTreeLeafMap(gbdtModel: GradientBoostedTreesModel): Map[String, Int] = {
102 | lrFeatureMap
103 | }
104 |
105 | /**
106 | * 读取libSVM格式的文件,生成训练样本和测试样本。
107 | */
108 | def readLibSvmSampleData(
109 | @transient spark: org.apache.spark.sql.SparkSession,
110 | dataPath: String): (Dataset[LabeledPoint], Dataset[LabeledPoint]) = {
111 | import spark.implicits._
112 | // 2.1 读取样本
113 | // 2.3 划分样本
114 | (training, test)
115 | }
116 |
117 | }
118 |
119 |
--------------------------------------------------------------------------------
/推荐系统算法实践—源码下载/第9章集成学习/gcForest.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # ## 1)环境设定
5 |
6 | # In[31]:
7 |
8 | import argparse
9 | import numpy as np
10 | import sys
11 | from keras.datasets import mnist
12 | import pickle
13 | from sklearn.ensemble import RandomForestClassifier
14 | from sklearn.metrics import accuracy_score
15 | sys.path.insert(0, "lib")
16 |
17 | from gcforest.gcforest import GCForest
18 | from gcforest.utils.config_utils import load_json
19 |
20 | from sklearn.linear_model import LogisticRegression
21 | from sklearn import metrics
22 | from os import path, listdir
23 | from sklearn.datasets import load_svmlight_files
24 | from sklearn.model_selection import train_test_split
25 | from sklearn.externals import joblib
26 | from sklearn import preprocessing
27 | import numpy as np
28 | import pandas as pd
29 | import random
30 |
31 | from sklearn import metrics
32 | from sklearn.svm import SVC
33 |
34 |
35 | # ## 2)数据准备
36 |
37 | # In[32]:
38 |
39 | # 数据处理,读取libSVM格式数据,并且将数据归一化,样本划分,并且根据batch参数生成batch
40 | def process_data(data_path, feature_size, test_rat=0.3, random_seed=0, train_batch_size=5000, test_batch_size=5000):
41 | # 读取文件
42 | # batch生成
43 | return {"train_batch": train_batch, "test_batch": test_batch, "X_train": X_train, "X_test": X_test, "Y_train": Y_train, "Y_test": Y_test}
44 |
45 | # In[40]:
46 |
47 | data_path = '/data/data01/'
48 | test_rat=0.4
49 | random_seed=0
50 | train_batch_size=20000
51 | test_batch_size=20000
52 | feature_size=530
53 |
54 | # 获取样本数据
55 | data = process_data(data_path, feature_size, test_rat, random_seed, train_batch_size, test_batch_size)
56 |
57 | train_batch = data['train_batch']
58 | test_batch = data['test_batch']
59 |
60 | X_train = np.array(data['X_train'])
61 | y_train = np.array(data['Y_train']).reshape(-1,)
62 | X_test = np.array(data['X_test'])
63 | y_test = np.array(data['Y_test']).reshape(-1,)
64 |
65 | print X_train.shape
66 | print y_train.shape
67 | print X_test.shape
68 | print y_test.shape
69 |
70 |
71 | # In[41]:
72 |
73 | print X_train[0:2]
74 | print y_train[0:2]
75 |
76 | # (a, b), (c, d) = mnist.load_data()
77 | # print a.shape
78 | # print b.shape
79 | # print c.shape
80 | # print d.shape
81 | # print a[0:2]
82 | # print b[0:2]
83 |
84 |
85 | # ## 3)gcForest模型
86 |
87 | # In[49]:
88 |
89 | # 模型参数
90 | def get_toy_config():
91 | config = {}
92 | ca_config = {}
93 | ca_config["random_state"] = 0
94 | ca_config["max_layers"] = 100
95 | ca_config["early_stopping_rounds"] = 3
96 | ca_config["n_classes"] = 2
97 | ca_config["estimators"] = []
98 | ca_config["estimators"].append({"n_folds": 2, "type": "RandomForestClassifier", "n_estimators": 10, "max_depth": None, "n_jobs": -1})
99 | ca_config["estimators"].append({"n_folds": 2, "type": "ExtraTreesClassifier", "n_estimators": 10, "max_depth": None, "n_jobs": -1})
100 | ca_config["estimators"].append({"n_folds": 2, "type": "LogisticRegression"})
101 | config["cascade"] = ca_config
102 | return config
103 |
104 |
105 | # In[54]:
106 |
107 | # 模型参数
108 | config = get_toy_config()
109 |
110 | # 模型初始化
111 | gc = GCForest(config)
112 |
113 | # 模型训练
114 | X_train_enc = gc.fit_transform(X_train, y_train)
115 |
116 | # 模型预测
117 | y_pred = gc.predict(X_test)
118 | acc = accuracy_score(y_test, y_pred)
119 | print("Test Accuracy of GcForest = {:.2f} %".format(acc * 100))
120 |
121 | # xgboost/RF预测分类.
122 | X_test_enc = gc.transform(X_test)
123 | X_train_enc = X_train_enc.reshape((X_train_enc.shape[0], -1))
124 | X_test_enc = X_test_enc.reshape((X_test_enc.shape[0], -1))
125 | X_train_origin = X_train.reshape((X_train.shape[0], -1))
126 | X_test_origin = X_test.reshape((X_test.shape[0], -1))
127 | X_train_enc = np.hstack((X_train_origin, X_train_enc))
128 | X_test_enc = np.hstack((X_test_origin, X_test_enc))
129 | print("X_train_enc.shape={}, X_test_enc.shape={}".format(X_train_enc.shape, X_test_enc.shape))
130 | clf = RandomForestClassifier(n_estimators=50, max_depth=None, n_jobs=-1)
131 | clf.fit(X_train_enc, y_train)
132 | y_pred = clf.predict(X_test_enc)
133 | acc = accuracy_score(y_test, y_pred)
134 | print("Test Accuracy of Other classifier using gcforest's X_encode = {:.2f} %".format(acc * 100))
135 |
136 |
137 | # In[55]:
138 |
139 | # 计算AUC指标
140 | probs_train= clf.predict_proba(X_train_enc)
141 | AUC1 = metrics.roc_auc_score(y_train, probs_train[:,1])
142 | print("Train Auc: %s"%(AUC1))
143 |
144 | probs_test= clf.predict_proba(X_test_enc)
145 | AUC2 = metrics.roc_auc_score(y_test, probs_test[:,1])
146 | print("Test Auc: %s"%(AUC2))
147 |
148 | # # dump
149 | # with open("test.pkl", "wb") as f:
150 | # pickle.dump(gc, f, pickle.HIGHEST_PROTOCOL)
151 | # # load
152 | # with open("test.pkl", "rb") as f:
153 | # gc = pickle.load(f)
154 | # y_pred = gc.predict(X_test)
155 | # acc = accuracy_score(y_test, y_pred)
156 | # print("Test Accuracy of GcForest (save and load) = {:.2f} %".format(acc * 100))
157 |
158 |
159 | # In[ ]:
160 |
161 | # dump
162 | with open("test.pkl", "wb") as f:
163 | pickle.dump(gc, f, pickle.HIGHEST_PROTOCOL)
164 | # load
165 | with open("test.pkl", "rb") as f:
166 | gc = pickle.load(f)
167 | y_pred = gc.predict(X_test)
168 | acc = accuracy_score(y_test, y_pred)
169 | print("Test Accuracy of GcForest (save and load) = {:.2f} %".format(acc * 100))
170 |
171 |
--------------------------------------------------------------------------------
/推荐系统算法实践—补充部分/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WallaceLiu/recommendation_algorithm/36cdd077446ab7a2831c168f7419b058bd2fcbb0/推荐系统算法实践—补充部分/.DS_Store
--------------------------------------------------------------------------------
/推荐系统算法实践—补充部分/第12章节/adult.names:
--------------------------------------------------------------------------------
1 | | This data was extracted from the census bureau database found at
2 | | http://www.census.gov/ftp/pub/DES/www/welcome.html
3 | | Donor: Ronny Kohavi and Barry Becker,
4 | | Data Mining and Visualization
5 | | Silicon Graphics.
6 | | e-mail: ronnyk@sgi.com for questions.
7 | | Split into train-test using MLC++ GenCVFiles (2/3, 1/3 random).
8 | | 48842 instances, mix of continuous and discrete (train=32561, test=16281)
9 | | 45222 if instances with unknown values are removed (train=30162, test=15060)
10 | | Duplicate or conflicting instances : 6
11 | | Class probabilities for adult.all file
12 | | Probability for the label '>50K' : 23.93% / 24.78% (without unknowns)
13 | | Probability for the label '<=50K' : 76.07% / 75.22% (without unknowns)
14 | |
15 | | Extraction was done by Barry Becker from the 1994 Census database. A set of
16 | | reasonably clean records was extracted using the following conditions:
17 | | ((AAGE>16) && (AGI>100) && (AFNLWGT>1)&& (HRSWK>0))
18 | |
19 | | Prediction task is to determine whether a person makes over 50K
20 | | a year.
21 | |
22 | | First cited in:
23 | | @inproceedings{kohavi-nbtree,
24 | | author={Ron Kohavi},
25 | | title={Scaling Up the Accuracy of Naive-Bayes Classifiers: a
26 | | Decision-Tree Hybrid},
27 | | booktitle={Proceedings of the Second International Conference on
28 | | Knowledge Discovery and Data Mining},
29 | | year = 1996,
30 | | pages={to appear}}
31 | |
32 | | Error Accuracy reported as follows, after removal of unknowns from
33 | | train/test sets):
34 | | C4.5 : 84.46+-0.30
35 | | Naive-Bayes: 83.88+-0.30
36 | | NBTree : 85.90+-0.28
37 | |
38 | |
39 | | Following algorithms were later run with the following error rates,
40 | | all after removal of unknowns and using the original train/test split.
41 | | All these numbers are straight runs using MLC++ with default values.
42 | |
43 | | Algorithm Error
44 | | -- ---------------- -----
45 | | 1 C4.5 15.54
46 | | 2 C4.5-auto 14.46
47 | | 3 C4.5 rules 14.94
48 | | 4 Voted ID3 (0.6) 15.64
49 | | 5 Voted ID3 (0.8) 16.47
50 | | 6 T2 16.84
51 | | 7 1R 19.54
52 | | 8 NBTree 14.10
53 | | 9 CN2 16.00
54 | | 10 HOODG 14.82
55 | | 11 FSS Naive Bayes 14.05
56 | | 12 IDTM (Decision table) 14.46
57 | | 13 Naive-Bayes 16.12
58 | | 14 Nearest-neighbor (1) 21.42
59 | | 15 Nearest-neighbor (3) 20.35
60 | | 16 OC1 15.04
61 | | 17 Pebls Crashed. Unknown why (bounds WERE increased)
62 | |
63 | | Conversion of original data as follows:
64 | | 1. Discretized agrossincome into two ranges with threshold 50,000.
65 | | 2. Convert U.S. to US to avoid periods.
66 | | 3. Convert Unknown to "?"
67 | | 4. Run MLC++ GenCVFiles to generate data,test.
68 | |
69 | | Description of fnlwgt (final weight)
70 | |
71 | | The weights on the CPS files are controlled to independent estimates of the
72 | | civilian noninstitutional population of the US. These are prepared monthly
73 | | for us by Population Division here at the Census Bureau. We use 3 sets of
74 | | controls.
75 | | These are:
76 | | 1. A single cell estimate of the population 16+ for each state.
77 | | 2. Controls for Hispanic Origin by age and sex.
78 | | 3. Controls by Race, age and sex.
79 | |
80 | | We use all three sets of controls in our weighting program and "rake" through
81 | | them 6 times so that by the end we come back to all the controls we used.
82 | |
83 | | The term estimate refers to population totals derived from CPS by creating
84 | | "weighted tallies" of any specified socio-economic characteristics of the
85 | | population.
86 | |
87 | | People with similar demographic characteristics should have
88 | | similar weights. There is one important caveat to remember
89 | | about this statement. That is that since the CPS sample is
90 | | actually a collection of 51 state samples, each with its own
91 | | probability of selection, the statement only applies within
92 | | state.
93 |
94 |
95 | >50K, <=50K.
96 |
97 | age: continuous.
98 | workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
99 | fnlwgt: continuous.
100 | education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
101 | education-num: continuous.
102 | marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
103 | occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
104 | relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
105 | race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
106 | sex: Female, Male.
107 | capital-gain: continuous.
108 | capital-loss: continuous.
109 | hours-per-week: continuous.
110 | native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.
111 |
--------------------------------------------------------------------------------
/推荐系统算法实践—补充部分/第14章节/others.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | import pandas as pd
4 | import random
5 | import math
6 | import re
7 | from os import path, listdir
8 | from tensorflow.contrib import layers
9 | from sklearn import metrics
10 | import time
11 | import datetime
12 |
13 | # ## 1)数据准备Dataset格式
14 | # 每一行解析,格式:0229|0,0,0,0,0,0,0,0,0,0,0,0,0,0|1,1173,0,0,0|18578
15 | def decode_sequence(line, continuous_size, item_size):
16 | columns = tf.string_split([line], '|')
17 | normalized_continuous_features = tf.string_to_number(tf.string_split([columns.values[1]], ',').values[0:continuous_size], out_type=tf.int32, name = "normalized_continuous_features")
18 | hist_click = tf.string_to_number(tf.string_split([columns.values[2]], ',').values[0:item_size], out_type=tf.int32, name = "hist_click")
19 | label = tf.reshape(tf.string_to_number(columns.values[3], out_type=tf.float32, name = "label"), [-1])
20 | return {"label": label, "hist_click": hist_click, "normalized_continuous_features": normalized_continuous_features}
21 |
22 | # 文件读取,采用dataset格式
23 | def read_my_file_format(data_type, filenames, continuous_size, item_size, batch_size, num_epochs=1):
24 | # 读取文件
25 | print filenames
26 | dataset = tf.data.TextLineDataset(filenames).map(lambda x: decode_sequence(x, continuous_size, item_size)).prefetch(batch_size).cache()
27 | dataset = dataset.repeat(num_epochs)
28 | dataset = dataset.batch(batch_size) # Batch size to use
29 | iterator = dataset.make_one_shot_iterator()
30 | next_element = iterator.get_next()
31 | return next_element
32 |
33 | # 文件列表
34 | def get_file_list(my_path):
35 | files = []
36 | if path.isdir(my_path):
37 | [files.append(path.join(my_path, p)) for p in listdir(my_path) if path.isfile(path.join(my_path, p))]
38 | else:
39 | files.append(my_path)
40 | return files
41 |
42 | # 数据处理
43 | def process_data(data_type, my_path, continuous_size, item_size, batch_size=32, num_epochs=1):
44 | filenames = get_file_list(my_path)
45 | next_element = read_my_file_format(data_type, filenames, continuous_size, item_size, batch_size, num_epochs)
46 | return next_element
47 |
48 | # In[15]:
49 |
50 | # 测试数据
51 | filenames = '/data/001'
52 | continuous_size = 16
53 | item_size = 5
54 | batch_size = 3
55 | num_epochs = 1
56 | data_type = 'sequence'
57 | next_element = process_data(data_type, filenames, continuous_size, item_size, batch_size, num_epochs)
58 |
59 | # ## 2)定义YouTubeNet模型
60 | """ 6 多层感知器神经网络计算,最终得到用户的embedding向量U:[batch_size, embedding_size] """
61 | print("6 多层感知器神经网络计算")
62 | with tf.name_scope('MLP'):
63 | with tf.variable_scope("MLP", reuse=tf.AUTO_REUSE):
64 | # 第一层:(embedding_size + normalized_continuous_features_length) * embedding_size
65 | # 第二层: embedding_size * embedding_size
66 | weights = {
67 | 'h1': tf.Variable(tf.random_normal([self.embedding_size + self.normalized_continuous_features_length, self.embedding_size])),
68 | 'h2': tf.Variable(tf.random_normal([self.embedding_size, self.embedding_size]))
69 | }
70 | biases = {
71 | 'b1': tf.Variable(tf.random_normal([self.embedding_size])),
72 | 'out': tf.Variable(tf.random_normal([self.embedding_size]))
73 | }
74 | print("%s: %s" % ("weights", weights))
75 | print("%s: %s" % ("biases", biases))
76 | layer_1 = tf.add(tf.matmul(all_concat, weights['h1']), biases['b1'])
77 | layer_1 = tf.nn.relu(layer_1)
78 | print("%s: %s" % ("layer_1", layer_1))
79 | layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['out'])
80 | print("%s: %s" % ("layer_2", layer_2))
81 | layer_out = tf.nn.relu(layer_2)
82 | print("%s: %s" % ("layer_out", layer_out))
83 |
84 | """ 7 Softmax计算,用户的embedding向量U 乘以 物品的embedding向量V,然后通过Softmax计算结果,其中Loss采用NCE负采样方法 """
85 | print("7 最后一层Softmax计算")
86 | with tf.name_scope('Softmax_Classifer'):
87 | with tf.variable_scope("softmax_classifer", reuse=tf.AUTO_REUSE):
88 | # NCE LOSS
89 | loss = tf.reduce_mean(
90 | tf.nn.nce_loss(
91 | weights=self.weights,
92 | biases=self.biases,
93 | labels=label,
94 | inputs=layer_out,
95 | num_sampled=self.num_sampled,
96 | num_classes=self.item_count
97 | )
98 | )
99 | print("%s: %s" % ("loss", loss))
100 | # LOSS优化方法
101 | train_step = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999,epsilon=1e-8).minimize(loss)
102 | # Softmax的预测结果
103 | out = tf.nn.softmax(tf.matmul(layer_out, tf.transpose(self.weights)) + self.biases, dim=1)
104 | print("%s: %s" % ("out", out))
--------------------------------------------------------------------------------
/推荐系统算法实践—补充部分/第5章节/other.py:
--------------------------------------------------------------------------------
1 | # ## 2)数据准备Dataset格式
2 | # 每一行解析,解析标签csv格式
3 | # 5805 17357
4 | def decode_csv(line):
5 | # 按照,分割,取label和feature
6 | columns = tf.string_split([line], ' ')
7 | center_words = tf.reshape(tf.string_to_number(columns.values[0], out_type=tf.int32),[-1])
8 | target_words = tf.reshape(tf.string_to_number(columns.values[1], out_type=tf.int32),[-1])
9 | return {'center_words': center_words, 'target_words': target_words}
10 | # 文件读取,采用dataset格式
11 | def read_my_file_format(filenames, batch_size, num_epochs=1):
12 | # 读取文件
13 | dataset = tf.data.TextLineDataset(filenames).map(lambda x: decode_csv(x)).prefetch(batch_size).cache()
14 | dataset = dataset.repeat(num_epochs)
15 | dataset = dataset.batch(batch_size)
16 | iterator = dataset.make_one_shot_iterator()
17 | next_element = iterator.get_next()
18 | return next_element
19 | # 文件列表
20 | def get_file_list(my_path):
21 | files = []
22 | if path.isdir(my_path):
23 | [files.append(path.join(my_path, p)) for p in listdir(my_path) if path.isfile(path.join(my_path, p))]
24 | else:
25 | files.append(my_path)
26 | return files
27 | # 数据处理
28 | def process_data(my_path, batch_size=32, num_epochs=1):
29 | filenames = get_file_list(my_path)
30 | next_element = read_my_file_format(filenames, batch_size, num_epochs)
31 | return next_element
32 |
33 | filenames = "./windows_skip_sample.csv"
34 | batch_size = 1000
35 | num_epochs = 200
36 | next_element = process_data(filenames, batch_size, num_epochs)
--------------------------------------------------------------------------------
/推荐系统算法实践—补充部分/第6-11和13章节/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WallaceLiu/recommendation_algorithm/36cdd077446ab7a2831c168f7419b058bd2fcbb0/推荐系统算法实践—补充部分/第6-11和13章节/.DS_Store
--------------------------------------------------------------------------------
/推荐系统算法实践—补充部分/第6-11和13章节/data/00000:
--------------------------------------------------------------------------------
1 | 0.0 0:5.0 1:2.0 6:8.0 7:5.0 8:22.0 9:3.0 10:2.000020024E9 11:15.0 12:12.0 13:8.0 14:156011.0 15:156.0 16:1.560110001E9 17:6.0 18:1363200.0 19:921600.0 20:1388.0 21:14246.0 22:850.0 23:12378.0 24:35.0 29:1.0 30:2.0 114:1.0 120:1.0 125:2.0 126:1.0 129:1.0 132:2.0 134:2.0 136:2.0 138:4.0 147:1.0 149:1.0 158:1.0 159:1.0 166:1.0 168:1.0 176:2.0 181:1.0 186:1.0 211:1.0 212:1.0 216:1.0 221:1.0 223:1.0 227:1.0 232:1.0 233:1.0 237:1.0 241:2.0 244:1.0 246:1.0 247:1.0 250:3.0 251:1.0 253:1.0 254:1.0 258:3.0 259:626.0 260:614.0 261:626.0 262:595.0 263:637.0 264:662.0 265:4.0 482:5.5 499:1.0
2 | 0.0 114:1.0 120:1.0 125:1.0 126:1.0 132:1.0 134:1.0 136:5.0 138:5.0 158:2.0 159:1.0 161:1.0 169:1.0 174:1.0 176:2.0 181:1.0 201:1.0 205:2.0 211:1.0 212:1.0 216:1.0 220:1.0 227:1.0 228:1.0 233:1.0 240:1.0 250:2.0 253:1.0 254:1.0 259:671.0 260:654.0 261:671.0 262:652.0 263:684.0 264:695.0 271:0.54714 274:0.57133 276:0.00557 278:0.03361 281:0.5464 282:0.0 321:0.54714 349:0.57133 365:0.00557 376:0.03361 392:0.52395 393:1.8E-4 395:0.01914 396:0.09437 397:0.5464 402:0.00487 404:0.0 487:1.0
3 | 0.0 0:5.0 1:1.0 6:11.0 7:8.0 8:21.0 9:5.0 10:2.000020024E9 11:16.0 12:24.0 13:4.0 14:156063.0 15:156.0 16:1.560630001E9 17:6.0 18:2265600.0 19:2073600.0 20:2835.0 21:14470.0 22:1609.0 23:12198.0 24:31.0 30:3.0 35:1.0 36:1.0 46:1.0 79:1.0 87:1.0 114:1.0 120:1.0 125:1.0 126:1.0 132:1.0 134:1.0 136:1.0 138:9.0 147:1.0 151:3.0 152:2.0 155:1.0 166:1.0 168:1.0 169:1.0 172:1.0 176:2.0 181:2.0 205:1.0 208:1.0 211:1.0 212:1.0 216:2.0 219:1.0 220:1.0 221:1.0 223:1.0 227:1.0 228:1.0 241:1.0 245:1.0 250:2.0 253:1.0 254:1.0 258:6.0 259:675.0 260:683.0 261:675.0 262:648.0 263:684.0 264:686.0 265:4.0 267:0.30439 273:0.04025 274:0.33068 275:0.14469 276:0.04278 281:0.84341 284:0.15797 285:0.02921 293:0.30439 344:0.04025 349:0.33068 351:7.8E-4 354:0.04441 358:0.14469 365:0.04278 392:0.84341 393:0.04843 395:0.0 396:0.53333 397:0.08287 401:0.4149 417:0.05761 419:0.15797 420:0.14199 421:0.09726 423:0.01132 424:0.21702 425:0.27164 426:0.72403 432:0.02921 483:156.0 484:156063.0 485:1.560630001E9 490:1.0
4 | 0.0 135:1.0 136:2.0 138:5.0 144:1.0 147:1.0 149:1.0 151:5.0 152:3.0 154:1.0 156:1.0 176:1.0 181:1.0 208:1.0 211:4.0 212:4.0 215:1.0 227:1.0 229:1.0 250:3.0 253:1.0 254:1.0 257:1.0 267:0.37017 268:0.04224 274:0.05251 275:0.21933 276:0.0 278:0.02015 281:1.0 293:0.37017 296:0.11551 298:0.04224 349:0.05251 358:0.15732 361:0.00786 364:0.0 366:0.08654 367:0.04391 376:0.02015 392:0.66854 394:1.0 397:1.0 486:1.0
5 | 0.0 0:7.0 1:2.0 6:5.0 7:5.0 8:18.0 9:2.0 10:2.000020024E9 11:8.0 12:8.0 13:4.0 14:156044.0 15:156.0 16:1.560440007E9 17:3.0 18:1190400.0 19:518400.0 20:1369.0 21:7063.0 22:599.0 23:5445.0 24:68.0 35:2.0 36:1.0 46:1.0 79:1.0 87:1.0 89:1.0 92:1.0 96:1.0 102:1.0 114:1.0 120:1.0 125:2.0 126:2.0 136:3.0 138:4.0 149:1.0 176:1.0 201:1.0 205:1.0 220:1.0 221:1.0 223:1.0 227:1.0 228:1.0 233:1.0 237:1.0 250:1.0 253:1.0 258:6.0 259:687.0 260:678.0 261:687.0 262:672.0 263:691.0 264:707.0 265:5.0 267:0.49999 269:0.04716 280:0.13575 281:0.53648 284:0.10593 293:0.49999 309:0.04716 387:0.13575 392:0.53648 396:0.13134 419:0.10593 421:0.07479 482:5.8 483:156.0 484:156044.0 485:1.560440007E9 507:1.0
6 | 0.0 0:2.0 1:2.0 4:1381.57 6:8.0 7:5.0 8:22.0 9:2.0 10:2.000020024E9 11:8.0 12:8.0 13:4.0 14:156033.0 15:156.0 16:1.560330002E9 17:3.0 18:1190400.0 19:921600.0 20:1389.0 21:6881.0 22:500.0 23:5038.0 24:14.0 25:5.0 29:0.0 31:1381.57 136:1.0 138:2.0 221:1.0 223:1.0 250:2.0 251:1.0 253:1.0 258:5.0 259:629.0 260:617.0 261:629.0 262:599.0 263:640.0 264:664.0 265:1.0 267:0.65312 268:0.4772 275:0.0 281:0.59912 293:0.65312 298:0.4772 361:0.0 392:0.59912 482:5.0 489:1.0
7 | 0.0 0:4.0 1:1.0 6:8.0 7:5.0 8:18.0 9:2.0 10:2.000020024E9 11:8.0 12:8.0 13:4.0 14:156033.0 15:156.0 16:1.560330001E9 17:3.0 18:1190400.0 19:921600.0 20:1368.0 21:7064.0 22:699.0 23:5296.0 24:29.0 30:1.0 35:2.0 36:1.0 41:1.0 46:1.0 71:1.0 76:1.0 125:2.0 126:2.0 136:4.0 138:4.0 142:1.0 149:1.0 151:2.0 152:1.0 155:1.0 205:1.0 216:3.0 219:1.0 220:1.0 227:1.0 228:1.0 250:1.0 253:1.0 258:4.0 259:645.0 260:647.0 261:646.0 262:594.0 263:663.0 264:683.0 265:3.0 274:0.8074 281:1.0 282:0.09744 284:0.35719 349:0.53074 351:0.8074 391:5.4E-4 392:0.81208 393:0.54078 394:0.00257 396:0.51813 397:1.0 405:0.09744 415:0.33152 418:0.35719 482:5.25 483:156.0 484:156033.0 485:1.560330001E9 487:1.0
8 | 0.0 10:2.000020024E9 114:1.0 120:1.0 125:1.0 126:1.0 136:2.0 138:6.0 151:1.0 157:1.0 176:4.0 181:2.0 184:1.0 201:1.0 211:1.0 212:1.0 216:2.0 221:2.0 223:1.0 224:1.0 233:5.0 236:2.0 237:1.0 240:2.0 250:1.0 253:1.0 259:666.0 260:648.0 261:666.0 262:652.0 263:679.0 264:687.0 267:0.77939 268:0.22744 269:0.06879 271:0.06818 272:0.32865 273:0.12341 274:0.25096 276:0.05982 281:0.51142 282:0.0 284:1.0 285:0.89039 289:0.06818 293:0.77939 298:0.22744 308:0.2133 309:0.06879 324:0.06818 335:0.32865 344:0.12341 349:0.25096 351:0.03988 364:0.05982 392:0.51142 396:0.26317 400:0.10105 404:0.0 418:1.0 432:0.89039 450:0.06818 483:156.0 484:156037.0 485:1.560370002E9 487:1.0
9 | 0.0 0:4.0 1:2.0 4:6174.0 6:8.0 7:5.0 8:22.0 9:2.0 10:2.000020024E9 11:8.0 12:12.0 13:4.0 14:156013.0 15:156.0 16:1.56013001E9 17:3.0 18:1300000.0 19:921600.0 20:1399.0 21:6885.0 22:569.0 23:5155.0 24:27.0 25:7.0 29:0.0 30:1.0 31:6174.0 35:1.0 36:1.0 42:1.0 136:2.0 138:2.0 145:1.0 147:1.0 149:1.0 151:1.0 155:1.0 216:1.0 219:1.0 221:2.0 223:2.0 250:1.0 253:1.0 258:3.0 259:690.0 260:681.0 261:690.0 262:672.0 263:698.0 264:710.0 265:3.0 267:0.01275 268:1.0E-5 269:0.09674 273:0.10097 274:0.07664 276:0.00689 278:0.04202 281:6.3E-4 282:0.0 284:0.14291 293:0.01275 298:1.0E-5 308:0.09425 309:0.00198 344:0.10097 345:0.00794 349:0.05015 351:0.17359 364:0.00689 376:0.04202 392:0.54585 394:6.3E-4 396:0.1807 402:0.0 418:0.14291 482:5.0 483:156.0 484:156013.0 485:1.560130006E9 492:1.0
10 | 0.0 259:650.0 260:632.0 261:650.0 262:630.0 263:661.0 264:680.0 267:0.0 281:0.14551 293:0.0 392:0.0 394:0.0 397:0.16667 500:1.0
11 |
--------------------------------------------------------------------------------
/推荐系统算法实践—补充部分/第6-11和13章节/data/00001:
--------------------------------------------------------------------------------
1 | 0.0 10:2.000020028E9 29:0.0 136:1.0 138:7.0 140:2.0 144:1.0 147:1.0 149:2.0 176:8.0 181:4.0 184:1.0 186:1.0 189:1.0 201:1.0 211:1.0 212:1.0 215:1.0 216:2.0 221:2.0 223:1.0 224:1.0 233:1.0 237:1.0 241:1.0 243:1.0 250:3.0 254:1.0 255:1.0 256:1.0 259:692.0 260:675.0 261:692.0 262:682.0 263:700.0 264:711.0 267:0.52416 268:0.06194 269:0.35611 271:0.85714 273:0.45089 274:0.01163 275:0.85714 276:0.00545 279:0.0026 281:0.95455 282:0.27735 289:0.02589 293:0.52416 298:0.06194 309:0.35611 321:0.8 322:0.85714 324:0.02589 344:0.45089 349:0.01163 358:0.0 359:0.85714 361:0.03223 365:0.00545 383:0.0026 392:0.73692 394:0.07727 396:0.08167 398:0.95455 403:0.27735 450:0.02589 516:1.0
2 | 0.0 6:11.0 7:9.0 8:24.0 9:11.0 10:2.000020024E9 11:19.0 12:18.0 13:4.0 17:8.0 18:1593600.0 19:2073600.0 20:3416.0 21:59662.0 22:4850.0 23:55736.0 35:3.0 71:1.0 74:1.0 89:1.0 92:1.0 96:1.0 102:1.0 106:1.0 110:1.0 114:1.0 120:1.0 125:6.0 127:4.0 128:1.0 129:1.0 130:1.0 131:1.0 132:1.0 134:1.0 136:3.0 138:8.0 140:2.0 145:1.0 147:1.0 149:1.0 151:2.0 153:1.0 154:1.0 155:1.0 158:2.0 161:1.0 163:1.0 164:1.0 169:3.0 171:1.0 172:1.0 174:1.0 176:8.0 181:3.0 184:2.0 186:1.0 189:1.0 197:1.0 205:1.0 208:1.0 209:1.0 211:1.0 212:1.0 216:2.0 217:3.0 219:1.0 221:5.0 223:3.0 224:2.0 233:3.0 237:1.0 240:2.0 241:4.0 244:1.0 247:2.0 249:1.0 250:4.0 251:1.0 253:1.0 254:2.0 259:703.0 260:685.0 261:703.0 262:687.0 263:707.0 264:734.0 267:0.16675 268:0.0 269:0.10147 271:0.17491 272:0.10146 273:0.00631 274:0.22913 275:0.0 277:0.02697 280:0.04946 281:0.17956 282:0.0014 284:0.03471 289:0.17491 293:0.09711 296:0.04707 298:0.0 309:0.10147 324:0.17491 335:0.0 337:0.01117 344:0.00631 349:0.03624 351:0.2042 354:0.37031 358:0.0 369:0.02697 388:0.0308 389:0.04946 391:0.0 392:0.00499 394:0.06772 395:0.00109 396:0.0 398:0.0 403:0.0014 418:0.0 419:0.03471 420:0.0325 450:0.17491 483:156.0 484:156013.0 485:1.56013001E9 491:1.0
3 | 0.0 114:1.0 120:1.0 125:1.0 126:1.0 136:1.0 138:5.0 140:1.0 151:3.0 154:3.0 158:4.0 159:1.0 161:3.0 176:2.0 181:2.0 205:1.0 211:1.0 212:1.0 220:1.0 221:1.0 223:1.0 227:1.0 228:1.0 233:1.0 237:1.0 241:1.0 246:1.0 250:1.0 253:1.0 259:703.0 260:739.0 261:703.0 262:693.0 263:703.0 264:679.0 267:0.56983 268:0.0399 269:0.30573 273:0.01961 275:0.0 276:1.0 278:0.2631 280:1.0 281:0.96242 284:0.0 293:0.56983 298:0.0399 309:0.30573 344:0.01961 346:0.04637 347:0.28233 358:0.0 364:0.37661 367:1.0 376:0.28109 387:1.0 391:0.00943 392:0.96242 395:0.0 416:0.0 483:156.0 484:156037.0 485:1.560370002E9 487:1.0
4 | 0.0 0:3.0 1:1.0 6:11.0 7:8.0 8:21.0 9:8.0 10:2.000020024E9 12:22.0 13:4.0 14:156044.0 15:156.0 16:1.560440001E9 18:1900000.0 19:4096000.0 20:2774.0 22:3388.0 24:21.0 258:4.0 259:620.0 260:612.0 261:621.0 262:582.0 263:635.0 264:657.0 265:2.0 274:0.01803 275:0.05543 281:0.5 282:0.0 349:0.01803 358:0.05543 392:0.5 394:0.0 402:0.0 483:156.0 484:156044.0 485:1.560440001E9 490:1.0
5 | 0.0 35:1.0 71:1.0 74:1.0 114:2.0 120:2.0 125:4.0 127:3.0 128:1.0 132:1.0 134:1.0 136:2.0 138:9.0 140:1.0 145:1.0 151:2.0 154:1.0 155:1.0 166:3.0 168:3.0 169:1.0 174:1.0 176:6.0 181:4.0 186:1.0 196:1.0 208:1.0 211:1.0 212:1.0 215:1.0 216:2.0 219:1.0 221:1.0 223:1.0 233:3.0 237:1.0 240:2.0 250:3.0 253:1.0 254:1.0 256:1.0 259:624.0 260:612.0 261:624.0 262:593.0 263:635.0 264:660.0 486:1.0
6 | 0.0 6:11.0 7:9.0 8:26.0 9:11.0 10:2.000020024E9 11:19.0 12:22.0 13:8.0 17:8.0 18:1900800.0 19:2242080.0 20:3286.0 21:59401.0 22:4899.0 23:55243.0 259:691.0 260:684.0 261:690.0 262:679.0 263:690.0 264:710.0 482:5.8 488:1.0
7 | 0.0 29:1.0 273:0.14085 281:0.56153 282:1.0 285:0.9156 344:0.14085 391:0.06023 392:0.56153 393:0.08851 398:0.04462 403:1.0 432:0.9156 483:156.0 484:156022.0 485:1.560220002E9 496:1.0
8 | 0.0 121:1.0 122:1.0 125:1.0 129:1.0 138:3.0 166:1.0 168:1.0 176:1.0 181:1.0 211:1.0 212:1.0 241:1.0 245:1.0 250:1.0 254:1.0 267:0.0 272:0.0 273:0.06312 274:0.12618 275:0.03488 276:0.51499 281:0.70897 284:0.24869 293:0.00134 295:0.0 335:0.0 344:0.06312 349:0.10628 351:0.12618 358:0.03488 367:0.51499 391:0.20173 392:0.70897 393:0.19528 395:0.0 396:0.00644 416:0.24869 417:0.17722 419:0.10241 421:0.01178 424:2.1E-4 427:0.0063 428:0.03733 430:0.02265 510:1.0
9 | 0.0 6:3.0 7:3.0 8:15.0 10:2.000020024E9 11:4.0 12:5.0 13:1.0 17:1.0 18:1008000.0 19:384000.0 20:645.0 21:2979.0 22:499.0 23:1727.0 29:1.0 125:1.0 129:1.0 136:1.0 138:2.0 166:1.0 168:1.0 176:2.0 181:1.0 201:1.0 211:1.0 214:1.0 216:2.0 241:1.0 247:1.0 250:1.0 253:1.0 259:632.0 260:620.0 261:633.0 262:595.0 263:640.0 264:678.0 275:0.02185 281:1.0 358:0.02185 392:1.0 396:0.01678 482:4.0 499:1.0
10 | 0.0 4:1024.0 6:8.0 7:5.0 8:22.0 9:2.0 10:2.000020024E9 11:8.0 12:8.0 13:4.0 17:3.0 18:1190400.0 19:921600.0 20:1389.0 21:6881.0 22:500.0 23:5038.0 25:4.0 29:0.0 31:1024.0 35:1.0 36:1.0 46:1.0 121:1.0 122:1.0 136:2.0 138:2.0 147:1.0 250:1.0 253:1.0 259:651.0 260:638.0 261:651.0 262:612.0 263:666.0 264:692.0 275:0.01446 276:0.0492 278:0.25841 281:0.61349 282:0.0 284:0.53165 358:0.01446 364:0.0492 375:0.25841 391:0.0 392:0.61349 396:0.23096 404:0.0 416:0.53165 421:0.08423 426:0.09111 427:0.0 482:5.0 483:156.0 484:156043.0 485:1.560430001E9 486:1.0
11 |
--------------------------------------------------------------------------------
/推荐系统算法实践—补充部分/第6-11和13章节/sklearn_others.py:
--------------------------------------------------------------------------------
1 | from sklearn.linear_model import LogisticRegression
2 | from sklearn import metrics
3 | from os import path, listdir
4 | from sklearn.datasets import load_svmlight_files
5 | from sklearn.model_selection import train_test_split
6 | from sklearn.externals import joblib
7 | from sklearn import preprocessing
8 | import numpy as np
9 | import pandas as pd
10 | import random
11 | print("Python Version: %s"%(platform.python_version()))
12 |
13 |
14 | # ## 2)数据准备
15 | def process_data(data_path, feature_size, test_rat=0.3, random_seed=0, train_batch_size=5000, test_batch_size=5000):
16 | # 读取文件
17 | filenames = get_file_list(data_path)
18 | data = load_svmlight_files(filenames, n_features=feature_size, dtype=np.float32)
19 | # 合并所有文件
20 | merger_x = data[0].toarray()
21 | merger_y = data[1].reshape(-1, 1)
22 | for i in range(2,len(data)):
23 | if i % 2 == 0:
24 | x = data[i].toarray()
25 | merger_x=np.vstack((merger_x, x))
26 | else:
27 | y = data[i].reshape(-1, 1)
28 | merger_y=np.vstack((merger_y, y))
29 |
30 | # 生成x y datafarme
31 | feature_col = range(1,(feature_size + 1))
32 | x_frame = pd.DataFrame(merger_x ,columns=feature_col)
33 | y_frame = pd.DataFrame(merger_y)
34 | # 数据归一化
35 | minmax_scala=preprocessing.MinMaxScaler(feature_range=(0,1))
36 | scalafeature=minmax_scala.fit_transform(x_frame)
37 | scalafeature_frame = pd.DataFrame(scalafeature ,columns=x_frame.columns)
38 | # 训练样本,测试样本生成
39 | X_train, X_test, Y_train, Y_test = train_test_split(scalafeature_frame, y_frame, test_size=test_rat, random_state=random_seed)
40 | # batch生成
41 | all_train = pd.concat([Y_train, X_train], axis=1)
42 | all_test = pd.concat([Y_test, X_test], axis=1)
43 | xy_train = np.array(all_train).reshape(-1, feature_size + 1)
44 | xy_test = np.array(all_test).reshape(-1, feature_size + 1)
45 | train_batch = split_batch(xy_train, train_batch_size)
46 | test_batch = split_batch(xy_test, test_batch_size)
47 | return {"train_batch": train_batch, "test_batch": test_batch, "X_train": X_train, "X_test": X_test, "Y_train": Y_train, "Y_test": Y_test}
48 |
49 | # 按照batch_size大小将数据进行切分,返回Batch数据
50 | def split_batch(xy_data, batch_size=5000):
51 | # 计算batch数量
52 | all_len=xy_data.shape[0]
53 | n=int(round(float(all_len)/batch_size))
54 | if n == 0:
55 | n = 1
56 | data_batch=[]
57 | # 生成每个batch
58 | for i in range(n):
59 | k1=i*batch_size
60 | if i < n-1:
61 | k2=(i+1)*batch_size
62 | elif i == (n-1) and (i+1)*batch_size <= all_len:
63 | k2=all_len
64 | else:
65 | k2=(i+1)*batch_size
66 | batch=xy_data[k1:k2,:]
67 | data_batch.append(batch)
68 | return data_batch
69 |
70 | # 根据文件目录,获取文件路径,返回文件路径列表
71 | def get_file_list(my_path):
72 | files = []
73 | if path.isdir(my_path):
74 | [files.append(path.join(my_path, p)) for p in listdir(my_path) if path.isfile(path.join(my_path, p))]
75 | else:
76 | files.append(my_path)
77 | return files
78 |
79 |
80 | # 数据测试
81 | data_path = '/data'
82 | test_rat=0.4
83 | random_seed=0
84 | train_batch_size=2000
85 | test_batch_size=2000
86 | feature_size=530
87 |
88 | # 获取样本数据
89 | data = process_data(data_path, feature_size, test_rat, random_seed, train_batch_size, test_batch_size)
--------------------------------------------------------------------------------
/推荐系统算法实践—补充部分/第6-11和13章节/spark_others.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * 读取libSVM格式的文件,生成训练样本和测试样本。
3 | * 1)读取文件
4 | * 2)生成标签索引
5 | * 3)样本处理
6 | * 4)样本划分
7 | */
8 | def readLibSvmSampleData(
9 | @transient spark: org.apache.spark.sql.SparkSession,
10 | dataPath: String): (Dataset[LabeledPoint], Dataset[LabeledPoint]) = {
11 | import spark.implicits._
12 | // 2.1 读取样本
13 | val dataRead = spark.read.options(Map(("delimiter", "|"), ("header", "false"))).csv(dataPath)
14 | // 2.2 获取样本中所有标签,并且建立索引关系
15 | val featureMap = dataRead.map {
16 | case Row(libSvmFeatrue: String) =>
17 | val items = libSvmFeatrue.split(' ')
18 | val features = items.filter(_.nonEmpty).
19 | filter(f => f.split(':').size == 2).
20 | map { item =>
21 | val indexAndValue = item.split(':')
22 | indexAndValue(0)
23 | }
24 | features
25 | }.flatMap(x => x).distinct().collect().sorted.zipWithIndex.toMap
26 | val numFeatures = featureMap.size
27 | // 2.3 样本校准化处理
28 | val readSampleData = dataRead.map {
29 | case Row(libSvmFeatrue: String) =>
30 | val items = libSvmFeatrue.split(' ')
31 | val click = items(0).toString().toDouble
32 | val features = items.filter(_.nonEmpty).
33 | filter(f => f.split(':').size == 2).
34 | map { item =>
35 | val indexAndValue = item.split(':')
36 | val id = featureMap.getOrElse(indexAndValue(0), -1)
37 | val value = indexAndValue(1).toDouble
38 | (id, value)
39 | }.filter(f => f._1 > 0).sortBy(f => f._1)
40 | val label = if (click > 0) 1.0 else 0.0
41 | LabeledPoint(label, Vectors.sparse(numFeatures, features.map(_._1), features.map(_._2)))
42 | }
43 | // 2.3 划分样本
44 | val splits = readSampleData.randomSplit(Array(0.6, 0.4))
45 | val training = splits(0)
46 | val test = splits(1)
47 | (training, test)
48 | }
49 |
50 | /**
51 | * 根据gbdt模型对样本进行转换生成新样本
52 | * 每个样本通过每一棵树,可以找到对应的叶节点,该叶节点就是转换后的新特征。
53 | * @param sampleLablePoint 训练样本,格式为:RDD[LabeledPoint].
54 | * @param treeLeafMap gbdt模型的叶子节点.
55 | * @param gbdtModel gbdt模型
56 | * @return RDD[LabeledPoint]
57 | */
58 | def lrSample(
59 | sampleLablePoint: RDD[LabeledPoint],
60 | lrFeatureMap: Map[String, Int],
61 | gbdtModel: GradientBoostedTreesModel): RDD[LabeledPoint] = {
62 | val treeNumber = gbdtModel.trees.length
63 | val lrFeatureNum = lrFeatureMap.size
64 | val lrSampleParsed = sampleLablePoint.map { point =>
65 | val label = point.label
66 | val features = point.features
67 | val lrFeatures = ArrayBuffer[Int]()
68 | val lrValues = ArrayBuffer[Double]()
69 | val treeNumber = gbdtModel.trees.size
70 | for (treeIndex <- 0 to (treeNumber - 1)) {
71 | var node = gbdtModel.trees(treeIndex).topNode
72 | while (!node.isLeaf) {
73 | if (node.split.get.featureType == Continuous) {
74 | if (features(node.split.get.feature) <= node.split.get.threshold)
75 | node = node.leftNode.get
76 | else
77 | node = node.rightNode.get
78 | } else {
79 | if (node.split.get.categories.contains(features(node.split.get.feature)))
80 | node = node.leftNode.get
81 | else
82 | node = node.rightNode.get
83 | }
84 | }
85 | val key = treeIndex.toString + '_' + node.id
86 |
87 | lrFeatures += lrFeatureMap(key)
88 | lrValues += 1
89 | }
90 | (label, lrFeatures.sorted.toArray, lrValues.toArray)
91 | }
92 | val lrSamplLablePoint = lrSampleParsed.map {
93 | case (label, lrFeatures, lrValues) =>
94 | LabeledPoint(label, Vectors.sparse(lrFeatureNum, lrFeatures, lrValues))
95 | }
96 | (lrSamplLablePoint)
97 | }
98 |
99 | /**
100 | * gbdt模型解析叶子节点
101 | * @param gbdtModel gbdt模型.
102 | * @return 返回Map[String, Int],得到所有决策树的叶子节点,以及编号,数据格式为:(树id_叶子节点id, 编号)
103 | */
104 | def getTreeLeafMap(gbdtModel: GradientBoostedTreesModel): Map[String, Int] = {
105 | val lrFeatureMap = scala.collection.mutable.Map[String, Int]()
106 | var featureId = 0
107 | val treeNumber = gbdtModel.trees.size
108 | for (treeIndex <- 0 to (treeNumber - 1)) {
109 | val treeNodeQueue = collection.mutable.Queue[Node]()
110 | val rootNode = gbdtModel.trees(treeIndex).topNode
111 | treeNodeQueue.enqueue(rootNode)
112 | while (!treeNodeQueue.isEmpty) {
113 | val resNode = treeNodeQueue.dequeue()
114 | if (resNode.isLeaf) {
115 | val key = treeIndex.toString + '_' + resNode.id.toString()
116 | lrFeatureMap(key) = featureId
117 | featureId = featureId + 1
118 | }
119 | if (resNode.leftNode.isDefined)
120 | treeNodeQueue.enqueue(resNode.leftNode.get)
121 | if (resNode.rightNode.isDefined)
122 | treeNodeQueue.enqueue(resNode.rightNode.get)
123 | }
124 | }
125 | (lrFeatureMap.toMap)
126 | }
127 |
128 |
--------------------------------------------------------------------------------
/推荐系统算法实践—补充部分/第6-11和13章节/tf_others.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | import pandas as pd
4 | import random
5 | import math
6 | import re
7 |
8 | from os import path, listdir
9 | from sklearn import metrics
10 | from tensorflow.contrib import layers
11 |
12 | import time
13 | import datetime
14 |
15 | # ## 2)数据准备Dataset格式
16 |
17 | # In[6]:
18 |
19 | """
20 | 解析CSV格式,对输入的每一行样本,进行格式解析,返回labels和dense_vector格式数据
21 | 例如输入csv格式字符串: 0.0,0.6666666666666666,0.5,0.0,0.0,0.0,0.0,0.7272727272727273,0.42857142857142855
22 | 函数参数:
23 | line:需要解析的字符串;
24 | feature_size:特征长度;
25 | 函数返回:
26 | 返回字典,格式:{'labels': labels, 'dense_vector': dense_vector}
27 | labels:样本的labels;
28 | dense_vector:样本的特征向量;
29 | """
30 | def decode_csv(line, feature_size):
31 | # 按照,分割,取label和feature
32 | columns = tf.string_split([line], ',')
33 | labels = tf.reshape(tf.string_to_number(columns.values[0], out_type=tf.float32),[-1])
34 | dense_vector = tf.reshape(tf.string_to_number(columns.values[1:feature_size + 1], out_type=tf.float32),[feature_size])
35 | return {'labels': labels, 'dense_vector': dense_vector}
36 |
37 | """
38 | 采用DataSet格式读取文件
39 | 函数参数:
40 | data_type:文件格式;
41 | filenames:文件路径;
42 | batch_size:Batch大小;
43 | feature_size:特征长度;
44 | num_epochs:样本复制多少次;
45 | 函数返回:
46 | 返回DataSet
47 | """
48 | def read_my_file_format(data_type, filenames, feature_size, batch_size, num_epochs=1):
49 | # 读取文件
50 | print filenames
51 | dataset = tf.data.TextLineDataset(filenames).map(lambda x: decode_csv(x, feature_size)).prefetch(batch_size).cache()
52 | dataset = dataset.repeat(num_epochs)
53 | dataset = dataset.batch(batch_size) # Batch size to use
54 | iterator = dataset.make_one_shot_iterator()
55 | next_element = iterator.get_next()
56 | return next_element
57 |
58 | # 文件列表
59 | def get_file_list(my_path):
60 | files = []
61 | if path.isdir(my_path):
62 | [files.append(path.join(my_path, p)) for p in listdir(my_path) if path.isfile(path.join(my_path, p))]
63 | else:
64 | files.append(my_path)
65 | return files
66 |
67 | # 数据处理
68 | def process_data(data_type, my_path, feature_size, batch_size=32, num_epochs=1):
69 | filenames = get_file_list(my_path)
70 | next_element = read_my_file_format(data_type, filenames, feature_size, batch_size, num_epochs)
71 | return next_element
72 |
73 | # 测试数据
74 | filenames = '/data/csv-00000'
75 | feature_size = 530
76 | batch_size = 3
77 | num_epochs = 1
78 | data_type = 'csv'
79 | next_element = process_data(data_type, filenames, feature_size, batch_size, num_epochs)
80 | print next_element['dense_vector']
81 | print next_element['labels']
82 |
--------------------------------------------------------------------------------