├── .DS_Store ├── .gitignore ├── README.md ├── 推荐系统算法实践—源码下载 ├── .DS_Store ├── 第11章DNN │ └── DNN.py ├── 第12章Wide & Deep模型 │ ├── WideAndDeep.py │ ├── adult.data │ ├── adult.names │ └── adult.test ├── 第13章DeepFM模型 │ └── DeepFM.py ├── 第14章YouTube │ └── YouTubeNet.py ├── 第15章基于电商平台的商品召回 │ └── myGoodsRecall.scala ├── 第16章基于逻辑回归的音乐评分预测 │ └── LR.scala ├── 第17章Kaggle竞赛之Outbrain点击率预估 │ ├── EnsembleTree.scala │ ├── FFM.py │ └── XGB.py ├── 第18章基于深度学习的电商商品点击率预估 │ └── DeepInterestNetwork.py ├── 第19章Notebook实践 │ ├── Debug_CF.scala │ ├── Debug_FM.py │ ├── Debug_Sk_LR.py │ ├── Debug_Spark_LR.scala │ └── Debug_TF_LR.py ├── 第4章协同过滤 │ ├── .DS_Store │ ├── I2iTest.scala │ ├── ItemSimilarity.scala │ └── ml-latest-small │ │ ├── README.txt │ │ ├── links.csv │ │ ├── movies.csv │ │ ├── ratings.csv │ │ └── tags.csv ├── 第5章Word2vec │ ├── Word2vec.py │ └── Word2vec.scala ├── 第6章逻辑回归 │ ├── LR.py │ ├── LogisticRegression.py │ └── LogisticRegression.scala ├── 第7章FM │ ├── FM.py │ └── FM_Sk.py ├── 第8章决策树 │ ├── DecisionTrees.scala │ ├── Tree.py │ └── sample_libsvm_data.txt └── 第9章集成学习 │ ├── GbdtLr.scala │ └── gcForest.py └── 推荐系统算法实践—补充部分 ├── .DS_Store ├── 第12章节 ├── adult.data ├── adult.names └── adult.test ├── 第14章节 ├── 001 └── others.py ├── 第5章节 ├── other.py ├── sequence_sample.csv └── windows_skip_sample.csv └── 第6-11和13章节 ├── .DS_Store ├── csv-00000 ├── data ├── 00000 └── 00001 ├── sample_libsvm_data.txt ├── sklearn_others.py ├── spark_others.scala └── tf_others.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WallaceLiu/recommendation_algorithm/36cdd077446ab7a2831c168f7419b058bd2fcbb0/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # recommendation_algorithm 2 | 推荐系统算法实践 3 | -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WallaceLiu/recommendation_algorithm/36cdd077446ab7a2831c168f7419b058bd2fcbb0/推荐系统算法实践—源码下载/.DS_Store -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第11章DNN/DNN.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # ## 1)环境准备 5 | 6 | # In[1]: 7 | 8 | import numpy as np 9 | import tensorflow as tf 10 | import pandas as pd 11 | import random 12 | import math 13 | import re 14 | 15 | from sklearn import preprocessing 16 | from os import path, listdir 17 | from sklearn.datasets import load_svmlight_files 18 | from sklearn.model_selection import train_test_split 19 | from sklearn import metrics 20 | from tensorflow.contrib import layers 21 | 22 | from sklearn import metrics 23 | 24 | import time 25 | import datetime 26 | 27 | import os 28 | os.environ["CUDA_VISIBLE_DEVICES"]="0" 29 | 30 | print tf.__version__ 31 | print tf.__path__ 32 | 33 | 34 | # ## 2)数据准备Dataset格式 35 | 36 | # 数据处理 37 | def process_data(data_type, my_path, feature_size, batch_size=32, num_epochs=1): 38 | filenames = get_file_list(my_path) 39 | next_element = read_my_file_format(data_type, filenames, feature_size, batch_size, num_epochs) 40 | return next_element 41 | 42 | # 创建session,指定GPU或者CPU使用率 43 | def get_session(gpu_fraction=0.1): 44 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction, 45 | allow_growth=True) 46 | # server = tf.train.Server.create_local_server() 47 | return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) 48 | 49 | 50 | # In[3]: 51 | 52 | # 测试数据 53 | filenames = '/data/all-csv' 54 | feature_size = 530 55 | batch_size = 3 56 | num_epochs = 1 57 | data_type = 'csv' 58 | next_element = process_data(data_type, filenames, feature_size, batch_size, num_epochs) 59 | print next_element['dense_vector'] 60 | print next_element['labels'] 61 | 62 | gpu_fraction = 0.2 63 | my_device='/gpu:0' 64 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer()) 65 | with tf.device(my_device): 66 | sess = get_session(gpu_fraction) 67 | sess.run(init_op) 68 | dense_vector, labels = sess.run([next_element['dense_vector'],next_element['labels']]) 69 | print dense_vector 70 | print labels 71 | 72 | 73 | # ## 3)DNN模型 74 | 75 | # In[4]: 76 | 77 | class DNN(object): 78 | """ 初始化成员变量 """ 79 | def __init__(self, 80 | feature_size, 81 | loss_fuc, 82 | train_optimizer, 83 | learning_rate, 84 | reg_type, 85 | reg_param, 86 | dnn_layer, 87 | dnn_active_fuc, 88 | is_dropout_dnn, 89 | dropout_dnn, 90 | is_batch_norm): 91 | # 特征向量长度 92 | self.feature_size = feature_size 93 | # 损失函数 94 | self.loss_fuc = loss_fuc 95 | # 优化方法 96 | self.train_optimizer = train_optimizer 97 | # 学习率 98 | self.learning_rate = learning_rate 99 | # 正则类型 100 | self.reg_type = reg_type 101 | # 正则因子 102 | self.reg_param = reg_param 103 | # dnn_layer 104 | self.dnn_layer = dnn_layer 105 | self.dnn_active_fuc = dnn_active_fuc 106 | # dropout_dnn 107 | self.is_dropout_dnn = is_dropout_dnn 108 | self.dropout_dnn = dropout_dnn 109 | # is_batch_norm 110 | self.is_batch_norm = is_batch_norm 111 | 112 | # aglobal_step 113 | self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step') 114 | 115 | """ dnn全连接层计算 """ 116 | def _udf_full_connect(self, inputs, input_size, output_size, activation='relu'): 117 | # 生成或者攻取weights和biases 118 | weights = tf.get_variable("weights", 119 | [input_size, output_size], 120 | initializer=tf.glorot_normal_initializer(), 121 | trainable=True) 122 | biases = tf.get_variable("biases", 123 | [output_size], 124 | initializer=tf.glorot_normal_initializer(), 125 | trainable=True) 126 | # 全连接计算 127 | layer = tf.matmul(inputs, weights) + biases 128 | # 激活函数 129 | if activation == 'relu': 130 | layer = tf.nn.relu(layer) 131 | elif activation == 'tanh': 132 | layer = tf.nn.tanh(layer) 133 | return layer 134 | 135 | def train(self, batch_data, is_train): 136 | """ 1 定义输入数据 """ 137 | print("1 定义输入数据") 138 | with tf.name_scope('input_data'): 139 | # 标签:[batch_size, 1] 140 | labels = batch_data['labels'] 141 | # 用户特征向量:[batch_size, feature_size] 142 | dense_vector = tf.reshape(batch_data['dense_vector'], shape=[-1, feature_size]) # None * feature_size * 1 143 | print("%s: %s" % ("dense_vector", dense_vector)) 144 | print("%s: %s" % ("labels", labels)) 145 | 146 | """ 2 定义网络输出 """ 147 | print("2 DNN网络输出" ) 148 | with tf.name_scope("DNN_Comput_Score"): 149 | # 第一层计算 150 | with tf.variable_scope("deep_layer1", reuse=tf.AUTO_REUSE): 151 | input_size = self.feature_size 152 | output_size = self.dnn_layer[0] 153 | deep_inputs = dense_vector # None * F 154 | print("%s: %s" % ("deep_layer1, deep_inputs", deep_inputs)) 155 | # 输入dropout 156 | if is_train and self.is_dropout_dnn: 157 | deep_inputs = tf.nn.dropout(deep_inputs, self.dropout_dnn[0]) 158 | # 全连接计算 159 | deep_outputs = self._udf_full_connect(deep_inputs, input_size, output_size, self.dnn_active_fuc[0]) 160 | # batch_norm 161 | if self.is_batch_norm: 162 | deep_outputs = tf.layers.batch_normalization(deep_outputs, axis=-1, training=is_train) 163 | # 输出dropout 164 | if is_train and self.is_dropout_dnn: 165 | deep_outputs = tf.nn.dropout(deep_outputs, dropout_dnn[1]) 166 | print("%s: %s" % ("deep_layer1, deep_outputs", deep_outputs)) 167 | # 中间层计算 168 | for i in range(len(self.dnn_layer) - 1): 169 | with tf.variable_scope("deep_layer%d"%(i+2), reuse=tf.AUTO_REUSE): 170 | # 全连接计算 171 | deep_outputs = self._udf_full_connect(deep_outputs, self.dnn_layer[i], self.dnn_layer[i+1], self.dnn_active_fuc[i+1]) 172 | # batch_norm 173 | if self.is_batch_norm: 174 | deep_outputs = tf.layers.batch_normalization(deep_outputs, axis=-1, training=is_train) 175 | # 输出dropout 176 | if is_train and self.is_dropout_dnn: 177 | deep_outputs = tf.nn.dropout(deep_outputs, self.dropout_dnn[i+2]) 178 | print("%s, %s: %s" % ("deep_layer%d"%(i+2), "deep_outputs", deep_outputs)) 179 | # 输出层计算 180 | with tf.variable_scope("deep_layer%d"%(len(dnn_layer)+1), reuse=tf.AUTO_REUSE): 181 | deep_outputs = self._udf_full_connect(deep_outputs, self.dnn_layer[-1], 1, self.dnn_active_fuc[-1]) 182 | print("%s, %s: %s" % ("deep_layer%d"%(len(dnn_layer)+1), "deep_outputs", deep_outputs)) 183 | # 正则化,默认L2 184 | dnn_regularization = 0.0 185 | for j in range(len(self.dnn_layer)+1): 186 | with tf.variable_scope("deep_layer%d"%(j+1), reuse=True): 187 | weights = tf.get_variable("weights") 188 | if reg_type == 'l1_reg': 189 | dnn_regularization = dnn_regularization + tf.reduce_sum(tf.abs(weights)) 190 | elif reg_type == 'l2_reg': 191 | dnn_regularization = dnn_regularization + tf.nn.l2_loss(weights) 192 | else: 193 | dnn_regularization = dnn_regularization + tf.nn.l2_loss(weights) 194 | # Deep输出 195 | Y_Out=deep_outputs 196 | print("%s: %s" % ("Y_Out", Y_Out)) 197 | # ---------- score ---------- 198 | score=tf.nn.sigmoid(Y_Out,name='score') 199 | score=tf.reshape(score, shape=[-1, 1]) 200 | print("%s: %s" % ("score", score)) 201 | 202 | """ 3 定义损失函数和AUC指标 """ 203 | print("3 定义损失函数和AUC指标" ) 204 | with tf.name_scope("loss"): 205 | # loss:Squared_error,Cross_entropy ,FTLR 206 | regularization = self.reg_param * dnn_regularization 207 | if loss_fuc == 'Squared_error': 208 | loss = tf.reduce_mean(tf.reduce_sum(tf.square(labels - score), reduction_indices=[1])) + regularization 209 | elif loss_fuc == 'Cross_entropy': 210 | loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=tf.reshape(Y_Out, [-1]), labels=tf.reshape(labels, [-1]))) + regularization 211 | elif loss_fuc == 'FTLR': 212 | loss = tf.reduce_mean(tf.reduce_sum(tf.square(labels - score), reduction_indices=[1])) + regularization 213 | # AUC 214 | auc = tf.metrics.auc(labels, score) 215 | print("%s: %s" % ("labels", labels)) 216 | 217 | """ 4 设定optimizer """ 218 | print("4 设定optimizer" ) 219 | with tf.name_scope("optimizer"): 220 | with tf.variable_scope("optimizer", reuse=tf.AUTO_REUSE): 221 | #------bulid optimizer------ 222 | if train_optimizer == 'Adam': 223 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8) 224 | elif train_optimizer == 'Adagrad': 225 | optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=1e-8) 226 | elif train_optimizer == 'Momentum': 227 | optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.95) 228 | elif train_optimizer == 'ftrl': 229 | optimizer = tf.train.FtrlOptimizer(learning_rate) 230 | train_step = optimizer.minimize(loss, global_step=self.global_step) 231 | 232 | """5 设定summary,以便在Tensorboard里进行可视化 """ 233 | print("5 设定summary" ) 234 | with tf.name_scope("summaries"): 235 | tf.summary.scalar("loss", loss) 236 | tf.summary.scalar("accumulate_auc", auc[0]) 237 | for j in range(len(self.dnn_layer)+1): 238 | with tf.variable_scope("deep_layer%d"%(j+1), reuse=True): 239 | weights = tf.get_variable("weights") 240 | tf.summary.histogram("w%d"%(j+1), weights) 241 | # 好几个summary,所以这里要merge_all 242 | summary_op = tf.summary.merge_all() 243 | 244 | """6 返回结果 """ 245 | return Y_Out, score, regularization, loss, auc, train_step, labels, score, summary_op 246 | 247 | 248 | # ## 4)模型训练测试 249 | 250 | # In[5]: 251 | 252 | # 数据参数 253 | print("0 数据准备和参数设置" ) 254 | filenames = '/data/csv-all' 255 | data_type='csv' 256 | feature_size = 530 257 | batch_size = 60000 258 | num_epochs = 200 259 | next_element = process_data(data_type, filenames, feature_size, batch_size, num_epochs) 260 | print("%s: %s" % ("next_element", next_element)) 261 | 262 | # 模型参数 263 | loss_fuc = 'Squared_error' 264 | train_optimizer = 'Adam' 265 | learning_rate = 0.01 266 | reg_type = 'l2_reg' 267 | reg_param = 0.0 268 | 269 | dnn_layer=[100,50] 270 | dnn_active_fuc=['relu','relu','output'] 271 | dropout_fm=[1,1] 272 | is_dropout_dnn=True 273 | dropout_dnn=[0.7,0.7,0.7] 274 | is_batch_norm=True 275 | 276 | log_path='/data/log/DNN_Squared_error_L2_0_20180816_01' 277 | 278 | # 开始训练 279 | bea_model = DNN(feature_size, 280 | loss_fuc, 281 | train_optimizer, 282 | learning_rate, 283 | reg_type, 284 | reg_param, 285 | dnn_layer, 286 | dnn_active_fuc, 287 | is_dropout_dnn, 288 | dropout_dnn, 289 | is_batch_norm) 290 | Y_Out, score, regularization, loss, auc, train_step, labels, score, summary_op = bea_model.train(next_element, is_train=True) 291 | 292 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer()) 293 | gpu_fraction = 0.5 294 | my_device='/gpu:0' 295 | with tf.device(my_device): 296 | sess = get_session(gpu_fraction) 297 | sess.run(init_op) 298 | batch_cnt = 0 299 | #选定可视化存储目录 300 | writer = tf.summary.FileWriter(log_path, sess.graph) 301 | print("6 迭代过程" ) 302 | try: 303 | while True: 304 | batch_cnt = batch_cnt + 1 305 | a, b, c, summary = sess.run([loss, auc, train_step, summary_op]) 306 | if batch_cnt % 100 == 0 or batch_cnt <= 10: 307 | y, p = sess.run([labels, score]) 308 | if y.sum() > 0.0: 309 | batch_auc=metrics.roc_auc_score(y, p) 310 | else: 311 | batch_auc=0.0 312 | print("batch: {} loss: {:.4f} accumulate_auc: {:.4f} batch_auc: {:.4f}".format(batch_cnt, a, b[0], batch_auc)) 313 | writer.add_summary(summary, batch_cnt) 314 | except tf.errors.OutOfRangeError: 315 | print("Train end of dataset") 316 | -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第12章Wide & Deep模型/WideAndDeep.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # ## 1)环境准备 5 | 6 | # In[8]: 7 | 8 | import numpy as np 9 | import tensorflow as tf 10 | import pandas as pd 11 | import random 12 | import math 13 | import re 14 | 15 | from sklearn import preprocessing 16 | from os import path, listdir 17 | from sklearn.datasets import load_svmlight_files 18 | from sklearn.model_selection import train_test_split 19 | from sklearn import metrics 20 | from tensorflow.contrib import layers 21 | 22 | from sklearn import metrics 23 | 24 | import time 25 | import datetime 26 | 27 | import os 28 | os.environ["CUDA_VISIBLE_DEVICES"]="0" 29 | 30 | import tensorflow as tf 31 | 32 | print tf.__version__ 33 | print tf.__path__ 34 | 35 | 36 | # ## 2)数据准备 37 | 38 | # In[9]: 39 | 40 | # 定义输入样本格式 41 | _CSV_COLUMNS = [ 42 | 'age', 'workclass', 'fnlwgt', 'education', 'education_num', 43 | 'marital_status', 'occupation', 'relationship', 'race', 'gender', 44 | 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 45 | 'income_bracket' 46 | ] 47 | _CSV_COLUMN_DEFAULTS = [[0], [''], [0], [''], [0], [''], [''], [''], [''], [''], 48 | [0], [0], [0], [''], ['']] 49 | _NUM_EXAMPLES = { 50 | 'train': 32561, 51 | 'validation': 16281, 52 | } 53 | 54 | """Builds a set of wide and deep feature columns.""" 55 | def build_model_columns(): 56 | # 1. 特征处理,包括:连续特征、离散特征、转换特征、交叉特征等 57 | 58 | # 连续特征 (其中在Wide和Deep组件都会用到) 59 | age = tf.feature_column.numeric_column('age') 60 | education_num = tf.feature_column.numeric_column('education_num') 61 | capital_gain = tf.feature_column.numeric_column('capital_gain') 62 | capital_loss = tf.feature_column.numeric_column('capital_loss') 63 | hours_per_week = tf.feature_column.numeric_column('hours_per_week') 64 | 65 | # 离散特征 66 | education = tf.feature_column.categorical_column_with_vocabulary_list( 67 | 'education', [ 68 | 'Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college', 69 | 'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school', 70 | '5th-6th', '10th', '1st-4th', 'Preschool', '12th']) 71 | 72 | marital_status = tf.feature_column.categorical_column_with_vocabulary_list( 73 | 'marital_status', [ 74 | 'Married-civ-spouse', 'Divorced', 'Married-spouse-absent', 75 | 'Never-married', 'Separated', 'Married-AF-spouse', 'Widowed']) 76 | 77 | relationship = tf.feature_column.categorical_column_with_vocabulary_list( 78 | 'relationship', [ 79 | 'Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried', 80 | 'Other-relative']) 81 | 82 | workclass = tf.feature_column.categorical_column_with_vocabulary_list( 83 | 'workclass', [ 84 | 'Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov', 85 | 'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked']) 86 | 87 | # 离散hash bucket特征 88 | occupation = tf.feature_column.categorical_column_with_hash_bucket( 89 | 'occupation', hash_bucket_size=1000 90 | ) 91 | 92 | # 特征Transformations 93 | age_buckets = tf.feature_column.bucketized_column( 94 | age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65] 95 | ) 96 | 97 | # 2. 设定Wide层特征 98 | """ 99 | Wide部分使用了规范化后的连续特征、离散特征、交叉特征 100 | """ 101 | # 基本特征列 102 | base_columns = [ 103 | # 全是离散特征 104 | education, marital_status, relationship, workclass, occupation, 105 | age_buckets, 106 | ] 107 | 108 | # 交叉特征列 109 | crossed_columns = [ 110 | tf.feature_column.crossed_column( 111 | ['education', 'occupation'], hash_bucket_size=1000), 112 | tf.feature_column.crossed_column( 113 | [age_buckets, 'education', 'occupation'], hash_bucket_size=1000 114 | ) 115 | ] 116 | 117 | # wide特征列 118 | wide_columns = base_columns + crossed_columns 119 | 120 | # 3. 设定Deep层特征 121 | """ 122 | Deep层主要针对离散特征进行处理,其中处理方式有: 123 | 1. Sparse Features -> Embedding vector -> 串联(连续特征),其中Embedding Values随机初始化。 124 | 2. 另外一种处理离散特征的方法是:one-hot和multi-hot representation. 此方法适用于低维度特征,其中embedding是通用的做法 125 | 其中:采用embedding_column(embedding)和indicator_column(multi-hot)API 126 | """ 127 | # deep特征列 128 | deep_columns = [ 129 | age, 130 | education_num, 131 | capital_gain, 132 | capital_loss, 133 | hours_per_week, 134 | tf.feature_column.indicator_column(workclass), 135 | tf.feature_column.indicator_column(education), 136 | tf.feature_column.indicator_column(marital_status), 137 | tf.feature_column.indicator_column(relationship), 138 | 139 | # embedding特征 140 | tf.feature_column.embedding_column(occupation, dimension=8) 141 | ] 142 | return wide_columns, deep_columns 143 | 144 | # Estimator Input 145 | # 定义输入 146 | def input_fn(data_file, num_epochs, shuffle, batch_size): 147 | """为Estimator创建一个input function""" 148 | assert tf.gfile.Exists(data_file), "{0} not found.".format(data_file) 149 | def parse_csv(line): 150 | print("Parsing", data_file) 151 | # tf.decode_csv会把csv文件转换成Tensor。其中record_defaults用于指明每一列的缺失值用什么填充。 152 | columns = tf.decode_csv(line, record_defaults=_CSV_COLUMN_DEFAULTS) 153 | features = dict(zip(_CSV_COLUMNS, columns)) 154 | labels = features.pop('income_bracket') 155 | # tf.equal(x, y) 返回一个bool类型Tensor, 表示x == y, element-wise 156 | return features, tf.equal(labels, '>50K') 157 | dataset = tf.data.TextLineDataset(data_file).map(parse_csv, num_parallel_calls=5) 158 | dataset = dataset.repeat(num_epochs) 159 | dataset = dataset.batch(batch_size) 160 | iterator = dataset.make_one_shot_iterator() 161 | batch_features, batch_labels = iterator.get_next() 162 | return batch_features, batch_labels 163 | 164 | 165 | # ## 3)模型准备 166 | 167 | # In[10]: 168 | 169 | # Wide & Deep Model 170 | def build_estimator(model_dir, model_type): 171 | """Build an estimator appropriate for the given model type.""" 172 | wide_columns, deep_columns = build_model_columns() 173 | hidden_units = [100, 50] 174 | 175 | # Create a tf.estimator.RunConfig to ensure the model is run on CPU, which 176 | # trains faster than GPU for this model. 177 | run_config = tf.estimator.RunConfig().replace( 178 | session_config=tf.ConfigProto(device_count={'GPU': 0})) 179 | 180 | if model_type == 'wide': 181 | return tf.estimator.LinearClassifier( 182 | model_dir=model_dir, 183 | feature_columns=wide_columns, 184 | config=run_config) 185 | elif model_type == 'deep': 186 | return tf.estimator.DNNClassifier( 187 | model_dir=model_dir, 188 | feature_columns=deep_columns, 189 | hidden_units=hidden_units, 190 | config=run_config) 191 | else: 192 | return tf.estimator.DNNLinearCombinedClassifier( 193 | model_dir=model_dir, 194 | linear_feature_columns=wide_columns, 195 | dnn_feature_columns=deep_columns, 196 | dnn_hidden_units=hidden_units, 197 | config=run_config) 198 | 199 | # 模型路径 200 | model_type = 'widedeep' 201 | model_dir = '/data/model/wide_deep' 202 | 203 | # Wide & Deep 联合模型 204 | model = build_estimator(model_dir, model_type) 205 | 206 | 207 | # ## 4)模型训练 208 | 209 | # In[11]: 210 | 211 | # 训练参数 212 | train_epochs = 10 213 | batch_size = 5000 214 | train_file = '/data/adult.data' 215 | test_file = '/data/adult.test' 216 | 217 | # 6. 开始训练 218 | for n in range(train_epochs): 219 | # 模型训练 220 | model.train(input_fn=lambda: input_fn(train_file, train_epochs, True, batch_size)) 221 | # 模型评估 222 | results = model.evaluate(input_fn=lambda: input_fn(test_file, 1, False, batch_size)) 223 | # 打印评估结果 224 | print("Results at epoch {0}".format((n+1) * train_epochs)) 225 | print('-'*30) 226 | for key in sorted(results): 227 | print("{0:20}: {1:.4f}".format(key, results[key])) 228 | -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第12章Wide & Deep模型/adult.names: -------------------------------------------------------------------------------- 1 | | This data was extracted from the census bureau database found at 2 | | http://www.census.gov/ftp/pub/DES/www/welcome.html 3 | | Donor: Ronny Kohavi and Barry Becker, 4 | | Data Mining and Visualization 5 | | Silicon Graphics. 6 | | e-mail: ronnyk@sgi.com for questions. 7 | | Split into train-test using MLC++ GenCVFiles (2/3, 1/3 random). 8 | | 48842 instances, mix of continuous and discrete (train=32561, test=16281) 9 | | 45222 if instances with unknown values are removed (train=30162, test=15060) 10 | | Duplicate or conflicting instances : 6 11 | | Class probabilities for adult.all file 12 | | Probability for the label '>50K' : 23.93% / 24.78% (without unknowns) 13 | | Probability for the label '<=50K' : 76.07% / 75.22% (without unknowns) 14 | | 15 | | Extraction was done by Barry Becker from the 1994 Census database. A set of 16 | | reasonably clean records was extracted using the following conditions: 17 | | ((AAGE>16) && (AGI>100) && (AFNLWGT>1)&& (HRSWK>0)) 18 | | 19 | | Prediction task is to determine whether a person makes over 50K 20 | | a year. 21 | | 22 | | First cited in: 23 | | @inproceedings{kohavi-nbtree, 24 | | author={Ron Kohavi}, 25 | | title={Scaling Up the Accuracy of Naive-Bayes Classifiers: a 26 | | Decision-Tree Hybrid}, 27 | | booktitle={Proceedings of the Second International Conference on 28 | | Knowledge Discovery and Data Mining}, 29 | | year = 1996, 30 | | pages={to appear}} 31 | | 32 | | Error Accuracy reported as follows, after removal of unknowns from 33 | | train/test sets): 34 | | C4.5 : 84.46+-0.30 35 | | Naive-Bayes: 83.88+-0.30 36 | | NBTree : 85.90+-0.28 37 | | 38 | | 39 | | Following algorithms were later run with the following error rates, 40 | | all after removal of unknowns and using the original train/test split. 41 | | All these numbers are straight runs using MLC++ with default values. 42 | | 43 | | Algorithm Error 44 | | -- ---------------- ----- 45 | | 1 C4.5 15.54 46 | | 2 C4.5-auto 14.46 47 | | 3 C4.5 rules 14.94 48 | | 4 Voted ID3 (0.6) 15.64 49 | | 5 Voted ID3 (0.8) 16.47 50 | | 6 T2 16.84 51 | | 7 1R 19.54 52 | | 8 NBTree 14.10 53 | | 9 CN2 16.00 54 | | 10 HOODG 14.82 55 | | 11 FSS Naive Bayes 14.05 56 | | 12 IDTM (Decision table) 14.46 57 | | 13 Naive-Bayes 16.12 58 | | 14 Nearest-neighbor (1) 21.42 59 | | 15 Nearest-neighbor (3) 20.35 60 | | 16 OC1 15.04 61 | | 17 Pebls Crashed. Unknown why (bounds WERE increased) 62 | | 63 | | Conversion of original data as follows: 64 | | 1. Discretized agrossincome into two ranges with threshold 50,000. 65 | | 2. Convert U.S. to US to avoid periods. 66 | | 3. Convert Unknown to "?" 67 | | 4. Run MLC++ GenCVFiles to generate data,test. 68 | | 69 | | Description of fnlwgt (final weight) 70 | | 71 | | The weights on the CPS files are controlled to independent estimates of the 72 | | civilian noninstitutional population of the US. These are prepared monthly 73 | | for us by Population Division here at the Census Bureau. We use 3 sets of 74 | | controls. 75 | | These are: 76 | | 1. A single cell estimate of the population 16+ for each state. 77 | | 2. Controls for Hispanic Origin by age and sex. 78 | | 3. Controls by Race, age and sex. 79 | | 80 | | We use all three sets of controls in our weighting program and "rake" through 81 | | them 6 times so that by the end we come back to all the controls we used. 82 | | 83 | | The term estimate refers to population totals derived from CPS by creating 84 | | "weighted tallies" of any specified socio-economic characteristics of the 85 | | population. 86 | | 87 | | People with similar demographic characteristics should have 88 | | similar weights. There is one important caveat to remember 89 | | about this statement. That is that since the CPS sample is 90 | | actually a collection of 51 state samples, each with its own 91 | | probability of selection, the statement only applies within 92 | | state. 93 | 94 | 95 | >50K, <=50K. 96 | 97 | age: continuous. 98 | workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked. 99 | fnlwgt: continuous. 100 | education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool. 101 | education-num: continuous. 102 | marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse. 103 | occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces. 104 | relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried. 105 | race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black. 106 | sex: Female, Male. 107 | capital-gain: continuous. 108 | capital-loss: continuous. 109 | hours-per-week: continuous. 110 | native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands. 111 | -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第14章YouTube/YouTubeNet.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # ## 0)环境准备 5 | 6 | # In[8]: 7 | 8 | import numpy as np 9 | import tensorflow as tf 10 | import pandas as pd 11 | import random 12 | import math 13 | import re 14 | 15 | from sklearn import preprocessing 16 | from os import path, listdir 17 | from sklearn.datasets import load_svmlight_files 18 | from sklearn.model_selection import train_test_split 19 | from sklearn import metrics 20 | from tensorflow.contrib import layers 21 | 22 | from sklearn import metrics 23 | 24 | import time 25 | import datetime 26 | 27 | import os 28 | os.environ["CUDA_VISIBLE_DEVICES"]="0" 29 | 30 | print tf.__version__ 31 | print tf.__path__ 32 | 33 | 34 | # ## 1)数据准备Dataset格式 35 | 36 | # In[14]: 37 | 38 | # 每一行解析 sequence格式 39 | # 351702070890229|0,0,0,0,0,0,0,0,0,0,0,0,0,0|1,1173,0,0,0|18578 40 | 41 | # 数据处理 42 | def process_data(data_type, my_path, feature_size, batch_size=32, num_epochs=1): 43 | filenames = get_file_list(my_path) 44 | next_element = read_my_file_format(data_type, filenames, feature_size, batch_size, num_epochs) 45 | return next_element 46 | 47 | # 创建session,指定GPU或者CPU使用率 48 | def get_session(gpu_fraction=0.1): 49 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction, 50 | allow_growth=True) 51 | # server = tf.train.Server.create_local_server() 52 | return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) 53 | 54 | 55 | # In[15]: 56 | 57 | # 测试数据 58 | filenames = '/data/sequence_normalize/001' 59 | item_size = 5 60 | batch_size = 3 61 | num_epochs = 1 62 | data_type = 'sequence' 63 | next_element = process_data(data_type, filenames, item_size, batch_size, num_epochs) 64 | # print next_element['label'] 65 | # print next_element['hist_click'] 66 | # print next_element['normalized_continuous_features'] 67 | 68 | gpu_fraction = 0.2 69 | my_device='/gpu:0' 70 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer()) 71 | with tf.device(my_device): 72 | sess = get_session(gpu_fraction) 73 | sess.run(init_op) 74 | label, item, other = sess.run([next_element['label'],next_element['hist_click'],next_element['normalized_continuous_features']]) 75 | print label 76 | print item 77 | print other 78 | 79 | 80 | # ## 2)定义YouTubeNet模型 81 | 82 | # In[26]: 83 | 84 | class YouTubeNet(object): 85 | """ 初始化成员变量 """ 86 | def __init__(self, 87 | item_count, 88 | embedding_size, 89 | num_sampled, 90 | learning_rate, 91 | hist_click_length, 92 | normalized_continuous_features_length, 93 | log_path): 94 | # 资源池大小 95 | self.item_count = item_count 96 | # embedding大小 97 | self.embedding_size = embedding_size 98 | # NCE采样数量 99 | self.num_sampled = num_sampled 100 | # 学习率 101 | self.learning_rate = learning_rate 102 | # 用户行为序列特征长度 103 | self.hist_click_length = hist_click_length 104 | # 用户其它特征长度 105 | self.normalized_continuous_features_length = normalized_continuous_features_length 106 | # log_path 107 | self.log_path = log_path 108 | 109 | def train(self, batch_data): 110 | """ 1 定义输入数据 """ 111 | print("1 定义输入数据" ) 112 | with tf.name_scope('input_data'): 113 | # 用户其它特征向量:[batch_size, normalized_continuous_features_length] 114 | normalized_continuous_features = batch_data['normalized_continuous_features'] 115 | # 用户行为序列特征向量:[batch_size, hist_click_length] 116 | hist_click = batch_data['hist_click'] 117 | # 用户标签:[batch_size, 1] 118 | label = batch_data['label'] 119 | # 计算item序列中0的比例 120 | batch_item_ratio = tf.reduce_mean(tf.reduce_mean(tf.to_float(tf.abs(hist_click) > 0),1),0) 121 | print("%s: %s" % ("normalized_continuous_features", normalized_continuous_features)) 122 | print("%s: %s" % ("hist_click", hist_click)) 123 | print("%s: %s" % ("label", label)) 124 | 125 | """ 2 Embedding初始化 """ 126 | # 初始化物品embedding向量V:[item_count, embedding_size] 127 | print("2 Embedding初始化" ) 128 | with tf.name_scope('embedding'): 129 | with tf.variable_scope("embedding", reuse=tf.AUTO_REUSE): 130 | self.weights = tf.Variable(tf.truncated_normal([self.item_count, self.embedding_size], 131 | stddev=1.0 / math.sqrt(self.embedding_size))) 132 | self.biases = tf.Variable(tf.zeros([self.item_count])) 133 | print("%s: %s" % ("weights", self.weights)) 134 | print("%s: %s" % ("biases", self.biases)) 135 | 136 | """ 3 对用户行为序列进行embedding_lookup查找,得到用户的行为embed向量 """ 137 | print("3 对用户item序列进行embedding_lookup查找" ) 138 | with tf.name_scope("embedding_lookup"): 139 | # weights:[item_count, embedding_size] 140 | # hist_click:[batch_size, hist_click_length] 141 | # embed:[batch_size, hist_click_length, embedding_size] 142 | inputs = tf.nn.embedding_lookup(self.weights, hist_click) 143 | print("%s: %s" % ("inputs", inputs)) 144 | 145 | """ 4 pooling操作,根据用户行为embed向量,进行求和或者平均操作 """ 146 | print("4 对用户序列进行pooling操作" ) 147 | with tf.name_scope('pooling_layer'): 148 | pooling_embed = tf.reduce_sum(inputs, axis=1) 149 | print("%s: %s" % ("pooling_embed", pooling_embed)) 150 | 151 | """ 5 用户特征向量拼接 """ 152 | print("5 用户特征向量拼接") 153 | with tf.name_scope("all_concat"): 154 | all_concat = tf.concat([pooling_embed, normalized_continuous_features], 1) 155 | print("%s: %s" % ("all_concat", all_concat)) 156 | 157 | """ 6 多层感知器神经网络计算,最终得到用户的embedding向量U:[batch_size, embedding_size] """ 158 | # 省略,可以参照第13章或者第12章。 159 | 160 | """ 7 Softmax计算,用户的embedding向量U 乘以物品的embedding向量V,然后通过Softmax计算结果,其中Loss采用NCE负采样方法 """ 161 | print("7 最后一层Softmax计算") 162 | with tf.name_scope('Softmax_Classifer'): 163 | with tf.variable_scope("softmax_classifer", reuse=tf.AUTO_REUSE): 164 | # 省略,可以参照https://github.com/ogerhsou/Youtube-Recommendation-Tensorflow/blob/master/youtube_recommendation.py。 165 | 166 | """8 设定summary,以便在Tensorboard里进行可视化 """ 167 | print("8 设定summary" ) 168 | with tf.name_scope("summaries"): 169 | tf.summary.scalar("loss", loss) 170 | tf.summary.histogram("weightsweight", self.weights) 171 | # 好几个summary,所以这里要merge_all 172 | summary_op = tf.summary.merge_all() 173 | 174 | """9 返回结果 """ 175 | return out, loss, batch_item_ratio, label, summary_op, train_step 176 | 177 | 178 | # ## 3)模型训练测试 179 | 180 | # In[ ]: 181 | 182 | # 数据参数 183 | print("0 数据准备和参数设置" ) 184 | batch_size=2000 185 | item_size = 30 186 | num_epochs=1 187 | filenames = '/data/001' 188 | data_type = 'sequence' 189 | next_element = process_data(data_type, filenames, item_size, batch_size, num_epochs) 190 | print("%s: %s" % ("next_element", next_element)) 191 | 192 | # 模型参数 193 | item_count = 99974 194 | embedding_size = 32 195 | num_sampled = 32 196 | learning_rate = 0.01 197 | hist_click_length = item_size * 3 198 | f_size = hist_click_length + 2 199 | normalized_continuous_features_length = f_size - hist_click_length - 1 200 | log_path='/data/log/youtubenet_20180810_001' 201 | 202 | # 开始训练 203 | bea_model = YouTubeNet(item_count, embedding_size, num_sampled, learning_rate, hist_click_length, normalized_continuous_features_length, log_path) 204 | out, loss, batch_item_ratio, label, summary_op, train_step = bea_model.train(next_element) 205 | 206 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer()) 207 | gpu_fraction = 0.5 208 | my_device='/cpu:0' 209 | with tf.device(my_device): 210 | sess = get_session(gpu_fraction) 211 | sess.run(init_op) 212 | batch_cnt = 0 213 | #选定可视化存储目录 214 | writer = tf.summary.FileWriter(log_path, sess.graph) 215 | print("9 迭代过程" ) 216 | try: 217 | while True: 218 | batch_cnt = batch_cnt + 1 219 | a, b, c, d, summary, _ = sess.run([out, loss, batch_item_ratio, label, summary_op, train_step]) 220 | if batch_cnt % 400 == 0 or batch_cnt <= 10: 221 | print("batch: {} loss: {} item_ratio: {}".format(batch_cnt, b, c)) 222 | writer.add_summary(summary, batch_cnt) 223 | except tf.errors.OutOfRangeError: 224 | print("Train end of dataset") 225 | -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第15章基于电商平台的商品召回/myGoodsRecall.scala: -------------------------------------------------------------------------------- 1 | package book_code 2 | 3 | import scala.collection.mutable.ArrayBuffer 4 | 5 | class myGoodsRecall extends Serializable { 6 | 7 | /** 8 | * 9 | * 根据用户请求,返回召回列表 10 | * 11 | */ 12 | 13 | def recall(request: Request, extendMap: Map[String, String]): Response = { 14 | 15 | // 1 获取参数 16 | val recallNum = extendMap.getOrElse("recallNum", "500").toInt 17 | val recallByKeyNum = extendMap.getOrElse("recallByKeyNum", "20").toInt 18 | val userActTopK = extendMap.getOrElse("userActTopK", "20").toInt 19 | 20 | // 2.1 获取用户数据,取用户TopK个浏览商品,这里一般通过其他接口,取相应的用户数据,在代码中不展开,这里采用一个数组来做实例讲解 21 | val userGoldsdArray = Array("101", "108", "109", "105") 22 | // 2.2 获取用户数据,取用户类别兴趣数据,这里一般通过其他接口,取相应的用户数据,在代码中不展开,这里采用一个数组来做实例讲解 23 | val userCategoryArray = Array("1", "2", "11") 24 | 25 | // 3.1 goldCF召回查询 26 | val userGoldCfRecallArray = userGoldsdArray.map { itemKey: String => 27 | // 通过key查询,得到列表,这里一般通过其他接口,取相应的数据,在本代码中不展开,这里采用一个Map 28 | // 需要解析召回内容,并且取top,用数据格式返回 29 | val itemByOneKeyArray = Map[String, Array[Item]]().getOrElse(itemKey, Array[Item]()).slice(0, recallByKeyNum) 30 | itemByOneKeyArray 31 | } 32 | // 3.2 汇总并去重 33 | val userGoldCfRecallDistinctTmp = userGoldCfRecallArray.flatMap(f => f) 34 | val userGoldCfRecallDistinct = ArrayBuffer[Item]() 35 | for (i <- 0 to userGoldCfRecallDistinctTmp.size - 1) { 36 | val item = userGoldCfRecallDistinctTmp(i) 37 | if (!userGoldCfRecallDistinct.map(f => f.itemKey).contains(item.itemKey)) { 38 | userGoldCfRecallDistinct += item 39 | } 40 | } 41 | 42 | // 4 相似内容召回查询 43 | val userGoldSimilarContentArray = userGoldsdArray.map { itemKey: String => 44 | // 通过key查询,得到列表,这里一般通过其他接口,取相应的数据,在本代码中不展开,这里采用一个Map来做实例讲解 45 | // 需要解析召回内容,并且取top,用数据格式返回 46 | val itemByOneKeyArray = Map[String, Array[Item]]().getOrElse(itemKey, Array[Item]()).slice(0, recallByKeyNum) 47 | itemByOneKeyArray 48 | } 49 | // 4.2 汇总并去重 50 | val userGoldSimilarContentRecallDistinctTmp = userGoldSimilarContentArray.flatMap(f => f) 51 | val userGoldSimilarContentRecallDistinct = ArrayBuffer[Item]() 52 | for (i <- 0 to userGoldSimilarContentRecallDistinctTmp.size - 1) { 53 | val item = userGoldSimilarContentRecallDistinctTmp(i) 54 | if (!userGoldSimilarContentRecallDistinctTmp.map(f => f.itemKey).contains(item.itemKey)) { 55 | userGoldSimilarContentRecallDistinct += item 56 | } 57 | } 58 | 59 | // 5 用户类别兴趣召回查询 60 | val userGoldSimilarCategoryArray = userCategoryArray.map { category: String => 61 | // 通过key查询,得到列表,这里一般通过其他接口,取相应的数据,在本代码中不展开,这里采用一个Map来做实例讲解 62 | // 需要解析召回内容,并且取top,用数据格式返回 63 | val itemByOneKeyArray = Map[String, Array[Item]]().getOrElse(category, Array[Item]()).slice(0, recallByKeyNum) 64 | itemByOneKeyArray 65 | } 66 | // 5.2 汇总并去重 67 | val userGoldSimilarCategoryRecallDistinctTmp = userGoldSimilarCategoryArray.flatMap(f => f) 68 | val userGoldSimilarCategoryRecallDistinct = ArrayBuffer[Item]() 69 | for (i <- 0 to userGoldSimilarCategoryRecallDistinctTmp.size - 1) { 70 | val item = userGoldSimilarCategoryRecallDistinctTmp(i) 71 | if (!userGoldSimilarCategoryRecallDistinctTmp.map(f => f.itemKey).contains(item.itemKey)) { 72 | userGoldSimilarCategoryRecallDistinct += item 73 | } 74 | } 75 | 76 | // 6 依此类推,查询其它召回数据,这里主不展开了 77 | 78 | // 7 多个召回数据合并,排序,并且取TopK 79 | // 7.1 CF 80 | // 取每个召回的参数权重,这里用个Map来做实例讲解 81 | val weightCF = Map[String, Double]().getOrElse("CF", 1.0) 82 | // 取物品,以及对应的分值 83 | val recallCF = userGoldCfRecallDistinct.toArray.map(x => (x.itemKey, x.score * weightCF)) 84 | // 7.2 Content 85 | // 取每个召回的参数权重,这里用个Map来做实例讲解 86 | val weightContent = Map[String, Double]().getOrElse("Content", 1.0) 87 | // 取物品,以及对应的分值 88 | val recallContent = userGoldSimilarContentRecallDistinct.toArray.map(x => (x.itemKey, x.score * weightContent)) 89 | // 7.3 Category 90 | // 取每个召回的参数权重,这里用个Map来做实例讲解 91 | val weightCategory = Map[String, Double]().getOrElse("Category", 1.0) 92 | // 取物品,以及对应的分值 93 | val recallCategory = userGoldSimilarCategoryRecallDistinct.toArray.map(x => (x.itemKey, x.score * weightCategory)) 94 | 95 | // 7.4 合并,并且返回ToK,排序按照分值降序排 96 | val recallMerge = (recallCF ++ recallContent ++ recallCategory). 97 | sortBy(f => -1 * f._2). 98 | slice(0, recallNum).map { 99 | case (itemKey: String, score: Double) => 100 | new Item(itemKey).setScore(score) 101 | } 102 | 103 | // 8 返回结果 104 | val recallStatus = if (recallMerge.size > 0) "True" else "False" 105 | val response = new Response(request.getSessionID). 106 | setStatus(recallStatus). 107 | setItemArray(recallMerge) 108 | response 109 | } 110 | 111 | } -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第16章基于逻辑回归的音乐评分预测/LR.scala: -------------------------------------------------------------------------------- 1 | package book_code 2 | 3 | import org.apache.spark.mllib.classification.{ LogisticRegressionModel, LogisticRegressionWithLBFGS } 4 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics 5 | import org.apache.spark.mllib.linalg.Vectors 6 | import org.apache.spark.mllib.regression.LabeledPoint 7 | 8 | import org.apache.spark.sql.types._ 9 | import org.apache.spark.sql.functions._ 10 | import org.apache.spark.sql._ 11 | import org.apache.spark.sql.SparkSession 12 | 13 | import java.io.{ ObjectInputStream, ObjectOutputStream } 14 | import java.net.URI 15 | import java.sql.Connection 16 | import org.apache.hadoop.conf.Configuration 17 | import org.apache.hadoop.fs.{ FileSystem, Path } 18 | 19 | object LR { 20 | 21 | def main(args: Array[String]): Unit = { 22 | 23 | val spark = SparkSession 24 | .builder 25 | .appName("LR") 26 | .config("spark.hadoop.validateOutputSpecs", "false") 27 | .enableHiveSupport() 28 | .getOrCreate() 29 | import spark.implicits._ 30 | 31 | // 1.1 初始化参数 32 | val dataPath = "hdfs://1.1.1.1:9000/LR_Data/sample_original_1/all.csv" 33 | val minFeature = 10 34 | val defaultValue = 0.0 35 | val modelSavePath = "" 36 | 37 | val iter = 100 38 | val reg_param = 0.0 39 | val elastic_net_param = 0.0 40 | 41 | // 2.2 取样本数据 42 | val dataRead = spark.read.options(Map(("delimiter", "|"), ("header", "false"))).csv(dataPath) 43 | val col = dataRead.columns 44 | val readSampleData = dataRead.withColumnRenamed(col(0), "label"). 45 | withColumnRenamed(col(1), "feature"). 46 | withColumnRenamed(col(2), "item") 47 | readSampleData.cache() 48 | 49 | //2.3 建立标签ID的索引以及数据处理方法 50 | val dataProcessObj = new DataProcess() 51 | 52 | // 2.4 生成样本 53 | val (training, test) = sampleDataProcess(spark, readSampleData, dataProcessObj) 54 | training.cache() 55 | training.count() 56 | test.cache() 57 | test.count() 58 | 59 | //3.1 建立逻辑回归模型 60 | val lr = new LogisticRegressionWithLBFGS().setNumClasses(2) 61 | lr.optimizer.setNumIterations(iter) 62 | lr.optimizer.setRegParam(reg_param) 63 | val lrModel = lr.run(training.rdd) 64 | 65 | //3.2 计算模型指标 66 | lrModel.clearThreshold() 67 | val scoreAndLabels = test.rdd.map { point => 68 | val score = lrModel.predict(point.features) 69 | (score, point.label) 70 | } 71 | val metrics = new BinaryClassificationMetrics(scoreAndLabels) 72 | val auc = metrics.areaUnderROC() 73 | val aupr = metrics.areaUnderPR() 74 | println(s"AUC: ${auc}") 75 | println(s"AUPR: ${aupr}") 76 | 77 | // 4.1 封装模型 78 | val mllibLR = new LrModel(lrModel, defaultValue, dataProcessObj) 79 | // 4.2 保存模型 80 | modelSave(mllibLR, modelSavePath) 81 | 82 | } 83 | 84 | /** 85 | * 保存序列化的模型 86 | */ 87 | def modelSave( 88 | model: LrModel, 89 | path: String): Unit = { 90 | } 91 | 92 | def sampleDataProcess(): (Dataset[LabeledPoint], Dataset[LabeledPoint]) = { 93 | (training, test) 94 | } 95 | 96 | } -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第17章Kaggle竞赛之Outbrain点击率预估/EnsembleTree.scala: -------------------------------------------------------------------------------- 1 | package book_code 2 | 3 | import org.apache.hadoop.conf.Configuration 4 | import org.apache.hadoop.fs.{ FileSystem, Path } 5 | import org.apache.spark.mllib.classification.{ LogisticRegressionModel, LogisticRegressionWithLBFGS } 6 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics 7 | import org.apache.spark.mllib.linalg.Vectors 8 | import org.apache.spark.ml.linalg.{ Vector => mlVector } 9 | import org.apache.spark.mllib.linalg.Vector 10 | import org.apache.spark.mllib.regression.LabeledPoint 11 | import org.apache.spark.mllib.tree.GradientBoostedTrees 12 | import org.apache.spark.mllib.tree.RandomForest 13 | import org.apache.spark.mllib.tree.configuration.BoostingStrategy 14 | import org.apache.spark.mllib.tree.configuration.FeatureType._ 15 | import org.apache.spark.mllib.tree.model.{ GradientBoostedTreesModel, Node, RandomForestModel } 16 | import org.apache.spark.mllib.tree.configuration.Algo.{ Algo, Regression } 17 | import org.apache.spark.mllib.tree.configuration.QuantileStrategy._ 18 | import org.apache.spark.mllib.tree.configuration.Strategy 19 | import org.apache.spark.mllib.tree.impurity.Variance 20 | import org.apache.spark.rdd.RDD 21 | import org.apache.spark.sql._ 22 | import scala.collection.mutable.ArrayBuffer 23 | import org.apache.spark.sql.types._ 24 | import org.apache.spark.sql.functions._ 25 | import org.apache.spark.sql._ 26 | import org.apache.spark.sql.SparkSession 27 | import java.io.{ ObjectInputStream, ObjectOutputStream } 28 | import java.net.URI 29 | import java.sql.Connection 30 | import org.apache.hadoop.conf.Configuration 31 | import org.apache.hadoop.fs.{ FileSystem, Path } 32 | 33 | object EnsembleTree { 34 | 35 | def main(args: Array[String]): Unit = { 36 | 37 | val spark = SparkSession.builder(). 38 | master("local"). 39 | appName("EnsembleTree"). 40 | getOrCreate() 41 | 42 | import spark.implicits._ 43 | 44 | // 1.1 初始化参数 45 | val dataPath = "hdfs://1.1.1:9000/data/Outbrain/all.csv" 46 | val minFeature = 10 47 | val defaultValue = 0.0 48 | val modelSavePath = "" 49 | var iteratTree = 10 50 | var iteratDepth = 10 51 | var maxAuc = 0.0 52 | var maxDepth = 10 53 | var numTrees = 10 54 | var minInstancesPerNode = 2 55 | var iter = 100 56 | var reg_param = 0.0 57 | var elastic_net_param = 0.0 58 | 59 | // 2.1 取样本数据 60 | val dataRead = spark.read.options(Map(("delimiter", "|"), ("header", "false"))).csv(dataPath) 61 | val col = dataRead.columns 62 | val readSampleData = dataRead.withColumnRenamed(col(0), "label"). 63 | withColumnRenamed(col(1), "feature"). 64 | withColumnRenamed(col(2), "item") 65 | readSampleData.cache() 66 | 67 | //2.2 建立数据处理方法 68 | val dataProcessObj1 = new DataProcess() 69 | val dataProcessObj2 = new DataProcess() 70 | val dataProcessObjAll = new DataProcess() 71 | 72 | //2 训练样本准备,准备2份 73 | val (training1, test1) = sampleDataProcess(spark, readSampleData, dataProcessObj1) 74 | training1.cache() 75 | training1.count() 76 | test1.cache() 77 | test1.count() 78 | 79 | val (training2, test2) = sampleDataProcess(spark, readSampleData, dataProcessObj2) 80 | training2.cache() 81 | training2.count() 82 | test2.cache() 83 | test2.count() 84 | 85 | //3.1 Gbdt1模型训练 86 | val boostingStrategy = BoostingStrategy.defaultParams("Regression") 87 | boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]() 88 | boostingStrategy.treeStrategy.minInstancesPerNode = minInstancesPerNode 89 | boostingStrategy.numIterations = numTrees 90 | boostingStrategy.treeStrategy.maxDepth = maxDepth 91 | val gbdtMode1 = GradientBoostedTrees.train(training1.rdd, boostingStrategy) 92 | 93 | //3.2 Gbdt2模型训练 94 | val gbdtMode2 = GradientBoostedTrees.train(training2.rdd, boostingStrategy) 95 | 96 | 97 | //4 解析样本,通过2个树模型映射到最终的LR输入向量 98 | val gbdtMode1_BC = spark.sparkContext.broadcast(gbdtMode1) 99 | val gbdtMode2_BC = spark.sparkContext.broadcast(gbdtMode2) 100 | 101 | val mergeSampleData = readSampleData.map { row => 102 | val click = row(0).toString().toInt 103 | val detail = row(1).toString() 104 | val itemid = row(2).toString() 105 | val label = if (click > 0) 1.0 else 0.0 106 | 107 | //第1个GBDT映射 108 | val (tree1Size, tree1NodeFeature) = gettreeNode(gbdtMode1_BC.value, tree1Feature) 109 | 110 | //第2个GBDT映射 111 | val (tree2Size, tree2NodeFeature) = gettreeNode(gbdtMode2_BC.value, tree2Feature) 112 | 113 | //所有样本归一化 114 | val allFeature = allMap 115 | val allSize = dataProcessObjAll.numFeatures 116 | 117 | //合并 118 | val mergeFeature = (tree1NodeFeature ++ 119 | (tree2NodeFeature.map(f => (f._1 + tree1Size.toInt, f._2))) ++ 120 | (tree3NodeFeature.map(f => (f._1 + tree1Size.toInt + tree2Size.toInt, f._2))) ++ 121 | (allFeature.map(f => (f._1 + tree1Size.toInt + tree2Size.toInt + tree3Size.toInt, f._2)))).sortBy(f => f._1) 122 | val mergeSize = tree1Size + tree2Size + tree3Size + allSize 123 | val point = LabeledPoint(label, Vectors.sparse(mergeSize.toInt, mergeFeature.map(_._1), mergeFeature.map(_._2))) 124 | point 125 | } 126 | 127 | //5 lr模型训练 128 | val Splits = mergeSampleData.randomSplit(Array(0.7, 0.3)) 129 | val Training = Splits(0) 130 | val Test = Splits(1) 131 | Training.cache() 132 | Test.cache() 133 | Training.count() 134 | Test.count() 135 | 136 | val lr = new LogisticRegressionWithLBFGS().setNumClasses(2) 137 | lr.optimizer.setNumIterations(iter) 138 | lr.optimizer.setRegParam(reg_param) 139 | val lrModel = lr.run(Training.rdd) 140 | 141 | //6 计算模型指标 142 | lrModel.clearThreshold() 143 | val scoreAndLabels = Test.rdd.map { point => 144 | val score = lrModel.predict(point.features) 145 | (score, point.label) 146 | } 147 | val metrics = new BinaryClassificationMetrics(scoreAndLabels) 148 | val auc = metrics.areaUnderROC() 149 | val aupr = metrics.areaUnderPR() 150 | println(s"AUC: ${auc}") 151 | println(s"AUPR: ${aupr}") 152 | 153 | // 7.1 封装模型 154 | val mllibEST = new EnsembleTreeModel() 155 | // 7.2 保存模型 156 | modelSave(mllibEST, modelSavePath) 157 | 158 | } 159 | 160 | } 161 | 162 | -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第17章Kaggle竞赛之Outbrain点击率预估/FFM.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[ ]: 5 | 6 | 7 | import xlearn as xl 8 | 9 | # 1 模型建立 10 | # model = create_linear() # Create linear model 11 | # model = create_fm() # Create factorization machines 12 | # model = create_ffm() # Create field-aware factorizarion machines. 13 | ffm_model = xl.create_ffm() # 建立field-aware factorization machine模型 14 | ffm_model.setTrain("./small_train.txt") # 设置训练数据 15 | ffm_model.setValidate("./small_test.txt") # 设置测试数据 16 | 17 | # 2 模型参数: 18 | # task: {'binary', # 二元分类 19 | # 'reg'} # 回归 20 | # metric: {'acc', 'prec', 'recall', 'f1', 'auc', # 分类指标 21 | # 'mae', 'mape', 'rmse', 'rmsd'} # 回归指标 22 | # lr: float value # 学习率 23 | # lambda: float value #正则因子 24 | # 其它超参因子参照API说明:https://xlearn-doc-cn.readthedocs.io/en/latest/all_api/index.html 25 | param = {'task':'binary', 'lr':0.1, 'lambda':0.001, 'metric':'auc'} 26 | 27 | # 3 训练模型 28 | # The trained model will be stored in ffm_model.out 29 | ffm_model.fit(param, './ffm_model.out') 30 | 31 | # 4 测试 32 | ffm_model.setTest("./small_test.txt") # Test data 33 | ffm_model.setSigmoid() # Convert output to 0-1 34 | 35 | # 预测结果 36 | # The output result will be stored in output.txt 37 | ffm_model.predict("./ffm_model.out", "./output.txt") 38 | 39 | -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第17章Kaggle竞赛之Outbrain点击率预估/XGB.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[ ]: 5 | 6 | 7 | import xgboost as xgb 8 | import numpy as np 9 | import random 10 | import math 11 | import os 12 | import sys 13 | from sklearn import metrics 14 | 15 | # 1 数据准备 16 | dtrain = xgb.DMatrix(train_data_path, feature_names = features) 17 | dtest = xgb.DMatrix(test_data_path, feature_names = features) 18 | dvalid = xgb.DMatrix(valid_data_path, feature_names = features) 19 | 20 | # 2 参数准备 21 | param = {'booster': booster, 22 | 'eval_metric':eval_metric, 23 | 'max_depth':max_depth, 24 | 'gamma': gamma, 25 | 'min_child_weight':min_child_weight, 26 | 'eta':eta, 27 | 'objective':objective, 28 | 'subsample': subsample, 29 | 'colsample_bytree': colsample_bytree} 30 | 31 | # 3 模型训练 32 | bst = xgb.train(param, dtrain, round, evals=[(dtrain,'train'), (dtest,'test')]) 33 | 34 | # 4 模型测试 35 | preds = bst.predict(dtest) 36 | auc = metrics.roc_auc_score(labels, preds) 37 | precision = metrics.average_precision_score(labels, preds) 38 | mae = metrics.mean_absolute_error(labels, preds) 39 | rmse = math.sqrt(metrics.mean_squared_error(labels, preds)) 40 | 41 | # 5 模型保存 42 | bst.save_model(local_path_bin) 43 | bst.dump_model(local_path) 44 | 45 | -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第18章基于深度学习的电商商品点击率预估/DeepInterestNetwork.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # ## 1)环境准备 5 | 6 | # In[28]: 7 | 8 | 9 | import numpy as np 10 | import tensorflow as tf 11 | import pandas as pd 12 | import random 13 | import math 14 | import re 15 | 16 | from sklearn import preprocessing 17 | from os import path, listdir 18 | from sklearn.datasets import load_svmlight_files 19 | from sklearn.model_selection import train_test_split 20 | from sklearn import metrics 21 | from tensorflow.contrib import layers 22 | 23 | from sklearn import metrics 24 | 25 | import time 26 | import datetime 27 | 28 | import os 29 | 30 | print(tf.__version__) 31 | print(tf.__path__) 32 | 33 | 34 | # ## 2)数据准备 35 | 36 | # In[23]: 37 | 38 | 39 | # 获取商品和类目的embedding数据 40 | def get_embedding(): 41 | # 类目embedding数据 42 | # 商品embedding数据 43 | return {"category_list": category_list, "golds_list": golds_list} 44 | 45 | # 读取用户行为数据,格式:点击|浏览序列|点击序列|购买序列|类目兴趣序列|用户画像特征 46 | # 1000000123|0,0,0,0,0,0,0,0,0,0,0|0,0,0,0,0,0,0,0,0||0,0,0,0,0,0,0,0,0,0|1,1173,0,0,0 47 | def decode_sequence(line, gold_size, category_size, profile_size): 48 | # 数据解析 49 | return {"label": label, "goods": goods, "category": category, "profile": profile} 50 | 51 | # 数据处理 52 | def process_data(my_path, gold_size, category_size, other_size, batch_size=32, num_epochs=1): 53 | filenames = get_file_list(my_path) 54 | next_element = read_my_file_format(filenames, gold_size, category_size, other_size, batch_size, num_epochs) 55 | return next_element 56 | 57 | 58 | # In[29]: 59 | 60 | 61 | # 测试数据 62 | filenames = 'D:\\Data\\GoldsData\\User\\user_data.csv' 63 | batch_size = 2 64 | num_epochs = 1 65 | gold_size = 10 66 | category_size = 8 67 | other_size = 12 68 | next_element = process_data(filenames, gold_size, category_size, other_size, batch_size, num_epochs) 69 | print(next_element['label']) 70 | print(next_element['goods']) 71 | print(next_element['category']) 72 | print(next_element['profile']) 73 | 74 | my_device='/cpu:0' 75 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer()) 76 | with tf.device(my_device): 77 | sess = tf.Session() 78 | sess.run(init_op) 79 | label, goods, category, other = sess.run([next_element['label'],next_element['goods'],next_element['category'],next_element['profile']]) 80 | print(label) 81 | print(goods) 82 | print(category) 83 | print(other) 84 | 85 | 86 | 87 | # In[30]: 88 | 89 | 90 | # embedding数据 91 | embedding = get_embedding() 92 | print(embedding['category_list'].shape) 93 | print(embedding['golds_list'].shape) 94 | print(embedding['golds_list']) 95 | print(embedding['golds_list']) 96 | 97 | 98 | # ## 3)定义DeepInterestNetwork模型 99 | 100 | # In[31]: 101 | 102 | 103 | class DeepInterestNetwork(object): 104 | """ 一、初始化成员变量 """ 105 | def __init__(self, 106 | goods_size, 107 | goods_embedding_size, 108 | category_embedding_size, 109 | num_sampled, 110 | learning_rate, 111 | attention_size, 112 | goods_input_length, 113 | category_input_length, 114 | profile_input_length, 115 | log_path): 116 | # 商品池大小 117 | self.goods_size = goods_size 118 | # 商品embedding大小 119 | self.goods_embedding_size = goods_embedding_size 120 | self.category_embedding_size = category_embedding_size 121 | # NCE采样数量 122 | self.num_sampled = num_sampled 123 | # 学习率 124 | self.learning_rate = learning_rate 125 | # attention层大小 126 | self.attention_size = attention_size 127 | # 用户购买序列特征长度 128 | self.goods_input_length = goods_input_length 129 | # 用户类目兴趣特征长度 130 | self.category_input_length = category_input_length 131 | # 用户画像特征长度 132 | self.profile_input_length = profile_input_length 133 | # log_path 134 | self.log_path = log_path 135 | 136 | """ 二、计算网络最后一层的输出 """ 137 | def _comput_lay_out(self, batch_data): 138 | """ 1 定义输入数据 """ 139 | print("1 定义输入数据" ) 140 | with tf.name_scope('input_data'): 141 | # 用户画像特征向量:[batch_size, profile_input_length] 142 | input_profile = batch_data['profile'] 143 | # 用户类目特征向量:[batch_size, category_input_length] 144 | input_category = batch_data['category'] 145 | # 用户购买序列特征向量:[batch_size, goods_input_length] 146 | input_goods = batch_data['goods'] 147 | print("%s: %s" % ("input_profile", input_profile)) 148 | print("%s: %s" % ("input_goods", input_goods)) 149 | print("%s: %s" % ("input_category", input_category)) 150 | 151 | # 计算gold序列中0的比例 152 | batch_goods_ratio = tf.reduce_mean(tf.reduce_mean(tf.to_float(tf.abs(input_goods) > 0),1),0) 153 | 154 | """ 2 对用户行为序列进行embedding_lookup查找,得到用户的行为embed向量 """ 155 | # 省略,可以参照第14章。 156 | 157 | """ 3 attention机制,根据用户行为embed向量,通过多层感知神经网络,最后通过Saftmax得到alpha权重向量 """ 158 | print("3 对用户序列进行attention层计算" ) 159 | with tf.name_scope('attention_layer'): 160 | with tf.variable_scope("attention_layer", reuse=tf.AUTO_REUSE): 161 | # 全连接层计算 162 | # inputs shape: [batch_size, goods_input_length, embedding_size] 163 | # h: [batch_size, goods_input_length, embedding_size] 164 | h = layers.fully_connected(inputs_goods_emb, self.attention_size, activation_fn=tf.nn.tanh) 165 | print("%s: %s" % ("h", h)) 166 | 167 | # 输出层计算 168 | # u_context: importance vector 169 | u_context = tf.Variable(tf.truncated_normal([self.attention_size])) 170 | hu_sum = tf.reduce_sum(tf.multiply(h, u_context), axis=2, keep_dims=True) 171 | print("%s: %s" % ("hu_sum", hu_sum)) 172 | # 防止 exp 溢出 173 | hu_max = tf.reduce_max(hu_sum, axis=1, keep_dims=True) 174 | print("%s: %s" % ("hu_max", hu_max)) 175 | hu_normal = hu_sum - hu_max 176 | print("%s: %s" % ("hu_normal", hu_normal)) 177 | 178 | # Softmax计算 179 | # hu_sum: [batch_size, goods_input_length, 1] 180 | exp = tf.exp(hu_normal) 181 | exp_adapt = exp 182 | print("%s: %s" % ("exp_adapt", exp_adapt)) 183 | 184 | exp_adapt_sum = tf.reduce_sum(exp_adapt, axis=1, keep_dims=True) 185 | print("%s: %s" % ("exp_adapt_sum", exp_adapt_sum)) 186 | alpha = tf.div(exp_adapt, exp_adapt_sum) 187 | print("%s: %s" % ("alpha", alpha)) 188 | 189 | # attention计算,[batch_size, embedding_size] 190 | atten_embed = tf.reduce_sum(tf.multiply(inputs_goods_emb, alpha), axis=1) 191 | print("%s: %s" % ("atten_embed", atten_embed)) 192 | 193 | """ 4 用户特征向量拼接 """ 194 | # 省略,可以参照第14章。 195 | 196 | """ 5 多层感知器神经网络计算,最终得到用户的embedding向量U:[batch_size, embedding_size] """ 197 | # 省略,可以参照第14章。 198 | 199 | 200 | """ 三、网络训练 """ 201 | def train(self, batch_data, goods_embedding, category_embedding): 202 | """ 1 Embedding初始化 """ 203 | with tf.name_scope('embedding'): 204 | self.goods_embedding = tf.convert_to_tensor(goods_embedding, dtype=tf.float32) 205 | self.category_embedding = tf.convert_to_tensor(category_embedding, dtype=tf.float32) 206 | print("%s: %s" % ("goods_embedding", self.goods_embedding)) 207 | print("%s: %s" % ("category_embedding", self.category_embedding)) 208 | with tf.variable_scope("embedding", reuse=tf.AUTO_REUSE): 209 | self.nce_biases = tf.get_variable(name='nce_biases', shape=[self.goods_size], initializer=tf.constant_initializer(0.0)) 210 | print("%s: %s" % ("nce_biases", self.nce_biases)) 211 | 212 | """ 2 计算深度神经网络的最后一层输出 """ 213 | layer_out, batch_goods_ratio = self._comput_lay_out(batch_data) 214 | # 用户标签:[batch_size, 1] 215 | input_label = batch_data['label'] 216 | print("%s: %s" % ("input_label", input_label)) 217 | 218 | """ 3 Softmax计算,用户的embedding向量U乘以商品的embedding向量V,然后通过Softmax计算结果,其中Loss采用NCE负采样方法 """ 219 | print("3 最后一层Softmax计算") 220 | # 省略,可以参照第14章。 221 | 222 | """4 设定summary,以便在Tensorboard里进行可视化 """ 223 | print("4 设定summary" ) 224 | with tf.name_scope("summaries"): 225 | tf.summary.scalar("loss", loss) 226 | tf.summary.histogram("nce_biases", self.nce_biases) 227 | # 好几个summary,所以这里要merge_all 228 | summary_op = tf.summary.merge_all() 229 | 230 | """5 返回结果 """ 231 | return out, loss, batch_goods_ratio, input_label, summary_op, train_step 232 | 233 | """ 四、预测计算 """ 234 | def predict(self, batch_data): 235 | """ 1 计算深度神经网络的最后一层输出 """ 236 | layer_out, _ = self._comput_lay_out(batch_data) 237 | 238 | """ 2 计算Softmax的预测结果 """ 239 | predict_score = tf.nn.softmax(tf.matmul(layer_out, tf.transpose(self.goods_embedding)) + self.nce_biases, dim=1) 240 | # 结果返回 241 | return predict_score 242 | 243 | 244 | # ## 4)模型训练测试 245 | 246 | # In[32]: 247 | 248 | 249 | # 数据参数 250 | print("0 数据准备和参数设置" ) 251 | filenames = 'D:\\Data\\GoldsData\\User\\user_data.csv' 252 | batch_size = 2000 253 | num_epochs = 1000 254 | gold_size = 10 255 | category_size = 8 256 | profile_size = 12 257 | next_element = process_data(filenames, gold_size, category_size, other_size, batch_size, num_epochs) 258 | print("%s: %s" % ("next_element", next_element)) 259 | 260 | # 模型参数 261 | goods_size = 40742 262 | goods_embedding_size = 100 263 | category_embedding_size = 10 264 | num_sampled = 32 265 | learning_rate = 0.01 266 | attention_size = 60 267 | goods_input_length = gold_size * 3 268 | category_input_length = category_size 269 | profile_input_length = profile_size 270 | log_path='D:\\Data\\log\\20180915' 271 | 272 | # embedding参数 273 | embedding = get_embedding() 274 | goods_embedding = embedding['golds_list'] 275 | category_embedding = embedding['category_list'] 276 | print("%s: %s" % ("goods_embedding.shape", goods_embedding.shape)) 277 | print("%s: %s" % ("category_embedding.shape", category_embedding.shape)) 278 | 279 | # 开始训练 280 | golds_rec_model = DeepInterestNetwork(goods_size, 281 | goods_embedding_size, 282 | category_embedding_size, 283 | num_sampled, 284 | learning_rate, 285 | attention_size, 286 | goods_input_length, 287 | category_input_length, 288 | profile_input_length, 289 | log_path) 290 | out, loss, batch_goods_ratio, input_label, summary_op, train_step = golds_rec_model.train(next_element, goods_embedding, category_embedding) 291 | 292 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer()) 293 | my_device='/cpu:0' 294 | with tf.device(my_device): 295 | sess = tf.Session() 296 | sess.run(init_op) 297 | batch_cnt = 0 298 | #选定可视化存储目录 299 | writer = tf.summary.FileWriter(log_path, sess.graph) 300 | print("5 迭代过程" ) 301 | try: 302 | while True: 303 | batch_cnt = batch_cnt + 1 304 | a, b, c, d, summary, _ = sess.run([out, loss, batch_goods_ratio, input_label, summary_op, train_step]) 305 | if batch_cnt % 200 == 0 or batch_cnt <= 10: 306 | print("batch: {} loss: {} gold_ratio: {}".format(batch_cnt, b, c)) 307 | writer.add_summary(summary, batch_cnt) 308 | except tf.errors.OutOfRangeError: 309 | print("Train end of dataset") 310 | 311 | -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第19章Notebook实践/Debug_CF.scala: -------------------------------------------------------------------------------- 1 | import scala.math._ 2 | import org.apache.spark.sql.SparkSession 3 | import org.apache.spark.sql.Dataset 4 | import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder 5 | import org.apache.spark.sql.functions._ 6 | import org.apache.spark.sql._ 7 | import scala.collection.mutable.WrappedArray 8 | import scala.collection.JavaConverters._ 9 | import scala.collection.mutable.ArrayBuffer 10 | import scala.math._ 11 | import org.apache.spark.sql.SparkSession 12 | import org.apache.spark.sql.Dataset 13 | import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder 14 | import org.apache.spark.sql.functions._ 15 | import org.apache.spark.sql._ 16 | import scala.collection.mutable.WrappedArray 17 | import scala.collection.JavaConverters._ 18 | import scala.collection.mutable.ArrayBuffer 19 | 20 | import spark.implicits._ 21 | /** 22 | * ********************************* 23 | * 1 数据准备 24 | * 数据来源: 25 | * MovieLens 【数据地址:https://grouplens.org/datasets/movielens/】(1M、10M、20M 共三个数据集) 26 | * ********************************* 27 | */ 28 | // 1.1读取item配置表 29 | val item_conf_path = "hdfs://192.168.1.100:9000/Recommended_Algorithm_Action/I2I/movies.csv" 30 | val item_conf_df = spark.read.options(Map(("delimiter", ","), ("header", "true"))).csv(item_conf_path) 31 | item_conf_df.show(5,false) 32 | val item_id2title_map = item_conf_df.select("movieId", "title").collect().map(row => (row(0).toString(), row(1).toString())).toMap 33 | val item_id2genres_map = item_conf_df.select("movieId", "genres").collect().map(row => (row(0).toString(), row(1).toString())).toMap 34 | 35 | // 1.2读取用户行为数据 36 | val user_rating_path = "hdfs://192.168.1.100:9000/user/Recommended_Algorithm_Action/I2I/ratings.csv" 37 | val user_rating_df = spark.read.options(Map(("delimiter", ","), ("header", "true"))).csv(user_rating_path) 38 | user_rating_df.dtypes 39 | val user_ds = user_rating_df.map { 40 | case Row(userId: String, movieId: String, rating: String, timestamp: String) => 41 | ItemPref(userId, movieId, rating.toDouble) 42 | } 43 | user_ds.show(5, false) 44 | user_ds.cache() 45 | user_ds.count() 46 | 47 | // 1 (用户:物品) => (用户:(物品集合)) 48 | val user_ds1 = user_ds.groupBy("userid").agg(collect_set("itemid")).withColumnRenamed ("collect_set(itemid)", "itemid_set") 49 | user_ds1.show(2, false) 50 | 51 | // 2 物品:物品,上三角数据 52 | val user_ds2 = user_ds1.flatMap { row => 53 | val itemlist = row.getAs[scala.collection.mutable.WrappedArray[String]](1).toArray. sorted 54 | val result = new ArrayBuffer[(String, String, Double)]() 55 | for (i <- 0 to itemlist.length - 2) { 56 | for (j <- i + 1 to itemlist.length - 1) { 57 | result += ((itemlist(i), itemlist(j), 1.0)) 58 | } 59 | } 60 | result 61 | }.withColumnRenamed("_1", "itemidI").withColumnRenamed("_2", "itemidJ").withColumnRenamed("_3", "score") 62 | user_ds2.show(5, false) 63 | 64 | // 3 计算物品与物品,上三角,同现频次 65 | val user_ds3 = user_ds2.groupBy("itemidI", "itemidJ").agg(sum("score").as("sumIJ")) 66 | user_ds3. 67 | show(5, false) 68 | 69 | // 4 计算物品总共出现的频次 70 | val user_ds0 = user_ds.withColumn("score", lit(1)).groupBy("itemid").agg(sum("score").as("score")) 71 | user_ds0.show(5, false) 72 | 73 | // 5 计算同现相似度 74 | val user_ds4 = user_ds3.join(user_ds0.withColumnRenamed("itemid", "itemidJ").withColumnRenamed("score", "sumJ").select("itemidJ", "sumJ"), "itemidJ") 75 | user_ds4.show(5, false) 76 | 77 | val user_ds5 = user_ds4.join(user_ds0.withColumnRenamed("itemid", "itemidI").withColumnRenamed("score", "sumI").select("itemidI", "sumI"), "itemidI") 78 | user_ds5.show(5, false) 79 | 80 | // 根据公式N(i)∩N(j)/sqrt(N(i)*N(j)) 计算 81 | val user_ds6 = user_ds5.withColumn("result", col("sumIJ") / sqrt(col("sumI") * col("sumJ"))) 82 | user_ds6.show(5, false) 83 | 84 | // 6 上、下三角合并 85 | println(s"user_ds6.count(): ${user_ds6.count()}") 86 | val user_ds8 = user_ds6.select("itemidI", "itemidJ", "result").union(user_ds6.select($"itemidJ".as("itemidI"), $"itemidI".as("itemidJ"), $"result")) 87 | println(s"user_ds8.count(): ${user_ds8.count()}") 88 | user_ds8.show(5, false) 89 | 90 | // 7 结果返回 91 | val out = user_ds8.select("itemidI", "itemidJ", "result").map { row => 92 | val itemidI = row.getString(0) 93 | val itemidJ = row.getString(1) 94 | val similar = row.getDouble(2) 95 | ItemSimi(itemidI, itemidJ, similar) 96 | } 97 | out.show(5, false) 98 | 99 | // 结果增加配置信息 100 | val item_id2title_map_BC = spark.sparkContext.broadcast(item_id2title_map) 101 | val item_id2genres_map_BC = spark.sparkContext.broadcast(item_id2genres_map) 102 | 103 | val items_similar_cooccurrence = out.map { 104 | case ItemSimi(itemidI: String, itemidJ: String, similar: Double) => 105 | val i_title = item_id2title_map_BC.value.getOrElse(itemidI, "") 106 | val j_title = item_id2title_map_BC.value.getOrElse(itemidJ, "") 107 | val i_genres = item_id2genres_map_BC.value.getOrElse(itemidI, "") 108 | val j_genres = item_id2genres_map_BC.value.getOrElse(itemidJ, "") 109 | (itemidI, itemidJ, similar, i_title, j_title, i_genres, j_genres) 110 | }.withColumnRenamed("_1", "itemidI"). 111 | withColumnRenamed("_2", "itemidJ"). 112 | withColumnRenamed("_3", "similar"). 113 | withColumnRenamed("_4", "i_title"). 114 | withColumnRenamed("_5", "j_title"). 115 | withColumnRenamed("_6", "i_genres"). 116 | withColumnRenamed("_7", "j_genres") 117 | items_similar_cooccurrence.columns 118 | items_similar_cooccurrence.cache() 119 | items_similar_cooccurrence.count 120 | 121 | // 查询结果信息,查询各种Case 122 | items_similar_cooccurrence. 123 | orderBy($"itemidI".asc, $"similar".desc). 124 | select("i_title", "j_title", "i_genres", "j_genres", "similar"). 125 | show(20) 126 | 127 | // 3.1 同现相似度推荐 128 | val cooccurrence = items_similar_cooccurrence.select("itemidI", "itemidJ", "similar").map { 129 | case Row(itemidI: String, itemidJ: String, similar: Double) => 130 | ItemSimi(itemidI, itemidJ, similar) 131 | } 132 | cooccurrence.show(5) 133 | 134 | // 1 数据准备 135 | val items_similar_ds1 = cooccurrence 136 | val user_prefer_ds1 = user_ds 137 | 138 | // 2 根据用户的item召回相似物品 139 | val user_prefer_ds2 = items_similar_ds1.join(user_prefer_ds1, $"itemidI" === $"itemid", "inner") 140 | user_prefer_ds2.show(5) 141 | 142 | // 3 计算召回的用户物品得分 143 | val user_prefer_ds3 = user_prefer_ds2.withColumn("score", col("pref") * col("similar")).select("userid", "itemidJ", "score") 144 | user_prefer_ds3.show(5) 145 | 146 | // 4 得分汇总 147 | val user_prefer_ds4 = user_prefer_ds3.groupBy("userid", "itemidJ").agg(sum("score").as("score")).withColumnRenamed("itemidJ", "itemid") 148 | user_prefer_ds4.show(5) 149 | 150 | // 5 用户得分排序结果,去除用户已评分物品 151 | val user_prefer_ds5 = user_prefer_ds4.join(user_prefer_ds1, Seq("userid", "itemid"), "left").where("pref is null") 152 | user_prefer_ds5.show(5) 153 | 154 | // 6 结果返回 155 | val out1 = user_prefer_ds5.select("userid", "itemid", "score").map { row => 156 | val userid = row.getString(0) 157 | val itemid = row.getString(1) 158 | val pref = row.getDouble(2) 159 | UserRecomm(userid, itemid, pref) 160 | } 161 | 162 | // 结果增加配置信息 163 | val user_predictr_cooccurrence = out1.map { 164 | case UserRecomm(userid: String, itemid: String, pref: Double) => 165 | val title = item_id2title_map_BC.value.getOrElse(itemid, "") 166 | val genres = item_id2genres_map_BC.value.getOrElse(itemid, "") 167 | (userid, itemid, title, genres, pref) 168 | }.withColumnRenamed("_1", "userid"). 169 | withColumnRenamed("_2", "itemid"). 170 | withColumnRenamed("_3", "title"). 171 | withColumnRenamed("_4", "genres"). 172 | withColumnRenamed("_5", "pref") 173 | user_predictr_cooccurrence.columns 174 | user_predictr_cooccurrence.cache() 175 | user_predictr_cooccurrence.count() 176 | 177 | // 查询结果信息,查询各种Case 178 | user_predictr_cooccurrence.orderBy($"userid".asc, $"pref".desc).show(20) 179 | 180 | 181 | 182 | 183 | -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第19章Notebook实践/Debug_FM.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # ## 1)环境准备 5 | 6 | # In[1]: 7 | 8 | import numpy as np 9 | import tensorflow as tf 10 | import pandas as pd 11 | import random 12 | import math 13 | import re 14 | 15 | from sklearn import preprocessing 16 | from os import path, listdir 17 | from sklearn.datasets import load_svmlight_files 18 | from sklearn.model_selection import train_test_split 19 | from sklearn import metrics 20 | from tensorflow.contrib import layers 21 | 22 | import time 23 | import datetime 24 | 25 | import os 26 | os.environ["CUDA_VISIBLE_DEVICES"]="0" 27 | 28 | print tf.__version__ 29 | print tf.__path__ 30 | 31 | 32 | # ## 2)数据准备Dataset格式 33 | 34 | # In[2]: 35 | 36 | # 每一行解析,解析标签csv格式 37 | # 0.0,0.6666666666666666,0.5,0.0,0.0,0.0,0.0,0.7272727272727273,0.42857142857142855 38 | 39 | # 数据处理 40 | def process_data(data_type, my_path, feature_size, batch_size=32, num_epochs=1): 41 | filenames = get_file_list(my_path) 42 | next_element = read_my_file_format(data_type, filenames, feature_size, batch_size, num_epochs) 43 | return next_element 44 | 45 | # 创建session,指定GPU或者CPU使用率 46 | def get_session(gpu_fraction=0.1): 47 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction, 48 | allow_growth=True) 49 | # server = tf.train.Server.create_local_server() 50 | return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) 51 | 52 | 53 | # ## 3)Debug代码 54 | 55 | # In[3]: 56 | 57 | """ 0 测试数据 """ 58 | filenames = '/data/data0/001' 59 | feature_size = 530 60 | fm_v_size = 10 61 | batch_size = 3 62 | num_epochs = 1 63 | data_type = 'libsvm' 64 | batch_data = process_data(data_type, filenames, feature_size, batch_size, num_epochs) 65 | print("%s: %s" % ("batch_data", batch_data)) 66 | 67 | 68 | # In[9]: 69 | 70 | """ 1 定义输入数据 """ 71 | # 标签:[batch_size, 1] 72 | labels = batch_data['labels'] 73 | # 用户特征向量:[batch_size, feature_size] 74 | dense_vector = tf.reshape(batch_data['dense_vector'], shape=[-1, feature_size, 1]) # None * feature_size * 1 75 | print("%s: %s" % ("dense_vector", dense_vector)) 76 | print("%s: %s" % ("labels", labels)) 77 | 78 | 79 | # In[10]: 80 | 81 | """ 2 定义网络输出 """ 82 | # FM参数,生成或者获取W V 83 | with tf.variable_scope("lr_layer", reuse=tf.AUTO_REUSE): 84 | FM_W = tf.get_variable(name='fm_w', shape=[feature_size, 1], initializer=tf.glorot_normal_initializer()) 85 | FM_V = tf.get_variable(name='fm_v', shape=[feature_size, fm_v_size], initializer=tf.glorot_normal_initializer()) 86 | FM_B = tf.Variable(tf.constant(0.0), dtype=tf.float32 ,name="fm_bias") # W0 87 | print("%s: %s" % ("FM_W", FM_W)) 88 | print("%s: %s" % ("FM_V", FM_V)) 89 | print("%s: %s" % ("FM_B", FM_B)) 90 | 91 | 92 | # In[11]: 93 | 94 | # ---------- w * x ---------- 95 | Y_first = tf.reduce_sum(tf.multiply(FM_W, dense_vector), 2) # None * F 96 | print("%s: %s" % ("Y_first", Y_first)) 97 | 98 | 99 | # In[12]: 100 | 101 | # ---------- Vij * Vij* Xij --------------- 102 | embeddings = tf.multiply(FM_V, dense_vector) # None * V * X 103 | print("%s: %s" % ("embeddings", embeddings)) 104 | # sum_square part 105 | summed_features_emb = tf.reduce_sum(embeddings, 1) # sum(v*x) 106 | summed_features_emb_square = tf.square(summed_features_emb) # (sum(v*x))^2 107 | 108 | # square_sum part 109 | squared_features_emb = tf.square(embeddings) # (v*x)^2 110 | squared_sum_features_emb = tf.reduce_sum(squared_features_emb, 1) # sum((v*x)^2) 111 | 112 | # second order 113 | Y_second = 0.5 * tf.subtract(summed_features_emb_square, squared_sum_features_emb) # 0.5*((sum(v*x))^2 - sum((v*x)^2)) 114 | print("%s: %s" % ("Y_second", Y_second)) 115 | 116 | 117 | # In[15]: 118 | 119 | # out = W * X + Vij * Vij* Xij 120 | FM_out_lay1 = tf.concat([Y_first, Y_second], axis=1) 121 | print("%s: %s" % ("FM_out_lay1", FM_out_lay1)) 122 | 123 | Y_Out = tf.reduce_sum(FM_out_lay1, 1) 124 | print("%s: %s" % ("Y_Out", Y_Out)) 125 | 126 | 127 | # In[16]: 128 | 129 | # out = out + bias 130 | y_d = tf.reshape(Y_Out,shape=[-1]) 131 | Y_bias = FM_B * tf.ones_like(y_d, dtype=tf.float32) # Y_bias 132 | Y_Out = tf.add(Y_Out, Y_bias, name='Y_Out') 133 | print("%s: %s" % ("Y_bias", Y_bias)) 134 | print("%s: %s" % ("Y_Out", Y_Out)) 135 | 136 | 137 | # In[17]: 138 | 139 | # ---------- score ---------- 140 | score=tf.nn.sigmoid(Y_Out,name='score') 141 | score=tf.reshape(score, shape=[-1, 1]) 142 | print("%s: %s" % ("score", score)) 143 | 144 | 145 | # In[18]: 146 | 147 | """ 3 定义损失函数和AUC指标 """ 148 | reg_type = 'l2_reg' 149 | loss_fuc = 'Cross_entropy' 150 | reg_param = 0.01 151 | learning_rate = 0.01 152 | print("%s: %s" % ("reg_type", reg_type)) 153 | print("%s: %s" % ("loss_fuc", loss_fuc)) 154 | print("%s: %s" % ("reg_param", reg_param)) 155 | print("%s: %s" % ("learning_rate", learning_rate)) 156 | 157 | 158 | # In[19]: 159 | 160 | # loss:Squared_error,Cross_entropy ,FTLR 161 | if reg_type == 'l1_reg': 162 | regularization = reg_param * tf.reduce_sum(tf.abs(FM_W)) 163 | elif reg_type == 'l2_reg': 164 | regularization = reg_param * tf.nn.l2_loss(FM_W) 165 | else: 166 | regularization = reg_param * tf.nn.l2_loss(FM_W) 167 | 168 | if loss_fuc == 'Squared_error': 169 | loss = tf.reduce_mean(tf.reduce_sum(tf.square(labels - score), reduction_indices=[1])) + regularization 170 | elif loss_fuc == 'Cross_entropy': 171 | loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=tf.reshape(Y_Out, [-1]), labels=tf.reshape(labels, [-1]))) + regularization 172 | elif loss_fuc == 'FTLR': 173 | loss = tf.reduce_mean(tf.reduce_sum(tf.square(labels - score), reduction_indices=[1])) + regularization 174 | 175 | 176 | # In[20]: 177 | 178 | # AUC 179 | auc = tf.metrics.auc(labels, score) 180 | print("%s: %s" % ("labels", labels)) 181 | print("%s: %s" % ("score", score)) 182 | 183 | 184 | # In[21]: 185 | 186 | # w为0的比例,w的平均值 187 | w_zero_ratio = tf.reduce_mean(tf.to_float(tf.abs(FM_W) <= 1.0e-5)) 188 | w_avg = tf.reduce_mean(FM_W) 189 | v_zero_ratio = tf.reduce_mean(tf.to_float(tf.abs(FM_V) <= 1.0e-5)) 190 | v_avg = tf.reduce_mean(FM_V) 191 | 192 | 193 | # In[22]: 194 | 195 | """ 4 设定optimizer """ 196 | global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step') 197 | with tf.variable_scope("optimizer", reuse=tf.AUTO_REUSE): 198 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8) 199 | train_step = optimizer.minimize(loss, global_step=global_step) 200 | 201 | 202 | # In[23]: 203 | 204 | """ 分步调试,对上面各个步骤中的变量值进行打印和查看,以方便定位问题 """ 205 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer()) 206 | with tf.device('/cpu:0'): 207 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2, allow_growth=True) 208 | sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) 209 | sess.run(init_op) 210 | a, b = sess.run([Y_Out, score]) 211 | print a 212 | print b 213 | -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第19章Notebook实践/Debug_Sk_LR.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # ## 1)环境准备 5 | 6 | In [25]: 7 | from sklearn.linear_model import LogisticRegression 8 | rom sklearn import metrics 9 | from os import path, listdir 10 | from sklearn.datasets import load_svmlight_files 11 | from sklearn.model_selection import train_test_split 12 | from sklearn.externals import joblib 13 | from sklearn import preprocessing 14 | import numpy as np 15 | import pandas as pd 16 | import random 17 | import platform 18 | print("Python Version: %s"%(platform.python_version())) 19 | 20 | In [26]: 21 | """ 22 | 处理libSVM数据方法,生成样本,支持Batch格式返回,也支持X/Y返回 23 | 步骤: 24 | 1)读取libSVM格式数据。 25 | 2)数据归一化处理。 26 | 3)划分训练集和测试集。 27 | 4)生成Batch数据。 28 | """ 29 | 30 | In [27]: 31 | # 数据测试 32 | data_path = '/data/data01/' 33 | test_rat=0.4 34 | random_seed=0 35 | train_batch_size=20000 36 | test_batch_size=20000 37 | feature_size=530 38 | 39 | # 获取样本数据 40 | data = process_data(data_path, feature_size, test_rat, random_seed, train_batch_size, test_batch_size) 41 | 42 | train_batch = data['train_batch'] 43 | test_batch = data['test_batch'] 44 | X_train = data['X_train'] 45 | Y_train = data['Y_train'] 46 | X_test = data['X_test'] 47 | Y_test = data['Y_test'] 48 | 49 | # 查看样本数据大小 50 | print("X_train.shape: ") 51 | print(X_train.shape) 52 | print("Y_train.shape: ") 53 | print(Y_train.shape) 54 | print("X_test.shape: ") 55 | print(X_test.shape) 56 | print("Y_test.shape: ") 57 | print(Y_test.shape) 58 | 59 | In [30]: 60 | # 3.1 建立逻辑回归模型,并且设定参数 61 | lr_model= LogisticRegression(penalty='l2', C=1000, solver='lbfgs', max_iter=500) 62 | 63 | # 3.2 训练逻辑回归模型 64 | lr_model.fit(X_train,Y_train.values.ravel()) 65 | 66 | In [31]: 67 | # 3.3 采用测试集验证模型离线指标 68 | # 训练集AUC 69 | probs_train= lr_model.predict_proba(X_train) 70 | AUC1 = metrics.roc_auc_score(Y_train, probs_train[:,1]) 71 | print("Train Auc: %s"%(AUC1)) 72 | 73 | # 测试集AUC 74 | probs_test= lr_model.predict_proba(X_test) 75 | predict_test = lr_model.predict(X_test) 76 | AUC2 = metrics.roc_auc_score(Y_test, probs_test[:,1]) 77 | print("Test Auc: %s"%(AUC2)) 78 | 79 | # 准确率 80 | accuracy = metrics.accuracy_score(Y_test, predict_test) 81 | print("Test Accuracy: %s"%(accuracy)) 82 | 83 | # 召回率 84 | recall = metrics.recall_score(Y_test, predict_test) 85 | print("Test Recall: %s"%(recall)) 86 | 87 | # F1值 88 | f1 = metrics.f1_score(Y_test, predict_test) 89 | print("Test F1: %s"%(f1)) 90 | 91 | In [42]: 92 | # 3.4 打印模型参数 93 | w=lr_model.coef_ 94 | print("参数大小:") 95 | print(w.shape) 96 | print("参数前10个:") 97 | print(lr_model.coef_[:,0:10]) 98 | print("截距:") 99 | print(lr_model.intercept_) 100 | print("稀疏化特征比率:%.2f%%" %(np.mean(lr_model.coef_.ravel()==0)*100)) 101 | print("sigmoid函数转化的值,即:概率p") 102 | print(lr_model.predict_proba(X_test[0:5])) 103 | 104 | In [43]: 105 | # 3.5 模型保存 106 | joblib.dump(lr_model,"logistic_lr.model") 107 | #模型加载 108 | load_lr = joblib.load("logistic_lr.model") 109 | print(load_lr.predict_proba(X_test[0:5])) 110 | -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第19章Notebook实践/Debug_Spark_LR.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.ml.classification.{ BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel } 2 | import org.apache.spark.ml.evaluation.{ MulticlassClassificationEvaluator, BinaryClassificationEvaluator } 3 | import org.apache.spark.ml.linalg.{ Vector, Vectors } 4 | import org.apache.spark.sql.types._ 5 | import org.apache.spark.sql.functions._import org.apache.spark.sql._ 6 | import org.apache.spark.sql.SparkSession 7 | import org.apache.spark.ml.feature._ 8 | 9 | /** 10 | * 读取libSVM格式的文件,生成训练样本和测试样本 11 | * 1)读取文件 12 | * 2)生成标签索引 13 | * 3)样本处理 14 | * 4)样本划分 15 | */ 16 | def readLibSvmSampleData(): ={ 17 | } 18 | 19 | //1 参数准备 20 | val dataPath = "hdfs://192.168.1.100:9000/Recommended_Algorithm_Action/data01/" 21 | val iter = 500 22 | val reg_param = 0.0 23 | val elastic_net_param = 0.0 24 | 25 | //2 训练样本准备 26 | val (training, test) = readLibSvmSampleData(spark, dataPath) 27 | training.cache() 28 | test.cache() 29 | println(s"training.count(): ${training.count()}") 30 | println(s"test.count(): ${test.count()}") 31 | training.show 32 | 33 | //3 建立逻辑回归模型 34 | val lr = new LogisticRegression(). 35 | setMaxIter(iter). 36 | setRegParam(reg_param). 37 | setElasticNetParam(elastic_net_param) 38 | 39 | //4 根据训练样本进行模型训练 40 | val lrModel = lr.fit(training) 41 | 42 | //5 打印模型信息 43 | println(s"Coefficients Top 10: ${lrModel.coefficients.toArray.slice(0, 10).mkString(" ")}") 44 | println(s"Intercept: ${lrModel.intercept}") 45 | 46 | //6 对模型进行测试 47 | val test_predict = lrModel.transform(test) 48 | test_predict.show 49 | test_predict.select("features", "label", "probability", "prediction").take(5).foreach { 50 | case Row(features: Vector, label: Double, prob: Vector, prediction: Double) => 51 | println(s"($features, $label) -> prob=$prob, prediction=$prediction") 52 | } 53 | 54 | 55 | //10 模型摘要 56 | val trainingSummary = lrModel.summary 57 | 58 | //11 每次迭代目标值 59 | val objectiveHistory = trainingSummary.objectiveHistory 60 | println("objectiveHistory:") 61 | objectiveHistory.foreach(loss => println(loss)) 62 | 63 | //12 计算模型指标数据 64 | val binarySummary = trainingSummary.asInstanceOf[BinaryLogisticRegressionSummary] 65 | 66 | //13 模型摘要AUC指标 67 | val roc = binarySummary.roc 68 | roc.show() 69 | val AUC = binarySummary.areaUnderROC 70 | println(s"areaUnderROC: ${binarySummary.areaUnderROC}") 71 | 72 | //14 测试集AUC指标 73 | val evaluator = new BinaryClassificationEvaluator(). 74 | setLabelCol("label"). 75 | setRawPredictionCol("probability"). 76 | setMetricName("areaUnderROC") 77 | val testAUC = evaluator.evaluate(test_predict) 78 | println("Test AUC = " + testAUC) 79 | 80 | //15 设置模型阈值 81 | // 不同的阈值,计算不同的F1,然后通过最大的F1找出并重设模型的最佳阈值 82 | val fMeasure = binarySummary.fMeasureByThreshold 83 | fMeasure.show 84 | // 获得最大的F1值 85 | val maxFMeasure = fMeasure.select(max("F-Measure")).head().getDouble(0) 86 | // 找出最大F1值对应的阈值(最佳阈值) 87 | val bestThreshold = fMeasure.where($"F-Measure" === maxFMeasure).select("threshold").head().getDouble(0) 88 | // 将模型的Threshold设置为选择出来的最佳分类阈值 89 | lrModel.setThreshold(bestThreshold) 90 | 91 | 92 | //16 模型保存与加载 93 | // 保存 94 | lrModel.save("hdfs://192.168.1.100:9000/mlv2/lrmodel") 95 | // 加载 96 | val load_lrModel = LogisticRegressionModel.load("hdfs://192.168.1.100:9000/mlv2/lrmodel") 97 | // 加载测试 98 | val load_predict = load_lrModel.transform(test) 99 | load_predict.select("features", "label", "probability", "prediction").take(5).foreach { 100 | case Row(features: Vector, label: Double, prob: Vector, prediction: Double) => 101 | println(s"($features, $label) -> prob=$prob, prediction=$prediction") 102 | } 103 | -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第19章Notebook实践/Debug_TF_LR.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # ## 1)环境准备 5 | 6 | In [1]: 7 | import numpy as np 8 | import tensorflow as tf 9 | import pandas as pd 10 | import random 11 | import math 12 | import re 13 | 14 | from sklearn import preprocessing 15 | from os import path, listdir 16 | from sklearn.datasets import load_svmlight_files 17 | from sklearn.model_selection import train_test_split 18 | from sklearn import metrics 19 | from tensorflow.contrib import layers 20 | 21 | from sklearn import metrics 22 | 23 | import time 24 | import datetime 25 | 26 | import os 27 | os.environ["CUDA_VISIBLE_DEVICES"]="0" 28 | 29 | print tf.__version__ 30 | print tf.__path__ 31 | 32 | In [6]: 33 | """ 34 | 解析CSV格式,对输入的每一行样本进行格式解析,返回labels和dense_vector格式数据 35 | 例如输入CSV格式字符串: 0.0,0.6666666666666666,0.5,0.0,0.0,0.0,0.0,0.7272727272727273,0.42857142857142855 36 | 函数参数: 37 | line:需要解析的字符串 38 | feature_size:特征长度 39 | 函数返回: 40 | 返回字典,格式:{'labels': labels, 'dense_vector': dense_vector} 41 | labels:样本的labels 42 | dense_vector:样本的特征向量 43 | """ 44 | 45 | In [8]: 46 | # 测试数据 47 | filenames = '/data/all-csv' 48 | feature_size = 530 49 | batch_size = 3 50 | num_epochs = 1 51 | data_type = 'csv' 52 | next_element = process_data(data_type, filenames, feature_size, batch_size, num_epochs) 53 | print next_element['dense_vector'] 54 | print next_element['labels'] 55 | 56 | gpu_fraction = 0.2 57 | my_device='/gpu:0' 58 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer()) 59 | with tf.device(my_device): 60 | sess = get_session(gpu_fraction) 61 | sess.run(init_op) 62 | dense_vector, labels = sess.run([next_element['dense_vector'],next_element['labels']]) 63 | print dense_vector 64 | print labels 65 | 66 | In [7]: 67 | #基于逻辑回归的网络结构在TensorFlow中实现逻辑回归模型。其中“LR模型”代码省略,具体内容可以参考6.2.3节中的相关代码。 68 | class LR(object): 69 | """ 初始化成员变量 """ 70 | def __init__(self, feature_size, loss_fuc, train_optimizer, learning_rate, reg_type, reg_param): 71 | # 特征向量长度 72 | self.feature_size = feature_size 73 | # 损失函数 74 | self.loss_fuc = loss_fuc 75 | # 优化方法 76 | self.train_optimizer = train_optimizer 77 | # 学习率 78 | self.learning_rate = learning_rate 79 | # 正则类型 80 | self.reg_type = reg_type 81 | # 正则因子 82 | self.reg_param = reg_param 83 | # aglobal_step 84 | self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step') 85 | 86 | def train(self, batch_data): 87 | """ 1 定义输入数据 """ 88 | with tf.name_scope('input_data'): 89 | 90 | 91 | In [9]: 92 | # 数据准备 93 | filenames = '/data/csv-all' 94 | data_type='csv' 95 | feature_size = 530 96 | batch_size = 60000 97 | num_epochs = 200 98 | next_element = process_data(data_type, filenames, feature_size, batch_size, num_epochs) 99 | 100 | # 模型参数 101 | loss_fuc = 'Squared_error' 102 | train_optimizer = 'Adam' 103 | learning_rate = 0.01 104 | reg_type = 'l2_reg' 105 | reg_param = 0.0 106 | log_path='/data/log/Squared_error_lr_L2_0_20180816_01' 107 | 108 | # 开始训练 109 | bea_model = LR(feature_size, loss_fuc, train_optimizer, learning_rate, reg_type, reg_param) 110 | Y_Out, score, regularization, loss, auc, train_step, w_zero_ratio, w_avg, labels, score, summary_op = bea_model.train(next_element) 111 | 112 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer()) 113 | gpu_fraction = 0.4 114 | my_device='/gpu:0' 115 | with tf.device(my_device): 116 | sess = get_session(gpu_fraction) 117 | sess.run(init_op) 118 | batch_cnt = 0 119 | #选定可视化存储目录 120 | writer = tf.summary.FileWriter(log_path, sess.graph) 121 | try: 122 | while True: 123 | batch_cnt = batch_cnt + 1 124 | a, b, c, d, e, summary = sess.run([loss, auc, w_zero_ratio, w_avg, train_step, summary_op]) 125 | if batch_cnt % 50 == 0 or batch_cnt <= 10: 126 | y, p = sess.run([labels, score]) 127 | if y.sum() > 0.0: 128 | batch_auc=metrics.roc_auc_score(y, p) 129 | else: 130 | batch_auc=0.0 131 | print("batch: {} loss: {:.4f} accumulate_auc: {:.4f} batch_auc: {:.4f} w_zero_ratio: {:.4f} w_avg: {:.4f}".format(batch_cnt, a, b[0], batch_auc, c, d)) 132 | writer.add_summary(summary, batch_cnt) 133 | except tf.errors.OutOfRangeError: 134 | print("3、Train end of dataset") 135 | 136 | -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第4章协同过滤/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WallaceLiu/recommendation_algorithm/36cdd077446ab7a2831c168f7419b058bd2fcbb0/推荐系统算法实践—源码下载/第4章协同过滤/.DS_Store -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第4章协同过滤/I2iTest.scala: -------------------------------------------------------------------------------- 1 | package book_code 2 | 3 | import scala.math._ 4 | import org.apache.spark.sql.SparkSession 5 | 6 | import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder 7 | import org.apache.spark.sql.Encoder 8 | import org.apache.spark.sql.types._ 9 | import org.apache.spark.sql.functions._ 10 | import org.apache.spark.sql._ 11 | import scala.collection.mutable.WrappedArray 12 | import scala.collection.JavaConverters._ 13 | import scala.collection.mutable.ArrayBuffer 14 | 15 | object I2iTest { 16 | 17 | def main(args: Array[String]): Unit = { 18 | 19 | val spark = SparkSession 20 | .builder 21 | .appName("I2iTest") 22 | .enableHiveSupport() 23 | .getOrCreate() 24 | 25 | import spark.implicits._ 26 | 27 | /** 28 | * ********************************* 29 | * 1 数据准备 30 | * 数据来源: 31 | * MovieLens 【数据地址:https://grouplens.org/datasets/movielens/】(1M、10M、20M 共三个数据集) 32 | * ********************************* 33 | */ 34 | 35 | // 1.1读取item配置表 36 | val item_conf_path = "hdfs://1.1.1.1:9000/I2I/movies.csv" 37 | val item_conf_df = spark.read.options(Map(("delimiter", ","), ("header", "true"))).csv(item_conf_path) 38 | val item_id2title_map = item_conf_df.select("movieId", "title").collect().map(row => (row(0).toString(), row(1).toString())).toMap 39 | val item_id2genres_map = item_conf_df.select("movieId", "genres").collect().map(row => (row(0).toString(), row(1).toString())).toMap 40 | 41 | // 1.2读取用户行为数据 42 | val user_rating_path = "hdfs://1.1.1.1:9000/I2I/ratings.csv" 43 | val user_rating_df = spark.read.options(Map(("delimiter", ","), ("header", "true"))).csv(user_rating_path) 44 | 45 | user_rating_df.dtypes 46 | val user_ds = user_rating_df.map { 47 | case Row(userId: String, movieId: String, rating: String, timestamp: String) => 48 | ItemPref(userId, movieId, rating.toDouble) 49 | } 50 | println("user_ds.show(10)") 51 | user_ds.show(10) 52 | user_ds.cache() 53 | user_ds.count() 54 | 55 | /** 56 | * ********************************* 57 | * 2 相似度计算 58 | * ********************************* 59 | */ 60 | val item_id2title_map_BC = spark.sparkContext.broadcast(item_id2title_map) 61 | val item_id2genres_map_BC = spark.sparkContext.broadcast(item_id2genres_map) 62 | 63 | // 2.1 同现相似度 64 | val items_similar_cooccurrence = ItemSimilarity.CooccurrenceSimilarity(user_ds).map { 65 | case ItemSimi(itemidI: String, itemidJ: String, similar: Double) => 66 | val i_title = item_id2title_map_BC.value.getOrElse(itemidI, "") 67 | val j_title = item_id2title_map_BC.value.getOrElse(itemidJ, "") 68 | val i_genres = item_id2genres_map_BC.value.getOrElse(itemidI, "") 69 | val j_genres = item_id2genres_map_BC.value.getOrElse(itemidJ, "") 70 | (itemidI, itemidJ, similar, i_title, j_title, i_genres, j_genres) 71 | }.withColumnRenamed("_1", "itemidI"). 72 | withColumnRenamed("_2", "itemidJ"). 73 | withColumnRenamed("_3", "similar"). 74 | withColumnRenamed("_4", "i_title"). 75 | withColumnRenamed("_5", "j_title"). 76 | withColumnRenamed("_6", "i_genres"). 77 | withColumnRenamed("_7", "j_genres") 78 | items_similar_cooccurrence.columns 79 | // 结果打打印 80 | items_similar_cooccurrence.cache() 81 | items_similar_cooccurrence.count 82 | println("items_similar_cooccurrence.show(20)") 83 | items_similar_cooccurrence. 84 | orderBy($"itemidI".asc, $"similar".desc). 85 | select("i_title", "j_title", "i_genres", "j_genres", "similar"). 86 | show(20) 87 | 88 | // 2.2 余弦相似度 89 | val items_similar_cosine = ItemSimilarity.CosineSimilarity(user_ds).map { 90 | case ItemSimi(itemidI: String, itemidJ: String, similar: Double) => 91 | val i_title = item_id2title_map_BC.value.getOrElse(itemidI, "") 92 | val j_title = item_id2title_map_BC.value.getOrElse(itemidJ, "") 93 | val i_genres = item_id2genres_map_BC.value.getOrElse(itemidI, "") 94 | val j_genres = item_id2genres_map_BC.value.getOrElse(itemidJ, "") 95 | (itemidI, itemidJ, similar, i_title, j_title, i_genres, j_genres) 96 | }.withColumnRenamed("_1", "itemidI"). 97 | withColumnRenamed("_2", "itemidJ"). 98 | withColumnRenamed("_3", "similar"). 99 | withColumnRenamed("_4", "i_title"). 100 | withColumnRenamed("_5", "j_title"). 101 | withColumnRenamed("_6", "i_genres"). 102 | withColumnRenamed("_7", "j_genres") 103 | items_similar_cosine.columns 104 | // 结果打打印 105 | items_similar_cosine.cache() 106 | items_similar_cosine.count 107 | println("items_similar_cosine.show(20)") 108 | items_similar_cosine. 109 | orderBy($"itemidI".asc, $"similar".desc). 110 | select("i_title", "j_title", "i_genres", "j_genres", "similar"). 111 | show(20) 112 | 113 | // 2.3 欧氏距离相似度 114 | val items_similar_euclidean = ItemSimilarity.EuclideanDistanceSimilarity(user_ds).map { 115 | case ItemSimi(itemidI: String, itemidJ: String, similar: Double) => 116 | val i_title = item_id2title_map_BC.value.getOrElse(itemidI, "") 117 | val j_title = item_id2title_map_BC.value.getOrElse(itemidJ, "") 118 | val i_genres = item_id2genres_map_BC.value.getOrElse(itemidI, "") 119 | val j_genres = item_id2genres_map_BC.value.getOrElse(itemidJ, "") 120 | (itemidI, itemidJ, similar, i_title, j_title, i_genres, j_genres) 121 | }.withColumnRenamed("_1", "itemidI"). 122 | withColumnRenamed("_2", "itemidJ"). 123 | withColumnRenamed("_3", "similar"). 124 | withColumnRenamed("_4", "i_title"). 125 | withColumnRenamed("_5", "j_title"). 126 | withColumnRenamed("_6", "i_genres"). 127 | withColumnRenamed("_7", "j_genres") 128 | items_similar_euclidean.columns 129 | // 结果打打印 130 | items_similar_euclidean.cache() 131 | items_similar_euclidean.count 132 | println("items_similar_euclidean.show(20)") 133 | items_similar_euclidean. 134 | orderBy($"itemidI".asc, $"similar".desc). 135 | select("i_title", "j_title", "i_genres", "j_genres", "similar"). 136 | show(20) 137 | 138 | /** 139 | * ********************************* 140 | * 3 推荐计算 141 | * ********************************* 142 | */ 143 | 144 | // 推荐结果计算 145 | // 3.1 同现相似度推荐 146 | val cooccurrence = items_similar_cooccurrence.select("itemidI", "itemidJ", "similar").map { 147 | case Row(itemidI: String, itemidJ: String, similar: Double) => 148 | ItemSimi(itemidI, itemidJ, similar) 149 | } 150 | val user_predictr_cooccurrence = ItemSimilarity.Recommend(cooccurrence, user_ds).map { 151 | case UserRecomm(userid: String, itemid: String, pref: Double) => 152 | val title = item_id2title_map_BC.value.getOrElse(itemid, "") 153 | val genres = item_id2genres_map_BC.value.getOrElse(itemid, "") 154 | (userid, itemid, title, genres, pref) 155 | }.withColumnRenamed("_1", "userid"). 156 | withColumnRenamed("_2", "itemid"). 157 | withColumnRenamed("_3", "title"). 158 | withColumnRenamed("_4", "genres"). 159 | withColumnRenamed("_5", "pref") 160 | user_predictr_cooccurrence.columns 161 | user_predictr_cooccurrence.cache() 162 | user_predictr_cooccurrence.count() 163 | println("user_predictr_cooccurrence.show(20)") 164 | user_predictr_cooccurrence.orderBy($"userid".asc, $"pref".desc).show(20) 165 | 166 | // 3.2 余弦相似度推荐 167 | val cosine = items_similar_cosine.select("itemidI", "itemidJ", "similar").map { 168 | case Row(itemidI: String, itemidJ: String, similar: Double) => 169 | ItemSimi(itemidI, itemidJ, similar) 170 | } 171 | val user_predictr_cosine = ItemSimilarity.Recommend(cosine, user_ds).map { 172 | case UserRecomm(userid: String, itemid: String, pref: Double) => 173 | val title = item_id2title_map_BC.value.getOrElse(itemid, "") 174 | val genres = item_id2genres_map_BC.value.getOrElse(itemid, "") 175 | (userid, itemid, title, genres, pref) 176 | }.withColumnRenamed("_1", "userid"). 177 | withColumnRenamed("_2", "itemid"). 178 | withColumnRenamed("_3", "title"). 179 | withColumnRenamed("_4", "genres"). 180 | withColumnRenamed("_5", "pref") 181 | user_predictr_cosine.columns 182 | user_predictr_cosine.cache() 183 | user_predictr_cosine.count() 184 | println("user_predictr_cosine.show(20)") 185 | user_predictr_cosine.orderBy($"userid".asc, $"pref".desc).show(20) 186 | 187 | // 3.3 欧氏距离相似度推荐 188 | val euclidean = items_similar_euclidean.select("itemidI", "itemidJ", "similar").map { 189 | case Row(itemidI: String, itemidJ: String, similar: Double) => 190 | ItemSimi(itemidI, itemidJ, similar) 191 | } 192 | val user_predictr_euclidean = ItemSimilarity.Recommend(euclidean, user_ds).map { 193 | case UserRecomm(userid: String, itemid: String, pref: Double) => 194 | val title = item_id2title_map_BC.value.getOrElse(itemid, "") 195 | val genres = item_id2genres_map_BC.value.getOrElse(itemid, "") 196 | (userid, itemid, title, genres, pref) 197 | }.withColumnRenamed("_1", "userid"). 198 | withColumnRenamed("_2", "itemid"). 199 | withColumnRenamed("_3", "title"). 200 | withColumnRenamed("_4", "genres"). 201 | withColumnRenamed("_5", "pref") 202 | user_predictr_euclidean.columns 203 | user_predictr_euclidean.cache() 204 | user_predictr_euclidean.count() 205 | println("user_predictr_euclidean.show(20)") 206 | user_predictr_euclidean.orderBy($"userid".asc, $"itemid".desc).show(20) 207 | 208 | // 推荐结果保存 209 | val table_date = 20181025 210 | val recommend_table = "table_i2i_recommend_result" 211 | user_predictr_cooccurrence.createOrReplaceTempView("df_to_hive_table") 212 | val insertSql1 = s"insert overwrite table ${recommend_table} partition(ds=${table_date}) select userid, itemid, pref from df_to_hive_table" 213 | println(insertSql1) 214 | // spark.sql(insertSql1) 215 | 216 | } 217 | 218 | } -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第4章协同过滤/ItemSimilarity.scala: -------------------------------------------------------------------------------- 1 | package book_code 2 | 3 | import scala.math._ 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.Dataset 6 | import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder 7 | import org.apache.spark.sql.functions._ 8 | import org.apache.spark.sql._ 9 | import scala.collection.mutable.WrappedArray 10 | import scala.collection.JavaConverters._ 11 | import scala.collection.mutable.ArrayBuffer 12 | 13 | object ItemSimilarity extends Serializable { 14 | 15 | import org.apache.spark.sql.functions._ 16 | 17 | /** 18 | * 关联规则计算. 19 | * 支持度(Support):在所有项集中{X, Y}出现的可能性,即项集中同时含有X和Y的概率,P(X U Y)/P(I),I是总事务集 20 | * 置信度(Confidence):在先决条件X发生的条件下,关联结果Y发生的概率,P(X U Y)/P(X) 21 | * 提升度(lift):在含有X的条件下同时含有Y的可能性与没有X这个条件下项集中含有Y的可能性之比,confidence(X => Y)/P(Y) 22 | * @param user_rdd 用户评分 23 | * @param RDD[ItemAssociation] 返回物品相似度 24 | * 25 | */ 26 | def AssociationRules(user_ds: Dataset[ItemPref]): Dataset[ItemAssociation] = { 27 | import user_ds.sparkSession.implicits._ 28 | // 1 (用户:物品) => (用户:(物品集合)) 29 | val user_ds1 = user_ds.groupBy("userid").agg(collect_set("itemid")).withColumnRenamed("collect_set(itemid)", "itemid_set") 30 | 31 | // 2 物品:物品,上三角数据 32 | val user_ds2 = user_ds1.flatMap { row => 33 | val itemlist = row.getAs[WrappedArray[String]](1).toArray.sorted 34 | val result = new ArrayBuffer[(String, String, Double)]() 35 | for (i <- 0 to itemlist.length - 2) { 36 | for (j <- i + 1 to itemlist.length - 1) { 37 | result += ((itemlist(i), itemlist(j), 1.0)) 38 | } 39 | } 40 | result 41 | }.withColumnRenamed("_1", "itemidI").withColumnRenamed("_2", "itemidJ").withColumnRenamed("_3", "score") 42 | 43 | // 3 计算物品与物品,上三角,同现频次 44 | val user_ds3 = user_ds2.groupBy("itemidI", "itemidJ").agg(sum("score").as("sumIJ")) 45 | 46 | //4 计算物品总共出现的频次 47 | val user_ds0 = user_ds.withColumn("score", lit(1)).groupBy("itemid").agg(sum("score").as("score")) 48 | val user_all = user_ds1.count 49 | 50 | //5 计算支持度(Support) 51 | val user_ds4 = user_ds3.select("itemidI", "itemidJ", "sumIJ"). 52 | union(user_ds3.select($"itemidJ".as("itemidI"), $"itemidI".as("itemidJ"), $"sumIJ")). 53 | withColumn("support", $"sumIJ" / user_all.toDouble) 54 | 55 | // user_ds4.orderBy($"support".desc).show 56 | 57 | //6 置信度(Confidence) 58 | val user_ds5 = user_ds4. 59 | join(user_ds0.withColumnRenamed("itemid", "itemidI").withColumnRenamed("score", "sumI"), "itemidI"). 60 | withColumn("confidence", $"sumIJ" / $"sumI") 61 | 62 | // user_ds5.orderBy($"confidence".desc).show 63 | 64 | //7 提升度(lift) 65 | val user_ds6 = user_ds5. 66 | join(user_ds0.withColumnRenamed("itemid", "itemidJ").withColumnRenamed("score", "sumJ"), "itemidJ"). 67 | withColumn("lift", $"confidence" / ($"sumJ" / user_all.toDouble)) 68 | 69 | // user_ds6.orderBy($"lift".desc).show 70 | 71 | // 计算同现相似度 72 | val user_ds8 = user_ds6.withColumn("similar", col("sumIJ") / sqrt(col("sumI") * col("sumJ"))) 73 | // user_ds8.orderBy($"similar".desc).show 74 | 75 | // 8 结果返回 76 | val out = user_ds8.select("itemidI", "itemidJ", "support", "confidence", "lift", "similar").map { row => 77 | val itemidI = row.getString(0) 78 | val itemidJ = row.getString(1) 79 | val support = row.getDouble(2) 80 | val confidence = row.getDouble(3) 81 | val lift = row.getDouble(4) 82 | val similar = row.getDouble(5) 83 | ItemAssociation(itemidI, itemidJ, support, confidence, lift, similar) 84 | } 85 | out 86 | } 87 | 88 | /** 89 | * 余弦相似度矩阵计算. 90 | * T(x,y) = ∑x(i)y(i) / sqrt(∑(x(i)*x(i))) * sqrt(∑(y(i)*y(i))) 91 | * @param user_rdd 用户评分 92 | * @param RDD[ItemSimi] 返回物品相似度 93 | * 94 | */ 95 | def CosineSimilarity(user_ds: Dataset[ItemPref]): Dataset[ItemSimi] = { 96 | import user_ds.sparkSession.implicits._ 97 | 98 | // 1 数据做准备 99 | val user_ds1 = user_ds. 100 | withColumn("iv", concat_ws(":", $"itemid", $"pref")). 101 | groupBy("userid").agg(collect_set("iv")). 102 | withColumnRenamed("collect_set(iv)", "itemid_set"). 103 | select("userid", "itemid_set") 104 | 105 | // 2 物品:物品,上三角数据 106 | val user_ds2 = user_ds1.flatMap { row => 107 | val itemlist = row.getAs[scala.collection.mutable.WrappedArray[String]](1).toArray.sorted 108 | val result = new ArrayBuffer[(String, String, Double, Double)]() 109 | for (i <- 0 to itemlist.length - 2) { 110 | for (j <- i + 1 to itemlist.length - 1) { 111 | result += ((itemlist(i).split(":")(0), itemlist(j).split(":")(0), itemlist(i).split(":")(1).toDouble, itemlist(j).split(":")(1).toDouble)) 112 | } 113 | } 114 | result 115 | }.withColumnRenamed("_1", "itemidI").withColumnRenamed("_2", "itemidJ").withColumnRenamed("_3", "scoreI").withColumnRenamed("_4", "scoreJ") 116 | 117 | // 3 按照公式计算sim 118 | // x*y = ∑x(i)y(i) 119 | // |x|^2 = ∑(x(i)*x(i)) 120 | // |y|^2 = ∑(y(i)*y(i)) 121 | // result = x*y / sqrt(|x|^2) * sqrt(|y|^2) 122 | val user_ds3 = user_ds2. 123 | withColumn("cnt", lit(1)). 124 | groupBy("itemidI", "itemidJ"). 125 | agg(sum(($"scoreI" * $"scoreJ")).as("sum_xy"), 126 | sum(($"scoreI" * $"scoreI")).as("sum_x"), 127 | sum(($"scoreJ" * $"scoreJ")).as("sum_y")). 128 | withColumn("result", $"sum_xy" / (sqrt($"sum_x") * sqrt($"sum_y"))) 129 | 130 | // 4 上、下三角合并 131 | val user_ds8 = user_ds3.select("itemidI", "itemidJ", "result"). 132 | union(user_ds3.select($"itemidJ".as("itemidI"), $"itemidI".as("itemidJ"), $"result")) 133 | 134 | // 5 结果返回 135 | val out = user_ds8.select("itemidI", "itemidJ", "result").map { row => 136 | val itemidI = row.getString(0) 137 | val itemidJ = row.getString(1) 138 | val similar = row.getDouble(2) 139 | ItemSimi(itemidI, itemidJ, similar) 140 | } 141 | out 142 | } 143 | 144 | /** 145 | * 欧氏距离相似度矩阵计算. 146 | * d(x, y) = sqrt(∑((x(i)-y(i)) * (x(i)-y(i)))) 147 | * sim(x, y) = n / (1 + d(x, y)) 148 | * @param user_rdd 用户评分 149 | * @param RDD[ItemSimi] 返回物品相似度 150 | * 151 | */ 152 | def EuclideanDistanceSimilarity(user_ds: Dataset[ItemPref]): Dataset[ItemSimi] = { 153 | import user_ds.sparkSession.implicits._ 154 | 155 | // 1 数据做准备 156 | val user_ds1 = user_ds. 157 | withColumn("iv", concat_ws(":", $"itemid", $"pref")). 158 | groupBy("userid").agg(collect_set("iv")). 159 | withColumnRenamed("collect_set(iv)", "itemid_set"). 160 | select("userid", "itemid_set") 161 | 162 | // 2 物品:物品,上三角数据 163 | val user_ds2 = user_ds1.flatMap { row => 164 | val itemlist = row.getAs[scala.collection.mutable.WrappedArray[String]](1).toArray.sorted 165 | val result = new ArrayBuffer[(String, String, Double, Double)]() 166 | for (i <- 0 to itemlist.length - 2) { 167 | for (j <- i + 1 to itemlist.length - 1) { 168 | result += ((itemlist(i).split(":")(0), itemlist(j).split(":")(0), itemlist(i).split(":")(1).toDouble, itemlist(j).split(":")(1).toDouble)) 169 | } 170 | } 171 | result 172 | }.withColumnRenamed("_1", "itemidI").withColumnRenamed("_2", "itemidJ").withColumnRenamed("_3", "scoreI").withColumnRenamed("_4", "scoreJ") 173 | 174 | // 3 按照公式计算sim 175 | // dist = sqrt(∑((x(i)-y(i)) * (x(i)-y(i)))) 176 | // cntSum = sum(1) 177 | // result = cntSum / (1 + dist) 178 | val user_ds3 = user_ds2. 179 | withColumn("cnt", lit(1)). 180 | groupBy("itemidI", "itemidJ"). 181 | agg(sqrt(sum(($"scoreI" - $"scoreJ") * ($"scoreI" - $"scoreJ"))).as("dist"), sum($"cnt").as("cntSum")). 182 | withColumn("result", $"cntSum" / (lit(1.0) + $"dist")) 183 | 184 | // 4 上、下三角合并 185 | val user_ds8 = user_ds3.select("itemidI", "itemidJ", "result").union(user_ds3.select($"itemidJ".as("itemidI"), $"itemidI".as("itemidJ"), $"result")) 186 | 187 | // 5 结果返回 188 | val out = user_ds8.select("itemidI", "itemidJ", "result").map { row => 189 | val itemidI = row.getString(0) 190 | val itemidJ = row.getString(1) 191 | val similar = row.getDouble(2) 192 | ItemSimi(itemidI, itemidJ, similar) 193 | } 194 | out 195 | } 196 | 197 | /** 198 | * 同现相似度矩阵计算. 199 | * w(i,j) = N(i)∩N(j)/sqrt(N(i)*N(j)) 200 | * @param user_rdd 用户评分 201 | * @param RDD[ItemSimi] 返回物品相似度 202 | * 203 | */ 204 | def CooccurrenceSimilarity(user_ds: Dataset[ItemPref]): Dataset[ItemSimi] = { 205 | import user_ds.sparkSession.implicits._ 206 | 207 | // 1 (用户:物品) => (用户:(物品集合)) 208 | val user_ds1 = user_ds.groupBy("userid").agg(collect_set("itemid")).withColumnRenamed("collect_set(itemid)", "itemid_set") 209 | 210 | // 2 物品:物品,上三角数据 211 | val user_ds2 = user_ds1.flatMap { row => 212 | val itemlist = row.getAs[scala.collection.mutable.WrappedArray[String]](1).toArray.sorted 213 | val result = new ArrayBuffer[(String, String, Double)]() 214 | for (i <- 0 to itemlist.length - 2) { 215 | for (j <- i + 1 to itemlist.length - 1) { 216 | result += ((itemlist(i), itemlist(j), 1.0)) 217 | } 218 | } 219 | result 220 | }.withColumnRenamed("_1", "itemidI").withColumnRenamed("_2", "itemidJ").withColumnRenamed("_3", "score") 221 | 222 | // 3 计算物品与物品,上三角,同现频次 223 | val user_ds3 = user_ds2.groupBy("itemidI", "itemidJ").agg(sum("score").as("sumIJ")) 224 | 225 | // 4 计算物品总共出现的频次 226 | val user_ds0 = user_ds.withColumn("score", lit(1)).groupBy("itemid").agg(sum("score").as("score")) 227 | 228 | // 5 计算同现相似度 229 | val user_ds4 = user_ds3.join(user_ds0.withColumnRenamed("itemid", "itemidJ").withColumnRenamed("score", "sumJ").select("itemidJ", "sumJ"), "itemidJ") 230 | 231 | val user_ds5 = user_ds4.join(user_ds0.withColumnRenamed("itemid", "itemidI").withColumnRenamed("score", "sumI").select("itemidI", "sumI"), "itemidI") 232 | 233 | // 根据公式N(i)∩N(j)/sqrt(N(i)*N(j)) 计算 234 | val user_ds6 = user_ds5.withColumn("result", col("sumIJ") / sqrt(col("sumI") * col("sumJ"))) 235 | 236 | // 6 上、下三角合并 237 | println(s"user_ds6.count(): ${user_ds6.count()}") 238 | val user_ds8 = user_ds6.select("itemidI", "itemidJ", "result").union(user_ds6.select($"itemidJ".as("itemidI"), $"itemidI".as("itemidJ"), $"result")) 239 | println(s"user_ds8.count(): ${user_ds8.count()}") 240 | 241 | // 7 结果返回 242 | val out = user_ds8.select("itemidI", "itemidJ", "result").map { row => 243 | val itemidI = row.getString(0) 244 | val itemidJ = row.getString(1) 245 | val similar = row.getDouble(2) 246 | ItemSimi(itemidI, itemidJ, similar) 247 | } 248 | out 249 | } 250 | 251 | /** 252 | * 计算推荐结果. 253 | * @param items_similar 物品相似矩阵 254 | * @param user_prefer 用户评分表 255 | * @param RDD[UserRecomm] 返回用户推荐结果 256 | * 257 | */ 258 | def Recommend(items_similar: Dataset[ItemSimi], 259 | user_prefer: Dataset[ItemPref]): Dataset[UserRecomm] = { 260 | import user_prefer.sparkSession.implicits._ 261 | 262 | // 1 数据准备 263 | val items_similar_ds1 = items_similar 264 | val user_prefer_ds1 = user_prefer 265 | // 2 根据用户的item召回相似物品 266 | val user_prefer_ds2 = items_similar_ds1.join(user_prefer_ds1, $"itemidI" === $"itemid", "inner") 267 | // user_prefer_ds2.show() 268 | // 3 计算召回的用户物品得分 269 | val user_prefer_ds3 = user_prefer_ds2.withColumn("score", col("pref") * col("similar")).select("userid", "itemidJ", "score") 270 | // user_prefer_ds3.show() 271 | // 4 得分汇总 272 | val user_prefer_ds4 = user_prefer_ds3.groupBy("userid", "itemidJ").agg(sum("score").as("score")).withColumnRenamed("itemidJ", "itemid") 273 | // user_prefer_ds4.show() 274 | // 5 用户得分排序结果,去除用户已评分物品 275 | val user_prefer_ds5 = user_prefer_ds4.join(user_prefer_ds1, Seq("userid", "itemid"), "left").where("pref is null") 276 | // user_prefer_ds5.show() 277 | // 6 结果返回 278 | val out1 = user_prefer_ds5.select("userid", "itemid", "score").map { row => 279 | val userid = row.getString(0) 280 | val itemid = row.getString(1) 281 | val pref = row.getDouble(2) 282 | UserRecomm(userid, itemid, pref) 283 | } 284 | // out1.orderBy($"userid", $"pref".desc).show 285 | out1 286 | } 287 | 288 | } 289 | 290 | 291 | -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第4章协同过滤/ml-latest-small/README.txt: -------------------------------------------------------------------------------- 1 | Summary 2 | ======= 3 | 4 | This dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from [MovieLens](http://movielens.org), a movie recommendation service. It contains 100836 ratings and 3683 tag applications across 9742 movies. These data were created by 610 users between March 29, 1996 and September 24, 2018. This dataset was generated on September 26, 2018. 5 | 6 | Users were selected at random for inclusion. All selected users had rated at least 20 movies. No demographic information is included. Each user is represented by an id, and no other information is provided. 7 | 8 | The data are contained in the files `links.csv`, `movies.csv`, `ratings.csv` and `tags.csv`. More details about the contents and use of all these files follows. 9 | 10 | This is a *development* dataset. As such, it may change over time and is not an appropriate dataset for shared research results. See available *benchmark* datasets if that is your intent. 11 | 12 | This and other GroupLens data sets are publicly available for download at . 13 | 14 | 15 | Usage License 16 | ============= 17 | 18 | Neither the University of Minnesota nor any of the researchers involved can guarantee the correctness of the data, its suitability for any particular purpose, or the validity of results based on the use of the data set. The data set may be used for any research purposes under the following conditions: 19 | 20 | * The user may not state or imply any endorsement from the University of Minnesota or the GroupLens Research Group. 21 | * The user must acknowledge the use of the data set in publications resulting from the use of the data set (see below for citation information). 22 | * The user may redistribute the data set, including transformations, so long as it is distributed under these same license conditions. 23 | * The user may not use this information for any commercial or revenue-bearing purposes without first obtaining permission from a faculty member of the GroupLens Research Project at the University of Minnesota. 24 | * The executable software scripts are provided "as is" without warranty of any kind, either expressed or implied, including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose. The entire risk as to the quality and performance of them is with you. Should the program prove defective, you assume the cost of all necessary servicing, repair or correction. 25 | 26 | In no event shall the University of Minnesota, its affiliates or employees be liable to you for any damages arising out of the use or inability to use these programs (including but not limited to loss of data or data being rendered inaccurate). 27 | 28 | If you have any further questions or comments, please email 29 | 30 | 31 | Citation 32 | ======== 33 | 34 | To acknowledge use of the dataset in publications, please cite the following paper: 35 | 36 | > F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. 37 | 38 | 39 | Further Information About GroupLens 40 | =================================== 41 | 42 | GroupLens is a research group in the Department of Computer Science and Engineering at the University of Minnesota. Since its inception in 1992, GroupLens's research projects have explored a variety of fields including: 43 | 44 | * recommender systems 45 | * online communities 46 | * mobile and ubiquitious technologies 47 | * digital libraries 48 | * local geographic information systems 49 | 50 | GroupLens Research operates a movie recommender based on collaborative filtering, MovieLens, which is the source of these data. We encourage you to visit to try it out! If you have exciting ideas for experimental work to conduct on MovieLens, send us an email at - we are always interested in working with external collaborators. 51 | 52 | 53 | Content and Use of Files 54 | ======================== 55 | 56 | Formatting and Encoding 57 | ----------------------- 58 | 59 | The dataset files are written as [comma-separated values](http://en.wikipedia.org/wiki/Comma-separated_values) files with a single header row. Columns that contain commas (`,`) are escaped using double-quotes (`"`). These files are encoded as UTF-8. If accented characters in movie titles or tag values (e.g. Misérables, Les (1995)) display incorrectly, make sure that any program reading the data, such as a text editor, terminal, or script, is configured for UTF-8. 60 | 61 | 62 | User Ids 63 | -------- 64 | 65 | MovieLens users were selected at random for inclusion. Their ids have been anonymized. User ids are consistent between `ratings.csv` and `tags.csv` (i.e., the same id refers to the same user across the two files). 66 | 67 | 68 | Movie Ids 69 | --------- 70 | 71 | Only movies with at least one rating or tag are included in the dataset. These movie ids are consistent with those used on the MovieLens web site (e.g., id `1` corresponds to the URL ). Movie ids are consistent between `ratings.csv`, `tags.csv`, `movies.csv`, and `links.csv` (i.e., the same id refers to the same movie across these four data files). 72 | 73 | 74 | Ratings Data File Structure (ratings.csv) 75 | ----------------------------------------- 76 | 77 | All ratings are contained in the file `ratings.csv`. Each line of this file after the header row represents one rating of one movie by one user, and has the following format: 78 | 79 | userId,movieId,rating,timestamp 80 | 81 | The lines within this file are ordered first by userId, then, within user, by movieId. 82 | 83 | Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars). 84 | 85 | Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970. 86 | 87 | 88 | Tags Data File Structure (tags.csv) 89 | ----------------------------------- 90 | 91 | All tags are contained in the file `tags.csv`. Each line of this file after the header row represents one tag applied to one movie by one user, and has the following format: 92 | 93 | userId,movieId,tag,timestamp 94 | 95 | The lines within this file are ordered first by userId, then, within user, by movieId. 96 | 97 | Tags are user-generated metadata about movies. Each tag is typically a single word or short phrase. The meaning, value, and purpose of a particular tag is determined by each user. 98 | 99 | Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970. 100 | 101 | 102 | Movies Data File Structure (movies.csv) 103 | --------------------------------------- 104 | 105 | Movie information is contained in the file `movies.csv`. Each line of this file after the header row represents one movie, and has the following format: 106 | 107 | movieId,title,genres 108 | 109 | Movie titles are entered manually or imported from , and include the year of release in parentheses. Errors and inconsistencies may exist in these titles. 110 | 111 | Genres are a pipe-separated list, and are selected from the following: 112 | 113 | * Action 114 | * Adventure 115 | * Animation 116 | * Children's 117 | * Comedy 118 | * Crime 119 | * Documentary 120 | * Drama 121 | * Fantasy 122 | * Film-Noir 123 | * Horror 124 | * Musical 125 | * Mystery 126 | * Romance 127 | * Sci-Fi 128 | * Thriller 129 | * War 130 | * Western 131 | * (no genres listed) 132 | 133 | 134 | Links Data File Structure (links.csv) 135 | --------------------------------------- 136 | 137 | Identifiers that can be used to link to other sources of movie data are contained in the file `links.csv`. Each line of this file after the header row represents one movie, and has the following format: 138 | 139 | movieId,imdbId,tmdbId 140 | 141 | movieId is an identifier for movies used by . E.g., the movie Toy Story has the link . 142 | 143 | imdbId is an identifier for movies used by . E.g., the movie Toy Story has the link . 144 | 145 | tmdbId is an identifier for movies used by . E.g., the movie Toy Story has the link . 146 | 147 | Use of the resources listed above is subject to the terms of each provider. 148 | 149 | 150 | Cross-Validation 151 | ---------------- 152 | 153 | Prior versions of the MovieLens dataset included either pre-computed cross-folds or scripts to perform this computation. We no longer bundle either of these features with the dataset, since most modern toolkits provide this as a built-in feature. If you wish to learn about standard approaches to cross-fold computation in the context of recommender systems evaluation, see [LensKit](http://lenskit.org) for tools, documentation, and open-source code examples. 154 | -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第4章协同过滤/ml-latest-small/movies.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WallaceLiu/recommendation_algorithm/36cdd077446ab7a2831c168f7419b058bd2fcbb0/推荐系统算法实践—源码下载/第4章协同过滤/ml-latest-small/movies.csv -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第5章Word2vec/Word2vec.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # ## 1)环境准备 5 | 6 | # In[1]: 7 | 8 | import tensorflow as tf 9 | import numpy as np 10 | import pandas as pd 11 | import random 12 | import math 13 | import re 14 | from os import path, listdir 15 | import os 16 | os.environ["CUDA_VISIBLE_DEVICES"]="0" 17 | 18 | print tf.__version__ 19 | print tf.__path__ 20 | 21 | 22 | # ## 2)数据准备Dataset格式 23 | 24 | # In[2]: 25 | 26 | # 每一行解析,解析标签csv格式 27 | # 5805 17357 28 | # 数据处理 29 | def process_data(my_path, batch_size=32, num_epochs=1): 30 | filenames = get_file_list(my_path) 31 | next_element = read_my_file_format(filenames, batch_size, num_epochs) 32 | return next_element 33 | # 创建session,指定GPU或者CPU使用率 34 | def get_session(gpu_fraction=0.1): 35 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction, 36 | allow_growth=True) 37 | return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) 38 | 39 | 40 | # ## 3)Skip-gram模型 41 | 42 | # In[3]: 43 | 44 | class SkipGram(object): 45 | """ 初始化成员变量 """ 46 | def __init__(self, vocab_size, embed_size, num_sampled, train_optimizer, learning_rate): 47 | # 字典长度 48 | self.vocab_size = vocab_size 49 | # 词向量长度 50 | self.embed_size = embed_size 51 | # 负采样数量 52 | self.num_sampled = num_sampled 53 | # 优化方法 54 | self.train_optimizer = train_optimizer 55 | # 学习率 56 | self.learning_rate = learning_rate 57 | # aglobal_step 58 | self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step') 59 | 60 | def train(self, batch_data): 61 | """ 1 定义输入数据 """ 62 | with tf.name_scope('input_data'): 63 | # center_words 64 | center_words = tf.reshape(batch_data['center_words'], shape=[-1]) 65 | # target_words 66 | target_words = tf.reshape(batch_data['target_words'], shape=[-1,1]) 67 | print("%s: %s" % ("center_words", center_words)) 68 | print("%s: %s" % ("target_words", target_words)) 69 | 70 | """ 2 定义网络输出 """ 71 | with tf.name_scope("Comput_Score"): 72 | # 词向量矩阵 73 | with tf.variable_scope("embed", reuse=tf.AUTO_REUSE): 74 | self.embedding_dict = tf.get_variable(name='embed', shape=[self.vocab_size, self.embed_size], initializer=tf.glorot_uniform_initializer()) 75 | print("%s: %s" % ("embedding_dict", self.embedding_dict)) 76 | 77 | # 模型内部参数矩阵 78 | with tf.variable_scope("nce", reuse=tf.AUTO_REUSE): 79 | self.nce_weight = tf.get_variable(name='nce_weight', shape=[self.vocab_size, self.embed_size], initializer=tf.glorot_normal_initializer()) 80 | self.nce_biases = tf.get_variable(name='nce_biases', shape=[1], initializer=tf.constant_initializer(0.0)) 81 | print("%s: %s" % ("nce_weight", self.nce_weight)) 82 | print("%s: %s" % ("nce_biases", self.nce_biases)) 83 | 84 | # 将输入序列向量化 85 | # 其实就是一个简单的查表 86 | embed = tf.nn.embedding_lookup(self.embedding_dict, center_words, name='embed') 87 | print("%s: %s" % ("embed", embed)) 88 | 89 | # 得到NCE损失(负采样得到的损失) 90 | loss = tf.reduce_mean( 91 | tf.nn.nce_loss( 92 | weights = self.nce_weight, # 权重 93 | biases = self.nce_biases, # 偏差 94 | labels = target_words, # 输入的标签 95 | inputs = embed, # 输入向量 96 | num_sampled = self.num_sampled, # 负采样的个数 97 | num_classes = self.vocab_size # 字典数目 98 | ) 99 | ) 100 | print("%s: %s" % ("loss", loss)) 101 | 102 | """ 3 设定optimizer """ 103 | with tf.name_scope("optimizer"): 104 | with tf.variable_scope("optimizer", reuse=tf.AUTO_REUSE): 105 | #------bulid optimizer------ 106 | if train_optimizer == 'Adam': 107 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8) 108 | elif train_optimizer == 'Adagrad': 109 | optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=1e-8) 110 | train_step = optimizer.minimize(loss, global_step=self.global_step) 111 | 112 | """4 设定summary,以便在Tensorboard里进行可视化 """ 113 | with tf.name_scope("summaries"): 114 | tf.summary.scalar("loss", loss) 115 | tf.summary.histogram("embedding_dict", self.embedding_dict) 116 | # 好几个summary,所以这里要merge_all 117 | summary_op = tf.summary.merge_all() 118 | 119 | """5 返回结果 """ 120 | return train_step, loss, summary_op 121 | 122 | 123 | # ## 4)模型训练测试 124 | 125 | # In[4]: 126 | 127 | # 测试数据 128 | filenames = "/data/windows_skip_sample.csv" 129 | batch_size = 100000 130 | num_epochs = 200 131 | next_element = process_data(filenames, batch_size, num_epochs) 132 | 133 | # 模型参数 134 | vocab_size = 6834 135 | embed_size = 30 136 | num_sampled = 50 137 | train_optimizer = 'Adam' 138 | learning_rate = 0.01 139 | log_path='/data/log/20180915' 140 | 141 | # 开始训练 142 | bea_model = SkipGram(vocab_size, embed_size, num_sampled, train_optimizer, learning_rate) 143 | train_step, loss, summary_op = bea_model.train(next_element) 144 | 145 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer()) 146 | gpu_fraction = 0.4 147 | my_device='/gpu:0' 148 | with tf.device(my_device): 149 | sess = get_session(gpu_fraction) 150 | sess.run(init_op) 151 | batch_cnt = 0 152 | #选定可视化存储目录 153 | writer = tf.summary.FileWriter(log_path, sess.graph) 154 | try: 155 | while True: 156 | batch_cnt = batch_cnt + 1 157 | a, b, summary = sess.run([train_step, loss, summary_op]) 158 | if batch_cnt % 1000 == 0 or batch_cnt <= 10: 159 | print("batch: {} loss: {:.4f}".format(batch_cnt, b)) 160 | writer.add_summary(summary, batch_cnt) 161 | except tf.errors.OutOfRangeError: 162 | print("Train end of dataset") 163 | 164 | -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第5章Word2vec/Word2vec.scala: -------------------------------------------------------------------------------- 1 | package book_code 2 | 3 | import org.apache.spark.sql.{ SparkSession, _ } 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.functions._ 6 | import org.apache.spark.mllib.feature.Word2Vec 7 | import java.util.Date 8 | import java.text.SimpleDateFormat 9 | 10 | object Word2vec { 11 | 12 | /** 13 | * word2vec实现: 14 | * 15 | * 1)读取训练样本 16 | * 2)w2v模型训练 17 | * 3)提取词向量,并且计算相似词 18 | * 19 | * @author sunbow 20 | */ 21 | 22 | def main(args: Array[String]): Unit = { 23 | 24 | /** 25 | * ############################################################# 26 | * 27 | * Step 1:初始化 28 | * 29 | * ############################################################## 30 | */ 31 | 32 | val spark = SparkSession 33 | .builder 34 | .appName("Word2vec") 35 | .config("spark.hadoop.validateOutputSpecs", "false") 36 | .enableHiveSupport() 37 | .getOrCreate() 38 | 39 | import spark.implicits._ 40 | val data_path = args(0) 41 | val conf_path = args(1) 42 | val defaultFS = args(2) 43 | val NumIterations = args(3).toInt 44 | val MaxSentenceLength = args(4).toInt 45 | val MinCount = args(5).toInt 46 | val VectorSize = args(6).toInt 47 | val WindowSize = args(7).toInt 48 | val simil_size = args(8).toInt 49 | 50 | /** 51 | * ############################################################# 52 | * 53 | * Step 2:数据准备 54 | * 55 | * ############################################################## 56 | */ 57 | // 2.1读取item配置表 58 | val id_conf_df = spark.read.options(Map(("delimiter", "|"), ("header", "false"))).csv(conf_path) 59 | val id2title_map = id_conf_df.collect().map(row => (row(0).toString(), row(1).toString())).toMap 60 | 61 | // 2.2读取样本数据 62 | val sequence_sample = spark.read.text(data_path).map { 63 | case Row(id_list: String) => 64 | val seq = id_list.split(" ").toSeq 65 | seq 66 | } 67 | sequence_sample.repartition(500).cache() 68 | sequence_sample.count() 69 | println("sequence_sample.show()") 70 | sequence_sample.show() 71 | 72 | /** 73 | * ############################################################# 74 | * 75 | * Step 3:Word2Vec 76 | * 77 | * ############################################################## 78 | */ 79 | // 训练模型 80 | val word2Vec = new Word2Vec(). 81 | setNumIterations(NumIterations). 82 | setMaxSentenceLength(MaxSentenceLength). 83 | setMinCount(MinCount). 84 | setVectorSize(VectorSize). 85 | setWindowSize(WindowSize) 86 | val model = word2Vec.fit(sequence_sample.rdd) 87 | 88 | // 模型保存 89 | val now = new Date() 90 | val dateFormat1 = new SimpleDateFormat("yyyyMMddHHmmss") 91 | val time_stamp = dateFormat1.format(now) 92 | val model_path = s"${defaultFS}/Word2vec/model/${time_stamp}" 93 | println(model_path) 94 | model.save(spark.sparkContext, model_path) 95 | 96 | /** 97 | * ############################################################# 98 | * 99 | * Step 4:词向量结果保存 100 | * 101 | * ############################################################## 102 | */ 103 | val modelBC = spark.sparkContext.broadcast(model) 104 | val id2title_map_BC = spark.sparkContext.broadcast(id2title_map) 105 | // 词,向量,相似词 106 | val word2vector_rdd = spark.sparkContext.parallelize(model.getVectors.toSeq).map { 107 | case (word: String, vec: Array[Float]) => 108 | // 根据word查找相似word 109 | val simil_word = modelBC.value.findSynonyms(word, simil_size) 110 | val simil_word_str = simil_word.map(f => s"${f._1}:${f._2.formatted("%.4f")}").mkString(",") 111 | val title = id2title_map_BC.value.getOrElse(word, "") 112 | val simil_title = simil_word.map(f => id2title_map_BC.value.getOrElse(f._1, "")).mkString(",") 113 | // 向量 114 | val vec_str = vec.mkString(",") 115 | (word, vec_str, simil_word, title, simil_title) 116 | } 117 | 118 | println("word2vector_rdd.toDF().show(30)") 119 | word2vector_rdd.toDF().withColumnRenamed("_4", "word").withColumnRenamed("_5", "simil_word").select("word", "simil_word").show(20) 120 | 121 | // 结果保存 122 | val save_path = s"${defaultFS}/Word2vec/model_result/${time_stamp}" 123 | word2vector_rdd.map(f => s"${f._1}|${f._2}|${f._3}|${f._4}|${f._5}").saveAsTextFile(save_path) 124 | 125 | } 126 | 127 | } -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第6章逻辑回归/LR.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # ## 1)环境准备 5 | 6 | # In[1]: 7 | 8 | import numpy as np 9 | import tensorflow as tf 10 | import pandas as pd 11 | import random 12 | import math 13 | import re 14 | 15 | from sklearn import preprocessing 16 | from os import path, listdir 17 | from sklearn.datasets import load_svmlight_files 18 | from sklearn.model_selection import train_test_split 19 | from sklearn import metrics 20 | from tensorflow.contrib import layers 21 | 22 | from sklearn import metrics 23 | 24 | import time 25 | import datetime 26 | 27 | import os 28 | os.environ["CUDA_VISIBLE_DEVICES"]="0" 29 | 30 | print tf.__version__ 31 | print tf.__path__ 32 | 33 | 34 | # ## 2)数据准备Dataset格式 35 | 36 | # In[6]: 37 | 38 | """ 39 | 解析CSV格式,对输入的每一行样本,进行格式解析,返回labels和dense_vector格式数据 40 | 例如输入csv格式字符串: 0.0,0.6666666666666666,0.5,0.0,0.0,0.0,0.0,0.7272727272727273,0.42857142857142855 41 | """ 42 | # 数据处理 43 | def process_data(data_type, my_path, feature_size, batch_size=32, num_epochs=1): 44 | filenames = get_file_list(my_path) 45 | next_element = read_my_file_format(data_type, filenames, feature_size, batch_size, num_epochs) 46 | return next_element 47 | 48 | # 创建session,指定GPU或者CPU使用率 49 | def get_session(gpu_fraction=0.1): 50 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction, 51 | allow_growth=True) 52 | return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) 53 | 54 | 55 | # In[8]: 56 | 57 | # 测试数据 58 | filenames = '/data/all-csv' 59 | feature_size = 530 60 | batch_size = 3 61 | num_epochs = 1 62 | data_type = 'csv' 63 | next_element = process_data(data_type, filenames, feature_size, batch_size, num_epochs) 64 | print next_element['dense_vector'] 65 | print next_element['labels'] 66 | 67 | gpu_fraction = 0.2 68 | my_device='/gpu:0' 69 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer()) 70 | with tf.device(my_device): 71 | sess = get_session(gpu_fraction) 72 | sess.run(init_op) 73 | dense_vector, labels = sess.run([next_element['dense_vector'],next_element['labels']]) 74 | print dense_vector 75 | print labels 76 | 77 | 78 | # ## 3)LR模型 79 | 80 | # In[7]: 81 | 82 | class LR(object): 83 | """ 初始化成员变量 """ 84 | def __init__(self, feature_size, loss_fuc, train_optimizer, learning_rate, reg_type, reg_param): 85 | # 特征向量长度 86 | self.feature_size = feature_size 87 | # 损失函数 88 | self.loss_fuc = loss_fuc 89 | # 优化方法 90 | self.train_optimizer = train_optimizer 91 | # 学习率 92 | self.learning_rate = learning_rate 93 | # 正则类型 94 | self.reg_type = reg_type 95 | # 正则因子 96 | self.reg_param = reg_param 97 | # aglobal_step 98 | self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step') 99 | 100 | def train(self, batch_data): 101 | """ 1 定义输入数据 """ 102 | with tf.name_scope('input_data'): 103 | # 标签:[batch_size, 1] 104 | labels = batch_data['labels'] 105 | # 用户特征向量:[batch_size, feature_size] 106 | dense_vector = tf.reshape(batch_data['dense_vector'], shape=[-1, feature_size, 1]) # None * feature_size * 1 107 | print("%s: %s" % ("dense_vector", dense_vector)) 108 | print("%s: %s" % ("labels", labels)) 109 | 110 | """ 2 定义网络输出 """ 111 | with tf.name_scope("LR_Comput_Score"): 112 | # LR参数,生成或者获取w b 113 | with tf.variable_scope("lr_layer", reuse=tf.AUTO_REUSE): 114 | self.w = tf.get_variable(name='w', shape=[self.feature_size, 1], initializer=tf.glorot_normal_initializer()) 115 | self.b = tf.get_variable(name='bias', shape=[1], initializer=tf.constant_initializer(0.0)) 116 | print("%s: %s" % ("w", self.w)) 117 | print("%s: %s" % ("b", self.b)) 118 | 119 | # ---------- w * x + b---------- 120 | Y_first = tf.reduce_sum(tf.multiply(self.w, dense_vector), 2) # None * F 121 | print("%s: %s" % ("Y_first", Y_first)) 122 | # ---------- sum(w * x) + b---------- 123 | Y_Out = tf.reduce_sum(Y_first, 1) 124 | Y_bias = self.b * tf.ones_like(Y_Out, dtype=tf.float32) # None * 1 125 | print("%s: %s" % ("Y_bias", Y_bias)) 126 | Y_Out = tf.add(Y_Out, Y_bias, name='Y_Out') 127 | print("%s: %s" % ("Y_Out", Y_Out)) 128 | # ---------- score ---------- 129 | score=tf.nn.sigmoid(Y_Out,name='score') 130 | score=tf.reshape(score, shape=[-1, 1]) 131 | print("%s: %s" % ("score", score)) 132 | 133 | """ 3 定义损失函数和AUC指标 """ 134 | with tf.name_scope("loss"): 135 | # loss:Squared_error,Cross_entropy ,FTLR 136 | if reg_type == 'l1_reg': 137 | regularization = self.reg_param * tf.reduce_sum(tf.abs(self.w)) 138 | # tf.contrib.layers.l1_regularizer(self.reg_param)(self.w) 139 | elif reg_type == 'l2_reg': 140 | regularization = self.reg_param * tf.nn.l2_loss(self.w) 141 | else: 142 | regularization = self.reg_param * tf.nn.l2_loss(self.w) 143 | 144 | if loss_fuc == 'Squared_error': 145 | loss = tf.reduce_mean(tf.reduce_sum(tf.square(labels - score), reduction_indices=[1])) + regularization 146 | elif loss_fuc == 'Cross_entropy': 147 | loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=tf.reshape(Y_Out, [-1]), labels=tf.reshape(labels, [-1]))) + regularization 148 | elif loss_fuc == 'FTLR': 149 | loss = tf.reduce_mean(tf.reduce_sum(tf.square(labels - score), reduction_indices=[1])) + regularization 150 | # AUC 151 | auc = tf.metrics.auc(labels, score) 152 | print("%s: %s" % ("labels", labels)) 153 | # w为0的比例,w的平均值 154 | w_zero_ratio = tf.reduce_mean(tf.to_float(tf.abs(self.w) <= 1.0e-5)) 155 | w_avg = tf.reduce_mean(self.w) 156 | 157 | """ 4 设定optimizer """ 158 | with tf.name_scope("optimizer"): 159 | with tf.variable_scope("optimizer", reuse=tf.AUTO_REUSE): 160 | #------bulid optimizer------ 161 | if train_optimizer == 'Adam': 162 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8) 163 | elif train_optimizer == 'Adagrad': 164 | optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=1e-8) 165 | elif train_optimizer == 'Momentum': 166 | optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.95) 167 | elif train_optimizer == 'ftrl': 168 | optimizer = tf.train.FtrlOptimizer(learning_rate) 169 | train_step = optimizer.minimize(loss, global_step=self.global_step) 170 | 171 | """5 设定summary,以便在Tensorboard里进行可视化 """ 172 | with tf.name_scope("summaries"): 173 | tf.summary.scalar("loss", loss) 174 | tf.summary.scalar("accumulate_auc", auc[0]) 175 | tf.summary.scalar("w_avg", w_avg) 176 | tf.summary.scalar("w_zero_ratio", w_zero_ratio) 177 | tf.summary.histogram("w", self.w) 178 | # 好几个summary,所以这里要merge_all 179 | summary_op = tf.summary.merge_all() 180 | 181 | """6 返回结果 """ 182 | return Y_Out, score, regularization, loss, auc, train_step, w_zero_ratio, w_avg, labels, score, summary_op 183 | 184 | 185 | # ## 4)模型训练测试 186 | 187 | # In[9]: 188 | 189 | # 数据准备 190 | filenames = '/data/csv-all' 191 | data_type='csv' 192 | feature_size = 530 193 | batch_size = 60000 194 | num_epochs = 200 195 | next_element = process_data(data_type, filenames, feature_size, batch_size, num_epochs) 196 | 197 | # 模型参数 198 | loss_fuc = 'Squared_error' 199 | train_optimizer = 'Adam' 200 | learning_rate = 0.01 201 | reg_type = 'l2_reg' 202 | reg_param = 0.0 203 | log_path='/data/log/Squared_error_lr_L2_0_20180816_01' 204 | 205 | # 开始训练 206 | bea_model = LR(feature_size, loss_fuc, train_optimizer, learning_rate, reg_type, reg_param) 207 | Y_Out, score, regularization, loss, auc, train_step, w_zero_ratio, w_avg, labels, score, summary_op = bea_model.train(next_element) 208 | 209 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer()) 210 | gpu_fraction = 0.4 211 | my_device='/gpu:1' 212 | with tf.device(my_device): 213 | sess = get_session(gpu_fraction) 214 | sess.run(init_op) 215 | batch_cnt = 0 216 | #选定可视化存储目录 217 | writer = tf.summary.FileWriter(log_path, sess.graph) 218 | try: 219 | while True: 220 | batch_cnt = batch_cnt + 1 221 | a, b, c, d, e, summary = sess.run([loss, auc, w_zero_ratio, w_avg, train_step, summary_op]) 222 | if batch_cnt % 50 == 0 or batch_cnt <= 10: 223 | y, p = sess.run([labels, score]) 224 | if y.sum() > 0.0: 225 | batch_auc=metrics.roc_auc_score(y, p) 226 | else: 227 | batch_auc=0.0 228 | print("batch: {} loss: {:.4f} accumulate_auc: {:.4f} batch_auc: {:.4f} w_zero_ratio: {:.4f} w_avg: {:.4f}".format(batch_cnt, a, b[0], batch_auc, c, d)) 229 | writer.add_summary(summary, batch_cnt) 230 | except tf.errors.OutOfRangeError: 231 | print("3、Train end of dataset") 232 | -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第6章逻辑回归/LogisticRegression.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # ## 1)环境设定 5 | 6 | # In[25]: 7 | 8 | from sklearn.linear_model import LogisticRegression 9 | from sklearn import metrics 10 | from os import path, listdir 11 | from sklearn.datasets import load_svmlight_files 12 | from sklearn.model_selection import train_test_split 13 | from sklearn.externals import joblib 14 | from sklearn import preprocessing 15 | import numpy as np 16 | import pandas as pd 17 | import random 18 | import platform 19 | print("Python Version: %s"%(platform.python_version())) 20 | 21 | 22 | # ## 2)数据准备 23 | 24 | # In[26]: 25 | 26 | """ 27 | 处理libSVM数据方法,生成样本,支持Batch格式返回,也支持X/Y返回 28 | """ 29 | def process_data(data_path, feature_size, test_rat=0.3, random_seed=0, train_batch_size=5000, test_batch_size=5000): 30 | # 读取文件 31 | # batch生成 32 | return {"train_batch": train_batch, "test_batch": test_batch, "X_train": X_train, "X_test": X_test, "Y_train": Y_train, "Y_test": Y_test} 33 | 34 | # In[27]: 35 | 36 | # 数据测试 37 | data_path = '/data/data01/' 38 | test_rat=0.4 39 | random_seed=0 40 | train_batch_size=20000 41 | test_batch_size=20000 42 | feature_size=530 43 | 44 | # 获取样本数据 45 | data = process_data(data_path, feature_size, test_rat, random_seed, train_batch_size, test_batch_size) 46 | 47 | train_batch = data['train_batch'] 48 | test_batch = data['test_batch'] 49 | X_train = data['X_train'] 50 | Y_train = data['Y_train'] 51 | X_test = data['X_test'] 52 | Y_test = data['Y_test'] 53 | 54 | # 查看样本数据大小 55 | print("X_train.shape: ") 56 | print(X_train.shape) 57 | print("Y_train.shape: ") 58 | print(Y_train.shape) 59 | print("X_test.shape: ") 60 | print(X_test.shape) 61 | print("Y_test.shape: ") 62 | print(Y_test.shape) 63 | 64 | 65 | # ## 3)LR模型 66 | 67 | # In[30]: 68 | 69 | # 3.1 建立逻辑回归模型,并且设定参数 70 | lr_model= LogisticRegression(penalty='l2', C=1000, solver='lbfgs', max_iter=500) 71 | 72 | # 3.2 训练逻辑回归模型 73 | lr_model.fit(X_train,Y_train.values.ravel()) 74 | 75 | 76 | # In[31]: 77 | 78 | # 3.3 采用测试集验证模型离线指标 79 | # 训练集AUC 80 | probs_train= lr_model.predict_proba(X_train) 81 | AUC1 = metrics.roc_auc_score(Y_train, probs_train[:,1]) 82 | print("Train Auc: %s"%(AUC1)) 83 | 84 | # 测试集AUC 85 | probs_test= lr_model.predict_proba(X_test) 86 | predict_test = lr_model.predict(X_test) 87 | AUC2 = metrics.roc_auc_score(Y_test, probs_test[:,1]) 88 | print("Test Auc: %s"%(AUC2)) 89 | 90 | # 准确率 91 | accuracy = metrics.accuracy_score(Y_test, predict_test) 92 | print("Test Accuracy: %s"%(accuracy)) 93 | 94 | # 召回率 95 | recall = metrics.recall_score(Y_test, predict_test) 96 | print("Test Recall: %s"%(recall)) 97 | 98 | # F1值 99 | f1 = metrics.f1_score(Y_test, predict_test) 100 | print("Test F1: %s"%(f1)) 101 | 102 | 103 | # In[42]: 104 | 105 | # 3.4 打印模型参数 106 | w=lr_model.coef_ 107 | print("参数大小:") 108 | print(w.shape) 109 | print("参数前10个:") 110 | print(lr_model.coef_[:,0:10]) 111 | print("截距:") 112 | print(lr_model.intercept_) 113 | print("稀疏化特征比率:%.2f%%" %(np.mean(lr_model.coef_.ravel()==0)*100)) 114 | print("sigmoid函数转化的值,即:概率p") 115 | print(lr_model.predict_proba(X_test[0:5])) 116 | 117 | 118 | # In[43]: 119 | 120 | # 3.5 模型保存 121 | joblib.dump(lr_model,"logistic_lr.model") 122 | #模型加载 123 | load_lr = joblib.load("logistic_lr.model") 124 | print(load_lr.predict_proba(X_test[0:5])) 125 | 126 | 127 | # In[ ]: 128 | 129 | # 3.1 建立逻辑回归模型,并且设定参数 130 | lr_model= LogisticRegression(penalty='l2', C=1000, solver='lbfgs', max_iter=500) 131 | 132 | # 3.2 训练逻辑回归模型 133 | lr_model.fit(X_train,Y_train) 134 | 135 | # 3.3 采用测试集验证模型离线指标 136 | # 训练集AUC 137 | probs_train= lr_model.predict_proba(X_train) 138 | AUC1 = metrics.roc_auc_score(Y_train, probs_train[:,1]) 139 | print("Train Auc: %s"%(AUC1)) 140 | 141 | # 测试集AUC 142 | probs_test= lr_model.predict_proba(X_test) 143 | predict_test = lr_model.predict(X_test) 144 | AUC2 = metrics.roc_auc_score(Y_test, probs_test[:,1]) 145 | print("Test Auc: %s"%(AUC2)) 146 | 147 | # 准确率 148 | accuracy = metrics.accuracy_score(Y_test, predict_test) 149 | print("Test Accuracy: %s"%(accuracy)) 150 | 151 | # 召回率 152 | recall = metrics.recall_score(Y_test, predict_test) 153 | print("Test Recall: %s"%(recall)) 154 | 155 | # F1值 156 | f1 = metrics.f1_score(Y_test, predict_test) 157 | print("Test F1: %s"%(f1)) 158 | 159 | # 3.4 打印模型参数 160 | print("参数:",lr_model.coef_) 161 | print("截距:",lr_model.intercept_) 162 | print("稀疏化特征比率:%.2f%%" %(np.mean(lr_model.coef_.ravel()==0)*100)) 163 | print("=========sigmoid函数转化的值,即:概率p=========") 164 | print(lr_model.predict_proba(X_test[0:5])) #sigmoid函数转化的值,即:概率p 165 | 166 | # 3.5 模型保存 167 | joblib.dump(lr_model,"logistic_lr.model") 168 | #模型加载 169 | load_lr = joblib.load("logistic_lr.model") 170 | print(load_lr.predict_proba(X_test[0:5])) 171 | 172 | # In[30]: 173 | 174 | # 3.1 建立逻辑回归模型,并且设定参数 175 | lr_model= LogisticRegression(penalty='l2', C=1000, solver='lbfgs', max_iter=500) 176 | 177 | # 3.2 训练逻辑回归模型 178 | lr_model.fit(X_train,Y_train) 179 | 180 | 181 | # In[46]: 182 | 183 | # 3.3 采用测试集验证模型离线指标 184 | # 训练集AUC 185 | probs_train= lr_model.predict_proba(X_train) 186 | AUC1 = metrics.roc_auc_score(Y_train, probs_train[:,1]) 187 | print("Train Auc: %s"%(AUC1)) 188 | 189 | # 测试集AUC 190 | probs_test= lr_model.predict_proba(X_test) 191 | predict_test = lr_model.predict(X_test) 192 | AUC2 = metrics.roc_auc_score(Y_test, probs_test[:,1]) 193 | print("Test Auc: %s"%(AUC2)) 194 | 195 | # 准确率 196 | accuracy = metrics.accuracy_score(Y_test, predict_test) 197 | print("Test Accuracy: %s"%(accuracy)) 198 | 199 | # 召回率 200 | recall = metrics.recall_score(Y_test, predict_test) 201 | print("Test Recall: %s"%(recall)) 202 | 203 | # F1值 204 | f1 = metrics.f1_score(Y_test, predict_test) 205 | print("Test F1: %s"%(f1)) 206 | 207 | 208 | # In[49]: 209 | 210 | # 3.4 打印模型参数 211 | print("参数:",lr_model.coef_) 212 | print("截距:",lr_model.intercept_) 213 | print("稀疏化特征比率:%.2f%%" %(np.mean(lr_model.coef_.ravel()==0)*100)) 214 | print("=========sigmoid函数转化的值,即:概率p=========") 215 | print(lr_model.predict_proba(X_test[0:5])) #sigmoid函数转化的值,即:概率p 216 | 217 | 218 | # In[53]: 219 | 220 | # 3.5 模型保存 221 | joblib.dump(lr_model,"logistic_lr.model") 222 | #模型加载 223 | load_lr = joblib.load("logistic_lr.model") 224 | print(load_lr.predict_proba(X_test[0:5])) 225 | -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第6章逻辑回归/LogisticRegression.scala: -------------------------------------------------------------------------------- 1 | package book_code 2 | 3 | import org.apache.spark.ml.classification.{ BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel } 4 | import org.apache.spark.ml.evaluation.{ MulticlassClassificationEvaluator, BinaryClassificationEvaluator } 5 | import org.apache.spark.ml.linalg.{ Vector, Vectors } 6 | import org.apache.spark.sql.types._ 7 | import org.apache.spark.sql.functions._ 8 | import org.apache.spark.sql._ 9 | import org.apache.spark.sql.SparkSession 10 | import org.apache.spark.ml.feature._ 11 | import java.util.Date 12 | import java.text.SimpleDateFormat 13 | 14 | object LogisticRegression { 15 | 16 | def main(args: Array[String]): Unit = { 17 | 18 | val spark = SparkSession. 19 | builder(). 20 | appName("LogisticRegression"). 21 | enableHiveSupport(). 22 | getOrCreate() 23 | 24 | import spark.implicits._ 25 | 26 | //1 参数准备 27 | val dataPath = "hdfs://1.1.1.1:9000/user/data01/" 28 | val iter = 500 29 | val reg_param = 0.0 30 | val elastic_net_param = 0.0 31 | 32 | //2 训练样本准备 33 | val (training, test) = readLibSvmSampleData(spark, dataPath) 34 | training.cache() 35 | test.cache() 36 | println(s"training.count(): ${training.count()}") 37 | println(s"test.count(): ${test.count()}") 38 | println("training.show") 39 | training.show 40 | 41 | //3 建立逻辑回归模型 42 | val lr = new LogisticRegression(). 43 | setMaxIter(iter). 44 | setRegParam(reg_param). 45 | setElasticNetParam(elastic_net_param) 46 | 47 | //4 根据训练样本进行模型训练 48 | val lrModel = lr.fit(training) 49 | 50 | //5 打印模型信息 51 | println(s"Coefficients Top 10: ${lrModel.coefficients.toArray.slice(0, 10).mkString(" ")}") 52 | println(s"Intercept: ${lrModel.intercept}") 53 | 54 | //6 建立多元回归模型 55 | val mlr = new LogisticRegression(). 56 | setMaxIter(500). 57 | setRegParam(0.0). 58 | setElasticNetParam(0.0). 59 | setFamily("multinomial") 60 | 61 | //7 根据训练样本进行模型训练 62 | val mlrModel = mlr.fit(training) 63 | 64 | //8 打印模型信息 65 | println(s"Multinomial coefficients: ${mlrModel.coefficientMatrix}") 66 | println(s"Multinomial intercepts: ${mlrModel.interceptVector}") 67 | 68 | //9 对模型进行测试 69 | val test_predict = lrModel.transform(test) 70 | test_predict.show 71 | test_predict.select("features", "label", "probability", "prediction").take(5).foreach { 72 | case Row(features: Vector, label: Double, prob: Vector, prediction: Double) => 73 | println(s"($features, $label) -> prob=$prob, prediction=$prediction") 74 | } 75 | 76 | //10 模型摘要 77 | val trainingSummary = lrModel.summary 78 | 79 | //11 每次迭代目标值 80 | val objectiveHistory = trainingSummary.objectiveHistory 81 | println("objectiveHistory:") 82 | objectiveHistory.foreach(loss => println(loss)) 83 | 84 | //12 计算模型指标数据 85 | val binarySummary = trainingSummary.asInstanceOf[BinaryLogisticRegressionSummary] 86 | 87 | //13 模型摘要AUC指标 88 | val roc = binarySummary.roc 89 | println("roc.show()") 90 | roc.show() 91 | val AUC = binarySummary.areaUnderROC 92 | println(s"areaUnderROC: ${binarySummary.areaUnderROC}") 93 | 94 | //14 测试集AUC指标 95 | val evaluator = new BinaryClassificationEvaluator(). 96 | setLabelCol("label"). 97 | setRawPredictionCol("probability"). 98 | setMetricName("areaUnderROC") 99 | val testAUC = evaluator.evaluate(test_predict) 100 | println("Test AUC = " + testAUC) 101 | 102 | //15 设置模型阈值 103 | // 不同的阈值,计算不同的F1,然后通过最大的F1找出并重设模型的最佳阈值。 104 | val fMeasure = binarySummary.fMeasureByThreshold 105 | fMeasure.show 106 | // 获得最大的F1值 107 | val maxFMeasure = fMeasure.select(max("F-Measure")).head().getDouble(0) 108 | // 找出最大F1值对应的阈值(最佳阈值) 109 | val bestThreshold = fMeasure.where($"F-Measure" === maxFMeasure).select("threshold").head().getDouble(0) 110 | // 并将模型的Threshold设置为选择出来的最佳分类阈值 111 | lrModel.setThreshold(bestThreshold) 112 | 113 | //16 模型保存与加载 114 | // 保存 115 | val now = new Date() 116 | val dateFormat1 = new SimpleDateFormat("yyyyMMddHHmmss") 117 | val time_stamp = dateFormat1.format(now) 118 | 119 | lrModel.save(s"hdfs://1.1.1.1:9000/lrmodel/${time_stamp}") 120 | // 加载 121 | val load_lrModel = LogisticRegressionModel.load(s"hdfs://1.1.1.1:9000/lrmodel/${time_stamp}") 122 | // 加载测试 123 | val load_predict = load_lrModel.transform(test) 124 | println("加载测试") 125 | load_predict.select("features", "label", "probability", "prediction").take(5).foreach { 126 | case Row(features: Vector, label: Double, prob: Vector, prediction: Double) => 127 | println(s"($features, $label) -> prob=$prob, prediction=$prediction") 128 | } 129 | 130 | } 131 | 132 | /** 133 | * 读取libSVM格式的文件,生成训练样本和测试样本。 134 | */ 135 | def readLibSvmSampleData( 136 | @transient spark: org.apache.spark.sql.SparkSession, 137 | dataPath: String): (Dataset[LabeledPoint], Dataset[LabeledPoint]) = { 138 | import spark.implicits._ 139 | // 2.1 读取样本 140 | 141 | // 2.3 划分样本 142 | 143 | (training, test) 144 | } 145 | 146 | } -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第7章FM/FM.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # ## 0)环境准备 5 | 6 | # In[1]: 7 | 8 | import numpy as np 9 | import tensorflow as tf 10 | import pandas as pd 11 | import random 12 | import math 13 | import re 14 | 15 | from sklearn import preprocessing 16 | from os import path, listdir 17 | from sklearn.datasets import load_svmlight_files 18 | from sklearn.model_selection import train_test_split 19 | from sklearn import metrics 20 | from tensorflow.contrib import layers 21 | 22 | import time 23 | import datetime 24 | 25 | import os 26 | os.environ["CUDA_VISIBLE_DEVICES"]="0" 27 | 28 | print tf.__version__ 29 | print tf.__path__ 30 | 31 | 32 | # ## 1)数据准备Dataset格式 33 | 34 | # In[2]: 35 | 36 | # 每一行解析,解析标签csv格式 37 | # 0.0,0.6666666666666666,0.5,0.0,0.0,0.0,0.0,0.7272727272727273,0.42857142857142855 38 | # 数据处理 39 | def process_data(data_type, my_path, feature_size, batch_size=32, num_epochs=1): 40 | filenames = get_file_list(my_path) 41 | next_element = read_my_file_format(data_type, filenames, feature_size, batch_size, num_epochs) 42 | return next_element 43 | 44 | # 创建session,指定GPU或者CPU使用率 45 | def get_session(gpu_fraction=0.1): 46 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction, 47 | allow_growth=True) 48 | # server = tf.train.Server.create_local_server() 49 | return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) 50 | 51 | 52 | # ## 2)FM模型 53 | 54 | # In[3]: 55 | 56 | class FM(object): 57 | """ 初始化成员变量 """ 58 | def __init__(self, feature_size, fm_v_size, loss_fuc, train_optimizer, learning_rate, reg_type, reg_param): 59 | # 特征向量长度 60 | self.feature_size = feature_size 61 | # fm_v_size向量长度 62 | self.fm_v_size = fm_v_size 63 | # 损失函数 64 | self.loss_fuc = loss_fuc 65 | # 优化方法 66 | self.train_optimizer = train_optimizer 67 | # 学习率 68 | self.learning_rate = learning_rate 69 | # 正则类型 70 | self.reg_type = reg_type 71 | # 正则因子 72 | self.reg_param = reg_param 73 | # aglobal_step 74 | self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step') 75 | 76 | def train(self, batch_data): 77 | """ 1 定义输入数据 """ 78 | with tf.name_scope('input_data'): 79 | # 标签:[batch_size, 1] 80 | labels = batch_data['labels'] 81 | # 用户特征向量:[batch_size, feature_size] 82 | dense_vector = tf.reshape(batch_data['dense_vector'], shape=[-1, feature_size, 1]) # None * feature_size * 1 83 | print("%s: %s" % ("dense_vector", dense_vector)) 84 | print("%s: %s" % ("labels", labels)) 85 | 86 | """ 2 定义网络输出 """ 87 | with tf.name_scope("FM_Comput_Score"): 88 | # FM参数,生成或者获取W V 89 | with tf.variable_scope("fm_layer", reuse=tf.AUTO_REUSE): 90 | self.FM_W = tf.get_variable(name='fm_w', shape=[self.feature_size, 1], initializer=tf.glorot_normal_initializer()) 91 | self.FM_V = tf.get_variable(name='fm_v', shape=[self.feature_size, self.fm_v_size], initializer=tf.glorot_normal_initializer()) 92 | self.FM_B = tf.Variable(tf.constant(0.0), dtype=tf.float32 ,name="fm_bias") # W0 93 | print("%s: %s" % ("FM_W", self.FM_W)) 94 | print("%s: %s" % ("FM_V", self.FM_V)) 95 | print("%s: %s" % ("FM_B", self.FM_B)) 96 | 97 | # ---------- w * x---------- 98 | Y_first = tf.reduce_sum(tf.multiply(self.FM_W, dense_vector), 2) # None * F 99 | print("%s: %s" % ("Y_first", Y_first)) 100 | 101 | # ---------- Vij * Vij* Xij --------------- 102 | embeddings = tf.multiply(self.FM_V, dense_vector) # None * V * X 103 | # sum_square part 104 | summed_features_emb = tf.reduce_sum(embeddings, 1) # sum(v*x) 105 | summed_features_emb_square = tf.square(summed_features_emb) # (sum(v*x))^2 106 | 107 | # square_sum part 108 | squared_features_emb = tf.square(embeddings) # (v*x)^2 109 | squared_sum_features_emb = tf.reduce_sum(squared_features_emb, 1) # sum((v*x)^2) 110 | 111 | # second order 112 | Y_second = 0.5 * tf.subtract(summed_features_emb_square, squared_sum_features_emb) # 0.5*((sum(v*x))^2 - sum((v*x)^2)) 113 | print("%s: %s" % ("Y_second", Y_second)) 114 | 115 | # out = W * X + Vij * Vij* Xij 116 | FM_out_lay1 = tf.concat([Y_first, Y_second], axis=1) 117 | Y_Out = tf.reduce_sum(FM_out_lay1, 1) 118 | # out = out + bias 119 | y_d = tf.reshape(Y_Out,shape=[-1]) 120 | Y_bias = self.FM_B * tf.ones_like(y_d, dtype=tf.float32) # Y_bias 121 | Y_Out = tf.add(Y_Out, Y_bias, name='Y_Out') 122 | print("%s: %s" % ("Y_bias", Y_bias)) 123 | print("%s: %s" % ("Y_Out", Y_Out)) 124 | # ---------- score ---------- 125 | score=tf.nn.sigmoid(Y_Out,name='score') 126 | score=tf.reshape(score, shape=[-1, 1]) 127 | print("%s: %s" % ("score", score)) 128 | 129 | """ 3 定义损失函数和AUC指标 """ 130 | with tf.name_scope("loss"): 131 | # loss:Squared_error,Cross_entropy ,FTLR 132 | if reg_type == 'l1_reg': 133 | regularization = tf.contrib.layers.l1_regularizer(self.reg_param)(self.FM_W) 134 | elif reg_type == 'l2_reg': 135 | regularization = self.reg_param * tf.nn.l2_loss(self.FM_W) 136 | else: 137 | regularization = self.reg_param * tf.nn.l2_loss(self.FM_W) 138 | 139 | if loss_fuc == 'Squared_error': 140 | loss = tf.reduce_mean(tf.reduce_sum(tf.square(labels - score), reduction_indices=[1])) + regularization 141 | elif loss_fuc == 'Cross_entropy': 142 | loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=tf.reshape(Y_Out, [-1]), labels=tf.reshape(labels, [-1]))) + regularization 143 | elif loss_fuc == 'FTLR': 144 | loss = tf.reduce_mean(tf.reduce_sum(tf.square(labels - score), reduction_indices=[1])) + regularization 145 | # AUC 146 | auc = tf.metrics.auc(labels, score) 147 | print("%s: %s" % ("labels", labels)) 148 | # w为0的比例,w的平均值 149 | w_zero_ratio = tf.reduce_mean(tf.to_float(tf.abs(self.FM_W) <= 1.0e-5)) 150 | w_avg = tf.reduce_mean(self.FM_W) 151 | v_zero_ratio = tf.reduce_mean(tf.to_float(tf.abs(self.FM_V) <= 1.0e-5)) 152 | v_avg = tf.reduce_mean(self.FM_V) 153 | 154 | """ 4 设定optimizer """ 155 | with tf.name_scope("optimizer"): 156 | #------bulid optimizer------ 157 | with tf.variable_scope("Optimizer", reuse=tf.AUTO_REUSE): 158 | if train_optimizer == 'Adam': 159 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8) 160 | elif train_optimizer == 'Adagrad': 161 | optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=1e-8) 162 | elif train_optimizer == 'Momentum': 163 | optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.95) 164 | elif train_optimizer == 'ftrl': 165 | optimizer = tf.train.FtrlOptimizer(learning_rate) 166 | train_step = optimizer.minimize(loss, global_step=self.global_step) 167 | 168 | """5 设定summary,以便在Tensorboard里进行可视化 """ 169 | with tf.name_scope("summaries"): 170 | tf.summary.scalar("loss", loss) 171 | tf.summary.scalar("accumulate_auc", auc[0]) 172 | tf.summary.scalar("w_avg", w_avg) 173 | tf.summary.scalar("w_zero_ratio", w_zero_ratio) 174 | tf.summary.scalar("v_avg", v_avg) 175 | tf.summary.scalar("v_zero_ratio", v_zero_ratio) 176 | tf.summary.histogram("FM_W", self.FM_W) 177 | tf.summary.histogram("FM_V", self.FM_V) 178 | # 好几个summary,所以这里要merge_all 179 | summary_op = tf.summary.merge_all() 180 | 181 | """6 返回结果 """ 182 | return Y_Out, score, regularization, loss, auc, train_step, w_zero_ratio, w_avg, v_zero_ratio, v_avg, labels, score, summary_op 183 | 184 | 185 | # ## 3)模型训练测试 186 | 187 | # In[4]: 188 | 189 | # 测试数据 190 | filenames = '/data/csv-all' 191 | data_type='csv' 192 | feature_size = 530 193 | batch_size = 6000 194 | num_epochs = 200 195 | next_element = process_data(data_type, filenames, feature_size, batch_size, num_epochs) 196 | 197 | # 模型参数 198 | feature_size = 530 199 | fm_v_size = 20 200 | loss_fuc = 'Cross_entropy' 201 | train_optimizer = 'Adam' 202 | learning_rate = 0.01 203 | reg_type = 'l2_reg' 204 | reg_param = 0.000 205 | log_path='/data/log/FM_Cross_entropy_L2_0_20180816_01' 206 | 207 | # 开始训练 208 | bea_model = FM(feature_size, fm_v_size, loss_fuc, train_optimizer, learning_rate, reg_type, reg_param) 209 | Y_Out, score, regularization, loss, auc, train_step, w_zero_ratio, w_avg, v_zero_ratio, v_avg, labels, score, summary_op = bea_model.train(next_element) 210 | 211 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer()) 212 | gpu_fraction = 0.6 213 | my_device='/gpu:0' 214 | with tf.device(my_device): 215 | sess = get_session(gpu_fraction) 216 | sess.run(init_op) 217 | batch_cnt = 0 218 | #选定可视化存储目录 219 | writer = tf.summary.FileWriter(log_path, sess.graph) 220 | try: 221 | while True: 222 | batch_cnt = batch_cnt + 1 223 | a, b, c, d, e, summary = sess.run([loss, auc, w_zero_ratio, w_avg, train_step, summary_op]) 224 | if batch_cnt % 50 == 0 or batch_cnt <= 10: 225 | y, p = sess.run([labels, score]) 226 | if y.sum() > 0.0: 227 | batch_auc=metrics.roc_auc_score(y, p) 228 | else: 229 | batch_auc=0.0 230 | print("batch: {} loss: {:.4f} accumulate_auc: {:.4f} batch_auc: {:.4f} w_zero_ratio: {:.4f} w_avg: {:.4f}".format(batch_cnt, a, b[0], batch_auc, c, d)) 231 | writer.add_summary(summary, batch_cnt) 232 | except tf.errors.OutOfRangeError: 233 | print("3、Train end of dataset") 234 | -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第7章FM/FM_Sk.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # ## 0)环境设定 5 | 6 | # In[1]: 7 | 8 | from sklearn import metrics 9 | from os import path, listdir 10 | from sklearn.datasets import load_svmlight_files 11 | from sklearn.model_selection import train_test_split 12 | from sklearn.externals import joblib 13 | from sklearn import preprocessing 14 | from sklearn import metrics 15 | from fastFM import als 16 | import numpy as np 17 | import pandas as pd 18 | import random 19 | 20 | 21 | # ## 1)数据准备 22 | 23 | # In[2]: 24 | 25 | # 数据处理,读取libSVM格式数据,并且将数据归一化,样本划分,并且根据batch参数生成batch 26 | def process_data(data_path, feature_size, test_rat=0.3, random_seed=0, train_batch_size=5000, test_batch_size=5000): 27 | # 读取文件 28 | # batch生成 29 | return {"train_batch": train_batch, "test_batch": test_batch, "X_train": X_train, "X_test": X_test, "Y_train": Y_train, "Y_test": Y_test} 30 | 31 | # In[3]: 32 | 33 | data_path = '/data/data01/' 34 | test_rat=0.4 35 | random_seed=0 36 | train_batch_size=20000 37 | test_batch_size=20000 38 | feature_size=530 39 | 40 | # 获取样本数据 41 | data = process_data(data_path, feature_size, test_rat, random_seed, train_batch_size, test_batch_size) 42 | 43 | train_batch = data['train_batch'] 44 | test_batch = data['test_batch'] 45 | 46 | X_train = data['X_train'] 47 | Y_train = data['Y_train'] 48 | X_test = data['X_test'] 49 | Y_test = data['Y_test'] 50 | 51 | print X_train.shape 52 | print Y_train.shape 53 | print X_test.shape 54 | print Y_test.shape 55 | 56 | 57 | # In[6]: 58 | 59 | print Y_train 60 | 61 | 62 | # ## 3)FM模型 63 | 64 | # In[4]: 65 | 66 | # 3.1 建立FM模型,并且设定参数 67 | fm_model = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=10, random_state=0, l2_reg_w=0.0, l2_reg_V=0.0, l2_reg=0) 68 | 69 | # 3.2 训练FM模型 70 | fm_model.fit(X_train,Y_train) 71 | 72 | # 3.3 采用测试集验证模型离线指标 73 | # 训练集AUC 74 | probs_train= fm_model.predict_proba(X_train) 75 | AUC1 = metrics.roc_auc_score(Y_train, probs_train[:,1]) 76 | print("Train Auc: %s"%(AUC1)) 77 | 78 | # 测试集AUC 79 | probs_test= fm_model.predict_proba(X_test) 80 | predict_test = fm_model.predict(X_test) 81 | AUC2 = metrics.roc_auc_score(Y_test, probs_test[:,1]) 82 | print("Test Auc: %s"%(AUC2)) 83 | 84 | # 准确率 85 | accuracy = metrics.accuracy_score(Y_test, predict_test) 86 | print("Test Accuracy: %s"%(accuracy)) 87 | 88 | # 召回率 89 | recall = metrics.recall_score(Y_test, predict_test) 90 | print("Test Recall: %s"%(recall)) 91 | 92 | # F1值 93 | f1 = metrics.f1_score(Y_test, predict_test) 94 | print("Test F1: %s"%(f1)) 95 | 96 | # 3.5 模型保存 97 | joblib.dump(fm_model,"FM.model") 98 | #模型加载 99 | print("模型加载") 100 | load_lr = joblib.load("FM.model") 101 | print(load_lr.predict_proba(X_test[0:5])) 102 | 103 | 104 | # In[ ]: 105 | 106 | # 3.4 打印模型参数 107 | print("参数:",lr_model.coef_) 108 | print("截距:",lr_model.intercept_) 109 | print("稀疏化特征比率:%.2f%%" %(np.mean(lr_model.coef_.ravel()==0)*100)) 110 | print("=========sigmoid函数转化的值,即:概率p=========") 111 | print(lr_model.predict_proba(X_test[0:5])) #sigmoid函数转化的值,即:概率p 112 | 113 | # 3.5 模型保存 114 | joblib.dump(fm_model,"FM.model") 115 | #模型加载 116 | load_lr = joblib.load("FM.model") 117 | print(load_lr.predict_proba(X_test[0:5])) 118 | 119 | -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第8章决策树/DecisionTrees.scala: -------------------------------------------------------------------------------- 1 | package book_code 2 | 3 | import org.apache.spark.ml.feature._ 4 | import org.apache.spark.ml.Pipeline 5 | import org.apache.spark.ml.classification.{ RandomForestClassificationModel, RandomForestClassifier } 6 | import org.apache.spark.ml.classification.{ DecisionTreeClassifier, DecisionTreeClassificationModel } 7 | import org.apache.spark.ml.classification.{ GBTClassificationModel, GBTClassifier } 8 | import org.apache.spark.ml.evaluation.{ MulticlassClassificationEvaluator, BinaryClassificationEvaluator } 9 | import org.apache.spark.ml.{ Pipeline, PipelineModel } 10 | import org.apache.spark.ml.param.ParamMap 11 | import org.apache.spark.ml.linalg.{ Vector, Vectors } 12 | import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder 13 | import org.apache.spark.sql.Encoder 14 | import org.apache.spark.sql.types._ 15 | import org.apache.spark.sql.functions._ 16 | import org.apache.spark.sql._ 17 | import org.apache.spark.sql.SparkSession 18 | import java.util.Date 19 | import java.text.SimpleDateFormat 20 | 21 | object DecisionTrees { 22 | 23 | def main(args: Array[String]): Unit = { 24 | 25 | val spark = SparkSession.builder(). 26 | master("local"). 27 | appName("decision_trees"). 28 | getOrCreate() 29 | 30 | import spark.implicits._ 31 | 32 | //1 参数准备 33 | val dataPath = "hdfs://1.1.1.1:9000/user/data01/" 34 | 35 | //2 训练样本准备 36 | val (training, test) = readLibSvmSampleData(spark, dataPath) 37 | training.cache() 38 | test.cache() 39 | println(s"training.count(): ${training.count()}") 40 | println(s"test.count(): ${test.count()}") 41 | println("training.show") 42 | training.show 43 | 44 | val data = training.unionAll(test) 45 | 46 | //2 标签进行索引编号 47 | val labelIndexer = new StringIndexer(). 48 | setInputCol("label"). 49 | setOutputCol("indexedLabel"). 50 | fit(data) 51 | // 对离散特征进行标记索引,以用来确定哪些特征是离散特征 52 | // 如果一个特征的值超过4个以上,该特征视为连续特征,否则将会标记得离散特征并进行索引编号 53 | val featureIndexer = new VectorIndexer(). 54 | setInputCol("features"). 55 | setOutputCol("indexedFeatures"). 56 | setMaxCategories(4). 57 | fit(data) 58 | 59 | //3 样本划分 60 | val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3)) 61 | 62 | //4 训练决策树模型 63 | val dt = new DecisionTreeClassifier(). 64 | setLabelCol("indexedLabel"). 65 | setFeaturesCol("indexedFeatures") 66 | 67 | //4 训练随机森林模型 68 | val rf = new RandomForestClassifier() 69 | .setLabelCol("indexedLabel") 70 | .setFeaturesCol("indexedFeatures") 71 | .setNumTrees(10) 72 | .setMaxDepth(15) 73 | 74 | //4 训练GBDT模型 75 | val gbt = new GBTClassifier() 76 | .setLabelCol("indexedLabel") 77 | .setFeaturesCol("indexedFeatures") 78 | .setMaxIter(10) 79 | .setMaxDepth(15) 80 | 81 | //5 将索引的标签转回原始标签 82 | val labelConverter = new IndexToString(). 83 | setInputCol("prediction"). 84 | setOutputCol("predictedLabel"). 85 | setLabels(labelIndexer.labels) 86 | 87 | //6 构建Pipeline 88 | val pipeline1 = new Pipeline(). 89 | setStages(Array(labelIndexer, featureIndexer, dt, labelConverter)) 90 | val pipeline2 = new Pipeline(). 91 | setStages(Array(labelIndexer, featureIndexer, rf, labelConverter)) 92 | val pipeline3 = new Pipeline(). 93 | setStages(Array(labelIndexer, featureIndexer, gbt, labelConverter)) 94 | 95 | //7 Pipeline开始训练 96 | val model1 = pipeline1.fit(trainingData) 97 | val model2 = pipeline2.fit(trainingData) 98 | val model3 = pipeline3.fit(trainingData) 99 | 100 | //8 模型测试 101 | val predictions = model3.transform(testData) 102 | println("predictions.show") 103 | predictions.select("predictedLabel", "label", "features").show(10) 104 | 105 | //9 分类指标 106 | // 正确率 107 | val evaluator1 = new MulticlassClassificationEvaluator(). 108 | setLabelCol("indexedLabel"). 109 | setPredictionCol("prediction"). 110 | setMetricName("accuracy") 111 | val accuracy = evaluator1.evaluate(predictions) 112 | println("Test Error = " + (1.0 - accuracy)) 113 | // f1 114 | val evaluator2 = new MulticlassClassificationEvaluator(). 115 | setLabelCol("indexedLabel"). 116 | setPredictionCol("prediction"). 117 | setMetricName("f1") 118 | val f1 = evaluator2.evaluate(predictions) 119 | println("f1 = " + f1) 120 | // Precision 121 | val evaluator3 = new MulticlassClassificationEvaluator(). 122 | setLabelCol("indexedLabel"). 123 | setPredictionCol("prediction"). 124 | setMetricName("weightedPrecision") 125 | val Precision = evaluator3.evaluate(predictions) 126 | println("Precision = " + Precision) 127 | // Recall 128 | val evaluator4 = new MulticlassClassificationEvaluator(). 129 | setLabelCol("indexedLabel"). 130 | setPredictionCol("prediction"). 131 | setMetricName("weightedRecall") 132 | val Recall = evaluator4.evaluate(predictions) 133 | println("Recall = " + Recall) 134 | 135 | // AUC 136 | val evaluator5 = new BinaryClassificationEvaluator(). 137 | setLabelCol("indexedLabel"). 138 | setRawPredictionCol("prediction"). 139 | setMetricName("areaUnderROC") 140 | val AUC = evaluator5.evaluate(predictions) 141 | println("Test AUC = " + AUC) 142 | 143 | // aupr 144 | val evaluator6 = new BinaryClassificationEvaluator(). 145 | setLabelCol("indexedLabel"). 146 | setRawPredictionCol("prediction"). 147 | setMetricName("areaUnderPR") 148 | val aupr = evaluator6.evaluate(predictions) 149 | println("Test aupr = " + aupr) 150 | 151 | //10 决策树打印 152 | val treeModel = model1.stages(2).asInstanceOf[DecisionTreeClassificationModel] 153 | println("Learned classification tree model:\n" + treeModel.toDebugString) 154 | 155 | //11 模型保存与加载 156 | val now = new Date() 157 | val dateFormat1 = new SimpleDateFormat("yyyyMMddHHmmss") 158 | val time_stamp = dateFormat1.format(now) 159 | model1.save("hdfs://1.1.1.1:9000/dtmodel/${time_stamp}") 160 | val load_treeModel = PipelineModel.load(s"hdfs://1.1.1.1:9000/dtmodel/${time_stamp}") 161 | 162 | } 163 | 164 | /** 165 | * 读取libSVM格式的文件,生成训练样本和测试样本。 166 | */ 167 | def readLibSvmSampleData( 168 | @transient spark: org.apache.spark.sql.SparkSession, 169 | dataPath: String): (Dataset[LabeledPoint], Dataset[LabeledPoint]) = { 170 | import spark.implicits._ 171 | // 2.1 读取样本 172 | // 2.3 划分样本 173 | (training, test) 174 | } 175 | 176 | } 177 | 178 | -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第8章决策树/Tree.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # ## 1)环境设定 5 | 6 | # In[1]: 7 | 8 | from sklearn.linear_model import LogisticRegression 9 | from sklearn import metrics 10 | from os import path, listdir 11 | from sklearn.datasets import load_svmlight_files 12 | from sklearn.model_selection import train_test_split 13 | from sklearn.externals import joblib 14 | from sklearn import preprocessing 15 | import numpy as np 16 | import pandas as pd 17 | import random 18 | 19 | from sklearn import metrics 20 | from sklearn.svm import SVC 21 | from sklearn.neural_network import MLPClassifier 22 | from sklearn.neighbors import KNeighborsClassifier 23 | from sklearn.tree import DecisionTreeClassifier 24 | from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier 25 | from sklearn.naive_bayes import GaussianNB 26 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 27 | from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis 28 | 29 | 30 | # ## 2)数据准备 31 | 32 | # In[2]: 33 | 34 | # 数据处理,读取libSVM格式数据,并且将数据归一化,样本划分,并且根据batch参数生成batch 35 | def process_data(data_path, feature_size, test_rat=0.3, random_seed=0, train_batch_size=5000, test_batch_size=5000): 36 | # 读取文件 37 | # batch生成 38 | return {"train_batch": train_batch, "test_batch": test_batch, "X_train": X_train, "X_test": X_test, "Y_train": Y_train, "Y_test": Y_test} 39 | 40 | # In[3]: 41 | 42 | data_path = '/data/data01/' 43 | test_rat=0.4 44 | random_seed=0 45 | train_batch_size=20000 46 | test_batch_size=20000 47 | feature_size=530 48 | 49 | # 获取样本数据 50 | data = process_data(data_path, feature_size, test_rat, random_seed, train_batch_size, test_batch_size) 51 | 52 | train_batch = data['train_batch'] 53 | test_batch = data['test_batch'] 54 | 55 | X_train = data['X_train'] 56 | Y_train = data['Y_train'] 57 | X_test = data['X_test'] 58 | Y_test = data['Y_test'] 59 | 60 | print X_train.shape 61 | print Y_train.shape 62 | print X_test.shape 63 | print Y_test.shape 64 | 65 | 66 | # ## 3)Tree模型 67 | 68 | # In[4]: 69 | 70 | # 3.1 随机森林模型,并且设定参数 71 | rf_model= RandomForestClassifier( 72 | n_estimators=30, 73 | criterion='gini', 74 | max_depth=20, 75 | min_samples_leaf=200) 76 | 77 | # 3.1 GBDT模型,并且设定参数 78 | gbdt_model= GradientBoostingClassifier( 79 | n_estimators=30, 80 | criterion='friedman_mse', 81 | max_depth=20, 82 | min_samples_leaf=200) 83 | 84 | # 3.2 训练模型 85 | rf_model.fit(X_train,Y_train.values.ravel()) 86 | gbdt_model.fit(X_train,Y_train.values.ravel()) 87 | 88 | 89 | # In[5]: 90 | 91 | # 3.3 采用测试集验证模型离线指标 92 | # 训练集AUC 93 | probs_train= rf_model.predict_proba(X_train) 94 | AUC1 = metrics.roc_auc_score(Y_train, probs_train[:,1]) 95 | print("RF Train Auc: %s"%(AUC1)) 96 | 97 | # 测试集AUC 98 | probs_test= rf_model.predict_proba(X_test) 99 | predict_test = rf_model.predict(X_test) 100 | AUC2 = metrics.roc_auc_score(Y_test, probs_test[:,1]) 101 | print("RF Test Auc: %s"%(AUC2)) 102 | 103 | # 训练集AUC 104 | probs_train2= gbdt_model.predict_proba(X_train) 105 | AUC3 = metrics.roc_auc_score(Y_train, probs_train2[:,1]) 106 | print("Gbdt Train Auc: %s"%(AUC3)) 107 | 108 | # 测试集AUC 109 | probs_test2= gbdt_model.predict_proba(X_test) 110 | AUC4 = metrics.roc_auc_score(Y_test, probs_test2[:,1]) 111 | print("Gbdt Test Auc: %s"%(AUC4)) 112 | 113 | 114 | # In[6]: 115 | 116 | # 准确率 117 | accuracy = metrics.accuracy_score(Y_test, predict_test) 118 | print("Test Accuracy: %s"%(accuracy)) 119 | 120 | # 召回率 121 | recall = metrics.recall_score(Y_test, predict_test) 122 | print("Test Recall: %s"%(recall)) 123 | 124 | # F1值 125 | f1 = metrics.f1_score(Y_test, predict_test) 126 | print("Test F1: %s"%(f1)) 127 | 128 | 129 | # In[7]: 130 | 131 | # 3.1 随机森林模型,并且设定参数 132 | rf_model= RandomForestClassifier( 133 | n_estimators=50, 134 | criterion='gini', 135 | max_depth=30, 136 | min_samples_leaf=100) 137 | 138 | # 3.1 GBDT模型,并且设定参数 139 | gbdt_model= GradientBoostingClassifier( 140 | n_estimators=50, 141 | criterion='friedman_mse', 142 | max_depth=30, 143 | min_samples_leaf=100) 144 | 145 | # 3.2 训练模型 146 | rf_model.fit(X_train,Y_train) 147 | gbdt_model.fit(X_train,Y_train) 148 | 149 | # 3.3 采用测试集验证模型离线指标 150 | # RF训练集AUC 151 | probs_train= rf_model.predict_proba(X_train) 152 | AUC1 = metrics.roc_auc_score(Y_train, probs_train[:,1]) 153 | print("RF Train Auc: %s"%(AUC1)) 154 | 155 | # RF测试集AUC 156 | probs_test= rf_model.predict_proba(X_test) 157 | predict_test = rf_model.predict(X_test) 158 | AUC2 = metrics.roc_auc_score(Y_test, probs_test[:,1]) 159 | print("RF Test Auc: %s"%(AUC2)) 160 | 161 | # Gbdt训练集AUC 162 | probs_train2= gbdt_model.predict_proba(X_train) 163 | AUC3 = metrics.roc_auc_score(Y_train, probs_train2[:,1]) 164 | print("Gbdt Train Auc: %s"%(AUC3)) 165 | 166 | # Gbdt测试集AUC 167 | probs_test2= gbdt_model.predict_proba(X_test) 168 | AUC4 = metrics.roc_auc_score(Y_test, probs_test2[:,1]) 169 | print("Gbdt Test Auc: %s"%(AUC4)) 170 | 171 | # 准确率 172 | accuracy = metrics.accuracy_score(Y_test, predict_test) 173 | print("Test Accuracy: %s"%(accuracy)) 174 | 175 | # 召回率 176 | recall = metrics.recall_score(Y_test, predict_test) 177 | print("Test Recall: %s"%(recall)) 178 | 179 | # F1值 180 | f1 = metrics.f1_score(Y_test, predict_test) 181 | print("Test F1: %s"%(f1)) 182 | 183 | # 3.5 模型保存 184 | joblib.dump(rf_model,"rf_model.model") 185 | joblib.dump(gbdt_model,"gbdt_model.model") 186 | #模型加载 187 | load_rf = joblib.load("rf_model.model") 188 | load_gbdt = joblib.load("gbdt_model.model") 189 | print(load_rf.predict_proba(X_test[0:5])) 190 | print(load_gbdt.predict_proba(X_test[0:5])) 191 | -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第9章集成学习/GbdtLr.scala: -------------------------------------------------------------------------------- 1 | package book_code 2 | 3 | import org.apache.hadoop.conf.Configuration 4 | import org.apache.hadoop.fs.{ FileSystem, Path } 5 | import org.apache.spark.mllib.classification.{ LogisticRegressionModel, LogisticRegressionWithLBFGS } 6 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics 7 | import org.apache.spark.mllib.linalg.Vectors 8 | import org.apache.spark.ml.linalg.{ Vector => mlVector } 9 | import org.apache.spark.mllib.linalg.Vector 10 | import org.apache.spark.mllib.regression.LabeledPoint 11 | import org.apache.spark.mllib.tree.GradientBoostedTrees 12 | import org.apache.spark.mllib.tree.configuration.BoostingStrategy 13 | import org.apache.spark.mllib.tree.configuration.FeatureType._ 14 | import org.apache.spark.mllib.tree.model.{ GradientBoostedTreesModel, Node } 15 | import org.apache.spark.rdd.RDD 16 | import org.apache.spark.sql._ 17 | import scala.collection.mutable.ArrayBuffer 18 | 19 | object GbdtLr { 20 | 21 | def main(args: Array[String]): Unit = { 22 | 23 | val spark = SparkSession.builder(). 24 | master("local"). 25 | appName("GbdtLr"). 26 | getOrCreate() 27 | 28 | import spark.implicits._ 29 | 30 | //1 参数准备 31 | val iteratTree = 10 32 | val iteratDepth = 10 33 | val maxAuc = 0.0 34 | val maxDepth = 15 35 | val numTrees = 10 36 | val minInstancesPerNode = 2 37 | 38 | //2 训练样本准备 39 | val dataPath = "hdfs://1.1.1.1:9000/user/data01/" 40 | 41 | //2 训练样本准备 42 | val (trainingData, testData) = readLibSvmSampleData(spark, dataPath) 43 | trainingData.cache() 44 | testData.cache() 45 | println(s"trainingData.count(): ${trainingData.count()}") 46 | println(s"testData.count(): ${testData.count()}") 47 | println("trainingData.show") 48 | trainingData.show 49 | val data = trainingData.unionAll(testData) 50 | 51 | //3 Gbdt模型训练 52 | val boostingStrategy = BoostingStrategy.defaultParams("Regression") 53 | boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]() 54 | boostingStrategy.treeStrategy.minInstancesPerNode = minInstancesPerNode 55 | boostingStrategy.numIterations = numTrees 56 | boostingStrategy.treeStrategy.maxDepth = maxDepth 57 | val gbdtModel = GradientBoostedTrees.train(trainingData.rdd, boostingStrategy) 58 | 59 | //4 gbdt模型解析:取出所有树的叶子节点 60 | val treeLeafMap = getTreeLeafMap(gbdtModel) 61 | 62 | //5 样本数据转换成gbdt叶子节点编号的样本 63 | val lrSampleLablePoint = lrSample(data.rdd, treeLeafMap, gbdtModel) 64 | val lrSplits = lrSampleLablePoint.randomSplit(Array(0.7, 0.3)) 65 | val (lrTrainingData, lrTestData) = (lrSplits(0), lrSplits(1)) 66 | lrTrainingData.cache() 67 | lrTrainingData.count() 68 | lrTestData.cache() 69 | lrTestData.count() 70 | 71 | //6 lr模型训练 72 | val lr = new LogisticRegressionWithLBFGS().setNumClasses(2) 73 | lr.optimizer.setNumIterations(100) 74 | lr.optimizer.setRegParam(0.0) 75 | val lrModel = lr.run(lrTrainingData) 76 | 77 | //7 计算模型指标 78 | lrModel.clearThreshold() 79 | val scoreAndLabels = lrTestData.map { point => 80 | val score = lrModel.predict(point.features) 81 | (score, point.label) 82 | } 83 | val metrics = new BinaryClassificationMetrics(scoreAndLabels) 84 | val auc = metrics.areaUnderROC() 85 | val aupr = metrics.areaUnderPR() 86 | println(s"AUC: ${auc}") 87 | println(s"AUPR: ${aupr}") 88 | 89 | } 90 | 91 | /** 92 | * 根据gbdt模型生成gbdtlr模型的样本 93 | */ 94 | def lrSample(): RDD[LabeledPoint] = { 95 | lrSamplLablePoint 96 | } 97 | 98 | /** 99 | * gbdt模型解析叶子节点 100 | */ 101 | def getTreeLeafMap(gbdtModel: GradientBoostedTreesModel): Map[String, Int] = { 102 | lrFeatureMap 103 | } 104 | 105 | /** 106 | * 读取libSVM格式的文件,生成训练样本和测试样本。 107 | */ 108 | def readLibSvmSampleData( 109 | @transient spark: org.apache.spark.sql.SparkSession, 110 | dataPath: String): (Dataset[LabeledPoint], Dataset[LabeledPoint]) = { 111 | import spark.implicits._ 112 | // 2.1 读取样本 113 | // 2.3 划分样本 114 | (training, test) 115 | } 116 | 117 | } 118 | 119 | -------------------------------------------------------------------------------- /推荐系统算法实践—源码下载/第9章集成学习/gcForest.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # ## 1)环境设定 5 | 6 | # In[31]: 7 | 8 | import argparse 9 | import numpy as np 10 | import sys 11 | from keras.datasets import mnist 12 | import pickle 13 | from sklearn.ensemble import RandomForestClassifier 14 | from sklearn.metrics import accuracy_score 15 | sys.path.insert(0, "lib") 16 | 17 | from gcforest.gcforest import GCForest 18 | from gcforest.utils.config_utils import load_json 19 | 20 | from sklearn.linear_model import LogisticRegression 21 | from sklearn import metrics 22 | from os import path, listdir 23 | from sklearn.datasets import load_svmlight_files 24 | from sklearn.model_selection import train_test_split 25 | from sklearn.externals import joblib 26 | from sklearn import preprocessing 27 | import numpy as np 28 | import pandas as pd 29 | import random 30 | 31 | from sklearn import metrics 32 | from sklearn.svm import SVC 33 | 34 | 35 | # ## 2)数据准备 36 | 37 | # In[32]: 38 | 39 | # 数据处理,读取libSVM格式数据,并且将数据归一化,样本划分,并且根据batch参数生成batch 40 | def process_data(data_path, feature_size, test_rat=0.3, random_seed=0, train_batch_size=5000, test_batch_size=5000): 41 | # 读取文件 42 | # batch生成 43 | return {"train_batch": train_batch, "test_batch": test_batch, "X_train": X_train, "X_test": X_test, "Y_train": Y_train, "Y_test": Y_test} 44 | 45 | # In[40]: 46 | 47 | data_path = '/data/data01/' 48 | test_rat=0.4 49 | random_seed=0 50 | train_batch_size=20000 51 | test_batch_size=20000 52 | feature_size=530 53 | 54 | # 获取样本数据 55 | data = process_data(data_path, feature_size, test_rat, random_seed, train_batch_size, test_batch_size) 56 | 57 | train_batch = data['train_batch'] 58 | test_batch = data['test_batch'] 59 | 60 | X_train = np.array(data['X_train']) 61 | y_train = np.array(data['Y_train']).reshape(-1,) 62 | X_test = np.array(data['X_test']) 63 | y_test = np.array(data['Y_test']).reshape(-1,) 64 | 65 | print X_train.shape 66 | print y_train.shape 67 | print X_test.shape 68 | print y_test.shape 69 | 70 | 71 | # In[41]: 72 | 73 | print X_train[0:2] 74 | print y_train[0:2] 75 | 76 | # (a, b), (c, d) = mnist.load_data() 77 | # print a.shape 78 | # print b.shape 79 | # print c.shape 80 | # print d.shape 81 | # print a[0:2] 82 | # print b[0:2] 83 | 84 | 85 | # ## 3)gcForest模型 86 | 87 | # In[49]: 88 | 89 | # 模型参数 90 | def get_toy_config(): 91 | config = {} 92 | ca_config = {} 93 | ca_config["random_state"] = 0 94 | ca_config["max_layers"] = 100 95 | ca_config["early_stopping_rounds"] = 3 96 | ca_config["n_classes"] = 2 97 | ca_config["estimators"] = [] 98 | ca_config["estimators"].append({"n_folds": 2, "type": "RandomForestClassifier", "n_estimators": 10, "max_depth": None, "n_jobs": -1}) 99 | ca_config["estimators"].append({"n_folds": 2, "type": "ExtraTreesClassifier", "n_estimators": 10, "max_depth": None, "n_jobs": -1}) 100 | ca_config["estimators"].append({"n_folds": 2, "type": "LogisticRegression"}) 101 | config["cascade"] = ca_config 102 | return config 103 | 104 | 105 | # In[54]: 106 | 107 | # 模型参数 108 | config = get_toy_config() 109 | 110 | # 模型初始化 111 | gc = GCForest(config) 112 | 113 | # 模型训练 114 | X_train_enc = gc.fit_transform(X_train, y_train) 115 | 116 | # 模型预测 117 | y_pred = gc.predict(X_test) 118 | acc = accuracy_score(y_test, y_pred) 119 | print("Test Accuracy of GcForest = {:.2f} %".format(acc * 100)) 120 | 121 | # xgboost/RF预测分类. 122 | X_test_enc = gc.transform(X_test) 123 | X_train_enc = X_train_enc.reshape((X_train_enc.shape[0], -1)) 124 | X_test_enc = X_test_enc.reshape((X_test_enc.shape[0], -1)) 125 | X_train_origin = X_train.reshape((X_train.shape[0], -1)) 126 | X_test_origin = X_test.reshape((X_test.shape[0], -1)) 127 | X_train_enc = np.hstack((X_train_origin, X_train_enc)) 128 | X_test_enc = np.hstack((X_test_origin, X_test_enc)) 129 | print("X_train_enc.shape={}, X_test_enc.shape={}".format(X_train_enc.shape, X_test_enc.shape)) 130 | clf = RandomForestClassifier(n_estimators=50, max_depth=None, n_jobs=-1) 131 | clf.fit(X_train_enc, y_train) 132 | y_pred = clf.predict(X_test_enc) 133 | acc = accuracy_score(y_test, y_pred) 134 | print("Test Accuracy of Other classifier using gcforest's X_encode = {:.2f} %".format(acc * 100)) 135 | 136 | 137 | # In[55]: 138 | 139 | # 计算AUC指标 140 | probs_train= clf.predict_proba(X_train_enc) 141 | AUC1 = metrics.roc_auc_score(y_train, probs_train[:,1]) 142 | print("Train Auc: %s"%(AUC1)) 143 | 144 | probs_test= clf.predict_proba(X_test_enc) 145 | AUC2 = metrics.roc_auc_score(y_test, probs_test[:,1]) 146 | print("Test Auc: %s"%(AUC2)) 147 | 148 | # # dump 149 | # with open("test.pkl", "wb") as f: 150 | # pickle.dump(gc, f, pickle.HIGHEST_PROTOCOL) 151 | # # load 152 | # with open("test.pkl", "rb") as f: 153 | # gc = pickle.load(f) 154 | # y_pred = gc.predict(X_test) 155 | # acc = accuracy_score(y_test, y_pred) 156 | # print("Test Accuracy of GcForest (save and load) = {:.2f} %".format(acc * 100)) 157 | 158 | 159 | # In[ ]: 160 | 161 | # dump 162 | with open("test.pkl", "wb") as f: 163 | pickle.dump(gc, f, pickle.HIGHEST_PROTOCOL) 164 | # load 165 | with open("test.pkl", "rb") as f: 166 | gc = pickle.load(f) 167 | y_pred = gc.predict(X_test) 168 | acc = accuracy_score(y_test, y_pred) 169 | print("Test Accuracy of GcForest (save and load) = {:.2f} %".format(acc * 100)) 170 | 171 | -------------------------------------------------------------------------------- /推荐系统算法实践—补充部分/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WallaceLiu/recommendation_algorithm/36cdd077446ab7a2831c168f7419b058bd2fcbb0/推荐系统算法实践—补充部分/.DS_Store -------------------------------------------------------------------------------- /推荐系统算法实践—补充部分/第12章节/adult.names: -------------------------------------------------------------------------------- 1 | | This data was extracted from the census bureau database found at 2 | | http://www.census.gov/ftp/pub/DES/www/welcome.html 3 | | Donor: Ronny Kohavi and Barry Becker, 4 | | Data Mining and Visualization 5 | | Silicon Graphics. 6 | | e-mail: ronnyk@sgi.com for questions. 7 | | Split into train-test using MLC++ GenCVFiles (2/3, 1/3 random). 8 | | 48842 instances, mix of continuous and discrete (train=32561, test=16281) 9 | | 45222 if instances with unknown values are removed (train=30162, test=15060) 10 | | Duplicate or conflicting instances : 6 11 | | Class probabilities for adult.all file 12 | | Probability for the label '>50K' : 23.93% / 24.78% (without unknowns) 13 | | Probability for the label '<=50K' : 76.07% / 75.22% (without unknowns) 14 | | 15 | | Extraction was done by Barry Becker from the 1994 Census database. A set of 16 | | reasonably clean records was extracted using the following conditions: 17 | | ((AAGE>16) && (AGI>100) && (AFNLWGT>1)&& (HRSWK>0)) 18 | | 19 | | Prediction task is to determine whether a person makes over 50K 20 | | a year. 21 | | 22 | | First cited in: 23 | | @inproceedings{kohavi-nbtree, 24 | | author={Ron Kohavi}, 25 | | title={Scaling Up the Accuracy of Naive-Bayes Classifiers: a 26 | | Decision-Tree Hybrid}, 27 | | booktitle={Proceedings of the Second International Conference on 28 | | Knowledge Discovery and Data Mining}, 29 | | year = 1996, 30 | | pages={to appear}} 31 | | 32 | | Error Accuracy reported as follows, after removal of unknowns from 33 | | train/test sets): 34 | | C4.5 : 84.46+-0.30 35 | | Naive-Bayes: 83.88+-0.30 36 | | NBTree : 85.90+-0.28 37 | | 38 | | 39 | | Following algorithms were later run with the following error rates, 40 | | all after removal of unknowns and using the original train/test split. 41 | | All these numbers are straight runs using MLC++ with default values. 42 | | 43 | | Algorithm Error 44 | | -- ---------------- ----- 45 | | 1 C4.5 15.54 46 | | 2 C4.5-auto 14.46 47 | | 3 C4.5 rules 14.94 48 | | 4 Voted ID3 (0.6) 15.64 49 | | 5 Voted ID3 (0.8) 16.47 50 | | 6 T2 16.84 51 | | 7 1R 19.54 52 | | 8 NBTree 14.10 53 | | 9 CN2 16.00 54 | | 10 HOODG 14.82 55 | | 11 FSS Naive Bayes 14.05 56 | | 12 IDTM (Decision table) 14.46 57 | | 13 Naive-Bayes 16.12 58 | | 14 Nearest-neighbor (1) 21.42 59 | | 15 Nearest-neighbor (3) 20.35 60 | | 16 OC1 15.04 61 | | 17 Pebls Crashed. Unknown why (bounds WERE increased) 62 | | 63 | | Conversion of original data as follows: 64 | | 1. Discretized agrossincome into two ranges with threshold 50,000. 65 | | 2. Convert U.S. to US to avoid periods. 66 | | 3. Convert Unknown to "?" 67 | | 4. Run MLC++ GenCVFiles to generate data,test. 68 | | 69 | | Description of fnlwgt (final weight) 70 | | 71 | | The weights on the CPS files are controlled to independent estimates of the 72 | | civilian noninstitutional population of the US. These are prepared monthly 73 | | for us by Population Division here at the Census Bureau. We use 3 sets of 74 | | controls. 75 | | These are: 76 | | 1. A single cell estimate of the population 16+ for each state. 77 | | 2. Controls for Hispanic Origin by age and sex. 78 | | 3. Controls by Race, age and sex. 79 | | 80 | | We use all three sets of controls in our weighting program and "rake" through 81 | | them 6 times so that by the end we come back to all the controls we used. 82 | | 83 | | The term estimate refers to population totals derived from CPS by creating 84 | | "weighted tallies" of any specified socio-economic characteristics of the 85 | | population. 86 | | 87 | | People with similar demographic characteristics should have 88 | | similar weights. There is one important caveat to remember 89 | | about this statement. That is that since the CPS sample is 90 | | actually a collection of 51 state samples, each with its own 91 | | probability of selection, the statement only applies within 92 | | state. 93 | 94 | 95 | >50K, <=50K. 96 | 97 | age: continuous. 98 | workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked. 99 | fnlwgt: continuous. 100 | education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool. 101 | education-num: continuous. 102 | marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse. 103 | occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces. 104 | relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried. 105 | race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black. 106 | sex: Female, Male. 107 | capital-gain: continuous. 108 | capital-loss: continuous. 109 | hours-per-week: continuous. 110 | native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands. 111 | -------------------------------------------------------------------------------- /推荐系统算法实践—补充部分/第14章节/others.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import pandas as pd 4 | import random 5 | import math 6 | import re 7 | from os import path, listdir 8 | from tensorflow.contrib import layers 9 | from sklearn import metrics 10 | import time 11 | import datetime 12 | 13 | # ## 1)数据准备Dataset格式 14 | # 每一行解析,格式:0229|0,0,0,0,0,0,0,0,0,0,0,0,0,0|1,1173,0,0,0|18578 15 | def decode_sequence(line, continuous_size, item_size): 16 | columns = tf.string_split([line], '|') 17 | normalized_continuous_features = tf.string_to_number(tf.string_split([columns.values[1]], ',').values[0:continuous_size], out_type=tf.int32, name = "normalized_continuous_features") 18 | hist_click = tf.string_to_number(tf.string_split([columns.values[2]], ',').values[0:item_size], out_type=tf.int32, name = "hist_click") 19 | label = tf.reshape(tf.string_to_number(columns.values[3], out_type=tf.float32, name = "label"), [-1]) 20 | return {"label": label, "hist_click": hist_click, "normalized_continuous_features": normalized_continuous_features} 21 | 22 | # 文件读取,采用dataset格式 23 | def read_my_file_format(data_type, filenames, continuous_size, item_size, batch_size, num_epochs=1): 24 | # 读取文件 25 | print filenames 26 | dataset = tf.data.TextLineDataset(filenames).map(lambda x: decode_sequence(x, continuous_size, item_size)).prefetch(batch_size).cache() 27 | dataset = dataset.repeat(num_epochs) 28 | dataset = dataset.batch(batch_size) # Batch size to use 29 | iterator = dataset.make_one_shot_iterator() 30 | next_element = iterator.get_next() 31 | return next_element 32 | 33 | # 文件列表 34 | def get_file_list(my_path): 35 | files = [] 36 | if path.isdir(my_path): 37 | [files.append(path.join(my_path, p)) for p in listdir(my_path) if path.isfile(path.join(my_path, p))] 38 | else: 39 | files.append(my_path) 40 | return files 41 | 42 | # 数据处理 43 | def process_data(data_type, my_path, continuous_size, item_size, batch_size=32, num_epochs=1): 44 | filenames = get_file_list(my_path) 45 | next_element = read_my_file_format(data_type, filenames, continuous_size, item_size, batch_size, num_epochs) 46 | return next_element 47 | 48 | # In[15]: 49 | 50 | # 测试数据 51 | filenames = '/data/001' 52 | continuous_size = 16 53 | item_size = 5 54 | batch_size = 3 55 | num_epochs = 1 56 | data_type = 'sequence' 57 | next_element = process_data(data_type, filenames, continuous_size, item_size, batch_size, num_epochs) 58 | 59 | # ## 2)定义YouTubeNet模型 60 | """ 6 多层感知器神经网络计算,最终得到用户的embedding向量U:[batch_size, embedding_size] """ 61 | print("6 多层感知器神经网络计算") 62 | with tf.name_scope('MLP'): 63 | with tf.variable_scope("MLP", reuse=tf.AUTO_REUSE): 64 | # 第一层:(embedding_size + normalized_continuous_features_length) * embedding_size 65 | # 第二层: embedding_size * embedding_size 66 | weights = { 67 | 'h1': tf.Variable(tf.random_normal([self.embedding_size + self.normalized_continuous_features_length, self.embedding_size])), 68 | 'h2': tf.Variable(tf.random_normal([self.embedding_size, self.embedding_size])) 69 | } 70 | biases = { 71 | 'b1': tf.Variable(tf.random_normal([self.embedding_size])), 72 | 'out': tf.Variable(tf.random_normal([self.embedding_size])) 73 | } 74 | print("%s: %s" % ("weights", weights)) 75 | print("%s: %s" % ("biases", biases)) 76 | layer_1 = tf.add(tf.matmul(all_concat, weights['h1']), biases['b1']) 77 | layer_1 = tf.nn.relu(layer_1) 78 | print("%s: %s" % ("layer_1", layer_1)) 79 | layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['out']) 80 | print("%s: %s" % ("layer_2", layer_2)) 81 | layer_out = tf.nn.relu(layer_2) 82 | print("%s: %s" % ("layer_out", layer_out)) 83 | 84 | """ 7 Softmax计算,用户的embedding向量U 乘以 物品的embedding向量V,然后通过Softmax计算结果,其中Loss采用NCE负采样方法 """ 85 | print("7 最后一层Softmax计算") 86 | with tf.name_scope('Softmax_Classifer'): 87 | with tf.variable_scope("softmax_classifer", reuse=tf.AUTO_REUSE): 88 | # NCE LOSS 89 | loss = tf.reduce_mean( 90 | tf.nn.nce_loss( 91 | weights=self.weights, 92 | biases=self.biases, 93 | labels=label, 94 | inputs=layer_out, 95 | num_sampled=self.num_sampled, 96 | num_classes=self.item_count 97 | ) 98 | ) 99 | print("%s: %s" % ("loss", loss)) 100 | # LOSS优化方法 101 | train_step = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999,epsilon=1e-8).minimize(loss) 102 | # Softmax的预测结果 103 | out = tf.nn.softmax(tf.matmul(layer_out, tf.transpose(self.weights)) + self.biases, dim=1) 104 | print("%s: %s" % ("out", out)) -------------------------------------------------------------------------------- /推荐系统算法实践—补充部分/第5章节/other.py: -------------------------------------------------------------------------------- 1 | # ## 2)数据准备Dataset格式 2 | # 每一行解析,解析标签csv格式 3 | # 5805 17357 4 | def decode_csv(line): 5 | # 按照,分割,取label和feature 6 | columns = tf.string_split([line], ' ') 7 | center_words = tf.reshape(tf.string_to_number(columns.values[0], out_type=tf.int32),[-1]) 8 | target_words = tf.reshape(tf.string_to_number(columns.values[1], out_type=tf.int32),[-1]) 9 | return {'center_words': center_words, 'target_words': target_words} 10 | # 文件读取,采用dataset格式 11 | def read_my_file_format(filenames, batch_size, num_epochs=1): 12 | # 读取文件 13 | dataset = tf.data.TextLineDataset(filenames).map(lambda x: decode_csv(x)).prefetch(batch_size).cache() 14 | dataset = dataset.repeat(num_epochs) 15 | dataset = dataset.batch(batch_size) 16 | iterator = dataset.make_one_shot_iterator() 17 | next_element = iterator.get_next() 18 | return next_element 19 | # 文件列表 20 | def get_file_list(my_path): 21 | files = [] 22 | if path.isdir(my_path): 23 | [files.append(path.join(my_path, p)) for p in listdir(my_path) if path.isfile(path.join(my_path, p))] 24 | else: 25 | files.append(my_path) 26 | return files 27 | # 数据处理 28 | def process_data(my_path, batch_size=32, num_epochs=1): 29 | filenames = get_file_list(my_path) 30 | next_element = read_my_file_format(filenames, batch_size, num_epochs) 31 | return next_element 32 | 33 | filenames = "./windows_skip_sample.csv" 34 | batch_size = 1000 35 | num_epochs = 200 36 | next_element = process_data(filenames, batch_size, num_epochs) -------------------------------------------------------------------------------- /推荐系统算法实践—补充部分/第6-11和13章节/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WallaceLiu/recommendation_algorithm/36cdd077446ab7a2831c168f7419b058bd2fcbb0/推荐系统算法实践—补充部分/第6-11和13章节/.DS_Store -------------------------------------------------------------------------------- /推荐系统算法实践—补充部分/第6-11和13章节/data/00000: -------------------------------------------------------------------------------- 1 | 0.0 0:5.0 1:2.0 6:8.0 7:5.0 8:22.0 9:3.0 10:2.000020024E9 11:15.0 12:12.0 13:8.0 14:156011.0 15:156.0 16:1.560110001E9 17:6.0 18:1363200.0 19:921600.0 20:1388.0 21:14246.0 22:850.0 23:12378.0 24:35.0 29:1.0 30:2.0 114:1.0 120:1.0 125:2.0 126:1.0 129:1.0 132:2.0 134:2.0 136:2.0 138:4.0 147:1.0 149:1.0 158:1.0 159:1.0 166:1.0 168:1.0 176:2.0 181:1.0 186:1.0 211:1.0 212:1.0 216:1.0 221:1.0 223:1.0 227:1.0 232:1.0 233:1.0 237:1.0 241:2.0 244:1.0 246:1.0 247:1.0 250:3.0 251:1.0 253:1.0 254:1.0 258:3.0 259:626.0 260:614.0 261:626.0 262:595.0 263:637.0 264:662.0 265:4.0 482:5.5 499:1.0 2 | 0.0 114:1.0 120:1.0 125:1.0 126:1.0 132:1.0 134:1.0 136:5.0 138:5.0 158:2.0 159:1.0 161:1.0 169:1.0 174:1.0 176:2.0 181:1.0 201:1.0 205:2.0 211:1.0 212:1.0 216:1.0 220:1.0 227:1.0 228:1.0 233:1.0 240:1.0 250:2.0 253:1.0 254:1.0 259:671.0 260:654.0 261:671.0 262:652.0 263:684.0 264:695.0 271:0.54714 274:0.57133 276:0.00557 278:0.03361 281:0.5464 282:0.0 321:0.54714 349:0.57133 365:0.00557 376:0.03361 392:0.52395 393:1.8E-4 395:0.01914 396:0.09437 397:0.5464 402:0.00487 404:0.0 487:1.0 3 | 0.0 0:5.0 1:1.0 6:11.0 7:8.0 8:21.0 9:5.0 10:2.000020024E9 11:16.0 12:24.0 13:4.0 14:156063.0 15:156.0 16:1.560630001E9 17:6.0 18:2265600.0 19:2073600.0 20:2835.0 21:14470.0 22:1609.0 23:12198.0 24:31.0 30:3.0 35:1.0 36:1.0 46:1.0 79:1.0 87:1.0 114:1.0 120:1.0 125:1.0 126:1.0 132:1.0 134:1.0 136:1.0 138:9.0 147:1.0 151:3.0 152:2.0 155:1.0 166:1.0 168:1.0 169:1.0 172:1.0 176:2.0 181:2.0 205:1.0 208:1.0 211:1.0 212:1.0 216:2.0 219:1.0 220:1.0 221:1.0 223:1.0 227:1.0 228:1.0 241:1.0 245:1.0 250:2.0 253:1.0 254:1.0 258:6.0 259:675.0 260:683.0 261:675.0 262:648.0 263:684.0 264:686.0 265:4.0 267:0.30439 273:0.04025 274:0.33068 275:0.14469 276:0.04278 281:0.84341 284:0.15797 285:0.02921 293:0.30439 344:0.04025 349:0.33068 351:7.8E-4 354:0.04441 358:0.14469 365:0.04278 392:0.84341 393:0.04843 395:0.0 396:0.53333 397:0.08287 401:0.4149 417:0.05761 419:0.15797 420:0.14199 421:0.09726 423:0.01132 424:0.21702 425:0.27164 426:0.72403 432:0.02921 483:156.0 484:156063.0 485:1.560630001E9 490:1.0 4 | 0.0 135:1.0 136:2.0 138:5.0 144:1.0 147:1.0 149:1.0 151:5.0 152:3.0 154:1.0 156:1.0 176:1.0 181:1.0 208:1.0 211:4.0 212:4.0 215:1.0 227:1.0 229:1.0 250:3.0 253:1.0 254:1.0 257:1.0 267:0.37017 268:0.04224 274:0.05251 275:0.21933 276:0.0 278:0.02015 281:1.0 293:0.37017 296:0.11551 298:0.04224 349:0.05251 358:0.15732 361:0.00786 364:0.0 366:0.08654 367:0.04391 376:0.02015 392:0.66854 394:1.0 397:1.0 486:1.0 5 | 0.0 0:7.0 1:2.0 6:5.0 7:5.0 8:18.0 9:2.0 10:2.000020024E9 11:8.0 12:8.0 13:4.0 14:156044.0 15:156.0 16:1.560440007E9 17:3.0 18:1190400.0 19:518400.0 20:1369.0 21:7063.0 22:599.0 23:5445.0 24:68.0 35:2.0 36:1.0 46:1.0 79:1.0 87:1.0 89:1.0 92:1.0 96:1.0 102:1.0 114:1.0 120:1.0 125:2.0 126:2.0 136:3.0 138:4.0 149:1.0 176:1.0 201:1.0 205:1.0 220:1.0 221:1.0 223:1.0 227:1.0 228:1.0 233:1.0 237:1.0 250:1.0 253:1.0 258:6.0 259:687.0 260:678.0 261:687.0 262:672.0 263:691.0 264:707.0 265:5.0 267:0.49999 269:0.04716 280:0.13575 281:0.53648 284:0.10593 293:0.49999 309:0.04716 387:0.13575 392:0.53648 396:0.13134 419:0.10593 421:0.07479 482:5.8 483:156.0 484:156044.0 485:1.560440007E9 507:1.0 6 | 0.0 0:2.0 1:2.0 4:1381.57 6:8.0 7:5.0 8:22.0 9:2.0 10:2.000020024E9 11:8.0 12:8.0 13:4.0 14:156033.0 15:156.0 16:1.560330002E9 17:3.0 18:1190400.0 19:921600.0 20:1389.0 21:6881.0 22:500.0 23:5038.0 24:14.0 25:5.0 29:0.0 31:1381.57 136:1.0 138:2.0 221:1.0 223:1.0 250:2.0 251:1.0 253:1.0 258:5.0 259:629.0 260:617.0 261:629.0 262:599.0 263:640.0 264:664.0 265:1.0 267:0.65312 268:0.4772 275:0.0 281:0.59912 293:0.65312 298:0.4772 361:0.0 392:0.59912 482:5.0 489:1.0 7 | 0.0 0:4.0 1:1.0 6:8.0 7:5.0 8:18.0 9:2.0 10:2.000020024E9 11:8.0 12:8.0 13:4.0 14:156033.0 15:156.0 16:1.560330001E9 17:3.0 18:1190400.0 19:921600.0 20:1368.0 21:7064.0 22:699.0 23:5296.0 24:29.0 30:1.0 35:2.0 36:1.0 41:1.0 46:1.0 71:1.0 76:1.0 125:2.0 126:2.0 136:4.0 138:4.0 142:1.0 149:1.0 151:2.0 152:1.0 155:1.0 205:1.0 216:3.0 219:1.0 220:1.0 227:1.0 228:1.0 250:1.0 253:1.0 258:4.0 259:645.0 260:647.0 261:646.0 262:594.0 263:663.0 264:683.0 265:3.0 274:0.8074 281:1.0 282:0.09744 284:0.35719 349:0.53074 351:0.8074 391:5.4E-4 392:0.81208 393:0.54078 394:0.00257 396:0.51813 397:1.0 405:0.09744 415:0.33152 418:0.35719 482:5.25 483:156.0 484:156033.0 485:1.560330001E9 487:1.0 8 | 0.0 10:2.000020024E9 114:1.0 120:1.0 125:1.0 126:1.0 136:2.0 138:6.0 151:1.0 157:1.0 176:4.0 181:2.0 184:1.0 201:1.0 211:1.0 212:1.0 216:2.0 221:2.0 223:1.0 224:1.0 233:5.0 236:2.0 237:1.0 240:2.0 250:1.0 253:1.0 259:666.0 260:648.0 261:666.0 262:652.0 263:679.0 264:687.0 267:0.77939 268:0.22744 269:0.06879 271:0.06818 272:0.32865 273:0.12341 274:0.25096 276:0.05982 281:0.51142 282:0.0 284:1.0 285:0.89039 289:0.06818 293:0.77939 298:0.22744 308:0.2133 309:0.06879 324:0.06818 335:0.32865 344:0.12341 349:0.25096 351:0.03988 364:0.05982 392:0.51142 396:0.26317 400:0.10105 404:0.0 418:1.0 432:0.89039 450:0.06818 483:156.0 484:156037.0 485:1.560370002E9 487:1.0 9 | 0.0 0:4.0 1:2.0 4:6174.0 6:8.0 7:5.0 8:22.0 9:2.0 10:2.000020024E9 11:8.0 12:12.0 13:4.0 14:156013.0 15:156.0 16:1.56013001E9 17:3.0 18:1300000.0 19:921600.0 20:1399.0 21:6885.0 22:569.0 23:5155.0 24:27.0 25:7.0 29:0.0 30:1.0 31:6174.0 35:1.0 36:1.0 42:1.0 136:2.0 138:2.0 145:1.0 147:1.0 149:1.0 151:1.0 155:1.0 216:1.0 219:1.0 221:2.0 223:2.0 250:1.0 253:1.0 258:3.0 259:690.0 260:681.0 261:690.0 262:672.0 263:698.0 264:710.0 265:3.0 267:0.01275 268:1.0E-5 269:0.09674 273:0.10097 274:0.07664 276:0.00689 278:0.04202 281:6.3E-4 282:0.0 284:0.14291 293:0.01275 298:1.0E-5 308:0.09425 309:0.00198 344:0.10097 345:0.00794 349:0.05015 351:0.17359 364:0.00689 376:0.04202 392:0.54585 394:6.3E-4 396:0.1807 402:0.0 418:0.14291 482:5.0 483:156.0 484:156013.0 485:1.560130006E9 492:1.0 10 | 0.0 259:650.0 260:632.0 261:650.0 262:630.0 263:661.0 264:680.0 267:0.0 281:0.14551 293:0.0 392:0.0 394:0.0 397:0.16667 500:1.0 11 | -------------------------------------------------------------------------------- /推荐系统算法实践—补充部分/第6-11和13章节/data/00001: -------------------------------------------------------------------------------- 1 | 0.0 10:2.000020028E9 29:0.0 136:1.0 138:7.0 140:2.0 144:1.0 147:1.0 149:2.0 176:8.0 181:4.0 184:1.0 186:1.0 189:1.0 201:1.0 211:1.0 212:1.0 215:1.0 216:2.0 221:2.0 223:1.0 224:1.0 233:1.0 237:1.0 241:1.0 243:1.0 250:3.0 254:1.0 255:1.0 256:1.0 259:692.0 260:675.0 261:692.0 262:682.0 263:700.0 264:711.0 267:0.52416 268:0.06194 269:0.35611 271:0.85714 273:0.45089 274:0.01163 275:0.85714 276:0.00545 279:0.0026 281:0.95455 282:0.27735 289:0.02589 293:0.52416 298:0.06194 309:0.35611 321:0.8 322:0.85714 324:0.02589 344:0.45089 349:0.01163 358:0.0 359:0.85714 361:0.03223 365:0.00545 383:0.0026 392:0.73692 394:0.07727 396:0.08167 398:0.95455 403:0.27735 450:0.02589 516:1.0 2 | 0.0 6:11.0 7:9.0 8:24.0 9:11.0 10:2.000020024E9 11:19.0 12:18.0 13:4.0 17:8.0 18:1593600.0 19:2073600.0 20:3416.0 21:59662.0 22:4850.0 23:55736.0 35:3.0 71:1.0 74:1.0 89:1.0 92:1.0 96:1.0 102:1.0 106:1.0 110:1.0 114:1.0 120:1.0 125:6.0 127:4.0 128:1.0 129:1.0 130:1.0 131:1.0 132:1.0 134:1.0 136:3.0 138:8.0 140:2.0 145:1.0 147:1.0 149:1.0 151:2.0 153:1.0 154:1.0 155:1.0 158:2.0 161:1.0 163:1.0 164:1.0 169:3.0 171:1.0 172:1.0 174:1.0 176:8.0 181:3.0 184:2.0 186:1.0 189:1.0 197:1.0 205:1.0 208:1.0 209:1.0 211:1.0 212:1.0 216:2.0 217:3.0 219:1.0 221:5.0 223:3.0 224:2.0 233:3.0 237:1.0 240:2.0 241:4.0 244:1.0 247:2.0 249:1.0 250:4.0 251:1.0 253:1.0 254:2.0 259:703.0 260:685.0 261:703.0 262:687.0 263:707.0 264:734.0 267:0.16675 268:0.0 269:0.10147 271:0.17491 272:0.10146 273:0.00631 274:0.22913 275:0.0 277:0.02697 280:0.04946 281:0.17956 282:0.0014 284:0.03471 289:0.17491 293:0.09711 296:0.04707 298:0.0 309:0.10147 324:0.17491 335:0.0 337:0.01117 344:0.00631 349:0.03624 351:0.2042 354:0.37031 358:0.0 369:0.02697 388:0.0308 389:0.04946 391:0.0 392:0.00499 394:0.06772 395:0.00109 396:0.0 398:0.0 403:0.0014 418:0.0 419:0.03471 420:0.0325 450:0.17491 483:156.0 484:156013.0 485:1.56013001E9 491:1.0 3 | 0.0 114:1.0 120:1.0 125:1.0 126:1.0 136:1.0 138:5.0 140:1.0 151:3.0 154:3.0 158:4.0 159:1.0 161:3.0 176:2.0 181:2.0 205:1.0 211:1.0 212:1.0 220:1.0 221:1.0 223:1.0 227:1.0 228:1.0 233:1.0 237:1.0 241:1.0 246:1.0 250:1.0 253:1.0 259:703.0 260:739.0 261:703.0 262:693.0 263:703.0 264:679.0 267:0.56983 268:0.0399 269:0.30573 273:0.01961 275:0.0 276:1.0 278:0.2631 280:1.0 281:0.96242 284:0.0 293:0.56983 298:0.0399 309:0.30573 344:0.01961 346:0.04637 347:0.28233 358:0.0 364:0.37661 367:1.0 376:0.28109 387:1.0 391:0.00943 392:0.96242 395:0.0 416:0.0 483:156.0 484:156037.0 485:1.560370002E9 487:1.0 4 | 0.0 0:3.0 1:1.0 6:11.0 7:8.0 8:21.0 9:8.0 10:2.000020024E9 12:22.0 13:4.0 14:156044.0 15:156.0 16:1.560440001E9 18:1900000.0 19:4096000.0 20:2774.0 22:3388.0 24:21.0 258:4.0 259:620.0 260:612.0 261:621.0 262:582.0 263:635.0 264:657.0 265:2.0 274:0.01803 275:0.05543 281:0.5 282:0.0 349:0.01803 358:0.05543 392:0.5 394:0.0 402:0.0 483:156.0 484:156044.0 485:1.560440001E9 490:1.0 5 | 0.0 35:1.0 71:1.0 74:1.0 114:2.0 120:2.0 125:4.0 127:3.0 128:1.0 132:1.0 134:1.0 136:2.0 138:9.0 140:1.0 145:1.0 151:2.0 154:1.0 155:1.0 166:3.0 168:3.0 169:1.0 174:1.0 176:6.0 181:4.0 186:1.0 196:1.0 208:1.0 211:1.0 212:1.0 215:1.0 216:2.0 219:1.0 221:1.0 223:1.0 233:3.0 237:1.0 240:2.0 250:3.0 253:1.0 254:1.0 256:1.0 259:624.0 260:612.0 261:624.0 262:593.0 263:635.0 264:660.0 486:1.0 6 | 0.0 6:11.0 7:9.0 8:26.0 9:11.0 10:2.000020024E9 11:19.0 12:22.0 13:8.0 17:8.0 18:1900800.0 19:2242080.0 20:3286.0 21:59401.0 22:4899.0 23:55243.0 259:691.0 260:684.0 261:690.0 262:679.0 263:690.0 264:710.0 482:5.8 488:1.0 7 | 0.0 29:1.0 273:0.14085 281:0.56153 282:1.0 285:0.9156 344:0.14085 391:0.06023 392:0.56153 393:0.08851 398:0.04462 403:1.0 432:0.9156 483:156.0 484:156022.0 485:1.560220002E9 496:1.0 8 | 0.0 121:1.0 122:1.0 125:1.0 129:1.0 138:3.0 166:1.0 168:1.0 176:1.0 181:1.0 211:1.0 212:1.0 241:1.0 245:1.0 250:1.0 254:1.0 267:0.0 272:0.0 273:0.06312 274:0.12618 275:0.03488 276:0.51499 281:0.70897 284:0.24869 293:0.00134 295:0.0 335:0.0 344:0.06312 349:0.10628 351:0.12618 358:0.03488 367:0.51499 391:0.20173 392:0.70897 393:0.19528 395:0.0 396:0.00644 416:0.24869 417:0.17722 419:0.10241 421:0.01178 424:2.1E-4 427:0.0063 428:0.03733 430:0.02265 510:1.0 9 | 0.0 6:3.0 7:3.0 8:15.0 10:2.000020024E9 11:4.0 12:5.0 13:1.0 17:1.0 18:1008000.0 19:384000.0 20:645.0 21:2979.0 22:499.0 23:1727.0 29:1.0 125:1.0 129:1.0 136:1.0 138:2.0 166:1.0 168:1.0 176:2.0 181:1.0 201:1.0 211:1.0 214:1.0 216:2.0 241:1.0 247:1.0 250:1.0 253:1.0 259:632.0 260:620.0 261:633.0 262:595.0 263:640.0 264:678.0 275:0.02185 281:1.0 358:0.02185 392:1.0 396:0.01678 482:4.0 499:1.0 10 | 0.0 4:1024.0 6:8.0 7:5.0 8:22.0 9:2.0 10:2.000020024E9 11:8.0 12:8.0 13:4.0 17:3.0 18:1190400.0 19:921600.0 20:1389.0 21:6881.0 22:500.0 23:5038.0 25:4.0 29:0.0 31:1024.0 35:1.0 36:1.0 46:1.0 121:1.0 122:1.0 136:2.0 138:2.0 147:1.0 250:1.0 253:1.0 259:651.0 260:638.0 261:651.0 262:612.0 263:666.0 264:692.0 275:0.01446 276:0.0492 278:0.25841 281:0.61349 282:0.0 284:0.53165 358:0.01446 364:0.0492 375:0.25841 391:0.0 392:0.61349 396:0.23096 404:0.0 416:0.53165 421:0.08423 426:0.09111 427:0.0 482:5.0 483:156.0 484:156043.0 485:1.560430001E9 486:1.0 11 | -------------------------------------------------------------------------------- /推荐系统算法实践—补充部分/第6-11和13章节/sklearn_others.py: -------------------------------------------------------------------------------- 1 | from sklearn.linear_model import LogisticRegression 2 | from sklearn import metrics 3 | from os import path, listdir 4 | from sklearn.datasets import load_svmlight_files 5 | from sklearn.model_selection import train_test_split 6 | from sklearn.externals import joblib 7 | from sklearn import preprocessing 8 | import numpy as np 9 | import pandas as pd 10 | import random 11 | print("Python Version: %s"%(platform.python_version())) 12 | 13 | 14 | # ## 2)数据准备 15 | def process_data(data_path, feature_size, test_rat=0.3, random_seed=0, train_batch_size=5000, test_batch_size=5000): 16 | # 读取文件 17 | filenames = get_file_list(data_path) 18 | data = load_svmlight_files(filenames, n_features=feature_size, dtype=np.float32) 19 | # 合并所有文件 20 | merger_x = data[0].toarray() 21 | merger_y = data[1].reshape(-1, 1) 22 | for i in range(2,len(data)): 23 | if i % 2 == 0: 24 | x = data[i].toarray() 25 | merger_x=np.vstack((merger_x, x)) 26 | else: 27 | y = data[i].reshape(-1, 1) 28 | merger_y=np.vstack((merger_y, y)) 29 | 30 | # 生成x y datafarme 31 | feature_col = range(1,(feature_size + 1)) 32 | x_frame = pd.DataFrame(merger_x ,columns=feature_col) 33 | y_frame = pd.DataFrame(merger_y) 34 | # 数据归一化 35 | minmax_scala=preprocessing.MinMaxScaler(feature_range=(0,1)) 36 | scalafeature=minmax_scala.fit_transform(x_frame) 37 | scalafeature_frame = pd.DataFrame(scalafeature ,columns=x_frame.columns) 38 | # 训练样本,测试样本生成 39 | X_train, X_test, Y_train, Y_test = train_test_split(scalafeature_frame, y_frame, test_size=test_rat, random_state=random_seed) 40 | # batch生成 41 | all_train = pd.concat([Y_train, X_train], axis=1) 42 | all_test = pd.concat([Y_test, X_test], axis=1) 43 | xy_train = np.array(all_train).reshape(-1, feature_size + 1) 44 | xy_test = np.array(all_test).reshape(-1, feature_size + 1) 45 | train_batch = split_batch(xy_train, train_batch_size) 46 | test_batch = split_batch(xy_test, test_batch_size) 47 | return {"train_batch": train_batch, "test_batch": test_batch, "X_train": X_train, "X_test": X_test, "Y_train": Y_train, "Y_test": Y_test} 48 | 49 | # 按照batch_size大小将数据进行切分,返回Batch数据 50 | def split_batch(xy_data, batch_size=5000): 51 | # 计算batch数量 52 | all_len=xy_data.shape[0] 53 | n=int(round(float(all_len)/batch_size)) 54 | if n == 0: 55 | n = 1 56 | data_batch=[] 57 | # 生成每个batch 58 | for i in range(n): 59 | k1=i*batch_size 60 | if i < n-1: 61 | k2=(i+1)*batch_size 62 | elif i == (n-1) and (i+1)*batch_size <= all_len: 63 | k2=all_len 64 | else: 65 | k2=(i+1)*batch_size 66 | batch=xy_data[k1:k2,:] 67 | data_batch.append(batch) 68 | return data_batch 69 | 70 | # 根据文件目录,获取文件路径,返回文件路径列表 71 | def get_file_list(my_path): 72 | files = [] 73 | if path.isdir(my_path): 74 | [files.append(path.join(my_path, p)) for p in listdir(my_path) if path.isfile(path.join(my_path, p))] 75 | else: 76 | files.append(my_path) 77 | return files 78 | 79 | 80 | # 数据测试 81 | data_path = '/data' 82 | test_rat=0.4 83 | random_seed=0 84 | train_batch_size=2000 85 | test_batch_size=2000 86 | feature_size=530 87 | 88 | # 获取样本数据 89 | data = process_data(data_path, feature_size, test_rat, random_seed, train_batch_size, test_batch_size) -------------------------------------------------------------------------------- /推荐系统算法实践—补充部分/第6-11和13章节/spark_others.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * 读取libSVM格式的文件,生成训练样本和测试样本。 3 | * 1)读取文件 4 | * 2)生成标签索引 5 | * 3)样本处理 6 | * 4)样本划分 7 | */ 8 | def readLibSvmSampleData( 9 | @transient spark: org.apache.spark.sql.SparkSession, 10 | dataPath: String): (Dataset[LabeledPoint], Dataset[LabeledPoint]) = { 11 | import spark.implicits._ 12 | // 2.1 读取样本 13 | val dataRead = spark.read.options(Map(("delimiter", "|"), ("header", "false"))).csv(dataPath) 14 | // 2.2 获取样本中所有标签,并且建立索引关系 15 | val featureMap = dataRead.map { 16 | case Row(libSvmFeatrue: String) => 17 | val items = libSvmFeatrue.split(' ') 18 | val features = items.filter(_.nonEmpty). 19 | filter(f => f.split(':').size == 2). 20 | map { item => 21 | val indexAndValue = item.split(':') 22 | indexAndValue(0) 23 | } 24 | features 25 | }.flatMap(x => x).distinct().collect().sorted.zipWithIndex.toMap 26 | val numFeatures = featureMap.size 27 | // 2.3 样本校准化处理 28 | val readSampleData = dataRead.map { 29 | case Row(libSvmFeatrue: String) => 30 | val items = libSvmFeatrue.split(' ') 31 | val click = items(0).toString().toDouble 32 | val features = items.filter(_.nonEmpty). 33 | filter(f => f.split(':').size == 2). 34 | map { item => 35 | val indexAndValue = item.split(':') 36 | val id = featureMap.getOrElse(indexAndValue(0), -1) 37 | val value = indexAndValue(1).toDouble 38 | (id, value) 39 | }.filter(f => f._1 > 0).sortBy(f => f._1) 40 | val label = if (click > 0) 1.0 else 0.0 41 | LabeledPoint(label, Vectors.sparse(numFeatures, features.map(_._1), features.map(_._2))) 42 | } 43 | // 2.3 划分样本 44 | val splits = readSampleData.randomSplit(Array(0.6, 0.4)) 45 | val training = splits(0) 46 | val test = splits(1) 47 | (training, test) 48 | } 49 | 50 | /** 51 | * 根据gbdt模型对样本进行转换生成新样本 52 | * 每个样本通过每一棵树,可以找到对应的叶节点,该叶节点就是转换后的新特征。 53 | * @param sampleLablePoint 训练样本,格式为:RDD[LabeledPoint]. 54 | * @param treeLeafMap gbdt模型的叶子节点. 55 | * @param gbdtModel gbdt模型 56 | * @return RDD[LabeledPoint] 57 | */ 58 | def lrSample( 59 | sampleLablePoint: RDD[LabeledPoint], 60 | lrFeatureMap: Map[String, Int], 61 | gbdtModel: GradientBoostedTreesModel): RDD[LabeledPoint] = { 62 | val treeNumber = gbdtModel.trees.length 63 | val lrFeatureNum = lrFeatureMap.size 64 | val lrSampleParsed = sampleLablePoint.map { point => 65 | val label = point.label 66 | val features = point.features 67 | val lrFeatures = ArrayBuffer[Int]() 68 | val lrValues = ArrayBuffer[Double]() 69 | val treeNumber = gbdtModel.trees.size 70 | for (treeIndex <- 0 to (treeNumber - 1)) { 71 | var node = gbdtModel.trees(treeIndex).topNode 72 | while (!node.isLeaf) { 73 | if (node.split.get.featureType == Continuous) { 74 | if (features(node.split.get.feature) <= node.split.get.threshold) 75 | node = node.leftNode.get 76 | else 77 | node = node.rightNode.get 78 | } else { 79 | if (node.split.get.categories.contains(features(node.split.get.feature))) 80 | node = node.leftNode.get 81 | else 82 | node = node.rightNode.get 83 | } 84 | } 85 | val key = treeIndex.toString + '_' + node.id 86 | 87 | lrFeatures += lrFeatureMap(key) 88 | lrValues += 1 89 | } 90 | (label, lrFeatures.sorted.toArray, lrValues.toArray) 91 | } 92 | val lrSamplLablePoint = lrSampleParsed.map { 93 | case (label, lrFeatures, lrValues) => 94 | LabeledPoint(label, Vectors.sparse(lrFeatureNum, lrFeatures, lrValues)) 95 | } 96 | (lrSamplLablePoint) 97 | } 98 | 99 | /** 100 | * gbdt模型解析叶子节点 101 | * @param gbdtModel gbdt模型. 102 | * @return 返回Map[String, Int],得到所有决策树的叶子节点,以及编号,数据格式为:(树id_叶子节点id, 编号) 103 | */ 104 | def getTreeLeafMap(gbdtModel: GradientBoostedTreesModel): Map[String, Int] = { 105 | val lrFeatureMap = scala.collection.mutable.Map[String, Int]() 106 | var featureId = 0 107 | val treeNumber = gbdtModel.trees.size 108 | for (treeIndex <- 0 to (treeNumber - 1)) { 109 | val treeNodeQueue = collection.mutable.Queue[Node]() 110 | val rootNode = gbdtModel.trees(treeIndex).topNode 111 | treeNodeQueue.enqueue(rootNode) 112 | while (!treeNodeQueue.isEmpty) { 113 | val resNode = treeNodeQueue.dequeue() 114 | if (resNode.isLeaf) { 115 | val key = treeIndex.toString + '_' + resNode.id.toString() 116 | lrFeatureMap(key) = featureId 117 | featureId = featureId + 1 118 | } 119 | if (resNode.leftNode.isDefined) 120 | treeNodeQueue.enqueue(resNode.leftNode.get) 121 | if (resNode.rightNode.isDefined) 122 | treeNodeQueue.enqueue(resNode.rightNode.get) 123 | } 124 | } 125 | (lrFeatureMap.toMap) 126 | } 127 | 128 | -------------------------------------------------------------------------------- /推荐系统算法实践—补充部分/第6-11和13章节/tf_others.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import pandas as pd 4 | import random 5 | import math 6 | import re 7 | 8 | from os import path, listdir 9 | from sklearn import metrics 10 | from tensorflow.contrib import layers 11 | 12 | import time 13 | import datetime 14 | 15 | # ## 2)数据准备Dataset格式 16 | 17 | # In[6]: 18 | 19 | """ 20 | 解析CSV格式,对输入的每一行样本,进行格式解析,返回labels和dense_vector格式数据 21 | 例如输入csv格式字符串: 0.0,0.6666666666666666,0.5,0.0,0.0,0.0,0.0,0.7272727272727273,0.42857142857142855 22 | 函数参数: 23 | line:需要解析的字符串; 24 | feature_size:特征长度; 25 | 函数返回: 26 | 返回字典,格式:{'labels': labels, 'dense_vector': dense_vector} 27 | labels:样本的labels; 28 | dense_vector:样本的特征向量; 29 | """ 30 | def decode_csv(line, feature_size): 31 | # 按照,分割,取label和feature 32 | columns = tf.string_split([line], ',') 33 | labels = tf.reshape(tf.string_to_number(columns.values[0], out_type=tf.float32),[-1]) 34 | dense_vector = tf.reshape(tf.string_to_number(columns.values[1:feature_size + 1], out_type=tf.float32),[feature_size]) 35 | return {'labels': labels, 'dense_vector': dense_vector} 36 | 37 | """ 38 | 采用DataSet格式读取文件 39 | 函数参数: 40 | data_type:文件格式; 41 | filenames:文件路径; 42 | batch_size:Batch大小; 43 | feature_size:特征长度; 44 | num_epochs:样本复制多少次; 45 | 函数返回: 46 | 返回DataSet 47 | """ 48 | def read_my_file_format(data_type, filenames, feature_size, batch_size, num_epochs=1): 49 | # 读取文件 50 | print filenames 51 | dataset = tf.data.TextLineDataset(filenames).map(lambda x: decode_csv(x, feature_size)).prefetch(batch_size).cache() 52 | dataset = dataset.repeat(num_epochs) 53 | dataset = dataset.batch(batch_size) # Batch size to use 54 | iterator = dataset.make_one_shot_iterator() 55 | next_element = iterator.get_next() 56 | return next_element 57 | 58 | # 文件列表 59 | def get_file_list(my_path): 60 | files = [] 61 | if path.isdir(my_path): 62 | [files.append(path.join(my_path, p)) for p in listdir(my_path) if path.isfile(path.join(my_path, p))] 63 | else: 64 | files.append(my_path) 65 | return files 66 | 67 | # 数据处理 68 | def process_data(data_type, my_path, feature_size, batch_size=32, num_epochs=1): 69 | filenames = get_file_list(my_path) 70 | next_element = read_my_file_format(data_type, filenames, feature_size, batch_size, num_epochs) 71 | return next_element 72 | 73 | # 测试数据 74 | filenames = '/data/csv-00000' 75 | feature_size = 530 76 | batch_size = 3 77 | num_epochs = 1 78 | data_type = 'csv' 79 | next_element = process_data(data_type, filenames, feature_size, batch_size, num_epochs) 80 | print next_element['dense_vector'] 81 | print next_element['labels'] 82 | --------------------------------------------------------------------------------