├── README.md ├── code_structure.jpg ├── construct_tree.py ├── dataset.py ├── din_model.py ├── prediction.py ├── relative_album_caculate.py ├── sample_init.py ├── tdm.py ├── test.py └── treeAndDict2File.py /README.md: -------------------------------------------------------------------------------- 1 | # 深度树模型实验 2 | 该项目优化和完善了git另外一个哥们写的实验室类型的的项目,优化了大数据量情况下在生产环境成功运行 3 | --- 4 | 5 | ### 代码结构 6 | 文件说明 7 | tdm.py: 代码入口,负责完整深度树模型的训练和测试 8 | sample_init.py: 数据处理及生成程序,负责数据预处理及树样本的生成 9 | construct_tree.py: 样本二叉树生成程序,负责树模型的生成 10 | din_model.py: DIN网络搭建 11 | prediction.py: 遍历树预测部分 12 | dataset.py:数据生成迭代器 13 | relative_album_caculate:专辑的相关专辑计算 14 | 15 | 16 | ### 算法模型 17 | 深度树算法流程(文献[1]): 18 | 1. 构造随机二叉树 19 | 2. 基于树模型生成样本 20 | 3. 训练DNN模型直到收敛 21 | 4. 基于DNN模型得到样本的Embedding,重新构造聚类二叉树 22 | 5. 循环上述2~4过程 23 | 该过程全部在tdm.py中 24 | 25 | 首先运行sample_init.py 26 | 然后运行tdm.py 27 | ### 进度 28 | 100w用户,每个用户5个播放历史跑通 29 | 30 | ### 参考文献 31 | [1] Learning Tree-based Deep Model for Recommender Systems, Han Zhu, Xiang Li, Pengye Zhang, etc. 32 | [2] Deep Interest Network for Click-Through Rate Prediction, Guorui Zhou, Chengru Song, Xiaoqiang Zhu, etc. 33 | [3] Empirical Evaluation of Rectified Activations in Convolution Network, Bing Xu, Naiyan Wang, Tianqi Chen, etc. 34 | [4] Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification, Kaiming He, Xiangyu Zhang, Shaoqing Ren, etc. 35 | [5] Distributed Representations of Words and Phrases and their Compositionality, Tomas Mikolov, Ilya Sutskever, Kai Chen, etc. 36 | -------------------------------------------------------------------------------- /code_structure.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrew-zzz/tree-based-deep-model/15e2530211f965b5278bf43d354263f8a2e8c0ae/code_structure.jpg -------------------------------------------------------------------------------- /construct_tree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | 4 | 5 | class TreeNode(object): 6 | """define the tree node structure.""" 7 | def __init__(self, x, item_id=None): 8 | self.val = x 9 | self.item_id = item_id 10 | self.parent = None 11 | self.left = None 12 | self.right = None 13 | 14 | 15 | class TreeInitialize(object): 16 | """"Build the random binary tree.""" 17 | def __init__(self, data): 18 | self.data = data[['item_ID', 'category_ID']] 19 | # 唯一物品list,按category排 20 | self.items = None 21 | #root节点 22 | self.root = None 23 | #叶子节点item_id -> TreeNode 24 | self.leaf_dict = {} 25 | #非叶子节点的个数 26 | self.node_size = 0 27 | 28 | def __random_sort(self): 29 | self.data = self.data.drop_duplicates(['item_ID']) 30 | items_total = self.data.groupby(by=['category_ID'])['item_ID'].apply(lambda x: x) 31 | self.items = items_total.tolist() 32 | return self.items 33 | 34 | def _build_binary_tree(self, root, items): 35 | if len(items) == 1: 36 | leaf_node = TreeNode(0, item_id=items[0]) 37 | leaf_node.parent = root.parent 38 | return leaf_node 39 | left_child, right_child = TreeNode(0), TreeNode(0) 40 | left_child.parent, right_child.parent = root, root 41 | mid = int(len(items) / 2) 42 | left = self._build_binary_tree(left_child, items[:mid]) 43 | right = self._build_binary_tree(right_child, items[mid:]) 44 | root.left = left 45 | root.right = right 46 | return root 47 | 48 | def _define_node_index(self, root): 49 | node_queue = [root] 50 | i = 0 51 | try: 52 | while node_queue: 53 | current_node = node_queue.pop(0) 54 | if current_node.left: 55 | node_queue.append(current_node.left) 56 | if current_node.right: 57 | node_queue.append(current_node.right) 58 | if current_node.item_id is not None: 59 | self.leaf_dict[current_node.item_id] = current_node 60 | else: 61 | current_node.val = i 62 | i += 1 63 | self.node_size = i 64 | return 0 65 | except RuntimeError as err: 66 | print("Runtime Error Info: {0}".format(err)) 67 | return -1 68 | 69 | def random_binary_tree(self): 70 | root = TreeNode(0) 71 | items = self.__random_sort() 72 | self.root = self._build_binary_tree(root, items) 73 | _ = self._define_node_index(self.root) 74 | return self.root 75 | 76 | def _node_list(self, root): 77 | #将二叉树数据提出放入list 78 | def node_val(node): 79 | if(node.left or node.right): 80 | return (node.val,0) 81 | else: 82 | return (node.item_id,1) 83 | node_queue = [root] 84 | arr_arr_node = [] 85 | arr_arr_node.append([node_val(node_queue[0])]) 86 | while node_queue: 87 | tmp = [] 88 | tmp_val = [] 89 | for i in node_queue: 90 | if i.left: 91 | tmp.append(i.left) 92 | tmp_val.append(node_val(i.left)) 93 | if i.right: 94 | tmp.append(i.right) 95 | tmp_val.append(node_val(i.right)) 96 | if len(tmp_val) > 0: 97 | arr_arr_node.append(tmp_val) 98 | node_queue = tmp 99 | return arr_arr_node 100 | 101 | class TreeLearning(TreeInitialize): 102 | """Build the k-means clustering binary tree""" 103 | def __init__(self, items, index_dict): 104 | self.items = items #embedding according to item_list 105 | self.mapper = index_dict #item_list 106 | self.root = None 107 | self.leaf_dict = {} 108 | self.node_size = 0 109 | self.new_items_list = None 110 | 111 | def _balance_clutering(self, c1, c2, item1, item2): 112 | amount = item1.shape[0] - item2.shape[0] 113 | if amount > 1: 114 | num = int(amount / 2) 115 | distance = np.sum(np.square(item1 - c1), axis=1) 116 | item_move = item1[distance.argsort()[-num:]] 117 | item2_adjust = np.concatenate((item2, item_move), axis=0) 118 | item1_adjust = np.delete(item1, distance.argsort()[-num:], axis=0) 119 | elif amount < -1: 120 | num = int(abs(amount) / 2) 121 | distance = np.sum(np.square(item2 - c2), axis=1) 122 | item_move = item2[distance.argsort()[-num:]] 123 | item1_adjust = np.concatenate((item1, item_move), axis=0) 124 | item2_adjust = np.delete(item2, distance.argsort()[-num:], axis=0) 125 | else: 126 | item1_adjust, item2_adjust = item1, item2 127 | return item1_adjust, item2_adjust 128 | 129 | def _k_means_clustering(self, items): 130 | m1, m2 = items[0], items[1] 131 | while True: 132 | indicate = np.sum(np.square(items - m1), axis=1) - np.sum(np.square(items - m2), axis=1) 133 | items_m1, items_m2 = items[indicate < 0], items[indicate >= 0] 134 | m1_new = np.sum(items_m1, axis=0) / items_m1.shape[0] 135 | m2_new = np.sum(items_m2, axis=0) / items_m2.shape[0] 136 | if np.sum(np.absolute(m1_new - m1)) < 1e-3 and np.sum(np.absolute(m2_new - m2)) < 1e-3: 137 | break 138 | m1, m2 = m1_new, m2_new 139 | items_m1, items_m2 = self._balance_clutering(m1, m2, items_m1, items_m2) 140 | return items_m1, items_m2 141 | 142 | def _build_binary_tree(self, root, items): 143 | # root 144 | # self.items = items #embedding 145 | # self.mapper = index_dict #item_list 146 | if items.shape[0] == 1: 147 | leaf_node = TreeNode(0, item_id=self.mapper[self.items.index(items[0].tolist())]) 148 | leaf_node.parent = root.parent 149 | return leaf_node 150 | left_items, right_items = self._k_means_clustering(items) 151 | left_child, right_child = TreeNode(0), TreeNode(0) 152 | left_child.parent, right_child.parent = root, root 153 | left = self._build_binary_tree(left_child, left_items) 154 | right = self._build_binary_tree(right_child, right_items) 155 | root.left, root.right = left, right 156 | return root 157 | 158 | def clustering_binary_tree(self): 159 | root = TreeNode(0) 160 | items = np.array(self.items) 161 | self.root = self._build_binary_tree(root, items) 162 | _ = self._define_node_index(self.root) 163 | return self.root 164 | 165 | def leaf(self,root,list): 166 | if root == None: 167 | return 0 168 | elif root.left == None and root.right == None: 169 | list.append(root.item_id) 170 | return list 171 | else: 172 | self.leaf(root.left, list) 173 | self.leaf(root.right, list) 174 | return list 175 | 176 | def _rebuild_item_list(self): 177 | a = [] 178 | self.leaf(self.root,a) 179 | self.items = a -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import os 3 | import numpy as np 4 | 5 | class DataGenerator(object): 6 | def __init__(self, train_dir, test_dir, feature_dim, batch_size=20000): 7 | """ 8 | 读取特定格式的csv文件,生成训练数据和测试数据 9 | :param train_dir: 训练数据路径 10 | :param test_dir: 测试数据路径 11 | :param feature_dim: 原始特征总数 12 | :param batch_size: batch大小 13 | """ 14 | self.train_dir = train_dir 15 | self.test_dir = test_dir 16 | self.feature_dim = feature_dim 17 | self.batch_size = batch_size 18 | 19 | def parse_csv(self, value): 20 | """ 21 | 将csv行解析为tensor对象 22 | :param value: 23 | :return: 24 | """ 25 | #uid 0 26 | #play_list 1-5 27 | #list_len 6 28 | #node_id 7 29 | #is_leaf 8 30 | #label 9 31 | columns = tf.decode_csv(value, record_defaults=[ 32 | [0.0], [0.0], [0.0], [0.0], [0.0], 33 | [0.0], [0.0], [0.0], [0.0], [0.0] 34 | ]) 35 | features = columns[0:9] 36 | label = columns[9] 37 | return features,label 38 | 39 | def datasetCreate(self): 40 | """ 41 | 创建train、test数据的dataset 42 | :return: 43 | """ 44 | train_filenames = [self.train_dir +'/'+ filename for filename in os.listdir(self.train_dir)] 45 | train_dataset = tf.data.Dataset.from_tensor_slices(train_filenames) 46 | train_dataset = train_dataset.flat_map( 47 | lambda filename: 48 | tf.data.TextLineDataset(filename).map(self.parse_csv) 49 | # tf.data.TextLineDataset(filename).skip(1).apply(self.parse_csv) 50 | ).batch(self.batch_size).repeat().prefetch(100) 51 | # 52 | test_filenames = [self.train_dir +'/'+ filename for filename in os.listdir(self.test_dir)] 53 | test_dataset = tf.data.Dataset.from_tensor_slices(test_filenames) 54 | test_dataset = test_dataset.flat_map( 55 | lambda filename: 56 | tf.data.TextLineDataset(filename).map(self.parse_csv) 57 | ).batch(self.batch_size).repeat().prefetch(100) 58 | return train_dataset,test_dataset 59 | 60 | -------------------------------------------------------------------------------- /din_model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import os 3 | from .sample_init import * 4 | # from Dice import dice 5 | import sklearn.metrics as metrics 6 | 7 | class Model(object): 8 | 9 | def __init__(self, album_count, node_count,feature_size): 10 | self.input = tf.cast(tf.placeholder(tf.double, [None, feature_size], name='input'), tf.int32) # [B,B] 11 | # self.y = tf.cast(tf.placeholder(tf.int32, [None, 1], name='label'), tf.float32) 12 | self.y = tf.placeholder(tf.float32, [None, ]) 13 | self.i = self.input[:,7] # [B] #node_id 14 | self.hist_i = self.input[:,1:6] # [B, T] 15 | self.sl = self.input[:,6] # [B] 播放历史个数 16 | self.lr = tf.placeholder(tf.float64, []) #decay 17 | self.is_leaf = self.input[:,8] #node节点0 叶子节点1 18 | hidden_units = 32 19 | self.saver = None 20 | 21 | self.item_emb_w = tf.get_variable("item_emb_w", [album_count, hidden_units]) 22 | item_b = tf.get_variable("item_b", [album_count], 23 | initializer=tf.constant_initializer(0.0)) 24 | 25 | node_emb_w = tf.get_variable("node_emb_w", [node_count, hidden_units]) 26 | node_b = tf.get_variable("node_b", [node_count], 27 | initializer=tf.constant_initializer(0.0)) 28 | 29 | #处理item_embedding 30 | i_emb = tf.nn.embedding_lookup(self.item_emb_w,self.i) #[B,H] 31 | n_emb = tf.nn.embedding_lookup(node_emb_w,self.i) #[B,H] 32 | i_b = tf.gather(item_b, self.i) 33 | n_b = tf.gather(node_b, self.i) 34 | 35 | # Mask 根据leaf的值取出album还是node的 embdding,bias组装 36 | key_masks = tf.expand_dims(self.is_leaf,-1) # [B, 1] 37 | key_masks = tf.tile(key_masks, [1,hidden_units]) #[B,H] 38 | key_masks = key_masks > 0 39 | i_emb = tf.where(key_masks, i_emb, n_emb) # [B, H] 40 | 41 | key_masks_1 = self.is_leaf > 0 42 | i_b = tf.where(key_masks_1, i_b, n_b) #[B] 43 | 44 | #历史embedding and attention 45 | h_emb = tf.nn.embedding_lookup(self.item_emb_w, self.hist_i) 46 | hist_i = attention(i_emb, h_emb, self.sl) 47 | 48 | # -- attention end --- 49 | # hist_i = tf.layers.batch_normalization(inputs=hist_i) 50 | hist_i = tf.reshape(hist_i, [-1, hidden_units], name='hist_bn') 51 | hist_i = tf.layers.dense(hist_i, hidden_units, name='hist_fcn') 52 | 53 | u_emb_i = hist_i 54 | 55 | # -- fcn begin ------- 56 | din_i = tf.concat([u_emb_i, i_emb], axis=-1) 57 | # din_i = tf.layers.batch_normalization(inputs=din_i, name='b1') 58 | d_layer_1_i = tf.layers.dense(din_i, 64, activation=tf.nn.sigmoid, name='f1') 59 | d_layer_2_i = tf.layers.dense(d_layer_1_i, 32, activation=tf.nn.sigmoid, name='f2') 60 | d_layer_3_i = tf.layers.dense(d_layer_2_i, 1, activation=None, name='f3') 61 | d_layer_3_i = tf.reshape(d_layer_3_i, [-1]) 62 | self.logits = i_b + d_layer_3_i 63 | self.prediction = tf.cast(tf.reshape(tf.sigmoid(self.logits), [-1, 1]), tf.double, name='prediction') 64 | 65 | # Step variable 66 | self.global_step = tf.Variable(0, trainable=False, name='global_step') 67 | self.global_epoch_step = \ 68 | tf.Variable(0, trainable=False, name='global_epoch_step') 69 | self.global_epoch_step_op = \ 70 | tf.assign(self.global_epoch_step, self.global_epoch_step + 1) 71 | 72 | self.loss = tf.reduce_mean( 73 | tf.nn.sigmoid_cross_entropy_with_logits( 74 | logits=self.logits, 75 | labels=self.y) 76 | ) 77 | trainable_params = tf.trainable_variables() 78 | self.opt = tf.train.GradientDescentOptimizer(learning_rate=self.lr) 79 | gradients = tf.gradients(self.loss, trainable_params) 80 | clip_gradients, _ = tf.clip_by_global_norm(gradients, 5) 81 | self.train_op = self.opt.apply_gradients( 82 | list(zip(clip_gradients, trainable_params)), global_step=self.global_step) 83 | 84 | def train(self, sess, train_set, l): 85 | features,label = sess.run(train_set) 86 | features = np.array(features) 87 | loss, _ = sess.run([self.loss, self.train_op], feed_dict={ 88 | self.input:features, 89 | self.y:label, 90 | self.lr:l 91 | }) 92 | return loss 93 | 94 | def predict(self,data,sess): 95 | val = sess.run(self.prediction, feed_dict={ 96 | self.input: data, 97 | }) 98 | return val 99 | 100 | def _eval(self,sess,model,test_set,validation_step): 101 | score_arr = [] 102 | p_arr = [] 103 | for i in range(validation_step): 104 | # for _, uij in DataInput(test_set, test_batch_size): 105 | features, label = sess.run(test_set) 106 | features = np.array(features) 107 | score = sess.run(model.prediction, feed_dict={ 108 | self.input: features 109 | }) 110 | score_arr.extend(score) 111 | p_arr.extend(label) 112 | test_auc = metrics.roc_auc_score(p_arr, score_arr) 113 | # 保存pb 114 | # from tensorflow.python.framework import graph_util 115 | # constant_graph = graph_util.convert_variables_to_constants(sess, sess.graph_def, 116 | # ['in_i', 'in_hist', 'in_sl', 'prediction']) 117 | # with tf.gfile.FastGFile('/home/dev/data/andrew.zhu/vip/din/model/pb/buy_model.pb', 118 | # mode='wb') as f: # 模型的名字是model.pb 119 | # f.write(constant_graph.SerializeToString()) 120 | return test_auc 121 | 122 | def get_embeddings(self,item_list,save_path): 123 | with tf.Session() as sess: 124 | saver = tf.train.Saver() 125 | saver.restore(sess, save_path) 126 | item_embeddings = sess.run(tf.nn.embedding_lookup(self.item_emb_w, np.array(item_list))) 127 | # print(item_embeddings.tolist()) 128 | return item_embeddings.tolist() 129 | 130 | def save(self, sess, path): 131 | saver = tf.train.Saver() 132 | saver.save(sess, save_path=path) 133 | 134 | def restore(self, sess, path): 135 | saver = tf.train.Saver() 136 | saver.restore(sess, save_path=path) 137 | 138 | def extract_axis_1(data, ind): 139 | batch_range = tf.range(tf.shape(data)[0]) 140 | indices = tf.stack([batch_range, ind], axis=1) 141 | res = tf.gather_nd(data, indices) 142 | return res 143 | 144 | 145 | def attention(queries, keys, keys_length): 146 | ''' 147 | queries: [B, H] 148 | keys: [B, T, H] 149 | keys_length: [B] 150 | ''' 151 | queries_hidden_units = queries.get_shape().as_list()[-1] 152 | queries = tf.tile(queries, [1, tf.shape(keys)[1]]) 153 | queries = tf.reshape(queries, [-1, tf.shape(keys)[1], queries_hidden_units]) 154 | din_all = tf.concat([queries, keys, queries - keys, queries * keys], axis=-1) 155 | d_layer_1_all = tf.layers.dense(din_all, 80, activation=tf.nn.sigmoid, name='f1_att', reuse=tf.AUTO_REUSE) 156 | d_layer_2_all = tf.layers.dense(d_layer_1_all, 40, activation=tf.nn.sigmoid, name='f2_att', reuse=tf.AUTO_REUSE) 157 | d_layer_3_all = tf.layers.dense(d_layer_2_all, 1, activation=None, name='f3_att', reuse=tf.AUTO_REUSE) 158 | d_layer_3_all = tf.reshape(d_layer_3_all, [-1, 1, tf.shape(keys)[1]]) 159 | outputs = d_layer_3_all 160 | # Mask 161 | key_masks = tf.sequence_mask(keys_length, tf.shape(keys)[1]) # [B, T] 162 | key_masks = tf.expand_dims(key_masks, 1) # [B, 1, T] 163 | paddings = tf.ones_like(outputs) * (-2 ** 32 + 1) 164 | outputs = tf.where(key_masks, outputs, paddings) # [B, 1, T] 165 | 166 | # Scale 167 | outputs = outputs / (keys.get_shape().as_list()[-1] ** 0.5) 168 | 169 | # Activation 170 | outputs = tf.nn.softmax(outputs) # [B, 1, T] 171 | 172 | # Weighted sum 173 | outputs = tf.matmul(outputs, keys) # [B, 1, H] 174 | 175 | return outputs -------------------------------------------------------------------------------- /prediction.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from ximalaya_brain_jobs.train.vip.tdm.tdm import get_data 3 | from ximalaya_brain_jobs.model.util import upload_result_to_hdfs 4 | from ximalaya_brain_jobs.train.vip.tdm.din_model import Model 5 | import tensorflow as tf 6 | import pickle 7 | import sklearn.metrics as metrics 8 | 9 | def make_data(state,node): 10 | length = len(state) 11 | r_val = [0] 12 | r_val.extend(state) 13 | r_val.append(length) 14 | if node.item_id is not None: 15 | r_val.append(node.item_id) 16 | r_val.append(1) 17 | else: 18 | r_val.append(node.val) 19 | r_val.append(0) 20 | return np.array([r_val]) 21 | 22 | 23 | def candidates_generator(state, root, k, model,sess): 24 | """layer-wise retrieval algorithm in prediction.""" 25 | Q, A = [root], [] 26 | time = 0 27 | while Q: 28 | Q_tmp = [] 29 | for node in Q: 30 | if node.item_id is not None: 31 | A.append(node) 32 | else: 33 | Q_tmp.append(node) 34 | Q = Q_tmp 35 | probs = [] 36 | for node in Q: 37 | data = make_data(state,node) 38 | prob = model.predict(data,sess) 39 | probs.append(prob[0][0]) 40 | prob_list = list(zip(Q, probs)) 41 | prob_list = sorted(prob_list, key=lambda x: x[1], reverse=True) 42 | print('prob_list %d' % time) 43 | for i in prob_list: 44 | print('time:%d,id %d:score %f' %(time,i[0].val,i[1])) 45 | time = time + 1 46 | I = [] 47 | for i in prob_list[0:k]: 48 | I.append(i[0]) 49 | # print(I) 50 | Q = [] 51 | for j in I: 52 | if node.left: 53 | Q.append(j.left) 54 | if node.right: 55 | Q.append(j.right) 56 | t = [] 57 | for i in range(len(Q)): 58 | if Q[i].item_id == None: 59 | t.append(Q[i].val) 60 | else: 61 | t.append(Q[i].item_id) 62 | probs = [] 63 | for leaf in A: 64 | data = make_data(state,leaf) 65 | prob = model.predict(data,sess) 66 | probs.append(prob[0]) 67 | prob_list = list(zip(A, probs)) 68 | prob_list = sorted(prob_list, key=lambda x: x[1], reverse=True) 69 | A = [] 70 | for i in range(k): 71 | A.append(prob_list[i][0].item_id) 72 | return A 73 | 74 | def metrics_count(data, root, k, model): 75 | """Recall/Precision/F-measure statistic.""" 76 | precision_rate, recall_rate, fm_rate, novelty_rate, num = 0, 0, 0, 0, 0 77 | for items in data: 78 | size = items.shape[0] 79 | for i in range(size): 80 | cands = candidates_generator((items[i][None, :],), root, k, model) 81 | item_clip = list(set(items[i][items[i] != -2].tolist())) 82 | m, g = len(cands), len(item_clip) 83 | for item in item_clip: 84 | if item in cands: 85 | cands.remove(item) 86 | n = len(cands) 87 | p_rate, r_rate, n_rate = float(m - n) / m, float(m - n) / g, float(n) / k 88 | f_rate = (2 * p_rate * r_rate) / (p_rate + r_rate) 89 | precision_rate += p_rate 90 | recall_rate += r_rate 91 | fm_rate += f_rate 92 | novelty_rate += n_rate 93 | num += 1 94 | precision_rate = float(precision_rate * 100) / num 95 | recall_rate = float(recall_rate * 100) / num 96 | fm_rate = float(fm_rate * 100) / num 97 | novelty_rate = float(novelty_rate * 100) / num 98 | print("================================= Performance Statistic =================================") 99 | print("Precision rate: {:.2f}% | Recall rate: {:.2f}% | " 100 | "F-Measure rate: {:.2f}% | Novelty rate: {:.2f}%".format(precision_rate, recall_rate, fm_rate, novelty_rate)) 101 | 102 | 103 | def main(): 104 | data_train, data_validate, cache = get_data() 105 | with open('/home/dev/data/andrew.zhu/tdm/data_flow/final_tree.pkl', 'rb') as f: 106 | tree = pickle.load(f) 107 | item_ids, item_size, node_size = tree.items, len(tree.items), tree.node_size 108 | print(item_size) 109 | print(node_size) 110 | model = Model(item_size, node_size,9) 111 | # play_hist = [29309729,23571206,28580191,321787,26565248] 112 | # play_hist_index = [] 113 | # for i in play_hist: 114 | # play_hist_index.append(item_index[i]) 115 | play_hist_index = [100.0, 77.0, 800.0, 999.0,1200.0] 116 | 117 | # print(play_hist_index) 118 | import os 119 | os.environ['CUDA_VISIBLE_DEVICES'] = '1' # 使用 GPU 120 | with tf.Session() as sess: 121 | saver = tf.train.Saver() 122 | saver.restore(sess, "/home/dev/data/andrew.zhu/tdm/model/tdm.ckpt") 123 | # data = np.array([[12.0, 552.0, 853.0, 12.0, 283.0, 210.0, 5.0, 0.0, 0.0]]) 124 | # import time 125 | # t1 = time.clock() 126 | # rval = model.predict(data, sess) 127 | # t2 = time.clock() 128 | # print(t2 - t1) 129 | # print('rval') 130 | # print(rval) 131 | import time 132 | ts = time.clock() 133 | result = candidates_generator(play_hist_index, tree.root, 50, model,sess) 134 | ts1 = time.clock() 135 | print(ts1 - ts) 136 | print(result) 137 | # result_albumId = [] 138 | # for i in result: 139 | # result_albumId.append(index_item[i]) 140 | # print(result_albumId) 141 | -------------------------------------------------------------------------------- /relative_album_caculate.py: -------------------------------------------------------------------------------- 1 | from .din_model import Model 2 | import pickle 3 | from .tdm import get_data 4 | import numpy as np 5 | from ximalaya_brain_jobs.model.util import upload_result_to_hdfs 6 | 7 | 8 | def load_data(): 9 | with open('/home/dev/data/andrew.zhu/tdm/data_flow/final_tree.pkl', 'wb') as f: 10 | tree = pickle.load(f) 11 | return tree 12 | 13 | def embedding_index(embeddings, space='ip'): 14 | """ 15 | 通过 hnswlib 建立 item向量索引,从而快速进行最近邻查找 16 | :param sess: 17 | :param space: 18 | :return: 19 | """ 20 | # embeddings = sess.run("vec/clip_V:0") 21 | print("embeddings type is %s" % type(embeddings)) 22 | dim = embeddings.shape[1] 23 | print('embeddings shape is %s' % str(embeddings.shape)) 24 | 25 | # # 建立索引 26 | import hnswlib 27 | nmsl_index = hnswlib.Index(space=space, dim=dim) 28 | nmsl_index.init_index(max_elements=100000, ef_construction=200) 29 | nmsl_index.set_ef(50) 30 | nmsl_index.add_items(embeddings) 31 | return nmsl_index 32 | 33 | def get_embedding(): 34 | data_train, data_validate, cache = get_data() 35 | print('data_train len %d'% len(data_train)) 36 | print('data_validate len %d' % len(data_validate)) 37 | # uid,ts,item_list,behavior_list + mask 38 | _, _, tree = cache 39 | item_ids, item_size ,node_size = tree.items, len(tree.items),tree.node_size 40 | print('item_size %d' % item_size) 41 | print('node_size %d' % node_size) 42 | model = Model(item_size, node_size,10) 43 | with tf.Session() as sess: 44 | saver = tf.train.Saver() 45 | saver.restore(sess, "/home/dev/data/andrew.zhu/tdm/model/tdm.ckpt") 46 | item_embeddings = sess.run(model.item_emb_w) 47 | # print(item_embeddings.tolist()) 48 | return np.array(item_embeddings) 49 | 50 | # print(item_embeddings.tolist()) 51 | # return item_embeddings 52 | 53 | def get_dict(): 54 | with open('/home/dev/data/andrew.zhu/tdm/data_flow/sample.pkl', 'rb') as f: 55 | data_train = pickle.load(f) 56 | data_validate = pickle.load(f) 57 | cache = pickle.load(f) 58 | return cache 59 | 60 | def get_item_similar_item(index_item_dict, nmsl, embeddings, save_path, file_name, topK=30): 61 | """ 62 | 获取item相似item 63 | :param index_album_dict: 64 | :param nmsl: 65 | :param embeddings: 66 | :param topK: 67 | :return: 68 | """ 69 | print("top k is %s" % topK) 70 | 71 | labels, distance = nmsl.knn_query(embeddings, k=topK) 72 | print(labels.shape) 73 | print(distance.shape) 74 | item_length = len(index_item_dict) 75 | print("sim album num is %d" % item_length) 76 | result = {} 77 | for i in range(item_length): 78 | # print(i) 79 | item_id = index_item_dict[i] 80 | label = labels[i] 81 | 82 | items = [] 83 | for j in label.tolist(): 84 | try: 85 | items.append(index_item_dict[int(j)]) 86 | except: 87 | print('-- %s -- %s' % (j, type(j))) 88 | # albums = [index_album_dict[j] for j in label.tolist()] 89 | similar_item = [] 90 | for k in range(topK): 91 | similar_item.append(str(items[k])) 92 | sim_item = '|'.join(similar_item) 93 | result[item_id] = sim_item 94 | # print("album_id is %s" % album_id) 95 | # print("sim album is %s" % sim_album) 96 | 97 | from pandas.core.frame import DataFrame 98 | re = DataFrame.from_dict(result, orient='index', columns=['re_items']) 99 | re = re.reset_index().rename(columns={'index': 'item_id'}) 100 | # re.rename(columns={0: 'album_id', 1: 're_albums'}, inplace=True) 101 | print(re.head(5)) 102 | re.to_csv(save_path + file_name, index=True) 103 | upload_result_to_hdfs("/user/dev/andrew.zhu/test", 104 | save_path + file_name) 105 | return result 106 | 107 | import tensorflow as tf 108 | 109 | def main(): 110 | # tree = load_data() 111 | # save_path = '/home/dev/data/andrew.zhu/tdm/model/tdm.ckpt' 112 | # item_ids, item_size, node_size = tree.items, len(tree.items), tree.node_size 113 | 114 | item_embedding = get_embedding() 115 | item_embedding_index = embedding_index(item_embedding) 116 | # 117 | (user_dict, item_dict, random_tree) = get_dict() 118 | item_sim_item_save_path='/home/dev/data/andrew.zhu/tdm/data_flow/' 119 | file_name='album_sim' 120 | item_dict = dict(zip(item_dict.values(), item_dict.keys())) 121 | get_item_similar_item(item_dict, item_embedding_index, item_embedding, item_sim_item_save_path, file_name, 4) -------------------------------------------------------------------------------- /sample_init.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import random 4 | import multiprocessing as mp 5 | import pandas as pd 6 | import numpy as np 7 | from ximalaya_brain_jobs.train.vip.tdm.construct_tree import TreeInitialize 8 | import pickle 9 | from ximalaya_brain_utils.hdfs_util import HdfsClient 10 | #载入csv处理写入pickle 11 | import glob,os 12 | 13 | def _mask_padding(data, max_len): 14 | size = data.shape[0] 15 | raw = data.values 16 | mask = np.array([[-2] * max_len for _ in range(size)]) 17 | for i in range(size): 18 | mask[i, :len(raw[i])] = raw[i] 19 | return mask.tolist() 20 | 21 | 22 | def data_process(local): 23 | """convert and split the raw data.""" 24 | #user_id,item_id,category_id,behavior_type index化 25 | path = local 26 | print(path) 27 | file = glob.glob(os.path.join(path, "*.csv")) 28 | dl = [] 29 | for f in file: 30 | dl.append(pd.read_csv(f, header=None, 31 | names=['user_ID', 'item_ID', 'category_ID'])) 32 | data_raw = pd.concat(dl).dropna().reset_index(drop=True) 33 | print('data_raw') 34 | print(data_raw) 35 | # print('finish load') 36 | # print(data_raw) 37 | user_list = data_raw.user_ID.drop_duplicates().to_list() 38 | user_dict = dict(zip(user_list, range(len(user_list)))) 39 | data_raw['user_ID'] = data_raw.user_ID.apply(lambda x: user_dict[x]) 40 | item_list = data_raw.item_ID.drop_duplicates().to_list() 41 | item_dict = dict(zip(item_list, range(len(item_list)))) 42 | data_raw['item_ID'] = data_raw.item_ID.apply(lambda x: item_dict[x]) 43 | category_list = data_raw.category_ID.drop_duplicates().to_list() 44 | category_dict = dict(zip(category_list, range(len(category_list)))) 45 | data_raw['category_ID'] = data_raw.category_ID.apply(lambda x: category_dict[x]) 46 | 47 | #建立二叉树 48 | random_tree = TreeInitialize(data_raw) 49 | _ = random_tree.random_binary_tree() 50 | print('stop build tree') 51 | 52 | #行为数据按user_id,timestamp聚合 53 | data = data_raw.groupby(['user_ID'])['item_ID'].apply(list).reset_index() 54 | data['behavior_num'] = data.item_ID.apply(lambda x: len(x)) 55 | print('computer behavior_num') 56 | #过滤行为数据小于10次的user 57 | mask_length = data.behavior_num.max() 58 | print('mask_length %d' % mask_length) 59 | data = data.sample(frac=1).reset_index(drop=True) 60 | data = data[data.behavior_num == 5] 61 | print('5 hist len') 62 | print(data.shape) 63 | # data = data[data.behavior_num < 10] 64 | # print('finish filter num > 10') 65 | #加mask 66 | # data['item_ID'] = _mask_padding(data['item_ID'], 6) 67 | #data 'user_ID', 'item_list', 'behaviors_num' 68 | # data_train, data_validate = data[:-100000], data[-100000:] 69 | data_train, data_validate = data[:-20000], data[-20000:] 70 | cache = (user_dict, item_dict, random_tree) 71 | # return data_train, data_validate.reset_index(drop=True), cache 72 | with open('/home/dev/data/andrew.zhu/tdm/data_flow/sample.pkl', 'wb') as f: 73 | pickle.dump(data_train, f, pickle.HIGHEST_PROTOCOL) # uid, iid 74 | pickle.dump(data_validate, f, pickle.HIGHEST_PROTOCOL) # cid of iid line 75 | pickle.dump(cache, 76 | f, pickle.HIGHEST_PROTOCOL) 77 | 78 | def test_pickle(): 79 | with open('/home/dev/data/andrew.zhu/tdm/data_flow/sample.pkl', 'rb') as f: 80 | data_train = pickle.load(f) 81 | data_validate = pickle.load(f) 82 | user_dict, item_dict, random_tree = pickle.load(f) 83 | print('data_train %d' % len(data_train)) 84 | print('data_validate %d' % len(data_validate)) 85 | print('user_num %d'% len(user_dict)) 86 | print('item_num %d' % len(item_dict)) 87 | print('tree_item_num %d' % len(random_tree.items)) 88 | print('tree_node_num %d' % random_tree.node_size) 89 | # print(user_dict) 90 | # print(item_dict) 91 | # print(random_tree) 92 | 93 | def df_split(df, num): 94 | row = df.shape[0] 95 | part_size = row // num 96 | df_list = [] 97 | for i in range(num): 98 | start, end = part_size * i, part_size * (i + 1) 99 | df_tmp = df.iloc[start: end, :] 100 | df_list.append(df_tmp) 101 | if row % num != 0: 102 | df_list.append(df.iloc[end:row, :]) 103 | return df_list 104 | 105 | def del_file(path_data): 106 | for i in os.listdir(path_data) : 107 | file_data = path_data + "/" + i 108 | if os.path.isfile(file_data) == True: 109 | os.remove(file_data) 110 | else: 111 | del_file(file_data) 112 | 113 | def sample_merge_multiprocess(data, tree_map,mode, split_num ,dir): 114 | del_file(dir) 115 | df_list = df_split(data, split_num) 116 | length = len(df_list) 117 | print("total dataset length %d df_list_length is %d" % (len(data),length)) 118 | from multiprocessing import Pool, Process 119 | # datas = Manager().list() 120 | p_list = [] 121 | for i in range(length): 122 | p = Process(target=merge_samples, args=(df_list[i], tree_map, mode, i)) 123 | p.start() 124 | p_list.append(p) 125 | for res in p_list: 126 | res.join() 127 | 128 | 129 | def _single_node_sample(item_id, node, root): 130 | samples = [] 131 | positive_info = {} 132 | i = 0 133 | s = time.clock() 134 | while node: 135 | if node.item_id is None: 136 | single_sample = [item_id, node.val, 0, 1] 137 | else: 138 | single_sample = [item_id, node.item_id, 1, 1] 139 | samples.append(single_sample) 140 | positive_info[i] = node 141 | node = node.parent 142 | i += 1 143 | #j代表 叶子节点到root一路的index k代表当前level 144 | j, k = i-1, 0 145 | level_nodes = [root] 146 | while level_nodes: 147 | tmp = [] 148 | for node in level_nodes: 149 | if node.left: 150 | tmp.append(node.left) 151 | if node.right: 152 | tmp.append(node.right) 153 | if j >= 0: 154 | level_nodes.remove(positive_info[j]) 155 | if level_nodes: 156 | if len(level_nodes) <= 2*k: 157 | index_list = range(len(level_nodes)) 158 | else: 159 | index_list = random.sample(range(len(level_nodes)), 2*k) 160 | if j == 0: 161 | index_list = random.sample(range(len(level_nodes)), 80) 162 | for level_index in index_list: 163 | if level_nodes[level_index].item_id is None: 164 | single_sample = [item_id, level_nodes[level_index].val, 0, 0] 165 | else: 166 | single_sample = [item_id, level_nodes[level_index].item_id, 1, 0] 167 | samples.append(single_sample) 168 | level_nodes = tmp 169 | k += 1 170 | j -= 1 171 | e = time.clock() 172 | print('time %f' % (e-s)) 173 | samples = pd.DataFrame(samples, columns=['item_ID', 'node', 'is_leaf', 'label']) 174 | return samples 175 | 176 | def map_generate(df): 177 | #生成map 为了提高访问速度 178 | r_value = {} 179 | df = df.values 180 | for i in df: 181 | value = r_value.get(i[0]) 182 | if value == None: 183 | r_value[i[0]] = [[i[1],i[2],i[3]]] 184 | else: 185 | r_value[i[0]].append([i[1], i[2], i[3]]) 186 | return r_value 187 | 188 | def _single_node_sample_1(item_id, node, node_list): 189 | samples = [] 190 | positive_info = [] 191 | i = 0 192 | while node: 193 | if node.item_id is None: 194 | single_sample = [item_id, node.val, 0, 1] 195 | id = node.val 196 | else: 197 | single_sample = [item_id, node.item_id, 1, 1] 198 | id = node.item_id 199 | samples.append(single_sample) 200 | positive_info.append(id) 201 | node = node.parent 202 | i += 1 203 | #j从root下面一层开始的层id 204 | j = i-2 205 | #当前tree_list_map数据结构为[[(id,is_leaf)],[]] 206 | tree_depth = len(node_list) 207 | for i in range(1,tree_depth): 208 | #i为数的当前层数从1开始 209 | tmp_map = node_list[i] 210 | # if(i <= 2): 211 | # index_list = random.sample(range(len(tmp_map)), 2) 212 | # else: 213 | index_list = random.sample(range(len(tmp_map)), 2*i) 214 | if j == 0: 215 | remove_item = (positive_info[j], 1) 216 | else: 217 | remove_item = (positive_info[j], 0) 218 | sample_iter = [] 219 | for level_index in index_list: 220 | single_sample = [item_id, tmp_map[level_index][0], tmp_map[level_index][1], 0] 221 | sample_iter.append(single_sample) 222 | 223 | if [item_id, remove_item[0], remove_item[1], 0] in sample_iter: 224 | sample_iter.remove([item_id, remove_item[0], remove_item[1], 0]) 225 | samples.extend(sample_iter) 226 | j -= 1 227 | if(j < 0): 228 | break 229 | return samples 230 | 231 | 232 | def tree_generate_samples(items, leaf_dict, node_list): 233 | """Sample based on the constructed tree with multiprocess.""" 234 | samples_total = [] 235 | for item in items: 236 | if item != -2: 237 | node = leaf_dict[item] 238 | samples = _single_node_sample_1(item, node, node_list) 239 | samples_total.extend(samples) 240 | # total_samples = pd.concat(samples, ignore_index=True) 241 | samples = pd.DataFrame(samples_total, columns=['item_ID', 'node', 'is_leaf', 'label']) 242 | return samples 243 | # return total_samples 244 | 245 | 246 | def _single_data_merge(data, tree_data): 247 | complete_data = None 248 | # tree_data ['item_ID', 'node', 'is_leaf', 'label'] 249 | # data ['user_ID','timestamp','item','behaviors'] 250 | item_ids = np.array(data.item_ID) 251 | # item_ids = item_ids[item_ids != -2] 252 | for item in item_ids: 253 | samples_tree_item = tree_data[tree_data.item_ID == item][['node', 'is_leaf', 'label']].reset_index(drop=True) 254 | size = samples_tree_item.shape[0] 255 | data_extend = pd.concat([data] * size, axis=1, ignore_index=True).T 256 | data_item = pd.concat([data_extend, samples_tree_item], axis=1) 257 | if complete_data is None: 258 | complete_data = data_item 259 | else: 260 | complete_data = pd.concat([complete_data, data_item], axis=0, ignore_index=True) 261 | return complete_data 262 | 263 | def merge_samples(data, tree_map,mode,process_id): 264 | def list_tile(data, list_index): 265 | # [1,[2,3,4],5] -> [1,2,3,4,5] 266 | out = [] 267 | for j in range(len(data)): 268 | if j != list_index: 269 | out.append(data[j]) 270 | else: 271 | out.extend(data[j]) 272 | return out 273 | t_1 = time.clock() 274 | print('-----------> 进程: %d - chunk: %s <------------' % (process_id, data.shape[0])) 275 | #生成样本数据 为了效率 树生成的物品index改成map结构 276 | train_size = data.shape[0] 277 | r_value = [] 278 | #[user_ID,item_ID,behavior_num] ['node', 'is_leaf', 'label'] 279 | j = 0 280 | s = time.clock() 281 | for i in range(train_size): 282 | data_row = data.iloc[i] 283 | data_row_values = data_row.values 284 | item_list = data_row.item_ID 285 | data_row_values_tile = list_tile(data_row_values,1) 286 | # data_row_values_tile = data_row_values 287 | for item in item_list: 288 | # if(item == -2): 289 | # break 290 | l_len = len(tree_map[item]) 291 | tmp = np.append(l_len*[data_row_values_tile],tree_map[item],axis=1) 292 | r_value.extend(tmp) 293 | if(i % 10000 == 0 and i != 0): 294 | # np.savetxt('/home/dev/data/andrew.zhu/tdm/data_flow/%s/%s_%s.csv' % (mode,process_id,j), r_value, delimiter=",",fmt='%d') 295 | pd.DataFrame(r_value)\ 296 | .to_csv('/home/dev/data/andrew.zhu/tdm/data_flow/%s/%i_%s.csv' % (mode,process_id,j), 297 | header=False,index=False) 298 | print('mode:%s,process:%s,epoch:%d,time:%f,length:%d' % (mode,process_id,j, time.clock() - s,len(r_value))) 299 | s = time.clock() 300 | r_value = [] 301 | j = j + 1 302 | if len(r_value)!= 0: 303 | pd.DataFrame(r_value) \ 304 | .to_csv('/home/dev/data/andrew.zhu/tdm/data_flow/%s/%i_%s.csv' % (mode, process_id, j), 305 | header=False, index=False) 306 | t_2 = time.clock() 307 | print('进程 %d : time_use=%.2f s' % (process_id, t_2 - t_1)) 308 | """combine the preprocessed samples and tree samples.""" 309 | 310 | 311 | class DataInput: 312 | def __init__(self, data, batch_size): 313 | 314 | self.batch_size = batch_size 315 | self.data = data 316 | self.epoch_size = len(self.data) // self.batch_size 317 | if self.epoch_size * self.batch_size < len(self.data): 318 | self.epoch_size += 1 319 | self.i = 0 320 | 321 | def __iter__(self): 322 | return self 323 | 324 | def __next__(self): 325 | 326 | if self.i == self.epoch_size: 327 | raise StopIteration 328 | 329 | ts = self.data[self.i * self.batch_size : min((self.i+1) * self.batch_size, 330 | len(self.data))] 331 | self.i += 1 332 | # (reviewerID, hist, albumId, label) 333 | i, y,is_leaf, sl = [], [], [] , [] 334 | for t in ts: 335 | i.append(t[3]) 336 | y.append(t[5]) 337 | sl.append(t[2]) 338 | is_leaf.append(t[4]) 339 | max_sl = max(sl) 340 | 341 | hist_i = np.zeros([len(ts), max_sl], np.int64) 342 | 343 | k = 0 344 | for t in ts: 345 | for l in range(len(t[1])): 346 | hist_i[k][l] = t[1][l] 347 | k += 1 348 | 349 | return self.i, (i, y,is_leaf, hist_i, sl) 350 | 351 | class DataInputTest: 352 | def __init__(self, data, batch_size): 353 | 354 | self.batch_size = batch_size 355 | self.data = data 356 | self.epoch_size = len(self.data) // self.batch_size 357 | if self.epoch_size * self.batch_size < len(self.data): 358 | self.epoch_size += 1 359 | self.i = 0 360 | 361 | def __iter__(self): 362 | return self 363 | 364 | def __next__(self): 365 | 366 | if self.i == self.epoch_size: 367 | raise StopIteration 368 | 369 | ts = self.data[self.i * self.batch_size : min((self.i+1) * self.batch_size, 370 | len(self.data))] 371 | self.i += 1 372 | # reviewerID, hist, label 373 | u, i, j, sl = [], [], [], [] 374 | for t in ts: 375 | u.append(t[0]) 376 | i.append(t[2][0]) 377 | j.append(t[2][1]) 378 | sl.append(len(t[1])) 379 | max_sl = max(sl) 380 | hist_i = np.zeros([len(ts), max_sl], np.int64) 381 | 382 | k = 0 383 | for t in ts: 384 | for l in range(len(t[1])): 385 | hist_i[k][l] = t[1][l] 386 | k += 1 387 | 388 | return self.i, (u, i, j, hist_i, sl) 389 | 390 | 391 | def download(hdfs,local): 392 | # hdfs_path = ['/tmp/user/dev/andrew.zhu/vip/buy/*'] 393 | # 394 | hdfs_train_paths = hdfs 395 | local_train_path = local 396 | client = HdfsClient() 397 | print('------------------------') 398 | print(hdfs_train_paths) 399 | print(local_train_path) 400 | print('------------------------') 401 | client.download(hdfs_train_paths, 402 | local_train_path, 403 | overwrite=True) 404 | 405 | print('----------------> get data finished <-------------------' + str(local_train_path)) 406 | 407 | def main(): 408 | hdfs_path = '/user/dev/andrew.zhu/tdm/data' 409 | local_path = '/home/dev/data/andrew.zhu/tdm/' 410 | download(hdfs_path,local_path) 411 | #数据过滤了 >300 要修正 412 | data_process(local_path+"data") 413 | test_pickle() -------------------------------------------------------------------------------- /tdm.py: -------------------------------------------------------------------------------- 1 | from ximalaya_brain_jobs.train.vip.tdm.sample_init import data_process, tree_generate_samples, sample_merge_multiprocess, DataInput , map_generate 2 | from ximalaya_brain_jobs.train.vip.tdm.construct_tree import TreeLearning 3 | from ximalaya_brain_jobs.train.vip.tdm.din_model import Model 4 | import tensorflow as tf 5 | import pickle 6 | import os 7 | import sys 8 | import time 9 | import random 10 | from ximalaya_brain_jobs.train.vip.tdm.dataset import DataGenerator 11 | import pandas as pd 12 | from ximalaya_brain_jobs.model.util import get_train_test_steps_dir 13 | from ximalaya_brain_jobs.model.util import upload_result_to_hdfs 14 | 15 | def get_data(): 16 | with open('/home/dev/data/andrew.zhu/tdm/data_flow/sample.pkl', 'rb') as f: 17 | data_train = pickle.load(f) 18 | data_validate = pickle.load(f) 19 | cache = pickle.load(f) 20 | return data_train, data_validate, cache 21 | 22 | def run(model,train_set,test_set,model_save_path,train_step, validation_step,model_pb_save_path): 23 | gpu_options = tf.GPUOptions(allow_growth=True) 24 | with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: 25 | sess.run(tf.global_variables_initializer()) 26 | sess.run(tf.local_variables_initializer()) 27 | # print('test_gauc: %.4f\t test_auc: %.4f' % _eval(sess, model)) 28 | lr = 0.01 29 | start_time = time.time() 30 | best_auc = 0.0 31 | for _ in range(1): 32 | loss_sum = 0.0 33 | for i in range(train_step): 34 | loss = model.train(sess, train_set, lr) 35 | loss_sum += loss 36 | if model.global_step.eval() % (train_step//3) == 0: 37 | auc = model._eval(sess, model,test_set,validation_step) 38 | print('Epoch %d Global_step %d\tTrain_loss: %.4f\tEval_AUC: %.4f' % 39 | (model.global_epoch_step.eval(), model.global_step.eval(), 40 | loss_sum / 1000, auc)) 41 | if best_auc < auc: 42 | best_auc = auc 43 | model.save(sess, model_save_path) 44 | from tensorflow.python.framework import graph_util 45 | constant_graph = graph_util.convert_variables_to_constants(sess, sess.graph_def, 46 | ['input', 'prediction']) 47 | with tf.gfile.FastGFile(model_pb_save_path, 48 | mode='wb') as f: # 模型的名字是model.pb 49 | f.write(constant_graph.SerializeToString()) 50 | upload_result_to_hdfs("/user/dev/andrew.zhu/tdm/model/tdm.pb", 51 | model_pb_save_path) 52 | 53 | 54 | sys.stdout.flush() 55 | loss_sum = 0.0 56 | if model.global_step.eval() % 20000 == 0: 57 | lr = 0.001 58 | print('Epoch %d DONE\tCost time: %.2f' % 59 | (model.global_epoch_step.eval(), time.time()-start_time)) 60 | sys.stdout.flush() 61 | model.global_epoch_step_op.eval() 62 | 63 | print('best test_gauc:', best_auc) 64 | sys.stdout.flush() 65 | 66 | def main(): 67 | data_train, data_validate, cache = get_data() 68 | print('data_train len %d'% len(data_train)) 69 | print('data_validate len %d' % len(data_validate)) 70 | # uid,ts,item_list,behavior_list + mask 71 | _, _, tree = cache 72 | item_ids, item_size ,node_size = tree.items, len(tree.items),tree.node_size 73 | print('item_size %d' % item_size) 74 | print('node_size %d' % node_size) 75 | num_epoch = 2 76 | model_save_path = '/home/dev/data/andrew.zhu/tdm/model/tdm.ckpt' 77 | model_pb_save_path = '/home/dev/data/andrew.zhu/tdm/model/tdm.pb' 78 | train_dir = '/home/dev/data/andrew.zhu/tdm/data_flow/train' 79 | test_dir = '/home/dev/data/andrew.zhu/tdm/data_flow/test' 80 | model_feature_dim = 9 81 | data_generator_feature_dim = 10 82 | model = Model(item_size, node_size, model_feature_dim) 83 | batch_size = 20000 84 | while num_epoch > 0: 85 | # ['item_ID', 'node', 'is_leaf', 'label'] 86 | #生成树样本 把二叉树提出来用数组存储 提高访问效率 87 | node_list = tree._node_list(tree.root) 88 | print('node_list_len %d' % len(node_list)) 89 | item_ids, item_size, node_size = tree.items, len(tree.items), tree.node_size 90 | start = time.clock() 91 | #根据树生成正负样本 92 | tree_samples = tree_generate_samples(item_ids, tree.leaf_dict, node_list) 93 | o = time.clock() 94 | print('finish tree_samples %d' % (o-start)) 95 | print('tree_sample length %d' % len(tree_samples)) 96 | #优化数据结构生成map 97 | s = time.clock() 98 | tree_map = map_generate(tree_samples) 99 | o = time.clock() 100 | print('finish build map %f' % (o-s)) 101 | # 生成训练样本,文件写csv 102 | sample_merge_multiprocess(data_train , tree_map,'train',7,train_dir) 103 | sample_merge_multiprocess(data_validate,tree_map,'test',5,test_dir) 104 | # 获取训练数据,测试数据样本数 105 | train_step, validation_step = get_train_test_steps_dir(train_dir, test_dir, batch_size) 106 | print('train_step:%d' % train_step) 107 | print('test_step:%d' % validation_step) 108 | data_generator = DataGenerator(train_dir, test_dir, data_generator_feature_dim, batch_size=batch_size) 109 | train_dataset,test_dataset = data_generator.datasetCreate() 110 | train = train_dataset.make_one_shot_iterator().get_next() 111 | test = test_dataset.make_one_shot_iterator().get_next() 112 | run(model,train,test,model_save_path,train_step,validation_step,model_pb_save_path) 113 | num_epoch -= 1 114 | 115 | if num_epoch > 0: 116 | item_embeddings = model.get_embeddings(item_ids,model_save_path) 117 | tree = TreeLearning(item_embeddings, item_ids) 118 | _ = tree.clustering_binary_tree() 119 | tree._rebuild_item_list() 120 | with open('/home/dev/data/andrew.zhu/tdm/data_flow/final_tree.pkl', 'wb') as f: 121 | pickle.dump(tree, f, pickle.HIGHEST_PROTOCOL) # uid, iid 122 | # dtest = Dataset(vtrain, 100) 123 | # metrics_count(dtest, tree.root, 150, model) 124 | print("========================================== end ==========================================") 125 | 126 | 127 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | from construct_tree import TreeInitialize 2 | 3 | def preorder(self, root): 4 | if root is None: 5 | return '' 6 | if root.lef: 7 | self.preorder(root.left) 8 | if root.right: 9 | self.preorder(root.right) 10 | 11 | def leaf(root,list): 12 | if root == None: 13 | return 0 14 | elif root.left == None and root.right == None: 15 | list.append(root.item_id) 16 | return list 17 | else: 18 | leaf(root.left, list) 19 | leaf(root.right, list) 20 | return list 21 | 22 | def map_generate(df): 23 | #生成map 为了提高访问速度 24 | r_value = {} 25 | df = df.values 26 | for i in df: 27 | value = r_value.get(i[0]) 28 | if value == None: 29 | r_value[i[0]] = [[i[1],i[2],i[3]]] 30 | else: 31 | r_value[i[0]].append([i[1], i[2], i[3]]) 32 | r_value[i[0]] = r_value[i[0]] 33 | return r_value 34 | 35 | def _node_list1(root): 36 | # 将二叉树数据提出放入list 37 | def node_val(node): 38 | if (node.left or node.right): 39 | return str(node.val)+'-'+ str(0) 40 | else: 41 | return str(node.item_id)+'-'+ str(1) 42 | 43 | node_queue = [root] 44 | arr_arr_node = [] 45 | arr_arr_node.extend([node_val(node_queue[0])]) 46 | while node_queue: 47 | tmp = [] 48 | tmp_val = [] 49 | for i in node_queue: 50 | if i is None: 51 | tmp.append(None) 52 | tmp.append(None) 53 | tmp_val.append("") 54 | tmp_val.append("") 55 | else: 56 | if i.left: 57 | tmp.append(i.left) 58 | tmp_val.append(node_val(i.left)) 59 | else: 60 | tmp.append(None) 61 | tmp_val.append("") 62 | if i.right: 63 | tmp.append(i.right) 64 | tmp_val.append(node_val(i.right)) 65 | else: 66 | tmp.append(None) 67 | tmp_val.append("") 68 | node_queue = tmp 69 | is_break = True 70 | for j in tmp: 71 | if j != None: 72 | is_break = False 73 | if is_break: 74 | break 75 | else: 76 | arr_arr_node.extend(tmp_val) 77 | return arr_arr_node 78 | 79 | 80 | def printTree(root): 81 | if not root: 82 | return 83 | print('Binary Tree:') 84 | printInOrder(root, 0, 'H', 17) 85 | 86 | 87 | def printInOrder(root, height, preStr, length): 88 | if not root: 89 | return 90 | printInOrder(root.right, height + 1, 'v', length) 91 | string = preStr + str(root.val) + preStr 92 | leftLen = (length - len(string)) // 2 93 | rightLen = length - len(string) - leftLen 94 | res = " " * leftLen + string + " " * rightLen 95 | print(" " * height * length + res) 96 | printInOrder(root.left, height + 1, '^', length) 97 | 98 | 99 | 100 | def _node_list(root): 101 | # 将二叉树数据提出放入list 102 | def node_val(node): 103 | if (node.left or node.right): 104 | return (node.val, 0) 105 | else: 106 | return (node.item_id, 1) 107 | 108 | node_queue = [root] 109 | arr_arr_node = [] 110 | arr_arr_node.append([node_val(node_queue[0])]) 111 | while node_queue: 112 | tmp = [] 113 | tmp_val = [] 114 | for i in node_queue: 115 | if i is None: 116 | tmp.append(None) 117 | tmp.append(None) 118 | tmp_val.append("") 119 | tmp_val.append("") 120 | else: 121 | if i.left: 122 | tmp.append(i.left) 123 | tmp_val.append(node_val(i.left)) 124 | else: 125 | tmp.append(None) 126 | tmp_val.append("") 127 | if i.right: 128 | tmp.append(i.right) 129 | tmp_val.append(node_val(i.right)) 130 | else: 131 | tmp.append(None) 132 | tmp_val.append("") 133 | node_queue = tmp 134 | is_break = True 135 | for j in tmp: 136 | if j != None: 137 | is_break = False 138 | if is_break: 139 | break 140 | else: 141 | arr_arr_node.append(tmp_val) 142 | return arr_arr_node 143 | 144 | 145 | if __name__ == '__main__': 146 | a = [] 147 | import pandas as pd 148 | import numpy as np 149 | data = pd.DataFrame({'item_ID':range(20),'category_ID':range(20)}) 150 | # data1= data.sample(frac=1).reset_index(drop=True) 151 | # print(data1) 152 | tree = TreeInitialize(data) 153 | tree.random_binary_tree() 154 | print(leaf(tree.root,a)) 155 | r = _node_list(tree.root) 156 | r1 = _node_list1(tree.root) 157 | print(r) 158 | print(r1) 159 | # 160 | # 161 | # import numpy as np 162 | # a = np.array([[1,2,3],[4,4,5]]) 163 | # print(a.shape) -------------------------------------------------------------------------------- /treeAndDict2File.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from ximalaya_brain_utils.hdfs_util import HdfsClient 3 | 4 | def _node_list(root): 5 | # 将二叉树数据提出放入list 6 | def node_val(node): 7 | if (node.left or node.right): 8 | return str(node.val)+'-'+ str(0) 9 | else: 10 | return str(node.item_id)+'-'+ str(1) 11 | 12 | node_queue = [root] 13 | arr_arr_node = [] 14 | arr_arr_node.extend([node_val(node_queue[0])]) 15 | while node_queue: 16 | tmp = [] 17 | tmp_val = [] 18 | for i in node_queue: 19 | if i is None: 20 | tmp.append(None) 21 | tmp.append(None) 22 | tmp_val.append("") 23 | tmp_val.append("") 24 | else: 25 | if i.left: 26 | tmp.append(i.left) 27 | tmp_val.append(node_val(i.left)) 28 | else: 29 | tmp.append(None) 30 | tmp_val.append("") 31 | if i.right: 32 | tmp.append(i.right) 33 | tmp_val.append(node_val(i.right)) 34 | else: 35 | tmp.append(None) 36 | tmp_val.append("") 37 | node_queue = tmp 38 | is_break = True 39 | for j in tmp: 40 | if j != None: 41 | is_break = False 42 | if is_break: 43 | break 44 | else: 45 | arr_arr_node.extend(tmp_val) 46 | return arr_arr_node 47 | 48 | def print_last_layer(root): 49 | # 将二叉树数据提出放入list 50 | node_queue = [root] 51 | arr_arr_node = [] 52 | layer = 0 53 | total = 0 54 | while node_queue: 55 | tmp = [] 56 | for i in node_queue: 57 | if i is None: 58 | continue 59 | else: 60 | if i.left: 61 | tmp.append(i.left) 62 | if i.right: 63 | tmp.append(i.right) 64 | node_queue = tmp 65 | print_out = [] 66 | for i in node_queue: 67 | if(i.item_id is not None): 68 | print_out.append((i.item_id,1)) 69 | else: 70 | print_out.append((i.val,0)) 71 | print('layer %d' % layer) 72 | total = total + len(print_out) 73 | layer = layer + 1 74 | return arr_arr_node 75 | 76 | 77 | def main(): 78 | hdfs_client = HdfsClient() 79 | with open('/home/dev/data/andrew.zhu/tdm/data_flow/final_tree.pkl', 'rb') as f: 80 | tree = pickle.load(f) 81 | # print_last_layer(tree.root) 82 | with open('/home/dev/data/andrew.zhu/tdm/data_flow/tree_str', "w", encoding='utf-8') as f: 83 | out = str(_node_list(tree.root)) 84 | f.write(out) 85 | f.close() 86 | hdfs_client.upload("/user/dev/andrew.zhu/tdm/model/tree_str.txt", "/home/dev/data/andrew.zhu/tdm/data_flow/tree_str", overwrite=True) 87 | 88 | with open('/home/dev/data/andrew.zhu/tdm/data_flow/sample.pkl', 'rb') as f: 89 | data_train = pickle.load(f) 90 | data_validate = pickle.load(f) 91 | user_dict, item_index, tree = pickle.load(f) 92 | 93 | with open('/home/dev/data/andrew.zhu/tdm/data_flow/item_dict', "w", encoding='utf-8') as f: 94 | f.write(str(item_index)) 95 | f.close() 96 | 97 | hdfs_client.upload("/user/dev/andrew.zhu/tdm/model/item_dict.txt", "/home/dev/data/andrew.zhu/tdm/data_flow/item_dict", overwrite=True) 98 | --------------------------------------------------------------------------------