├── README.md ├── algorithm_structure.png ├── code_structure.jpg ├── construct_tree.py ├── deep_network.py ├── prediction.py ├── requirements.txt ├── sample_init.py └── tdm.py /README.md: -------------------------------------------------------------------------------- 1 | # 深度树模型实验 2 | --- 3 | 4 | ## 目录 5 | - 实验环境 6 | - 代码结构 7 | - 算法模型 8 | - 进度 9 | - 参考文献 10 | 11 | 12 | ### 实验环境 13 | 系统环境: ubuntu 18.04LTS 14 | IDE:PyCharm 2018.3 15 | [实验数据集](https://tianchi.aliyun.com/dataset/dataDetail?dataId=649&userId=1) 16 | 17 | ### 代码结构 18 | ![code-structure](./code_structure.jpg) 19 | 文件说明 20 | tdm.py: 代码入口,负责完整深度树模型的训练和测试 21 | sample_init.py: 数据处理及生成程序,负责数据预处理及树样本的生成 22 | construct_tree.py: 样本二叉树生成程序,负责树模型的生成 23 | deep_network.py: DNN的实现程序,负责网络的搭建 24 | prediction.py: 树节点预测及模型评测程序,负责模型预测及性能验证 25 | 26 | ### 算法模型 27 | ![algorithm-structure](./algorithm_structure.png) 28 | 深度树算法流程(文献[1]): 29 | 1. 构造随机二叉树 30 | 2. 基于树模型生成样本 31 | 3. 训练DNN模型直到收敛 32 | 4. 基于DNN模型得到样本的Embedding,重新构造聚类二叉树 33 | 5. 循环上述2~4过程 34 | 35 | ### 进度 36 | 完成功能测试,跑通模型 37 | TODO: 改进性能,验证模型 38 | 39 | ### 参考文献 40 | [1] Learning Tree-based Deep Model for Recommender Systems, Han Zhu, Xiang Li, Pengye Zhang, etc. 41 | [2] Deep Interest Network for Click-Through Rate Prediction, Guorui Zhou, Chengru Song, Xiaoqiang Zhu, etc. 42 | [3] Empirical Evaluation of Rectified Activations in Convolution Network, Bing Xu, Naiyan Wang, Tianqi Chen, etc. 43 | [4] Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification, Kaiming He, Xiangyu Zhang, Shaoqing Ren, etc. 44 | [5] Distributed Representations of Words and Phrases and their Compositionality, Tomas Mikolov, Ilya Sutskever, Kai Chen, etc. 45 | -------------------------------------------------------------------------------- /algorithm_structure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRegan666/Tree_Deep_Model/eaa3a134baffe2b2b8b7d2ee1f2aea9be495700d/algorithm_structure.png -------------------------------------------------------------------------------- /code_structure.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRegan666/Tree_Deep_Model/eaa3a134baffe2b2b8b7d2ee1f2aea9be495700d/code_structure.jpg -------------------------------------------------------------------------------- /construct_tree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | 4 | 5 | class TreeNode(object): 6 | """define the tree node structure.""" 7 | def __init__(self, x, item_id=None): 8 | self.val = x 9 | self.item_id = item_id 10 | self.parent = None 11 | self.left = None 12 | self.right = None 13 | 14 | 15 | class TreeInitialize(object): 16 | """"Build the random binary tree.""" 17 | def __init__(self, data): 18 | self.data = data[['item_ID', 'category_ID']] 19 | self.items = None 20 | self.root = None 21 | self.leaf_dict = {} 22 | self.node_size = 0 23 | 24 | def __random_sort(self): 25 | self.data = self.data.drop_duplicates(['item_ID']) 26 | items_total = self.data.groupby(by=['category_ID'])['item_ID'].apply(lambda x: x) 27 | self.items = items_total.tolist() 28 | return self.items 29 | 30 | def _build_binary_tree(self, root, items): 31 | if len(items) == 1: 32 | leaf_node = TreeNode(0, item_id=items[0]) 33 | leaf_node.parent = root.parent 34 | return leaf_node 35 | left_child, right_child = TreeNode(0), TreeNode(0) 36 | left_child.parent, right_child.parent = root, root 37 | mid = int(len(items) / 2) 38 | left = self._build_binary_tree(left_child, items[:mid]) 39 | right = self._build_binary_tree(right_child, items[mid:]) 40 | root.left = left 41 | root.right = right 42 | return root 43 | 44 | def _define_node_index(self, root): 45 | node_queue = [root] 46 | i = 0 47 | try: 48 | while node_queue: 49 | current_node = node_queue.pop(0) 50 | if current_node.left: 51 | node_queue.append(current_node.left) 52 | if current_node.right: 53 | node_queue.append(current_node.right) 54 | if current_node.item_id is not None: 55 | self.leaf_dict[current_node.item_id] = current_node 56 | else: 57 | current_node.val = i 58 | i += 1 59 | self.node_size = i 60 | return 0 61 | except RuntimeError as err: 62 | print("Runtime Error Info: {0}".format(err)) 63 | return -1 64 | 65 | def random_binary_tree(self): 66 | root = TreeNode(0) 67 | items = self.__random_sort() 68 | self.root = self._build_binary_tree(root, items) 69 | _ = self._define_node_index(self.root) 70 | return self.root 71 | 72 | 73 | class TreeLearning(TreeInitialize): 74 | """Build the k-means clustering binary tree""" 75 | def __init__(self, items, index_dict): 76 | self.items = items 77 | self.mapper = index_dict 78 | self.root = None 79 | self.leaf_dict = {} 80 | self.node_size = 0 81 | 82 | def _balance_clutering(self, c1, c2, item1, item2): 83 | amount = item1.shape[0] - item2.shape[0] 84 | if amount > 1: 85 | num = int(amount / 2) 86 | distance = np.sum(np.square(item1 - c1), axis=1) 87 | item_move = item1[distance.argsort()[-num:]] 88 | item2_adjust = np.concatenate((item2, item_move), axis=0) 89 | item1_adjust = np.delete(item1, distance.argsort()[-num:], axis=0) 90 | elif amount < -1: 91 | num = int(abs(amount) / 2) 92 | distance = np.sum(np.square(item2 - c2), axis=1) 93 | item_move = item2[distance.argsort()[-num:]] 94 | item1_adjust = np.concatenate((item1, item_move), axis=0) 95 | item2_adjust = np.delete(item2, distance.argsort()[-num:], axis=0) 96 | else: 97 | item1_adjust, item2_adjust = item1, item2 98 | return item1_adjust, item2_adjust 99 | 100 | def _k_means_clustering(self, items): 101 | m1, m2 = items[0], items[1] 102 | while True: 103 | indicate = np.sum(np.square(items - m1), axis=1) - np.sum(np.square(items - m2), axis=1) 104 | items_m1, items_m2 = items[indicate < 0], items[indicate >= 0] 105 | m1_new = np.sum(items_m1, axis=0) / items_m1.shape[0] 106 | m2_new = np.sum(items_m2, axis=0) / items_m2.shape[0] 107 | if np.sum(np.absolute(m1_new - m1)) < 1e-3 and np.sum(np.absolute(m2_new - m2)) < 1e-3: 108 | break 109 | m1, m2 = m1_new, m2_new 110 | items_m1, items_m2 = self._balance_clutering(m1, m2, items_m1, items_m2) 111 | return items_m1, items_m2 112 | 113 | def _build_binary_tree(self, root, items): 114 | if items.shape[0] == 1: 115 | leaf_node = TreeNode(0, item_id=self.mapper[self.items.index(items[0].tolist())]) 116 | leaf_node.parent = root.parent 117 | return leaf_node 118 | left_items, right_items = self._k_means_clustering(items) 119 | left_child, right_child = TreeNode(0), TreeNode(0) 120 | left_child.parent, right_child.parent = root, root 121 | left = self._build_binary_tree(left_child, left_items) 122 | right = self._build_binary_tree(right_child, right_items) 123 | root.left, root.right = left, right 124 | return root 125 | 126 | def clustering_binary_tree(self): 127 | root = TreeNode(0) 128 | items = np.array(self.items) 129 | self.root = self._build_binary_tree(root, items) 130 | _ = self._define_node_index(self.root) 131 | return self.root 132 | 133 | -------------------------------------------------------------------------------- /deep_network.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import tensorflow as tf 4 | tf.enable_eager_execution() 5 | 6 | 7 | MODEL_DIR = os.path.dirname(os.path.abspath(__file__)) 8 | MODEL_NAME = MODEL_DIR + '/models/network_model.ckpt' 9 | SUMMARY_DIR = MODEL_DIR + '/logs' 10 | 11 | 12 | class NeuralNet(object): 13 | """Deep network structure: 14 | input_embedding+node_embedding >> 15 | attention_block >> 16 | union_embedding >> 17 | MLP(128>64>24>2) >> 18 | label_probabilities. 19 | """ 20 | def __init__(self, item_size, node_size, embedding_size): 21 | self.item_size = item_size 22 | self.embedding_size = embedding_size 23 | self.item_embeddings = tf.get_variable("item_embeddings", 24 | [self.item_size, self.embedding_size], 25 | use_resource=True) 26 | self.node_embeddings = tf.get_variable("node_embeddings", 27 | [node_size, self.embedding_size], 28 | use_resource=True) 29 | self.saver = None 30 | 31 | def _PRelu(self, x): 32 | m, n = tf.shape(x) 33 | value_init = 0.25 * tf.ones((1, n)) 34 | a = tf.Variable(initial_value=value_init, use_resource=True) 35 | y = tf.maximum(x, 0) + a * tf.minimum(x, 0) 36 | return y 37 | 38 | def _activation_unit(self, item, node): 39 | item, node = tf.reshape(item, [1, -1]), tf.reshape(node, [1, -1]) 40 | hybrid = item * node 41 | feature = tf.concat([item, hybrid, node], axis=1) 42 | layer1 = tf.layers.dense(feature, 36) 43 | layer1_prelu = self._PRelu(layer1) 44 | weight = tf.layers.dense(layer1_prelu, 1) 45 | return weight 46 | 47 | def _attention_feature(self, item, node, is_leafs, features): 48 | item_clip = item[item != -2] 49 | item_embedding = tf.nn.embedding_lookup(self.item_embeddings, item_clip) 50 | if is_leafs[0] == 0: 51 | node_embedding = tf.nn.embedding_lookup(self.node_embeddings, node) 52 | else: 53 | node_embedding = tf.nn.embedding_lookup(self.item_embeddings, node) 54 | item_num, _ = tf.shape(item_embedding) 55 | item_feature = None 56 | for i in range(item_num): 57 | item_weight = self._activation_unit(item_embedding[i], node_embedding[0])[0][0] 58 | if item_feature is None: 59 | item_feature = item_weight * item_embedding[i] 60 | else: 61 | item_feature = tf.add(item_feature, item_weight * item_embedding[i]) 62 | item_feature = tf.concat([tf.reshape(item_feature, [1, -1]), node_embedding], axis=1) 63 | if features is None: 64 | features = item_feature 65 | else: 66 | features = tf.concat([features, item_feature], axis=0) 67 | return features 68 | 69 | def _attention_block(self, items, nodes, is_leafs): 70 | batch, _ = tf.shape(items) 71 | features = None 72 | for i in range(batch): 73 | features = self._attention_feature(items[i], nodes[i], is_leafs[i], features) 74 | return features 75 | 76 | def _network_structure(self, items, nodes, is_leafs, is_training): 77 | batch_features = self._attention_block(items, nodes, is_leafs) 78 | layer1 = tf.layers.dense(batch_features, 128) 79 | layer1_prelu = self._PRelu(layer1) 80 | layer1_bn = tf.layers.batch_normalization(layer1_prelu, training=is_training) 81 | layer2 = tf.layers.dense(layer1_bn, 64) 82 | layer2_prelu = self._PRelu(layer2) 83 | layer2_bn = tf.layers.batch_normalization(layer2_prelu, training=is_training) 84 | layer3 = tf.layers.dense(layer2_bn, 24) 85 | layer3_prelu = self._PRelu(layer3) 86 | layer3_bn = tf.layers.batch_normalization(layer3_prelu, training=is_training) 87 | logits = tf.layers.dense(layer3_bn, 2) 88 | return logits 89 | 90 | def _check_accuracy(self, iter_epoch, validate_data, is_training): 91 | num_correct, num_samples = 0, 0 92 | for items_val, nodes_val, is_leafs_val, labels_val in validate_data: 93 | scores = self._network_structure(items_val, nodes_val, is_leafs_val, is_training) 94 | scores = scores.numpy() 95 | label_predict = scores.argmax(axis=1) 96 | label_true = labels_val.argmax(axis=1) 97 | label_predict = label_predict[label_predict == label_true] 98 | label_predict = label_predict[label_predict == 0] 99 | label_true = label_true[label_true == 0] 100 | num_samples += label_true.shape[0] 101 | num_correct += label_predict.shape[0] 102 | accuracy = float(num_correct) / num_samples 103 | print("Iteration {}, total positive samples: {}, " 104 | "correct samples: {}, accuracy: {}".format(iter_epoch, num_samples, num_correct, accuracy)) 105 | 106 | def train(self, use_gpu=False, train_data=None, validate_data=None, 107 | lr=0.001, b1=0.9, b2=0.999, eps=1e-08, num_epoch=10, check_epoch=200, save_epoch=1000): 108 | device = '/device:GPU:0' if use_gpu else '/cpu:0' 109 | with tf.device(device): 110 | container = tf.contrib.eager.EagerVariableStore() 111 | check_point = tf.contrib.eager.Checkpointable() 112 | iter_epoch = 0 113 | for epoch in range(num_epoch): 114 | print("Start epoch %d" % epoch) 115 | for items_tr, nodes_tr, is_leafs_tr, labels_tr in train_data: 116 | with tf.GradientTape() as tape: 117 | with container.as_default(): 118 | scores = self._network_structure(items_tr, nodes_tr, is_leafs_tr, 1) 119 | loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels_tr, logits=scores) 120 | loss = tf.reduce_sum(loss) 121 | print("Epoch {}, Iteration {}, loss {}".format(epoch, iter_epoch, loss)) 122 | gradients = tape.gradient(loss, container.trainable_variables()) 123 | optimizer = tf.train.AdamOptimizer(learning_rate=lr, beta1=b1, beta2=b2, epsilon=eps) 124 | optimizer.apply_gradients(zip(gradients, container.trainable_variables())) 125 | if iter_epoch % check_epoch == 0: 126 | self._check_accuracy(iter_epoch, validate_data, 0) 127 | if iter_epoch % save_epoch == 0: 128 | for k, v in container._store._vars.items(): 129 | setattr(check_point, k, v) 130 | self.saver = tf.train.Checkpoint(checkpointable=check_point) 131 | self.saver.save(MODEL_NAME) 132 | iter_epoch += 1 133 | print("It's completed to train the network.") 134 | 135 | def get_embeddings(self, item_list, use_gpu=True): 136 | """ 137 | TODO: validate and optimize 138 | """ 139 | model_path = tf.train.latest_checkpoint(MODEL_DIR + '/models/') 140 | self.saver.restore(model_path) 141 | device = '/device:GPU:0' if use_gpu else '/cpu:0' 142 | with tf.device(device): 143 | item_embeddings = tf.nn.embedding_lookup(self.item_embeddings, np.array(item_list)) 144 | res = item_embeddings.numpy() 145 | return res.tolist() 146 | 147 | def predict(self, data, use_gpu=True): 148 | """ 149 | TODO: validate and optimize 150 | """ 151 | model_path = tf.train.latest_checkpoint(MODEL_DIR+'/models/') 152 | self.saver.restore(model_path) 153 | device = '/device:GPU:0' if use_gpu else '/cpu:0' 154 | with tf.device(device): 155 | items, nodes, is_leafs = data 156 | scores = self._network_structure(items, nodes, is_leafs, 0) 157 | scores = scores.numpy() 158 | return scores[:, 0] 159 | 160 | -------------------------------------------------------------------------------- /prediction.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def candidates_generator(state, root, k, model): 5 | """layer-wise retrieval algorithm in prediction.""" 6 | Q, A = [root], [] 7 | while Q: 8 | for node in Q: 9 | if node.item_id is not None: 10 | A.append(node) 11 | Q.remove(node) 12 | probs = [] 13 | for node in Q: 14 | data = state + (np.array([[node.val]]), np.array([[0]])) 15 | prob = model.predict(data) 16 | probs.append(prob[0]) 17 | prob_list = list(zip(Q, probs)) 18 | prob_list = sorted(prob_list, key=lambda x: x[1], reverse=True) 19 | I = [] 20 | if len(prob_list) > k: 21 | for i in range(k): 22 | I.append(prob_list[i][0]) 23 | else: 24 | for p in prob_list: 25 | I.append(p[0]) 26 | Q = [] 27 | while I: 28 | node = I.pop() 29 | if node.left: 30 | Q.append(node.left) 31 | if node.right: 32 | Q.append(node.right) 33 | probs = [] 34 | for leaf in A: 35 | data = state + (np.array([[leaf.item_id]]), np.array([[1]])) 36 | prob = model.predict(data) 37 | probs.append(prob[0]) 38 | prob_list = list(zip(A, probs)) 39 | prob_list = sorted(prob_list, key=lambda x: x[1], reverse=True) 40 | A = [] 41 | for i in range(k): 42 | A.append(prob_list[i][0].item_id) 43 | return A 44 | 45 | 46 | def metrics_count(data, root, k, model): 47 | """Recall/Precision/F-measure statistic.""" 48 | precision_rate, recall_rate, fm_rate, novelty_rate, num = 0, 0, 0, 0, 0 49 | for items in data: 50 | size = items.shape[0] 51 | for i in range(size): 52 | cands = candidates_generator((items[i][None, :],), root, k, model) 53 | item_clip = list(set(items[i][items[i] != -2].tolist())) 54 | m, g = len(cands), len(item_clip) 55 | for item in item_clip: 56 | if item in cands: 57 | cands.remove(item) 58 | n = len(cands) 59 | p_rate, r_rate, n_rate = float(m - n) / m, float(m - n) / g, float(n) / k 60 | f_rate = (2 * p_rate * r_rate) / (p_rate + r_rate) 61 | precision_rate += p_rate 62 | recall_rate += r_rate 63 | fm_rate += f_rate 64 | novelty_rate += n_rate 65 | num += 1 66 | precision_rate = float(precision_rate * 100) / num 67 | recall_rate = float(recall_rate * 100) / num 68 | fm_rate = float(fm_rate * 100) / num 69 | novelty_rate = float(novelty_rate * 100) / num 70 | print("================================= Performance Statistic =================================") 71 | print("Precision rate: {:.2f}% | Recall rate: {:.2f}% | " 72 | "F-Measure rate: {:.2f}% | Novelty rate: {:.2f}%".format(precision_rate, recall_rate, fm_rate, novelty_rate)) 73 | 74 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.7.0 2 | astor==0.7.1 3 | gast==0.2.2 4 | grpcio==1.18.0 5 | h5py==2.9.0 6 | Keras-Applications==1.0.7 7 | Keras-Preprocessing==1.0.9 8 | Markdown==3.0.1 9 | numpy==1.16.1 10 | pandas==0.24.1 11 | protobuf==3.6.1 12 | python-dateutil==2.8.0 13 | pytz==2018.9 14 | six==1.12.0 15 | tensorboard==1.12.2 16 | tensorflow-gpu==1.12.0 17 | termcolor==1.1.0 18 | Werkzeug==0.14.1 19 | -------------------------------------------------------------------------------- /sample_init.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import random 4 | import multiprocessing as mp 5 | import pandas as pd 6 | import numpy as np 7 | from construct_tree import TreeInitialize 8 | 9 | 10 | LOAD_DIR = os.path.dirname(os.path.abspath(__file__)) + '/datasets/UserBehavior_sp.csv' 11 | 12 | 13 | def _time_window_stamp(): 14 | boundaries = ['2017-11-26 00:00:00', '2017-11-27 00:00:00', '2017-11-28 00:00:00', 15 | '2017-11-29 00:00:00', '2017-11-30 00:00:00', '2017-12-01 00:00:00', 16 | '2017-12-02 00:00:00', '2017-12-03 00:00:00', '2017-12-04 00:00:00'] 17 | for i in range(len(boundaries)): 18 | time_array = time.strptime(boundaries[i], "%Y-%m-%d %H:%M:%S") 19 | time_stamp = int(time.mktime(time_array)) 20 | boundaries[i] = time_stamp 21 | return boundaries 22 | 23 | 24 | def _time_converter(x, boundaries): 25 | tag = -1 26 | if x > boundaries[-1]: 27 | tag = 9 28 | else: 29 | for i in range(len(boundaries)): 30 | if x <= boundaries[i]: 31 | tag = i 32 | break 33 | return tag 34 | 35 | 36 | def _mask_padding(data, max_len): 37 | size = data.shape[0] 38 | raw = data.values 39 | mask = np.array([[-2] * max_len for _ in range(size)]) 40 | for i in range(size): 41 | mask[i, :len(raw[i])] = raw[i] 42 | return mask.tolist() 43 | 44 | 45 | def data_process(): 46 | """convert and split the raw data.""" 47 | data_raw = pd.read_csv(LOAD_DIR, header=None, 48 | names=['user_ID', 'item_ID', 'category_ID', 'behavior_type', 'timestamp']) 49 | data_raw = data_raw[:10000] 50 | user_list = data_raw.user_ID.drop_duplicates().to_list() 51 | user_dict = dict(zip(user_list, range(len(user_list)))) 52 | data_raw['user_ID'] = data_raw.user_ID.apply(lambda x: user_dict[x]) 53 | item_list = data_raw.item_ID.drop_duplicates().to_list() 54 | item_dict = dict(zip(item_list, range(len(item_list)))) 55 | data_raw['item_ID'] = data_raw.item_ID.apply(lambda x: item_dict[x]) 56 | category_list = data_raw.category_ID.drop_duplicates().to_list() 57 | category_dict = dict(zip(category_list, range(len(category_list)))) 58 | data_raw['category_ID'] = data_raw.category_ID.apply(lambda x: category_dict[x]) 59 | behavior_dict = dict(zip(['pv', 'buy', 'cart', 'fav'], range(4))) 60 | data_raw['behavior_type'] = data_raw.behavior_type.apply(lambda x: behavior_dict[x]) 61 | time_window = _time_window_stamp() 62 | data_raw['timestamp'] = data_raw.timestamp.apply(_time_converter, args=(time_window,)) 63 | random_tree = TreeInitialize(data_raw) 64 | _ = random_tree.random_binary_tree() 65 | data = data_raw.groupby(['user_ID', 'timestamp'])['item_ID'].apply(list).reset_index() 66 | data['behaviors'] = data_raw.groupby(['user_ID', 67 | 'timestamp'])['behavior_type'].apply(list).reset_index()['behavior_type'] 68 | data['behavior_num'] = data.behaviors.apply(lambda x: len(x)) 69 | mask_length = data.behavior_num.max() 70 | data = data[data.behavior_num >= 10] 71 | data = data.drop(columns=['behavior_num']) 72 | data['item_ID'] = _mask_padding(data['item_ID'], mask_length) 73 | data['behaviors'] = _mask_padding(data['behaviors'], mask_length) 74 | data_train, data_validate, data_test = data[:-200], data[-200:-100], data[-100:] 75 | cache = (user_dict, item_dict, behavior_dict, random_tree) 76 | return data_train, data_validate.reset_index(drop=True), data_test.reset_index(drop=True), cache 77 | 78 | 79 | def _single_node_sample(item_id, node, root): 80 | sample_num = 200 81 | samples = [] 82 | positive_info = {} 83 | i = 0 84 | while node: 85 | if node.item_id is None: 86 | single_sample = [item_id, node.val, 0, [1, 0]] 87 | else: 88 | single_sample = [item_id, node.item_id, 1, [1, 0]] 89 | samples.append(single_sample) 90 | positive_info[i] = node 91 | node = node.parent 92 | i += 1 93 | j, k = i-1, 0 94 | level_nodes = [root] 95 | while level_nodes: 96 | tmp = [] 97 | for node in level_nodes: 98 | if node.left: 99 | tmp.append(node.left) 100 | if node.right: 101 | tmp.append(node.right) 102 | if j >= 0: 103 | level_nodes.remove(positive_info[j]) 104 | if level_nodes: 105 | if len(level_nodes) <= 2*k: 106 | index_list = range(len(level_nodes)) 107 | sample_num -= len(level_nodes) 108 | else: 109 | index_list = random.sample(range(len(level_nodes)), 2*k) 110 | sample_num -= 2*k 111 | if j == 0: 112 | index_list = random.sample(range(len(level_nodes)), sample_num + 2*k) 113 | for level_index in index_list: 114 | if level_nodes[level_index].item_id is None: 115 | single_sample = [item_id, level_nodes[level_index].val, 0, [0, 1]] 116 | else: 117 | single_sample = [item_id, level_nodes[level_index].item_id, 1, [0, 1]] 118 | samples.append(single_sample) 119 | level_nodes = tmp 120 | k += 1 121 | j -= 1 122 | samples = pd.DataFrame(samples, columns=['item_ID', 'node', 'is_leaf', 'label']) 123 | return samples 124 | 125 | 126 | def _tree_generate_worker(task_queue, sample_queue): 127 | while True: 128 | try: 129 | item_id, node, root = task_queue.get() 130 | node_sample = _single_node_sample(item_id, node, root) 131 | sample_queue.put(node_sample) 132 | except Exception as err: 133 | print("Tree Worker Process Exception Info: {}".format(str(err))) 134 | finally: 135 | task_queue.task_done() 136 | 137 | 138 | def tree_generate_samples(items, leaf_dict, root): 139 | """Sample based on the constructed tree with multiprocess.""" 140 | jobs = mp.JoinableQueue() 141 | tree_samples = mp.Queue() 142 | for _ in range(8): 143 | process = mp.Process(target=_tree_generate_worker, args=(jobs, tree_samples)) 144 | process.daemon = True 145 | process.start() 146 | total_samples = None 147 | for i in range(0, len(items), 50): 148 | sub_items = items[i:i+50] 149 | for item in sub_items: 150 | jobs.put((item, leaf_dict[item], root)) 151 | jobs.join() 152 | batch_samples = [] 153 | while not tree_samples.empty(): 154 | tree_sample = tree_samples.get_nowait() 155 | batch_samples.append(tree_sample) 156 | if total_samples is None: 157 | total_samples = pd.concat(batch_samples, ignore_index=True) 158 | else: 159 | batch_samples = pd.concat(batch_samples, ignore_index=True) 160 | total_samples = pd.concat([total_samples, batch_samples], ignore_index=True) 161 | return total_samples 162 | 163 | 164 | def _single_data_merge(data, tree_data): 165 | complete_data = None 166 | item_ids = np.array(data.item_ID) 167 | item_ids = item_ids[item_ids != -2] 168 | for item in item_ids: 169 | samples_tree_item = tree_data[tree_data.item_ID == item][['node', 'is_leaf', 'label']].reset_index(drop=True) 170 | size = samples_tree_item.shape[0] 171 | data_extend = pd.concat([data] * size, axis=1, ignore_index=True).T 172 | data_item = pd.concat([data_extend, samples_tree_item], axis=1) 173 | if complete_data is None: 174 | complete_data = data_item 175 | else: 176 | complete_data = pd.concat([complete_data, data_item], axis=0, ignore_index=True) 177 | return complete_data 178 | 179 | 180 | def _merge_generate_worker(tree_data, task_queue, sample_queue): 181 | while True: 182 | try: 183 | data_row = task_queue.get() 184 | complete_sample = _single_data_merge(data_row, tree_data) 185 | sample_queue.put(complete_sample) 186 | except Exception as err: 187 | print("Merge Worker Process Exception Info: {}".format(str(err))) 188 | finally: 189 | task_queue.task_done() 190 | 191 | 192 | def merge_samples(data, tree_sample): 193 | """combine the preprocessed samples and tree samples.""" 194 | jobs = mp.JoinableQueue() 195 | complete_samples = mp.Queue() 196 | for _ in range(8): 197 | process = mp.Process(target=_merge_generate_worker, args=(tree_sample, jobs, complete_samples)) 198 | process.daemon = True 199 | process.start() 200 | data_complete = None 201 | train_size = data.shape[0] 202 | for i in range(0, train_size, 50): 203 | for _ in range(50): 204 | if i == train_size: 205 | break 206 | jobs.put(data.iloc[i]) 207 | i += 1 208 | jobs.join() 209 | batch_samples = [] 210 | while not complete_samples.empty(): 211 | single_data_sample = complete_samples.get_nowait() 212 | batch_samples.append(single_data_sample) 213 | if data_complete is None: 214 | data_complete = pd.concat(batch_samples, ignore_index=True) 215 | else: 216 | batch_samples = pd.concat(batch_samples, ignore_index=True) 217 | data_complete = pd.concat([data_complete, batch_samples], ignore_index=True) 218 | return data_complete 219 | 220 | 221 | class Dataset(object): 222 | """construct the dataset iterator.""" 223 | def __init__(self, data, batch_size, shuffle=False): 224 | self.data = data 225 | self.batch_size = batch_size 226 | self.shuffle = shuffle 227 | 228 | def __iter__(self): 229 | self.data = self.data.drop(columns=['user_ID', 'timestamp']) 230 | N, B = self.data.shape[0], self.batch_size 231 | idxs = np.arange(N) 232 | if self.shuffle: 233 | np.random.shuffle(idxs) 234 | if self.data.shape[1] > 2: 235 | return ((np.array(self.data.loc[idxs[i:i+B], 'item_ID'].tolist()), 236 | self.data.loc[idxs[i:i+B], 'node'].values[:, None], 237 | self.data.loc[idxs[i:i+B], 'is_leaf'].values[:, None], 238 | np.array(self.data.loc[idxs[i:i+B], 'label'].tolist())) for i in range(0, N, B)) 239 | else: 240 | return (np.array(self.data.loc[idxs[i:i+B], 'item_ID'].tolist()) for i in range(0, N, B)) 241 | 242 | 243 | if __name__ == '__main__': 244 | data_train, data_validate, data_test, cache = data_process() 245 | user_dict, item_dict, _, tree = cache 246 | items = tree.items 247 | total_samples = tree_generate_samples(items, tree.leaf_dict, tree.root) 248 | data_complete = merge_samples(data_train, total_samples) 249 | dtrain = Dataset(data_complete, 50, shuffle=True) 250 | -------------------------------------------------------------------------------- /tdm.py: -------------------------------------------------------------------------------- 1 | from sample_init import data_process, tree_generate_samples, merge_samples, Dataset 2 | from deep_network import NeuralNet 3 | from prediction import metrics_count 4 | from construct_tree import TreeLearning 5 | 6 | 7 | def main(): 8 | data_train, data_val, data_test, cache = data_process() 9 | _, _, _, tree = cache 10 | item_ids, item_size = tree.items, len(tree.items) 11 | model = None 12 | num_epoch = 20 13 | while num_epoch > 0: 14 | tree_samples = tree_generate_samples(item_ids, tree.leaf_dict, tree.root) 15 | tdata, vdata = merge_samples(data_train, tree_samples), merge_samples(data_val, tree_samples) 16 | dtrain, vtrain = Dataset(tdata, 50, shuffle=True), Dataset(vdata, 50) 17 | vtest = Dataset(data_val, 50) 18 | model = NeuralNet(item_size, tree.node_size, 24) 19 | model.train(use_gpu=True, 20 | train_data=dtrain, 21 | validate_data=vtrain) 22 | metrics_count(vtest, tree.root, 10, model) 23 | num_epoch -= 1 24 | if num_epoch > 0: 25 | item_embeddings = model.get_embeddings(item_ids) 26 | tree = TreeLearning(item_embeddings, item_ids) 27 | _ = tree.clustering_binary_tree() 28 | dtest = Dataset(data_test, 100) 29 | metrics_count(dtest, tree.root, 150, model) 30 | print("========================================== end ==========================================") 31 | 32 | 33 | if __name__ == '__main__': 34 | main() 35 | 36 | --------------------------------------------------------------------------------