├── README.md ├── data ├── graph_node ├── side_info_feature └── walk_seq ├── eges_multigpu.py ├── preprocess.py └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | 参考了两位大佬的代码,简单实现了论文《Billion-scale Commodity Embedding for E-commerce Recommendation in Alibaba》中的EGES,并可以运行在单机多GPU上。 2 | 3 | 训练pipeline: 4 | 5 | 1. 统计点击序列中的每个item pair的共现次数,保存为data/graph_node 6 | 7 | 2. 生成每个item的side info,保存为data/side_info_feature 8 | 9 | 3. 运行preprocess.py,根据图随机游走生成序列,保存为data/walk_seq 10 | 11 | 4. 运行eges_multigpu.py,生成item向量 12 | 13 | 14 | 15 | - EGES部分代码参考:https://github.com/wangzhegeek/EGES 16 | - 单机多卡部分代码参考:https://github.com/lomyal/simple-word-embedding -------------------------------------------------------------------------------- /data/graph_node: -------------------------------------------------------------------------------- 1 | 1,2,2 2 | 2,6,2 3 | 2,3,3 4 | 5,10,3 5 | 5,21,2 6 | 6,26,3 7 | 10,11,2 8 | 10,23,2 9 | 13,21,2 10 | 13,30,3 -------------------------------------------------------------------------------- /data/side_info_feature: -------------------------------------------------------------------------------- 1 | id0 cate1 shop3 price1 city1 brand1 2 | id1 cate1 shop1 price1 city1 brand2 3 | id2 cate2 shop3 price1 city2 brand2 4 | id3 cate3 shop2 price1 city3 brand2 5 | id4 cate1 shop2 price2 city4 brand3 6 | id5 cate1 shop3 price2 city4 brand3 -------------------------------------------------------------------------------- /data/walk_seq: -------------------------------------------------------------------------------- 1 | 36073 35986 36145 36064 36071 36143 36539 36441 36540 36433 36413 36544 36358 36397 36353 36456 36353 36480 36472 36418 36381 36360 36560 36356 36409 22153 22189 35974 71804 71982 71433 87154 87277 87065 87089 87086 87164 87063 87205 87240 2 | 187301 187342 187343 187292 187300 187299 187291 187274 3 | 178479 178486 178255 178475 178386 178531 185661 185721 185717 185731 185678 185711 185650 185711 185670 185644 185670 185689 185696 185612 185669 185600 185679 185580 185579 185641 185670 185644 185653 185604 185642 185645 185660 185604 185630 185647 185650 185671 4 | 235533 235593 235661 235565 235547 235544 235570 235555 235563 235555 235554 235624 5 | 125346 125373 125396 125434 125465 47225 47171 47190 47193 47207 47064 47102 47073 47060 47086 47116 47104 47105 47127 47045 47048 47064 47059 47131 47120 47095 47057 47165 47134 47143 47067 47122 47066 47119 47158 47053 47225 47191 47265 47201 6 | 132984 132959 132934 132945 132943 132964 132966 132956 132969 132984 132943 132964 132966 132956 132957 7 | 136098 136094 136095 136097 136098 136094 136095 136097 136098 136094 136095 136097 136098 136099 136102 136105 8 | 66158 66068 66185 66076 66161 66162 66158 66002 66039 66027 66167 66184 66180 66182 66183 66003 66099 66128 9 | 60523 60433 60509 10 | 44550 44384 44390 44391 44387 44419 44423 -------------------------------------------------------------------------------- /eges_multigpu.py: -------------------------------------------------------------------------------- 1 | import tensorflow.compat.v1 as tf 2 | import numpy as np 3 | import collections 4 | import random 5 | import math 6 | import datetime 7 | import time 8 | 9 | word_map = {} 10 | data = [] 11 | side_info = np.loadtxt('./data/side_info_feature', dtype=int) 12 | item_size, feature_size = side_info.shape 13 | embedding_size = 128 14 | n_sampled = 50 15 | num_gpus = 2 16 | batch_size = 256 17 | num_steps = 200001 # data_size / batch_size * n_epoch 18 | every_k_step = 5000 19 | num_skips = 4 # batch_size % num_skips == 0 20 | window_size = 4 21 | tf.disable_eager_execution() 22 | 23 | item_set = set() 24 | def read_data(filename): 25 | global item_set 26 | with open(filename) as f: 27 | for line in f.readlines(): 28 | line = line.strip().split(' ') 29 | data.extend(line) 30 | item_set = set(data) 31 | return data 32 | 33 | 34 | data_index = 0 35 | def generate_batch(batch_size): 36 | global data_index 37 | batch = np.ndarray(shape=(batch_size), dtype=np.int32) 38 | label = np.ndarray(shape=(batch_size, 1), dtype=np.int32) 39 | span = 2 * window_size + 1 40 | buffer = collections.deque(maxlen=span) 41 | if data_index + span > len(data): 42 | data_index = 0 43 | 44 | buffer.extend(data[data_index: data_index + span]) 45 | data_index += span 46 | for i in range(batch_size // num_skips): 47 | tgt = window_size 48 | visited_tgt = [tgt] 49 | for j in range(num_skips): 50 | while tgt in visited_tgt: 51 | tgt = random.randint(0, span - 1) 52 | visited_tgt.append(tgt) 53 | batch[i * num_skips + j] = buffer[window_size] 54 | label[i * num_skips + j, 0] = buffer[tgt] 55 | if data_index == len(data): 56 | for k in range(span): 57 | buffer.append(k) 58 | data_index = span 59 | else: 60 | buffer.append(data[data_index]) 61 | data_index += 1 62 | data_index = (data_index + len(data) - span) % len(data) 63 | return batch, label 64 | 65 | 66 | def _variable_on_cpu(name, shape, initializer, dtype=np.float32): 67 | with tf.device('/cpu:0'): 68 | var = tf.get_variable(name, shape, initializer=initializer, dtype=dtype) 69 | return var 70 | 71 | 72 | def tower_loss(scope, inputs, labels): 73 | embedding_list = [] 74 | 75 | for i in range(feature_size): 76 | embedding = _variable_on_cpu('side_info_{0}_embeddings'.format(i), [max(side_info[:, i]) + 1, embedding_size], 77 | tf.random_uniform_initializer(-1.0, 1.0)) 78 | side_info_index = tf.nn.embedding_lookup(side_info[:, i], inputs) 79 | side_info_embed = tf.nn.embedding_lookup(embedding, tf.cast(side_info_index[:], dtype=tf.int32)) 80 | embedding_list.append(side_info_embed) 81 | 82 | alpha_embedding = _variable_on_cpu('alpha_embeddings', [item_size, feature_size], 83 | tf.random_uniform_initializer(-1.0, 1.0)) 84 | stacked_embed = tf.stack(embedding_list, axis=-1) 85 | alpha_index = tf.nn.embedding_lookup(side_info[:, 0], inputs) 86 | alpha_embed = tf.nn.embedding_lookup(alpha_embedding, alpha_index) 87 | alpha_embed_expand = tf.expand_dims(alpha_embed, 1) 88 | alpha_i_sum = tf.reduce_sum(tf.exp(alpha_embed_expand), axis=-1) 89 | merge_embedding = tf.reduce_sum(stacked_embed * tf.exp(alpha_embed_expand), axis=-1) / alpha_i_sum 90 | 91 | ''' cold start item 92 | stacked_embed = tf.stack(embedding_list[1:], axis=-1) 93 | alpha_index = tf.nn.embedding_lookup(side_info[:, 1], inputs) 94 | alpha_embed = tf.nn.embedding_lookup(alpha_embedding, alpha_index[:]) 95 | alpha_embed_expand = tf.expand_dims(alpha_embed, 1) 96 | alpha_i_sum = tf.reduce_sum(tf.exp(alpha_embed_expand), axis=-1) 97 | merge_embedding = tf.reduce_sum(stacked_embed * tf.exp(alpha_embed_expand), axis=-1) / alpha_i_sum 98 | cold_start_embedding = tf.reduce_sum(stacked_embed * tf.exp(alpha_embed_expand), axis=-1) / alpha_i_sum 99 | ''' 100 | weights = _variable_on_cpu('w', [item_size, embedding_size], tf.truncated_normal_initializer(stddev=1.0/math.sqrt(embedding_size))) 101 | biases = _variable_on_cpu('b', [item_size], tf.zeros_initializer()) 102 | loss = tf.reduce_mean(tf.nn.nce_loss( 103 | weights=weights, 104 | biases=biases, 105 | labels=labels, 106 | inputs=merge_embedding, 107 | num_sampled=n_sampled, 108 | num_classes=item_size 109 | )) 110 | return loss, merge_embedding 111 | 112 | 113 | def average_gradient(tower_grads): 114 | avg_grads = [] 115 | for grads_vars in zip(*tower_grads): 116 | values = tf.concat([g.values / num_gpus for g, _ in grads_vars], 0) 117 | indices = tf.concat([g.indices for g, _ in grads_vars], 0) 118 | grad = tf.IndexedSlices(values, indices) 119 | 120 | var = grads_vars[0][1] 121 | cur_grad_and_var = (grad, var) 122 | avg_grads.append(cur_grad_and_var) 123 | return avg_grads 124 | 125 | 126 | def get_final_embedding(): 127 | cnt = item_size // batch_size 128 | remain = item_size % batch_size 129 | final_embedding = {} 130 | all_item = side_info[:, 0] 131 | all_item = np.concatenate([all_item, [0] * remain], axis=0) 132 | 133 | for i in range(cnt): 134 | eval_input = all_item[i * batch_size: (i + 1) * batch_size] 135 | eval_label = np.zeros((batch_size, 1)) 136 | eval_embedding = sess.run(merged_embedding, feed_dict={train_input: eval_input, train_label: eval_label}) 137 | # for cold start item 138 | # cold_start_embedding = sess.run(cold_start_embedding, feed_dict={train_input: eval_input, train_label: eval_label}) 139 | eval_embedding = eval_embedding.tolist() 140 | if i == cnt - 1: 141 | eval_embedding = eval_embedding[:-remain] 142 | final_embedding.update({all_item[i*batch_size+k]: eval_embedding[k] for k in range(len(eval_embedding))}) 143 | dump_embedding(final_embedding, 'data/item_embeddings') 144 | 145 | 146 | def dump_embedding(embedding_result, output_file): 147 | with open(output_file, 'w') as f: 148 | for k, v in embedding_result.items(): 149 | f.write("{0} {1}\n".format(k, " ".join(list(map(lambda x: str(x), v))))) 150 | 151 | 152 | if __name__ == '__main__': 153 | d = read_data('data/walk_seq') 154 | 155 | graph = tf.Graph() 156 | with graph.as_default(), tf.device('/cpu:0'): 157 | train_input = tf.placeholder(tf.int32, shape=[batch_size]) 158 | train_label = tf.placeholder(tf.int32, shape=[batch_size, 1]) 159 | 160 | train_opt = tf.train.GradientDescentOptimizer(1.0) 161 | #train_opt = tf.train.AdamOptimizer(1.0) 162 | 163 | tower_grads = [] 164 | batch_size_gpu = batch_size // num_gpus 165 | with tf.variable_scope(tf.get_variable_scope()): 166 | for i in range(num_gpus): 167 | with tf.device('/gpu:{0}'.format(i)): 168 | with tf.name_scope('tower_{0}'.format(i)) as scope: 169 | train_input_gpu = tf.slice(train_input, [i * batch_size_gpu], [batch_size_gpu]) 170 | train_label_gpu = tf.slice(train_label, [i * batch_size_gpu, 0], [batch_size_gpu, 1]) 171 | 172 | loss, merged_embedding = tower_loss(scope, train_input_gpu, train_label_gpu) 173 | tf.get_variable_scope().reuse_variables() 174 | 175 | grads = train_opt.compute_gradients(loss) 176 | tower_grads.append(grads) 177 | 178 | grads = average_gradient(tower_grads) 179 | apply_gradient_op = train_opt.apply_gradients(grads) 180 | 181 | init = tf.global_variables_initializer() 182 | 183 | config = tf.ConfigProto(allow_soft_placement=True) 184 | config.gpu_options.allow_growth = True 185 | 186 | with tf.Session(graph=graph, config=config) as sess: 187 | start_time = datetime.datetime.now() 188 | init.run() 189 | print('Init finished') 190 | saver = tf.train.Saver(max_to_keep=4) 191 | 192 | avg_loss = 0 193 | final_loss = 0 194 | for step in range(1, num_steps): 195 | batch_input, batch_label = generate_batch(batch_size) 196 | feed_dict = {train_input: batch_input, train_label: batch_label} 197 | _, loss_val, batch_res = sess.run([apply_gradient_op, loss, merged_embedding], feed_dict=feed_dict) 198 | 199 | avg_loss += loss_val 200 | final_loss += loss_val 201 | 202 | if step % every_k_step == 0: 203 | end_time = datetime.datetime.now() 204 | avg_loss /= every_k_step 205 | print("step: {0}, loss: {1}, time: {2}s".format(step, avg_loss, (end_time-start_time).seconds)) 206 | avg_loss = 0 207 | start_time = datetime.datetime.now() 208 | 209 | get_final_embedding() 210 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | import random 3 | import numpy as np 4 | from joblib import Parallel, delayed 5 | import time 6 | from itertools import chain 7 | import os 8 | import re 9 | 10 | id_mapping = {} 11 | side_info = [] 12 | DATA_PATH = "data" 13 | BATCH = 100000 14 | 15 | 16 | def partition_num(num, workers): 17 | if num % workers == 0: 18 | return [num // workers] * workers 19 | else: 20 | return [num // workers] * workers + [num % workers] 21 | 22 | 23 | def create_alias_table(area_ratio): 24 | """ 25 | :param area_ratio: sum(area_ratio)=1 26 | :return: accept,alias 27 | """ 28 | l = len(area_ratio) 29 | accept, alias = [0] * l, [0] * l 30 | small, large = [], [] 31 | 32 | for i, prob in enumerate(area_ratio): 33 | if prob < 1.0: 34 | small.append(i) 35 | else: 36 | large.append(i) 37 | 38 | while small and large: 39 | small_idx, large_idx = small.pop(), large.pop() 40 | accept[small_idx] = area_ratio[small_idx] 41 | alias[small_idx] = large_idx 42 | area_ratio[large_idx] = area_ratio[large_idx] - (1 - area_ratio[small_idx]) 43 | if area_ratio[large_idx] < 1.0: 44 | small.append(large_idx) 45 | else: 46 | large.append(large_idx) 47 | 48 | while large: 49 | large_idx = large.pop() 50 | accept[large_idx] = 1 51 | while small: 52 | small_idx = small.pop() 53 | accept[small_idx] = 1 54 | 55 | return accept, alias 56 | 57 | 58 | 59 | def dump_seq(walks, id_mapping): 60 | with open(os.path.join(DATA_PATH, "walk_seq"), "w") as f: 61 | for line in walks: 62 | f.write("{0}\n".format(" ".join(map(lambda x: str(id_mapping[x]), line)))) 63 | 64 | 65 | def get_all_pairs(walks, id_mapping, window_size): 66 | all_pairs = [] 67 | cnt = 0 68 | side_info = [] 69 | with open(os.path.join(DATA_PATH, "side_info_feature")) as f: 70 | for line in f.readlines(): 71 | line = line.strip().split("\t") 72 | side_info.append(line) 73 | 74 | for k in range(len(walks)): 75 | for i in range(len(walks[k])): 76 | for j in range(i - window_size, i + window_size): 77 | if i == j or j < 0 or j >= len(walks[k]): 78 | continue 79 | else: 80 | line = [id_mapping[walks[k][i]]] 81 | line.extend(side_info[id_mapping[walks[k][i]]-1]) 82 | line.append(id_mapping[walks[k][j]]) 83 | all_pairs.append(line) 84 | 85 | if len(all_pairs) == 0: 86 | return 87 | if len(all_pairs) % BATCH == 0: 88 | with open(os.path.join(DATA_PATH, "all_pairs"), "a") as f: 89 | for line in all_pairs: 90 | f.write("{0}\n".format("\t".join(list(map(lambda x: str(x), line))))) 91 | print("{0} lines done. {1}".format(BATCH * cnt, 92 | time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))) 93 | 94 | all_pairs = [] 95 | cnt += 1 96 | 97 | 98 | class RandomWalker: 99 | def __init__(self, g): 100 | self.g = g 101 | self.alias_nodes = {} 102 | self.alias_edges = {} 103 | 104 | def deepwalk(self, start, walk_len): 105 | walk = [start] 106 | 107 | while len(walk) < walk_len: 108 | cur = walk[-1] 109 | neighbor = list(self.g.neighbors(cur)) 110 | if len(neighbor) > 0: 111 | walk.append(random.choice(neighbor)) 112 | else: 113 | break 114 | return walk 115 | 116 | def simulate(self, num_walks, walk_length, workers=4): 117 | g = self.g 118 | 119 | nodes = list(g.nodes()) 120 | result = Parallel(n_jobs=workers)(delayed(self._simulate)(nodes, num, walk_length) 121 | for num in partition_num(num_walks, workers)) 122 | result = list(chain(*result)) 123 | result = list(filter(lambda x: len(x) > 2, result)) 124 | return result 125 | 126 | def _simulate(self, nodes, num_walks, walk_length): 127 | walks = [] 128 | for _ in range(num_walks): 129 | random.shuffle(nodes) 130 | for node in nodes: 131 | walks.append(self.deepwalk(node, walk_length)) 132 | return walks 133 | 134 | def get_alias_edges(self, t, v): 135 | g = self.g 136 | p, q = 1, 1 137 | unnormalized = [] 138 | for x in g.neighbors(v): 139 | weight = g[v][x].get('weight', 1.0) 140 | if x == t: 141 | unnormalized.append(weight / p) 142 | elif g.has_edge(x, t): 143 | unnormalized.append(weight) 144 | else: 145 | unnormalized.append(weight / q) 146 | norm = sum(unnormalized) 147 | normalized = list(map(lambda x: x / norm * len(unnormalized), unnormalized)) 148 | return create_alias_table(normalized) 149 | 150 | def build_trans_prob(self): 151 | g = self.g 152 | 153 | for node in g.nodes: 154 | unnormalized = [g[node][neighbor].get('weight', 1.0) for neighbor in g.neighbors(node)] 155 | norm = sum(unnormalized) 156 | normalized = list(map(lambda x: x / norm * len(unnormalized), unnormalized)) 157 | self.alias_nodes[node] = create_alias_table(normalized) 158 | 159 | for edge in g.edges(): 160 | self.alias_edges[edge] = self.get_alias_edges(edge[0], edge[1]) 161 | 162 | 163 | def create_graph(): 164 | edges = [] 165 | with open(os.path.join(DATA_PATH, "graph_node")) as f: 166 | for line in f.readlines(): 167 | line = line.strip() 168 | in_node, out_node, weight = line.split(",") 169 | edges.append((in_node, out_node, float(weight))) 170 | 171 | di = nx.DiGraph() 172 | di.add_weighted_edges_from(edges) 173 | rand = RandomWalker(di) 174 | # rand.build_trans_prob() 175 | res = rand.simulate(15, 15) 176 | print('simulate finished') 177 | 178 | id_mapping = {} 179 | with open(os.path.join(DATA_PATH, "id_mapping")) as f: 180 | for line in f.readlines(): 181 | line = line.strip() 182 | line = line.split("\t") 183 | id_mapping[line[0]] = int(line[1]) 184 | 185 | dump_seq(res, id_mapping) 186 | 187 | 188 | if __name__ == '__main__': 189 | create_graph() 190 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def write_embedding(item_list, embedding_result, outputFileName): 5 | f = open(outputFileName, 'w') 6 | for i in range(len(embedding_result)): 7 | s = " ".join(str(f) for f in embedding_result[i].tolist()) 8 | f.write("{0} {1}\n".format(item_list[i], s)) 9 | f.close() 10 | 11 | 12 | def graph_context_batch_iter(all_pairs, batch_size, side_info, num_features): 13 | while True: 14 | start_idx = np.random.randint(0, len(all_pairs) - batch_size) 15 | batch_idx = np.array(range(start_idx, start_idx + batch_size)) 16 | batch_idx = np.random.permutation(batch_idx) 17 | batch = np.zeros((batch_size, num_features), dtype=np.int32) 18 | labels = np.zeros((batch_size, 1), dtype=np.int32) 19 | batch[:] = side_info[all_pairs[batch_idx, 0]] 20 | labels[:, 0] = all_pairs[batch_idx, 1] 21 | yield batch, labels 22 | 23 | 24 | def preprocess_nxgraph(graph): 25 | node2idx = {} 26 | idx2node = [] 27 | node_size = 0 28 | for node in graph.nodes(): 29 | node2idx[node] = node_size 30 | idx2node.append(node) 31 | node_size += 1 32 | return idx2node, node2idx 33 | 34 | 35 | def partition_dict(vertices, workers): 36 | batch_size = (len(vertices) - 1) // workers + 1 37 | part_list = [] 38 | part = [] 39 | count = 0 40 | for v1, nbs in vertices.items(): 41 | part.append((v1, nbs)) 42 | count += 1 43 | if count % batch_size == 0: 44 | part_list.append(part) 45 | part = [] 46 | if len(part) > 0: 47 | part_list.append(part) 48 | return part_list 49 | 50 | 51 | def partition_list(vertices, workers): 52 | batch_size = (len(vertices) - 1) // workers + 1 53 | part_list = [] 54 | part = [] 55 | count = 0 56 | for v1, nbs in enumerate(vertices): 57 | part.append((v1, nbs)) 58 | count += 1 59 | if count % batch_size == 0: 60 | part_list.append(part) 61 | part = [] 62 | if len(part) > 0: 63 | part_list.append(part) 64 | return part_list 65 | 66 | 67 | def partition_num(num, workers): 68 | if num % workers == 0: 69 | return [num//workers]*workers 70 | else: 71 | return [num//workers]*workers + [num % workers] 72 | --------------------------------------------------------------------------------