├── README.md
├── data
    ├── graph_node
    ├── side_info_feature
    └── walk_seq
├── eges_multigpu.py
├── preprocess.py
└── utils.py


/README.md:
--------------------------------------------------------------------------------
 1 | 参考了两位大佬的代码，简单实现了论文《Billion-scale Commodity Embedding for E-commerce Recommendation in Alibaba》中的EGES，并可以运行在单机多GPU上。
 2 | 
 3 | 训练pipeline：
 4 | 
 5 | 1. 统计点击序列中的每个item pair的共现次数，保存为data/graph_node
 6 | 
 7 | 2. 生成每个item的side info，保存为data/side_info_feature
 8 | 
 9 | 3. 运行preprocess.py，根据图随机游走生成序列，保存为data/walk_seq
10 | 
11 | 4. 运行eges_multigpu.py，生成item向量
12 | 
13 |    
14 | 
15 | - EGES部分代码参考：https://github.com/wangzhegeek/EGES
16 | - 单机多卡部分代码参考：https://github.com/lomyal/simple-word-embedding


--------------------------------------------------------------------------------
/data/graph_node:
--------------------------------------------------------------------------------
 1 | 1,2,2
 2 | 2,6,2
 3 | 2,3,3
 4 | 5,10,3
 5 | 5,21,2
 6 | 6,26,3
 7 | 10,11,2
 8 | 10,23,2
 9 | 13,21,2
10 | 13,30,3


--------------------------------------------------------------------------------
/data/side_info_feature:
--------------------------------------------------------------------------------
1 | id0	cate1	shop3	price1	city1	brand1
2 | id1	cate1	shop1	price1	city1	brand2
3 | id2	cate2	shop3	price1	city2	brand2
4 | id3	cate3	shop2	price1	city3	brand2
5 | id4	cate1	shop2	price2	city4	brand3
6 | id5	cate1	shop3	price2	city4	brand3


--------------------------------------------------------------------------------
/data/walk_seq:
--------------------------------------------------------------------------------
 1 | 36073 35986 36145 36064 36071 36143 36539 36441 36540 36433 36413 36544 36358 36397 36353 36456 36353 36480 36472 36418 36381 36360 36560 36356 36409 22153 22189 35974 71804 71982 71433 87154 87277 87065 87089 87086 87164 87063 87205 87240
 2 | 187301 187342 187343 187292 187300 187299 187291 187274
 3 | 178479 178486 178255 178475 178386 178531 185661 185721 185717 185731 185678 185711 185650 185711 185670 185644 185670 185689 185696 185612 185669 185600 185679 185580 185579 185641 185670 185644 185653 185604 185642 185645 185660 185604 185630 185647 185650 185671
 4 | 235533 235593 235661 235565 235547 235544 235570 235555 235563 235555 235554 235624
 5 | 125346 125373 125396 125434 125465 47225 47171 47190 47193 47207 47064 47102 47073 47060 47086 47116 47104 47105 47127 47045 47048 47064 47059 47131 47120 47095 47057 47165 47134 47143 47067 47122 47066 47119 47158 47053 47225 47191 47265 47201
 6 | 132984 132959 132934 132945 132943 132964 132966 132956 132969 132984 132943 132964 132966 132956 132957
 7 | 136098 136094 136095 136097 136098 136094 136095 136097 136098 136094 136095 136097 136098 136099 136102 136105
 8 | 66158 66068 66185 66076 66161 66162 66158 66002 66039 66027 66167 66184 66180 66182 66183 66003 66099 66128
 9 | 60523 60433 60509
10 | 44550 44384 44390 44391 44387 44419 44423


--------------------------------------------------------------------------------
/eges_multigpu.py:
--------------------------------------------------------------------------------
  1 | import tensorflow.compat.v1 as tf
  2 | import numpy as np
  3 | import collections
  4 | import random
  5 | import math
  6 | import datetime
  7 | import time
  8 | 
  9 | word_map = {}
 10 | data = []
 11 | side_info = np.loadtxt('./data/side_info_feature', dtype=int)
 12 | item_size, feature_size = side_info.shape
 13 | embedding_size = 128
 14 | n_sampled = 50
 15 | num_gpus = 2
 16 | batch_size = 256
 17 | num_steps = 200001   # data_size / batch_size * n_epoch
 18 | every_k_step = 5000
 19 | num_skips = 4       # batch_size % num_skips == 0
 20 | window_size = 4
 21 | tf.disable_eager_execution()
 22 | 
 23 | item_set = set()
 24 | def read_data(filename):
 25 |     global item_set
 26 |     with open(filename) as f:
 27 |         for line in f.readlines():
 28 |             line = line.strip().split(' ')
 29 |             data.extend(line)
 30 |     item_set = set(data)
 31 |     return data
 32 | 
 33 | 
 34 | data_index = 0
 35 | def generate_batch(batch_size):
 36 |     global data_index
 37 |     batch = np.ndarray(shape=(batch_size), dtype=np.int32)
 38 |     label = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
 39 |     span = 2 * window_size + 1
 40 |     buffer = collections.deque(maxlen=span)
 41 |     if data_index + span > len(data):
 42 |         data_index = 0
 43 | 
 44 |     buffer.extend(data[data_index: data_index + span])
 45 |     data_index += span
 46 |     for i in range(batch_size // num_skips):
 47 |         tgt = window_size
 48 |         visited_tgt = [tgt]
 49 |         for j in range(num_skips):
 50 |             while tgt in visited_tgt:
 51 |                 tgt = random.randint(0, span - 1)
 52 |             visited_tgt.append(tgt)
 53 |             batch[i * num_skips + j] = buffer[window_size]
 54 |             label[i * num_skips + j, 0] = buffer[tgt]
 55 |         if data_index == len(data):
 56 |             for k in range(span):
 57 |                 buffer.append(k)
 58 |             data_index = span
 59 |         else:
 60 |             buffer.append(data[data_index])
 61 |         data_index += 1
 62 |     data_index = (data_index + len(data) - span) % len(data)
 63 |     return batch, label
 64 | 
 65 | 
 66 | def _variable_on_cpu(name, shape, initializer, dtype=np.float32):
 67 |     with tf.device('/cpu:0'):
 68 |         var = tf.get_variable(name, shape, initializer=initializer, dtype=dtype)
 69 |     return var
 70 | 
 71 | 
 72 | def tower_loss(scope, inputs, labels):
 73 |     embedding_list = []
 74 | 
 75 |     for i in range(feature_size):
 76 |         embedding = _variable_on_cpu('side_info_{0}_embeddings'.format(i), [max(side_info[:, i]) + 1, embedding_size],
 77 |                                      tf.random_uniform_initializer(-1.0, 1.0))
 78 |         side_info_index = tf.nn.embedding_lookup(side_info[:, i], inputs)
 79 |         side_info_embed = tf.nn.embedding_lookup(embedding, tf.cast(side_info_index[:], dtype=tf.int32))
 80 |         embedding_list.append(side_info_embed)
 81 |         
 82 |     alpha_embedding = _variable_on_cpu('alpha_embeddings', [item_size, feature_size],
 83 |                                        tf.random_uniform_initializer(-1.0, 1.0))
 84 |     stacked_embed = tf.stack(embedding_list, axis=-1)
 85 |     alpha_index = tf.nn.embedding_lookup(side_info[:, 0], inputs)
 86 |     alpha_embed = tf.nn.embedding_lookup(alpha_embedding, alpha_index)
 87 |     alpha_embed_expand = tf.expand_dims(alpha_embed, 1)
 88 |     alpha_i_sum = tf.reduce_sum(tf.exp(alpha_embed_expand), axis=-1)
 89 |     merge_embedding = tf.reduce_sum(stacked_embed * tf.exp(alpha_embed_expand), axis=-1) / alpha_i_sum
 90 | 
 91 |     ''' cold start item
 92 |     stacked_embed = tf.stack(embedding_list[1:], axis=-1)
 93 |     alpha_index = tf.nn.embedding_lookup(side_info[:, 1], inputs)
 94 |     alpha_embed = tf.nn.embedding_lookup(alpha_embedding, alpha_index[:])
 95 |     alpha_embed_expand = tf.expand_dims(alpha_embed, 1)
 96 |     alpha_i_sum = tf.reduce_sum(tf.exp(alpha_embed_expand), axis=-1)
 97 |     merge_embedding = tf.reduce_sum(stacked_embed * tf.exp(alpha_embed_expand), axis=-1) / alpha_i_sum
 98 |     cold_start_embedding = tf.reduce_sum(stacked_embed * tf.exp(alpha_embed_expand), axis=-1) / alpha_i_sum
 99 |     '''
100 |     weights = _variable_on_cpu('w', [item_size, embedding_size], tf.truncated_normal_initializer(stddev=1.0/math.sqrt(embedding_size)))
101 |     biases = _variable_on_cpu('b', [item_size], tf.zeros_initializer())
102 |     loss = tf.reduce_mean(tf.nn.nce_loss(
103 |         weights=weights,
104 |         biases=biases,
105 |         labels=labels,
106 |         inputs=merge_embedding,
107 |         num_sampled=n_sampled,
108 |         num_classes=item_size
109 |     ))
110 |     return loss, merge_embedding
111 | 
112 | 
113 | def average_gradient(tower_grads):
114 |     avg_grads = []
115 |     for grads_vars in zip(*tower_grads):
116 |         values = tf.concat([g.values / num_gpus for g, _ in grads_vars], 0)
117 |         indices = tf.concat([g.indices for g, _ in grads_vars], 0)
118 |         grad = tf.IndexedSlices(values, indices)
119 | 
120 |         var = grads_vars[0][1]
121 |         cur_grad_and_var = (grad, var)
122 |         avg_grads.append(cur_grad_and_var)
123 |     return avg_grads
124 | 
125 | 
126 | def get_final_embedding():
127 |     cnt = item_size // batch_size
128 |     remain = item_size % batch_size
129 |     final_embedding = {}
130 |     all_item = side_info[:, 0]
131 |     all_item = np.concatenate([all_item, [0] * remain], axis=0)
132 | 
133 |     for i in range(cnt):
134 |         eval_input = all_item[i * batch_size: (i + 1) * batch_size]
135 |         eval_label = np.zeros((batch_size, 1))
136 |         eval_embedding = sess.run(merged_embedding, feed_dict={train_input: eval_input, train_label: eval_label})
137 |         # for cold start item
138 |         # cold_start_embedding = sess.run(cold_start_embedding, feed_dict={train_input: eval_input, train_label: eval_label})
139 |         eval_embedding = eval_embedding.tolist()
140 |         if i == cnt - 1:
141 |             eval_embedding = eval_embedding[:-remain]
142 |         final_embedding.update({all_item[i*batch_size+k]: eval_embedding[k] for k in range(len(eval_embedding))})
143 |     dump_embedding(final_embedding, 'data/item_embeddings')
144 | 
145 | 
146 | def dump_embedding(embedding_result, output_file):
147 |     with open(output_file, 'w') as f:
148 |         for k, v in embedding_result.items():
149 |             f.write("{0} {1}\n".format(k, " ".join(list(map(lambda x: str(x), v)))))
150 | 
151 | 
152 | if __name__ == '__main__':
153 |     d = read_data('data/walk_seq')
154 | 
155 |     graph = tf.Graph()
156 |     with graph.as_default(), tf.device('/cpu:0'):
157 |         train_input = tf.placeholder(tf.int32, shape=[batch_size])
158 |         train_label = tf.placeholder(tf.int32, shape=[batch_size, 1])
159 | 
160 |         train_opt = tf.train.GradientDescentOptimizer(1.0)
161 |         #train_opt = tf.train.AdamOptimizer(1.0)
162 | 
163 |         tower_grads = []
164 |         batch_size_gpu = batch_size // num_gpus
165 |         with tf.variable_scope(tf.get_variable_scope()):
166 |             for i in range(num_gpus):
167 |                 with tf.device('/gpu:{0}'.format(i)):
168 |                     with tf.name_scope('tower_{0}'.format(i)) as scope:
169 |                         train_input_gpu = tf.slice(train_input, [i * batch_size_gpu], [batch_size_gpu])
170 |                         train_label_gpu = tf.slice(train_label, [i * batch_size_gpu, 0], [batch_size_gpu, 1])
171 | 
172 |                         loss, merged_embedding = tower_loss(scope, train_input_gpu, train_label_gpu)
173 |                         tf.get_variable_scope().reuse_variables()
174 | 
175 |                         grads = train_opt.compute_gradients(loss)
176 |                         tower_grads.append(grads)
177 | 
178 |         grads = average_gradient(tower_grads)
179 |         apply_gradient_op = train_opt.apply_gradients(grads)
180 | 
181 |         init = tf.global_variables_initializer()
182 | 
183 |     config = tf.ConfigProto(allow_soft_placement=True)
184 |     config.gpu_options.allow_growth = True
185 | 
186 |     with tf.Session(graph=graph, config=config) as sess:
187 |         start_time = datetime.datetime.now()
188 |         init.run()
189 |         print('Init finished')
190 |         saver = tf.train.Saver(max_to_keep=4)
191 | 
192 |         avg_loss = 0
193 |         final_loss = 0
194 |         for step in range(1, num_steps):
195 |             batch_input, batch_label = generate_batch(batch_size)
196 |             feed_dict = {train_input: batch_input, train_label: batch_label}
197 |             _, loss_val, batch_res = sess.run([apply_gradient_op, loss, merged_embedding], feed_dict=feed_dict)
198 | 
199 |             avg_loss += loss_val
200 |             final_loss += loss_val
201 | 
202 |             if step % every_k_step == 0:
203 |                 end_time = datetime.datetime.now()
204 |                 avg_loss /= every_k_step
205 |                 print("step: {0}, loss: {1}, time: {2}s".format(step, avg_loss, (end_time-start_time).seconds))
206 |                 avg_loss = 0
207 |                 start_time = datetime.datetime.now()
208 | 
209 |         get_final_embedding()
210 | 


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
  1 | import networkx as nx
  2 | import random
  3 | import numpy as np
  4 | from joblib import Parallel, delayed
  5 | import time
  6 | from itertools import chain
  7 | import os
  8 | import re
  9 | 
 10 | id_mapping = {}
 11 | side_info = []
 12 | DATA_PATH = "data"
 13 | BATCH = 100000
 14 | 
 15 | 
 16 | def partition_num(num, workers):
 17 |     if num % workers == 0:
 18 |         return [num // workers] * workers
 19 |     else:
 20 |         return [num // workers] * workers + [num % workers]
 21 | 
 22 | 
 23 | def create_alias_table(area_ratio):
 24 |     """
 25 |     :param area_ratio: sum(area_ratio)=1
 26 |     :return: accept,alias
 27 |     """
 28 |     l = len(area_ratio)
 29 |     accept, alias = [0] * l, [0] * l
 30 |     small, large = [], []
 31 | 
 32 |     for i, prob in enumerate(area_ratio):
 33 |         if prob < 1.0:
 34 |             small.append(i)
 35 |         else:
 36 |             large.append(i)
 37 | 
 38 |     while small and large:
 39 |         small_idx, large_idx = small.pop(), large.pop()
 40 |         accept[small_idx] = area_ratio[small_idx]
 41 |         alias[small_idx] = large_idx
 42 |         area_ratio[large_idx] = area_ratio[large_idx] - (1 - area_ratio[small_idx])
 43 |         if area_ratio[large_idx] < 1.0:
 44 |             small.append(large_idx)
 45 |         else:
 46 |             large.append(large_idx)
 47 | 
 48 |     while large:
 49 |         large_idx = large.pop()
 50 |         accept[large_idx] = 1
 51 |     while small:
 52 |         small_idx = small.pop()
 53 |         accept[small_idx] = 1
 54 | 
 55 |     return accept, alias
 56 | 
 57 | 
 58 | 
 59 | def dump_seq(walks, id_mapping):
 60 |     with open(os.path.join(DATA_PATH, "walk_seq"), "w") as f:
 61 |         for line in walks:
 62 |             f.write("{0}\n".format(" ".join(map(lambda x: str(id_mapping[x]), line))))
 63 | 
 64 | 
 65 | def get_all_pairs(walks, id_mapping, window_size):
 66 |     all_pairs = []
 67 |     cnt = 0
 68 |     side_info = []
 69 |     with open(os.path.join(DATA_PATH, "side_info_feature")) as f:
 70 |         for line in f.readlines():
 71 |             line = line.strip().split("\t")
 72 |             side_info.append(line)
 73 | 
 74 |     for k in range(len(walks)):
 75 |         for i in range(len(walks[k])):
 76 |             for j in range(i - window_size, i + window_size):
 77 |                 if i == j or j < 0 or j >= len(walks[k]):
 78 |                     continue
 79 |                 else:
 80 |                     line = [id_mapping[walks[k][i]]]
 81 |                     line.extend(side_info[id_mapping[walks[k][i]]-1])
 82 |                     line.append(id_mapping[walks[k][j]])
 83 |                     all_pairs.append(line)
 84 | 
 85 |                 if len(all_pairs) == 0:
 86 |                     return
 87 |         if len(all_pairs) % BATCH == 0:
 88 |             with open(os.path.join(DATA_PATH, "all_pairs"), "a") as f:
 89 |                 for line in all_pairs:
 90 |                     f.write("{0}\n".format("\t".join(list(map(lambda x: str(x), line)))))
 91 |             print("{0} lines done. {1}".format(BATCH * cnt,
 92 |                                                time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))))
 93 | 
 94 |             all_pairs = []
 95 |             cnt += 1
 96 | 
 97 | 
 98 | class RandomWalker:
 99 |     def __init__(self, g):
100 |         self.g = g
101 |         self.alias_nodes = {}
102 |         self.alias_edges = {}
103 | 
104 |     def deepwalk(self, start, walk_len):
105 |         walk = [start]
106 | 
107 |         while len(walk) < walk_len:
108 |             cur = walk[-1]
109 |             neighbor = list(self.g.neighbors(cur))
110 |             if len(neighbor) > 0:
111 |                 walk.append(random.choice(neighbor))
112 |             else:
113 |                 break
114 |         return walk
115 | 
116 |     def simulate(self, num_walks, walk_length, workers=4):
117 |         g = self.g
118 | 
119 |         nodes = list(g.nodes())
120 |         result = Parallel(n_jobs=workers)(delayed(self._simulate)(nodes, num, walk_length)
121 |                                           for num in partition_num(num_walks, workers))
122 |         result = list(chain(*result))
123 |         result = list(filter(lambda x: len(x) > 2, result))
124 |         return result
125 | 
126 |     def _simulate(self, nodes, num_walks, walk_length):
127 |         walks = []
128 |         for _ in range(num_walks):
129 |             random.shuffle(nodes)
130 |             for node in nodes:
131 |                 walks.append(self.deepwalk(node, walk_length))
132 |         return walks
133 | 
134 |     def get_alias_edges(self, t, v):
135 |         g = self.g
136 |         p, q = 1, 1
137 |         unnormalized = []
138 |         for x in g.neighbors(v):
139 |             weight = g[v][x].get('weight', 1.0)
140 |             if x == t:
141 |                 unnormalized.append(weight / p)
142 |             elif g.has_edge(x, t):
143 |                 unnormalized.append(weight)
144 |             else:
145 |                 unnormalized.append(weight / q)
146 |         norm = sum(unnormalized)
147 |         normalized = list(map(lambda x: x / norm * len(unnormalized), unnormalized))
148 |         return create_alias_table(normalized)
149 | 
150 |     def build_trans_prob(self):
151 |         g = self.g
152 | 
153 |         for node in g.nodes:
154 |             unnormalized = [g[node][neighbor].get('weight', 1.0) for neighbor in g.neighbors(node)]
155 |             norm = sum(unnormalized)
156 |             normalized = list(map(lambda x: x / norm * len(unnormalized), unnormalized))
157 |             self.alias_nodes[node] = create_alias_table(normalized)
158 | 
159 |         for edge in g.edges():
160 |             self.alias_edges[edge] = self.get_alias_edges(edge[0], edge[1])
161 | 
162 | 
163 | def create_graph():
164 |     edges = []
165 |     with open(os.path.join(DATA_PATH, "graph_node")) as f:
166 |         for line in f.readlines():
167 |             line = line.strip()
168 |             in_node, out_node, weight = line.split(",")
169 |             edges.append((in_node, out_node, float(weight)))
170 | 
171 |     di = nx.DiGraph()
172 |     di.add_weighted_edges_from(edges)
173 |     rand = RandomWalker(di)
174 |     # rand.build_trans_prob()
175 |     res = rand.simulate(15, 15)
176 |     print('simulate finished')
177 | 
178 |     id_mapping = {}
179 |     with open(os.path.join(DATA_PATH, "id_mapping")) as f:
180 |         for line in f.readlines():
181 |             line = line.strip()
182 |             line = line.split("\t")
183 |             id_mapping[line[0]] = int(line[1])
184 | 
185 |     dump_seq(res, id_mapping)
186 | 
187 | 
188 | if __name__ == '__main__':
189 |     create_graph()
190 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def write_embedding(item_list, embedding_result, outputFileName):
 5 |     f = open(outputFileName, 'w')
 6 |     for i in range(len(embedding_result)):
 7 |         s = " ".join(str(f) for f in embedding_result[i].tolist())
 8 |         f.write("{0} {1}\n".format(item_list[i], s))
 9 |     f.close()
10 | 
11 | 
12 | def graph_context_batch_iter(all_pairs, batch_size, side_info, num_features):
13 |     while True:
14 |         start_idx = np.random.randint(0, len(all_pairs) - batch_size)
15 |         batch_idx = np.array(range(start_idx, start_idx + batch_size))
16 |         batch_idx = np.random.permutation(batch_idx)
17 |         batch = np.zeros((batch_size, num_features), dtype=np.int32)
18 |         labels = np.zeros((batch_size, 1), dtype=np.int32)
19 |         batch[:] = side_info[all_pairs[batch_idx, 0]]
20 |         labels[:, 0] = all_pairs[batch_idx, 1]
21 |         yield batch, labels
22 | 
23 | 
24 | def preprocess_nxgraph(graph):
25 |     node2idx = {}
26 |     idx2node = []
27 |     node_size = 0
28 |     for node in graph.nodes():
29 |         node2idx[node] = node_size
30 |         idx2node.append(node)
31 |         node_size += 1
32 |     return idx2node, node2idx
33 | 
34 | 
35 | def partition_dict(vertices, workers):
36 |     batch_size = (len(vertices) - 1) // workers + 1
37 |     part_list = []
38 |     part = []
39 |     count = 0
40 |     for v1, nbs in vertices.items():
41 |         part.append((v1, nbs))
42 |         count += 1
43 |         if count % batch_size == 0:
44 |             part_list.append(part)
45 |             part = []
46 |     if len(part) > 0:
47 |         part_list.append(part)
48 |     return part_list
49 | 
50 | 
51 | def partition_list(vertices, workers):
52 |     batch_size = (len(vertices) - 1) // workers + 1
53 |     part_list = []
54 |     part = []
55 |     count = 0
56 |     for v1, nbs in enumerate(vertices):
57 |         part.append((v1, nbs))
58 |         count += 1
59 |         if count % batch_size == 0:
60 |             part_list.append(part)
61 |             part = []
62 |     if len(part) > 0:
63 |         part_list.append(part)
64 |     return part_list
65 | 
66 | 
67 | def partition_num(num, workers):
68 |     if num % workers == 0:
69 |         return [num//workers]*workers
70 |     else:
71 |         return [num//workers]*workers + [num % workers]
72 | 


--------------------------------------------------------------------------------