├── README.md
├── __init__.py
├── __pycache__
    ├── aggregators.cpython-36.pyc
    ├── graphsage.cpython-36.pyc
    ├── inits.cpython-36.pyc
    ├── layers.cpython-36.pyc
    ├── minibatch.cpython-36.pyc
    ├── neigh_samplers.cpython-36.pyc
    ├── supervised_model.cpython-36.pyc
    ├── train.cpython-36.pyc
    └── utils.cpython-36.pyc
├── aggregators.py
├── example_run.sh
├── graphsage.py
├── inits.py
├── layers.py
├── log
    └── sup-sequential
    │   ├── max_small_0.0010
    │       ├── events.out.tfevents.1561206683.zhouxuechaodeMacBook-Pro.local
    │       ├── events.out.tfevents.1561209809.zhouxuechaodeMacBook-Pro.local
    │       ├── events.out.tfevents.1561263709.zhouxuechaodeMacBook-Pro.local
    │       ├── events.out.tfevents.1561263857.zhouxuechaodeMacBook-Pro.local
    │       ├── events.out.tfevents.1561272163.zhouxuechaodeMacBook-Pro.local
    │       └── val_stats.txt
    │   └── mean_small_0.0010
    │       ├── events.out.tfevents.1561196841.zhouxuechaodeMacBook-Pro.local
    │       └── val_stats.txt
├── minibatch.py
├── neigh_samplers.py
├── readFile.py
├── supervised_model.py
├── train.py
└── utils.py


/README.md:
--------------------------------------------------------------------------------
 1 | # BurstGraph
 2 | 
 3 | Large Scale Evolving Graphs with Burst Detection. 
 4 | 
 5 | Yifeng Zhao, Xiangwei Wang, Hongxia Yang, Le Song, and Jie Tang.
 6 | 
 7 | The code is still under construction. 
 8 | 
 9 | ## Training
10 | 
11 | You can run train.py with necessary parameters, such as shown in example\_run.sh .
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ericZhao93/BurstGraph/dd743b322c9b6dc666a8599d8fe9fb901bd83258/__init__.py


--------------------------------------------------------------------------------
/__pycache__/aggregators.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ericZhao93/BurstGraph/dd743b322c9b6dc666a8599d8fe9fb901bd83258/__pycache__/aggregators.cpython-36.pyc


--------------------------------------------------------------------------------
/__pycache__/graphsage.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ericZhao93/BurstGraph/dd743b322c9b6dc666a8599d8fe9fb901bd83258/__pycache__/graphsage.cpython-36.pyc


--------------------------------------------------------------------------------
/__pycache__/inits.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ericZhao93/BurstGraph/dd743b322c9b6dc666a8599d8fe9fb901bd83258/__pycache__/inits.cpython-36.pyc


--------------------------------------------------------------------------------
/__pycache__/layers.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ericZhao93/BurstGraph/dd743b322c9b6dc666a8599d8fe9fb901bd83258/__pycache__/layers.cpython-36.pyc


--------------------------------------------------------------------------------
/__pycache__/minibatch.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ericZhao93/BurstGraph/dd743b322c9b6dc666a8599d8fe9fb901bd83258/__pycache__/minibatch.cpython-36.pyc


--------------------------------------------------------------------------------
/__pycache__/neigh_samplers.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ericZhao93/BurstGraph/dd743b322c9b6dc666a8599d8fe9fb901bd83258/__pycache__/neigh_samplers.cpython-36.pyc


--------------------------------------------------------------------------------
/__pycache__/supervised_model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ericZhao93/BurstGraph/dd743b322c9b6dc666a8599d8fe9fb901bd83258/__pycache__/supervised_model.cpython-36.pyc


--------------------------------------------------------------------------------
/__pycache__/train.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ericZhao93/BurstGraph/dd743b322c9b6dc666a8599d8fe9fb901bd83258/__pycache__/train.cpython-36.pyc


--------------------------------------------------------------------------------
/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ericZhao93/BurstGraph/dd743b322c9b6dc666a8599d8fe9fb901bd83258/__pycache__/utils.cpython-36.pyc


--------------------------------------------------------------------------------
/aggregators.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | from layers import Layer, Dense
  4 | from inits import glorot, zeros
  5 | 
  6 | class MeanAggregator(Layer):
  7 |     """
  8 |     Aggregates via mean followed by matmul and non-linearity.
  9 |     """
 10 | 
 11 |     def __init__(self, input_dim, output_dim, neigh_input_dim=None,
 12 |             dropout=0., bias=False, act=tf.nn.relu, 
 13 |             name=None, concat=False, **kwargs):
 14 |         super(MeanAggregator, self).__init__(**kwargs)
 15 | 
 16 |         self.dropout = dropout
 17 |         self.bias = bias
 18 |         self.act = act
 19 |         self.concat = concat
 20 | 
 21 |         if neigh_input_dim is None:
 22 |             neigh_input_dim = input_dim
 23 | 
 24 |         if name is not None:
 25 |             name = '/' + name
 26 |         else:
 27 |             name = ''
 28 | 
 29 |         with tf.variable_scope(self.name + name + '_vars'):
 30 |             self.vars['neigh_weights'] = glorot([neigh_input_dim, output_dim],
 31 |                                                         name='neigh_weights')
 32 |             self.vars['self_weights'] = glorot([input_dim, output_dim],
 33 |                                                         name='self_weights')
 34 |             if self.bias:
 35 |                 self.vars['bias'] = zeros([self.output_dim], name='bias')
 36 | 
 37 |         if self.logging:
 38 |             self._log_vars()
 39 | 
 40 |         self.input_dim = input_dim
 41 |         self.output_dim = output_dim
 42 | 
 43 |     def _call(self, inputs):
 44 |         self_vecs, neigh_vecs = inputs
 45 | 
 46 |         neigh_vecs = tf.nn.dropout(neigh_vecs, 1-self.dropout)
 47 |         self_vecs = tf.nn.dropout(self_vecs, 1-self.dropout)
 48 |         neigh_means = tf.reduce_mean(neigh_vecs, axis=1)
 49 |        
 50 |         # [nodes] x [out_dim]
 51 |         from_neighs = tf.matmul(neigh_means, self.vars['neigh_weights'])
 52 | 
 53 |         from_self = tf.matmul(self_vecs, self.vars["self_weights"])
 54 |          
 55 |         if not self.concat:
 56 |             output = tf.add_n([from_self, from_neighs])
 57 |         else:
 58 |             output = tf.concat([from_self, from_neighs], axis=1)
 59 | 
 60 |         # bias
 61 |         if self.bias:
 62 |             output += self.vars['bias']
 63 |        
 64 |         return self.act(output)
 65 | 
 66 | class GCNAggregator(Layer):
 67 |     """
 68 |     Aggregates via mean followed by matmul and non-linearity.
 69 |     Same matmul parameters are used self vector and neighbor vectors.
 70 |     """
 71 | 
 72 |     def __init__(self, input_dim, output_dim, neigh_input_dim=None,
 73 |             dropout=0., bias=False, act=tf.nn.relu, name=None, concat=False, **kwargs):
 74 |         super(GCNAggregator, self).__init__(**kwargs)
 75 | 
 76 |         self.dropout = dropout
 77 |         self.bias = bias
 78 |         self.act = act
 79 |         self.concat = concat
 80 | 
 81 |         if neigh_input_dim is None:
 82 |             neigh_input_dim = input_dim
 83 | 
 84 |         if name is not None:
 85 |             name = '/' + name
 86 |         else:
 87 |             name = ''
 88 | 
 89 |         with tf.variable_scope(self.name + name + '_vars'):
 90 |             self.vars['weights'] = glorot([neigh_input_dim, output_dim],
 91 |                                                         name='neigh_weights')
 92 |             if self.bias:
 93 |                 self.vars['bias'] = zeros([self.output_dim], name='bias')
 94 | 
 95 |         if self.logging:
 96 |             self._log_vars()
 97 | 
 98 |         self.input_dim = input_dim
 99 |         self.output_dim = output_dim
100 | 
101 |     def _call(self, inputs):
102 |         self_vecs, neigh_vecs = inputs
103 | 
104 |         neigh_vecs = tf.nn.dropout(neigh_vecs, 1-self.dropout)
105 |         self_vecs = tf.nn.dropout(self_vecs, 1-self.dropout)
106 |         means = tf.reduce_mean(tf.concat([neigh_vecs, 
107 |             tf.expand_dims(self_vecs, axis=1)], axis=1), axis=1)
108 |        
109 |         # [nodes] x [out_dim]
110 |         output = tf.matmul(means, self.vars['weights'])
111 | 
112 |         # bias
113 |         if self.bias:
114 |             output += self.vars['bias']
115 |        
116 |         return self.act(output)
117 | 
118 | 
119 | class MaxPoolingAggregator(Layer):
120 |     """ Aggregates via max-pooling over MLP functions.
121 |     """
122 |     def __init__(self, input_dim, output_dim, model_size="small", neigh_input_dim=None,
123 |             dropout=0., bias=False, act=tf.nn.relu, name=None, concat=False, **kwargs):
124 |         super(MaxPoolingAggregator, self).__init__(**kwargs)
125 | 
126 |         self.dropout = dropout
127 |         self.bias = bias
128 |         self.act = act
129 |         self.concat = concat
130 | 
131 |         if neigh_input_dim is None:
132 |             neigh_input_dim = input_dim
133 | 
134 |         if name is not None:
135 |             name = '/' + name
136 |         else:
137 |             name = ''
138 | 
139 |         if model_size == "small":
140 |             hidden_dim = self.hidden_dim = 512
141 |         elif model_size == "big":
142 |             hidden_dim = self.hidden_dim = 1024
143 | 
144 |         self.mlp_layers = []
145 |         self.mlp_layers.append(Dense(input_dim=neigh_input_dim,
146 |                                  output_dim=hidden_dim,
147 |                                  act=tf.nn.relu,
148 |                                  dropout=dropout,
149 |                                  sparse_inputs=False,
150 |                                  logging=self.logging))
151 | 
152 |         with tf.variable_scope(self.name + name + '_vars'):
153 |             self.vars['neigh_weights'] = glorot([hidden_dim, output_dim],
154 |                                                         name='neigh_weights')
155 |            
156 |             self.vars['self_weights'] = glorot([input_dim, output_dim],
157 |                                                         name='self_weights')
158 |             if self.bias:
159 |                 self.vars['bias'] = zeros([self.output_dim], name='bias')
160 | 
161 |         if self.logging:
162 |             self._log_vars()
163 | 
164 |         self.input_dim = input_dim
165 |         self.output_dim = output_dim
166 |         self.neigh_input_dim = neigh_input_dim
167 | 
168 |     def _call(self, inputs):
169 |         self_vecs, neigh_vecs = inputs
170 |         neigh_h = neigh_vecs
171 | 
172 |         dims = tf.shape(neigh_h)
173 |         batch_size = dims[0]
174 |         num_neighbors = dims[1]
175 |         # [nodes * sampled neighbors] x [hidden_dim]
176 |         h_reshaped = tf.reshape(neigh_h, (batch_size * num_neighbors, self.neigh_input_dim))
177 | 
178 |         for l in self.mlp_layers:
179 |             h_reshaped = l(h_reshaped)
180 |         neigh_h = tf.reshape(h_reshaped, (batch_size, num_neighbors, self.hidden_dim))
181 |         neigh_h = tf.reduce_max(neigh_h, axis=1)
182 |         
183 |         from_neighs = tf.matmul(neigh_h, self.vars['neigh_weights'])
184 |         from_self = tf.matmul(self_vecs, self.vars["self_weights"])
185 |         
186 |         if not self.concat:
187 |             output = tf.add_n([from_self, from_neighs])
188 |         else:
189 |             output = tf.concat([from_self, from_neighs], axis=1)
190 | 
191 |         # bias
192 |         if self.bias:
193 |             output += self.vars['bias']
194 |        
195 |         return self.act(output)
196 | 
197 | class MeanPoolingAggregator(Layer):
198 |     """ Aggregates via mean-pooling over MLP functions.
199 |     """
200 |     def __init__(self, input_dim, output_dim, model_size="small", neigh_input_dim=None,
201 |             dropout=0., bias=False, act=tf.nn.relu, name=None, concat=False, **kwargs):
202 |         super(MeanPoolingAggregator, self).__init__(**kwargs)
203 | 
204 |         self.dropout = dropout
205 |         self.bias = bias
206 |         self.act = act
207 |         self.concat = concat
208 | 
209 |         if neigh_input_dim is None:
210 |             neigh_input_dim = input_dim
211 | 
212 |         if name is not None:
213 |             name = '/' + name
214 |         else:
215 |             name = ''
216 | 
217 |         if model_size == "small":
218 |             hidden_dim = self.hidden_dim = 512
219 |         elif model_size == "big":
220 |             hidden_dim = self.hidden_dim = 1024
221 | 
222 |         self.mlp_layers = []
223 |         self.mlp_layers.append(Dense(input_dim=neigh_input_dim,
224 |                                  output_dim=hidden_dim,
225 |                                  act=tf.nn.relu,
226 |                                  dropout=dropout,
227 |                                  sparse_inputs=False,
228 |                                  logging=self.logging))
229 | 
230 |         with tf.variable_scope(self.name + name + '_vars'):
231 |             self.vars['neigh_weights'] = glorot([hidden_dim, output_dim],
232 |                                                         name='neigh_weights')
233 |            
234 |             self.vars['self_weights'] = glorot([input_dim, output_dim],
235 |                                                         name='self_weights')
236 |             if self.bias:
237 |                 self.vars['bias'] = zeros([self.output_dim], name='bias')
238 | 
239 |         if self.logging:
240 |             self._log_vars()
241 | 
242 |         self.input_dim = input_dim
243 |         self.output_dim = output_dim
244 |         self.neigh_input_dim = neigh_input_dim
245 | 
246 |     def _call(self, inputs):
247 |         self_vecs, neigh_vecs = inputs
248 |         neigh_h = neigh_vecs
249 | 
250 |         dims = tf.shape(neigh_h)
251 |         batch_size = dims[0]
252 |         num_neighbors = dims[1]
253 |         # [nodes * sampled neighbors] x [hidden_dim]
254 |         h_reshaped = tf.reshape(neigh_h, (batch_size * num_neighbors, self.neigh_input_dim))
255 | 
256 |         for l in self.mlp_layers:
257 |             h_reshaped = l(h_reshaped)
258 |         neigh_h = tf.reshape(h_reshaped, (batch_size, num_neighbors, self.hidden_dim))
259 |         neigh_h = tf.reduce_mean(neigh_h, axis=1)
260 |         
261 |         from_neighs = tf.matmul(neigh_h, self.vars['neigh_weights'])
262 |         from_self = tf.matmul(self_vecs, self.vars["self_weights"])
263 |         
264 |         if not self.concat:
265 |             output = tf.add_n([from_self, from_neighs])
266 |         else:
267 |             output = tf.concat([from_self, from_neighs], axis=1)
268 | 
269 |         # bias
270 |         if self.bias:
271 |             output += self.vars['bias']
272 |        
273 |         return self.act(output)
274 | 
275 | 
276 | class TwoMaxLayerPoolingAggregator(Layer):
277 |     """ Aggregates via pooling over two MLP functions.
278 |     """
279 |     def __init__(self, input_dim, output_dim, model_size="small", neigh_input_dim=None,
280 |             dropout=0., bias=False, act=tf.nn.relu, name=None, concat=False, **kwargs):
281 |         super(TwoMaxLayerPoolingAggregator, self).__init__(**kwargs)
282 | 
283 |         self.dropout = dropout
284 |         self.bias = bias
285 |         self.act = act
286 |         self.concat = concat
287 | 
288 |         if neigh_input_dim is None:
289 |             neigh_input_dim = input_dim
290 | 
291 |         if name is not None:
292 |             name = '/' + name
293 |         else:
294 |             name = ''
295 | 
296 |         if model_size == "small":
297 |             hidden_dim_1 = self.hidden_dim_1 = 512
298 |             hidden_dim_2 = self.hidden_dim_2 = 256
299 |         elif model_size == "big":
300 |             hidden_dim_1 = self.hidden_dim_1 = 1024
301 |             hidden_dim_2 = self.hidden_dim_2 = 512
302 | 
303 |         self.mlp_layers = []
304 |         self.mlp_layers.append(Dense(input_dim=neigh_input_dim,
305 |                                  output_dim=hidden_dim_1,
306 |                                  act=tf.nn.relu,
307 |                                  dropout=dropout,
308 |                                  sparse_inputs=False,
309 |                                  logging=self.logging))
310 |         self.mlp_layers.append(Dense(input_dim=hidden_dim_1,
311 |                                  output_dim=hidden_dim_2,
312 |                                  act=tf.nn.relu,
313 |                                  dropout=dropout,
314 |                                  sparse_inputs=False,
315 |                                  logging=self.logging))
316 | 
317 | 
318 |         with tf.variable_scope(self.name + name + '_vars'):
319 |             self.vars['neigh_weights'] = glorot([hidden_dim_2, output_dim],
320 |                                                         name='neigh_weights')
321 |            
322 |             self.vars['self_weights'] = glorot([input_dim, output_dim],
323 |                                                         name='self_weights')
324 |             if self.bias:
325 |                 self.vars['bias'] = zeros([self.output_dim], name='bias')
326 | 
327 |         if self.logging:
328 |             self._log_vars()
329 | 
330 |         self.input_dim = input_dim
331 |         self.output_dim = output_dim
332 |         self.neigh_input_dim = neigh_input_dim
333 | 
334 |     def _call(self, inputs):
335 |         self_vecs, neigh_vecs = inputs
336 |         neigh_h = neigh_vecs
337 | 
338 |         dims = tf.shape(neigh_h)
339 |         batch_size = dims[0]
340 |         num_neighbors = dims[1]
341 |         # [nodes * sampled neighbors] x [hidden_dim]
342 |         h_reshaped = tf.reshape(neigh_h, (batch_size * num_neighbors, self.neigh_input_dim))
343 | 
344 |         for l in self.mlp_layers:
345 |             h_reshaped = l(h_reshaped)
346 |         neigh_h = tf.reshape(h_reshaped, (batch_size, num_neighbors, self.hidden_dim_2))
347 |         neigh_h = tf.reduce_max(neigh_h, axis=1)
348 |         
349 |         from_neighs = tf.matmul(neigh_h, self.vars['neigh_weights'])
350 |         from_self = tf.matmul(self_vecs, self.vars["self_weights"])
351 |         
352 |         if not self.concat:
353 |             output = tf.add_n([from_self, from_neighs])
354 |         else:
355 |             output = tf.concat([from_self, from_neighs], axis=1)
356 | 
357 |         # bias
358 |         if self.bias:
359 |             output += self.vars['bias']
360 |        
361 |         return self.act(output)
362 | 
363 | class SeqAggregator(Layer):
364 |     """ Aggregates via a standard LSTM.
365 |     """
366 |     def __init__(self, input_dim, output_dim, model_size="small", neigh_input_dim=None,
367 |             dropout=0., bias=False, act=tf.nn.relu, name=None,  concat=False, **kwargs):
368 |         super(SeqAggregator, self).__init__(**kwargs)
369 | 
370 |         self.dropout = dropout
371 |         self.bias = bias
372 |         self.act = act
373 |         self.concat = concat
374 | 
375 |         if neigh_input_dim is None:
376 |             neigh_input_dim = input_dim
377 | 
378 |         if name is not None:
379 |             name = '/' + name
380 |         else:
381 |             name = ''
382 | 
383 |         if model_size == "small":
384 |             hidden_dim = self.hidden_dim = 128
385 |         elif model_size == "big":
386 |             hidden_dim = self.hidden_dim = 256
387 | 
388 |         with tf.variable_scope(self.name + name + '_vars'):
389 |             self.vars['neigh_weights'] = glorot([hidden_dim, output_dim],
390 |                                                         name='neigh_weights')
391 |            
392 |             self.vars['self_weights'] = glorot([input_dim, output_dim],
393 |                                                         name='self_weights')
394 |             if self.bias:
395 |                 self.vars['bias'] = zeros([self.output_dim], name='bias')
396 | 
397 |         if self.logging:
398 |             self._log_vars()
399 | 
400 |         self.input_dim = input_dim
401 |         self.output_dim = output_dim
402 |         self.neigh_input_dim = neigh_input_dim
403 |         self.cell = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim)
404 | 
405 |     def _call(self, inputs):
406 |         self_vecs, neigh_vecs = inputs
407 | 
408 |         dims = tf.shape(neigh_vecs)
409 |         batch_size = dims[0]
410 |         initial_state = self.cell.zero_state(batch_size, tf.float32)
411 |         used = tf.sign(tf.reduce_max(tf.abs(neigh_vecs), axis=2))
412 |         length = tf.reduce_sum(used, axis=1)
413 |         length = tf.maximum(length, tf.constant(1.))
414 |         length = tf.cast(length, tf.int32)
415 | 
416 |         with tf.variable_scope(self.name) as scope:
417 |             try:
418 |                 rnn_outputs, rnn_states = tf.nn.dynamic_rnn(
419 |                         self.cell, neigh_vecs,
420 |                         initial_state=initial_state, dtype=tf.float32, time_major=False,
421 |                         sequence_length=length)
422 |             except ValueError:
423 |                 scope.reuse_variables()
424 |                 rnn_outputs, rnn_states = tf.nn.dynamic_rnn(
425 |                         self.cell, neigh_vecs,
426 |                         initial_state=initial_state, dtype=tf.float32, time_major=False,
427 |                         sequence_length=length)
428 |         batch_size = tf.shape(rnn_outputs)[0]
429 |         max_len = tf.shape(rnn_outputs)[1]
430 |         out_size = int(rnn_outputs.get_shape()[2])
431 |         index = tf.range(0, batch_size) * max_len + (length - 1)
432 |         flat = tf.reshape(rnn_outputs, [-1, out_size])
433 |         neigh_h = tf.gather(flat, index)
434 | 
435 |         from_neighs = tf.matmul(neigh_h, self.vars['neigh_weights'])
436 |         from_self = tf.matmul(self_vecs, self.vars["self_weights"])
437 |          
438 |         output = tf.add_n([from_self, from_neighs])
439 | 
440 |         if not self.concat:
441 |             output = tf.add_n([from_self, from_neighs])
442 |         else:
443 |             output = tf.concat([from_self, from_neighs], axis=1)
444 | 
445 |         # bias
446 |         if self.bias:
447 |             output += self.vars['bias']
448 |        
449 |         return self.act(output)
450 | 
451 | 


--------------------------------------------------------------------------------
/example_run.sh:
--------------------------------------------------------------------------------
1 | #python3 -m supervised_train --train_prefix ../dataset/alibaba_gul_3 --model mean
2 | python3 -m train --train_prefix ../sequential/ --model max
3 | #python3 -m sup_graphsage_train --train_prefix ../merge_alibaba/alibaba --model mean
4 | 


--------------------------------------------------------------------------------
/graphsage.py:
--------------------------------------------------------------------------------
  1 | from collections import namedtuple
  2 | import tensorflow as tf
  3 | import math
  4 |  
  5 | import layers as layers
  6 | 
  7 | class Model(object):
  8 |     
  9 |     def __init__(self, **kwargs):
 10 |         allowed_kwargs = {'name', 'logging', 'model_size'}
 11 |         for kwarg in kwargs.keys():
 12 |             assert kwarg in allowed_kwargs, 'Invalid keyword: ' + kwarg
 13 |         name = kwargs.get('name')
 14 |         if not name:
 15 |             name = self.__class__.__name__.lower()
 16 |         self.name = name
 17 | 
 18 |         logging = kwargs.get('logging', False)
 19 |         self.logging = logging
 20 | 
 21 |         self.vars = {}
 22 |         self.placeholders = {}
 23 | 
 24 |         self.layers = []
 25 |         self.activations = []
 26 |         self.inputs = None
 27 |         self.outputs = None
 28 |         
 29 |         self.loss = 0
 30 |         self.accuracy = 0
 31 |         self.optimizer = None
 32 |         self.opt_op = None
 33 | 
 34 |     def _build(self):
 35 |         raise NotImplementedError
 36 | 
 37 |     def build(self):
 38 |         with tf.variable_scope(self.name):
 39 |             self._build()
 40 |         
 41 |         self.activations.append(self.inputs)
 42 |         for layer in self.layers:
 43 |             hidden = layer(self.activations[-1])
 44 |             self.activations.append(hidden)
 45 |         self.outputs = self.activations[-1]
 46 | 
 47 |         variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name)
 48 |         self.vars = {var.name : var for var in variables}
 49 | 
 50 |         self._loss()
 51 |         self._accuracy()
 52 | 
 53 |         self.opt_op = self.optimizer.minimize(self.loss)
 54 | 
 55 |     def predict(self):
 56 |         pass
 57 | 
 58 |     def _loss(self):
 59 |         raise NotImplementedError
 60 | 
 61 |     def _accuracy(self):
 62 |         raise NotImplementedError
 63 | 
 64 |     def save(self, sess =None):
 65 |         if not sess:
 66 |             raise AttributeError('Tensorflow session not provide')
 67 |         saver = tf.train.Saver(self.vars)
 68 |         save_path = saver.save(sess, "tmp/%s.ckpt" % self.name)
 69 |         print("Model saved in file: %s" % save_path)
 70 | 
 71 |     def load(self, sess=None):
 72 |         if not sess:
 73 |             raise AttributedError('Tensorflow session not provided')
 74 |         saver = tf.train.Saver(self.vars)
 75 |         save_path = "tmp/%s.ckpt" % self.name
 76 |         saver.restore(sess, save_path)
 77 |         print('Model restored from file: %s' % save_path)
 78 | 
 79 | class GeneralizedModel(Model):
 80 |     def __init__(self, **kwargs):
 81 |         super(GeneralizedModel, self).__init__(**kwargs)
 82 | 
 83 |     def build(self):
 84 |         with tf.variable_scope(self.name):
 85 |             self._build()
 86 | 
 87 |         variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name)
 88 |         self.vars = {var.name: var for var in variables}
 89 | 
 90 |         self._loss()
 91 |         self._accuracy()
 92 | 
 93 |         self.opt_op = self.optimizer.minimize(self.loss)
 94 |         
 95 | 
 96 | 
 97 | SAGEInfo = namedtuple("SAGEInfo", ['layer_name', 'neigh_sampler', 'num_samples', 'output_dim'])
 98 | 
 99 | class SampleAndAggregate(GeneralizedModel):
100 |     
101 |     def __init__(self, placeholders, features, adj, degrees,
102 |             layer_infos, concat=True, aggregator_type="mean",
103 |             model_size="small", identitity_dim=0, **kwargs):
104 |         super(SampleAndAggregate, self).__init__(**kwargs)
105 |         if aggregator_type == "mean":
106 |             self.aggregator_cls = MeanAggregator
107 |         elif aggregator_type == "meanpool":
108 |             self.aggregator_cls = MeanPoolingAggregator
109 |         elif aggregator_type == "maxpool":
110 |             self.aggregator_cls = MaxPoolingAggregator
111 |         else:
112 |             raise Exception("Unknown aggregator:")
113 | 
114 |         #get info from palceholders ...
115 |         self.inputs1 = placeholders["batch1"]
116 |         self.inputs2 = placeholders["batch2"]
117 |         self.model_size = model_size
118 |         self.adj_info = adj
119 |         if identity_dim > 0:
120 |             self.embeds = tf.get_variable("node_embeddings", [adj.get_shape().as_list()[0], identity_dim])
121 |         else:
122 |             self.embeds = None
123 |         if features is None:
124 |             if identity_dim == 0:
125 |                 raise Exception("Identity dim and features have no positive values")
126 |             self.features = self.embeds
127 |         else:
128 |             self.features = tf.Variable(tf.conctant(features, dtype=tf.float32), trainable=False)
129 |             if not self.embeds is None:
130 |                 self.features = tf.concat([self.embeds, self.features], axis=1)
131 |         self.degrees = degrees
132 |         self.concat = concat
133 | 
134 |         self.dims = [(0 if features is None else features.shape[1]) + identity_dim]
135 |         self.dims.extend([layer_infos[i].output_dim for i in range(len(layer_infos))])
136 |         self.batch_size = placeholders["batch_size"]
137 |         self.placeholders = placeholders
138 |         self.layer_infos = layer_infos
139 | 
140 |         self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate)
141 | 
142 |         self.build()
143 | 
144 |     def sample(self, inputs, layer_infos, batch_size=None, timestamp = None):
145 |         
146 |         if batch_size is None:
147 |             batch_size = self.batch_size
148 |         samples = [inputs]
149 | 
150 |         support_size = 1
151 |         support_sizes = [support_size]
152 |         for k in range(len(layer_infos)):
153 |             t = len(layer_infos) - k - 1
154 |             support_size *= layer_infos[t].num_samples
155 |             sampler = layer_infos[t].neigh_sampler
156 |             if timestamp is None:
157 |                 node = sampler((samples[k], layer_infos[t].num_samples))
158 |             else:
159 |                 node = sampler((samples[k], layer_infos[t].num_samples, timestamp))
160 |             samples.append(tf.reshape(node, [support_size * batch_size, ]))
161 |             support_sizes.append(support_size)
162 |         return samples, support_sizes
163 | 
164 | 
165 |     def aggregate(self, samples, input_features, dims, num_samples, support_sizes, batch_size=None,
166 |             aggregators=None, name=None, concat=False, model_size="small"):
167 | 
168 |         if batch_size is None:
169 |             batch_size = self.batch_size
170 |         hidden = [tf.nn.embedding_lookup(input_features, node_samples) for node_samples in samples]
171 |         new_agg = aggregators is None
172 |         if new_agg:
173 |             aggregators = []
174 |         for layer in range(len(num_samples)):
175 |             if new_agg:
176 |                 dim_mult = 2 if concat and (layer != 0) else 1
177 |                 if layer == len(num_samples) -1:
178 |                     aggregator = self.aggregator_cls(dim_mult*dims[layer], dims[layer+1], act=lambda x:x,
179 |                                     dropout=self.placeholders['dropout'],
180 |                                     name=name, concat=concat, model_size=model_size)
181 |                 else:
182 |                     aggregator = self.aggregator_cls(dim_mult*dims[layer], dims[layer+1],
183 |                                     dropout=self.placeholders['dropout'],
184 |                                     name=name, concat=concat, model_size=model_size)
185 |                 aggregators.append(aggregator)
186 |             else:
187 |                 aggregator = aggregators[layer]
188 | 
189 |             next_hidden = []
190 |             for hop in range(len(num_samples) - layer):
191 |                 dim_mult = 2 if concat and (layer != 0) else 1
192 |                 neigh_dims = [batch_size * support_sizes[hop],
193 |                             num_samples[len(num_samples) - hop -1],
194 |                             dim_mult*dims[layer]]
195 |                 h = aggregator((hidden[hop],
196 |                                 tf.reshape(hidden[hop+1], neigh_dims)))
197 |                 next_hidden.append(h)
198 |             hidden = next_hidden
199 |         return hidden[0], aggregators
200 | 
201 | 
202 |     def _build_encoder(self, x):
203 |         dim_mult = 2 if self.concat else 1 
204 |         h = layers.Dense(dim_mult* self.dims[-1], dim_mult* self.dims[-1], 
205 |                         dropout=self.placeholders['dropout'], act=tf.nn.relu)(x)
206 |         mu = layers.Dense(dim_mult* self.dims[-1], dim_mult* self.dims[-1],
207 |                         dropout=self.placeholders['dropout'])(h)
208 |         log_sigma_squared = layers.Dense(dim_mult*self.dims[-1], dim_mult* self.dims[-1],
209 |                         dropout=self.placeholders['dropout'])(h)
210 | 
211 |         sigma_squared = tf.exp(log_sigma_squared)
212 |         sigma = tf.sqrt(sigma_squared)
213 |         return mu, log_sigma_squared, sigma_squared, sigma
214 | 
215 |     def _build_decoder(self, z):
216 |         dim_mult = 2 if self.concat else 1
217 |         h = tf.contrib.keras.layers.Dense(dim_mult * self.dims[-1], activation='relu')(z)
218 |         y_logit = tf.contrib.keras.layers.Dense(dim_mult * self.dims[-1])(h)
219 |         y = tf.sigmoid(y_logit)
220 |         return y_logit, y
221 |         
222 | 
223 |     def _build(self):
224 |         labels = tf.reshape(
225 |                 tf.cast(self.placeholders['batch2'], tf.int64),
226 |                 [self.batch_size, 1])
227 | 
228 |         samples, support_sizes = self.sample(self.inputs1, self.layer_infos)
229 |         num_samples = [layer_info.num_samples for layer_info in self.layer_infos]
230 |         self.hiddens, self.aggregators = self.aggregate(samples, [self.features], self.dims, num_samples,
231 |                     support_sizes, concat=self.concat, model_size=self.model_size)
232 |         
233 |         self.hiddens = tf.nn.l2_normalize(self.hiddens, 1)
234 |         self.mu, log_sigma_squared, sigma_squared, sigma = self._build_encoder(self.hiddens)
235 |         self.z = tf.random_normal([self.dims], mean=self.mu, stddev=sigma)
236 |         y_logit, self.y = self._build_decoder(self.z)
237 | 
238 | 
239 | 
240 |     def _loss(self):
241 |         for aggregator in self.aggregators:
242 |             for var in aggregator.vars.values():
243 |                 self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var)
244 | 
245 |         self.loss += self.link_pred_layer.loss(self.outputs1, self.outputs2, self.neg_outputs)
246 |         tf.summary.scalar('loss', self.loss)
247 |         
248 | 
249 |     def build(self):
250 |         self._build()
251 |         self._loss()
252 |         self._accuracy()
253 |         self.loss = self.loss / tf.cast(self.batch_size, tf.float32)
254 |         grads_and_vars = self.optimizer.compute_gradients(self.loss)
255 |         clipped_grads_and_vars = [(tf.clip_by_value(grad, -5.0, 5.0) if grad is not None else None, var) 
256 |                                     for grad, var in grads_and_vars]
257 |         self.grad, _ = clipped_grads_and_vars[0]
258 |         self.opt_op  = self.optimizer.apply_gradients(clipped_grad_and_vars)
259 | 
260 |     
261 | 


--------------------------------------------------------------------------------
/inits.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | # DISCLAIMER:
 5 | # Parts of this code file are derived from
 6 | # https://github.com/tkipf/gcn
 7 | # which is under an identical MIT license as GraphSAGE
 8 | 
 9 | def uniform(shape, scale=0.05, name=None):
10 |     """Uniform init."""
11 |     initial = tf.random_uniform(shape, minval=-scale, maxval=scale, dtype=tf.float32)
12 |     return tf.Variable(initial, name=name)
13 | 
14 | 
15 | def glorot(shape, name=None):
16 |     """Glorot & Bengio (AISTATS 2010) init."""
17 |     init_range = np.sqrt(6.0/(shape[0]+shape[1]))
18 |     initial = tf.random_uniform(shape, minval=-init_range, maxval=init_range, dtype=tf.float32)
19 |     return tf.Variable(initial, name=name)
20 | 
21 | 
22 | def zeros(shape, name=None):
23 |     """All zeros."""
24 |     initial = tf.zeros(shape, dtype=tf.float32)
25 |     return tf.Variable(initial, name=name)
26 | 
27 | def ones(shape, name=None):
28 |     """All ones."""
29 |     initial = tf.ones(shape, dtype=tf.float32)
30 |     return tf.Variable(initial, name=name)
31 | 


--------------------------------------------------------------------------------
/layers.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import print_function
 3 | 
 4 | import tensorflow as tf
 5 | from inits import zeros
 6 | 
 7 | flags = tf.app.flags
 8 | FLAGS = flags.FLAGS
 9 | 
10 | _LAYER_UIDS = {}
11 | def get_layer_uid(layer_name=''):
12 |     if layer_name not in _LAYER_UIDS:
13 |         _LAYER_UIDS[layer_name] = 1
14 |         return 1
15 |     else:
16 |         _LAYER_UIDS[layer_name] += 1
17 |         return _LAYER_UIDS[layer_name]
18 | 
19 | class Layer(object):
20 |     
21 |     def __init__(self, **kwargs):
22 |         
23 |         allowed_kwargs = {'name', 'logging', 'model_size'}
24 |         for kwarg in kwargs.keys():
25 |             assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg
26 |         name = kwargs.get('name')
27 |         if not name:
28 |             layer = self.__class__.__name__.lower()
29 |             name = layer + '_' + str(get_layer_uid(layer))
30 |         self.name = name
31 |         self.vars = {}
32 |         logging = kwargs.get('logging', False)
33 |         self.logging = logging
34 |         self.sparse_inputs = False
35 | 
36 | 
37 |     def _call(self, inputs):
38 |         return inputs
39 | 
40 |     def __call__(self, inputs):
41 |         with tf.name_scope(self.name):
42 |             if self.logging and not self.sparse_inputs:
43 |                 tf.summary.histogram(self.name + '/inputs', inputs)
44 |             outputs = self._call(inputs)
45 |             if self.logging:
46 |                 tf.summary.histogram(self.name + '/outputs', outputs)
47 |             return outputs
48 | 
49 |     def _log_vars(self):
50 |         for var in self.vars:
51 |             tf.summary.histogram(self.name + '/vars/' + var, self.vars[var])
52 | 
53 | 
54 | class Dense(Layer):
55 |     
56 |     def __init__(self, input_dim, output_dim, dropout=0., 
57 |                 act=None, placeholders=None, bias=True, featureless=False,
58 |                 sparse_inputs=False, **kwargs):
59 |         super(Dense, self).__init__(**kwargs)
60 | 
61 |         self.dropout = dropout
62 | 
63 |         self.act = act
64 |         self.featureless = featureless
65 |         self.bias = bias
66 |         self.input_dim = input_dim
67 |         self.output_dim = output_dim
68 | 
69 |         self.sparse_inputs = sparse_inputs
70 |         if sparse_inputs:
71 |             self.num_features_nonzero = placeholders['num_features_nonzero']
72 | 
73 |         with tf.variable_scope(self.name + '_vars'):
74 |             self.vars['weights'] = tf.get_variable('weights', shape=(input_dim, output_dim),
75 |                                             dtype=tf.float32,
76 |                                             initializer=tf.contrib.layers.xavier_initializer(),
77 |                                             regularizer=tf.contrib.layers.l2_regularizer(FLAGS.weight_decay))
78 |             if self.bias:
79 |                 self.vars['bias'] = zeros([output_dim], name='bias')
80 | 
81 |         if self.logging:
82 |             self._log_vars()
83 | 
84 |     def _call(self, inputs):
85 |         x = inputs
86 |         x = tf.nn.dropout(x, 1-self.dropout)
87 | 
88 |         output = tf.matmul(x, self.vars['weights'])
89 | 
90 |         if self.bias:
91 |             output += self.vars['bias']
92 |         if self.act is None:
93 |             return output
94 |         return self.act(output)
95 | 
96 | 


--------------------------------------------------------------------------------
/log/sup-sequential/max_small_0.0010/events.out.tfevents.1561206683.zhouxuechaodeMacBook-Pro.local:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ericZhao93/BurstGraph/dd743b322c9b6dc666a8599d8fe9fb901bd83258/log/sup-sequential/max_small_0.0010/events.out.tfevents.1561206683.zhouxuechaodeMacBook-Pro.local


--------------------------------------------------------------------------------
/log/sup-sequential/max_small_0.0010/events.out.tfevents.1561209809.zhouxuechaodeMacBook-Pro.local:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ericZhao93/BurstGraph/dd743b322c9b6dc666a8599d8fe9fb901bd83258/log/sup-sequential/max_small_0.0010/events.out.tfevents.1561209809.zhouxuechaodeMacBook-Pro.local


--------------------------------------------------------------------------------
/log/sup-sequential/max_small_0.0010/events.out.tfevents.1561263709.zhouxuechaodeMacBook-Pro.local:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ericZhao93/BurstGraph/dd743b322c9b6dc666a8599d8fe9fb901bd83258/log/sup-sequential/max_small_0.0010/events.out.tfevents.1561263709.zhouxuechaodeMacBook-Pro.local


--------------------------------------------------------------------------------
/log/sup-sequential/max_small_0.0010/events.out.tfevents.1561263857.zhouxuechaodeMacBook-Pro.local:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ericZhao93/BurstGraph/dd743b322c9b6dc666a8599d8fe9fb901bd83258/log/sup-sequential/max_small_0.0010/events.out.tfevents.1561263857.zhouxuechaodeMacBook-Pro.local


--------------------------------------------------------------------------------
/log/sup-sequential/max_small_0.0010/events.out.tfevents.1561272163.zhouxuechaodeMacBook-Pro.local:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ericZhao93/BurstGraph/dd743b322c9b6dc666a8599d8fe9fb901bd83258/log/sup-sequential/max_small_0.0010/events.out.tfevents.1561272163.zhouxuechaodeMacBook-Pro.local


--------------------------------------------------------------------------------
/log/sup-sequential/max_small_0.0010/val_stats.txt:
--------------------------------------------------------------------------------
1 | loss=0.53212 old_f1_micro= 0.79474 old_f1_macro= 0.77321 new_f1_micro= 0.76973 new_f1_macro= 0.75011


--------------------------------------------------------------------------------
/log/sup-sequential/mean_small_0.0010/events.out.tfevents.1561196841.zhouxuechaodeMacBook-Pro.local:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ericZhao93/BurstGraph/dd743b322c9b6dc666a8599d8fe9fb901bd83258/log/sup-sequential/mean_small_0.0010/events.out.tfevents.1561196841.zhouxuechaodeMacBook-Pro.local


--------------------------------------------------------------------------------
/log/sup-sequential/mean_small_0.0010/val_stats.txt:
--------------------------------------------------------------------------------
1 | loss=0.38652 old_f1_micro= 0.79622 old_f1_macro= 0.77377 new_f1_micro= 0.76678 new_f1_macro= 0.74107


--------------------------------------------------------------------------------
/minibatch.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from __future__ import print_function
  3 | 
  4 | import numpy as np
  5 | 
  6 | 
  7 | np.random.seed(123)
  8 | 
  9 | import tensorflow as tf
 10 | flags = tf.app.flags
 11 | FLAGS = flags.FLAGS
 12 | 
 13 | class EdgeMinibatchIterator(object):
 14 |     
 15 |     """ This minibatch iterator iterates over batches of sampled edges or
 16 |     random pairs of co-occuring edges.
 17 | 
 18 |     G -- networkx graph
 19 |     id2idx -- dict mapping node ids to index in feature tensor
 20 |     placeholders -- tensorflow placeholders object
 21 |     context_pairs -- if not none, then a list of co-occuring node pairs (from random walks)
 22 |     batch_size -- size of the minibatches
 23 |     max_degree -- maximum size of the downsampled adjacency lists
 24 |     n2v_retrain -- signals that the iterator is being used to add new embeddings to a n2v model
 25 |     fixed_n2v -- signals that the iterator is being used to retrain n2v with only existing nodes as context
 26 |     """
 27 |     def __init__(self, G, id2idx, 
 28 |             placeholders, context_pairs=None, batch_size=100, max_degree=25,
 29 |             n2v_retrain=False, fixed_n2v=False,
 30 |             **kwargs):
 31 | 
 32 |         self.G = G
 33 |         self.nodes = G.nodes()
 34 |         self.id2idx = id2idx
 35 |         self.placeholders = placeholders
 36 |         self.batch_size = batch_size
 37 |         self.max_degree = max_degree
 38 |         self.batch_num = 0
 39 | 
 40 |         self.nodes = np.random.permutation(G.nodes())
 41 |         self.adj, self.deg = self.construct_adj()
 42 |         self.test_adj = self.construct_test_adj()
 43 |         if context_pairs is None:
 44 |             edges = G.edges()
 45 |         else:
 46 |             edges = context_pairs
 47 |         self.train_edges = self.edges = np.random.permutation(edges)
 48 |         if not n2v_retrain:
 49 |             self.train_edges = self._remove_isolated(self.train_edges)
 50 |             self.val_edges = [e for e in G.edges() if G[e[0]][e[1]]['train_removed']]
 51 |         else:
 52 |             if fixed_n2v:
 53 |                 self.train_edges = self.val_edges = self._n2v_prune(self.edges)
 54 |             else:
 55 |                 self.train_edges = self.val_edges = self.edges
 56 | 
 57 |         print(len([n for n in G.nodes() if not G.node[n]['test'] and not G.node[n]['val']]), 'train nodes')
 58 |         print(len([n for n in G.nodes() if G.node[n]['test'] or G.node[n]['val']]), 'test nodes')
 59 |         self.val_set_size = len(self.val_edges)
 60 | 
 61 |     def _n2v_prune(self, edges):
 62 |         is_val = lambda n : self.G.node[n]["val"] or self.G.node[n]["test"]
 63 |         return [e for e in edges if not is_val(e[1])]
 64 | 
 65 |     def _remove_isolated(self, edge_list):
 66 |         new_edge_list = []
 67 |         missing = 0
 68 |         for n1, n2 in edge_list:
 69 |             if not n1 in self.G.node or not n2 in self.G.node:
 70 |                 missing += 1
 71 |                 continue
 72 |             if (self.deg[self.id2idx[n1]] == 0 or self.deg[self.id2idx[n2]] == 0) \
 73 |                     and (not self.G.node[n1]['test'] or self.G.node[n1]['val']) \
 74 |                     and (not self.G.node[n2]['test'] or self.G.node[n2]['val']):
 75 |                 continue
 76 |             else:
 77 |                 new_edge_list.append((n1,n2))
 78 |         print("Unexpected missing:", missing)
 79 |         return new_edge_list
 80 | 
 81 |     def construct_adj(self):
 82 |         adj = len(self.id2idx)*np.ones((len(self.id2idx)+1, self.max_degree))
 83 |         deg = np.zeros((len(self.id2idx),))
 84 | 
 85 |         for nodeid in self.G.nodes():
 86 |             if self.G.node[nodeid]['test'] or self.G.node[nodeid]['val']:
 87 |                 continue
 88 |             neighbors = np.array([self.id2idx[neighbor] 
 89 |                 for neighbor in self.G.neighbors(nodeid)
 90 |                 if (not self.G[nodeid][neighbor]['train_removed'])])
 91 |             deg[self.id2idx[nodeid]] = len(neighbors)
 92 |             if len(neighbors) == 0:
 93 |                 continue
 94 |             if len(neighbors) > self.max_degree:
 95 |                 neighbors = np.random.choice(neighbors, self.max_degree, replace=False)
 96 |             elif len(neighbors) < self.max_degree:
 97 |                 neighbors = np.random.choice(neighbors, self.max_degree, replace=True)
 98 |             adj[self.id2idx[nodeid], :] = neighbors
 99 |         return adj, deg
100 | 
101 |     def construct_test_adj(self):
102 |         adj = len(self.id2idx)*np.ones((len(self.id2idx)+1, self.max_degree))
103 |         for nodeid in self.G.nodes():
104 |             neighbors = np.array([self.id2idx[neighbor] 
105 |                 for neighbor in self.G.neighbors(nodeid)])
106 |             if len(neighbors) == 0:
107 |                 continue
108 |             if len(neighbors) > self.max_degree:
109 |                 neighbors = np.random.choice(neighbors, self.max_degree, replace=False)
110 |             elif len(neighbors) < self.max_degree:
111 |                 neighbors = np.random.choice(neighbors, self.max_degree, replace=True)
112 |             adj[self.id2idx[nodeid], :] = neighbors
113 |         return adj
114 | 
115 |     def end(self):
116 |         return self.batch_num * self.batch_size >= len(self.train_edges)
117 | 
118 |     def batch_feed_dict(self, batch_edges):
119 |         batch1 = []
120 |         batch2 = []
121 |         for node1, node2 in batch_edges:
122 |             batch1.append(self.id2idx[node1])
123 |             batch2.append(self.id2idx[node2])
124 | 
125 |         feed_dict = dict()
126 |         feed_dict.update({self.placeholders['batch_size'] : len(batch_edges)})
127 |         feed_dict.update({self.placeholders['batch1']: batch1})
128 |         feed_dict.update({self.placeholders['batch2']: batch2})
129 | 
130 |         return feed_dict
131 | 
132 |     def next_minibatch_feed_dict(self):
133 |         start_idx = self.batch_num * self.batch_size
134 |         self.batch_num += 1
135 |         end_idx = min(start_idx + self.batch_size, len(self.train_edges))
136 |         batch_edges = self.train_edges[start_idx : end_idx]
137 |         return self.batch_feed_dict(batch_edges)
138 | 
139 |     def num_training_batches(self):
140 |         return len(self.train_edges) // self.batch_size + 1
141 | 
142 |     def val_feed_dict(self, size=None):
143 |         edge_list = self.val_edges
144 |         if size is None:
145 |             return self.batch_feed_dict(edge_list)
146 |         else:
147 |             ind = np.random.permutation(len(edge_list))
148 |             val_edges = [edge_list[i] for i in ind[:min(size, len(ind))]]
149 |             return self.batch_feed_dict(val_edges)
150 | 
151 |     def incremental_val_feed_dict(self, size, iter_num):
152 |         edge_list = self.val_edges
153 |         val_edges = edge_list[iter_num*size:min((iter_num+1)*size, 
154 |             len(edge_list))]
155 |         return self.batch_feed_dict(val_edges), (iter_num+1)*size >= len(self.val_edges), val_edges
156 | 
157 |     def incremental_embed_feed_dict(self, size, iter_num):
158 |         node_list = self.nodes
159 |         val_nodes = node_list[iter_num*size:min((iter_num+1)*size, 
160 |             len(node_list))]
161 |         val_edges = [(n,n) for n in val_nodes]
162 |         return self.batch_feed_dict(val_edges), (iter_num+1)*size >= len(node_list), val_edges
163 | 
164 |     def label_val(self):
165 |         train_edges = []
166 |         val_edges = []
167 |         for n1, n2 in self.G.edges():
168 |             if (self.G.node[n1]['val'] or self.G.node[n1]['test'] 
169 |                     or self.G.node[n2]['val'] or self.G.node[n2]['test']):
170 |                 val_edges.append((n1,n2))
171 |             else:
172 |                 train_edges.append((n1,n2))
173 |         return train_edges, val_edges
174 | 
175 |     def shuffle(self):
176 |         """ Re-shuffle the training set.
177 |             Also reset the batch number.
178 |         """
179 |         self.train_edges = np.random.permutation(self.train_edges)
180 |         self.nodes = np.random.permutation(self.nodes)
181 |         self.batch_num = 0
182 | 
183 | class NodeMinibatchIterator(object):
184 |     
185 |     """ 
186 |     This minibatch iterator iterates over nodes for supervised learning.
187 | 
188 |     G -- networkx graph
189 |     id2idx -- dict mapping node ids to integer values indexing feature tensor
190 |     placeholders -- standard tensorflow placeholders object for feeding
191 |     label_map -- map from node ids to class values (integer or list)
192 |     num_classes -- number of output classes
193 |     batch_size -- size of the minibatches
194 |     max_degree -- maximum size of the downsampled adjacency lists
195 |     """
196 |     def __init__(self, G, id2idx, 
197 |             placeholders, label_map, num_classes, 
198 |             batch_size=100, max_degree=25,
199 |             **kwargs):
200 | 
201 |         self.G = G
202 |         self.nodes = G.nodes()
203 |         self.id2idx = id2idx
204 |         self.placeholders = placeholders
205 |         self.batch_size = batch_size
206 |         self.max_degree = max_degree
207 |         self.batch_num = 0
208 |         self.label_map = label_map
209 |         self.num_classes = num_classes
210 | 
211 |         self.adj, self.deg = self.construct_adj()
212 |         self.test_adj = self.construct_test_adj()
213 | 
214 |         #self.val_nodes = [n for n in self.G.nodes() if self.G.node[n]['val']]
215 |         self.test_nodes = [n for n in self.G.nodes() if self.G.node[n]["user"] and self.G.node[n]['test']]
216 | 
217 |         self.train_nodes = [n for n in self.G.nodes() if self.G.node[n]["user"] and not self.G.node[n]["test"]]
218 |         # don't train on nodes that only have edges to test set
219 |         self.train_nodes = [n for n in self.train_nodes if self.deg[id2idx[n]] > 0]
220 | 
221 | 
222 |     def _make_label_vec(self, node):
223 |         label = self.label_map[node]
224 |         if isinstance(label, list):
225 |             label_vec = np.array(label)
226 |         elif isinstance(label, dict):
227 |             label_vec = np.array(label['old']) + np.array(label['new'])
228 |         else:
229 |             label_vec = np.zeros((self.num_classes))
230 |             class_ind = self.label_map[node]
231 |             label_vec[class_ind] = 1
232 |         return label_vec
233 | 
234 |     def _make_weight_vec(self, node):
235 |         label = self.label_map[node]
236 |         if isinstance(label, list):
237 |             weight_vec = np.array([FLAGS.weight_value if value >= 0.5 else 1.0 for value in label])
238 |         elif isinstance(label, dict):
239 |             weight_vec = np.array([FLAGS.weight_value if label['old'][idx] + label['new'][idx] >= 0.5 else 1.0 for idx in range(len(label['old']))])
240 |         return weight_vec
241 | 
242 |     def _make_weights(self, vec, value):
243 |         return np.array([value if vec[idx] >= 0.5 else 1.0 for idx in range(len(vec))])
244 | 
245 |     def construct_adj(self):
246 |         adj = len(self.id2idx)*np.ones((len(self.id2idx)+1, self.max_degree))
247 |         deg = np.zeros((len(self.id2idx),))
248 | 
249 |         for nodeid in self.G.nodes():
250 |             if self.G.node[nodeid]['user'] and self.G.node[nodeid]['test']:
251 |                 continue
252 |             neighbors = np.array([self.id2idx[neighbor]
253 |                 for neighbor in self.G.neighbors(nodeid)
254 |                 if (not self.G[nodeid][neighbor]['train_removed'])])
255 |             deg[self.id2idx[nodeid]] = len(neighbors)
256 |             if len(neighbors) == 0:
257 |                 continue
258 |             if len(neighbors) > self.max_degree:
259 |                 neighbors = np.random.choice(neighbors, self.max_degree, replace=False)
260 |             elif len(neighbors) < self.max_degree:
261 |                 neighbors = np.random.choice(neighbors, self.max_degree, replace=True)
262 |             adj[self.id2idx[nodeid], :] = neighbors
263 |         return adj, deg
264 | 
265 |     def construct_test_adj(self):
266 |         adj = len(self.id2idx)*np.ones((len(self.id2idx)+1, self.max_degree))
267 |         for nodeid in self.G.nodes():
268 |             neighbors = np.array([self.id2idx[neighbor] 
269 |                 for neighbor in self.G.neighbors(nodeid)])
270 |             if len(neighbors) == 0:
271 |                 continue
272 |             if len(neighbors) > self.max_degree:
273 |                 neighbors = np.random.choice(neighbors, self.max_degree, replace=False)
274 |             elif len(neighbors) < self.max_degree:
275 |                 neighbors = np.random.choice(neighbors, self.max_degree, replace=True)
276 |             adj[self.id2idx[nodeid], :] = neighbors
277 |         return adj
278 | 
279 |     def end(self):
280 |         return self.batch_num * self.batch_size >= len(self.train_nodes)
281 | 
282 |     def batch_feed_dict(self, batch_nodes, val=False):
283 |         batch1id = batch_nodes
284 |         batch1 = [self.id2idx[n] for n in batch1id]
285 |         if not FLAGS.two_channel:     
286 |             labels = np.vstack([self._make_label_vec(node) for node in batch1id])
287 |             weights = np.vstack([self._make_weight_vec(node) for node in batch1id])
288 |             feed_dict = dict()
289 |             feed_dict.update({self.placeholders['batch_size'] : len(batch1)})
290 |             feed_dict.update({self.placeholders['batch']: batch1})
291 |             feed_dict.update({self.placeholders['labels']: labels})
292 |             feed_dict.update({self.placeholders['weights']: weights})
293 |         else:
294 |             labels_rec = np.vstack([ np.array(self.label_map[node]['old']) for node in batch1id])
295 |             weights_rec = np.vstack([self._make_weights(self.label_map[node]['old'], FLAGS.weight_value) for node in batch1id])
296 |             labels_abn = np.vstack([ np.array(self.label_map[node]['new']) for node in batch1id])
297 |             weights_abn = np.vstack([self._make_weights(self.label_map[node]['new'], FLAGS.weight_value*30) for node in batch1id])
298 |             feed_dict = dict()
299 |             feed_dict.update({self.placeholders['batch_size']: len(batch1)})
300 |             feed_dict.update({self.placeholders['batch']: batch1})
301 |             feed_dict.update({self.placeholders['labels_rec']: labels_rec})
302 |             feed_dict.update({self.placeholders['labels_abn']: labels_abn})
303 |             feed_dict.update({self.placeholders['weights_rec']: weights_rec})
304 |             feed_dict.update({self.placeholders['weights_abn']: weights_abn})
305 |             labels = labels_rec + labels_abn
306 |         return feed_dict, labels
307 | 
308 |     def node_val_feed_dict(self, size=None, test=False):
309 |         #if test:
310 |         #    val_nodes = self.test_nodes
311 |         #else:
312 |         #    val_nodes = self.val_nodes
313 |         val_nodes = self.test_nodes
314 |         if not size is None:
315 |             val_nodes = np.random.choice(val_nodes, size, replace=True)
316 |         # add a dummy neighbor
317 |         ret_val = self.batch_feed_dict(val_nodes)
318 |         return ret_val[0], ret_val[1]
319 | 
320 | 
321 |     def incremental_node_val_feed_dict(self, size, iter_num, test=False):
322 |         #if test:
323 |         #    val_nodes = self.test_nodes
324 |         #else:
325 |         #    val_nodes = self.val_nodes
326 |         val_nodes = self.test_nodes
327 |         val_node_subset = val_nodes[iter_num*size:min((iter_num+1)*size, 
328 |             len(val_nodes))]
329 | 
330 |         # add a dummy neighbor
331 |         ret_val = self.batch_feed_dict(val_node_subset)
332 |         return ret_val[0], ret_val[1], (iter_num+1)*size >= len(val_nodes), val_node_subset
333 | 
334 |     def num_training_batches(self):
335 |         return len(self.train_nodes) // self.batch_size + 1
336 | 
337 |     def next_minibatch_feed_dict(self):
338 |         start_idx = self.batch_num * self.batch_size
339 |         self.batch_num += 1
340 |         end_idx = min(start_idx + self.batch_size, len(self.train_nodes))
341 |         batch_nodes = self.train_nodes[start_idx : end_idx]
342 |         return self.batch_feed_dict(batch_nodes)
343 | 
344 |     def incremental_embed_feed_dict(self, size, iter_num):
345 |         node_list = self.nodes
346 |         val_nodes = node_list[iter_num*size:min((iter_num+1)*size, 
347 |             len(node_list))]
348 |         return self.batch_feed_dict(val_nodes), (iter_num+1)*size >= len(node_list), val_nodes
349 | 
350 |     def shuffle(self):
351 |         """ Re-shuffle the training set.
352 |             Also reset the batch number.
353 |         """
354 |         self.train_nodes = np.random.permutation(self.train_nodes)
355 |         self.batch_num = 0
356 | 
357 | class SeqNodeMinibatchIterator(object):
358 |     
359 |     """ 
360 |     This minibatch iterator iterates over nodes for supervised learning.
361 | 
362 |     G -- networkx graph in each time steps
363 |     id2idx -- dict mapping node ids to integer values indexing feature tensor
364 |     placeholders -- standard tensorflow placeholders object for feeding
365 |     label_map -- map from node ids to class values (integer or list)
366 |     num_classes -- number of output classes
367 |     batch_size -- size of the minibatches
368 |     max_degree -- maximum size of the downsampled adjacency lists
369 |     """
370 |     def __init__(self, G, id2idx, 
371 |             placeholders, label_map, num_classes, 
372 |             batch_size=100, max_degree=25, num_steps=5,
373 |             **kwargs):
374 | 
375 |         self.G = G
376 |         self.nodes = self._get_all_nodes(G)
377 |         self.id2idx = id2idx
378 |         self.placeholders = placeholders
379 |         self.batch_size = batch_size
380 |         self.max_degree = max_degree
381 |         self.batch_num = 0
382 |         self.num_steps = num_steps
383 |         self.label_map = label_map
384 |         self.num_classes = num_classes
385 | 
386 |         self.adj, self.deg = self.construct_adj()
387 |         self.test_adj = self.construct_test_adj()
388 | 
389 |         #self.val_nodes = [n for n in self.G.nodes() if self.G.node[n]['val']]
390 |         self.test_nodes = self._get_test_nodes(G)
391 | 
392 |         self.train_nodes = self._get_train_nodes(G)
393 |         # don't train on nodes that only have edges to test set
394 |     
395 |     def _get_all_nodes(self, Gs):
396 |         node_set = set([])
397 |         node_list = []
398 |         for G in Gs:
399 |             for node in G.nodes():
400 |                 if not node in node_set:
401 |                     node_set.add(node)
402 |                     node_list.append(node)
403 |         return node_list
404 | 
405 |     def _get_train_nodes(self, Gs):
406 |         node_set = set([])
407 |         node_list = []
408 |         for G in Gs:
409 |             for node in G.nodes():
410 |                 if G.node[node]["user"] and not G.node[node]["test"] and not node in node_set:
411 |                     node_list.append(node)
412 |                     node_set.add(node)
413 |         return node_list
414 | 
415 |     def _get_test_nodes(self, Gs):
416 |         node_set = set([])
417 |         node_list = []
418 |         for G in Gs:
419 |             for node in G.nodes():
420 |                 if G.node[node]["user"] and G.node[node]["test"] and not node in node_set:
421 |                     node_list.append(node)
422 |                     node_set.add(node)
423 |         return node_list
424 | 
425 | 
426 |     def _make_label_vec(self, node):
427 |         label = self.label_map[node]
428 |         if isinstance(label, list):
429 |             label_vec = np.array(label)
430 |         elif isinstance(label, dict):
431 |             label_vec = np.array(label['old']) + np.array(label['new'])
432 |         else:
433 |             label_vec = np.zeros((self.num_classes))
434 |             class_ind = self.label_map[node]
435 |             label_vec[class_ind] = 1
436 |         return label_vec
437 | 
438 |     def _make_weight_vec(self, node):
439 |         label = self.label_map[node]
440 |         if isinstance(label, list):
441 |             weight_vec = np.array([FLAGS.weight_value if value >= 0.5 else 1.0 for value in label])
442 |         elif isinstance(label, dict):
443 |             weight_vec = np.array([FLAGS.weight_value if label['old'][idx] + label['new'][idx] >= 0.5 else 1.0 for idx in range(len(label['old']))])
444 |         return weight_vec
445 | 
446 |     def _make_weights(self, vec, value):
447 |         return np.array([value if vec[idx] >= 0.5 else 1.0 for idx in range(len(vec))])
448 | 
449 |     def construct_adj(self):
450 |         adj = [len(self.id2idx)*np.ones((len(self.id2idx)+1, self.max_degree)) for i in range(self.num_steps)]
451 |         deg = [np.zeros((len(self.id2idx),)) for i in range(self.num_steps)]
452 |         
453 |         for Gid in range(self.num_steps):
454 |             for nodeid in self.G[Gid].nodes():
455 |                 if self.G[Gid].node[nodeid]['user'] and self.G[Gid].node[nodeid]['test']:
456 |                     continue
457 |                 neighbors = np.array([self.id2idx[neighbor]
458 |                                     for neighbor in self.G[Gid].neighbors(nodeid)
459 |                                     if (not self.G[Gid][nodeid][neighbor]['train_removed'])])
460 |                 deg[Gid][self.id2idx[nodeid]] = len(neighbors)
461 |                 if len(neighbors) == 0:
462 |                     continue
463 |                 if len(neighbors) > self.max_degree:
464 |                     neighbors = np.random.choice(neighbors, self.max_degree, replace=False)
465 |                 elif len(neighbors) < self.max_degree:
466 |                     neighbors = np.random.choice(neighbors, self.max_degree, replace=True)
467 |                 adj[Gid][self.id2idx[nodeid], :] = neighbors
468 |         return adj, deg
469 | 
470 |     def construct_test_adj(self):
471 |         adj = [len(self.id2idx)*np.ones((len(self.id2idx)+1, self.max_degree)) for i in range(self.num_steps)]
472 |         for Gid in range(self.num_steps):
473 |             for nodeid in self.G[Gid].nodes():
474 |                 neighbors = np.array([self.id2idx[neighbor] 
475 |                         for neighbor in self.G[Gid].neighbors(nodeid)])
476 |                 if len(neighbors) == 0:
477 |                     continue
478 |                 if len(neighbors) > self.max_degree:
479 |                     neighbors = np.random.choice(neighbors, self.max_degree, replace=False)
480 |                 elif len(neighbors) < self.max_degree:
481 |                     neighbors = np.random.choice(neighbors, self.max_degree, replace=True)
482 |                 adj[Gid][self.id2idx[nodeid], :] = neighbors
483 |         return adj
484 | 
485 |     def end(self):
486 |         return self.batch_num * self.batch_size >= len(self.train_nodes)
487 | 
488 |     def batch_feed_dict(self, batch_nodes, val=False):
489 |         batch1id = batch_nodes
490 |         batch1 = [self.id2idx[n] for n in batch1id]
491 |         if not FLAGS.two_channel:     
492 |             labels = np.vstack([self._make_label_vec(node) for node in batch1id])
493 |             weights = np.vstack([self._make_weight_vec(node) for node in batch1id])
494 |             feed_dict = dict()
495 |             feed_dict.update({self.placeholders['batch_size'] : len(batch1)})
496 |             feed_dict.update({self.placeholders['batch']: batch1})
497 |             feed_dict.update({self.placeholders['labels']: labels})
498 |             feed_dict.update({self.placeholders['weights']: weights})
499 |         else:
500 |             labels_rec = np.array([ [np.array(self.label_map[step][node]['old']) for step in range(self.num_steps)] for node in batch1id])
501 |             weights_rec = np.array([[self._make_weights(self.label_map[step][node]['old'], FLAGS.weight_value) for step in range(self.num_steps)] for node in batch1id])
502 |             labels_abn = np.array([ [np.array(self.label_map[step][node]['new']) for step in range(self.num_steps)] for node in batch1id])
503 |             weights_abn = np.array([[self._make_weights(self.label_map[step][node]['new'], FLAGS.weight_value*20) for step in range(self.num_steps)] for node in batch1id])
504 |             feed_dict = dict()
505 |             feed_dict.update({self.placeholders['batch_size']: len(batch1)})
506 |             feed_dict.update({self.placeholders['batch']: batch1})
507 |             feed_dict.update({self.placeholders['labels_rec']: labels_rec})
508 |             feed_dict.update({self.placeholders['labels_abn']: labels_abn})
509 |             feed_dict.update({self.placeholders['weights_rec']: weights_rec})
510 |             feed_dict.update({self.placeholders['weights_abn']: weights_abn})
511 |             labels = labels_rec + labels_abn
512 |         return feed_dict, labels[:, -1, :]
513 | 
514 |     def node_val_feed_dict(self, size=None, test=False):
515 |         #if test:
516 |         #    val_nodes = self.test_nodes
517 |         #else:
518 |         #    val_nodes = self.val_nodes
519 |         val_nodes = self.test_nodes
520 |         if not size is None:
521 |             val_nodes = np.random.choice(val_nodes, size, replace=True)
522 |         # add a dummy neighbor
523 |         ret_val = self.batch_feed_dict(val_nodes)
524 |         return ret_val[0], ret_val[1]
525 | 
526 | 
527 |     def incremental_node_val_feed_dict(self, size, iter_num, test=False):
528 |         #if test:
529 |         #    val_nodes = self.test_nodes
530 |         #else:
531 |         #    val_nodes = self.val_nodes
532 |         val_nodes = self.test_nodes
533 |         val_node_subset = val_nodes[iter_num*size:min((iter_num+1)*size, 
534 |             len(val_nodes))]
535 | 
536 |         # add a dummy neighbor
537 |         ret_val = self.batch_feed_dict(val_node_subset)
538 |         return ret_val[0], ret_val[1], (iter_num+1)*size >= len(val_nodes), val_node_subset
539 | 
540 |     def num_training_batches(self):
541 |         return len(self.train_nodes) // self.batch_size + 1
542 | 
543 |     def next_minibatch_feed_dict(self):
544 |         start_idx = self.batch_num * self.batch_size
545 |         self.batch_num += 1
546 |         end_idx = min(start_idx + self.batch_size, len(self.train_nodes))
547 |         batch_nodes = self.train_nodes[start_idx : end_idx]
548 |         return self.batch_feed_dict(batch_nodes)
549 | 
550 |     def incremental_embed_feed_dict(self, size, iter_num):
551 |         node_list = self.nodes
552 |         val_nodes = node_list[iter_num*size:min((iter_num+1)*size, 
553 |             len(node_list))]
554 |         return self.batch_feed_dict(val_nodes), (iter_num+1)*size >= len(node_list), val_nodes
555 | 
556 |     def shuffle(self):
557 |         """ Re-shuffle the training set.
558 |             Also reset the batch number.
559 |         """
560 |         self.train_nodes = np.random.permutation(self.train_nodes)
561 |         self.batch_num = 0
562 | 


--------------------------------------------------------------------------------
/neigh_samplers.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import print_function
 3 | 
 4 | from layers import Layer
 5 | 
 6 | import tensorflow as tf
 7 | flags = tf.app.flags
 8 | FLAGS = flags.FLAGS
 9 | 
10 | 
11 | """
12 | Classes that are used to sample node neighborhoods
13 | """
14 | 
15 | class UniformNeighborSampler(Layer):
16 |     """
17 |     Uniformly samples neighbors.
18 |     Assumes that adj lists are padded with random re-sampling
19 |     """
20 |     def __init__(self, adj_info, **kwargs):
21 |         super(UniformNeighborSampler, self).__init__(**kwargs)
22 |         self.adj_info = adj_info
23 | 
24 |     def _call(self, inputs):
25 |         ids, num_samples = inputs
26 |         adj_lists = tf.nn.embedding_lookup(self.adj_info, ids) 
27 |         adj_lists = tf.transpose(tf.random_shuffle(tf.transpose(adj_lists)))
28 |         adj_lists = tf.slice(adj_lists, [0,0], [-1, num_samples])
29 |         return adj_lists
30 | 
31 | 
32 | 
33 | class SeqUniformNeighborSampler(Layer):
34 | 
35 |     def __init__(self, adj_info, **kwargs):
36 |         super(SeqUniformNeighborSampler, self).__init__(**kwargs)
37 |         self.adj_info = tf.unstack(adj_info, axis=0)
38 |         print("adj_info :", tf.shape(self.adj_info))
39 | 
40 |     def _call(self, inputs):
41 |         ids, num_samples, aid = inputs
42 |         print(aid)
43 |         adj_lists = tf.nn.embedding_lookup(self.adj_info[aid], ids)
44 |         adj_lists = tf.transpose(tf.random_shuffle(tf.transpose(adj_lists)))
45 |         adj_lists = tf.slice(adj_lists, [0,0], [-1, num_samples])
46 |         return adj_lists
47 | 


--------------------------------------------------------------------------------
/readFile.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import networkx as nx
 3 | from networkx.readwrite import json_graph
 4 | 
 5 | with open('../dataset/graph_v1.json', 'r') as rf:
 6 |     data = json.loads(rf.readline())
 7 |     G = json_graph.node_link_graph(data)
 8 |     broken_count = 0
 9 |     for node in G.nodes():
10 |         if not 'val' in G.node[node] or not 'test' in G.node[node]:
11 |             broken_count += 1
12 |             G.remove_node(node)
13 |     print(broken_count)
14 | 


--------------------------------------------------------------------------------
/supervised_model.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | import graphsage as models
  4 | import layers as layers
  5 | from aggregators import MeanAggregator, MaxPoolingAggregator, MeanPoolingAggregator
  6 | 
  7 | 
  8 | flags = tf.app.flags
  9 | FLAGS = flags.FLAGS
 10 | 
 11 | class SupervisedGraphsage(models.SampleAndAggregate):
 12 |     
 13 |     def __init__(self, num_classes,
 14 |                 placeholders, features, adj, degrees,
 15 |                 layer_infos, concat=True, aggregator_type="mean",
 16 |                 model_size="small", sigmoid_loss=False, identity_dim=0,
 17 |                     **kwargs):
 18 | 
 19 |         models.GeneralizedModel.__init__(self, **kwargs)
 20 | 
 21 |         if aggregator_type == "mean":
 22 |             self.aggregator_cls = MeanAggregator
 23 |         elif aggregator_type == "meanpool":
 24 |             self.aggregator_cls = MeanPoolingAggregator
 25 |         elif aggregator_type == "maxpool":
 26 |             self.aggregator_cls = MaxPoolingAggregator
 27 |         else:
 28 |             raise Exception("Unknown aggregator:", self.aggregator_cls)
 29 | 
 30 |         self.inputs = placeholders["batch"]
 31 |         self.model_size = model_size
 32 |         self.adj_info = adj
 33 |         if identity_dim > 0:
 34 |             self.embeds = tf.get_variable("node_embeddings", [adj.get_shape().as_list()[0], identity_dim])
 35 |         else:
 36 |             self.embeds = None
 37 |         if features is None:
 38 |             if identity_dim == 0:
 39 |                 raise Exception("identity feature dimension must be positive")
 40 |             self.features = self.embeds
 41 |         else:
 42 |             self.features = tf.Variable(tf.constant(features, dtype=tf.float32), trainable=False)
 43 |             if not self.embeds is None:
 44 |                 self.features = tf.concat([self.embeds, self.features], axis=1)
 45 |         self.degrees = degrees
 46 |         self.concat  = concat
 47 |         self.num_classes = num_classes
 48 |         self.dims = [ (0 if features is None else features.shape[1]) + identity_dim]
 49 |         self.dims.extend([layer_infos[i].output_dim for i in range(len(layer_infos))])
 50 |         self.batch_size = placeholders["batch_size"]
 51 |         self.placeholders = placeholders
 52 |         self.layer_infos = layer_infos
 53 | 
 54 |         self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate)
 55 | 
 56 |         self.build()
 57 | 
 58 |     def build(self):
 59 |         samples, support_size = self.sample(self.inputs, self.layer_infos)
 60 |         num_samples = [layer_info.num_samples for layer_info in self.layer_infos]
 61 |         self.hiddens, self.aggregators = self.aggregate(samples, [self.features], self.dims, num_samples,
 62 |                                 support_size, concat=self.concat, model_size=self.model_size)
 63 |         dim_mult = 2 if self.concat else 1
 64 | 
 65 |         self.hiddens = tf.nn.l2_normalize(self.hiddens, 1)
 66 | 
 67 |         # VAE
 68 |         self.mu, log_sigma_squared, sigma_squared, sigma = self._build_encoder(self.hiddens)
 69 |         self.z = tf.random_normal([dim_mult*self.dims[-1]], mean=self.mu, stddev=sigma)
 70 |         self.regular_vae = -0.5 * tf.reduce_sum(1 + log_sigma_squared - tf.square(self.mu) - sigma_squared, 1)
 71 |         self.node_pred = layers.Dense(dim_mult*self.dims[-1], self.num_classes,
 72 |                         dropout=self.placeholders['dropout'],
 73 |                         act=tf.nn.sigmoid)
 74 | 
 75 |         self.outputs = self.node_pred(self.z)
 76 | 
 77 |         self._loss()
 78 |         
 79 |         grads_and_vars = self.optimizer.compute_gradients(self.loss)
 80 |         clipped_grads_and_vars = [(tf.clip_by_value(grad, -5.0, 5.0) if grad is not None else None, var)
 81 |                                     for grad, var in grads_and_vars]
 82 |         self.grad, _ = clipped_grads_and_vars[0]
 83 |         self.opt_op = self.optimizer.apply_gradients(clipped_grads_and_vars)
 84 | 
 85 |     def _loss(self):
 86 |         
 87 |         for aggregator in self.aggregators:
 88 |             for var in aggregator.vars.values():
 89 |                 self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var)
 90 |         for var in self.node_pred.vars.values():
 91 |             self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var)
 92 |         
 93 |         # multi-class loss
 94 |         #logits = tf.math.log(self.outputs)
 95 |         #one_hot_labels = tf.one_hot(self.placeholders['labels'], depth=self.num_classes, dtype=tf.float32, axis=-1)
 96 |         #multi_loss = tf.einsum('bij,bj->bi', one_hot_labels, logits)
 97 |         #multi_loss =  - tf.einsum('bi,bi->b', multi_loss, self.label_masks)
 98 |         multi_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.outputs, labels=self.placeholders['labels'])
 99 |         multi_loss = tf.einsum('bi,bi->b', multi_loss, self.placeholders['weights'])
100 |         self.loss += tf.reduce_mean(multi_loss)
101 |         self.loss += tf.reduce_mean(self.regular_vae)
102 | 
103 |         tf.summary.scalar('loss', self.loss)
104 | 
105 |     def predict(self):
106 |         return tf.outputs
107 | 
108 | 
109 | class TwoChannelGraphsage(models.SampleAndAggregate):
110 |     
111 |     def __init__(self, num_classes,
112 |                 placeholders, features, adj, degrees,
113 |                 layer_infos, concat=True, aggregator_type="mean",
114 |                 model_size="small", sigmoid_loss=False, identity_dim=0,
115 |                     **kwargs):
116 | 
117 |         models.GeneralizedModel.__init__(self, **kwargs)
118 | 
119 |         if aggregator_type == "mean":
120 |             self.aggregator_cls = MeanAggregator
121 |         elif aggregator_type == "meanpool":
122 |             self.aggregator_cls = MeanPoolingAggregator
123 |         elif aggregator_type == "maxpool":
124 |             self.aggregator_cls = MaxPoolingAggregator
125 |         else:
126 |             raise Exception("Unknown aggregator:", self.aggregator_cls)
127 | 
128 |         self.inputs = placeholders["batch"]
129 |         self.model_size = model_size
130 |         self.adj_info = adj
131 |         self.temperature = 1.0
132 |         if identity_dim > 0:
133 |             self.embeds = tf.get_variable("node_embeddings", [adj.get_shape().as_list()[0], identity_dim])
134 |         else:
135 |             self.embeds = None
136 |         if features is None:
137 |             if identity_dim == 0:
138 |                 raise Exception("identity feature dimension must be positive")
139 |             self.features = self.embeds
140 |         else:
141 |             self.features = tf.Variable(tf.constant(features, dtype=tf.float32), trainable=False)
142 |             if not self.embeds is None:
143 |                 self.features = tf.concat([self.embeds, self.features], axis=1)
144 |         self.degrees = degrees
145 |         self.concat  = concat
146 |         self.num_classes = num_classes
147 |         self.dims = [ (0 if features is None else features.shape[1]) + identity_dim]
148 |         self.dims.extend([layer_infos[i].output_dim for i in range(len(layer_infos))])
149 |         self.batch_size = placeholders["batch_size"]
150 |         self.placeholders = placeholders
151 |         self.layer_infos = layer_infos
152 | 
153 |         self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate)
154 | 
155 |         self.build()
156 | 
157 | 
158 |     def _build_encoder(self, x):
159 |         dim_mult  = 2 if self.concat else 1
160 |         fc1 = layers.Dense(dim_mult*self.dims[-1], dim_mult* self.dims[-1],
161 |                         dropout=self.placeholders['dropout'], act=tf.nn.relu)
162 |         fc2 = layers.Dense(dim_mult* self.dims[-1], dim_mult* self.dims[-1],
163 |                         dropout=self.placeholders['dropout'])
164 |         fc3 = layers.Dense(dim_mult* self.dims[-1], dim_mult* self.dims[-1],
165 |                         dropout=self.placeholders['dropout'])
166 |         h = fc1(x)
167 |         mu = fc2(h)
168 |         log_sigma_squared = fc3(h)
169 |         sigma_squared = tf.exp(log_sigma_squared)
170 |         sigma = tf.sqrt(sigma_squared)
171 |         return mu, log_sigma_squared, sigma_squared, sigma, [fc1,fc2,fc3]
172 | 
173 |     def build(self):
174 |         samples, support_size = self.sample(self.inputs, self.layer_infos)
175 |         num_samples = [layer_info.num_samples for layer_info in self.layer_infos]
176 |         self.hiddens, self.aggregators = self.aggregate(samples, [self.features], self.dims, num_samples,
177 |                                 support_size, concat=self.concat, model_size=self.model_size)
178 |         dim_mult = 2 if self.concat else 1
179 | 
180 |         self.hiddens = tf.nn.l2_normalize(self.hiddens, 1)
181 | 
182 |         # Two Channel VAE
183 |         self.mu_rec, log_sigma_squared, sigma_squared, sigma_rec, self.vars_vae_rec = self._build_encoder(self.hiddens)
184 |         self.z_rec = tf.random_normal([dim_mult*self.dims[-1]], mean=self.mu_rec, stddev=sigma_rec)
185 |         self.normal_rec = -0.5 * tf.reduce_sum(1 + log_sigma_squared - tf.square(self.mu_rec) - sigma_squared, 1)
186 |         
187 |         self.node_pred_rec = layers.Dense(dim_mult*self.dims[-1], self.num_classes,
188 |                         dropout=self.placeholders['dropout'],
189 |                         act=None)
190 | 
191 |         self.outputs_rec = self.node_pred_rec(self.z_rec)
192 | 
193 |         self.mu_abn, log_sigma_squared, sigma_squared, sigma_abn, self.vars_vae_abn = self._build_encoder(self.hiddens)
194 |         self.z_abn = tf.random_normal([dim_mult*self.dims[-1]], mean=self.mu_abn, stddev=sigma_abn)
195 |         self.normal_abn = -0.5 * tf.reduce_sum(1 + log_sigma_squared - tf.square(self.mu_abn) - sigma_squared, 1)
196 |         
197 |         u = tf.random_uniform(shape=(self.batch_size, dim_mult*self.dims[-1]), dtype=tf.float32)
198 |         self.hidden_trans = layers.Dense(dim_mult* self.dims[-1], dim_mult* self.dims[-1],
199 |                                 dropout=self.placeholders['dropout'], act=tf.nn.sigmoid)
200 |         self.s = self.hidden_trans(self.hiddens)
201 | 
202 |         self.s_abn = tf.sigmoid((tf.log(self.s + 1e-20) + tf.log(u + 1e-20) - tf.log(1-u + 1e-20)) / self.temperature)
203 |         self.bernoulli_abn = tf.reduce_sum(tf.log(self.s + 1e-20) + tf.log(1 - self.s + 1e-20) - 2 * tf.log(0.5), 1)
204 | 
205 |         self.r_abn = tf.multiply(self.z_abn, self.s_abn)
206 |         
207 |         self.node_pred_abn = layers.Dense(dim_mult*self.dims[-1], self.num_classes,
208 |                                 dropout=self.placeholders['dropout'],
209 |                                 act=None)
210 |         self.outputs_abn = self.node_pred_abn(self.r_abn)
211 |         print(self.outputs_rec.get_shape())
212 |         
213 |         self.outputs = tf.reduce_max(tf.concat([tf.expand_dims(tf.nn.sigmoid(self.outputs_rec),-1) , tf.expand_dims(tf.nn.sigmoid(self.outputs_abn),-1) ], axis=-1), axis=-1)
214 |         print(self.outputs.get_shape())
215 |         self._loss()
216 |        
217 |         self.output_rec = tf.nn.sigmoid(self.outputs_rec)
218 |         self.output_abn = tf.nn.sigmoid(self.outputs_abn)
219 |         grads_and_vars = self.optimizer.compute_gradients(self.loss)
220 |         clipped_grads_and_vars = [(tf.clip_by_value(grad, -5.0, 5.0) if grad is not None else None, var)
221 |                                     for grad, var in grads_and_vars]
222 |         self.grad, _ = clipped_grads_and_vars[0]
223 |         self.opt_op = self.optimizer.apply_gradients(clipped_grads_and_vars)
224 | 
225 |     def _loss(self):
226 |         regular_weight = 0.3 
227 |         for aggregator in self.aggregators:
228 |             for var in aggregator.vars.values():
229 |                 self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var)
230 |         for var in self.node_pred_rec.vars.values():
231 |             self.loss += FLAGS.weight_decay * tf.reduce_sum(tf.abs(var))
232 |         for var in self.node_pred_abn.vars.values():
233 |             self.loss += FLAGS.weight_decay * tf.reduce_sum(tf.abs(var))
234 |         for var in self.hidden_trans.vars.values():
235 |             self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var)
236 |         for variable in self.vars_vae_abn:
237 |             for var in variable.vars.values():
238 |                 self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var)
239 |         for variable in self.vars_vae_rec:
240 |             for var in variable.vars.values():
241 |                 self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var)
242 |         # multi-class loss
243 |         #logits = tf.math.log(self.outputs)
244 |         #one_hot_labels = tf.one_hot(self.placeholders['labels'], depth=self.num_classes, dtype=tf.float32, axis=-1)
245 |         #multi_loss = tf.einsum('bij,bj->bi', one_hot_labels, logits)
246 |         #multi_loss =  - tf.einsum('bi,bi->b', multi_loss, self.label_masks)
247 |         multi_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.outputs_rec, labels=self.placeholders['labels_rec'])
248 |         multi_loss = tf.einsum('bi,bi->b', multi_loss, self.placeholders['weights_rec'])
249 |         self.loss += tf.reduce_mean(multi_loss)
250 |         multi_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.outputs_abn, labels=self.placeholders['labels_abn'])
251 |         multi_loss = tf.einsum('bi,bi->b', multi_loss, self.placeholders['weights_abn'])
252 |         self.loss += tf.reduce_mean(multi_loss)
253 | 
254 |         self.loss += regular_weight * tf.reduce_mean(self.bernoulli_abn)
255 |         self.loss += regular_weight * tf.reduce_mean(self.normal_rec)
256 |         self.loss += regular_weight * tf.reduce_mean(self.normal_abn)
257 | 
258 |         tf.summary.scalar('loss', self.loss)
259 | 
260 |     def predict(self):
261 |         return tf.outputs
262 | 
263 | class SeqGraphsage(models.SampleAndAggregate):
264 |     
265 |     def __init__(self, num_classes,
266 |                 placeholders, features, adj, degrees,
267 |                 layer_infos, concat=True, aggregator_type="mean",
268 |                 model_size="small", sigmoid_loss=False, identity_dim=0,
269 |                     num_steps=5, **kwargs):
270 | 
271 |         models.GeneralizedModel.__init__(self, **kwargs)
272 | 
273 |         if aggregator_type == "mean":
274 |             self.aggregator_cls = MeanAggregator
275 |         elif aggregator_type == "meanpool":
276 |             self.aggregator_cls = MeanPoolingAggregator
277 |         elif aggregator_type == "maxpool":
278 |             self.aggregator_cls = MaxPoolingAggregator
279 |         else:
280 |             raise Exception("Unknown aggregator:", self.aggregator_cls)
281 | 
282 |         self.inputs = placeholders["batch"]
283 |         self.model_size = model_size
284 |         self.adj_info = adj
285 |         self.temperature = 1.0
286 |         self.num_steps = num_steps
287 |         if identity_dim > 0:
288 |             self.embeds = [tf.get_variable("node_embeddings_{:d}".format(idx), [adj.get_shape().as_list()[1], identity_dim]) for idx in range(num_steps)]
289 |         else:
290 |             self.embeds = None
291 |         if features is None:
292 |             if identity_dim == 0:
293 |                 raise Exception("identity feature dimension must be positive")
294 |             self.features = self.embeds
295 |         else:
296 |             self.features = [tf.Variable(tf.constant(features, dtype=tf.float32), trainable=False, name="feature_{:d}".format(idx)) for idx in range(num_steps)]
297 |             if not self.embeds is None:
298 |                 self.features = [tf.concat([self.embeds[idx], self.features[idx]], axis=1) for idx in range(num_steps)]
299 |         self.degrees = degrees
300 |         self.concat  = concat
301 |         self.wei_pos = FLAGS.weight_value
302 |         self.num_classes = num_classes
303 |         self.dims = [ (0 if features is None else features.shape[1]) + identity_dim]
304 |         self.dims.extend([layer_infos[i].output_dim for i in range(len(layer_infos))])
305 |         self.batch_size = placeholders["batch_size"]
306 |         self.placeholders = placeholders
307 |         self.layer_infos = layer_infos
308 | 
309 |         self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate)
310 | 
311 |         self.build()
312 | 
313 | 
314 |     def _build_encoder(self, inputs, hiddens, fc1=None, fc2=None, fc3=None):
315 |         dim_mult  = 2 if self.concat else 1
316 |         if fc1 is None:
317 |             fc1 = layers.Dense(2*dim_mult*self.dims[-1], dim_mult* self.dims[-1],
318 |                             dropout=self.placeholders['dropout'], act=tf.nn.relu)
319 |         if fc2 is None:
320 |             fc2 = layers.Dense(dim_mult* self.dims[-1], dim_mult* self.dims[-1],
321 |                         dropout=self.placeholders['dropout'])
322 |         if fc3 is None:
323 |             fc3 = layers.Dense(dim_mult* self.dims[-1], dim_mult* self.dims[-1],
324 |                         dropout=self.placeholders['dropout'])
325 |         x = tf.concat([inputs, hiddens], axis=-1)
326 |         h = fc1(x)
327 |         mu = tf.nn.l2_normalize(fc2(h), 1)
328 |         log_sigma_squared = tf.nn.l2_normalize(fc3(h), 1)
329 |         sigma_squared = tf.exp(log_sigma_squared)
330 |         sigma = tf.sqrt(sigma_squared)
331 |         return mu, log_sigma_squared, sigma_squared, sigma, fc1,fc2, fc3
332 | 
333 | 
334 |     def _build_prior(self, x, fc1=None, fc2=None, fc3=None):
335 |         dim_mult = 2 if self.concat else 1
336 |         hidden = fc1(x)
337 |         mu = tf.nn.l2_normalize(fc2(hidden), 1)
338 |         log_sigma_squared = tf.nn.l2_normalize(fc3(hidden), 1)
339 |         sigma_squared = tf.exp(log_sigma_squared)
340 |         sigma = tf.sqrt(sigma_squared)
341 |         return mu, log_sigma_squared, sigma_squared, sigma
342 | 
343 | 
344 |     def build(self):
345 |         dim_mult = 2 if self.concat else 1
346 |         self.outputs_rec, self.outputs_abn = [], []
347 |         #print(self.placeholders["labels_abn"].get_shape().as_list())
348 |         self.hidden_states = []
349 |         for timestamp in range(self.num_steps):
350 |             print("timestamp:", timestamp)
351 |             samples, support_size = self.sample(self.inputs, self.layer_infos, timestamp=timestamp)
352 |             num_samples = [layer_info.num_samples for layer_info in self.layer_infos]
353 |             if timestamp == 0:
354 |                 self.hiddens, self.aggregators = self.aggregate(samples, self.features[timestamp], self.dims, num_samples,
355 |                                 support_size, concat=self.concat, model_size=self.model_size)
356 |             else:
357 |                 self.hiddens, _ = self.aggregate(samples, self.features[timestamp], self.dims, num_samples,
358 |                                 support_size, aggregators=self.aggregators, concat=self.concat,
359 |                                 model_size=self.model_size)
360 |             
361 |             self.hiddens = tf.nn.l2_normalize(self.hiddens, 1)
362 | 
363 |             # Two Channel VAE
364 |             
365 |             if timestamp == 0:
366 |                 self.hidden_states.append(tf.zeros_like(self.hiddens, dtype=tf.float32))
367 |                 self.prior_noli_rec = layers.Dense(dim_mult * self.dims[-1], dim_mult * self.dims[-1],
368 |                                     dropout=self.placeholders['dropout'], act=tf.nn.relu)
369 |                 self.prior_mu_rec   = layers.Dense(dim_mult * self.dims[-1], dim_mult * self.dims[-1],
370 |                                     dropout=self.placeholders['dropout'], act=None)
371 |                 self.prior_sigma_rec= layers.Dense(dim_mult * self.dims[-1], dim_mult * self.dims[-1],
372 |                                     dropout=self.placeholders['dropout'], act=None)
373 |                 self.prior_noli_abn = layers.Dense(dim_mult * self.dims[-1], dim_mult * self.dims[-1],
374 |                                     dropout=self.placeholders['dropout'], act=tf.nn.relu) 
375 |                 self.prior_mu_abn   = layers.Dense(dim_mult * self.dims[-1], dim_mult * self.dims[-1],
376 |                                     dropout=self.placeholders['dropout'], act=None)
377 |                 self.prior_sigma_abn= layers.Dense(dim_mult * self.dims[-1], dim_mult * self.dims[-1],
378 |                                     dropout=self.placeholders['dropout'], act=None)
379 |             
380 |             if timestamp == 0:
381 |                 self.mu_rec, log_sigma_squared_rec, sigma_squared_rec, sigma_rec, self.fc1_vae_rec, self.fc2_vae_rec, self.fc3_vae_rec = self._build_encoder(self.hiddens, self.hidden_states[-1])
382 |             else:
383 |                 self.mu_rec, log_sigma_squared_rec, sigma_squared_rec, sigma_rec, fc1, fc2, fc3 = self._build_encoder(self.hiddens, self.hidden_states[-1], 
384 |                                                                         fc1=self.fc1_vae_rec, fc2=self.fc2_vae_rec, fc3=self.fc3_vae_rec)
385 |             self.z_rec = tf.random_normal([dim_mult* self.dims[-1]], mean=self.mu_rec, stddev=sigma_rec)
386 |             self.sigma_rec = sigma_rec
387 | 
388 |             # KL divergence with prior distribution and approximate postier distribution
389 |             if timestamp == 0:
390 |                 self.normal_rec = -0.5 * tf.reduce_mean(1 + log_sigma_squared_rec - tf.square(self.mu_rec) - sigma_squared_rec, 1)
391 |             else:
392 |                 mu_pri, log_sigma_squared_pri, sigma_squared_pri, sigma_pri = self._build_prior(self.hidden_states[-1], fc1=self.prior_noli_rec, 
393 |                                                             fc2=self.prior_mu_rec, fc3=self.prior_sigma_rec)
394 |                 sigma_squared_pri = sigma_squared_pri + 1e-10
395 |                 self.normal_rec += -0.5 * tf.reduce_mean(1 + log_sigma_squared_rec - log_sigma_squared_pri + tf.divide(sigma_squared_rec, sigma_squared_pri) - tf.divide(tf.square(self.mu_rec - mu_pri), sigma_squared_pri), 1)
396 |                 
397 |             
398 |             if timestamp == 0:
399 |                 self.node_pred_rec = layers.Dense(dim_mult*self.dims[-1], self.num_classes,
400 |                         dropout=self.placeholders['dropout'],
401 |                         act=None)
402 | 
403 |             outputs_rec = self.node_pred_rec(self.z_rec)
404 |             
405 |             if timestamp == 0:
406 |                 self.mu_abn, log_sigma_squared_abn, sigma_squared_abn, sigma_abn, self.fc1_vae_abn, self.fc2_vae_abn, self.fc3_vae_abn = self._build_encoder(self.hiddens, self.hidden_states[-1])
407 |             else:
408 |                 self.mu_abn, log_sigma_squared_abn, sigma_squared_abn, sigma_abn, fc1, fc2, fc3 = self._build_encoder(self.hiddens, self.hidden_states[-1],
409 |                                                                 fc1=self.fc1_vae_rec, fc2=self.fc2_vae_rec, fc3=self.fc3_vae_rec)
410 |             self.z_abn = tf.random_normal([dim_mult*self.dims[-1]], mean=self.mu_abn, stddev=sigma_abn)
411 |             
412 |             if timestamp == 0:
413 |                 self.normal_abn = -0.5 * tf.reduce_mean(1 + log_sigma_squared_abn - tf.square(self.mu_abn) - sigma_squared_abn, 1)
414 |             else:
415 |                 mu_pri, log_sigma_squared_pri, sigma_squared_pri, sigma_pri = self._build_prior(self.hidden_states[-1], fc1=self.prior_noli_abn,
416 |                                                                 fc2=self.prior_mu_abn, fc3=self.prior_sigma_abn)
417 |                 sigma_pri = tf.math.reciprocal(sigma_pri+1e-10)
418 |                 sigma_trace = tf.multiply(sigma_pri, sigma_abn)
419 |                 mu_sigma = tf.multiply(mu_pri - self.mu_abn, sigma_pri)
420 |                 mu_sigma = tf.multiply(sigma_pri, mu_pri - self.mu_abn)
421 |                 self.normal_abn += -0.5 * tf.reduce_mean(1 + log_sigma_squared_abn - log_sigma_squared_pri + tf.divide(sigma_squared_abn, sigma_squared_pri) - tf.divide(tf.square(self.mu_abn - mu_pri), sigma_squared_pri), 1)
422 |             
423 |             self.sigma_abn = sigma_abn
424 |             u = tf.random_uniform(shape=(self.batch_size, dim_mult*self.dims[-1]), dtype=tf.float32)
425 |             if timestamp == 0:
426 |                 self.bernoulli_trans = layers.Dense(2*dim_mult* self.dims[-1], dim_mult* self.dims[-1],
427 |                                         dropout=self.placeholders['dropout'], act=tf.nn.sigmoid)
428 |                 self.bernoulli_prior = layers.Dense(dim_mult* self.dims[-1], dim_mult* self.dims[-1],
429 |                                         dropout=self.placeholders['dropout'], act=tf.nn.sigmoid)
430 |             self.s = self.bernoulli_trans(tf.concat([self.hiddens, self.hidden_states[-1]], axis=-1))
431 |             self.sp = self.bernoulli_prior(self.hidden_states[-1])
432 |             self.s_abn = tf.sigmoid((tf.log(self.s + 1e-20) + tf.log(u + 1e-20) - tf.log(1-u + 1e-20)) / self.temperature)
433 |             
434 |             if timestamp == 0:
435 |                 self.bernoulli_abn = - 0.5 * tf.reduce_mean(tf.log(self.s + 1e-20) + tf.log(1 - self.s + 1e-20) - 2 * tf.log(0.5), 1)
436 |             else:
437 |                 self.bernoulli_abn += - tf.reduce_mean(self.s*(tf.log(self.s + 1e-20) - tf.log(self.sp + 1e-20)) +(1-self.s) *(tf.log(1 - self.s + 1e-20) - tf.log(1 - self.sp + 1e-20)) , 1)
438 |             self.r_abn = tf.multiply(self.z_abn, self.s_abn)
439 |             if timestamp == 0:
440 |                 self.node_pred_abn = layers.Dense(dim_mult*self.dims[-1], self.num_classes,
441 |                                 dropout=self.placeholders['dropout'],
442 |                                 act=None)
443 | 
444 |             outputs_abn = self.node_pred_abn(self.r_abn)
445 |             self.outputs_rec.append(outputs_rec)
446 |             self.outputs_abn.append(outputs_abn)
447 |             
448 |             if timestamp == 0:
449 |                 self.hidden_trans = layers.Dense(6*dim_mult*self.dims[-1], dim_mult*self.dims[-1],
450 |                                             dropout=self.placeholders['dropout'],
451 |                                             act=tf.nn.relu)
452 | 
453 |             next_hidden_state = self.hidden_trans(tf.concat([self.hidden_states[-1], self.mu_rec, self.sigma_rec, self.mu_abn, self.sigma_abn, self.s], axis=-1))
454 |             self.hidden_states.append(tf.nn.l2_normalize(next_hidden_state,1))
455 |         self._loss()
456 |        
457 |         self.output_rec = tf.nn.sigmoid(self.outputs_rec[-1])
458 |         self.output_abn = tf.nn.sigmoid(self.outputs_abn[-1])
459 |         self.outputs = tf.clip_by_value(self.output_abn, 1e-8, 1.0-1e-8)
460 |         grads_and_vars = self.optimizer.compute_gradients(self.loss)
461 |         clipped_grads_and_vars = [(tf.clip_by_value(grad, -1.0, 1.0) if grad is not None else None, var)
462 |                                     for grad, var in grads_and_vars]
463 |         self.grad, _ = clipped_grads_and_vars[0]
464 |         self.opt_op = self.optimizer.apply_gradients(clipped_grads_and_vars)
465 | 
466 |     def _loss(self):
467 |         regular_weight = 1.0
468 |         for aggregator in self.aggregators:
469 |             for var in aggregator.vars.values():
470 |                 self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var)
471 |         for var in self.node_pred_rec.vars.values():
472 |             self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var)
473 |         for var in self.node_pred_abn.vars.values():
474 |             self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var)
475 |         for var in self.hidden_trans.vars.values():
476 |             self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var)
477 |         
478 |         for variable in [self.fc1_vae_rec, self.fc2_vae_rec, self.fc3_vae_rec]:
479 |             for var in variable.vars.values():
480 |                 self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var)
481 | 
482 |         for variable in [self.fc1_vae_abn, self.fc2_vae_abn, self.fc3_vae_abn]:
483 |             for var in variable.vars.values():
484 |                 self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var)
485 |         
486 |         for variable in [self.prior_noli_rec, self.prior_mu_rec, self.prior_sigma_rec]:
487 |             for var in variable.vars.values():
488 |                 self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var)
489 | 
490 |         for variable in [self.prior_noli_abn, self.prior_mu_abn, self.prior_sigma_abn, self.bernoulli_trans, self.bernoulli_prior]:
491 |             for var in variable.vars.values():
492 |                 self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var)
493 |         
494 |         labels_rec = tf.unstack(self.placeholders['labels_rec'], axis=1)
495 |         weights_rec = tf.unstack(self.placeholders['weights_rec'], axis=1)
496 |         
497 | 
498 |         for idx in range(self.num_steps):
499 |             self.loss += tf.reduce_mean(self._balanced_loss(self.outputs_rec[idx], labels_rec[idx], axis=1, wpos=self.wei_pos)) 
500 |             self.loss += tf.reduce_mean(self._balanced_loss(self.outputs_rec[idx], labels_rec[idx], axis=0, wpos=self.wei_pos)) 
501 | 
502 |         labels_abn = tf.unstack(self.placeholders['labels_abn'], axis=1)
503 |         weights_abn = tf.unstack(self.placeholders['weights_abn'], axis=1)
504 |         for idx in range(self.num_steps):
505 |             self.loss += tf.reduce_mean(self._balanced_loss(self.outputs_abn[idx], labels_abn[idx], axis=1, wpos=self.wei_pos*2)) #223
506 |             self.loss += tf.reduce_mean(self._balanced_loss(self.outputs_abn[idx], labels_abn[idx], axis=0, wpos=self.wei_pos*2))  #118
507 |         
508 |         self.loss += tf.reduce_mean(self.bernoulli_abn)
509 |         self.loss += tf.reduce_mean(self.normal_rec)
510 |         self.loss += tf.reduce_mean(self.normal_abn)
511 | 
512 |         tf.summary.scalar('loss', self.loss)
513 | 
514 |     def predict(self):
515 |         return tf.outputs
516 |     
517 |     def _balanced_loss(self, logits, labels, axis=None, wpos=1.0):
518 |         if axis is None:
519 |             axis=-1
520 |         pos_weight = tf.cast(tf.equal(labels, 1), tf.float32)
521 |         neg_weight = 1 - pos_weight
522 |         n_pos = tf.reduce_sum(pos_weight)
523 |         n_neg = tf.reduce_sum(neg_weight)
524 |         n_pos_divid = tf.reduce_sum(pos_weight, axis=axis)
525 |         n_neg_divid = tf.reduce_sum(neg_weight, axis=axis)
526 | 
527 |         ce_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels)
528 | 
529 |         def has_pos():
530 |             return tf.reduce_sum(ce_loss * pos_weight, axis=axis) / (n_pos_divid + 1e-5)
531 |         def has_neg():
532 |             return tf.reduce_sum(ce_loss * neg_weight, axis=axis) / (n_neg_divid + 1e-5)
533 |         def is_zero():
534 |             return tf.constant(0.0)
535 |         pos_loss = tf.cond(n_pos > 0, has_pos, is_zero)
536 |         neg_loss = tf.cond(n_neg > 0, has_neg, is_zero)
537 |         return (pos_loss * wpos + neg_loss) / (wpos + 1.0)
538 | 
539 | class SeqTestGraphsage(models.SampleAndAggregate):
540 |     
541 |     def __init__(self, num_classes,
542 |                 placeholders, features, adj, degrees,
543 |                 layer_infos, concat=True, aggregator_type="mean",
544 |                 model_size="small", sigmoid_loss=False, identity_dim=0,
545 |                     num_steps=5, **kwargs):
546 | 
547 |         models.GeneralizedModel.__init__(self, **kwargs)
548 | 
549 |         if aggregator_type == "mean":
550 |             self.aggregator_cls = MeanAggregator
551 |         elif aggregator_type == "meanpool":
552 |             self.aggregator_cls = MeanPoolingAggregator
553 |         elif aggregator_type == "maxpool":
554 |             self.aggregator_cls = MaxPoolingAggregator
555 |         else:
556 |             raise Exception("Unknown aggregator:", self.aggregator_cls)
557 | 
558 |         self.inputs = placeholders["batch"]
559 |         self.model_size = model_size
560 |         self.adj_info = adj
561 |         self.temperature = 1.0
562 |         self.num_steps = num_steps
563 |         if identity_dim > 0:
564 |             self.embeds = [tf.get_variable("node_embeddings_{:d}".format(idx), [adj.get_shape().as_list()[1], identity_dim]) for idx in range(num_steps)]
565 |         else:
566 |             self.embeds = None
567 |         if features is None:
568 |             if identity_dim == 0:
569 |                 raise Exception("identity feature dimension must be positive")
570 |             self.features = self.embeds
571 |         else:
572 |             self.features = [tf.Variable(tf.constant(features, dtype=tf.float32), trainable=False, name="feature_{:d}".format(idx)) for idx in range(num_steps)]
573 |             if not self.embeds is None:
574 |                 self.features = [tf.concat([self.embeds[idx], self.features[idx]], axis=1) for idx in range(num_steps)]
575 |         self.degrees = degrees
576 |         self.concat  = concat
577 |         self.num_classes = num_classes
578 |         self.dims = [ (0 if features is None else features.shape[1]) + identity_dim]
579 |         self.dims.extend([layer_infos[i].output_dim for i in range(len(layer_infos))])
580 |         self.batch_size = placeholders["batch_size"]
581 |         self.placeholders = placeholders
582 |         self.layer_infos = layer_infos
583 | 
584 |         self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate)
585 | 
586 |         self.build()
587 | 
588 | 
589 |     def _build_encoder(self, inputs, hiddens, fc1=None, fc2=None, fc3=None):
590 |         dim_mult  = 2 if self.concat else 1
591 |         if fc1 is None:
592 |             fc1 = layers.Dense(2*dim_mult*self.dims[-1], dim_mult* self.dims[-1],
593 |                             dropout=self.placeholders['dropout'], act=tf.nn.relu)
594 |         if fc2 is None:
595 |             fc2 = layers.Dense(dim_mult* self.dims[-1], dim_mult* self.dims[-1],
596 |                         dropout=self.placeholders['dropout'])
597 |         if fc3 is None:
598 |             fc3 = layers.Dense(dim_mult* self.dims[-1], dim_mult* self.dims[-1],
599 |                         dropout=self.placeholders['dropout'])
600 |         x = tf.concat([inputs, hiddens], axis=-1)
601 |         h = fc1(x)
602 |         mu = fc2(h)
603 |         log_sigma_squared = fc3(h)
604 |         sigma_squared = tf.exp(log_sigma_squared)
605 |         sigma = tf.sqrt(sigma_squared)
606 |         return mu, log_sigma_squared, sigma_squared, sigma, fc1,fc2, fc3
607 | 
608 | 
609 |     def _build_prior(self, x, fc1=None, fc2=None, fc3=None):
610 |         dim_mult = 2 if self.concat else 1
611 |         hidden = fc1(x)
612 |         mu = fc2(hidden)
613 |         log_sigma_squared = fc3(hidden)
614 |         sigma_squared = tf.exp(log_sigma_squared)
615 |         sigma = tf.sqrt(sigma_squared)
616 |         return mu, log_sigma_squared, sigma_squared, sigma
617 | 
618 | 
619 |     def build(self):
620 |         dim_mult = 2 if self.concat else 1
621 |         self.outputs_rec, self.outputs_abn = [], []
622 |         #print(self.placeholders["labels_abn"].get_shape().as_list())
623 |         self.hidden_states = []
624 |         for timestamp in range(self.num_steps):
625 |             print("timestamp:", timestamp)
626 |             samples, support_size = self.sample(self.inputs, self.layer_infos, timestamp=timestamp)
627 |             num_samples = [layer_info.num_samples for layer_info in self.layer_infos]
628 |             if timestamp == 0:
629 |                 self.hiddens, self.aggregators = self.aggregate(samples, self.features[timestamp], self.dims, num_samples,
630 |                                 support_size, concat=self.concat, model_size=self.model_size)
631 |             else:
632 |                 self.hiddens, _ = self.aggregate(samples, self.features[timestamp], self.dims, num_samples,
633 |                                 support_size, aggregators=self.aggregators, concat=self.concat,
634 |                                 model_size=self.model_size)
635 |             
636 |             self.hiddens = tf.nn.l2_normalize(self.hiddens, 1)
637 | 
638 |             # Two Channel VAE
639 |             
640 |             if timestamp == 0:
641 |                 self.hidden_states.append(tf.ones_like(self.hiddens, dtype=tf.float32))
642 |                 self.prior_noli_rec = layers.Dense(dim_mult * self.dims[-1], dim_mult * self.dims[-1],
643 |                                     dropout=self.placeholders['dropout'], act=tf.nn.relu)
644 |                 self.prior_mu_rec   = layers.Dense(dim_mult * self.dims[-1], dim_mult * self.dims[-1],
645 |                                     dropout=self.placeholders['dropout'], act=None)
646 |                 self.prior_sigma_rec= layers.Dense(dim_mult * self.dims[-1], dim_mult * self.dims[-1],
647 |                                     dropout=self.placeholders['dropout'], act=None)
648 |             
649 |             if timestamp == 0:
650 |                 self.mu_rec, log_sigma_squared_rec, sigma_squared_rec, sigma_rec, self.fc1_vae_rec, self.fc2_vae_rec, self.fc3_vae_rec = self._build_encoder(self.hiddens, self.hidden_states[-1])
651 |             else:
652 |                 self.mu_rec, log_sigma_squared_rec, sigma_squared_rec, sigma_rec, fc1, fc2, fc3 = self._build_encoder(self.hiddens, self.hidden_states[-1], 
653 |                                                                         fc1=self.fc1_vae_rec, fc2=self.fc2_vae_rec, fc3=self.fc3_vae_rec)
654 |             self.z = tf.random_normal([dim_mult* self.dims[-1]], mean=self.mu_rec, stddev=sigma_rec)
655 |             self.sigma_rec = sigma_rec
656 | 
657 |             # KL divergence with prior distribution and approximate postier distribution
658 |             if timestamp == 0:
659 |                 self.normal_rec = -0.5 * tf.reduce_sum(1 + log_sigma_squared_rec - tf.square(self.mu_rec) - sigma_squared_rec, 1)
660 |             else:
661 |                 mu_pri, log_sigma_squared_pri, sigma_squared_pri, sigma_pri = self._build_prior(self.hidden_states[-1], fc1=self.prior_noli_rec, 
662 |                                                             fc2=self.prior_mu_rec, fc3=self.prior_sigma_rec)
663 |                 sigma_pri = tf.math.reciprocal(sigma_pri+1e-10)
664 |                 sigma_trace = tf.multiply(sigma_pri, sigma_rec)
665 |                 mu_sigma = tf.multiply(mu_pri - self.mu_rec, sigma_pri)
666 |                 mu_sigma = tf.multiply(sigma_pri, mu_pri - self.mu_rec)
667 |                 self.normal_rec += -0.5 * tf.reduce_sum(1 + log_sigma_squared_rec - log_sigma_squared_pri - sigma_trace - mu_sigma, 1)
668 |                 
669 |             
670 |             if timestamp == 0:
671 |                 self.node_pred_rec = layers.Dense(dim_mult*self.dims[-1], self.num_classes,
672 |                         dropout=self.placeholders['dropout'],
673 |                         act=None)
674 | 
675 |             outputs_rec = self.node_pred_rec(self.z)
676 |            
677 |             
678 |             u = tf.random_uniform(shape=(self.batch_size, FLAGS.num_classes), dtype=tf.float32)
679 |             if timestamp == 0:
680 |                 self.bernoulli_trans = layers.Dense(2*dim_mult* self.dims[-1], FLAGS.num_classes,
681 |                                         dropout=self.placeholders['dropout'], act=tf.nn.sigmoid)
682 |                 self.node_pred_abn = layers.Dense(2 * dim_mult * self.dims[-1], FLAGS.num_classes,
683 |                                         dropout=self.placeholders['dropout'], act=None)
684 | 
685 |             self.s = self.bernoulli_trans(tf.concat([self.z, self.hidden_states[-1]], axis=-1))
686 |             self.s_abn = tf.sigmoid((tf.log(self.s + 1e-20) + tf.log(u + 1e-20) - tf.log(1-u + 1e-20)) / self.temperature)
687 |             self.z_abn = self.node_pred_abn(tf.concat([self.z, self.hidden_states[-1]], axis=-1))
688 |             self.r_abn = tf.multiply(self.z_abn, self.s_abn)
689 |             
690 |             outputs_abn = self.r_abn
691 |             print(outputs_rec.get_shape())
692 |             self.outputs_rec.append(outputs_rec)
693 |             self.outputs_abn.append(outputs_abn)
694 |             #self.outputs = tf.reduce_max(tf.concat([tf.expand_dims(tf.nn.sigmoid(self.outputs_rec),-1) , tf.expand_dims(tf.nn.sigmoid(self.outputs_abn),-1) ], axis=-1), axis=-1)
695 |             
696 |             # output next hidden states
697 |             if timestamp == 0:
698 |                 self.hidden_trans = layers.Dense(3*dim_mult*self.dims[-1] + FLAGS.num_classes, dim_mult*self.dims[-1],
699 |                                             dropout=self.placeholders['dropout'],
700 |                                             act=tf.nn.relu)
701 | 
702 |             next_hidden_state = self.hidden_trans(tf.concat([self.hidden_states[-1], self.mu_rec, self.sigma_rec, self.s], axis=-1))
703 |             self.hidden_states.append(tf.nn.l2_normalize(next_hidden_state,1))
704 |         self._loss()
705 |        
706 |         self.output_rec = tf.nn.sigmoid(self.outputs_rec[-1])
707 |         self.output_abn = tf.nn.sigmoid(self.outputs_abn[-1])
708 |         self.outputs = tf.clip_by_value(self.output_abn, 1e-8, 1.0-1e-8)
709 |         grads_and_vars = self.optimizer.compute_gradients(self.loss)
710 |         clipped_grads_and_vars = [(tf.clip_by_value(grad, -5.0, 5.0) if grad is not None else None, var)
711 |                                     for grad, var in grads_and_vars]
712 |         self.grad, _ = clipped_grads_and_vars[0]
713 |         self.opt_op = self.optimizer.apply_gradients(clipped_grads_and_vars)
714 | 
715 |     def _loss(self):
716 |         regular_weight = 0.1 
717 |         for aggregator in self.aggregators:
718 |             for var in aggregator.vars.values():
719 |                 self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var)
720 |         for var in self.node_pred_rec.vars.values():
721 |             self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var)
722 |         for var in self.node_pred_abn.vars.values():
723 |             self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var)
724 |         for var in self.hidden_trans.vars.values():
725 |             self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var)
726 |         
727 |         for variable in [self.fc1_vae_rec, self.fc2_vae_rec, self.fc3_vae_rec]:
728 |             for var in variable.vars.values():
729 |                 self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var)
730 | 
731 |         
732 |         labels_rec = tf.unstack(self.placeholders['labels_rec'], axis=1)
733 |         weights_rec = tf.unstack(self.placeholders['weights_rec'], axis=1)
734 |         for idx in range(self.num_steps):
735 |             multi_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.outputs_rec[idx], labels=labels_rec[idx])
736 |             multi_loss = tf.einsum('bi,bi->b', multi_loss, weights_rec[idx])
737 |             self.loss += tf.reduce_mean(multi_loss)
738 |         labels_abn = tf.unstack(self.placeholders['labels_abn'], axis=1)
739 |         weights_abn = tf.unstack(self.placeholders['weights_abn'], axis=1)
740 |         for idx in range(self.num_steps):
741 |             multi_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.outputs_abn[idx], labels=labels_abn[idx])
742 |             multi_loss = tf.einsum('bi,bi->b', multi_loss, weights_abn[idx])
743 |             self.loss += tf.reduce_mean(multi_loss)
744 | 
745 |         #self.loss += regular_weight * tf.reduce_mean(self.bernoulli_abn)
746 |         self.loss += regular_weight * tf.reduce_mean(self.normal_rec)
747 |         #self.loss += regular_weight * tf.reduce_mean(self.normal_abn)
748 | 
749 |         tf.summary.scalar('loss', self.loss)
750 | 
751 |     def predict(self):
752 |         return tf.outputs
753 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from __future__ import print_function
  3 | 
  4 | import os
  5 | import time
  6 | import tensorflow as tf
  7 | import numpy as np
  8 | import sklearn
  9 | from sklearn import metrics
 10 | 
 11 | from supervised_model import SupervisedGraphsage, TwoChannelGraphsage, SeqGraphsage, SeqTestGraphsage
 12 | from minibatch import NodeMinibatchIterator, SeqNodeMinibatchIterator
 13 | from neigh_samplers import UniformNeighborSampler, SeqUniformNeighborSampler
 14 | from graphsage import SAGEInfo
 15 | from utils import load_data, load_seq_data
 16 | 
 17 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 18 | 
 19 | seed = 123
 20 | np.random.seed(seed)
 21 | tf.set_random_seed(seed)
 22 | 
 23 | flags = tf.app.flags
 24 | FLAGS = flags.FLAGS
 25 | 
 26 | tf.app.flags.DEFINE_boolean('log_device_placement', False, 'log')
 27 | 
 28 | 
 29 | flags.DEFINE_string('model', 'mean', 'model version')
 30 | flags.DEFINE_float('learning_rate', 0.001, 'learning rate')
 31 | flags.DEFINE_string('model_size', 'small', 'small or big')
 32 | flags.DEFINE_string('train_prefix', '', 'train path')
 33 | 
 34 | flags.DEFINE_boolean('sequential', True, 'sequential dataset or not')
 35 | flags.DEFINE_integer('epochs', 10, 'epochs')
 36 | flags.DEFINE_float('dropout', 0.5, 'dropout')
 37 | flags.DEFINE_float('weight_decay', 0.02, 'weight decay for l2-norm')
 38 | flags.DEFINE_float('weight_value', 1.2, 'weight value for positive classes')
 39 | flags.DEFINE_integer('max_degree', 32, 'max degree')
 40 | flags.DEFINE_integer('samples_1', 25, 'number of samples in layer 1')
 41 | flags.DEFINE_integer('samples_2', 10, 'number of samples in layer 2')
 42 | flags.DEFINE_integer('samples_3', 0,  'number of samples in layer 3')
 43 | flags.DEFINE_integer('dim_1', 128, 'Size of output dim')
 44 | flags.DEFINE_integer('dim_2', 128, 'size of output dim')
 45 | flags.DEFINE_integer('batch_size', 128, 'batch size')
 46 | flags.DEFINE_integer('identity_dim', 100, 'identity dimension')
 47 | flags.DEFINE_boolean('split_class', True, 'split category into old and new')
 48 | flags.DEFINE_integer('num_classes', 123, 'item numbers')
 49 | flags.DEFINE_integer('max_input_classes', 5, 'output length')
 50 | flags.DEFINE_boolean('two_channel', True, 'normal and abnormal')
 51 | flags.DEFINE_integer('num_steps', 5, 'step numbers')
 52 | flags.DEFINE_string('base_log_dir', './log/', 'log save path')
 53 | flags.DEFINE_integer('validate_iter', 20, 'how often to run validation')
 54 | flags.DEFINE_integer('validate_batch_size', 128, 'batch size in valiation')
 55 | flags.DEFINE_integer('print_every', 20, 'how often to print')
 56 | flags.DEFINE_integer('gpu', 1, 'number of gpu')
 57 | flags.DEFINE_boolean('random_context', True, 'use random context')
 58 | flags.DEFINE_integer('max_total_steps', 10**5, 'max steps')
 59 | os.environ["CUDA_VISIBLE_DEVICES"]=str(FLAGS.gpu)
 60 | 
 61 | CPU_MEM_FRACTION = 0.8
 62 | print(FLAGS.batch_size)
 63 | print(FLAGS.two_channel)
 64 | 
 65 | def construct_placeholders():    
 66 |     if not FLAGS.sequential:
 67 |         print("using single channel placeholders")
 68 |         placeholders = {
 69 |             'labels' : tf.placeholder(tf.float32, shape=(None, FLAGS.num_steps, FLAGS.num_classes), name='labels'),
 70 |             'batch'  : tf.placeholder(tf.int32, shape=(None), name='batch'),
 71 |             'dropout': tf.placeholder_with_default(0., shape=(), name='dropout'),
 72 |             'weights': tf.placeholder(tf.float32, shape=(None, FLAGS.num_steps, FLAGS.num_classes), name='weights'),
 73 |             'batch_size': tf.placeholder(tf.int32, name='batch_size'),
 74 |         }
 75 |     else:
 76 |         print("using two channel placeholders")
 77 |         placeholders = {
 78 |             'labels_rec': tf.placeholder(tf.float32, shape=(None, FLAGS.num_steps, FLAGS.num_classes), name='labels_rec'),
 79 |             'labels_abn': tf.placeholder(tf.float32, shape=(None, FLAGS.num_steps, FLAGS.num_classes), name='labels_abn'),
 80 |             'weights_rec': tf.placeholder(tf.float32, shape=(None, FLAGS.num_steps, FLAGS.num_classes), name='weights_rec'),
 81 |             'weights_abn': tf.placeholder(tf.float32, shape=(None, FLAGS.num_steps, FLAGS.num_classes), name='weights_abn'),
 82 |             'batch': tf.placeholder(tf.int32, shape=(None), name='batch'),
 83 |             'dropout': tf.placeholder_with_default(0., shape=(), name='dropout'),
 84 |             'batch_size': tf.placeholder(tf.int32, name='batch_size'),
 85 |         }
 86 |     return placeholders
 87 | 
 88 | 
 89 | def calc_f1(y_true, y_pred):
 90 |     y_pred[y_pred > 0.5] = 1
 91 |     y_pred[y_pred <= 0.5] = 0
 92 |     return metrics.f1_score(y_true.round(), y_pred, average="micro"), metrics.f1_score(y_true.round(), y_pred, average="macro")
 93 | 
 94 | def log_dir():
 95 |     log_dir = FLAGS.base_log_dir + "/sup-" + FLAGS.train_prefix.split("/")[-2]
 96 |     log_dir += "/{model:s}_{model_size:s}_{lr:0.4f}/".format(
 97 |                 model=FLAGS.model,
 98 |                 model_size=FLAGS.model_size,
 99 |                 lr=FLAGS.learning_rate)
100 |     if not os.path.exists(log_dir):
101 |         os.makedirs(log_dir)
102 |     return log_dir
103 | 
104 | def evaluate(sess, model, minibatch_iter, size=None):
105 |     t_test = time.time()
106 |     feed_dict_val, labels = minibatch_iter.node_val_feed_dict(size)
107 |     node_outs_val = sess.run([model.outputs, model.loss], feed_dict=feed_dict_val)
108 |     mic, mac = calc_f1(labels, node_outs_val[0])
109 |     return node_outs_val[1], mic, mac, (time.time()- t_test)
110 | 
111 | 
112 | def incremental_evaluate(sess, model, minibatch_iter, class_map, test_labels, size, test=False):
113 |     t_test = time.time()
114 |     finished = False
115 |     val_losses = []
116 |     val_preds = []
117 |     labels = []
118 |     iter_num = 0
119 |     while not finished:
120 |         feed_dict_val, batch_labels, finished, batch_nodes = minibatch_iter.incremental_node_val_feed_dict(size, iter_num, test=test)
121 |         node_outs_val = sess.run([model.outputs, model.loss], feed_dict=feed_dict_val)
122 |         preds = node_outs_val[0]
123 |         val_losses.append(node_outs_val[1])
124 |         for idx, node in enumerate(batch_nodes):
125 |             for cate in test_labels[-1][node]["pos"]:
126 |                 if cate in class_map:
127 |                     labels.append(1)
128 |                     val_preds.append(int(preds[idx,class_map[cate]] > 0.5))
129 |             for cate in test_labels[-1][node]["neg"]:
130 |                 if cate in class_map:
131 |                     labels.append(0)
132 |                     val_preds.append(int(preds[idx, class_map[cate]] > 0.5))
133 |         iter_num += 1
134 |     val_preds = np.vstack(val_preds)
135 |     labels = np.vstack(labels)
136 |     f1_scores = calc_f1(labels, val_preds)
137 |     return np.mean(val_losses), f1_scores[0], f1_scores[1], (time.time() - t_test)
138 | 
139 | def incremental_evaluate_with_split_class(sess, model, minibatch_iter, class_map, test_labels, size, test=False):
140 |     t_test = time.time()
141 |     finished = False
142 |     val_losses = []
143 |     labels_old = []
144 |     preds_old  = []
145 |     labels_new = []
146 |     preds_new  = []
147 |     iter_num = 0
148 |     while not finished: 
149 |         feed_dict_val, batch_labels, finished, batch_nodes = minibatch_iter.incremental_node_val_feed_dict(size, iter_num, test=test)
150 |         if not FLAGS.two_channel:
151 |             node_outs_val = sess.run([model.outputs, model.loss], feed_dict = feed_dict_val)
152 |             preds = node_outs_val[0]
153 |             val_losses.append(node_outs_val[1])
154 |         else:
155 |             output_old, output_new, output_loss = sess.run([model.output_rec, model.output_abn, model.loss], feed_dict = feed_dict_val)
156 |             val_losses.append(output_loss)
157 |         for idx, node in enumerate(batch_nodes):
158 |             for cate in test_labels[-1][node]["old"]:
159 |                 if cate in class_map:
160 |                     labels_old.append(1)
161 |                     if not FLAGS.two_channel:
162 |                         preds_old.append(int(preds[idx, class_map[cate]] > 0.5))
163 |                     else:
164 |                         preds_old.append(int(output_old[idx, class_map[cate]] > 0.5))
165 |             for cate in test_labels[-1][node]["neg"][:len(test_labels[-1][node]["old"])]:
166 |                 if cate in class_map:
167 |                     labels_old.append(0)
168 |                     if not FLAGS.two_channel:
169 |                         preds_old.append(int(preds[idx, class_map[cate]] > 0.5))
170 |                     else:
171 |                         preds_old.append(int(output_old[idx, class_map[cate]] > 0.5))
172 |             for cate in test_labels[-1][node]["new"]:
173 |                 if cate in class_map:
174 |                     labels_new.append(1)
175 |                     if not FLAGS.two_channel:
176 |                         preds_new.append(int(preds[idx, class_map[cate]] > 0.5))
177 |                     else:
178 |                         preds_new.append(int(output_new[idx, class_map[cate]] > 0.5))
179 |             for cate in test_labels[-1][node]["neg"][:len(test_labels[-1][node]["new"])]:
180 |                 if cate in class_map:
181 |                     labels_new.append(0)
182 |                     if not FLAGS.two_channel:
183 |                         preds_new.append(int(preds[idx, class_map[cate]] > 0.5))
184 |                     else:
185 |                         preds_new.append(int(output_new[idx, class_map[cate]] > 0.5))
186 |         iter_num += 1
187 |     preds_old = np.vstack(preds_old)
188 |     labels_old = np.vstack(labels_old)
189 |     preds_new = np.vstack(preds_new)
190 |     labels_new = np.vstack(labels_new)
191 |     f1_old = calc_f1(labels_old, preds_old)
192 |     f1_new = calc_f1(labels_new, preds_new)
193 |     return np.mean(val_losses), f1_old[0], f1_old[1], f1_new[0], f1_new[1], (time.time() - t_test)
194 | 
195 | 
196 | def train(train_data, test_data=None):
197 |     
198 |     G = train_data[0]
199 |     features = train_data[1]
200 |     id_map = train_data[2]
201 |     all_class = train_data[4]
202 |     class_map = train_data[5]
203 |     test_class = train_data[6]
204 | 
205 |     num_classes = FLAGS.num_classes
206 | 
207 |     if not features is None:
208 |         features = np.vstack([features, np.zeros((features.shape[1], ))])
209 | 
210 |     context_pairs = train_data[3] if FLAGS.random_context else None
211 |     placeholders = construct_placeholders()
212 | 
213 |     minibatch = SeqNodeMinibatchIterator(G,
214 |                     id_map,
215 |                     placeholders,
216 |                     all_class,
217 |                     num_classes,
218 |                     batch_size=FLAGS.batch_size,
219 |                     max_degree=FLAGS.max_degree,
220 |                     context_pairs = context_pairs,
221 |                     num_steps = FLAGS.num_steps)
222 |     adj_info_ph = tf.placeholder(tf.int32, shape=np.array(minibatch.adj).shape)
223 |     adj_info = tf.Variable(adj_info_ph, trainable=False, name="adj_info")
224 |     
225 |     sampler = SeqUniformNeighborSampler(adj_info)
226 |     layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1)]
227 |     if FLAGS.samples_2 != 0:
228 |         layer_infos.append(SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2))
229 |     if FLAGS.samples_3 != 0:
230 |         layer_infos.append(SAGEInfo("node", sampler, FLAGS.samples_3, FLAGS.dim_2))
231 |     if FLAGS.sequential:
232 |         print("using sequential two channel inputs")
233 |         model = SeqGraphsage(num_classes, placeholders,
234 |                             features,
235 |                             adj_info,
236 |                             minibatch.deg,
237 |                             layer_infos,
238 |                             model_size=FLAGS.model_size,
239 |                             identity_dim=FLAGS.identity_dim,
240 |                             num_steps = FLAGS.num_steps,
241 |                             logging=True)
242 |     config = tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)
243 |     config.gpu_options.allow_growth = True
244 |     config.allow_soft_placement = True
245 | 
246 |     sess = tf.Session(config=config)
247 |     merged = tf.summary.merge_all()
248 |     summary_writer = tf.summary.FileWriter(log_dir(), sess.graph)
249 | 
250 |     sess.run(tf.global_variables_initializer(), feed_dict={adj_info_ph: np.array(minibatch.adj)})
251 | 
252 |     total_steps = 0
253 |     avg_time = 0.0
254 |     epoch_val_costs = []
255 | 
256 |     train_adj_info = tf.assign(adj_info, np.array(minibatch.adj))
257 |     val_adj_info   = tf.assign(adj_info, np.array(minibatch.test_adj))
258 |     best_vani_f1_mic , best_vani_f1_mac,  best_burst_f1_mic, best_burst_f1_mac = 0, 0 , 0, 0
259 |     for epoch in range(FLAGS.epochs):
260 |         minibatch.shuffle()
261 |         iter = 0
262 |         print("Epoch: %04d" % (epoch+1))
263 |         epoch_val_costs.append(0)
264 |         while not minibatch.end():
265 |             feed_dict, labels = minibatch.next_minibatch_feed_dict()
266 |             feed_dict.update({placeholders['dropout']: FLAGS.dropout})
267 |             t = time.time()
268 |             outs = sess.run([merged, model.opt_op, model.loss, model.outputs], feed_dict=feed_dict)
269 |             train_cost = outs[2]
270 |             if iter % FLAGS.validate_iter == 0:
271 |                 
272 |                 sess.run(val_adj_info.op)
273 |                 if not FLAGS.split_class:
274 |                     val_cost, val_f1_mic, val_f1_mac, duration = incremental_evaluate(sess, model, minibatch, class_map, test_class, FLAGS.validate_batch_size)
275 |                 else:
276 |                     val_cost, old_f1_mic, old_f1_mac, new_f1_mic, new_f1_mac, duration = incremental_evaluate_with_split_class(sess, model, minibatch, class_map, test_class, FLAGS.validate_batch_size)
277 |                 sess.run(train_adj_info.op)
278 |                 epoch_val_costs[-1] += val_cost
279 | 
280 |             if total_steps % FLAGS.print_every == 0:
281 |                 summary_writer.add_summary(outs[0], total_steps)
282 | 
283 |             avg_time = (avg_time * total_steps + time.time() - t) / (total_steps + 1)
284 | 
285 |             if total_steps % FLAGS.print_every == 0:
286 |                 if not FLAGS.split_class:
287 |                     print("Iter: {:04d}\ntrain_loss= {:.5f} test_loss= {:.5f}\nval_f1_mic= {:.5f} val_f1_mac= {:.5f} time= {:.5f}".format(iter, train_cost, val_cost, val_f1_mic, val_f1_mac, avg_time))
288 |                 else:
289 |                     print("Iter: {:04d}\ntrain_loss= {:.5f} test_loss= {:.5f}\nvanilla_f1_mic= {:.5f} vanilla_f1_mac= {:.5f} burst_f1_mic= {:.5f} burst_f1_mac= {:.5f} time= {:.5f}".format(iter, train_cost, val_cost, old_f1_mic, old_f1_mac, new_f1_mic, new_f1_mac, avg_time))
290 |                     if old_f1_mic > best_vani_f1_mic:
291 |                         best_vani_f1_mic = old_f1_mic
292 |                         best_vani_f1_mac = old_f1_mac
293 |                         best_burst_f1_mic = new_f1_mic
294 |                         best_burst_f1_mac = new_f1_mac
295 |             iter += 1
296 |             total_steps += 1
297 | 
298 |             if total_steps > FLAGS.max_total_steps:
299 |                 break
300 |         if total_steps > FLAGS.max_total_steps:
301 |             break
302 |     
303 |     print("Optimization finished")
304 |     if FLAGS.split_class:
305 |         print("Results:\nvanilla_f1_micro= {:.5f} vanilla_f1_macro={:.5f} burst_f1_micro= {:.5f} burst_f1_macro= {:.5f}".format( best_vani_f1_mic, best_vani_f1_mac, best_burst_f1_mic, best_burst_f1_mac))
306 | 
307 | def main(argv=None):
308 |     print("Loading training data ..")
309 |     if FLAGS.sequential:
310 |         train_data = load_seq_data(FLAGS.train_prefix, num_steps=FLAGS.num_steps, split_class=FLAGS.split_class)
311 |     else:
312 |         train_data = load_data(FLAGS.train_prefix, split_class = FLAGS.split_class)
313 |     print("Done loading training data ..")
314 |     train(train_data)
315 | 
316 | if __name__ == '__main__':
317 |     tf.app.run()
318 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import numpy as np
  4 | import random
  5 | import json
  6 | import sys
  7 | import os
  8 | 
  9 | import networkx as nx
 10 | from networkx.readwrite import json_graph
 11 | version_info = list(map(int, nx.__version__.split('.')))
 12 | print(version_info)
 13 | major = version_info[0]
 14 | minor = version_info[1]
 15 | #assert (major <= 1) and (minor <= 11), "networkx major version > 1.11"
 16 | 
 17 | WALK_LEN=4
 18 | N_WALKS=10
 19 | 
 20 | def load_data(prefix, normalize=True, load_walks=False, split_class=False):
 21 |     G_data = json.load(open(prefix + "-G.json"))
 22 |     G = json_graph.node_link_graph(G_data)
 23 |     if isinstance(list(G.nodes())[0], int):
 24 |         conversion = lambda n : int(n)
 25 |     else:
 26 |         conversion = lambda n : n
 27 | 
 28 |     if os.path.exists(prefix + "-feats.npy"):
 29 |         feats = np.load(prefix + "-feats.npy")
 30 |     else:
 31 |         print("No features present.. Only identity features will be used.")
 32 |         feats = None
 33 |     
 34 |     broken_count = 0
 35 |     pred_nodes = list(G.nodes())
 36 |     print(len(pred_nodes))
 37 |     for node in pred_nodes:
 38 |         if ('test' not in G.node[node]) and (not 'user' in G.node[node]):
 39 |             G.remove_node(node)
 40 |             broken_count += 1
 41 |         #break
 42 |     print("Removed {:d} nodes that lacked proper annotations due to networkx versioning issues".format(broken_count))
 43 |     
 44 |     class_map = {}
 45 |     missed_class_set = set([])
 46 |     for node in G.nodes():
 47 |         if not G.node[node]["user"]:
 48 |             class_map[node] = len(class_map)
 49 |     print("{:d} categories in the network".format(len(class_map)))
 50 |     print(class_map)
 51 |     id_map = json.load(open(prefix + "-id_map.json"))
 52 |     id_map = {conversion(k):int(v) for k,v in id_map.items()}
 53 |     walks = []
 54 |     train_class_map = json.load(open(prefix + "-train.json"))
 55 |     all_class = {}
 56 |     if not split_class:
 57 |         train_degrees = 0.0
 58 |         for node, category in train_class_map.items():
 59 |             vec = [0.0 for i in range(len(class_map))]
 60 |             for cat in category["new_cate"]:
 61 |                 if cat not in class_map:
 62 |                     missed_class_set.add(cat)
 63 |                 else:
 64 |                     vec[class_map[cat]] = 1.0
 65 | 
 66 |             for cat in category["old_cate"]:
 67 |                 if cat not in class_map:
 68 |                     missed_class_set.add(cat)
 69 |                 else:
 70 |                     vec[class_map[cat]] = 1.0
 71 |             all_class[node] = vec[:]
 72 |             train_degrees += sum(vec)
 73 |         print("average categorys: {:.3f} in train dataset".format(train_degrees / len(train_class_map.items())))
 74 |     else:
 75 |         train_old_degrees , train_new_degrees = 0.0, 0.0
 76 |         for node, category in train_class_map.items():
 77 |             vec = [0.0 for i in range(len(class_map))]
 78 |             for cat in category["old_cate"]:
 79 |                 if cat not in class_map:
 80 |                     missed_class_set.add(cat)
 81 |                 else:
 82 |                     vec[class_map[cat]] = 1.0
 83 |             vec1 = [0.0 for i in range(len(class_map))]
 84 |             for cat in category["new_cate"]:
 85 |                 if cat not in class_map:
 86 |                     missed_class_set.add(cat)
 87 |                 else:
 88 |                     vec1[class_map[cat]] = 1.0
 89 |             all_class[node] = {'old': vec[:], 'new': vec1[:]}
 90 |             train_old_degrees += sum(vec)
 91 |             train_new_degrees += sum(vec1)
 92 |         print("average old categorys: {:.3f} , new categorys: {:.3f} in train dataset".format(train_old_degrees / len(train_class_map), train_new_degrees / len(train_class_map)))
 93 | 
 94 | 
 95 |     test_class_map = json.load(open(prefix + "-test.json"))
 96 |     test_class = {}
 97 |     if not split_class:
 98 |         test_degrees = 0.0
 99 |         for node, category in test_class_map.items():
100 |             vec = category['old_cate'][:]
101 |             vec.extend(category['new_cate'][:])
102 |             test_class[node] = { 'pos': vec[:], 'neg': category['neg_cate'][:]}
103 |             vec = [0.0 for i in range(len(class_map))]
104 |             for cat in category["new_cate"]:
105 |                 if cat not in class_map:
106 |                     missed_class_set.add(cat)
107 |                 else:
108 |                     vec[class_map[cat]] = 1.0
109 |             for cat in category["old_cate"]:
110 |                 if cat not in class_map:
111 |                     missed_class_set.add(cat)
112 |                 else:
113 |                     vec[class_map[cat]] = 1.0
114 |             all_class[node] = vec[:]
115 |             test_degrees += sum(vec)
116 |         print("average categorys: {:.3f} in test dataset".format(test_degrees / len(test_class_map)))
117 |     else:
118 |         test_old_degrees , test_new_degrees = 0.0, 0.0
119 |         for node, category in test_class_map.items():
120 |             test_class[node] = {'old': category["old_cate"][:], 'new': category["new_cate"][:], 'neg': category["neg_cate"][:]}
121 |             vec = [0.0 for i in range(len(class_map))]
122 |             for cat in category["old_cate"]:
123 |                 if cat not in class_map:
124 |                     missed_class_set.add(cat)
125 |                 else:
126 |                     vec[class_map[cat]] = 1.0
127 |             vec1 = [0.0 for i in range(len(class_map))]
128 |             for cat in category["new_cate"]:
129 |                 if cat not in class_map:
130 |                     missed_class_set.add(cat)
131 |                 else:
132 |                     vec1[class_map[cat]] = 1.0
133 |             all_class[node] = {'old': vec[:], 'new': vec1[:]}
134 |             test_old_degrees += sum(vec)
135 |             test_new_degrees += sum(vec1)
136 |         print("average old categorys: {:.3f}, new categorys: {:.3f} in test dataset".format(test_old_degrees / len(test_class_map), test_new_degrees / len(test_class_map)))
137 |     print("missed class set length: {:d}".format(len(missed_class_set)))
138 |     ## Make sure the graph has edge train_removed annotations
139 |     ## (some datasets might already have this..)
140 |     print("Loaded data.. now preprocessing..")
141 |     for edge in G.edges():
142 |         if  ((not G.node[edge[0]]['user'] or G.node[edge[0]]['test']) and (not G.node[edge[1]]['user'] or G.node[edge[1]]['test'])):
143 |             G[edge[0]][edge[1]]['train_removed'] = True
144 |         else:
145 |             G[edge[0]][edge[1]]['train_removed'] = False
146 | 
147 |     if normalize and not feats is None:
148 |         from sklearn.preprocessing import StandardScaler
149 |         train_ids = np.array([id_map[n] for n in G.nodes() if not G.node[n]['test'] and G.node[n]['user']])
150 |         train_feats = feats[train_ids]
151 |         scaler = StandardScaler()
152 |         scaler.fit(train_feats)
153 |         feats = scaler.transform(feats)
154 |     
155 |     if load_walks:
156 |         with open(prefix + "-walks.txt") as fp:
157 |             for line in fp:
158 |                 walks.append(map(conversion, line.split()))
159 | 
160 |     return G, feats, id_map, walks, all_class, class_map, test_class
161 | 
162 | 
163 | def load_seq_data(prefix, num_steps=5, normalize=True, load_walks=False, split_class=False):
164 |     Gs = []
165 |     all_results, test_results = [], []
166 |     id_map = json.load(open(prefix+'alibaba_id_map.json'))
167 |     id_map = {k: int(v) for k,v in id_map.items()}
168 |     class_map = {}
169 |     for step in range(num_steps):
170 |         G_data = json.load(open(prefix + "graph/alibaba_gul_graph_{:d}.json".format(13+step)))
171 |         G = json_graph.node_link_graph(G_data)
172 |         num = 0
173 |         for node in G.nodes():
174 |             if not G.node[node]["user"] and node not in class_map:
175 |                 class_map[node] = len(class_map)
176 |             if not G.node[node]["user"]:
177 |                 num += 1
178 |         print("{:d} categories in the graph {:d}".format(num, step))
179 |         print("class map length: {:d}".format(len(class_map)))
180 |     print(class_map)
181 |     for step in range(num_steps):
182 |         G_data = json.load(open(prefix + "graph/alibaba_gul_graph_{:d}.json".format(13+step)))
183 |         G = json_graph.node_link_graph(G_data)
184 |         if isinstance(list(G.nodes())[0], int):
185 |             conversion = lambda n : int(n)
186 |         else:
187 |             conversion = lambda n : n
188 | 
189 |         if os.path.exists(prefix + "features.npy"):
190 |             feats = np.load(prefix + "features.npy")
191 |         else:
192 |             print("No features present.. Only identity features will be used.")
193 |             feats = None
194 |     
195 |         broken_count = 0
196 |         pred_nodes = list(G.nodes())
197 |         print("total node numbers in graph {:d} is {:d}".format(step, len(pred_nodes)))
198 |         for node in pred_nodes:
199 |             if ('test' not in G.node[node]) and (not 'user' in G.node[node]):
200 |                 G.remove_node(node)
201 |                 broken_count += 1
202 |         print("Removed {:d} nodes that lacked proper annotations due to networkx versioning issues".format(broken_count))
203 |     
204 |         missed_class_set = set([])
205 |         walks = []
206 |         train_class_map = json.load(open(prefix + "label/alibaba_gul_graph_train_label_{:d}.json".format(step+14)))
207 |         all_class = {}
208 |         if not split_class:
209 |             train_degrees = 0.0
210 |             for node, category in train_class_map.items():
211 |                 vec = [0.0 for i in range(len(class_map))]
212 |                 for cat in category["new_cate"]:
213 |                     if cat not in class_map:
214 |                         missed_class_set.add(cat)
215 |                     else:
216 |                         vec[class_map[cat]] = 1.0
217 | 
218 |                 for cat in category["old_cate"]:
219 |                     if cat not in class_map:
220 |                         missed_class_set.add(cat)
221 |                     else:
222 |                         vec[class_map[cat]] = 1.0
223 |                 all_class[node] = vec[:]
224 |                 train_degrees += sum(vec)
225 |             print("average categorys: {:.3f} in train dataset".format(train_degrees / len(train_class_map.items())))
226 |         else:
227 |             train_old_degrees , train_new_degrees = 0.0, 0.0
228 |             for node, category in train_class_map.items():
229 |                 vec = [0.0 for i in range(len(class_map))]
230 |                 for cat in category["old_cate"]:
231 |                     if cat not in class_map:
232 |                         missed_class_set.add(cat)
233 |                     else:
234 |                         vec[class_map[cat]] = 1.0
235 |                 vec1 = [0.0 for i in range(len(class_map))]
236 |                 for cat in category["new_cate"]:
237 |                     if cat not in class_map:
238 |                         missed_class_set.add(cat)
239 |                     else:
240 |                         vec1[class_map[cat]] = 1.0
241 |                 all_class[node] = {'old': vec[:], 'new': vec1[:]}
242 |                 train_old_degrees += sum(vec)
243 |                 train_new_degrees += sum(vec1)
244 |             print("average old categorys: {:.3f} , new categorys: {:.3f} in train dataset".format(train_old_degrees / len(train_class_map), train_new_degrees / len(train_class_map)))
245 | 
246 | 
247 |         test_class_map = json.load(open(prefix + "label/alibaba_gul_graph_test_label_{:d}.json".format(step+14)))
248 |         test_class = {}
249 |         if not split_class:
250 |             test_degrees = 0.0
251 |             for node, category in test_class_map.items():
252 |                 vec = category['old_cate'][:]
253 |                 vec.extend(category['new_cate'][:])
254 |                 test_class[node] = { 'pos': vec[:], 'neg': category['neg_cate'][:]}
255 |                 vec = [0.0 for i in range(len(class_map))]
256 |                 for cat in category["new_cate"]:
257 |                     if cat not in class_map:
258 |                         missed_class_set.add(cat)
259 |                     else:
260 |                         vec[class_map[cat]] = 1.0
261 |                 for cat in category["old_cate"]:
262 |                     if cat not in class_map:
263 |                         missed_class_set.add(cat)
264 |                     else:
265 |                         vec[class_map[cat]] = 1.0
266 |                 all_class[node] = vec[:]
267 |                 test_degrees += sum(vec)
268 |             print("average categorys: {:.3f} in test dataset".format(test_degrees / len(test_class_map)))
269 |         else:
270 |             test_old_degrees , test_new_degrees = 0.0, 0.0
271 |             for node, category in test_class_map.items():
272 |                 test_class[node] = {'old': category["old_cate"][:], 'new': category["new_cate"][:], 'neg': category["neg_cate"][:]}
273 |                 vec = [0.0 for i in range(len(class_map))]
274 |                 for cat in category["old_cate"]:
275 |                     if cat not in class_map:
276 |                         missed_class_set.add(cat)
277 |                     else:
278 |                         vec[class_map[cat]] = 1.0
279 |                 vec1 = [0.0 for i in range(len(class_map))]
280 |                 for cat in category["new_cate"]:
281 |                     if cat not in class_map:
282 |                         missed_class_set.add(cat)
283 |                     else:
284 |                         vec1[class_map[cat]] = 1.0
285 |                 all_class[node] = {'old': vec[:], 'new': vec1[:]}
286 |                 test_old_degrees += sum(vec)
287 |                 test_new_degrees += sum(vec1)
288 |             print("average old categorys: {:.3f}, new categorys: {:.3f} in test dataset".format(test_old_degrees / len(test_class_map), test_new_degrees / len(test_class_map)))
289 |         print("missed class set length: {:d}".format(len(missed_class_set)))
290 |         ## Make sure the graph has edge train_removed annotations
291 |         ## (some datasets might already have this..)
292 |         print("Loaded data.. now preprocessing..")
293 |         for edge in G.edges():
294 |             if  ((not G.node[edge[0]]['user'] or G.node[edge[0]]['test']) and (not G.node[edge[1]]['user'] or G.node[edge[1]]['test'])):
295 |                 G[edge[0]][edge[1]]['train_removed'] = True
296 |             else:
297 |                 G[edge[0]][edge[1]]['train_removed'] = False
298 | 
299 |         if normalize and not feats is None:
300 |             from sklearn.preprocessing import StandardScaler
301 |             train_ids = np.array([id_map[n] for n in G.nodes() if not G.node[n]['test'] and G.node[n]['user']])
302 |             train_feats = feats[train_ids]
303 |             scaler = StandardScaler()
304 |             scaler.fit(train_feats)
305 |             feats = scaler.transform(feats)
306 |     
307 |         if load_walks:
308 |             with open(prefix + "-walks.txt") as fp:
309 |                 for line in fp:
310 |                     walks.append(map(conversion, line.split()))
311 |         Gs.append(G)
312 |         all_results.append(all_class)
313 |         test_results.append(test_class)
314 |     return Gs, feats, id_map, walks, all_results, class_map, test_results
315 | 
316 | 
317 | 
318 | def run_random_walks(G, nodes, num_walks=N_WALKS):
319 |     pairs = []
320 |     for count, node in enumerate(nodes):
321 |         if G.degree(node) == 0:
322 |             continue
323 |         for i in range(num_walks):
324 |             curr_node = node
325 |             for j in range(WALK_LEN):
326 |                 try:
327 |                     next_node = random.choice(list(G.neighbors(curr_node)))
328 |                     # self co-occurrences are useless
329 |                     if curr_node != node:
330 |                         pairs.append((node,curr_node))
331 |                     curr_node = next_node
332 |                 except:
333 |                     print(len(G.neighbors(curr_node)))
334 |         if count % 1000 == 0:
335 |             print("Done walks for", count, "nodes")
336 |     return pairs
337 | 
338 | if __name__ == "__main__":
339 |     load_seq_data("../sequential/")
340 |     
341 |     """ Run random walks """
342 |     '''
343 |     graph_file = sys.argv[1]
344 |     out_file = sys.argv[2]
345 |     G_data = json.load(open(graph_file))
346 |     G = json_graph.node_link_graph(G_data)
347 |     broken_count = 0
348 |     pre_list = list(G.nodes())
349 |     for n in pre_list:
350 |         if not "user" in G.node[n] or not "test" in G.node[n]:
351 |             broken_count+=1
352 |             G.remove_node(n)
353 |     item_nodes = []
354 |     for n in G.nodes():
355 |         if "user" in G.node[n] and not G.node[n]["user"]:
356 |             item_nodes.append(n)
357 |     print(len(set(item_nodes)))
358 |     remove_nodes = []
359 |     for n in G.nodes():
360 |         if "test" not in G.node[n]:
361 |             #print(n) 
362 |             remove_nodes.append(n)
363 |     print(len(set(remove_nodes)))
364 |     print(list(G.nodes())[:10])
365 |     
366 |     print(list(G.neighbors(1)))
367 |     train_users_and_items = [n for n in G.nodes() if (G.node[n]["user"] and not G.node[n]["test"]) or (not G.node[n]["user"])]      
368 |     nodes = [n for n in G.nodes() if G.node[n]["user"] and not G.node[n]["test"]]
369 |     G = G.subgraph(train_users_and_items)
370 |     pairs = run_random_walks(G, nodes)
371 |     with open(out_file, "w") as fp:
372 |         fp.write("\n".join([str(p[0]) + "\t" + str(p[1]) for p in pairs]))
373 |     '''
374 | 


--------------------------------------------------------------------------------