├── LICENSE
├── README.md
├── aggregators.py
├── baselines
    └── node2vec_cora.embeddings
├── data
    ├── cora_id_edge.txt
    ├── ind.cora.allx
    ├── ind.cora.ally
    ├── ind.cora.graph
    ├── ind.cora.test.index
    ├── ind.cora.tx
    ├── ind.cora.ty
    ├── ind.cora.x
    └── ind.cora.y
├── graph.py
├── inits.py
├── layers.py
├── main.py
├── minibatch.py
├── models.py
├── test.py
└── utils.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Junliang Guo
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SPINE
 2 | This repository provides a reference implementation of the paper "SPINE: Structural Identity Preserved Inductive Network Embedding".
 3 | SPINE is an inductive embedding method which can simultaneously preserve the local proximity and the structural identity of nodes. Details can be found [here](http://arxiv.org/abs/1802.03984).
 4 | 
 5 | ### Requirements
 6 | 
 7 | * tensorflow > 1.2.1
 8 | * networkx == 1.11
 9 | * gensim > 2.3.0
10 | * fastdtw
11 | 
12 | ### Usage
13 | 
14 | To run SPINE on Cora dataset, execute as:
15 | ```
16 | python main.py --input data/cora_id_edge.txt --train_prefix cora --preprocess True
17 | ```
18 | 
19 | ### Options
20 | 
21 | To evaluate the performance by Pearson and Spearman correlation instead of classification accuracy, set ``--CORR True``.
22 | 
23 | To run SPINE and SPINE-p, deactive and active ``--PRETRAIN`` respectively.
24 | 
25 | For more options, please check ``main.py``.
26 | 
27 | ### Acknowledgements
28 | 
29 | We refer to [GraphSAGE](https://github.com/williamleif/GraphSAGE) and [GCN](https://github.com/tkipf/gcn) while constructing code framework and preprocessing datasets. Many thanks to the authors for making their code available.
30 | 
31 | ### Miscellaneous
32 | 
33 | Please cite our paper if you find SPINE useful in your research.
34 | ```
35 | @inproceedings{guo2019spine,
36 |   title={SPINE: Structural Identity Preserved Inductive Network Embedding},
37 |   author={Guo, Junliang and Xu, Linli and Liu, Jingchang},
38 |   booktitle={Twenty-Eighth International Joint Conference on Artificial Intelligence},
39 |   year={2019}
40 | }
41 | ```
42 | 
43 | This is only a reference implementation of SPINE, feel free to ask any question by opening an issue or email me at <leoguojl@gmail.com>.
44 | 


--------------------------------------------------------------------------------
/aggregators.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import tensorflow as tf
  5 | 
  6 | from layers import Layer, Dense
  7 | from inits import glorot, zeros, uniform
  8 | 
  9 | class WeightedAggregator(Layer):
 10 | 	"""
 11 | 	An multi-layer perceptron based feature aggregator
 12 | 	"""
 13 | 
 14 | 	def __init__(self, input_dim, output_dim, dropout = 0., bias = False,
 15 | 				hidden_dim = 512, act = tf.nn.relu, name = None, **kwargs):
 16 | 		super(WeightedAggregator, self).__init__(**kwargs)
 17 | 
 18 | 		self.dropout = dropout
 19 | 		self.bias = bias
 20 | 		self.act = act
 21 | 		self.hidden_dim = hidden_dim
 22 | 
 23 | 		if name is not None:
 24 | 			name = '/' + name
 25 | 		else:
 26 | 			name  = ''
 27 | 
 28 | 		with tf.variable_scope(self.name + name + '_vars'):
 29 | 			self.vars['mlp_weights'] = glorot([input_dim, output_dim], name = 'mlp_weights')
 30 | 			tf.summary.histogram("mlp_weights", self.vars['mlp_weights'])
 31 | 			if self.bias:
 32 | 				self.vars['bias'] = zeros([output_dim], name = 'bias')
 33 | 				tf.summary.histogram("bias", self.vars['bias'])
 34 | 		if self.logging:
 35 | 			self._log_vars()
 36 | 
 37 | 		self.input_dim = input_dim
 38 | 		self.output_dim = output_dim
 39 | 
 40 | 	def _call(self, inputs):
 41 | 		# vec: bs * k * f, weights: bs * k
 42 | 		node_vecs, node_weights, bs, k = inputs
 43 | 		
 44 | 		# weighted summation
 45 | 		node_weights = tf.reshape(node_weights, [bs, k, 1])
 46 | 		node_vecs = tf.reduce_sum(tf.multiply(node_vecs, tf.cast(node_weights, dtype = tf.float32)), 1)
 47 | 		out_k = tf.matmul(node_vecs, self.vars['mlp_weights'])
 48 | 		if self.bias:
 49 | 			out_k += self.vars['bias']
 50 | 
 51 | 		# out_k: bs * k * d	
 52 | 		out_k = self.act(out_k)
 53 | 		return out_k
 54 | 
 55 | class MeanAggregator(Layer):
 56 | 	"""
 57 | 	An multi-layer perceptron based feature aggregator
 58 | 	"""
 59 | 
 60 | 	def __init__(self, input_dim, output_dim, dropout = 0., bias = False,
 61 | 				hidden_dim = 512, act = tf.nn.relu, name = None, **kwargs):
 62 | 		super(MeanAggregator, self).__init__(**kwargs)
 63 | 
 64 | 		self.dropout = dropout
 65 | 		self.bias = bias
 66 | 		self.act = act
 67 | 		self.hidden_dim = hidden_dim
 68 | 
 69 | 		if name is not None:
 70 | 			name = '/' + name
 71 | 		else:
 72 | 			name  = ''
 73 | 
 74 | 		self.mlp_layers = []
 75 | 		self.mlp_layers.append(Dense(input_dim=input_dim,
 76 | 								output_dim=hidden_dim,
 77 | 								act=tf.nn.relu,
 78 | 								dropout=dropout,
 79 | 								sparse_inputs=False,
 80 | 								bias = bias,
 81 | 								logging=self.logging))
 82 | 
 83 | 		with tf.variable_scope(self.name + name + '_vars'):
 84 | 			self.vars['mlp_weights'] = glorot([input_dim, output_dim], name = 'mlp_weights')
 85 | 			tf.summary.histogram("mlp_weights", self.vars['mlp_weights'])
 86 | 			if self.bias:
 87 | 				self.vars['bias'] = zeros([output_dim], name = 'bias')
 88 | 				tf.summary.histogram("bias", self.vars['bias'])
 89 | 		if self.logging:
 90 | 			self._log_vars()
 91 | 
 92 | 		self.input_dim = input_dim
 93 | 		self.output_dim = output_dim
 94 | 
 95 | 	def _call(self, inputs):
 96 | 		node_vecs, node_weights, bs, k = inputs
 97 | 
 98 | 		node_means = tf.reduce_mean(node_vecs, 1)
 99 | 		out_k = tf.matmul(node_means, self.vars['mlp_weights'])
100 | 		if self.bias:
101 | 			out_k += self.vars['bias']
102 | 		return self.act(out_k)
103 | 


--------------------------------------------------------------------------------
/data/ind.cora.allx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemmonation/spine/6b60528f7f10a9ae9346a273dc63262e18ae89b9/data/ind.cora.allx


--------------------------------------------------------------------------------
/data/ind.cora.ally:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemmonation/spine/6b60528f7f10a9ae9346a273dc63262e18ae89b9/data/ind.cora.ally


--------------------------------------------------------------------------------
/data/ind.cora.graph:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemmonation/spine/6b60528f7f10a9ae9346a273dc63262e18ae89b9/data/ind.cora.graph


--------------------------------------------------------------------------------
/data/ind.cora.test.index:
--------------------------------------------------------------------------------
   1 | 2692
   2 | 2532
   3 | 2050
   4 | 1715
   5 | 2362
   6 | 2609
   7 | 2622
   8 | 1975
   9 | 2081
  10 | 1767
  11 | 2263
  12 | 1725
  13 | 2588
  14 | 2259
  15 | 2357
  16 | 1998
  17 | 2574
  18 | 2179
  19 | 2291
  20 | 2382
  21 | 1812
  22 | 1751
  23 | 2422
  24 | 1937
  25 | 2631
  26 | 2510
  27 | 2378
  28 | 2589
  29 | 2345
  30 | 1943
  31 | 1850
  32 | 2298
  33 | 1825
  34 | 2035
  35 | 2507
  36 | 2313
  37 | 1906
  38 | 1797
  39 | 2023
  40 | 2159
  41 | 2495
  42 | 1886
  43 | 2122
  44 | 2369
  45 | 2461
  46 | 1925
  47 | 2565
  48 | 1858
  49 | 2234
  50 | 2000
  51 | 1846
  52 | 2318
  53 | 1723
  54 | 2559
  55 | 2258
  56 | 1763
  57 | 1991
  58 | 1922
  59 | 2003
  60 | 2662
  61 | 2250
  62 | 2064
  63 | 2529
  64 | 1888
  65 | 2499
  66 | 2454
  67 | 2320
  68 | 2287
  69 | 2203
  70 | 2018
  71 | 2002
  72 | 2632
  73 | 2554
  74 | 2314
  75 | 2537
  76 | 1760
  77 | 2088
  78 | 2086
  79 | 2218
  80 | 2605
  81 | 1953
  82 | 2403
  83 | 1920
  84 | 2015
  85 | 2335
  86 | 2535
  87 | 1837
  88 | 2009
  89 | 1905
  90 | 2636
  91 | 1942
  92 | 2193
  93 | 2576
  94 | 2373
  95 | 1873
  96 | 2463
  97 | 2509
  98 | 1954
  99 | 2656
 100 | 2455
 101 | 2494
 102 | 2295
 103 | 2114
 104 | 2561
 105 | 2176
 106 | 2275
 107 | 2635
 108 | 2442
 109 | 2704
 110 | 2127
 111 | 2085
 112 | 2214
 113 | 2487
 114 | 1739
 115 | 2543
 116 | 1783
 117 | 2485
 118 | 2262
 119 | 2472
 120 | 2326
 121 | 1738
 122 | 2170
 123 | 2100
 124 | 2384
 125 | 2152
 126 | 2647
 127 | 2693
 128 | 2376
 129 | 1775
 130 | 1726
 131 | 2476
 132 | 2195
 133 | 1773
 134 | 1793
 135 | 2194
 136 | 2581
 137 | 1854
 138 | 2524
 139 | 1945
 140 | 1781
 141 | 1987
 142 | 2599
 143 | 1744
 144 | 2225
 145 | 2300
 146 | 1928
 147 | 2042
 148 | 2202
 149 | 1958
 150 | 1816
 151 | 1916
 152 | 2679
 153 | 2190
 154 | 1733
 155 | 2034
 156 | 2643
 157 | 2177
 158 | 1883
 159 | 1917
 160 | 1996
 161 | 2491
 162 | 2268
 163 | 2231
 164 | 2471
 165 | 1919
 166 | 1909
 167 | 2012
 168 | 2522
 169 | 1865
 170 | 2466
 171 | 2469
 172 | 2087
 173 | 2584
 174 | 2563
 175 | 1924
 176 | 2143
 177 | 1736
 178 | 1966
 179 | 2533
 180 | 2490
 181 | 2630
 182 | 1973
 183 | 2568
 184 | 1978
 185 | 2664
 186 | 2633
 187 | 2312
 188 | 2178
 189 | 1754
 190 | 2307
 191 | 2480
 192 | 1960
 193 | 1742
 194 | 1962
 195 | 2160
 196 | 2070
 197 | 2553
 198 | 2433
 199 | 1768
 200 | 2659
 201 | 2379
 202 | 2271
 203 | 1776
 204 | 2153
 205 | 1877
 206 | 2027
 207 | 2028
 208 | 2155
 209 | 2196
 210 | 2483
 211 | 2026
 212 | 2158
 213 | 2407
 214 | 1821
 215 | 2131
 216 | 2676
 217 | 2277
 218 | 2489
 219 | 2424
 220 | 1963
 221 | 1808
 222 | 1859
 223 | 2597
 224 | 2548
 225 | 2368
 226 | 1817
 227 | 2405
 228 | 2413
 229 | 2603
 230 | 2350
 231 | 2118
 232 | 2329
 233 | 1969
 234 | 2577
 235 | 2475
 236 | 2467
 237 | 2425
 238 | 1769
 239 | 2092
 240 | 2044
 241 | 2586
 242 | 2608
 243 | 1983
 244 | 2109
 245 | 2649
 246 | 1964
 247 | 2144
 248 | 1902
 249 | 2411
 250 | 2508
 251 | 2360
 252 | 1721
 253 | 2005
 254 | 2014
 255 | 2308
 256 | 2646
 257 | 1949
 258 | 1830
 259 | 2212
 260 | 2596
 261 | 1832
 262 | 1735
 263 | 1866
 264 | 2695
 265 | 1941
 266 | 2546
 267 | 2498
 268 | 2686
 269 | 2665
 270 | 1784
 271 | 2613
 272 | 1970
 273 | 2021
 274 | 2211
 275 | 2516
 276 | 2185
 277 | 2479
 278 | 2699
 279 | 2150
 280 | 1990
 281 | 2063
 282 | 2075
 283 | 1979
 284 | 2094
 285 | 1787
 286 | 2571
 287 | 2690
 288 | 1926
 289 | 2341
 290 | 2566
 291 | 1957
 292 | 1709
 293 | 1955
 294 | 2570
 295 | 2387
 296 | 1811
 297 | 2025
 298 | 2447
 299 | 2696
 300 | 2052
 301 | 2366
 302 | 1857
 303 | 2273
 304 | 2245
 305 | 2672
 306 | 2133
 307 | 2421
 308 | 1929
 309 | 2125
 310 | 2319
 311 | 2641
 312 | 2167
 313 | 2418
 314 | 1765
 315 | 1761
 316 | 1828
 317 | 2188
 318 | 1972
 319 | 1997
 320 | 2419
 321 | 2289
 322 | 2296
 323 | 2587
 324 | 2051
 325 | 2440
 326 | 2053
 327 | 2191
 328 | 1923
 329 | 2164
 330 | 1861
 331 | 2339
 332 | 2333
 333 | 2523
 334 | 2670
 335 | 2121
 336 | 1921
 337 | 1724
 338 | 2253
 339 | 2374
 340 | 1940
 341 | 2545
 342 | 2301
 343 | 2244
 344 | 2156
 345 | 1849
 346 | 2551
 347 | 2011
 348 | 2279
 349 | 2572
 350 | 1757
 351 | 2400
 352 | 2569
 353 | 2072
 354 | 2526
 355 | 2173
 356 | 2069
 357 | 2036
 358 | 1819
 359 | 1734
 360 | 1880
 361 | 2137
 362 | 2408
 363 | 2226
 364 | 2604
 365 | 1771
 366 | 2698
 367 | 2187
 368 | 2060
 369 | 1756
 370 | 2201
 371 | 2066
 372 | 2439
 373 | 1844
 374 | 1772
 375 | 2383
 376 | 2398
 377 | 1708
 378 | 1992
 379 | 1959
 380 | 1794
 381 | 2426
 382 | 2702
 383 | 2444
 384 | 1944
 385 | 1829
 386 | 2660
 387 | 2497
 388 | 2607
 389 | 2343
 390 | 1730
 391 | 2624
 392 | 1790
 393 | 1935
 394 | 1967
 395 | 2401
 396 | 2255
 397 | 2355
 398 | 2348
 399 | 1931
 400 | 2183
 401 | 2161
 402 | 2701
 403 | 1948
 404 | 2501
 405 | 2192
 406 | 2404
 407 | 2209
 408 | 2331
 409 | 1810
 410 | 2363
 411 | 2334
 412 | 1887
 413 | 2393
 414 | 2557
 415 | 1719
 416 | 1732
 417 | 1986
 418 | 2037
 419 | 2056
 420 | 1867
 421 | 2126
 422 | 1932
 423 | 2117
 424 | 1807
 425 | 1801
 426 | 1743
 427 | 2041
 428 | 1843
 429 | 2388
 430 | 2221
 431 | 1833
 432 | 2677
 433 | 1778
 434 | 2661
 435 | 2306
 436 | 2394
 437 | 2106
 438 | 2430
 439 | 2371
 440 | 2606
 441 | 2353
 442 | 2269
 443 | 2317
 444 | 2645
 445 | 2372
 446 | 2550
 447 | 2043
 448 | 1968
 449 | 2165
 450 | 2310
 451 | 1985
 452 | 2446
 453 | 1982
 454 | 2377
 455 | 2207
 456 | 1818
 457 | 1913
 458 | 1766
 459 | 1722
 460 | 1894
 461 | 2020
 462 | 1881
 463 | 2621
 464 | 2409
 465 | 2261
 466 | 2458
 467 | 2096
 468 | 1712
 469 | 2594
 470 | 2293
 471 | 2048
 472 | 2359
 473 | 1839
 474 | 2392
 475 | 2254
 476 | 1911
 477 | 2101
 478 | 2367
 479 | 1889
 480 | 1753
 481 | 2555
 482 | 2246
 483 | 2264
 484 | 2010
 485 | 2336
 486 | 2651
 487 | 2017
 488 | 2140
 489 | 1842
 490 | 2019
 491 | 1890
 492 | 2525
 493 | 2134
 494 | 2492
 495 | 2652
 496 | 2040
 497 | 2145
 498 | 2575
 499 | 2166
 500 | 1999
 501 | 2434
 502 | 1711
 503 | 2276
 504 | 2450
 505 | 2389
 506 | 2669
 507 | 2595
 508 | 1814
 509 | 2039
 510 | 2502
 511 | 1896
 512 | 2168
 513 | 2344
 514 | 2637
 515 | 2031
 516 | 1977
 517 | 2380
 518 | 1936
 519 | 2047
 520 | 2460
 521 | 2102
 522 | 1745
 523 | 2650
 524 | 2046
 525 | 2514
 526 | 1980
 527 | 2352
 528 | 2113
 529 | 1713
 530 | 2058
 531 | 2558
 532 | 1718
 533 | 1864
 534 | 1876
 535 | 2338
 536 | 1879
 537 | 1891
 538 | 2186
 539 | 2451
 540 | 2181
 541 | 2638
 542 | 2644
 543 | 2103
 544 | 2591
 545 | 2266
 546 | 2468
 547 | 1869
 548 | 2582
 549 | 2674
 550 | 2361
 551 | 2462
 552 | 1748
 553 | 2215
 554 | 2615
 555 | 2236
 556 | 2248
 557 | 2493
 558 | 2342
 559 | 2449
 560 | 2274
 561 | 1824
 562 | 1852
 563 | 1870
 564 | 2441
 565 | 2356
 566 | 1835
 567 | 2694
 568 | 2602
 569 | 2685
 570 | 1893
 571 | 2544
 572 | 2536
 573 | 1994
 574 | 1853
 575 | 1838
 576 | 1786
 577 | 1930
 578 | 2539
 579 | 1892
 580 | 2265
 581 | 2618
 582 | 2486
 583 | 2583
 584 | 2061
 585 | 1796
 586 | 1806
 587 | 2084
 588 | 1933
 589 | 2095
 590 | 2136
 591 | 2078
 592 | 1884
 593 | 2438
 594 | 2286
 595 | 2138
 596 | 1750
 597 | 2184
 598 | 1799
 599 | 2278
 600 | 2410
 601 | 2642
 602 | 2435
 603 | 1956
 604 | 2399
 605 | 1774
 606 | 2129
 607 | 1898
 608 | 1823
 609 | 1938
 610 | 2299
 611 | 1862
 612 | 2420
 613 | 2673
 614 | 1984
 615 | 2204
 616 | 1717
 617 | 2074
 618 | 2213
 619 | 2436
 620 | 2297
 621 | 2592
 622 | 2667
 623 | 2703
 624 | 2511
 625 | 1779
 626 | 1782
 627 | 2625
 628 | 2365
 629 | 2315
 630 | 2381
 631 | 1788
 632 | 1714
 633 | 2302
 634 | 1927
 635 | 2325
 636 | 2506
 637 | 2169
 638 | 2328
 639 | 2629
 640 | 2128
 641 | 2655
 642 | 2282
 643 | 2073
 644 | 2395
 645 | 2247
 646 | 2521
 647 | 2260
 648 | 1868
 649 | 1988
 650 | 2324
 651 | 2705
 652 | 2541
 653 | 1731
 654 | 2681
 655 | 2707
 656 | 2465
 657 | 1785
 658 | 2149
 659 | 2045
 660 | 2505
 661 | 2611
 662 | 2217
 663 | 2180
 664 | 1904
 665 | 2453
 666 | 2484
 667 | 1871
 668 | 2309
 669 | 2349
 670 | 2482
 671 | 2004
 672 | 1965
 673 | 2406
 674 | 2162
 675 | 1805
 676 | 2654
 677 | 2007
 678 | 1947
 679 | 1981
 680 | 2112
 681 | 2141
 682 | 1720
 683 | 1758
 684 | 2080
 685 | 2330
 686 | 2030
 687 | 2432
 688 | 2089
 689 | 2547
 690 | 1820
 691 | 1815
 692 | 2675
 693 | 1840
 694 | 2658
 695 | 2370
 696 | 2251
 697 | 1908
 698 | 2029
 699 | 2068
 700 | 2513
 701 | 2549
 702 | 2267
 703 | 2580
 704 | 2327
 705 | 2351
 706 | 2111
 707 | 2022
 708 | 2321
 709 | 2614
 710 | 2252
 711 | 2104
 712 | 1822
 713 | 2552
 714 | 2243
 715 | 1798
 716 | 2396
 717 | 2663
 718 | 2564
 719 | 2148
 720 | 2562
 721 | 2684
 722 | 2001
 723 | 2151
 724 | 2706
 725 | 2240
 726 | 2474
 727 | 2303
 728 | 2634
 729 | 2680
 730 | 2055
 731 | 2090
 732 | 2503
 733 | 2347
 734 | 2402
 735 | 2238
 736 | 1950
 737 | 2054
 738 | 2016
 739 | 1872
 740 | 2233
 741 | 1710
 742 | 2032
 743 | 2540
 744 | 2628
 745 | 1795
 746 | 2616
 747 | 1903
 748 | 2531
 749 | 2567
 750 | 1946
 751 | 1897
 752 | 2222
 753 | 2227
 754 | 2627
 755 | 1856
 756 | 2464
 757 | 2241
 758 | 2481
 759 | 2130
 760 | 2311
 761 | 2083
 762 | 2223
 763 | 2284
 764 | 2235
 765 | 2097
 766 | 1752
 767 | 2515
 768 | 2527
 769 | 2385
 770 | 2189
 771 | 2283
 772 | 2182
 773 | 2079
 774 | 2375
 775 | 2174
 776 | 2437
 777 | 1993
 778 | 2517
 779 | 2443
 780 | 2224
 781 | 2648
 782 | 2171
 783 | 2290
 784 | 2542
 785 | 2038
 786 | 1855
 787 | 1831
 788 | 1759
 789 | 1848
 790 | 2445
 791 | 1827
 792 | 2429
 793 | 2205
 794 | 2598
 795 | 2657
 796 | 1728
 797 | 2065
 798 | 1918
 799 | 2427
 800 | 2573
 801 | 2620
 802 | 2292
 803 | 1777
 804 | 2008
 805 | 1875
 806 | 2288
 807 | 2256
 808 | 2033
 809 | 2470
 810 | 2585
 811 | 2610
 812 | 2082
 813 | 2230
 814 | 1915
 815 | 1847
 816 | 2337
 817 | 2512
 818 | 2386
 819 | 2006
 820 | 2653
 821 | 2346
 822 | 1951
 823 | 2110
 824 | 2639
 825 | 2520
 826 | 1939
 827 | 2683
 828 | 2139
 829 | 2220
 830 | 1910
 831 | 2237
 832 | 1900
 833 | 1836
 834 | 2197
 835 | 1716
 836 | 1860
 837 | 2077
 838 | 2519
 839 | 2538
 840 | 2323
 841 | 1914
 842 | 1971
 843 | 1845
 844 | 2132
 845 | 1802
 846 | 1907
 847 | 2640
 848 | 2496
 849 | 2281
 850 | 2198
 851 | 2416
 852 | 2285
 853 | 1755
 854 | 2431
 855 | 2071
 856 | 2249
 857 | 2123
 858 | 1727
 859 | 2459
 860 | 2304
 861 | 2199
 862 | 1791
 863 | 1809
 864 | 1780
 865 | 2210
 866 | 2417
 867 | 1874
 868 | 1878
 869 | 2116
 870 | 1961
 871 | 1863
 872 | 2579
 873 | 2477
 874 | 2228
 875 | 2332
 876 | 2578
 877 | 2457
 878 | 2024
 879 | 1934
 880 | 2316
 881 | 1841
 882 | 1764
 883 | 1737
 884 | 2322
 885 | 2239
 886 | 2294
 887 | 1729
 888 | 2488
 889 | 1974
 890 | 2473
 891 | 2098
 892 | 2612
 893 | 1834
 894 | 2340
 895 | 2423
 896 | 2175
 897 | 2280
 898 | 2617
 899 | 2208
 900 | 2560
 901 | 1741
 902 | 2600
 903 | 2059
 904 | 1747
 905 | 2242
 906 | 2700
 907 | 2232
 908 | 2057
 909 | 2147
 910 | 2682
 911 | 1792
 912 | 1826
 913 | 2120
 914 | 1895
 915 | 2364
 916 | 2163
 917 | 1851
 918 | 2391
 919 | 2414
 920 | 2452
 921 | 1803
 922 | 1989
 923 | 2623
 924 | 2200
 925 | 2528
 926 | 2415
 927 | 1804
 928 | 2146
 929 | 2619
 930 | 2687
 931 | 1762
 932 | 2172
 933 | 2270
 934 | 2678
 935 | 2593
 936 | 2448
 937 | 1882
 938 | 2257
 939 | 2500
 940 | 1899
 941 | 2478
 942 | 2412
 943 | 2107
 944 | 1746
 945 | 2428
 946 | 2115
 947 | 1800
 948 | 1901
 949 | 2397
 950 | 2530
 951 | 1912
 952 | 2108
 953 | 2206
 954 | 2091
 955 | 1740
 956 | 2219
 957 | 1976
 958 | 2099
 959 | 2142
 960 | 2671
 961 | 2668
 962 | 2216
 963 | 2272
 964 | 2229
 965 | 2666
 966 | 2456
 967 | 2534
 968 | 2697
 969 | 2688
 970 | 2062
 971 | 2691
 972 | 2689
 973 | 2154
 974 | 2590
 975 | 2626
 976 | 2390
 977 | 1813
 978 | 2067
 979 | 1952
 980 | 2518
 981 | 2358
 982 | 1789
 983 | 2076
 984 | 2049
 985 | 2119
 986 | 2013
 987 | 2124
 988 | 2556
 989 | 2105
 990 | 2093
 991 | 1885
 992 | 2305
 993 | 2354
 994 | 2135
 995 | 2601
 996 | 1770
 997 | 1995
 998 | 2504
 999 | 1749
1000 | 2157
1001 | 


--------------------------------------------------------------------------------
/data/ind.cora.tx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemmonation/spine/6b60528f7f10a9ae9346a273dc63262e18ae89b9/data/ind.cora.tx


--------------------------------------------------------------------------------
/data/ind.cora.ty:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemmonation/spine/6b60528f7f10a9ae9346a273dc63262e18ae89b9/data/ind.cora.ty


--------------------------------------------------------------------------------
/data/ind.cora.x:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemmonation/spine/6b60528f7f10a9ae9346a273dc63262e18ae89b9/data/ind.cora.x


--------------------------------------------------------------------------------
/data/ind.cora.y:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemmonation/spine/6b60528f7f10a9ae9346a273dc63262e18ae89b9/data/ind.cora.y


--------------------------------------------------------------------------------
/graph.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """Graph utilities."""
  5 | 
  6 | import sys
  7 | import math
  8 | from io import open
  9 | from os import path
 10 | from glob import glob
 11 | from six.moves import range, zip, zip_longest
 12 | from six import iterkeys
 13 | from time import time
 14 | import random
 15 | from random import shuffle
 16 | from itertools import product,permutations
 17 | from scipy.sparse import issparse
 18 | from collections import defaultdict, Iterable
 19 | from multiprocessing import cpu_count
 20 | import logging
 21 | 
 22 | import numpy as np
 23 | from concurrent.futures import ProcessPoolExecutor
 24 | 
 25 | from multiprocessing import Pool
 26 | from multiprocessing import cpu_count
 27 | 
 28 | logger = logging.getLogger("structural_embedding")
 29 | 
 30 | class Graph(defaultdict):
 31 |   """Efficient basic implementation of nx `Graph' “ Undirected graphs with self loops"""  
 32 |   def __init__(self):
 33 |     super(Graph, self).__init__(list)
 34 | 
 35 |   def nodes(self):
 36 |     return self.keys()
 37 | 
 38 |   def remove_node(self, n):
 39 |     nbrs = self[n]
 40 |     for u in nbrs:
 41 |       self[u].remove(n)
 42 |     del self[n]
 43 | 
 44 |   def node(self):
 45 |     node = {}
 46 |     nodes = self.keys()
 47 |     for _id in nodes:
 48 |       node[_id] = {}
 49 |     return node
 50 | 
 51 |   def adjacency_iter(self):
 52 |     return self.iteritems()
 53 | 
 54 |   def subgraph(self, nodes={}):
 55 |     subgraph = Graph()
 56 |     
 57 |     for n in nodes:
 58 |       if n in self:
 59 |         subgraph[n] = [x for x in self[n] if x in nodes]
 60 |         
 61 |     return subgraph
 62 | 
 63 |   def make_undirected(self):
 64 |   
 65 |     t0 = time()
 66 | 
 67 |     for v in self.keys():
 68 |       for other in self[v]:
 69 |         if v != other:
 70 |           self[other].append(v)
 71 |     
 72 |     t1 = time()
 73 |     #logger.info('make_directed: added missing edges {}s'.format(t1-t0))
 74 | 
 75 |     self.make_consistent()
 76 |     return self
 77 | 
 78 |   def make_consistent(self):
 79 |     t0 = time()
 80 |     for k in iterkeys(self):
 81 |       self[k] = list(sorted(set(self[k])))
 82 |     
 83 |     t1 = time()
 84 |     #logger.info('make_consistent: made consistent in {}s'.format(t1-t0))
 85 | 
 86 |     #self.remove_self_loops()
 87 | 
 88 |     return self
 89 | 
 90 |   def remove_self_loops(self):
 91 | 
 92 |     removed = 0
 93 |     t0 = time()
 94 | 
 95 |     for x in self:
 96 |       if x in self[x]: 
 97 |         self[x].remove(x)
 98 |         removed += 1
 99 |     
100 |     t1 = time()
101 | 
102 |     #logger.info('remove_self_loops: removed {} loops in {}s'.format(removed, (t1-t0)))
103 |     return self
104 | 
105 |   def check_self_loops(self):
106 |     for x in self:
107 |       for y in self[x]:
108 |         if x == y:
109 |           return True
110 |     
111 |     return False
112 | 
113 |   def has_edge(self, v1, v2):
114 |     if v2 in self[v1] or v1 in self[v2]:
115 |       return True
116 |     return False
117 | 
118 |   def degree(self, nodes=None):
119 |     if isinstance(nodes, Iterable):
120 |       return {v:len(self[v]) for v in nodes}
121 |     else:
122 |       return len(self[nodes])
123 | 
124 |   def order(self):
125 |     "Returns the number of nodes in the graph"
126 |     return len(self)    
127 | 
128 |   def number_of_edges(self):
129 |     "Returns the number of nodes in the graph"
130 |     return sum([self.degree(x) for x in self.keys()])/2
131 | 
132 |   def number_of_nodes(self):
133 |     "Returns the number of nodes in the graph"
134 |     return self.order() 
135 | 
136 |   def gToDict(self):
137 |     d = {}
138 |     for k,v in self.iteritems():
139 |       d[k] = v
140 |     return d
141 | 
142 |   def printAdjList(self):
143 |     for key,value in self.iteritems():
144 |       print (key,":",value)
145 | 
146 |   def random_walk(self, path_length, alpha=0, rand=random.Random(), start=None):
147 |    """ Returns a truncated random walk.
148 |        path_length: Length of the random walk.
149 |        alpha: probability of restarts.
150 |        start: the start node of the random walk.
151 |    """
152 |    G = self
153 |    if start:
154 |      path = [start]
155 |    else:
156 |      # Sampling is uniform w.r.t V, and not w.r.t E
157 |      path = [rand.choice(G.keys())]
158 |    while len(path) < path_length:
159 |      cur = path[-1]
160 |      if len(G[cur]) > 0:
161 |        if rand.random() >= alpha:
162 |          path.append(rand.choice(G[cur]))
163 |        else:
164 |          path.append(path[0])
165 |      else:
166 |        break
167 |    return path
168 | 
169 |   def normal_random_walk(self, path_length, rand = random.Random(), start = None):
170 |     """
171 |     Define a normal random walk without restart to generate positive training pairs
172 |     """
173 |     G = self
174 |     pairs = []
175 |     if start:
176 |       path = [start]
177 |     else:
178 |       # Sampling is uniform w.r.t V, and not w.r.t E
179 |       path = [rand.choice(G.keys())]
180 |     while len(path) < path_length:
181 |       cur = path[-1]
182 |       if len(G[cur]) > 0:
183 |         next_node = rand.choice(G[cur])
184 |         path.append(next_node)
185 |         if path[0] != next_node:
186 |           pairs.append((path[0], next_node))
187 |       else:
188 |         break
189 |     return pairs
190 | 
191 | def mask_nodes(G, mask_rate, rand = random.Random()):
192 |   num_nodes = len(G.nodes())
193 |   train_num = int(mask_rate * num_nodes)
194 |   test_num = num_nodes - train_num
195 | 
196 |   test_id = random.sample(G.nodes(), test_num)
197 |   train_id = [i for i in G.nodes() if not i in test_id]
198 | 
199 |   train_id = sorted(train_id)
200 |   test_id = sorted(test_id)
201 | 
202 |   G = G.subgraph(train_id)  # a subgraph constructed only by training nodes
203 |   # use a id-map to map the nodes id into (0, len(G))
204 |   sub_id_map = {}
205 |   map_sub_id = {}
206 |   i = 0
207 |   for node in G.keys():
208 |     if i == len(G.keys()):
209 |       break
210 |     sub_id_map[node] = i
211 |     map_sub_id[i] = node
212 |     i += 1
213 | 
214 |   assert len(sub_id_map.keys()) == len(G.keys()), 'nodes in subgraph are not consisitent with in id map'
215 | 
216 |   map_G = Graph()
217 |   for node in G.keys():
218 |     map_G[sub_id_map[node]] = [sub_id_map[i] for i in G[node] if i in G.keys()]
219 | 
220 |   return map_G, test_id, train_id, map_sub_id
221 | 
222 | def write_normal_randomwalks(G, file_, num_paths = 50, path_length = 5,
223 |                       rand=random.Random(0)):
224 |   nodes = list(G.nodes())
225 |   pairs = []
226 |   with open(file_, 'w') as fp:
227 |     for cnt in range(num_paths):
228 |       rand.shuffle(nodes)
229 |       for node in nodes:
230 |         pair = G.normal_random_walk(path_length, rand = rand, start = node)
231 |         pairs.extend(pair)
232 |         for p in pair:
233 |           fp.write(unicode("{}\t{}\n".format(p[0], p[1])))
234 |   return pairs
235 | 
236 | def build_deepwalk_corpus(G, num_paths, path_length, alpha=0,
237 |                       rand=random.Random(0)):
238 |   walks = {}
239 |   #walks = []
240 | 
241 |   nodes = list(G.nodes())
242 |   
243 |   for cnt in range(num_paths):
244 |     rand.shuffle(nodes)
245 |     for node in nodes:
246 |       #walks.append(G.random_walk(path_length, rand=rand, alpha=alpha, start=node))
247 |       if node in walks.keys():
248 |         walks[node].append(G.random_walk(path_length, rand=rand, alpha=alpha, start=node))
249 |       else:
250 |         walks[node] = []
251 |         walks[node].append(G.random_walk(path_length, rand=rand, alpha=alpha, start=node))
252 |       
253 |   return walks
254 | 
255 | def grouper(n, iterable, padvalue=None):
256 |     "grouper(3, 'abcdefg', 'x') --> ('a','b','c'), ('d','e','f'), ('g','x','x')"
257 |     return zip_longest(*[iter(iterable)]*n, fillvalue=padvalue)
258 | 
259 | def parse_adjacencylist(f):
260 |   adjlist = []
261 |   for l in f:
262 |     if l and l[0] != "#":
263 |       introw = [int(x) for x in l.strip().split()]
264 |       row = [introw[0]]
265 |       row.extend(set(sorted(introw[1:])))
266 |       adjlist.extend([row])
267 |   
268 |   return adjlist
269 | 
270 | def parse_adjacencylist_unchecked(f):
271 |   adjlist = []
272 |   for l in f:
273 |     if l and l[0] != "#":
274 |       adjlist.extend([[int(x) for x in l.strip().split()]])
275 |   
276 |   return adjlist
277 | 
278 | def load_adjacencylist(file_, undirected=False, chunksize=10000, unchecked=True):
279 | 
280 |   if unchecked:
281 |     parse_func = parse_adjacencylist_unchecked
282 |     convert_func = from_adjlist_unchecked
283 |   else:
284 |     parse_func = parse_adjacencylist
285 |     convert_func = from_adjlist
286 | 
287 |   adjlist = []
288 | 
289 |   t0 = time()
290 | 
291 |   with open(file_) as f:
292 |     with ProcessPoolExecutor(max_workers=cpu_count()) as executor:
293 |       total = 0 
294 |       for idx, adj_chunk in enumerate(executor.map(parse_func, grouper(int(chunksize), f))):
295 |           adjlist.extend(adj_chunk)
296 |           total += len(adj_chunk)
297 |   
298 |   t1 = time()
299 | 
300 |   logger.info('Parsed {} edges with {} chunks in {}s'.format(total, idx, t1-t0))
301 | 
302 |   t0 = time()
303 |   G = convert_func(adjlist)
304 |   t1 = time()
305 | 
306 |   logger.info('Converted edges to graph in {}s'.format(t1-t0))
307 | 
308 |   if undirected:
309 |     t0 = time()
310 |     G = G.make_undirected()
311 |     t1 = time()
312 |     logger.info('Made graph undirected in {}s'.format(t1-t0))
313 | 
314 |   return G 
315 | 
316 | def load_edgelist(file_, undirected=True):
317 |   G = Graph()
318 |   with open(file_) as f:
319 |     for l in f:
320 |       if(len(l.strip().split()[:2]) > 1):
321 |         x, y = l.strip().split()[:2]
322 |         x = int(x)
323 |         y = int(y)
324 |         G[x].append(y)
325 |         if undirected:
326 |           G[y].append(x)
327 |       else:
328 |         x = l.strip().split()[:2]
329 |         x = int(x[0])
330 |         G[x] = []  
331 |   
332 |   G.make_consistent()
333 |   return G
334 | 
335 | def from_adjlist(adjlist):
336 |     G = Graph()
337 |     
338 |     for row in adjlist:
339 |         node = row[0]
340 |         neighbors = row[1:]
341 |         G[node] = list(sorted(set(neighbors)))
342 | 
343 |     return G
344 | 
345 | 
346 | def from_adjlist_unchecked(adjlist):
347 |     G = Graph()
348 |     
349 |     for row in adjlist:
350 |         node = row[0]
351 |         neighbors = row[1:]
352 |         G[node] = neighbors
353 | 
354 |     return G


--------------------------------------------------------------------------------
/inits.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import tensorflow as tf
 5 | import numpy as np
 6 | 
 7 | # DISCLAIMER:
 8 | # Parts of this code file are derived from
 9 | # https://github.com/tkipf/gcn
10 | # which is under an identical MIT license as GraphSAGE
11 | 
12 | def uniform(shape, scale=0.05, name=None):
13 |     """Uniform init."""
14 |     initial = tf.random_uniform(shape, minval=-scale, maxval=scale, dtype=tf.float32)
15 |     return tf.Variable(initial, name=name)
16 | 
17 | 
18 | def glorot(shape, name=None):
19 |     """Glorot & Bengio (AISTATS 2010) init."""
20 |     init_range = np.sqrt(6.0/(shape[0]+shape[1]))
21 |     initial = tf.random_uniform(shape, minval=-init_range, maxval=init_range, dtype=tf.float32)
22 |     return tf.Variable(initial, name=name)
23 | 
24 | 
25 | def zeros(shape, name=None):
26 |     """All zeros."""
27 |     initial = tf.zeros(shape, dtype=tf.float32)
28 |     return tf.Variable(initial, name=name)
29 | 
30 | def ones(shape, name=None):
31 |     """All ones."""
32 |     initial = tf.ones(shape, dtype=tf.float32)
33 |     return tf.Variable(initial, name=name)
34 | 


--------------------------------------------------------------------------------
/layers.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from __future__ import division
  5 | from __future__ import print_function
  6 | 
  7 | import tensorflow as tf
  8 | 
  9 | from inits import zeros
 10 | 
 11 | flags = tf.app.flags
 12 | FLAGS = flags.FLAGS
 13 | 
 14 | # DISCLAIMER:
 15 | # Boilerplate parts of this code file were originally forked from
 16 | # https://github.com/tkipf/gcn
 17 | # which itself was very inspired by the keras package
 18 | 
 19 | # global unique layer ID dictionary for layer name assignment
 20 | _LAYER_UIDS = {}
 21 | 
 22 | def get_layer_uid(layer_name=''):
 23 |     """Helper function, assigns unique layer IDs."""
 24 |     if layer_name not in _LAYER_UIDS:
 25 |         _LAYER_UIDS[layer_name] = 1
 26 |         return 1
 27 |     else:
 28 |         _LAYER_UIDS[layer_name] += 1
 29 |         return _LAYER_UIDS[layer_name]
 30 | 
 31 | class Layer(object):
 32 |     """Base layer class. Defines basic API for all layer objects.
 33 |     Implementation inspired by keras (http://keras.io).
 34 |     # Properties
 35 |         name: String, defines the variable scope of the layer.
 36 |         logging: Boolean, switches Tensorflow histogram logging on/off
 37 | 
 38 |     # Methods
 39 |         _call(inputs): Defines computation graph of layer
 40 |             (i.e. takes input, returns output)
 41 |         __call__(inputs): Wrapper for _call()
 42 |         _log_vars(): Log all variables
 43 |     """
 44 | 
 45 |     def __init__(self, **kwargs):
 46 |         allowed_kwargs = {'name', 'logging'}
 47 |         for kwarg in kwargs.keys():
 48 |             assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg
 49 |         name = kwargs.get('name')
 50 |         if not name:
 51 |             layer = self.__class__.__name__.lower()
 52 |             name = layer + '_' + str(get_layer_uid(layer))
 53 |         self.name = name
 54 |         self.vars = {}
 55 |         logging = kwargs.get('logging', False)
 56 |         self.logging = logging
 57 |         self.sparse_inputs = False
 58 | 
 59 |     def _call(self, inputs):
 60 |         return inputs
 61 | 
 62 |     def __call__(self, inputs):
 63 |         with tf.name_scope(self.name):
 64 |             if self.logging and not self.sparse_inputs:
 65 |                 tf.summary.histogram(self.name + '/inputs', inputs)
 66 |             outputs = self._call(inputs)
 67 |             if self.logging:
 68 |                 tf.summary.histogram(self.name + '/outputs', outputs)
 69 |             return outputs
 70 | 
 71 |     def _log_vars(self):
 72 |         for var in self.vars:
 73 |             tf.summary.histogram(self.name + '/vars/' + var, self.vars[var])
 74 | 
 75 | 
 76 | class Dense(Layer):
 77 |     """Dense layer."""
 78 |     def __init__(self, input_dim, output_dim, dropout=0., 
 79 |                  act=tf.nn.relu, placeholders=None, bias=True, featureless=False, 
 80 |                  sparse_inputs=False, **kwargs):
 81 |         super(Dense, self).__init__(**kwargs)
 82 | 
 83 |         self.dropout = dropout
 84 | 
 85 |         self.act = act
 86 |         self.featureless = featureless
 87 |         self.bias = bias
 88 |         self.input_dim = input_dim
 89 |         self.output_dim = output_dim
 90 | 
 91 |         # helper variable for sparse dropout
 92 |         self.sparse_inputs = sparse_inputs
 93 |         if sparse_inputs:
 94 |             self.num_features_nonzero = placeholders['num_features_nonzero']
 95 | 
 96 |         with tf.variable_scope(self.name + '_vars'):
 97 |             self.vars['weights'] = tf.get_variable('weights', shape=(input_dim, output_dim),
 98 |                                          dtype=tf.float32, 
 99 |                                          initializer=tf.contrib.layers.xavier_initializer(),
100 |                                          regularizer=tf.contrib.layers.l2_regularizer(FLAGS.weight_decay))
101 |             if self.bias:
102 |                 self.vars['bias'] = zeros([output_dim], name='bias')
103 | 
104 |         if self.logging:
105 |             self._log_vars()
106 | 
107 |     def _call(self, inputs):
108 |         x = inputs
109 | 
110 |         x = tf.nn.dropout(x, 1-self.dropout)
111 | 
112 |         # transform
113 |         output = tf.matmul(x, self.vars['weights'])
114 | 
115 |         # bias
116 |         if self.bias:
117 |             output += self.vars['bias']
118 | 
119 |         return self.act(output)
120 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from __future__ import division
  5 | from __future__ import print_function
  6 | 
  7 | import os
  8 | import time
  9 | import tensorflow as tf
 10 | from tensorflow.python import debug as tf_debug
 11 | import numpy as np
 12 | import scipy.io as sio
 13 | from scipy import spatial
 14 | import networkx as nx
 15 | import random
 16 | from tqdm import tqdm
 17 | from collections import Counter
 18 | 
 19 | import graph
 20 | import utils
 21 | from models import AggregateModel, PretrainModel
 22 | from minibatch import MinibatchIterator
 23 | import test
 24 | 
 25 | os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
 26 | 
 27 | flags = tf.app.flags
 28 | FLAGS = flags.FLAGS
 29 | 
 30 | tf.app.flags.DEFINE_boolean('log_device_placement', False,
 31 | 							"""Whether to log device placement.""")
 32 | 
 33 | flags.DEFINE_string('input', '', "the input graph edge list file. must be specified")
 34 | flags.DEFINE_string('train_prefix', 'cora', "dataset name.")
 35 | flags.DEFINE_boolean('preprocess', False, "if have processed once then could set False")
 36 | flags.DEFINE_integer('dim', 200, "embedding dimension.")
 37 | flags.DEFINE_integer('batchsize', 512, "batch size")
 38 | flags.DEFINE_integer('epoch', 5, "number of training epoches")
 39 | flags.DEFINE_float('learning_rate', 0.001, "learning rate")
 40 | flags.DEFINE_float('stru_rate', 0.2, "rate between structure sampling and neighbor sampling")
 41 | flags.DEFINE_integer('walk_times', 10, "random walk times started at every node.")
 42 | flags.DEFINE_integer('walk_length', 40, "random walk length at each node.")
 43 | flags.DEFINE_integer('k_RPR', 20, "top-k Rooted PageRank nodes of current node.")
 44 | flags.DEFINE_float('alpha', 0.5, "restart rate when random Walking.")
 45 | flags.DEFINE_integer('neg_sample_size', 50, "negative sampling size.")
 46 | flags.DEFINE_integer('verbose', 20, "how often to print information")
 47 | flags.DEFINE_float('dropout', 0., "dropout rate in MLP")
 48 | flags.DEFINE_float('weight_decay', 0.001, 'weight for l2 loss on embedding matrix.')
 49 | flags.DEFINE_integer('save_per_epoch', 200, 'how often to save the model by epoch')
 50 | flags.DEFINE_integer('seed', 123, "seed when random walk.")
 51 | flags.DEFINE_integer('gpu', 0, "which gpu to use.")
 52 | flags.DEFINE_boolean('PRETRAIN', True, 'W_mlp pretrained by node2vec')
 53 | flags.DEFINE_boolean('CORR', False, "use pearson and spearman correlation to evaluate")
 54 | 
 55 | seed = FLAGS.seed
 56 | np.random.seed(seed)
 57 | tf.set_random_seed(seed)
 58 | 
 59 | os.environ["CUDA_VISIBLE_DEVICES"]=str(FLAGS.gpu)
 60 | 
 61 | GPU_MEM_FRACTION = 0.8
 62 | 
 63 | def log_dir():
 64 | 	log_dir = "./log/" + FLAGS.train_prefix
 65 | 	log_dir += "/{lr:0.3f}_{stru_rate:0.1f}_{rpr_k:d}/".format(
 66 | 			lr = FLAGS.learning_rate,
 67 | 			stru_rate = FLAGS.stru_rate,
 68 | 			rpr_k = FLAGS.k_RPR
 69 | 			)
 70 | 	if not os.path.exists(log_dir):
 71 | 		os.makedirs(log_dir)
 72 | 	return log_dir
 73 | 
 74 | def read_graph():
 75 | 	'''
 76 | 	Reads the input network.
 77 | 	'''
 78 | 	print (" - Loading graph...")
 79 | 	G = graph.load_edgelist(FLAGS.input,undirected=True)
 80 | 	print (" - Graph loaded.")
 81 | 	return G
 82 | 
 83 | def construct_rpr_matrix(G, INDUCTIVE = False):
 84 | 	'''
 85 | 	Construct Rooted PageRank matrix
 86 | 	'''
 87 | 	print ("Number of nodes: {}".format(len(G.nodes())))
 88 | 	num_walks = len(G.nodes()) * FLAGS.walk_times
 89 | 	num_nodes = len(G.nodes())
 90 | 
 91 |   	print("Number of walks: {}".format(num_walks))
 92 |  	print("Walking...")
 93 |   	walks = graph.build_deepwalk_corpus(G, num_paths=FLAGS.walk_times, path_length=FLAGS.walk_length, 
 94 |   							alpha=FLAGS.alpha, rand=random.Random(FLAGS.seed))
 95 |   	all_counts = {}
 96 |   	for node in walks.keys():
 97 |   		walks_n = walks[node]
 98 |   		all_counts[node] = Counter()
 99 |   		for walk in walks_n:
100 |   			all_counts[node].update(walk)
101 | 
102 |   	print("Normal random walks started...")
103 |   	pairs = graph.write_normal_randomwalks(G, 
104 |   		file_= './var/' + FLAGS.train_prefix + '_normal_walks.txt',rand=random.Random(FLAGS.seed))
105 |   
106 |   	print("Normal random walks dumped.")
107 |  
108 | 	rpr_matrix = []
109 | 	rpr_arg = []
110 | 	for node in tqdm(xrange(num_nodes)):
111 | 		if node not in all_counts.keys():
112 | 			raise NotImplementedError
113 | 		temp = all_counts[node].most_common(FLAGS.k_RPR)
114 | 		temp_arg = [i[0] for i in temp]
115 | 		temp_value = [i[1] for i in temp]
116 | 		if len(temp) < FLAGS.k_RPR:
117 | 			for _ in xrange(FLAGS.k_RPR - len(temp)):
118 | 				temp_value.append(0.0)
119 | 				temp_arg.append(node)
120 | 		temp_value = np.asarray(temp_value, dtype = 'double')
121 | 		temp_value = temp_value / sum(temp_value)
122 | 		rpr_matrix.append(temp_value)
123 | 		rpr_arg.append(temp_arg)
124 | 	rpr_matrix = np.asarray(rpr_matrix, dtype = 'double')
125 | 	rpr_arg = np.asarray(rpr_arg, dtype = 'double')
126 | 	rpr_file = './var/' + FLAGS.train_prefix + '_rpr.mat'
127 | 
128 | 	sio.savemat(rpr_file, {'rpr_matrix':rpr_matrix})
129 | 	return rpr_matrix, pairs, rpr_arg
130 | 
131 | def construct_placeholders():
132 | 	placeholders = {
133 | 		'train_inputs' : tf.placeholder(tf.int32, shape = (None), name = 'train_inputs'),
134 | 		'train_labels' : tf.placeholder(tf.int32, shape = (None), name = 'train_labels'),
135 | 		'batchsize' : tf.placeholder(tf.int32, name = 'batchsize')
136 | 	}
137 | 	return placeholders
138 | 
139 | def main():
140 | 	G = read_graph()
141 | 	if FLAGS.preprocess:
142 | 		print (" - Computing Rooted PageRank matrix...")
143 | 		rpr_matrix, pairs, rpr_arg = construct_rpr_matrix(G)
144 | 		utils.dump_to_disk(rpr_arg, './var/' + FLAGS.train_prefix + '_rpr_arg')
145 | 		print (" - RPR matrix completed.")
146 | 		degrees, degree_permuted = utils.create_degree(G)
147 | 		print (" - Dumping degree vectors to disk...")
148 | 		utils.dump_to_disk(degrees, './var/' + FLAGS.train_prefix + '_degrees')
149 | 		utils.dump_to_disk(degree_permuted, './var/' + FLAGS.train_prefix + '_degree_permuted')
150 | 		print (" - Degree vectors dumped.")
151 | 	else:
152 | 		print (" - Loading precomputed Rooted PageRank matrix...")
153 | 		rpr_file = './var/' + FLAGS.train_prefix + '_rpr.mat'
154 | 		rpr_matrix = sio.loadmat(rpr_file)['rpr_matrix']
155 | 		rpr_arg = utils.load_pkl('./var/' + FLAGS.train_prefix + '_rpr_arg')
156 | 		print (" - RPR matrix loaded.")
157 | 		print (" - Loading Degree vectors...")
158 | 		degrees = utils.load_pkl('./var/' + FLAGS.train_prefix + '_degrees')
159 | 		degree_permuted = utils.load_pkl('./var/' + FLAGS.train_prefix + '_degree_permuted')
160 | 		print (" - Degree vectors loaded.")
161 | 		pairs = []
162 | 		with open('./var/' + FLAGS.train_prefix + '_normal_walks.txt', 'r') as fp:
163 | 			for line in fp:
164 | 				n_pair = line.split()
165 | 				pairs.append((int(n_pair[0]), int(n_pair[1])))
166 | 		print (" - Training pairs loaded")
167 | 
168 | 	placeholders = construct_placeholders()
169 | 
170 | 	minibatch = MinibatchIterator(G, placeholders, degrees, rpr_matrix, pairs, 
171 | 		batchsize = FLAGS.batchsize, stru_rate = FLAGS.stru_rate, dataset = FLAGS.train_prefix)
172 | 
173 | 	_, features, _, _ = utils.load_pdata(FLAGS.train_prefix)
174 | 	# TODO: maybe can be more efficiently written by sparse multipications
175 | 	features = np.asarray(features.todense())
176 | 	
177 | 	if FLAGS.PRETRAIN:
178 | 		from gensim.models.keyedvectors import KeyedVectors
179 | 		n2v_embedding = './baselines/{}_{}.embeddings'.format('node2vec', FLAGS.train_prefix)
180 | 		n_model = KeyedVectors.load_word2vec_format(n2v_embedding, binary=False)
181 | 		pretrained = np.asarray([n_model[str(node)] for node in xrange(rpr_matrix.shape[0])])
182 | 		model = PretrainModel(placeholders, features, pretrained, len(G.nodes()),
183 | 			degree_permuted, rpr_matrix, rpr_arg, 
184 | 			dropout = FLAGS.dropout,
185 | 			nodevec_dim = FLAGS.dim,
186 | 			lr = FLAGS.learning_rate,
187 | 			logging = True)
188 | 	else:
189 | 		model = AggregateModel(placeholders, features, len(G.nodes()),
190 | 			degree_permuted, rpr_matrix, rpr_arg, 
191 | 			dropout = FLAGS.dropout,
192 | 			nodevec_dim = FLAGS.dim,
193 | 			lr = FLAGS.learning_rate,
194 | 			logging = True)
195 | 	
196 | 	config = tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)
197 | 	config.gpu_options.allow_growth = True
198 | 	config.allow_soft_placement = True
199 | 
200 | 	sess = tf.Session(config = config)
201 | 	saver = tf.train.Saver(max_to_keep = 5)
202 | 	merged = tf.summary.merge_all()
203 | 	summary_writer = tf.summary.FileWriter(log_dir(), sess.graph)
204 | 	
205 | 	# Init variables
206 | 	sess.run(tf.global_variables_initializer())
207 | 	
208 | 	# Train model
209 | 	total_steps = 0
210 | 	average_time = 0.0
211 | 	average_test = 0.0
212 | 	test_steps = 0
213 | 	epoch_test_acc = [0.0]
214 | 
215 | 	for epoch in xrange(FLAGS.epoch):
216 | 		minibatch.shuffle()
217 | 		_iter = 0
218 | 		print ("Epoch : %02d" % (epoch + 1), "Batchs per epoch : %04d" % (len(pairs) / FLAGS.batchsize))
219 | 
220 | 		while not minibatch.end():
221 | 			feed_dict = minibatch.next_minibatch_feed_dict()
222 | 			t = time.time()
223 | 			# training step
224 | 			outs = sess.run([merged, model.opt_op, 
225 | 						model.loss, model.embeddings], feed_dict = feed_dict)
226 | 			train_cost = outs[2]
227 | 
228 | 			average_time = (average_time * total_steps + time.time() - t) / (total_steps + 1)
229 | 
230 | 			if _iter % FLAGS.verbose == 0:
231 | 				if FLAGS.CORR:
232 | 					all_feed = minibatch.all_feed_dict()
233 | 					out = sess.run([model.train_inputs_all,
234 | 								model.train_inputs_f, model.embed, model.loss], feed_dict = all_feed)
235 | 					str_corr = test.compute_correlation(FLAGS.train_prefix, out[1], rpr_matrix)
236 | 					print ("Epoch: ", '%02d' % (epoch + 1),
237 | 						"iter: ", '%03d' % _iter,
238 | 						"loss: ", "{:.3f}".format(train_cost),
239 | 						"corr: ", str_corr,
240 | 						"train time: ", "{:.3f}".format(average_time))
241 | 				else:
242 | 					train_feed, test_feed = minibatch.test_feed_dict()
243 | 					out_train = sess.run([model.train_inputs_all,
244 | 									model.train_inputs_f, model.embed], feed_dict = train_feed)
245 | 					t1 = time.time()
246 | 					out_test = sess.run([model.train_inputs_all,
247 | 									model.train_inputs_f, model.embed], feed_dict = test_feed)
248 | 					average_test = (average_test * test_steps + time.time() - t1) / (test_steps + 1)
249 | 					test_steps += 1
250 | 
251 | 					acc_f = test.feature_test(FLAGS.train_prefix, out_train[1], out_test[1])
252 | 					epoch_test_acc.append(acc_f)
253 | 					print ("Epoch: ", '%02d' % (epoch + 1),
254 | 						"iter: ", '%03d' % _iter,
255 | 						"loss: ", "{:.3f}".format(train_cost),
256 | 						"now acc: ", "{:.3f}".format(epoch_test_acc[-1]),
257 | 						"best acc: ", "{:.3f}".format(max(epoch_test_acc)),
258 | 						"train time: ", "{:.3f}".format(average_time),
259 | 						"test time: ", "{:.3f}".format(average_test))
260 | 
261 | 			_iter += 1
262 | 			total_steps += 1
263 | 		if epoch % FLAGS.save_per_epoch:
264 | 			saver.save(sess, os.path.join(log_dir(), 'model.ckpt'), epoch)
265 | 	print ("Optimization finished !")
266 | 
267 | if __name__ == '__main__':
268 | 	main()
269 | 	


--------------------------------------------------------------------------------
/minibatch.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from __future__ import print_function
  3 | 
  4 | import numpy as np
  5 | import math
  6 | import random
  7 | from scipy.stats import pearsonr, spearmanr
  8 | from fastdtw import fastdtw
  9 | 
 10 | import graph
 11 | import utils
 12 | 
 13 | class MinibatchIterator(object):
 14 | 	"""
 15 | 	Minibatch Iterator to sample random pairs of positive nodes
 16 | 	"""
 17 | 	def __init__(self, G, placeholders, degrees, rpr_matrix, context_pair, 
 18 | 				batchsize = 128, stru_rate = 0.5, dataset = 'cora', **kwargs):
 19 | 		self.G = G
 20 | 		self.placeholders = placeholders
 21 | 		self.node_num = len(G.nodes())
 22 | 		self.nodes = np.random.permutation(G.nodes())
 23 | 		self.edges = np.random.permutation(context_pair)
 24 | 		self.batch_num = 0
 25 | 		self.batchsize = batchsize
 26 | 		self.degrees = degrees
 27 | 		self.rpr_matrix = rpr_matrix
 28 | 		self.stru_rate =stru_rate
 29 | 		self.dataset = dataset
 30 | 
 31 | 	def batch_feed_dict(self, batch_edges):
 32 | 		train_inputs = []
 33 | 		train_labels = []
 34 | 		for node1, node2 in batch_edges:
 35 | 			train_inputs.append(node1)
 36 | 			if random.random() <= self.stru_rate:
 37 | 				degree_neighbors = self.get_vertices(node1)
 38 | 				rpr_sample_node = self.rpr_sample(node1, degree_neighbors)
 39 | 				train_labels.append(rpr_sample_node)
 40 | 			else:
 41 | 				train_labels.append(node2)
 42 | 
 43 | 		feed_dict = dict()
 44 | 		feed_dict.update({self.placeholders['batchsize'] : len(batch_edges)})
 45 | 		feed_dict.update({self.placeholders['train_inputs'] : train_inputs})
 46 | 		feed_dict.update({self.placeholders['train_labels'] : train_labels})
 47 | 
 48 | 		return feed_dict
 49 | 
 50 | 	def node_feed_dict(self, batch_nodes):
 51 | 		train_inputs = train_labels = batch_nodes
 52 | 		# train_labels is not important, thus equals them
 53 | 
 54 | 		feed_dict = dict()
 55 | 		feed_dict.update({self.placeholders['batchsize'] : len(batch_nodes)})
 56 | 		feed_dict.update({self.placeholders['train_inputs'] : train_inputs})
 57 | 		feed_dict.update({self.placeholders['train_labels'] : train_labels})
 58 | 
 59 | 		return feed_dict
 60 | 
 61 | 	def all_feed_dict(self):
 62 | 		id_range = range(self.node_num)
 63 | 		return self.node_feed_dict(id_range)
 64 | 
 65 | 	def test_feed_dict(self):
 66 | 		_, _, test_train, test_test = utils.load_pdata(self.dataset)
 67 | 		test_train = test_train[:, 0]
 68 | 		test_test = test_test[:, 0]
 69 | 		t_train_feed = self.node_feed_dict(test_train)
 70 | 		t_test_feed = self.node_feed_dict(test_test)
 71 | 		return t_train_feed, t_test_feed
 72 | 
 73 | 	def next_minibatch_feed_dict(self):
 74 | 		start  = self.batch_num * self.batchsize
 75 | 		self.batch_num += 1
 76 | 		batch_edges = self.edges[start : start + self.batchsize]
 77 | 		return self.batch_feed_dict(batch_edges)
 78 | 
 79 | 	def rpr_sample(self, node, neighbors):
 80 | 		node_rpr_v = self.rpr_matrix[node]
 81 | 		sim_list = []
 82 | 		for _neighbor in neighbors:
 83 | 			neighbor_rpr_v = self.rpr_matrix[_neighbor]
 84 | 			dits_dtw, _ = fastdtw(node_rpr_v, neighbor_rpr_v, radius = 1, dist = utils.cost)
 85 | 			sim_list.append(np.exp(-1.0 * dits_dtw))
 86 | 			
 87 | 		norm_weight = [float(i) / sum(sim_list) for i in sim_list]
 88 | 		sampled_neighbor = np.random.choice(neighbors, p = norm_weight)
 89 | 		return sampled_neighbor
 90 | 
 91 | 	def get_vertices(self, v):
 92 | 		num_seleted = 2 * math.log(self.node_num, 2)
 93 | 		vertices = []
 94 | 
 95 | 		degree_v = self.G.degree(v)
 96 | 
 97 | 		try:
 98 | 			c_v = 0  
 99 | 	
100 | 			for v2 in self.degrees[degree_v]['vertices']:
101 | 				if(v != v2):
102 | 					vertices.append(v2)
103 | 					c_v += 1
104 | 					if(c_v > num_seleted):
105 | 						raise StopIteration
106 | 	
107 | 			if('before' not in self.degrees[degree_v]):
108 | 				degree_b = -1
109 | 			else:
110 | 				degree_b = self.degrees[degree_v]['before']
111 | 			if('after' not in self.degrees[degree_v]):
112 | 				degree_a = -1
113 | 			else:
114 | 				degree_a = self.degrees[degree_v]['after']
115 | 			if(degree_b == -1 and degree_a == -1):
116 | 				raise StopIteration
117 | 			degree_now = utils.verifyDegrees(degree_v,degree_a,degree_b)
118 | 	
119 | 			while True:
120 | 				for v2 in self.degrees[degree_now]['vertices']:
121 | 					if(v != v2):
122 | 						vertices.append(v2)
123 | 						c_v += 1
124 | 						if(c_v > num_seleted):
125 | 							raise StopIteration
126 | 	
127 | 				if(degree_now == degree_b):
128 | 					if('before' not in self.degrees[degree_b]):
129 | 						degree_b = -1
130 | 					else:
131 | 						degree_b = self.degrees[degree_b]['before']
132 | 				else:
133 | 					if('after' not in self.degrees[degree_a]):
134 | 						degree_a = -1
135 | 					else:
136 | 						degree_a = self.degrees[degree_a]['after']
137 | 				
138 | 				if(degree_b == -1 and degree_a == -1):
139 | 					raise StopIteration
140 | 	
141 | 				degree_now = utils.verifyDegrees(degree_v,degree_a,degree_b)
142 | 	
143 | 		except StopIteration:
144 | 			return list(vertices)
145 | 	
146 | 		return list(vertices)
147 | 
148 | 	def end(self):
149 | 		return self.batch_num * self.batchsize > len(self.edges) - self.batchsize + 1
150 | 
151 | 	def shuffle(self):
152 | 		"""
153 | 		Re-shuffle the training set. 
154 | 		And the batch number
155 | 		"""
156 | 		self.nodes = np.random.permutation(self.nodes)
157 | 		self.edges = np.random.permutation(self.edges)
158 | 		self.batch_num = 0


--------------------------------------------------------------------------------
/models.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from __future__ import division
  5 | from __future__ import print_function
  6 | 
  7 | import tensorflow as tf
  8 | from tensorflow.contrib.tensorboard.plugins import projector
  9 | import numpy as np
 10 | import math
 11 | 
 12 | import inits
 13 | import graph
 14 | import utils
 15 | from aggregators import WeightedAggregator, MeanAggregator
 16 | 
 17 | flags = tf.app.flags
 18 | FLAGS = flags.FLAGS
 19 | 
 20 | class Model(object):
 21 | 	def __init__(self, **kwargs):
 22 | 		allowed_kwargs = {'name', 'logging'}
 23 | 		for kwarg in kwargs.keys():
 24 | 			assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg
 25 | 		name = kwargs.get('name')
 26 | 		if not name:
 27 | 			name = self.__class__.__name__.lower()
 28 | 		self.name = name
 29 | 
 30 | 		logging = kwargs.get('logging', False)
 31 | 		self.logging = logging
 32 | 
 33 | 		self.vars = {}
 34 | 		self.placeholders = {}
 35 | 
 36 | 		self.layers = []
 37 | 		self.activations = []
 38 | 
 39 | 		self.inputs = None
 40 | 		self.outputs = None
 41 | 
 42 | 		self.loss = 0
 43 | 		self.accuracy = 0
 44 | 		self.optimizer = None
 45 | 		self.opt_op = None
 46 | 
 47 | 	def _build(self):
 48 | 		raise NotImplementedError
 49 | 
 50 | 	def build(self):
 51 | 		""" Wrapper for _build() """
 52 | 		with tf.variable_scope(self.name):
 53 | 			self._build()
 54 | 
 55 | 		# Build sequential layer model
 56 | 		self.activations.append(self.inputs)
 57 | 		for layer in self.layers:
 58 | 			hidden = layer(self.activations[-1])
 59 | 			self.activations.append(hidden)
 60 | 		self.outputs = self.activations[-1]
 61 | 
 62 | 		# Store model variables for easy access
 63 | 		variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name)
 64 | 		self.vars = {var.name: var for var in variables}
 65 | 
 66 | 		# Build metrics
 67 | 		self._loss()
 68 | 		self._accuracy()
 69 | 
 70 | 		self.opt_op = self.optimizer.minimize(self.loss)
 71 | 
 72 | 	def predict(self):
 73 | 		pass
 74 | 
 75 | 	def _loss(self):
 76 | 		raise NotImplementedError
 77 | 
 78 | 	def _accuracy(self):
 79 | 		raise NotImplementedError
 80 | 
 81 | 	def save(self, sess=None):
 82 | 		if not sess:
 83 | 			raise AttributeError("TensorFlow session not provided.")
 84 | 		saver = tf.train.Saver(self.vars)
 85 | 		save_path = saver.save(sess, "tmp/%s.ckpt" % self.name)
 86 | 		print("Model saved in file: %s" % save_path)
 87 | 
 88 | 	def load(self, sess=None):
 89 | 		if not sess:
 90 | 			raise AttributeError("TensorFlow session not provided.")
 91 | 		saver = tf.train.Saver(self.vars)
 92 | 		save_path = "tmp/%s.ckpt" % self.name
 93 | 		saver.restore(sess, save_path)
 94 | 		print("Model restored from file: %s" % save_path)
 95 | 
 96 | class GeneralizedModel(Model):
 97 | 	"""
 98 | 	Base class for models that aren't constructed from traditional, sequential layers.
 99 | 	Subclasses must set self.outputs in _build method
100 | 
101 | 	(Removes the layers idiom from build method of the Model class)
102 | 	"""
103 | 
104 | 	def __init__(self, **kwargs):
105 | 		super(GeneralizedModel, self).__init__(**kwargs)
106 | 		
107 | 
108 | 	def build(self):
109 | 		""" Wrapper for _build() """
110 | 		with tf.variable_scope(self.name):
111 | 			self._build()
112 | 
113 | 		# Store model variables for easy access
114 | 		variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name)
115 | 		self.vars = {var.name: var for var in variables}
116 | 
117 | 		# Build metrics
118 | 		self._loss()
119 | 		self._accuracy()
120 | 
121 | 		self.opt_op = self.optimizer.minimize(self.loss)
122 | 
123 | class PretrainModel(GeneralizedModel):
124 | 	def __init__(self, placeholders, features, pretrained, dict_size, degree_permuted, rpr_matrix,
125 | 					rpr_arg, dropout = 0., nodevec_dim = 200, lr = 0.001, only_f = False, **kwargs):
126 | 		"""
127 | 		W_mlp is pretrained by node2vec
128 | 		"""
129 | 
130 | 		super(PretrainModel, self).__init__(**kwargs)
131 | 
132 | 		self.placeholders = placeholders
133 | 		self.degrees = degree_permuted
134 | 		self.only_f = only_f
135 | 		self.rpr_arg = tf.Variable(tf.constant(rpr_arg, dtype = tf.int64), trainable = False)
136 | 		self.rpr_matrix = tf.Variable(tf.constant(rpr_matrix, dtype = tf.float32), trainable = False)
137 | 
138 | 		self.dropout = dropout
139 | 		self.feature_dim = features.shape[1]
140 | 		self.features = tf.Variable(tf.constant(features, dtype = tf.float32), trainable = False)
141 | 		self.train_inputs = placeholders["train_inputs"]
142 | 		self.train_labels = placeholders["train_labels"]
143 | 		self.batchsize = placeholders["batchsize"]
144 | 		self.dim = dict_size
145 | 		self.nodevec_dim = nodevec_dim
146 | 		self.lr = lr
147 | 
148 | 		self.embeddings =  tf.Variable(tf.constant(pretrained, dtype = tf.float32), 
149 | 			trainable = True, name = "embeddings")
150 | 		self.nce_weights = tf.Variable(tf.constant(pretrained, dtype = tf.float32), 
151 | 			trainable = True, name = "nce_weights")
152 | 
153 | 		self.aggregator_t = WeightedAggregator(self.feature_dim, self.nodevec_dim, dropout = self.dropout,
154 | 								name = 'true_agg')
155 | 		
156 | 		self.optimizer = tf.train.AdamOptimizer(learning_rate = lr)
157 | 		
158 | 		self.build()
159 | 
160 | 	def sample_aggregate(self, input_args, bs, aggregator):
161 | 		samples_arg = tf.nn.embedding_lookup(self.rpr_arg, input_args)
162 | 		samples_weights = tf.nn.embedding_lookup(self.rpr_matrix, input_args)
163 | 		samples_features = tf.nn.embedding_lookup(self.features, samples_arg) 
164 | 
165 | 		batch_out = aggregator((samples_features, samples_weights, bs, FLAGS.k_RPR))
166 | 
167 | 		# out should be bs * d
168 | 		return batch_out
169 | 
170 | 	def _build(self):
171 | 		labels = tf.reshape(tf.cast(self.train_labels, dtype = tf.int64),
172 | 					[self.batchsize, 1])
173 | 		self.neg_samples, _, _ = (tf.nn.fixed_unigram_candidate_sampler(
174 | 			true_classes = labels,
175 | 			num_true = 1,
176 | 			num_sampled = FLAGS.neg_sample_size,
177 | 			unique = True,
178 | 			range_max = len(self.degrees),
179 | 			distortion = 0.75,
180 | 			unigrams = self.degrees.tolist()))
181 | 
182 | 		self.train_inputs_f = self.sample_aggregate(self.train_inputs, self.batchsize, self.aggregator_t)
183 | 		self.train_labels_f = self.sample_aggregate(self.train_labels, self.batchsize, self.aggregator_t)
184 | 		self.neg_samples_f = self.sample_aggregate(self.neg_samples, FLAGS.neg_sample_size, self.aggregator_t)
185 | 
186 | 		self.embed = tf.nn.embedding_lookup(self.embeddings, self.train_inputs)
187 | 		self.true_w = tf.nn.embedding_lookup(self.nce_weights, self.train_labels)
188 | 		self.false_w = tf.nn.embedding_lookup(self.nce_weights, self.neg_samples)
189 | 
190 | 		self.train_inputs_all = tf.add(self.train_inputs_f, self.embed)
191 | 		self.train_labels_all = tf.add(self.train_labels_f, self.true_w)
192 | 		self.neg_samples_all = tf.add(self.neg_samples_f, self.false_w)
193 | 
194 | 	def build(self):
195 | 		self._build()
196 | 		self._loss()
197 | 
198 | 		self._minimize_2()
199 | 
200 | 	def _minimize(self):
201 | 		self.opt_op = self.optimizer.minimize(self.loss)
202 | 
203 | 	def _minimize_2(self):
204 | 		var_list1 = [var for var in tf.trainable_variables() 
205 | 			if var.name == "embeddings:0" or var.name == "nce_weights:0"]
206 | 		var_list2 = [var for var in tf.trainable_variables() if var not in var_list1]
207 | 		opt2 = tf.train.AdamOptimizer(learning_rate = self.lr)
208 | 		opt1 = tf.train.AdamOptimizer(learning_rate = 1e-5)
209 | 		grads = tf.gradients(self.loss, var_list1 + var_list2)
210 | 		grads1 = grads[:len(var_list1)]
211 | 		grads2 = grads[len(var_list1):]
212 | 		train_op1 = opt1.apply_gradients(zip(grads1, var_list1))
213 | 		train_op2 = opt2.apply_gradients(zip(grads2, var_list2))
214 | 		self.opt_op = tf.group(train_op1, train_op2)
215 | 
216 | 	def _loss(self):
217 | 		p1 = tf.reduce_sum(tf.multiply(self.train_inputs_f, self.train_labels_f), 1)
218 | 		p1 = tf.log(tf.sigmoid(p1) + 0.001)
219 | 
220 | 		p2 = tf.reduce_sum(tf.matmul(self.train_inputs_f, tf.transpose(self.neg_samples_f)))
221 | 		p2 = tf.log(tf.sigmoid(-p2) + 0.001)
222 | 
223 | 		p3 = tf.reduce_sum(tf.multiply(self.embed, self.true_w), 1)
224 | 		p3 = tf.log(tf.sigmoid(p3) + 0.001)
225 | 
226 | 		p4 = tf.reduce_sum(tf.matmul(self.embed, tf.transpose(self.false_w)))
227 | 		p4 = tf.log(tf.sigmoid(-p4) + 0.001)
228 | 
229 | 		p5 = tf.reduce_sum(tf.multiply(self.embed, self.train_labels_f), 1)
230 | 		p5 = tf.log(tf.sigmoid(p5) + 0.001)
231 | 
232 | 		p6 = tf.reduce_sum(tf.matmul(self.embed, tf.transpose(self.neg_samples_f)))
233 | 		p6 = tf.log(tf.sigmoid(-p6) + 0.001)
234 | 
235 | 		p7 = tf.reduce_sum(tf.multiply(self.true_w, self.train_inputs_f), 1)
236 | 		p7 = tf.log(tf.sigmoid(p7) + 0.001)
237 | 
238 | 		p8 = tf.reduce_sum(tf.matmul(self.true_w, tf.transpose(self.neg_samples_f)))
239 | 		p8 = tf.log(tf.sigmoid(-p8) + 0.001)
240 | 
241 | 		rho1 = 1.5
242 | 		rho2 = 0.75
243 | 		rho3 = 1.5
244 | 		temp_loss = rho1*(p1+p2)+rho2*(p3+p4)+rho3*(p5+p6)+rho3*(p7+p8)
245 | 		self.loss += -tf.reduce_sum(temp_loss) / tf.cast(self.batchsize, tf.float32)
246 | 		tf.summary.scalar('loss', self.loss)
247 | 
248 | class AggregateModel(GeneralizedModel):
249 | 	def __init__(self, placeholders, features, dict_size, degree_permuted, rpr_matrix,
250 | 					rpr_arg, dropout = 0., nodevec_dim = 200, lr = 0.001, only_f = False, **kwargs):
251 | 		"""
252 | 		Aggregate feature informations of the neighbors of the current node,
253 | 		weighted by Rooted PageRank vector of the current node.
254 | 		"""
255 | 
256 | 		super(AggregateModel, self).__init__(**kwargs)
257 | 
258 | 		self.placeholders = placeholders
259 | 		self.degrees = degree_permuted
260 | 		self.only_f = only_f
261 | 		self.rpr_arg = tf.Variable(tf.constant(rpr_arg, dtype = tf.int64), trainable = False)
262 | 		self.rpr_matrix = tf.Variable(tf.constant(rpr_matrix, dtype = tf.float32), trainable = False)
263 | 		self.dropout = dropout
264 | 		self.feature_dim = features.shape[1]
265 | 		self.features = tf.Variable(tf.constant(features, dtype = tf.float32), trainable = False)
266 | 		self.train_inputs = placeholders["train_inputs"]
267 | 		self.train_labels = placeholders["train_labels"]
268 | 		self.batchsize = placeholders["batchsize"]
269 | 		self.dim = dict_size
270 | 		self.nodevec_dim = nodevec_dim
271 | 
272 | 		self.embeddings = inits.glorot([dict_size, nodevec_dim], name = "embeddings")
273 | 		self.nce_weights = inits.glorot([dict_size, nodevec_dim], name = "nce_weights")
274 | 
275 | 		self.aggregator_t = WeightedAggregator(self.feature_dim, self.nodevec_dim, dropout = self.dropout,
276 | 								name = 'true_agg')
277 | 		self.optimizer = tf.train.AdamOptimizer(learning_rate = lr)
278 | 
279 | 		self.build()
280 | 
281 | 	def sample_aggregate(self, input_args, bs, aggregator):
282 | 		samples_arg = tf.nn.embedding_lookup(self.rpr_arg, input_args)
283 | 		samples_weights = tf.nn.embedding_lookup(self.rpr_matrix, input_args)
284 | 		samples_features = tf.nn.embedding_lookup(self.features, samples_arg) 
285 | 
286 | 		batch_out = aggregator((samples_features, samples_weights, bs, FLAGS.k_RPR))
287 | 
288 | 		# out should be bs * d
289 | 		return batch_out
290 | 
291 | 	def _build(self):
292 | 		labels = tf.reshape(tf.cast(self.train_labels, dtype = tf.int64),
293 | 					[self.batchsize, 1])
294 | 		self.neg_samples, _, _ = (tf.nn.fixed_unigram_candidate_sampler(
295 | 			true_classes = labels,
296 | 			num_true = 1,
297 | 			num_sampled = FLAGS.neg_sample_size,
298 | 			unique = True,
299 | 			range_max = len(self.degrees),
300 | 			distortion = 0.75,
301 | 			unigrams = self.degrees.tolist()))
302 | 
303 | 		self.train_inputs_f = self.sample_aggregate(self.train_inputs, self.batchsize, self.aggregator_t)
304 | 		self.train_labels_f = self.sample_aggregate(self.train_labels, self.batchsize, self.aggregator_t)
305 | 		self.neg_samples_f = self.sample_aggregate(self.neg_samples, FLAGS.neg_sample_size, self.aggregator_t)
306 | 
307 | 		self.embed = tf.nn.embedding_lookup(self.embeddings, self.train_inputs)
308 | 		self.true_w = tf.nn.embedding_lookup(self.nce_weights, self.train_labels)
309 | 		self.false_w = tf.nn.embedding_lookup(self.nce_weights, self.neg_samples)
310 | 
311 | 		self.train_inputs_all = tf.add(self.train_inputs_f, self.embed)
312 | 		self.train_labels_all = tf.add(self.train_labels_f, self.true_w)
313 | 		self.neg_samples_all = tf.add(self.neg_samples_f, self.false_w)
314 | 
315 | 	def build(self):
316 | 		self._build()
317 | 		if self.only_f:
318 | 			self._f_loss()
319 | 		else:
320 | 			self._loss()
321 | 		self._minimize()
322 | 
323 | 	def _minimize(self):
324 | 		self.opt_op = self.optimizer.minimize(self.loss)
325 | 
326 | 	def _f_loss(self):
327 | 		p1 = tf.reduce_sum(tf.multiply(self.train_inputs_f, self.train_labels_f), 1)
328 | 		p1 = tf.log(tf.sigmoid(p1) + 0.001)
329 | 
330 | 		p2 = tf.reduce_sum(tf.matmul(self.train_inputs_f, tf.transpose(self.neg_samples_f)))
331 | 		p2 = tf.log(tf.sigmoid(-p2) + 0.001) 
332 | 
333 | 		temp_loss = p1 + p2
334 | 		self.loss=-tf.reduce_sum(temp_loss) / tf.cast(self.batchsize, tf.float32)
335 | 		tf.summary.scalar('loss', self.loss)
336 | 
337 | 	def _loss(self):
338 | 		p1 = tf.reduce_sum(tf.multiply(self.train_inputs_f, self.train_labels_f), 1)
339 | 		p1 = tf.log(tf.sigmoid(p1) + 0.001)
340 | 
341 | 		p2 = tf.reduce_sum(tf.matmul(self.train_inputs_f, tf.transpose(self.neg_samples_f)))
342 | 		p2 = tf.log(tf.sigmoid(-p2) + 0.001)
343 | 
344 | 		p3 = tf.reduce_sum(tf.multiply(self.embed, self.true_w), 1)
345 | 		p3 = tf.log(tf.sigmoid(p3) + 0.001)
346 | 
347 | 		p4 = tf.reduce_sum(tf.matmul(self.embed, tf.transpose(self.false_w)))
348 | 		p4 = tf.log(tf.sigmoid(-p4) + 0.001)
349 | 
350 | 		p5 = tf.reduce_sum(tf.multiply(self.embed, self.train_labels_f), 1)
351 | 		p5 = tf.log(tf.sigmoid(p5) + 0.001)
352 | 
353 | 		p6 = tf.reduce_sum(tf.matmul(self.embed, tf.transpose(self.neg_samples_f)))
354 | 		p6 = tf.log(tf.sigmoid(-p6) + 0.001)
355 | 
356 | 		p7 = tf.reduce_sum(tf.multiply(self.true_w, self.train_inputs_f), 1)
357 | 		p7 = tf.log(tf.sigmoid(p7) + 0.001)
358 | 
359 | 		p8 = tf.reduce_sum(tf.matmul(self.true_w, tf.transpose(self.neg_samples_f)))
360 | 		p8 = tf.log(tf.sigmoid(-p8) + 0.001)
361 | 
362 | 		rho1 = 1.5
363 | 		rho2 = 0.75
364 | 		rho3 = 1.5
365 | 		temp_loss = rho1*(p1+p2)+rho2*(p3+p4)+rho3*(p5+p6)+rho3*(p7+p8)
366 | 		self.loss += -tf.reduce_sum(temp_loss) / tf.cast(self.batchsize, tf.float32)
367 | 		tf.summary.scalar('loss', self.loss)


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import numpy as np
  5 | from scipy import sparse
  6 | from io import open
  7 | import json
  8 | import random
  9 | import tensorflow as tf
 10 | import sys
 11 | from utils import load_pdata, cos_sim, cost
 12 | from sklearn.linear_model import LogisticRegression
 13 | from sklearn.metrics import accuracy_score, f1_score
 14 | from sklearn.multiclass import OneVsRestClassifier
 15 | from itertools import izip
 16 | from sklearn.utils import shuffle as skshuffle
 17 | from collections import defaultdict
 18 | from scipy.stats import pearsonr, spearmanr
 19 | from fastdtw import fastdtw
 20 | 
 21 | 
 22 | def compute_correlation(dataset, embeddings, rpr_matrix):
 23 | 	graph, _, _, _ = load_pdata(dataset)
 24 | 	eu_dists = []
 25 | 	stru_dists = []
 26 | 	for node in graph:
 27 | 		for nei in graph[node]:
 28 | 			if node == nei:
 29 | 				continue
 30 | 			dist_eu = np.linalg.norm(embeddings[node] - embeddings[nei])
 31 | 			dist_stru, _ = fastdtw(embeddings[node], embeddings[nei], radius = 1, dist = cost)
 32 | 			eu_dists.append(dist_eu)
 33 | 			stru_dists.append(dist_stru)
 34 | 	pear_rho, pear_p = pearsonr(stru_dists, eu_dists)
 35 | 	spea_rho, spea_p = spearmanr(stru_dists, eu_dists)
 36 | 	return "P ratio and p: {:.2f} + {:.2f}, S ratio and p: {:.2f} + {:.2f}".format(pear_rho, pear_p, spea_rho, spea_p)
 37 | 
 38 | class TopKRanker(OneVsRestClassifier):
 39 | 	def predict(self, X, top_k_list):
 40 | 		assert X.shape[0] == len(top_k_list)
 41 | 		probs = np.asarray(super(TopKRanker, self).predict_proba(X))
 42 | 		all_labels = []
 43 | 		for i, k in enumerate(top_k_list):
 44 | 			probs_ = probs[i, :]
 45 | 			labels = self.classes_[probs_.argsort()[-k:]].tolist()
 46 | 			all_labels.append(labels)
 47 | 		return all_labels
 48 | 
 49 | def feature_test(dataset, train_embeddings, test_embeddings):
 50 | 	if dataset == 'cora':
 51 | 		classes = 7
 52 | 	elif dataset == 'citeseer':
 53 | 		classes = 6
 54 | 	elif dataset == 'pubmed':
 55 | 		classes = 3
 56 | 	else:
 57 | 		raise Exception('Error : wrong dataset name')
 58 | 
 59 | 	_, _, train_data, test_data = load_pdata(dataset)
 60 | 
 61 | 	test_l = test_data[:, 1]
 62 | 	test_label = []
 63 | 	for i in xrange(test_data.shape[0]):
 64 | 		temp = [0] * classes
 65 | 		temp[test_data[i][1] - 1] += 1
 66 | 		test_label.append(temp)
 67 | 	test_label = np.array(test_label)	 #1000 * 6
 68 | 
 69 | 	train_l = train_data[:, 1]
 70 | 	train_label = []
 71 | 	for i in xrange(train_data.shape[0]):
 72 | 		temp = [0] * classes
 73 | 		temp[train_data[i][1] - 1] += 1
 74 | 		train_label.append(temp)
 75 | 	train_label = np.array(train_label)	 #120 * 6
 76 | 
 77 | 	test_in = np.asarray(test_embeddings)
 78 | 	train_in = np.asarray(train_embeddings)
 79 | 	
 80 | 	y_train_ = sparse.coo_matrix(train_label)
 81 | 	y_train = [[] for x in xrange(y_train_.shape[0])]
 82 | 	cy =	y_train_.tocoo()
 83 | 	for i, j in izip(cy.row, cy.col):
 84 | 		y_train[i].append(j)
 85 | 	
 86 | 	assert sum(len(l) for l in y_train) == y_train_.nnz
 87 | 	
 88 | 	y_test_ = sparse.coo_matrix(test_label)
 89 | 	
 90 | 	y_test = [[] for x in xrange(y_test_.shape[0])]
 91 | 	cy = y_test_.tocoo()
 92 | 	for i, j in izip(cy.row, cy.col):
 93 | 		y_test[i].append(j)
 94 | 	y_train = np.array(y_train)
 95 | 	#y_test = np.array(y_test)
 96 | 
 97 | 	clf = TopKRanker(LogisticRegression())
 98 | 	clf.fit(train_in, y_train)
 99 | 	
100 | 	top_k_list = [len(l) for l in y_test]
101 | 	preds = clf.predict(test_in, top_k_list)
102 | 	acc = accuracy_score(y_test, preds)
103 | 	return acc
104 | 
105 | if __name__ == '__main__':
106 | 	prefix = sys.argv[1]
107 | 	mask_rate = float(sys.argv[2])
108 | 	G, feats, train_data, test_data = load_pdata(prefix)
109 | 	features = np.asarray(feats.todense())
110 | 
111 | 	test_id = test_data[:, 0]
112 | 	train_id = train_data[:, 0]
113 | 	
114 | 	feat_train = []
115 | 	feat_test = []
116 | 	for id_ in train_id:
117 | 		feat_train.append(features[id_])
118 | 	for id_ in test_id:
119 | 		feat_test.append(features[id_])
120 | 
121 | 	acc_f = feature_test(prefix, feat_train, feat_test)
122 | 	print ("feats: {:.3f}".format(acc_f))


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from __future__ import division
  5 | from __future__ import print_function
  6 | 
  7 | import os
  8 | import io
  9 | import time
 10 | import numpy as np
 11 | import scipy.io as sio
 12 | from scipy import sparse as sp
 13 | from scipy import spatial
 14 | import cPickle as pkl
 15 | import networkx as nx
 16 | import random
 17 | import math
 18 | 
 19 | from collections import Counter
 20 | 
 21 | import graph
 22 | 
 23 | def cost(a,b):
 24 |     ep = 0.001
 25 |     m = max(a,b) + ep
 26 |     mi = min(a,b) + ep
 27 |     return ((m/mi) - 1)
 28 | 
 29 | def cos_sim(node_vec, neb_vec):
 30 |     cos_ = 1 - spatial.distance.cosine(node_vec, neb_vec)
 31 |     return cos_
 32 | 
 33 | def create_degree(G):
 34 | 	print (" - Creating degree vectors...")
 35 | 	degrees = {}
 36 | 	degrees_sorted = set()
 37 | 	degree_permuted = np.zeros((len(G.keys()), ))
 38 | 	for v in G.keys():
 39 | 		degree = len(G[v])
 40 | 		degrees_sorted.add(degree)
 41 | 		degree_permuted[v] = degree
 42 | 		if(degree not in degrees):
 43 | 			degrees[degree] = {}
 44 | 			degrees[degree]['vertices'] = []
 45 | 		degrees[degree]['vertices'].append(v)
 46 | 	degrees_sorted = np.array(list(degrees_sorted),dtype='int')
 47 | 	#degree_permuted = degrees_sorted
 48 | 	degrees_sorted = np.sort(degrees_sorted)
 49 | 	l = len(degrees_sorted)
 50 | 	for index, degree in enumerate(degrees_sorted):
 51 | 		if(index > 0):
 52 | 			degrees[degree]['before'] = degrees_sorted[index - 1]
 53 | 		if(index < (l - 1)):
 54 | 			degrees[degree]['after'] = degrees_sorted[index + 1]
 55 | 	print ("- Degree vectors created.")
 56 | 	return degrees, degree_permuted
 57 | 
 58 | def verifyDegrees(degree_v_root,degree_a,degree_b):
 59 | 
 60 |     if(degree_b == -1):
 61 |         degree_now = degree_a
 62 |     elif(degree_a == -1):
 63 |         degree_now = degree_b
 64 |     elif(abs(degree_b - degree_v_root) < abs(degree_a - degree_v_root)):
 65 |         degree_now = degree_b
 66 |     else:
 67 |         degree_now = degree_a
 68 | 
 69 |     return degree_now 
 70 | 
 71 | def dump_to_disk(f, file_name):
 72 | 	with open(file_name + '.pkl', 'wb') as handle:
 73 | 		pkl.dump(f, handle, protocol = pkl.HIGHEST_PROTOCOL)
 74 | 
 75 | def load_pkl(file_name):
 76 | 	with open(file_name + '.pkl', 'rb') as handle:
 77 | 		val = pkl.load(handle)
 78 | 	return val
 79 | 
 80 | def sparse_to_tuple(sparse_mx):
 81 |     if not sp.isspmatrix_coo(sparse_mx):
 82 |         sparse_mx = sparse_mx.tocoo()
 83 |     coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose()
 84 |     values = sparse_mx.data
 85 |     shape = sparse_mx.shape
 86 |     return coords, values, shape
 87 | 
 88 | def parse_index_file(filename):
 89 |     index = []
 90 |     for line in open(filename):
 91 |         index.append(int(line.strip()))
 92 |     return index
 93 | 
 94 | def sample_mask(idx, l):
 95 |     """Create mask."""
 96 |     mask = np.zeros(l)
 97 |     mask[idx] = 1
 98 |     return np.array(mask, dtype=np.bool)
 99 | 
100 | def load_pdata(dataset_str):
101 |     if dataset_str != 'cora' and dataset_str != 'citeseer' and dataset_str != 'pubmed':
102 |         print ('Use datasets other than Planetoid, change load functions')
103 |         pass
104 |     names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
105 |     objects = []
106 |     for i in xrange(len(names)):
107 |         objects.append(pkl.load(open("./data/ind.{}.{}".format(dataset_str, names[i]))))
108 |     x, y, tx, ty, allx, ally, graph = tuple(objects)
109 |     test_idx_reorder = parse_index_file("./data/ind.{}.test.index".format(dataset_str))
110 |     test_idx_range = np.sort(test_idx_reorder)
111 |     if dataset_str == 'citeseer':
112 |         test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
113 |         tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
114 |         tx_extended[test_idx_range-min(test_idx_range), :] = tx
115 |         tx = tx_extended
116 |         ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
117 |         ty_extended[test_idx_range-min(test_idx_range), :] = ty
118 |         ty = ty_extended
119 |     features = sp.vstack((allx, tx)).tolil()
120 |     features[test_idx_reorder, :] = features[test_idx_range, :]
121 | 
122 |     labels = np.vstack((ally, ty))
123 |     labels[test_idx_reorder, :] = labels[test_idx_range, :]
124 |     
125 |     idx_test = test_idx_range.tolist()
126 |     idx_train = range(len(y))
127 | 
128 |     train_mask = sample_mask(idx_train, labels.shape[0])
129 |     test_mask = sample_mask(idx_test, labels.shape[0])
130 | 
131 |     y_train = np.zeros(labels.shape)
132 |     y_test = np.zeros(labels.shape)
133 |     y_train[train_mask, :] = labels[train_mask, :]
134 |     y_test[test_mask, :] = labels[test_mask, :]
135 | 
136 |     train_out = []
137 |     for i in idx_train:
138 |         ll = y_train[i].tolist()
139 |         ll = ll.index(1) + 1
140 |         train_out.append([i, ll])
141 |     train_out = np.array(train_out)
142 | 
143 |     test_out = []
144 |     for i in idx_test:
145 |         ll = y_test[i].tolist()
146 |         ll = ll.index(1) + 1
147 |         test_out.append([i, ll])
148 |     test_out = np.array(test_out)
149 |     adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
150 |     adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape)
151 |     adj.eliminate_zeros()
152 |     # Check that diag is zero:
153 |     assert np.diag(adj.todense()).sum() == 0
154 | 
155 |     adj_triu = sp.triu(adj)
156 |     adj_tuple = sparse_to_tuple(adj_triu)
157 |     edges = adj_tuple[0]
158 |     edges_all = sparse_to_tuple(adj)[0]
159 | 
160 |     num_mask = int(np.floor(edges.shape[0] / 10.))
161 | 
162 |     return graph, features, train_out, test_out
163 | 


--------------------------------------------------------------------------------