├── LICENSE ├── README.md ├── aggregators.py ├── baselines └── node2vec_cora.embeddings ├── data ├── cora_id_edge.txt ├── ind.cora.allx ├── ind.cora.ally ├── ind.cora.graph ├── ind.cora.test.index ├── ind.cora.tx ├── ind.cora.ty ├── ind.cora.x └── ind.cora.y ├── graph.py ├── inits.py ├── layers.py ├── main.py ├── minibatch.py ├── models.py ├── test.py └── utils.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Junliang Guo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SPINE 2 | This repository provides a reference implementation of the paper "SPINE: Structural Identity Preserved Inductive Network Embedding". 3 | SPINE is an inductive embedding method which can simultaneously preserve the local proximity and the structural identity of nodes. Details can be found [here](http://arxiv.org/abs/1802.03984). 4 | 5 | ### Requirements 6 | 7 | * tensorflow > 1.2.1 8 | * networkx == 1.11 9 | * gensim > 2.3.0 10 | * fastdtw 11 | 12 | ### Usage 13 | 14 | To run SPINE on Cora dataset, execute as: 15 | ``` 16 | python main.py --input data/cora_id_edge.txt --train_prefix cora --preprocess True 17 | ``` 18 | 19 | ### Options 20 | 21 | To evaluate the performance by Pearson and Spearman correlation instead of classification accuracy, set ``--CORR True``. 22 | 23 | To run SPINE and SPINE-p, deactive and active ``--PRETRAIN`` respectively. 24 | 25 | For more options, please check ``main.py``. 26 | 27 | ### Acknowledgements 28 | 29 | We refer to [GraphSAGE](https://github.com/williamleif/GraphSAGE) and [GCN](https://github.com/tkipf/gcn) while constructing code framework and preprocessing datasets. Many thanks to the authors for making their code available. 30 | 31 | ### Miscellaneous 32 | 33 | Please cite our paper if you find SPINE useful in your research. 34 | ``` 35 | @inproceedings{guo2019spine, 36 | title={SPINE: Structural Identity Preserved Inductive Network Embedding}, 37 | author={Guo, Junliang and Xu, Linli and Liu, Jingchang}, 38 | booktitle={Twenty-Eighth International Joint Conference on Artificial Intelligence}, 39 | year={2019} 40 | } 41 | ``` 42 | 43 | This is only a reference implementation of SPINE, feel free to ask any question by opening an issue or email me at . 44 | -------------------------------------------------------------------------------- /aggregators.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import tensorflow as tf 5 | 6 | from layers import Layer, Dense 7 | from inits import glorot, zeros, uniform 8 | 9 | class WeightedAggregator(Layer): 10 | """ 11 | An multi-layer perceptron based feature aggregator 12 | """ 13 | 14 | def __init__(self, input_dim, output_dim, dropout = 0., bias = False, 15 | hidden_dim = 512, act = tf.nn.relu, name = None, **kwargs): 16 | super(WeightedAggregator, self).__init__(**kwargs) 17 | 18 | self.dropout = dropout 19 | self.bias = bias 20 | self.act = act 21 | self.hidden_dim = hidden_dim 22 | 23 | if name is not None: 24 | name = '/' + name 25 | else: 26 | name = '' 27 | 28 | with tf.variable_scope(self.name + name + '_vars'): 29 | self.vars['mlp_weights'] = glorot([input_dim, output_dim], name = 'mlp_weights') 30 | tf.summary.histogram("mlp_weights", self.vars['mlp_weights']) 31 | if self.bias: 32 | self.vars['bias'] = zeros([output_dim], name = 'bias') 33 | tf.summary.histogram("bias", self.vars['bias']) 34 | if self.logging: 35 | self._log_vars() 36 | 37 | self.input_dim = input_dim 38 | self.output_dim = output_dim 39 | 40 | def _call(self, inputs): 41 | # vec: bs * k * f, weights: bs * k 42 | node_vecs, node_weights, bs, k = inputs 43 | 44 | # weighted summation 45 | node_weights = tf.reshape(node_weights, [bs, k, 1]) 46 | node_vecs = tf.reduce_sum(tf.multiply(node_vecs, tf.cast(node_weights, dtype = tf.float32)), 1) 47 | out_k = tf.matmul(node_vecs, self.vars['mlp_weights']) 48 | if self.bias: 49 | out_k += self.vars['bias'] 50 | 51 | # out_k: bs * k * d 52 | out_k = self.act(out_k) 53 | return out_k 54 | 55 | class MeanAggregator(Layer): 56 | """ 57 | An multi-layer perceptron based feature aggregator 58 | """ 59 | 60 | def __init__(self, input_dim, output_dim, dropout = 0., bias = False, 61 | hidden_dim = 512, act = tf.nn.relu, name = None, **kwargs): 62 | super(MeanAggregator, self).__init__(**kwargs) 63 | 64 | self.dropout = dropout 65 | self.bias = bias 66 | self.act = act 67 | self.hidden_dim = hidden_dim 68 | 69 | if name is not None: 70 | name = '/' + name 71 | else: 72 | name = '' 73 | 74 | self.mlp_layers = [] 75 | self.mlp_layers.append(Dense(input_dim=input_dim, 76 | output_dim=hidden_dim, 77 | act=tf.nn.relu, 78 | dropout=dropout, 79 | sparse_inputs=False, 80 | bias = bias, 81 | logging=self.logging)) 82 | 83 | with tf.variable_scope(self.name + name + '_vars'): 84 | self.vars['mlp_weights'] = glorot([input_dim, output_dim], name = 'mlp_weights') 85 | tf.summary.histogram("mlp_weights", self.vars['mlp_weights']) 86 | if self.bias: 87 | self.vars['bias'] = zeros([output_dim], name = 'bias') 88 | tf.summary.histogram("bias", self.vars['bias']) 89 | if self.logging: 90 | self._log_vars() 91 | 92 | self.input_dim = input_dim 93 | self.output_dim = output_dim 94 | 95 | def _call(self, inputs): 96 | node_vecs, node_weights, bs, k = inputs 97 | 98 | node_means = tf.reduce_mean(node_vecs, 1) 99 | out_k = tf.matmul(node_means, self.vars['mlp_weights']) 100 | if self.bias: 101 | out_k += self.vars['bias'] 102 | return self.act(out_k) 103 | -------------------------------------------------------------------------------- /data/ind.cora.allx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemmonation/spine/6b60528f7f10a9ae9346a273dc63262e18ae89b9/data/ind.cora.allx -------------------------------------------------------------------------------- /data/ind.cora.ally: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemmonation/spine/6b60528f7f10a9ae9346a273dc63262e18ae89b9/data/ind.cora.ally -------------------------------------------------------------------------------- /data/ind.cora.graph: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemmonation/spine/6b60528f7f10a9ae9346a273dc63262e18ae89b9/data/ind.cora.graph -------------------------------------------------------------------------------- /data/ind.cora.test.index: -------------------------------------------------------------------------------- 1 | 2692 2 | 2532 3 | 2050 4 | 1715 5 | 2362 6 | 2609 7 | 2622 8 | 1975 9 | 2081 10 | 1767 11 | 2263 12 | 1725 13 | 2588 14 | 2259 15 | 2357 16 | 1998 17 | 2574 18 | 2179 19 | 2291 20 | 2382 21 | 1812 22 | 1751 23 | 2422 24 | 1937 25 | 2631 26 | 2510 27 | 2378 28 | 2589 29 | 2345 30 | 1943 31 | 1850 32 | 2298 33 | 1825 34 | 2035 35 | 2507 36 | 2313 37 | 1906 38 | 1797 39 | 2023 40 | 2159 41 | 2495 42 | 1886 43 | 2122 44 | 2369 45 | 2461 46 | 1925 47 | 2565 48 | 1858 49 | 2234 50 | 2000 51 | 1846 52 | 2318 53 | 1723 54 | 2559 55 | 2258 56 | 1763 57 | 1991 58 | 1922 59 | 2003 60 | 2662 61 | 2250 62 | 2064 63 | 2529 64 | 1888 65 | 2499 66 | 2454 67 | 2320 68 | 2287 69 | 2203 70 | 2018 71 | 2002 72 | 2632 73 | 2554 74 | 2314 75 | 2537 76 | 1760 77 | 2088 78 | 2086 79 | 2218 80 | 2605 81 | 1953 82 | 2403 83 | 1920 84 | 2015 85 | 2335 86 | 2535 87 | 1837 88 | 2009 89 | 1905 90 | 2636 91 | 1942 92 | 2193 93 | 2576 94 | 2373 95 | 1873 96 | 2463 97 | 2509 98 | 1954 99 | 2656 100 | 2455 101 | 2494 102 | 2295 103 | 2114 104 | 2561 105 | 2176 106 | 2275 107 | 2635 108 | 2442 109 | 2704 110 | 2127 111 | 2085 112 | 2214 113 | 2487 114 | 1739 115 | 2543 116 | 1783 117 | 2485 118 | 2262 119 | 2472 120 | 2326 121 | 1738 122 | 2170 123 | 2100 124 | 2384 125 | 2152 126 | 2647 127 | 2693 128 | 2376 129 | 1775 130 | 1726 131 | 2476 132 | 2195 133 | 1773 134 | 1793 135 | 2194 136 | 2581 137 | 1854 138 | 2524 139 | 1945 140 | 1781 141 | 1987 142 | 2599 143 | 1744 144 | 2225 145 | 2300 146 | 1928 147 | 2042 148 | 2202 149 | 1958 150 | 1816 151 | 1916 152 | 2679 153 | 2190 154 | 1733 155 | 2034 156 | 2643 157 | 2177 158 | 1883 159 | 1917 160 | 1996 161 | 2491 162 | 2268 163 | 2231 164 | 2471 165 | 1919 166 | 1909 167 | 2012 168 | 2522 169 | 1865 170 | 2466 171 | 2469 172 | 2087 173 | 2584 174 | 2563 175 | 1924 176 | 2143 177 | 1736 178 | 1966 179 | 2533 180 | 2490 181 | 2630 182 | 1973 183 | 2568 184 | 1978 185 | 2664 186 | 2633 187 | 2312 188 | 2178 189 | 1754 190 | 2307 191 | 2480 192 | 1960 193 | 1742 194 | 1962 195 | 2160 196 | 2070 197 | 2553 198 | 2433 199 | 1768 200 | 2659 201 | 2379 202 | 2271 203 | 1776 204 | 2153 205 | 1877 206 | 2027 207 | 2028 208 | 2155 209 | 2196 210 | 2483 211 | 2026 212 | 2158 213 | 2407 214 | 1821 215 | 2131 216 | 2676 217 | 2277 218 | 2489 219 | 2424 220 | 1963 221 | 1808 222 | 1859 223 | 2597 224 | 2548 225 | 2368 226 | 1817 227 | 2405 228 | 2413 229 | 2603 230 | 2350 231 | 2118 232 | 2329 233 | 1969 234 | 2577 235 | 2475 236 | 2467 237 | 2425 238 | 1769 239 | 2092 240 | 2044 241 | 2586 242 | 2608 243 | 1983 244 | 2109 245 | 2649 246 | 1964 247 | 2144 248 | 1902 249 | 2411 250 | 2508 251 | 2360 252 | 1721 253 | 2005 254 | 2014 255 | 2308 256 | 2646 257 | 1949 258 | 1830 259 | 2212 260 | 2596 261 | 1832 262 | 1735 263 | 1866 264 | 2695 265 | 1941 266 | 2546 267 | 2498 268 | 2686 269 | 2665 270 | 1784 271 | 2613 272 | 1970 273 | 2021 274 | 2211 275 | 2516 276 | 2185 277 | 2479 278 | 2699 279 | 2150 280 | 1990 281 | 2063 282 | 2075 283 | 1979 284 | 2094 285 | 1787 286 | 2571 287 | 2690 288 | 1926 289 | 2341 290 | 2566 291 | 1957 292 | 1709 293 | 1955 294 | 2570 295 | 2387 296 | 1811 297 | 2025 298 | 2447 299 | 2696 300 | 2052 301 | 2366 302 | 1857 303 | 2273 304 | 2245 305 | 2672 306 | 2133 307 | 2421 308 | 1929 309 | 2125 310 | 2319 311 | 2641 312 | 2167 313 | 2418 314 | 1765 315 | 1761 316 | 1828 317 | 2188 318 | 1972 319 | 1997 320 | 2419 321 | 2289 322 | 2296 323 | 2587 324 | 2051 325 | 2440 326 | 2053 327 | 2191 328 | 1923 329 | 2164 330 | 1861 331 | 2339 332 | 2333 333 | 2523 334 | 2670 335 | 2121 336 | 1921 337 | 1724 338 | 2253 339 | 2374 340 | 1940 341 | 2545 342 | 2301 343 | 2244 344 | 2156 345 | 1849 346 | 2551 347 | 2011 348 | 2279 349 | 2572 350 | 1757 351 | 2400 352 | 2569 353 | 2072 354 | 2526 355 | 2173 356 | 2069 357 | 2036 358 | 1819 359 | 1734 360 | 1880 361 | 2137 362 | 2408 363 | 2226 364 | 2604 365 | 1771 366 | 2698 367 | 2187 368 | 2060 369 | 1756 370 | 2201 371 | 2066 372 | 2439 373 | 1844 374 | 1772 375 | 2383 376 | 2398 377 | 1708 378 | 1992 379 | 1959 380 | 1794 381 | 2426 382 | 2702 383 | 2444 384 | 1944 385 | 1829 386 | 2660 387 | 2497 388 | 2607 389 | 2343 390 | 1730 391 | 2624 392 | 1790 393 | 1935 394 | 1967 395 | 2401 396 | 2255 397 | 2355 398 | 2348 399 | 1931 400 | 2183 401 | 2161 402 | 2701 403 | 1948 404 | 2501 405 | 2192 406 | 2404 407 | 2209 408 | 2331 409 | 1810 410 | 2363 411 | 2334 412 | 1887 413 | 2393 414 | 2557 415 | 1719 416 | 1732 417 | 1986 418 | 2037 419 | 2056 420 | 1867 421 | 2126 422 | 1932 423 | 2117 424 | 1807 425 | 1801 426 | 1743 427 | 2041 428 | 1843 429 | 2388 430 | 2221 431 | 1833 432 | 2677 433 | 1778 434 | 2661 435 | 2306 436 | 2394 437 | 2106 438 | 2430 439 | 2371 440 | 2606 441 | 2353 442 | 2269 443 | 2317 444 | 2645 445 | 2372 446 | 2550 447 | 2043 448 | 1968 449 | 2165 450 | 2310 451 | 1985 452 | 2446 453 | 1982 454 | 2377 455 | 2207 456 | 1818 457 | 1913 458 | 1766 459 | 1722 460 | 1894 461 | 2020 462 | 1881 463 | 2621 464 | 2409 465 | 2261 466 | 2458 467 | 2096 468 | 1712 469 | 2594 470 | 2293 471 | 2048 472 | 2359 473 | 1839 474 | 2392 475 | 2254 476 | 1911 477 | 2101 478 | 2367 479 | 1889 480 | 1753 481 | 2555 482 | 2246 483 | 2264 484 | 2010 485 | 2336 486 | 2651 487 | 2017 488 | 2140 489 | 1842 490 | 2019 491 | 1890 492 | 2525 493 | 2134 494 | 2492 495 | 2652 496 | 2040 497 | 2145 498 | 2575 499 | 2166 500 | 1999 501 | 2434 502 | 1711 503 | 2276 504 | 2450 505 | 2389 506 | 2669 507 | 2595 508 | 1814 509 | 2039 510 | 2502 511 | 1896 512 | 2168 513 | 2344 514 | 2637 515 | 2031 516 | 1977 517 | 2380 518 | 1936 519 | 2047 520 | 2460 521 | 2102 522 | 1745 523 | 2650 524 | 2046 525 | 2514 526 | 1980 527 | 2352 528 | 2113 529 | 1713 530 | 2058 531 | 2558 532 | 1718 533 | 1864 534 | 1876 535 | 2338 536 | 1879 537 | 1891 538 | 2186 539 | 2451 540 | 2181 541 | 2638 542 | 2644 543 | 2103 544 | 2591 545 | 2266 546 | 2468 547 | 1869 548 | 2582 549 | 2674 550 | 2361 551 | 2462 552 | 1748 553 | 2215 554 | 2615 555 | 2236 556 | 2248 557 | 2493 558 | 2342 559 | 2449 560 | 2274 561 | 1824 562 | 1852 563 | 1870 564 | 2441 565 | 2356 566 | 1835 567 | 2694 568 | 2602 569 | 2685 570 | 1893 571 | 2544 572 | 2536 573 | 1994 574 | 1853 575 | 1838 576 | 1786 577 | 1930 578 | 2539 579 | 1892 580 | 2265 581 | 2618 582 | 2486 583 | 2583 584 | 2061 585 | 1796 586 | 1806 587 | 2084 588 | 1933 589 | 2095 590 | 2136 591 | 2078 592 | 1884 593 | 2438 594 | 2286 595 | 2138 596 | 1750 597 | 2184 598 | 1799 599 | 2278 600 | 2410 601 | 2642 602 | 2435 603 | 1956 604 | 2399 605 | 1774 606 | 2129 607 | 1898 608 | 1823 609 | 1938 610 | 2299 611 | 1862 612 | 2420 613 | 2673 614 | 1984 615 | 2204 616 | 1717 617 | 2074 618 | 2213 619 | 2436 620 | 2297 621 | 2592 622 | 2667 623 | 2703 624 | 2511 625 | 1779 626 | 1782 627 | 2625 628 | 2365 629 | 2315 630 | 2381 631 | 1788 632 | 1714 633 | 2302 634 | 1927 635 | 2325 636 | 2506 637 | 2169 638 | 2328 639 | 2629 640 | 2128 641 | 2655 642 | 2282 643 | 2073 644 | 2395 645 | 2247 646 | 2521 647 | 2260 648 | 1868 649 | 1988 650 | 2324 651 | 2705 652 | 2541 653 | 1731 654 | 2681 655 | 2707 656 | 2465 657 | 1785 658 | 2149 659 | 2045 660 | 2505 661 | 2611 662 | 2217 663 | 2180 664 | 1904 665 | 2453 666 | 2484 667 | 1871 668 | 2309 669 | 2349 670 | 2482 671 | 2004 672 | 1965 673 | 2406 674 | 2162 675 | 1805 676 | 2654 677 | 2007 678 | 1947 679 | 1981 680 | 2112 681 | 2141 682 | 1720 683 | 1758 684 | 2080 685 | 2330 686 | 2030 687 | 2432 688 | 2089 689 | 2547 690 | 1820 691 | 1815 692 | 2675 693 | 1840 694 | 2658 695 | 2370 696 | 2251 697 | 1908 698 | 2029 699 | 2068 700 | 2513 701 | 2549 702 | 2267 703 | 2580 704 | 2327 705 | 2351 706 | 2111 707 | 2022 708 | 2321 709 | 2614 710 | 2252 711 | 2104 712 | 1822 713 | 2552 714 | 2243 715 | 1798 716 | 2396 717 | 2663 718 | 2564 719 | 2148 720 | 2562 721 | 2684 722 | 2001 723 | 2151 724 | 2706 725 | 2240 726 | 2474 727 | 2303 728 | 2634 729 | 2680 730 | 2055 731 | 2090 732 | 2503 733 | 2347 734 | 2402 735 | 2238 736 | 1950 737 | 2054 738 | 2016 739 | 1872 740 | 2233 741 | 1710 742 | 2032 743 | 2540 744 | 2628 745 | 1795 746 | 2616 747 | 1903 748 | 2531 749 | 2567 750 | 1946 751 | 1897 752 | 2222 753 | 2227 754 | 2627 755 | 1856 756 | 2464 757 | 2241 758 | 2481 759 | 2130 760 | 2311 761 | 2083 762 | 2223 763 | 2284 764 | 2235 765 | 2097 766 | 1752 767 | 2515 768 | 2527 769 | 2385 770 | 2189 771 | 2283 772 | 2182 773 | 2079 774 | 2375 775 | 2174 776 | 2437 777 | 1993 778 | 2517 779 | 2443 780 | 2224 781 | 2648 782 | 2171 783 | 2290 784 | 2542 785 | 2038 786 | 1855 787 | 1831 788 | 1759 789 | 1848 790 | 2445 791 | 1827 792 | 2429 793 | 2205 794 | 2598 795 | 2657 796 | 1728 797 | 2065 798 | 1918 799 | 2427 800 | 2573 801 | 2620 802 | 2292 803 | 1777 804 | 2008 805 | 1875 806 | 2288 807 | 2256 808 | 2033 809 | 2470 810 | 2585 811 | 2610 812 | 2082 813 | 2230 814 | 1915 815 | 1847 816 | 2337 817 | 2512 818 | 2386 819 | 2006 820 | 2653 821 | 2346 822 | 1951 823 | 2110 824 | 2639 825 | 2520 826 | 1939 827 | 2683 828 | 2139 829 | 2220 830 | 1910 831 | 2237 832 | 1900 833 | 1836 834 | 2197 835 | 1716 836 | 1860 837 | 2077 838 | 2519 839 | 2538 840 | 2323 841 | 1914 842 | 1971 843 | 1845 844 | 2132 845 | 1802 846 | 1907 847 | 2640 848 | 2496 849 | 2281 850 | 2198 851 | 2416 852 | 2285 853 | 1755 854 | 2431 855 | 2071 856 | 2249 857 | 2123 858 | 1727 859 | 2459 860 | 2304 861 | 2199 862 | 1791 863 | 1809 864 | 1780 865 | 2210 866 | 2417 867 | 1874 868 | 1878 869 | 2116 870 | 1961 871 | 1863 872 | 2579 873 | 2477 874 | 2228 875 | 2332 876 | 2578 877 | 2457 878 | 2024 879 | 1934 880 | 2316 881 | 1841 882 | 1764 883 | 1737 884 | 2322 885 | 2239 886 | 2294 887 | 1729 888 | 2488 889 | 1974 890 | 2473 891 | 2098 892 | 2612 893 | 1834 894 | 2340 895 | 2423 896 | 2175 897 | 2280 898 | 2617 899 | 2208 900 | 2560 901 | 1741 902 | 2600 903 | 2059 904 | 1747 905 | 2242 906 | 2700 907 | 2232 908 | 2057 909 | 2147 910 | 2682 911 | 1792 912 | 1826 913 | 2120 914 | 1895 915 | 2364 916 | 2163 917 | 1851 918 | 2391 919 | 2414 920 | 2452 921 | 1803 922 | 1989 923 | 2623 924 | 2200 925 | 2528 926 | 2415 927 | 1804 928 | 2146 929 | 2619 930 | 2687 931 | 1762 932 | 2172 933 | 2270 934 | 2678 935 | 2593 936 | 2448 937 | 1882 938 | 2257 939 | 2500 940 | 1899 941 | 2478 942 | 2412 943 | 2107 944 | 1746 945 | 2428 946 | 2115 947 | 1800 948 | 1901 949 | 2397 950 | 2530 951 | 1912 952 | 2108 953 | 2206 954 | 2091 955 | 1740 956 | 2219 957 | 1976 958 | 2099 959 | 2142 960 | 2671 961 | 2668 962 | 2216 963 | 2272 964 | 2229 965 | 2666 966 | 2456 967 | 2534 968 | 2697 969 | 2688 970 | 2062 971 | 2691 972 | 2689 973 | 2154 974 | 2590 975 | 2626 976 | 2390 977 | 1813 978 | 2067 979 | 1952 980 | 2518 981 | 2358 982 | 1789 983 | 2076 984 | 2049 985 | 2119 986 | 2013 987 | 2124 988 | 2556 989 | 2105 990 | 2093 991 | 1885 992 | 2305 993 | 2354 994 | 2135 995 | 2601 996 | 1770 997 | 1995 998 | 2504 999 | 1749 1000 | 2157 1001 | -------------------------------------------------------------------------------- /data/ind.cora.tx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemmonation/spine/6b60528f7f10a9ae9346a273dc63262e18ae89b9/data/ind.cora.tx -------------------------------------------------------------------------------- /data/ind.cora.ty: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemmonation/spine/6b60528f7f10a9ae9346a273dc63262e18ae89b9/data/ind.cora.ty -------------------------------------------------------------------------------- /data/ind.cora.x: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemmonation/spine/6b60528f7f10a9ae9346a273dc63262e18ae89b9/data/ind.cora.x -------------------------------------------------------------------------------- /data/ind.cora.y: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemmonation/spine/6b60528f7f10a9ae9346a273dc63262e18ae89b9/data/ind.cora.y -------------------------------------------------------------------------------- /graph.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Graph utilities.""" 5 | 6 | import sys 7 | import math 8 | from io import open 9 | from os import path 10 | from glob import glob 11 | from six.moves import range, zip, zip_longest 12 | from six import iterkeys 13 | from time import time 14 | import random 15 | from random import shuffle 16 | from itertools import product,permutations 17 | from scipy.sparse import issparse 18 | from collections import defaultdict, Iterable 19 | from multiprocessing import cpu_count 20 | import logging 21 | 22 | import numpy as np 23 | from concurrent.futures import ProcessPoolExecutor 24 | 25 | from multiprocessing import Pool 26 | from multiprocessing import cpu_count 27 | 28 | logger = logging.getLogger("structural_embedding") 29 | 30 | class Graph(defaultdict): 31 | """Efficient basic implementation of nx `Graph' “ Undirected graphs with self loops""" 32 | def __init__(self): 33 | super(Graph, self).__init__(list) 34 | 35 | def nodes(self): 36 | return self.keys() 37 | 38 | def remove_node(self, n): 39 | nbrs = self[n] 40 | for u in nbrs: 41 | self[u].remove(n) 42 | del self[n] 43 | 44 | def node(self): 45 | node = {} 46 | nodes = self.keys() 47 | for _id in nodes: 48 | node[_id] = {} 49 | return node 50 | 51 | def adjacency_iter(self): 52 | return self.iteritems() 53 | 54 | def subgraph(self, nodes={}): 55 | subgraph = Graph() 56 | 57 | for n in nodes: 58 | if n in self: 59 | subgraph[n] = [x for x in self[n] if x in nodes] 60 | 61 | return subgraph 62 | 63 | def make_undirected(self): 64 | 65 | t0 = time() 66 | 67 | for v in self.keys(): 68 | for other in self[v]: 69 | if v != other: 70 | self[other].append(v) 71 | 72 | t1 = time() 73 | #logger.info('make_directed: added missing edges {}s'.format(t1-t0)) 74 | 75 | self.make_consistent() 76 | return self 77 | 78 | def make_consistent(self): 79 | t0 = time() 80 | for k in iterkeys(self): 81 | self[k] = list(sorted(set(self[k]))) 82 | 83 | t1 = time() 84 | #logger.info('make_consistent: made consistent in {}s'.format(t1-t0)) 85 | 86 | #self.remove_self_loops() 87 | 88 | return self 89 | 90 | def remove_self_loops(self): 91 | 92 | removed = 0 93 | t0 = time() 94 | 95 | for x in self: 96 | if x in self[x]: 97 | self[x].remove(x) 98 | removed += 1 99 | 100 | t1 = time() 101 | 102 | #logger.info('remove_self_loops: removed {} loops in {}s'.format(removed, (t1-t0))) 103 | return self 104 | 105 | def check_self_loops(self): 106 | for x in self: 107 | for y in self[x]: 108 | if x == y: 109 | return True 110 | 111 | return False 112 | 113 | def has_edge(self, v1, v2): 114 | if v2 in self[v1] or v1 in self[v2]: 115 | return True 116 | return False 117 | 118 | def degree(self, nodes=None): 119 | if isinstance(nodes, Iterable): 120 | return {v:len(self[v]) for v in nodes} 121 | else: 122 | return len(self[nodes]) 123 | 124 | def order(self): 125 | "Returns the number of nodes in the graph" 126 | return len(self) 127 | 128 | def number_of_edges(self): 129 | "Returns the number of nodes in the graph" 130 | return sum([self.degree(x) for x in self.keys()])/2 131 | 132 | def number_of_nodes(self): 133 | "Returns the number of nodes in the graph" 134 | return self.order() 135 | 136 | def gToDict(self): 137 | d = {} 138 | for k,v in self.iteritems(): 139 | d[k] = v 140 | return d 141 | 142 | def printAdjList(self): 143 | for key,value in self.iteritems(): 144 | print (key,":",value) 145 | 146 | def random_walk(self, path_length, alpha=0, rand=random.Random(), start=None): 147 | """ Returns a truncated random walk. 148 | path_length: Length of the random walk. 149 | alpha: probability of restarts. 150 | start: the start node of the random walk. 151 | """ 152 | G = self 153 | if start: 154 | path = [start] 155 | else: 156 | # Sampling is uniform w.r.t V, and not w.r.t E 157 | path = [rand.choice(G.keys())] 158 | while len(path) < path_length: 159 | cur = path[-1] 160 | if len(G[cur]) > 0: 161 | if rand.random() >= alpha: 162 | path.append(rand.choice(G[cur])) 163 | else: 164 | path.append(path[0]) 165 | else: 166 | break 167 | return path 168 | 169 | def normal_random_walk(self, path_length, rand = random.Random(), start = None): 170 | """ 171 | Define a normal random walk without restart to generate positive training pairs 172 | """ 173 | G = self 174 | pairs = [] 175 | if start: 176 | path = [start] 177 | else: 178 | # Sampling is uniform w.r.t V, and not w.r.t E 179 | path = [rand.choice(G.keys())] 180 | while len(path) < path_length: 181 | cur = path[-1] 182 | if len(G[cur]) > 0: 183 | next_node = rand.choice(G[cur]) 184 | path.append(next_node) 185 | if path[0] != next_node: 186 | pairs.append((path[0], next_node)) 187 | else: 188 | break 189 | return pairs 190 | 191 | def mask_nodes(G, mask_rate, rand = random.Random()): 192 | num_nodes = len(G.nodes()) 193 | train_num = int(mask_rate * num_nodes) 194 | test_num = num_nodes - train_num 195 | 196 | test_id = random.sample(G.nodes(), test_num) 197 | train_id = [i for i in G.nodes() if not i in test_id] 198 | 199 | train_id = sorted(train_id) 200 | test_id = sorted(test_id) 201 | 202 | G = G.subgraph(train_id) # a subgraph constructed only by training nodes 203 | # use a id-map to map the nodes id into (0, len(G)) 204 | sub_id_map = {} 205 | map_sub_id = {} 206 | i = 0 207 | for node in G.keys(): 208 | if i == len(G.keys()): 209 | break 210 | sub_id_map[node] = i 211 | map_sub_id[i] = node 212 | i += 1 213 | 214 | assert len(sub_id_map.keys()) == len(G.keys()), 'nodes in subgraph are not consisitent with in id map' 215 | 216 | map_G = Graph() 217 | for node in G.keys(): 218 | map_G[sub_id_map[node]] = [sub_id_map[i] for i in G[node] if i in G.keys()] 219 | 220 | return map_G, test_id, train_id, map_sub_id 221 | 222 | def write_normal_randomwalks(G, file_, num_paths = 50, path_length = 5, 223 | rand=random.Random(0)): 224 | nodes = list(G.nodes()) 225 | pairs = [] 226 | with open(file_, 'w') as fp: 227 | for cnt in range(num_paths): 228 | rand.shuffle(nodes) 229 | for node in nodes: 230 | pair = G.normal_random_walk(path_length, rand = rand, start = node) 231 | pairs.extend(pair) 232 | for p in pair: 233 | fp.write(unicode("{}\t{}\n".format(p[0], p[1]))) 234 | return pairs 235 | 236 | def build_deepwalk_corpus(G, num_paths, path_length, alpha=0, 237 | rand=random.Random(0)): 238 | walks = {} 239 | #walks = [] 240 | 241 | nodes = list(G.nodes()) 242 | 243 | for cnt in range(num_paths): 244 | rand.shuffle(nodes) 245 | for node in nodes: 246 | #walks.append(G.random_walk(path_length, rand=rand, alpha=alpha, start=node)) 247 | if node in walks.keys(): 248 | walks[node].append(G.random_walk(path_length, rand=rand, alpha=alpha, start=node)) 249 | else: 250 | walks[node] = [] 251 | walks[node].append(G.random_walk(path_length, rand=rand, alpha=alpha, start=node)) 252 | 253 | return walks 254 | 255 | def grouper(n, iterable, padvalue=None): 256 | "grouper(3, 'abcdefg', 'x') --> ('a','b','c'), ('d','e','f'), ('g','x','x')" 257 | return zip_longest(*[iter(iterable)]*n, fillvalue=padvalue) 258 | 259 | def parse_adjacencylist(f): 260 | adjlist = [] 261 | for l in f: 262 | if l and l[0] != "#": 263 | introw = [int(x) for x in l.strip().split()] 264 | row = [introw[0]] 265 | row.extend(set(sorted(introw[1:]))) 266 | adjlist.extend([row]) 267 | 268 | return adjlist 269 | 270 | def parse_adjacencylist_unchecked(f): 271 | adjlist = [] 272 | for l in f: 273 | if l and l[0] != "#": 274 | adjlist.extend([[int(x) for x in l.strip().split()]]) 275 | 276 | return adjlist 277 | 278 | def load_adjacencylist(file_, undirected=False, chunksize=10000, unchecked=True): 279 | 280 | if unchecked: 281 | parse_func = parse_adjacencylist_unchecked 282 | convert_func = from_adjlist_unchecked 283 | else: 284 | parse_func = parse_adjacencylist 285 | convert_func = from_adjlist 286 | 287 | adjlist = [] 288 | 289 | t0 = time() 290 | 291 | with open(file_) as f: 292 | with ProcessPoolExecutor(max_workers=cpu_count()) as executor: 293 | total = 0 294 | for idx, adj_chunk in enumerate(executor.map(parse_func, grouper(int(chunksize), f))): 295 | adjlist.extend(adj_chunk) 296 | total += len(adj_chunk) 297 | 298 | t1 = time() 299 | 300 | logger.info('Parsed {} edges with {} chunks in {}s'.format(total, idx, t1-t0)) 301 | 302 | t0 = time() 303 | G = convert_func(adjlist) 304 | t1 = time() 305 | 306 | logger.info('Converted edges to graph in {}s'.format(t1-t0)) 307 | 308 | if undirected: 309 | t0 = time() 310 | G = G.make_undirected() 311 | t1 = time() 312 | logger.info('Made graph undirected in {}s'.format(t1-t0)) 313 | 314 | return G 315 | 316 | def load_edgelist(file_, undirected=True): 317 | G = Graph() 318 | with open(file_) as f: 319 | for l in f: 320 | if(len(l.strip().split()[:2]) > 1): 321 | x, y = l.strip().split()[:2] 322 | x = int(x) 323 | y = int(y) 324 | G[x].append(y) 325 | if undirected: 326 | G[y].append(x) 327 | else: 328 | x = l.strip().split()[:2] 329 | x = int(x[0]) 330 | G[x] = [] 331 | 332 | G.make_consistent() 333 | return G 334 | 335 | def from_adjlist(adjlist): 336 | G = Graph() 337 | 338 | for row in adjlist: 339 | node = row[0] 340 | neighbors = row[1:] 341 | G[node] = list(sorted(set(neighbors))) 342 | 343 | return G 344 | 345 | 346 | def from_adjlist_unchecked(adjlist): 347 | G = Graph() 348 | 349 | for row in adjlist: 350 | node = row[0] 351 | neighbors = row[1:] 352 | G[node] = neighbors 353 | 354 | return G -------------------------------------------------------------------------------- /inits.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import tensorflow as tf 5 | import numpy as np 6 | 7 | # DISCLAIMER: 8 | # Parts of this code file are derived from 9 | # https://github.com/tkipf/gcn 10 | # which is under an identical MIT license as GraphSAGE 11 | 12 | def uniform(shape, scale=0.05, name=None): 13 | """Uniform init.""" 14 | initial = tf.random_uniform(shape, minval=-scale, maxval=scale, dtype=tf.float32) 15 | return tf.Variable(initial, name=name) 16 | 17 | 18 | def glorot(shape, name=None): 19 | """Glorot & Bengio (AISTATS 2010) init.""" 20 | init_range = np.sqrt(6.0/(shape[0]+shape[1])) 21 | initial = tf.random_uniform(shape, minval=-init_range, maxval=init_range, dtype=tf.float32) 22 | return tf.Variable(initial, name=name) 23 | 24 | 25 | def zeros(shape, name=None): 26 | """All zeros.""" 27 | initial = tf.zeros(shape, dtype=tf.float32) 28 | return tf.Variable(initial, name=name) 29 | 30 | def ones(shape, name=None): 31 | """All ones.""" 32 | initial = tf.ones(shape, dtype=tf.float32) 33 | return tf.Variable(initial, name=name) 34 | -------------------------------------------------------------------------------- /layers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | from inits import zeros 10 | 11 | flags = tf.app.flags 12 | FLAGS = flags.FLAGS 13 | 14 | # DISCLAIMER: 15 | # Boilerplate parts of this code file were originally forked from 16 | # https://github.com/tkipf/gcn 17 | # which itself was very inspired by the keras package 18 | 19 | # global unique layer ID dictionary for layer name assignment 20 | _LAYER_UIDS = {} 21 | 22 | def get_layer_uid(layer_name=''): 23 | """Helper function, assigns unique layer IDs.""" 24 | if layer_name not in _LAYER_UIDS: 25 | _LAYER_UIDS[layer_name] = 1 26 | return 1 27 | else: 28 | _LAYER_UIDS[layer_name] += 1 29 | return _LAYER_UIDS[layer_name] 30 | 31 | class Layer(object): 32 | """Base layer class. Defines basic API for all layer objects. 33 | Implementation inspired by keras (http://keras.io). 34 | # Properties 35 | name: String, defines the variable scope of the layer. 36 | logging: Boolean, switches Tensorflow histogram logging on/off 37 | 38 | # Methods 39 | _call(inputs): Defines computation graph of layer 40 | (i.e. takes input, returns output) 41 | __call__(inputs): Wrapper for _call() 42 | _log_vars(): Log all variables 43 | """ 44 | 45 | def __init__(self, **kwargs): 46 | allowed_kwargs = {'name', 'logging'} 47 | for kwarg in kwargs.keys(): 48 | assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg 49 | name = kwargs.get('name') 50 | if not name: 51 | layer = self.__class__.__name__.lower() 52 | name = layer + '_' + str(get_layer_uid(layer)) 53 | self.name = name 54 | self.vars = {} 55 | logging = kwargs.get('logging', False) 56 | self.logging = logging 57 | self.sparse_inputs = False 58 | 59 | def _call(self, inputs): 60 | return inputs 61 | 62 | def __call__(self, inputs): 63 | with tf.name_scope(self.name): 64 | if self.logging and not self.sparse_inputs: 65 | tf.summary.histogram(self.name + '/inputs', inputs) 66 | outputs = self._call(inputs) 67 | if self.logging: 68 | tf.summary.histogram(self.name + '/outputs', outputs) 69 | return outputs 70 | 71 | def _log_vars(self): 72 | for var in self.vars: 73 | tf.summary.histogram(self.name + '/vars/' + var, self.vars[var]) 74 | 75 | 76 | class Dense(Layer): 77 | """Dense layer.""" 78 | def __init__(self, input_dim, output_dim, dropout=0., 79 | act=tf.nn.relu, placeholders=None, bias=True, featureless=False, 80 | sparse_inputs=False, **kwargs): 81 | super(Dense, self).__init__(**kwargs) 82 | 83 | self.dropout = dropout 84 | 85 | self.act = act 86 | self.featureless = featureless 87 | self.bias = bias 88 | self.input_dim = input_dim 89 | self.output_dim = output_dim 90 | 91 | # helper variable for sparse dropout 92 | self.sparse_inputs = sparse_inputs 93 | if sparse_inputs: 94 | self.num_features_nonzero = placeholders['num_features_nonzero'] 95 | 96 | with tf.variable_scope(self.name + '_vars'): 97 | self.vars['weights'] = tf.get_variable('weights', shape=(input_dim, output_dim), 98 | dtype=tf.float32, 99 | initializer=tf.contrib.layers.xavier_initializer(), 100 | regularizer=tf.contrib.layers.l2_regularizer(FLAGS.weight_decay)) 101 | if self.bias: 102 | self.vars['bias'] = zeros([output_dim], name='bias') 103 | 104 | if self.logging: 105 | self._log_vars() 106 | 107 | def _call(self, inputs): 108 | x = inputs 109 | 110 | x = tf.nn.dropout(x, 1-self.dropout) 111 | 112 | # transform 113 | output = tf.matmul(x, self.vars['weights']) 114 | 115 | # bias 116 | if self.bias: 117 | output += self.vars['bias'] 118 | 119 | return self.act(output) 120 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import os 8 | import time 9 | import tensorflow as tf 10 | from tensorflow.python import debug as tf_debug 11 | import numpy as np 12 | import scipy.io as sio 13 | from scipy import spatial 14 | import networkx as nx 15 | import random 16 | from tqdm import tqdm 17 | from collections import Counter 18 | 19 | import graph 20 | import utils 21 | from models import AggregateModel, PretrainModel 22 | from minibatch import MinibatchIterator 23 | import test 24 | 25 | os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" 26 | 27 | flags = tf.app.flags 28 | FLAGS = flags.FLAGS 29 | 30 | tf.app.flags.DEFINE_boolean('log_device_placement', False, 31 | """Whether to log device placement.""") 32 | 33 | flags.DEFINE_string('input', '', "the input graph edge list file. must be specified") 34 | flags.DEFINE_string('train_prefix', 'cora', "dataset name.") 35 | flags.DEFINE_boolean('preprocess', False, "if have processed once then could set False") 36 | flags.DEFINE_integer('dim', 200, "embedding dimension.") 37 | flags.DEFINE_integer('batchsize', 512, "batch size") 38 | flags.DEFINE_integer('epoch', 5, "number of training epoches") 39 | flags.DEFINE_float('learning_rate', 0.001, "learning rate") 40 | flags.DEFINE_float('stru_rate', 0.2, "rate between structure sampling and neighbor sampling") 41 | flags.DEFINE_integer('walk_times', 10, "random walk times started at every node.") 42 | flags.DEFINE_integer('walk_length', 40, "random walk length at each node.") 43 | flags.DEFINE_integer('k_RPR', 20, "top-k Rooted PageRank nodes of current node.") 44 | flags.DEFINE_float('alpha', 0.5, "restart rate when random Walking.") 45 | flags.DEFINE_integer('neg_sample_size', 50, "negative sampling size.") 46 | flags.DEFINE_integer('verbose', 20, "how often to print information") 47 | flags.DEFINE_float('dropout', 0., "dropout rate in MLP") 48 | flags.DEFINE_float('weight_decay', 0.001, 'weight for l2 loss on embedding matrix.') 49 | flags.DEFINE_integer('save_per_epoch', 200, 'how often to save the model by epoch') 50 | flags.DEFINE_integer('seed', 123, "seed when random walk.") 51 | flags.DEFINE_integer('gpu', 0, "which gpu to use.") 52 | flags.DEFINE_boolean('PRETRAIN', True, 'W_mlp pretrained by node2vec') 53 | flags.DEFINE_boolean('CORR', False, "use pearson and spearman correlation to evaluate") 54 | 55 | seed = FLAGS.seed 56 | np.random.seed(seed) 57 | tf.set_random_seed(seed) 58 | 59 | os.environ["CUDA_VISIBLE_DEVICES"]=str(FLAGS.gpu) 60 | 61 | GPU_MEM_FRACTION = 0.8 62 | 63 | def log_dir(): 64 | log_dir = "./log/" + FLAGS.train_prefix 65 | log_dir += "/{lr:0.3f}_{stru_rate:0.1f}_{rpr_k:d}/".format( 66 | lr = FLAGS.learning_rate, 67 | stru_rate = FLAGS.stru_rate, 68 | rpr_k = FLAGS.k_RPR 69 | ) 70 | if not os.path.exists(log_dir): 71 | os.makedirs(log_dir) 72 | return log_dir 73 | 74 | def read_graph(): 75 | ''' 76 | Reads the input network. 77 | ''' 78 | print (" - Loading graph...") 79 | G = graph.load_edgelist(FLAGS.input,undirected=True) 80 | print (" - Graph loaded.") 81 | return G 82 | 83 | def construct_rpr_matrix(G, INDUCTIVE = False): 84 | ''' 85 | Construct Rooted PageRank matrix 86 | ''' 87 | print ("Number of nodes: {}".format(len(G.nodes()))) 88 | num_walks = len(G.nodes()) * FLAGS.walk_times 89 | num_nodes = len(G.nodes()) 90 | 91 | print("Number of walks: {}".format(num_walks)) 92 | print("Walking...") 93 | walks = graph.build_deepwalk_corpus(G, num_paths=FLAGS.walk_times, path_length=FLAGS.walk_length, 94 | alpha=FLAGS.alpha, rand=random.Random(FLAGS.seed)) 95 | all_counts = {} 96 | for node in walks.keys(): 97 | walks_n = walks[node] 98 | all_counts[node] = Counter() 99 | for walk in walks_n: 100 | all_counts[node].update(walk) 101 | 102 | print("Normal random walks started...") 103 | pairs = graph.write_normal_randomwalks(G, 104 | file_= './var/' + FLAGS.train_prefix + '_normal_walks.txt',rand=random.Random(FLAGS.seed)) 105 | 106 | print("Normal random walks dumped.") 107 | 108 | rpr_matrix = [] 109 | rpr_arg = [] 110 | for node in tqdm(xrange(num_nodes)): 111 | if node not in all_counts.keys(): 112 | raise NotImplementedError 113 | temp = all_counts[node].most_common(FLAGS.k_RPR) 114 | temp_arg = [i[0] for i in temp] 115 | temp_value = [i[1] for i in temp] 116 | if len(temp) < FLAGS.k_RPR: 117 | for _ in xrange(FLAGS.k_RPR - len(temp)): 118 | temp_value.append(0.0) 119 | temp_arg.append(node) 120 | temp_value = np.asarray(temp_value, dtype = 'double') 121 | temp_value = temp_value / sum(temp_value) 122 | rpr_matrix.append(temp_value) 123 | rpr_arg.append(temp_arg) 124 | rpr_matrix = np.asarray(rpr_matrix, dtype = 'double') 125 | rpr_arg = np.asarray(rpr_arg, dtype = 'double') 126 | rpr_file = './var/' + FLAGS.train_prefix + '_rpr.mat' 127 | 128 | sio.savemat(rpr_file, {'rpr_matrix':rpr_matrix}) 129 | return rpr_matrix, pairs, rpr_arg 130 | 131 | def construct_placeholders(): 132 | placeholders = { 133 | 'train_inputs' : tf.placeholder(tf.int32, shape = (None), name = 'train_inputs'), 134 | 'train_labels' : tf.placeholder(tf.int32, shape = (None), name = 'train_labels'), 135 | 'batchsize' : tf.placeholder(tf.int32, name = 'batchsize') 136 | } 137 | return placeholders 138 | 139 | def main(): 140 | G = read_graph() 141 | if FLAGS.preprocess: 142 | print (" - Computing Rooted PageRank matrix...") 143 | rpr_matrix, pairs, rpr_arg = construct_rpr_matrix(G) 144 | utils.dump_to_disk(rpr_arg, './var/' + FLAGS.train_prefix + '_rpr_arg') 145 | print (" - RPR matrix completed.") 146 | degrees, degree_permuted = utils.create_degree(G) 147 | print (" - Dumping degree vectors to disk...") 148 | utils.dump_to_disk(degrees, './var/' + FLAGS.train_prefix + '_degrees') 149 | utils.dump_to_disk(degree_permuted, './var/' + FLAGS.train_prefix + '_degree_permuted') 150 | print (" - Degree vectors dumped.") 151 | else: 152 | print (" - Loading precomputed Rooted PageRank matrix...") 153 | rpr_file = './var/' + FLAGS.train_prefix + '_rpr.mat' 154 | rpr_matrix = sio.loadmat(rpr_file)['rpr_matrix'] 155 | rpr_arg = utils.load_pkl('./var/' + FLAGS.train_prefix + '_rpr_arg') 156 | print (" - RPR matrix loaded.") 157 | print (" - Loading Degree vectors...") 158 | degrees = utils.load_pkl('./var/' + FLAGS.train_prefix + '_degrees') 159 | degree_permuted = utils.load_pkl('./var/' + FLAGS.train_prefix + '_degree_permuted') 160 | print (" - Degree vectors loaded.") 161 | pairs = [] 162 | with open('./var/' + FLAGS.train_prefix + '_normal_walks.txt', 'r') as fp: 163 | for line in fp: 164 | n_pair = line.split() 165 | pairs.append((int(n_pair[0]), int(n_pair[1]))) 166 | print (" - Training pairs loaded") 167 | 168 | placeholders = construct_placeholders() 169 | 170 | minibatch = MinibatchIterator(G, placeholders, degrees, rpr_matrix, pairs, 171 | batchsize = FLAGS.batchsize, stru_rate = FLAGS.stru_rate, dataset = FLAGS.train_prefix) 172 | 173 | _, features, _, _ = utils.load_pdata(FLAGS.train_prefix) 174 | # TODO: maybe can be more efficiently written by sparse multipications 175 | features = np.asarray(features.todense()) 176 | 177 | if FLAGS.PRETRAIN: 178 | from gensim.models.keyedvectors import KeyedVectors 179 | n2v_embedding = './baselines/{}_{}.embeddings'.format('node2vec', FLAGS.train_prefix) 180 | n_model = KeyedVectors.load_word2vec_format(n2v_embedding, binary=False) 181 | pretrained = np.asarray([n_model[str(node)] for node in xrange(rpr_matrix.shape[0])]) 182 | model = PretrainModel(placeholders, features, pretrained, len(G.nodes()), 183 | degree_permuted, rpr_matrix, rpr_arg, 184 | dropout = FLAGS.dropout, 185 | nodevec_dim = FLAGS.dim, 186 | lr = FLAGS.learning_rate, 187 | logging = True) 188 | else: 189 | model = AggregateModel(placeholders, features, len(G.nodes()), 190 | degree_permuted, rpr_matrix, rpr_arg, 191 | dropout = FLAGS.dropout, 192 | nodevec_dim = FLAGS.dim, 193 | lr = FLAGS.learning_rate, 194 | logging = True) 195 | 196 | config = tf.ConfigProto(log_device_placement=FLAGS.log_device_placement) 197 | config.gpu_options.allow_growth = True 198 | config.allow_soft_placement = True 199 | 200 | sess = tf.Session(config = config) 201 | saver = tf.train.Saver(max_to_keep = 5) 202 | merged = tf.summary.merge_all() 203 | summary_writer = tf.summary.FileWriter(log_dir(), sess.graph) 204 | 205 | # Init variables 206 | sess.run(tf.global_variables_initializer()) 207 | 208 | # Train model 209 | total_steps = 0 210 | average_time = 0.0 211 | average_test = 0.0 212 | test_steps = 0 213 | epoch_test_acc = [0.0] 214 | 215 | for epoch in xrange(FLAGS.epoch): 216 | minibatch.shuffle() 217 | _iter = 0 218 | print ("Epoch : %02d" % (epoch + 1), "Batchs per epoch : %04d" % (len(pairs) / FLAGS.batchsize)) 219 | 220 | while not minibatch.end(): 221 | feed_dict = minibatch.next_minibatch_feed_dict() 222 | t = time.time() 223 | # training step 224 | outs = sess.run([merged, model.opt_op, 225 | model.loss, model.embeddings], feed_dict = feed_dict) 226 | train_cost = outs[2] 227 | 228 | average_time = (average_time * total_steps + time.time() - t) / (total_steps + 1) 229 | 230 | if _iter % FLAGS.verbose == 0: 231 | if FLAGS.CORR: 232 | all_feed = minibatch.all_feed_dict() 233 | out = sess.run([model.train_inputs_all, 234 | model.train_inputs_f, model.embed, model.loss], feed_dict = all_feed) 235 | str_corr = test.compute_correlation(FLAGS.train_prefix, out[1], rpr_matrix) 236 | print ("Epoch: ", '%02d' % (epoch + 1), 237 | "iter: ", '%03d' % _iter, 238 | "loss: ", "{:.3f}".format(train_cost), 239 | "corr: ", str_corr, 240 | "train time: ", "{:.3f}".format(average_time)) 241 | else: 242 | train_feed, test_feed = minibatch.test_feed_dict() 243 | out_train = sess.run([model.train_inputs_all, 244 | model.train_inputs_f, model.embed], feed_dict = train_feed) 245 | t1 = time.time() 246 | out_test = sess.run([model.train_inputs_all, 247 | model.train_inputs_f, model.embed], feed_dict = test_feed) 248 | average_test = (average_test * test_steps + time.time() - t1) / (test_steps + 1) 249 | test_steps += 1 250 | 251 | acc_f = test.feature_test(FLAGS.train_prefix, out_train[1], out_test[1]) 252 | epoch_test_acc.append(acc_f) 253 | print ("Epoch: ", '%02d' % (epoch + 1), 254 | "iter: ", '%03d' % _iter, 255 | "loss: ", "{:.3f}".format(train_cost), 256 | "now acc: ", "{:.3f}".format(epoch_test_acc[-1]), 257 | "best acc: ", "{:.3f}".format(max(epoch_test_acc)), 258 | "train time: ", "{:.3f}".format(average_time), 259 | "test time: ", "{:.3f}".format(average_test)) 260 | 261 | _iter += 1 262 | total_steps += 1 263 | if epoch % FLAGS.save_per_epoch: 264 | saver.save(sess, os.path.join(log_dir(), 'model.ckpt'), epoch) 265 | print ("Optimization finished !") 266 | 267 | if __name__ == '__main__': 268 | main() 269 | -------------------------------------------------------------------------------- /minibatch.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | 4 | import numpy as np 5 | import math 6 | import random 7 | from scipy.stats import pearsonr, spearmanr 8 | from fastdtw import fastdtw 9 | 10 | import graph 11 | import utils 12 | 13 | class MinibatchIterator(object): 14 | """ 15 | Minibatch Iterator to sample random pairs of positive nodes 16 | """ 17 | def __init__(self, G, placeholders, degrees, rpr_matrix, context_pair, 18 | batchsize = 128, stru_rate = 0.5, dataset = 'cora', **kwargs): 19 | self.G = G 20 | self.placeholders = placeholders 21 | self.node_num = len(G.nodes()) 22 | self.nodes = np.random.permutation(G.nodes()) 23 | self.edges = np.random.permutation(context_pair) 24 | self.batch_num = 0 25 | self.batchsize = batchsize 26 | self.degrees = degrees 27 | self.rpr_matrix = rpr_matrix 28 | self.stru_rate =stru_rate 29 | self.dataset = dataset 30 | 31 | def batch_feed_dict(self, batch_edges): 32 | train_inputs = [] 33 | train_labels = [] 34 | for node1, node2 in batch_edges: 35 | train_inputs.append(node1) 36 | if random.random() <= self.stru_rate: 37 | degree_neighbors = self.get_vertices(node1) 38 | rpr_sample_node = self.rpr_sample(node1, degree_neighbors) 39 | train_labels.append(rpr_sample_node) 40 | else: 41 | train_labels.append(node2) 42 | 43 | feed_dict = dict() 44 | feed_dict.update({self.placeholders['batchsize'] : len(batch_edges)}) 45 | feed_dict.update({self.placeholders['train_inputs'] : train_inputs}) 46 | feed_dict.update({self.placeholders['train_labels'] : train_labels}) 47 | 48 | return feed_dict 49 | 50 | def node_feed_dict(self, batch_nodes): 51 | train_inputs = train_labels = batch_nodes 52 | # train_labels is not important, thus equals them 53 | 54 | feed_dict = dict() 55 | feed_dict.update({self.placeholders['batchsize'] : len(batch_nodes)}) 56 | feed_dict.update({self.placeholders['train_inputs'] : train_inputs}) 57 | feed_dict.update({self.placeholders['train_labels'] : train_labels}) 58 | 59 | return feed_dict 60 | 61 | def all_feed_dict(self): 62 | id_range = range(self.node_num) 63 | return self.node_feed_dict(id_range) 64 | 65 | def test_feed_dict(self): 66 | _, _, test_train, test_test = utils.load_pdata(self.dataset) 67 | test_train = test_train[:, 0] 68 | test_test = test_test[:, 0] 69 | t_train_feed = self.node_feed_dict(test_train) 70 | t_test_feed = self.node_feed_dict(test_test) 71 | return t_train_feed, t_test_feed 72 | 73 | def next_minibatch_feed_dict(self): 74 | start = self.batch_num * self.batchsize 75 | self.batch_num += 1 76 | batch_edges = self.edges[start : start + self.batchsize] 77 | return self.batch_feed_dict(batch_edges) 78 | 79 | def rpr_sample(self, node, neighbors): 80 | node_rpr_v = self.rpr_matrix[node] 81 | sim_list = [] 82 | for _neighbor in neighbors: 83 | neighbor_rpr_v = self.rpr_matrix[_neighbor] 84 | dits_dtw, _ = fastdtw(node_rpr_v, neighbor_rpr_v, radius = 1, dist = utils.cost) 85 | sim_list.append(np.exp(-1.0 * dits_dtw)) 86 | 87 | norm_weight = [float(i) / sum(sim_list) for i in sim_list] 88 | sampled_neighbor = np.random.choice(neighbors, p = norm_weight) 89 | return sampled_neighbor 90 | 91 | def get_vertices(self, v): 92 | num_seleted = 2 * math.log(self.node_num, 2) 93 | vertices = [] 94 | 95 | degree_v = self.G.degree(v) 96 | 97 | try: 98 | c_v = 0 99 | 100 | for v2 in self.degrees[degree_v]['vertices']: 101 | if(v != v2): 102 | vertices.append(v2) 103 | c_v += 1 104 | if(c_v > num_seleted): 105 | raise StopIteration 106 | 107 | if('before' not in self.degrees[degree_v]): 108 | degree_b = -1 109 | else: 110 | degree_b = self.degrees[degree_v]['before'] 111 | if('after' not in self.degrees[degree_v]): 112 | degree_a = -1 113 | else: 114 | degree_a = self.degrees[degree_v]['after'] 115 | if(degree_b == -1 and degree_a == -1): 116 | raise StopIteration 117 | degree_now = utils.verifyDegrees(degree_v,degree_a,degree_b) 118 | 119 | while True: 120 | for v2 in self.degrees[degree_now]['vertices']: 121 | if(v != v2): 122 | vertices.append(v2) 123 | c_v += 1 124 | if(c_v > num_seleted): 125 | raise StopIteration 126 | 127 | if(degree_now == degree_b): 128 | if('before' not in self.degrees[degree_b]): 129 | degree_b = -1 130 | else: 131 | degree_b = self.degrees[degree_b]['before'] 132 | else: 133 | if('after' not in self.degrees[degree_a]): 134 | degree_a = -1 135 | else: 136 | degree_a = self.degrees[degree_a]['after'] 137 | 138 | if(degree_b == -1 and degree_a == -1): 139 | raise StopIteration 140 | 141 | degree_now = utils.verifyDegrees(degree_v,degree_a,degree_b) 142 | 143 | except StopIteration: 144 | return list(vertices) 145 | 146 | return list(vertices) 147 | 148 | def end(self): 149 | return self.batch_num * self.batchsize > len(self.edges) - self.batchsize + 1 150 | 151 | def shuffle(self): 152 | """ 153 | Re-shuffle the training set. 154 | And the batch number 155 | """ 156 | self.nodes = np.random.permutation(self.nodes) 157 | self.edges = np.random.permutation(self.edges) 158 | self.batch_num = 0 -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | from tensorflow.contrib.tensorboard.plugins import projector 9 | import numpy as np 10 | import math 11 | 12 | import inits 13 | import graph 14 | import utils 15 | from aggregators import WeightedAggregator, MeanAggregator 16 | 17 | flags = tf.app.flags 18 | FLAGS = flags.FLAGS 19 | 20 | class Model(object): 21 | def __init__(self, **kwargs): 22 | allowed_kwargs = {'name', 'logging'} 23 | for kwarg in kwargs.keys(): 24 | assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg 25 | name = kwargs.get('name') 26 | if not name: 27 | name = self.__class__.__name__.lower() 28 | self.name = name 29 | 30 | logging = kwargs.get('logging', False) 31 | self.logging = logging 32 | 33 | self.vars = {} 34 | self.placeholders = {} 35 | 36 | self.layers = [] 37 | self.activations = [] 38 | 39 | self.inputs = None 40 | self.outputs = None 41 | 42 | self.loss = 0 43 | self.accuracy = 0 44 | self.optimizer = None 45 | self.opt_op = None 46 | 47 | def _build(self): 48 | raise NotImplementedError 49 | 50 | def build(self): 51 | """ Wrapper for _build() """ 52 | with tf.variable_scope(self.name): 53 | self._build() 54 | 55 | # Build sequential layer model 56 | self.activations.append(self.inputs) 57 | for layer in self.layers: 58 | hidden = layer(self.activations[-1]) 59 | self.activations.append(hidden) 60 | self.outputs = self.activations[-1] 61 | 62 | # Store model variables for easy access 63 | variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name) 64 | self.vars = {var.name: var for var in variables} 65 | 66 | # Build metrics 67 | self._loss() 68 | self._accuracy() 69 | 70 | self.opt_op = self.optimizer.minimize(self.loss) 71 | 72 | def predict(self): 73 | pass 74 | 75 | def _loss(self): 76 | raise NotImplementedError 77 | 78 | def _accuracy(self): 79 | raise NotImplementedError 80 | 81 | def save(self, sess=None): 82 | if not sess: 83 | raise AttributeError("TensorFlow session not provided.") 84 | saver = tf.train.Saver(self.vars) 85 | save_path = saver.save(sess, "tmp/%s.ckpt" % self.name) 86 | print("Model saved in file: %s" % save_path) 87 | 88 | def load(self, sess=None): 89 | if not sess: 90 | raise AttributeError("TensorFlow session not provided.") 91 | saver = tf.train.Saver(self.vars) 92 | save_path = "tmp/%s.ckpt" % self.name 93 | saver.restore(sess, save_path) 94 | print("Model restored from file: %s" % save_path) 95 | 96 | class GeneralizedModel(Model): 97 | """ 98 | Base class for models that aren't constructed from traditional, sequential layers. 99 | Subclasses must set self.outputs in _build method 100 | 101 | (Removes the layers idiom from build method of the Model class) 102 | """ 103 | 104 | def __init__(self, **kwargs): 105 | super(GeneralizedModel, self).__init__(**kwargs) 106 | 107 | 108 | def build(self): 109 | """ Wrapper for _build() """ 110 | with tf.variable_scope(self.name): 111 | self._build() 112 | 113 | # Store model variables for easy access 114 | variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name) 115 | self.vars = {var.name: var for var in variables} 116 | 117 | # Build metrics 118 | self._loss() 119 | self._accuracy() 120 | 121 | self.opt_op = self.optimizer.minimize(self.loss) 122 | 123 | class PretrainModel(GeneralizedModel): 124 | def __init__(self, placeholders, features, pretrained, dict_size, degree_permuted, rpr_matrix, 125 | rpr_arg, dropout = 0., nodevec_dim = 200, lr = 0.001, only_f = False, **kwargs): 126 | """ 127 | W_mlp is pretrained by node2vec 128 | """ 129 | 130 | super(PretrainModel, self).__init__(**kwargs) 131 | 132 | self.placeholders = placeholders 133 | self.degrees = degree_permuted 134 | self.only_f = only_f 135 | self.rpr_arg = tf.Variable(tf.constant(rpr_arg, dtype = tf.int64), trainable = False) 136 | self.rpr_matrix = tf.Variable(tf.constant(rpr_matrix, dtype = tf.float32), trainable = False) 137 | 138 | self.dropout = dropout 139 | self.feature_dim = features.shape[1] 140 | self.features = tf.Variable(tf.constant(features, dtype = tf.float32), trainable = False) 141 | self.train_inputs = placeholders["train_inputs"] 142 | self.train_labels = placeholders["train_labels"] 143 | self.batchsize = placeholders["batchsize"] 144 | self.dim = dict_size 145 | self.nodevec_dim = nodevec_dim 146 | self.lr = lr 147 | 148 | self.embeddings = tf.Variable(tf.constant(pretrained, dtype = tf.float32), 149 | trainable = True, name = "embeddings") 150 | self.nce_weights = tf.Variable(tf.constant(pretrained, dtype = tf.float32), 151 | trainable = True, name = "nce_weights") 152 | 153 | self.aggregator_t = WeightedAggregator(self.feature_dim, self.nodevec_dim, dropout = self.dropout, 154 | name = 'true_agg') 155 | 156 | self.optimizer = tf.train.AdamOptimizer(learning_rate = lr) 157 | 158 | self.build() 159 | 160 | def sample_aggregate(self, input_args, bs, aggregator): 161 | samples_arg = tf.nn.embedding_lookup(self.rpr_arg, input_args) 162 | samples_weights = tf.nn.embedding_lookup(self.rpr_matrix, input_args) 163 | samples_features = tf.nn.embedding_lookup(self.features, samples_arg) 164 | 165 | batch_out = aggregator((samples_features, samples_weights, bs, FLAGS.k_RPR)) 166 | 167 | # out should be bs * d 168 | return batch_out 169 | 170 | def _build(self): 171 | labels = tf.reshape(tf.cast(self.train_labels, dtype = tf.int64), 172 | [self.batchsize, 1]) 173 | self.neg_samples, _, _ = (tf.nn.fixed_unigram_candidate_sampler( 174 | true_classes = labels, 175 | num_true = 1, 176 | num_sampled = FLAGS.neg_sample_size, 177 | unique = True, 178 | range_max = len(self.degrees), 179 | distortion = 0.75, 180 | unigrams = self.degrees.tolist())) 181 | 182 | self.train_inputs_f = self.sample_aggregate(self.train_inputs, self.batchsize, self.aggregator_t) 183 | self.train_labels_f = self.sample_aggregate(self.train_labels, self.batchsize, self.aggregator_t) 184 | self.neg_samples_f = self.sample_aggregate(self.neg_samples, FLAGS.neg_sample_size, self.aggregator_t) 185 | 186 | self.embed = tf.nn.embedding_lookup(self.embeddings, self.train_inputs) 187 | self.true_w = tf.nn.embedding_lookup(self.nce_weights, self.train_labels) 188 | self.false_w = tf.nn.embedding_lookup(self.nce_weights, self.neg_samples) 189 | 190 | self.train_inputs_all = tf.add(self.train_inputs_f, self.embed) 191 | self.train_labels_all = tf.add(self.train_labels_f, self.true_w) 192 | self.neg_samples_all = tf.add(self.neg_samples_f, self.false_w) 193 | 194 | def build(self): 195 | self._build() 196 | self._loss() 197 | 198 | self._minimize_2() 199 | 200 | def _minimize(self): 201 | self.opt_op = self.optimizer.minimize(self.loss) 202 | 203 | def _minimize_2(self): 204 | var_list1 = [var for var in tf.trainable_variables() 205 | if var.name == "embeddings:0" or var.name == "nce_weights:0"] 206 | var_list2 = [var for var in tf.trainable_variables() if var not in var_list1] 207 | opt2 = tf.train.AdamOptimizer(learning_rate = self.lr) 208 | opt1 = tf.train.AdamOptimizer(learning_rate = 1e-5) 209 | grads = tf.gradients(self.loss, var_list1 + var_list2) 210 | grads1 = grads[:len(var_list1)] 211 | grads2 = grads[len(var_list1):] 212 | train_op1 = opt1.apply_gradients(zip(grads1, var_list1)) 213 | train_op2 = opt2.apply_gradients(zip(grads2, var_list2)) 214 | self.opt_op = tf.group(train_op1, train_op2) 215 | 216 | def _loss(self): 217 | p1 = tf.reduce_sum(tf.multiply(self.train_inputs_f, self.train_labels_f), 1) 218 | p1 = tf.log(tf.sigmoid(p1) + 0.001) 219 | 220 | p2 = tf.reduce_sum(tf.matmul(self.train_inputs_f, tf.transpose(self.neg_samples_f))) 221 | p2 = tf.log(tf.sigmoid(-p2) + 0.001) 222 | 223 | p3 = tf.reduce_sum(tf.multiply(self.embed, self.true_w), 1) 224 | p3 = tf.log(tf.sigmoid(p3) + 0.001) 225 | 226 | p4 = tf.reduce_sum(tf.matmul(self.embed, tf.transpose(self.false_w))) 227 | p4 = tf.log(tf.sigmoid(-p4) + 0.001) 228 | 229 | p5 = tf.reduce_sum(tf.multiply(self.embed, self.train_labels_f), 1) 230 | p5 = tf.log(tf.sigmoid(p5) + 0.001) 231 | 232 | p6 = tf.reduce_sum(tf.matmul(self.embed, tf.transpose(self.neg_samples_f))) 233 | p6 = tf.log(tf.sigmoid(-p6) + 0.001) 234 | 235 | p7 = tf.reduce_sum(tf.multiply(self.true_w, self.train_inputs_f), 1) 236 | p7 = tf.log(tf.sigmoid(p7) + 0.001) 237 | 238 | p8 = tf.reduce_sum(tf.matmul(self.true_w, tf.transpose(self.neg_samples_f))) 239 | p8 = tf.log(tf.sigmoid(-p8) + 0.001) 240 | 241 | rho1 = 1.5 242 | rho2 = 0.75 243 | rho3 = 1.5 244 | temp_loss = rho1*(p1+p2)+rho2*(p3+p4)+rho3*(p5+p6)+rho3*(p7+p8) 245 | self.loss += -tf.reduce_sum(temp_loss) / tf.cast(self.batchsize, tf.float32) 246 | tf.summary.scalar('loss', self.loss) 247 | 248 | class AggregateModel(GeneralizedModel): 249 | def __init__(self, placeholders, features, dict_size, degree_permuted, rpr_matrix, 250 | rpr_arg, dropout = 0., nodevec_dim = 200, lr = 0.001, only_f = False, **kwargs): 251 | """ 252 | Aggregate feature informations of the neighbors of the current node, 253 | weighted by Rooted PageRank vector of the current node. 254 | """ 255 | 256 | super(AggregateModel, self).__init__(**kwargs) 257 | 258 | self.placeholders = placeholders 259 | self.degrees = degree_permuted 260 | self.only_f = only_f 261 | self.rpr_arg = tf.Variable(tf.constant(rpr_arg, dtype = tf.int64), trainable = False) 262 | self.rpr_matrix = tf.Variable(tf.constant(rpr_matrix, dtype = tf.float32), trainable = False) 263 | self.dropout = dropout 264 | self.feature_dim = features.shape[1] 265 | self.features = tf.Variable(tf.constant(features, dtype = tf.float32), trainable = False) 266 | self.train_inputs = placeholders["train_inputs"] 267 | self.train_labels = placeholders["train_labels"] 268 | self.batchsize = placeholders["batchsize"] 269 | self.dim = dict_size 270 | self.nodevec_dim = nodevec_dim 271 | 272 | self.embeddings = inits.glorot([dict_size, nodevec_dim], name = "embeddings") 273 | self.nce_weights = inits.glorot([dict_size, nodevec_dim], name = "nce_weights") 274 | 275 | self.aggregator_t = WeightedAggregator(self.feature_dim, self.nodevec_dim, dropout = self.dropout, 276 | name = 'true_agg') 277 | self.optimizer = tf.train.AdamOptimizer(learning_rate = lr) 278 | 279 | self.build() 280 | 281 | def sample_aggregate(self, input_args, bs, aggregator): 282 | samples_arg = tf.nn.embedding_lookup(self.rpr_arg, input_args) 283 | samples_weights = tf.nn.embedding_lookup(self.rpr_matrix, input_args) 284 | samples_features = tf.nn.embedding_lookup(self.features, samples_arg) 285 | 286 | batch_out = aggregator((samples_features, samples_weights, bs, FLAGS.k_RPR)) 287 | 288 | # out should be bs * d 289 | return batch_out 290 | 291 | def _build(self): 292 | labels = tf.reshape(tf.cast(self.train_labels, dtype = tf.int64), 293 | [self.batchsize, 1]) 294 | self.neg_samples, _, _ = (tf.nn.fixed_unigram_candidate_sampler( 295 | true_classes = labels, 296 | num_true = 1, 297 | num_sampled = FLAGS.neg_sample_size, 298 | unique = True, 299 | range_max = len(self.degrees), 300 | distortion = 0.75, 301 | unigrams = self.degrees.tolist())) 302 | 303 | self.train_inputs_f = self.sample_aggregate(self.train_inputs, self.batchsize, self.aggregator_t) 304 | self.train_labels_f = self.sample_aggregate(self.train_labels, self.batchsize, self.aggregator_t) 305 | self.neg_samples_f = self.sample_aggregate(self.neg_samples, FLAGS.neg_sample_size, self.aggregator_t) 306 | 307 | self.embed = tf.nn.embedding_lookup(self.embeddings, self.train_inputs) 308 | self.true_w = tf.nn.embedding_lookup(self.nce_weights, self.train_labels) 309 | self.false_w = tf.nn.embedding_lookup(self.nce_weights, self.neg_samples) 310 | 311 | self.train_inputs_all = tf.add(self.train_inputs_f, self.embed) 312 | self.train_labels_all = tf.add(self.train_labels_f, self.true_w) 313 | self.neg_samples_all = tf.add(self.neg_samples_f, self.false_w) 314 | 315 | def build(self): 316 | self._build() 317 | if self.only_f: 318 | self._f_loss() 319 | else: 320 | self._loss() 321 | self._minimize() 322 | 323 | def _minimize(self): 324 | self.opt_op = self.optimizer.minimize(self.loss) 325 | 326 | def _f_loss(self): 327 | p1 = tf.reduce_sum(tf.multiply(self.train_inputs_f, self.train_labels_f), 1) 328 | p1 = tf.log(tf.sigmoid(p1) + 0.001) 329 | 330 | p2 = tf.reduce_sum(tf.matmul(self.train_inputs_f, tf.transpose(self.neg_samples_f))) 331 | p2 = tf.log(tf.sigmoid(-p2) + 0.001) 332 | 333 | temp_loss = p1 + p2 334 | self.loss=-tf.reduce_sum(temp_loss) / tf.cast(self.batchsize, tf.float32) 335 | tf.summary.scalar('loss', self.loss) 336 | 337 | def _loss(self): 338 | p1 = tf.reduce_sum(tf.multiply(self.train_inputs_f, self.train_labels_f), 1) 339 | p1 = tf.log(tf.sigmoid(p1) + 0.001) 340 | 341 | p2 = tf.reduce_sum(tf.matmul(self.train_inputs_f, tf.transpose(self.neg_samples_f))) 342 | p2 = tf.log(tf.sigmoid(-p2) + 0.001) 343 | 344 | p3 = tf.reduce_sum(tf.multiply(self.embed, self.true_w), 1) 345 | p3 = tf.log(tf.sigmoid(p3) + 0.001) 346 | 347 | p4 = tf.reduce_sum(tf.matmul(self.embed, tf.transpose(self.false_w))) 348 | p4 = tf.log(tf.sigmoid(-p4) + 0.001) 349 | 350 | p5 = tf.reduce_sum(tf.multiply(self.embed, self.train_labels_f), 1) 351 | p5 = tf.log(tf.sigmoid(p5) + 0.001) 352 | 353 | p6 = tf.reduce_sum(tf.matmul(self.embed, tf.transpose(self.neg_samples_f))) 354 | p6 = tf.log(tf.sigmoid(-p6) + 0.001) 355 | 356 | p7 = tf.reduce_sum(tf.multiply(self.true_w, self.train_inputs_f), 1) 357 | p7 = tf.log(tf.sigmoid(p7) + 0.001) 358 | 359 | p8 = tf.reduce_sum(tf.matmul(self.true_w, tf.transpose(self.neg_samples_f))) 360 | p8 = tf.log(tf.sigmoid(-p8) + 0.001) 361 | 362 | rho1 = 1.5 363 | rho2 = 0.75 364 | rho3 = 1.5 365 | temp_loss = rho1*(p1+p2)+rho2*(p3+p4)+rho3*(p5+p6)+rho3*(p7+p8) 366 | self.loss += -tf.reduce_sum(temp_loss) / tf.cast(self.batchsize, tf.float32) 367 | tf.summary.scalar('loss', self.loss) -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import numpy as np 5 | from scipy import sparse 6 | from io import open 7 | import json 8 | import random 9 | import tensorflow as tf 10 | import sys 11 | from utils import load_pdata, cos_sim, cost 12 | from sklearn.linear_model import LogisticRegression 13 | from sklearn.metrics import accuracy_score, f1_score 14 | from sklearn.multiclass import OneVsRestClassifier 15 | from itertools import izip 16 | from sklearn.utils import shuffle as skshuffle 17 | from collections import defaultdict 18 | from scipy.stats import pearsonr, spearmanr 19 | from fastdtw import fastdtw 20 | 21 | 22 | def compute_correlation(dataset, embeddings, rpr_matrix): 23 | graph, _, _, _ = load_pdata(dataset) 24 | eu_dists = [] 25 | stru_dists = [] 26 | for node in graph: 27 | for nei in graph[node]: 28 | if node == nei: 29 | continue 30 | dist_eu = np.linalg.norm(embeddings[node] - embeddings[nei]) 31 | dist_stru, _ = fastdtw(embeddings[node], embeddings[nei], radius = 1, dist = cost) 32 | eu_dists.append(dist_eu) 33 | stru_dists.append(dist_stru) 34 | pear_rho, pear_p = pearsonr(stru_dists, eu_dists) 35 | spea_rho, spea_p = spearmanr(stru_dists, eu_dists) 36 | return "P ratio and p: {:.2f} + {:.2f}, S ratio and p: {:.2f} + {:.2f}".format(pear_rho, pear_p, spea_rho, spea_p) 37 | 38 | class TopKRanker(OneVsRestClassifier): 39 | def predict(self, X, top_k_list): 40 | assert X.shape[0] == len(top_k_list) 41 | probs = np.asarray(super(TopKRanker, self).predict_proba(X)) 42 | all_labels = [] 43 | for i, k in enumerate(top_k_list): 44 | probs_ = probs[i, :] 45 | labels = self.classes_[probs_.argsort()[-k:]].tolist() 46 | all_labels.append(labels) 47 | return all_labels 48 | 49 | def feature_test(dataset, train_embeddings, test_embeddings): 50 | if dataset == 'cora': 51 | classes = 7 52 | elif dataset == 'citeseer': 53 | classes = 6 54 | elif dataset == 'pubmed': 55 | classes = 3 56 | else: 57 | raise Exception('Error : wrong dataset name') 58 | 59 | _, _, train_data, test_data = load_pdata(dataset) 60 | 61 | test_l = test_data[:, 1] 62 | test_label = [] 63 | for i in xrange(test_data.shape[0]): 64 | temp = [0] * classes 65 | temp[test_data[i][1] - 1] += 1 66 | test_label.append(temp) 67 | test_label = np.array(test_label) #1000 * 6 68 | 69 | train_l = train_data[:, 1] 70 | train_label = [] 71 | for i in xrange(train_data.shape[0]): 72 | temp = [0] * classes 73 | temp[train_data[i][1] - 1] += 1 74 | train_label.append(temp) 75 | train_label = np.array(train_label) #120 * 6 76 | 77 | test_in = np.asarray(test_embeddings) 78 | train_in = np.asarray(train_embeddings) 79 | 80 | y_train_ = sparse.coo_matrix(train_label) 81 | y_train = [[] for x in xrange(y_train_.shape[0])] 82 | cy = y_train_.tocoo() 83 | for i, j in izip(cy.row, cy.col): 84 | y_train[i].append(j) 85 | 86 | assert sum(len(l) for l in y_train) == y_train_.nnz 87 | 88 | y_test_ = sparse.coo_matrix(test_label) 89 | 90 | y_test = [[] for x in xrange(y_test_.shape[0])] 91 | cy = y_test_.tocoo() 92 | for i, j in izip(cy.row, cy.col): 93 | y_test[i].append(j) 94 | y_train = np.array(y_train) 95 | #y_test = np.array(y_test) 96 | 97 | clf = TopKRanker(LogisticRegression()) 98 | clf.fit(train_in, y_train) 99 | 100 | top_k_list = [len(l) for l in y_test] 101 | preds = clf.predict(test_in, top_k_list) 102 | acc = accuracy_score(y_test, preds) 103 | return acc 104 | 105 | if __name__ == '__main__': 106 | prefix = sys.argv[1] 107 | mask_rate = float(sys.argv[2]) 108 | G, feats, train_data, test_data = load_pdata(prefix) 109 | features = np.asarray(feats.todense()) 110 | 111 | test_id = test_data[:, 0] 112 | train_id = train_data[:, 0] 113 | 114 | feat_train = [] 115 | feat_test = [] 116 | for id_ in train_id: 117 | feat_train.append(features[id_]) 118 | for id_ in test_id: 119 | feat_test.append(features[id_]) 120 | 121 | acc_f = feature_test(prefix, feat_train, feat_test) 122 | print ("feats: {:.3f}".format(acc_f)) -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import os 8 | import io 9 | import time 10 | import numpy as np 11 | import scipy.io as sio 12 | from scipy import sparse as sp 13 | from scipy import spatial 14 | import cPickle as pkl 15 | import networkx as nx 16 | import random 17 | import math 18 | 19 | from collections import Counter 20 | 21 | import graph 22 | 23 | def cost(a,b): 24 | ep = 0.001 25 | m = max(a,b) + ep 26 | mi = min(a,b) + ep 27 | return ((m/mi) - 1) 28 | 29 | def cos_sim(node_vec, neb_vec): 30 | cos_ = 1 - spatial.distance.cosine(node_vec, neb_vec) 31 | return cos_ 32 | 33 | def create_degree(G): 34 | print (" - Creating degree vectors...") 35 | degrees = {} 36 | degrees_sorted = set() 37 | degree_permuted = np.zeros((len(G.keys()), )) 38 | for v in G.keys(): 39 | degree = len(G[v]) 40 | degrees_sorted.add(degree) 41 | degree_permuted[v] = degree 42 | if(degree not in degrees): 43 | degrees[degree] = {} 44 | degrees[degree]['vertices'] = [] 45 | degrees[degree]['vertices'].append(v) 46 | degrees_sorted = np.array(list(degrees_sorted),dtype='int') 47 | #degree_permuted = degrees_sorted 48 | degrees_sorted = np.sort(degrees_sorted) 49 | l = len(degrees_sorted) 50 | for index, degree in enumerate(degrees_sorted): 51 | if(index > 0): 52 | degrees[degree]['before'] = degrees_sorted[index - 1] 53 | if(index < (l - 1)): 54 | degrees[degree]['after'] = degrees_sorted[index + 1] 55 | print ("- Degree vectors created.") 56 | return degrees, degree_permuted 57 | 58 | def verifyDegrees(degree_v_root,degree_a,degree_b): 59 | 60 | if(degree_b == -1): 61 | degree_now = degree_a 62 | elif(degree_a == -1): 63 | degree_now = degree_b 64 | elif(abs(degree_b - degree_v_root) < abs(degree_a - degree_v_root)): 65 | degree_now = degree_b 66 | else: 67 | degree_now = degree_a 68 | 69 | return degree_now 70 | 71 | def dump_to_disk(f, file_name): 72 | with open(file_name + '.pkl', 'wb') as handle: 73 | pkl.dump(f, handle, protocol = pkl.HIGHEST_PROTOCOL) 74 | 75 | def load_pkl(file_name): 76 | with open(file_name + '.pkl', 'rb') as handle: 77 | val = pkl.load(handle) 78 | return val 79 | 80 | def sparse_to_tuple(sparse_mx): 81 | if not sp.isspmatrix_coo(sparse_mx): 82 | sparse_mx = sparse_mx.tocoo() 83 | coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose() 84 | values = sparse_mx.data 85 | shape = sparse_mx.shape 86 | return coords, values, shape 87 | 88 | def parse_index_file(filename): 89 | index = [] 90 | for line in open(filename): 91 | index.append(int(line.strip())) 92 | return index 93 | 94 | def sample_mask(idx, l): 95 | """Create mask.""" 96 | mask = np.zeros(l) 97 | mask[idx] = 1 98 | return np.array(mask, dtype=np.bool) 99 | 100 | def load_pdata(dataset_str): 101 | if dataset_str != 'cora' and dataset_str != 'citeseer' and dataset_str != 'pubmed': 102 | print ('Use datasets other than Planetoid, change load functions') 103 | pass 104 | names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] 105 | objects = [] 106 | for i in xrange(len(names)): 107 | objects.append(pkl.load(open("./data/ind.{}.{}".format(dataset_str, names[i])))) 108 | x, y, tx, ty, allx, ally, graph = tuple(objects) 109 | test_idx_reorder = parse_index_file("./data/ind.{}.test.index".format(dataset_str)) 110 | test_idx_range = np.sort(test_idx_reorder) 111 | if dataset_str == 'citeseer': 112 | test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1) 113 | tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) 114 | tx_extended[test_idx_range-min(test_idx_range), :] = tx 115 | tx = tx_extended 116 | ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) 117 | ty_extended[test_idx_range-min(test_idx_range), :] = ty 118 | ty = ty_extended 119 | features = sp.vstack((allx, tx)).tolil() 120 | features[test_idx_reorder, :] = features[test_idx_range, :] 121 | 122 | labels = np.vstack((ally, ty)) 123 | labels[test_idx_reorder, :] = labels[test_idx_range, :] 124 | 125 | idx_test = test_idx_range.tolist() 126 | idx_train = range(len(y)) 127 | 128 | train_mask = sample_mask(idx_train, labels.shape[0]) 129 | test_mask = sample_mask(idx_test, labels.shape[0]) 130 | 131 | y_train = np.zeros(labels.shape) 132 | y_test = np.zeros(labels.shape) 133 | y_train[train_mask, :] = labels[train_mask, :] 134 | y_test[test_mask, :] = labels[test_mask, :] 135 | 136 | train_out = [] 137 | for i in idx_train: 138 | ll = y_train[i].tolist() 139 | ll = ll.index(1) + 1 140 | train_out.append([i, ll]) 141 | train_out = np.array(train_out) 142 | 143 | test_out = [] 144 | for i in idx_test: 145 | ll = y_test[i].tolist() 146 | ll = ll.index(1) + 1 147 | test_out.append([i, ll]) 148 | test_out = np.array(test_out) 149 | adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) 150 | adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape) 151 | adj.eliminate_zeros() 152 | # Check that diag is zero: 153 | assert np.diag(adj.todense()).sum() == 0 154 | 155 | adj_triu = sp.triu(adj) 156 | adj_tuple = sparse_to_tuple(adj_triu) 157 | edges = adj_tuple[0] 158 | edges_all = sparse_to_tuple(adj)[0] 159 | 160 | num_mask = int(np.floor(edges.shape[0] / 10.)) 161 | 162 | return graph, features, train_out, test_out 163 | --------------------------------------------------------------------------------