├── .gitignore ├── LICENSE ├── README.md ├── SGC-tuning ├── citeseer.txt ├── cora.txt └── pubmed.txt ├── args.py ├── citation.py ├── data ├── ind.citeseer.allx ├── ind.citeseer.ally ├── ind.citeseer.graph ├── ind.citeseer.test.index ├── ind.citeseer.tx ├── ind.citeseer.ty ├── ind.citeseer.x ├── ind.citeseer.y ├── ind.cora.allx ├── ind.cora.ally ├── ind.cora.graph ├── ind.cora.test.index ├── ind.cora.tx ├── ind.cora.ty ├── ind.cora.x ├── ind.cora.y ├── ind.pubmed.allx ├── ind.pubmed.ally ├── ind.pubmed.graph ├── ind.pubmed.test.index ├── ind.pubmed.tx ├── ind.pubmed.ty ├── ind.pubmed.x └── ind.pubmed.y ├── downstream └── TextSGC │ ├── README.md │ ├── build_corpus.py │ ├── build_graph.py │ ├── models.py │ ├── remove_words.py │ ├── requirements.txt │ ├── train.py │ ├── tuned_result │ ├── 20ng.SGC.tuning.txt │ ├── R52.SGC.tuning.txt │ ├── R8.SGC.tuning.txt │ ├── mr.SGC.tuning.txt │ └── ohsumed.SGC.tuning.txt │ ├── tuning.py │ └── utils.py ├── metrics.py ├── model.jpg ├── models.py ├── normalization.py ├── reddit.py ├── requirements.txt ├── tuning.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | data/*reddit* 2 | downstream/TextSGC/data 3 | __pycache__/ 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2019 Tianyi Zhang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Simplifying Graph Convolutional Networks 2 | 3 | [![made-with-python](https://img.shields.io/badge/Made%20with-Python-red.svg)](#python) 4 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 5 | 6 | #### Updates 7 | * As pointed out by #23, there was a subtle bug in our preprocessing code for the reddit dataset. After fixing this bug, SGC achieves a F1 score of 95.0 (previously, it was 94.9). 8 | * Practical advice: it is often very helpful to normalize the features to have zero mean with standard deviation one to accelerate the convergence of SGC (and many other linear models). For example, we apply this normalization for [the reddit dataset](./utils.py#L119). Please consider doing this when applying SGC to other datasets. For some relevant discussions, see [Ross et al, 2013](https://arxiv.org/pdf/1305.6646.pdf) and [Li and Zhang, 1998](https://www.jstor.org/stable/25051187?seq=1#metadata_info_tab_contents). 9 | 10 | #### Authors: 11 | * [Felix Wu](https://scholar.google.com.tw/citations?user=sNL8SSoAAAAJ&hl=en)* 12 | * [Tianyi Zhang](https://scholar.google.com/citations?user=OI0HSa0AAAAJ&hl=en)* 13 | * [Amauri Holanda de Souza Jr.](https://scholar.google.com/citations?hl=en&user=lP0LBI4AAAAJ&view_op=list_works&sortby=pubdate)* 14 | * [Christopher Fifty](https://scholar.google.com/citations?user=lg2M2RYAAAAJ&hl=en) 15 | * [Tao Yu](http://jhc.sjtu.edu.cn/public/home/taoyu/) 16 | * [Kilian Q. Weinberger](http://kilian.cs.cornell.edu/index.html) 17 | 18 | *: Equal Contribution 19 | 20 | ### Overview 21 | This repo contains an example implementation of the Simple Graph Convolution 22 | (SGC) model, described in the ICML2019 paper [Simplifying Graph Convolutional Networks](https://arxiv.org/abs/1902.07153). 23 | 24 | SGC removes the nonlinearities and collapes the weight matrices in Graph Convolutional Networks (GCNs) and is essentially a linear model. 25 | For an illustration, ![](./model.jpg "SGC") 26 | 27 | SGC achieves competitive performance while saving much training time. For reference, on a GTX 1080 Ti, 28 | 29 | Dataset | Metric | Training Time 30 | :------:|:------:|:-----------:| 31 | Cora | Acc: 81.0 % | 0.13s 32 | Citeseer| Acc: 71.9 % | 0.14s 33 | Pubmed | Acc: 78.9 % | 0.29s 34 | Reddit | F1: 94.9 % | 2.7s 35 | 36 | This home repo contains the implementation for citation networks (Cora, Citeseer, and Pubmed) and social network (Reddit). 37 | We have a work-in-progress branch [ablation](https://github.com/Tiiiger/SGC/tree/ablation), containing additional codebase for our ablation studies. 38 | 39 | If you find this repo useful, please cite: 40 | ``` 41 | @InProceedings{pmlr-v97-wu19e, 42 | title = {Simplifying Graph Convolutional Networks}, 43 | author = {Wu, Felix and Souza, Amauri and Zhang, Tianyi and Fifty, Christopher and Yu, Tao and Weinberger, Kilian}, 44 | booktitle = {Proceedings of the 36th International Conference on Machine Learning}, 45 | pages = {6861--6871}, 46 | year = {2019}, 47 | publisher = {PMLR}, 48 | } 49 | ``` 50 | 51 | #### Other reference implementations 52 | Other reference implementations can be found in the follwing libraries. Note that in 53 | these examples, the hyperparameters are potentially different and 54 | the results would be different from the paper reported ones. 55 | 56 | - *Deep Graph Library*: [example](https://github.com/dmlc/dgl/tree/master/examples/pytorch/sgc). 57 | - *PyTorch Geometric*: 58 | [documentation](https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html#torch_geometric.nn.conv.SGConv) 59 | and [example](https://github.com/rusty1s/pytorch_geometric/blob/master/examples/sgc.py). 60 | - *Spektral*: [example](https://github.com/danielegrattarola/spektral/blob/master/examples/node_classification_simple_gc.py) 61 | - *StellarGraph*: [example](https://github.com/stellargraph/stellargraph/blob/develop/demos/node-classification/sgc/sgc-node-classification-example.ipynb) 62 | - *tf_geometric*: [example](https://github.com/CrawlScript/tf_geometric/blob/master/demo/demo_sgc.py) 63 | 64 | ### Dependencies 65 | Our implementation works with PyTorch>=1.0.0 Install other dependencies: `$ pip install -r requirement.txt` 66 | 67 | ### Data 68 | We provide the citation network datasets under `data/`, which corresponds to [the public data splits](https://github.com/tkipf/gcn/tree/master/gcn/data). 69 | Due to space limit, please download reddit dataset from [FastGCN](https://github.com/matenure/FastGCN/issues/9) and put `reddit_adj.npz`, `reddit.npz` under `data/`. 70 | 71 | ### Usage 72 | Citation Networks: We tune the only hyperparameter, weight decay, with hyperopt and put the resulting hyperparameter under `SGC-tuning`. 73 | See `tuning.py` for more details on hyperparameter optimization. 74 | ``` 75 | $ python citation.py --dataset cora --tuned 76 | $ python citation.py --dataset citeseer --tuned --epochs 150 77 | $ python citation.py --dataset pubmed --tuned 78 | ``` 79 | 80 | Reddit: 81 | ``` 82 | $ python reddit.py --inductive --test 83 | ``` 84 | ### Downstream 85 | We collect the code base for downstream tasks under `downstream`. Currently, we 86 | are releasing only SGC implementation for text classification. 87 | 88 | ### Acknowledgement 89 | This repo is modified from [pygcn](https://github.com/tkipf/pygcn), and [FastGCN](https://github.com/matenure/FastGCN). 90 | 91 | We thank *Deep Graph Library* team for helping providing a reference implementation of SGC and benchmarking SGC in *Deep Graph Library*. 92 | We thank Matthias Fey, author of *PyTorch Geometric*, for his help on providing a reference implementation of SGC within *PyTorch Geometric*. 93 | We thank Daniele Grattarola, author of *Spektral*, for his help on providing a reference implementation of SGC within *Spektral*. 94 | -------------------------------------------------------------------------------- /SGC-tuning/citeseer.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tiiiger/SGC/2c7a2727e82e462d8ef9d6e57f0b08888e16488f/SGC-tuning/citeseer.txt -------------------------------------------------------------------------------- /SGC-tuning/cora.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tiiiger/SGC/2c7a2727e82e462d8ef9d6e57f0b08888e16488f/SGC-tuning/cora.txt -------------------------------------------------------------------------------- /SGC-tuning/pubmed.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tiiiger/SGC/2c7a2727e82e462d8ef9d6e57f0b08888e16488f/SGC-tuning/pubmed.txt -------------------------------------------------------------------------------- /args.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | 4 | def get_citation_args(): 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument('--no-cuda', action='store_true', default=False, 7 | help='Disables CUDA training.') 8 | parser.add_argument('--seed', type=int, default=42, help='Random seed.') 9 | parser.add_argument('--epochs', type=int, default=100, 10 | help='Number of epochs to train.') 11 | parser.add_argument('--lr', type=float, default=0.2, 12 | help='Initial learning rate.') 13 | parser.add_argument('--weight_decay', type=float, default=5e-6, 14 | help='Weight decay (L2 loss on parameters).') 15 | parser.add_argument('--hidden', type=int, default=0, 16 | help='Number of hidden units.') 17 | parser.add_argument('--dropout', type=float, default=0, 18 | help='Dropout rate (1 - keep probability).') 19 | parser.add_argument('--dataset', type=str, default="cora", 20 | help='Dataset to use.') 21 | parser.add_argument('--model', type=str, default="SGC", 22 | choices=["SGC", "GCN"], 23 | help='model to use.') 24 | parser.add_argument('--feature', type=str, default="mul", 25 | choices=['mul', 'cat', 'adj'], 26 | help='feature-type') 27 | parser.add_argument('--normalization', type=str, default='AugNormAdj', 28 | choices=['AugNormAdj'], 29 | help='Normalization method for the adjacency matrix.') 30 | parser.add_argument('--degree', type=int, default=2, 31 | help='degree of the approximation.') 32 | parser.add_argument('--per', type=int, default=-1, 33 | help='Number of each nodes so as to balance.') 34 | parser.add_argument('--experiment', type=str, default="base-experiment", 35 | help='feature-type') 36 | parser.add_argument('--tuned', action='store_true', help='use tuned hyperparams') 37 | 38 | args, _ = parser.parse_known_args() 39 | args.cuda = not args.no_cuda and torch.cuda.is_available() 40 | return args 41 | -------------------------------------------------------------------------------- /citation.py: -------------------------------------------------------------------------------- 1 | import time 2 | import argparse 3 | import numpy as np 4 | import torch 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | from utils import load_citation, sgc_precompute, set_seed 8 | from models import get_model 9 | from metrics import accuracy 10 | import pickle as pkl 11 | from args import get_citation_args 12 | from time import perf_counter 13 | 14 | # Arguments 15 | args = get_citation_args() 16 | 17 | if args.tuned: 18 | if args.model == "SGC": 19 | with open("{}-tuning/{}.txt".format(args.model, args.dataset), 'rb') as f: 20 | args.weight_decay = pkl.load(f)['weight_decay'] 21 | print("using tuned weight decay: {}".format(args.weight_decay)) 22 | else: 23 | raise NotImplemented 24 | 25 | # setting random seeds 26 | set_seed(args.seed, args.cuda) 27 | 28 | adj, features, labels, idx_train, idx_val, idx_test = load_citation(args.dataset, args.normalization, args.cuda) 29 | 30 | model = get_model(args.model, features.size(1), labels.max().item()+1, args.hidden, args.dropout, args.cuda) 31 | 32 | if args.model == "SGC": features, precompute_time = sgc_precompute(features, adj, args.degree) 33 | print("{:.4f}s".format(precompute_time)) 34 | 35 | def train_regression(model, 36 | train_features, train_labels, 37 | val_features, val_labels, 38 | epochs=args.epochs, weight_decay=args.weight_decay, 39 | lr=args.lr, dropout=args.dropout): 40 | 41 | optimizer = optim.Adam(model.parameters(), lr=lr, 42 | weight_decay=weight_decay) 43 | t = perf_counter() 44 | for epoch in range(epochs): 45 | model.train() 46 | optimizer.zero_grad() 47 | output = model(train_features) 48 | loss_train = F.cross_entropy(output, train_labels) 49 | loss_train.backward() 50 | optimizer.step() 51 | train_time = perf_counter()-t 52 | 53 | with torch.no_grad(): 54 | model.eval() 55 | output = model(val_features) 56 | acc_val = accuracy(output, val_labels) 57 | 58 | return model, acc_val, train_time 59 | 60 | def test_regression(model, test_features, test_labels): 61 | model.eval() 62 | return accuracy(model(test_features), test_labels) 63 | 64 | if args.model == "SGC": 65 | model, acc_val, train_time = train_regression(model, features[idx_train], labels[idx_train], features[idx_val], labels[idx_val], 66 | args.epochs, args.weight_decay, args.lr, args.dropout) 67 | acc_test = test_regression(model, features[idx_test], labels[idx_test]) 68 | 69 | print("Validation Accuracy: {:.4f} Test Accuracy: {:.4f}".format(acc_val, acc_test)) 70 | print("Pre-compute time: {:.4f}s, train time: {:.4f}s, total: {:.4f}s".format(precompute_time, train_time, precompute_time+train_time)) 71 | -------------------------------------------------------------------------------- /data/ind.citeseer.allx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tiiiger/SGC/2c7a2727e82e462d8ef9d6e57f0b08888e16488f/data/ind.citeseer.allx -------------------------------------------------------------------------------- /data/ind.citeseer.ally: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tiiiger/SGC/2c7a2727e82e462d8ef9d6e57f0b08888e16488f/data/ind.citeseer.ally -------------------------------------------------------------------------------- /data/ind.citeseer.graph: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tiiiger/SGC/2c7a2727e82e462d8ef9d6e57f0b08888e16488f/data/ind.citeseer.graph -------------------------------------------------------------------------------- /data/ind.citeseer.test.index: -------------------------------------------------------------------------------- 1 | 2488 2 | 2644 3 | 3261 4 | 2804 5 | 3176 6 | 2432 7 | 3310 8 | 2410 9 | 2812 10 | 2520 11 | 2994 12 | 3282 13 | 2680 14 | 2848 15 | 2670 16 | 3005 17 | 2977 18 | 2592 19 | 2967 20 | 2461 21 | 3184 22 | 2852 23 | 2768 24 | 2905 25 | 2851 26 | 3129 27 | 3164 28 | 2438 29 | 2793 30 | 2763 31 | 2528 32 | 2954 33 | 2347 34 | 2640 35 | 3265 36 | 2874 37 | 2446 38 | 2856 39 | 3149 40 | 2374 41 | 3097 42 | 3301 43 | 2664 44 | 2418 45 | 2655 46 | 2464 47 | 2596 48 | 3262 49 | 3278 50 | 2320 51 | 2612 52 | 2614 53 | 2550 54 | 2626 55 | 2772 56 | 3007 57 | 2733 58 | 2516 59 | 2476 60 | 2798 61 | 2561 62 | 2839 63 | 2685 64 | 2391 65 | 2705 66 | 3098 67 | 2754 68 | 3251 69 | 2767 70 | 2630 71 | 2727 72 | 2513 73 | 2701 74 | 3264 75 | 2792 76 | 2821 77 | 3260 78 | 2462 79 | 3307 80 | 2639 81 | 2900 82 | 3060 83 | 2672 84 | 3116 85 | 2731 86 | 3316 87 | 2386 88 | 2425 89 | 2518 90 | 3151 91 | 2586 92 | 2797 93 | 2479 94 | 3117 95 | 2580 96 | 3182 97 | 2459 98 | 2508 99 | 3052 100 | 3230 101 | 3215 102 | 2803 103 | 2969 104 | 2562 105 | 2398 106 | 3325 107 | 2343 108 | 3030 109 | 2414 110 | 2776 111 | 2383 112 | 3173 113 | 2850 114 | 2499 115 | 3312 116 | 2648 117 | 2784 118 | 2898 119 | 3056 120 | 2484 121 | 3179 122 | 3132 123 | 2577 124 | 2563 125 | 2867 126 | 3317 127 | 2355 128 | 3207 129 | 3178 130 | 2968 131 | 3319 132 | 2358 133 | 2764 134 | 3001 135 | 2683 136 | 3271 137 | 2321 138 | 2567 139 | 2502 140 | 3246 141 | 2715 142 | 3066 143 | 2390 144 | 2381 145 | 3162 146 | 2741 147 | 2498 148 | 2790 149 | 3038 150 | 3321 151 | 2481 152 | 3050 153 | 3161 154 | 3122 155 | 2801 156 | 2957 157 | 3177 158 | 2965 159 | 2621 160 | 3208 161 | 2921 162 | 2802 163 | 2357 164 | 2677 165 | 2519 166 | 2860 167 | 2696 168 | 2368 169 | 3241 170 | 2858 171 | 2419 172 | 2762 173 | 2875 174 | 3222 175 | 3064 176 | 2827 177 | 3044 178 | 2471 179 | 3062 180 | 2982 181 | 2736 182 | 2322 183 | 2709 184 | 2766 185 | 2424 186 | 2602 187 | 2970 188 | 2675 189 | 3299 190 | 2554 191 | 2964 192 | 2597 193 | 2753 194 | 2979 195 | 2523 196 | 2912 197 | 2896 198 | 2317 199 | 3167 200 | 2813 201 | 2482 202 | 2557 203 | 3043 204 | 3244 205 | 2985 206 | 2460 207 | 2363 208 | 3272 209 | 3045 210 | 3192 211 | 2453 212 | 2656 213 | 2834 214 | 2443 215 | 3202 216 | 2926 217 | 2711 218 | 2633 219 | 2384 220 | 2752 221 | 3285 222 | 2817 223 | 2483 224 | 2919 225 | 2924 226 | 2661 227 | 2698 228 | 2361 229 | 2662 230 | 2819 231 | 3143 232 | 2316 233 | 3196 234 | 2739 235 | 2345 236 | 2578 237 | 2822 238 | 3229 239 | 2908 240 | 2917 241 | 2692 242 | 3200 243 | 2324 244 | 2522 245 | 3322 246 | 2697 247 | 3163 248 | 3093 249 | 3233 250 | 2774 251 | 2371 252 | 2835 253 | 2652 254 | 2539 255 | 2843 256 | 3231 257 | 2976 258 | 2429 259 | 2367 260 | 3144 261 | 2564 262 | 3283 263 | 3217 264 | 3035 265 | 2962 266 | 2433 267 | 2415 268 | 2387 269 | 3021 270 | 2595 271 | 2517 272 | 2468 273 | 3061 274 | 2673 275 | 2348 276 | 3027 277 | 2467 278 | 3318 279 | 2959 280 | 3273 281 | 2392 282 | 2779 283 | 2678 284 | 3004 285 | 2634 286 | 2974 287 | 3198 288 | 2342 289 | 2376 290 | 3249 291 | 2868 292 | 2952 293 | 2710 294 | 2838 295 | 2335 296 | 2524 297 | 2650 298 | 3186 299 | 2743 300 | 2545 301 | 2841 302 | 2515 303 | 2505 304 | 3181 305 | 2945 306 | 2738 307 | 2933 308 | 3303 309 | 2611 310 | 3090 311 | 2328 312 | 3010 313 | 3016 314 | 2504 315 | 2936 316 | 3266 317 | 3253 318 | 2840 319 | 3034 320 | 2581 321 | 2344 322 | 2452 323 | 2654 324 | 3199 325 | 3137 326 | 2514 327 | 2394 328 | 2544 329 | 2641 330 | 2613 331 | 2618 332 | 2558 333 | 2593 334 | 2532 335 | 2512 336 | 2975 337 | 3267 338 | 2566 339 | 2951 340 | 3300 341 | 2869 342 | 2629 343 | 2747 344 | 3055 345 | 2831 346 | 3105 347 | 3168 348 | 3100 349 | 2431 350 | 2828 351 | 2684 352 | 3269 353 | 2910 354 | 2865 355 | 2693 356 | 2884 357 | 3228 358 | 2783 359 | 3247 360 | 2770 361 | 3157 362 | 2421 363 | 2382 364 | 2331 365 | 3203 366 | 3240 367 | 2351 368 | 3114 369 | 2986 370 | 2688 371 | 2439 372 | 2996 373 | 3079 374 | 3103 375 | 3296 376 | 2349 377 | 2372 378 | 3096 379 | 2422 380 | 2551 381 | 3069 382 | 2737 383 | 3084 384 | 3304 385 | 3022 386 | 2542 387 | 3204 388 | 2949 389 | 2318 390 | 2450 391 | 3140 392 | 2734 393 | 2881 394 | 2576 395 | 3054 396 | 3089 397 | 3125 398 | 2761 399 | 3136 400 | 3111 401 | 2427 402 | 2466 403 | 3101 404 | 3104 405 | 3259 406 | 2534 407 | 2961 408 | 3191 409 | 3000 410 | 3036 411 | 2356 412 | 2800 413 | 3155 414 | 3224 415 | 2646 416 | 2735 417 | 3020 418 | 2866 419 | 2426 420 | 2448 421 | 3226 422 | 3219 423 | 2749 424 | 3183 425 | 2906 426 | 2360 427 | 2440 428 | 2946 429 | 2313 430 | 2859 431 | 2340 432 | 3008 433 | 2719 434 | 3058 435 | 2653 436 | 3023 437 | 2888 438 | 3243 439 | 2913 440 | 3242 441 | 3067 442 | 2409 443 | 3227 444 | 2380 445 | 2353 446 | 2686 447 | 2971 448 | 2847 449 | 2947 450 | 2857 451 | 3263 452 | 3218 453 | 2861 454 | 3323 455 | 2635 456 | 2966 457 | 2604 458 | 2456 459 | 2832 460 | 2694 461 | 3245 462 | 3119 463 | 2942 464 | 3153 465 | 2894 466 | 2555 467 | 3128 468 | 2703 469 | 2323 470 | 2631 471 | 2732 472 | 2699 473 | 2314 474 | 2590 475 | 3127 476 | 2891 477 | 2873 478 | 2814 479 | 2326 480 | 3026 481 | 3288 482 | 3095 483 | 2706 484 | 2457 485 | 2377 486 | 2620 487 | 2526 488 | 2674 489 | 3190 490 | 2923 491 | 3032 492 | 2334 493 | 3254 494 | 2991 495 | 3277 496 | 2973 497 | 2599 498 | 2658 499 | 2636 500 | 2826 501 | 3148 502 | 2958 503 | 3258 504 | 2990 505 | 3180 506 | 2538 507 | 2748 508 | 2625 509 | 2565 510 | 3011 511 | 3057 512 | 2354 513 | 3158 514 | 2622 515 | 3308 516 | 2983 517 | 2560 518 | 3169 519 | 3059 520 | 2480 521 | 3194 522 | 3291 523 | 3216 524 | 2643 525 | 3172 526 | 2352 527 | 2724 528 | 2485 529 | 2411 530 | 2948 531 | 2445 532 | 2362 533 | 2668 534 | 3275 535 | 3107 536 | 2496 537 | 2529 538 | 2700 539 | 2541 540 | 3028 541 | 2879 542 | 2660 543 | 3324 544 | 2755 545 | 2436 546 | 3048 547 | 2623 548 | 2920 549 | 3040 550 | 2568 551 | 3221 552 | 3003 553 | 3295 554 | 2473 555 | 3232 556 | 3213 557 | 2823 558 | 2897 559 | 2573 560 | 2645 561 | 3018 562 | 3326 563 | 2795 564 | 2915 565 | 3109 566 | 3086 567 | 2463 568 | 3118 569 | 2671 570 | 2909 571 | 2393 572 | 2325 573 | 3029 574 | 2972 575 | 3110 576 | 2870 577 | 3284 578 | 2816 579 | 2647 580 | 2667 581 | 2955 582 | 2333 583 | 2960 584 | 2864 585 | 2893 586 | 2458 587 | 2441 588 | 2359 589 | 2327 590 | 3256 591 | 3099 592 | 3073 593 | 3138 594 | 2511 595 | 2666 596 | 2548 597 | 2364 598 | 2451 599 | 2911 600 | 3237 601 | 3206 602 | 3080 603 | 3279 604 | 2934 605 | 2981 606 | 2878 607 | 3130 608 | 2830 609 | 3091 610 | 2659 611 | 2449 612 | 3152 613 | 2413 614 | 2722 615 | 2796 616 | 3220 617 | 2751 618 | 2935 619 | 3238 620 | 2491 621 | 2730 622 | 2842 623 | 3223 624 | 2492 625 | 3074 626 | 3094 627 | 2833 628 | 2521 629 | 2883 630 | 3315 631 | 2845 632 | 2907 633 | 3083 634 | 2572 635 | 3092 636 | 2903 637 | 2918 638 | 3039 639 | 3286 640 | 2587 641 | 3068 642 | 2338 643 | 3166 644 | 3134 645 | 2455 646 | 2497 647 | 2992 648 | 2775 649 | 2681 650 | 2430 651 | 2932 652 | 2931 653 | 2434 654 | 3154 655 | 3046 656 | 2598 657 | 2366 658 | 3015 659 | 3147 660 | 2944 661 | 2582 662 | 3274 663 | 2987 664 | 2642 665 | 2547 666 | 2420 667 | 2930 668 | 2750 669 | 2417 670 | 2808 671 | 3141 672 | 2997 673 | 2995 674 | 2584 675 | 2312 676 | 3033 677 | 3070 678 | 3065 679 | 2509 680 | 3314 681 | 2396 682 | 2543 683 | 2423 684 | 3170 685 | 2389 686 | 3289 687 | 2728 688 | 2540 689 | 2437 690 | 2486 691 | 2895 692 | 3017 693 | 2853 694 | 2406 695 | 2346 696 | 2877 697 | 2472 698 | 3210 699 | 2637 700 | 2927 701 | 2789 702 | 2330 703 | 3088 704 | 3102 705 | 2616 706 | 3081 707 | 2902 708 | 3205 709 | 3320 710 | 3165 711 | 2984 712 | 3185 713 | 2707 714 | 3255 715 | 2583 716 | 2773 717 | 2742 718 | 3024 719 | 2402 720 | 2718 721 | 2882 722 | 2575 723 | 3281 724 | 2786 725 | 2855 726 | 3014 727 | 2401 728 | 2535 729 | 2687 730 | 2495 731 | 3113 732 | 2609 733 | 2559 734 | 2665 735 | 2530 736 | 3293 737 | 2399 738 | 2605 739 | 2690 740 | 3133 741 | 2799 742 | 2533 743 | 2695 744 | 2713 745 | 2886 746 | 2691 747 | 2549 748 | 3077 749 | 3002 750 | 3049 751 | 3051 752 | 3087 753 | 2444 754 | 3085 755 | 3135 756 | 2702 757 | 3211 758 | 3108 759 | 2501 760 | 2769 761 | 3290 762 | 2465 763 | 3025 764 | 3019 765 | 2385 766 | 2940 767 | 2657 768 | 2610 769 | 2525 770 | 2941 771 | 3078 772 | 2341 773 | 2916 774 | 2956 775 | 2375 776 | 2880 777 | 3009 778 | 2780 779 | 2370 780 | 2925 781 | 2332 782 | 3146 783 | 2315 784 | 2809 785 | 3145 786 | 3106 787 | 2782 788 | 2760 789 | 2493 790 | 2765 791 | 2556 792 | 2890 793 | 2400 794 | 2339 795 | 3201 796 | 2818 797 | 3248 798 | 3280 799 | 2570 800 | 2569 801 | 2937 802 | 3174 803 | 2836 804 | 2708 805 | 2820 806 | 3195 807 | 2617 808 | 3197 809 | 2319 810 | 2744 811 | 2615 812 | 2825 813 | 2603 814 | 2914 815 | 2531 816 | 3193 817 | 2624 818 | 2365 819 | 2810 820 | 3239 821 | 3159 822 | 2537 823 | 2844 824 | 2758 825 | 2938 826 | 3037 827 | 2503 828 | 3297 829 | 2885 830 | 2608 831 | 2494 832 | 2712 833 | 2408 834 | 2901 835 | 2704 836 | 2536 837 | 2373 838 | 2478 839 | 2723 840 | 3076 841 | 2627 842 | 2369 843 | 2669 844 | 3006 845 | 2628 846 | 2788 847 | 3276 848 | 2435 849 | 3139 850 | 3235 851 | 2527 852 | 2571 853 | 2815 854 | 2442 855 | 2892 856 | 2978 857 | 2746 858 | 3150 859 | 2574 860 | 2725 861 | 3188 862 | 2601 863 | 2378 864 | 3075 865 | 2632 866 | 2794 867 | 3270 868 | 3071 869 | 2506 870 | 3126 871 | 3236 872 | 3257 873 | 2824 874 | 2989 875 | 2950 876 | 2428 877 | 2405 878 | 3156 879 | 2447 880 | 2787 881 | 2805 882 | 2720 883 | 2403 884 | 2811 885 | 2329 886 | 2474 887 | 2785 888 | 2350 889 | 2507 890 | 2416 891 | 3112 892 | 2475 893 | 2876 894 | 2585 895 | 2487 896 | 3072 897 | 3082 898 | 2943 899 | 2757 900 | 2388 901 | 2600 902 | 3294 903 | 2756 904 | 3142 905 | 3041 906 | 2594 907 | 2998 908 | 3047 909 | 2379 910 | 2980 911 | 2454 912 | 2862 913 | 3175 914 | 2588 915 | 3031 916 | 3012 917 | 2889 918 | 2500 919 | 2791 920 | 2854 921 | 2619 922 | 2395 923 | 2807 924 | 2740 925 | 2412 926 | 3131 927 | 3013 928 | 2939 929 | 2651 930 | 2490 931 | 2988 932 | 2863 933 | 3225 934 | 2745 935 | 2714 936 | 3160 937 | 3124 938 | 2849 939 | 2676 940 | 2872 941 | 3287 942 | 3189 943 | 2716 944 | 3115 945 | 2928 946 | 2871 947 | 2591 948 | 2717 949 | 2546 950 | 2777 951 | 3298 952 | 2397 953 | 3187 954 | 2726 955 | 2336 956 | 3268 957 | 2477 958 | 2904 959 | 2846 960 | 3121 961 | 2899 962 | 2510 963 | 2806 964 | 2963 965 | 3313 966 | 2679 967 | 3302 968 | 2663 969 | 3053 970 | 2469 971 | 2999 972 | 3311 973 | 2470 974 | 2638 975 | 3120 976 | 3171 977 | 2689 978 | 2922 979 | 2607 980 | 2721 981 | 2993 982 | 2887 983 | 2837 984 | 2929 985 | 2829 986 | 3234 987 | 2649 988 | 2337 989 | 2759 990 | 2778 991 | 2771 992 | 2404 993 | 2589 994 | 3123 995 | 3209 996 | 2729 997 | 3252 998 | 2606 999 | 2579 1000 | 2552 1001 | -------------------------------------------------------------------------------- /data/ind.citeseer.tx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tiiiger/SGC/2c7a2727e82e462d8ef9d6e57f0b08888e16488f/data/ind.citeseer.tx -------------------------------------------------------------------------------- /data/ind.citeseer.ty: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tiiiger/SGC/2c7a2727e82e462d8ef9d6e57f0b08888e16488f/data/ind.citeseer.ty -------------------------------------------------------------------------------- /data/ind.citeseer.x: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tiiiger/SGC/2c7a2727e82e462d8ef9d6e57f0b08888e16488f/data/ind.citeseer.x -------------------------------------------------------------------------------- /data/ind.citeseer.y: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tiiiger/SGC/2c7a2727e82e462d8ef9d6e57f0b08888e16488f/data/ind.citeseer.y -------------------------------------------------------------------------------- /data/ind.cora.allx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tiiiger/SGC/2c7a2727e82e462d8ef9d6e57f0b08888e16488f/data/ind.cora.allx -------------------------------------------------------------------------------- /data/ind.cora.ally: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tiiiger/SGC/2c7a2727e82e462d8ef9d6e57f0b08888e16488f/data/ind.cora.ally -------------------------------------------------------------------------------- /data/ind.cora.graph: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tiiiger/SGC/2c7a2727e82e462d8ef9d6e57f0b08888e16488f/data/ind.cora.graph -------------------------------------------------------------------------------- /data/ind.cora.test.index: -------------------------------------------------------------------------------- 1 | 2692 2 | 2532 3 | 2050 4 | 1715 5 | 2362 6 | 2609 7 | 2622 8 | 1975 9 | 2081 10 | 1767 11 | 2263 12 | 1725 13 | 2588 14 | 2259 15 | 2357 16 | 1998 17 | 2574 18 | 2179 19 | 2291 20 | 2382 21 | 1812 22 | 1751 23 | 2422 24 | 1937 25 | 2631 26 | 2510 27 | 2378 28 | 2589 29 | 2345 30 | 1943 31 | 1850 32 | 2298 33 | 1825 34 | 2035 35 | 2507 36 | 2313 37 | 1906 38 | 1797 39 | 2023 40 | 2159 41 | 2495 42 | 1886 43 | 2122 44 | 2369 45 | 2461 46 | 1925 47 | 2565 48 | 1858 49 | 2234 50 | 2000 51 | 1846 52 | 2318 53 | 1723 54 | 2559 55 | 2258 56 | 1763 57 | 1991 58 | 1922 59 | 2003 60 | 2662 61 | 2250 62 | 2064 63 | 2529 64 | 1888 65 | 2499 66 | 2454 67 | 2320 68 | 2287 69 | 2203 70 | 2018 71 | 2002 72 | 2632 73 | 2554 74 | 2314 75 | 2537 76 | 1760 77 | 2088 78 | 2086 79 | 2218 80 | 2605 81 | 1953 82 | 2403 83 | 1920 84 | 2015 85 | 2335 86 | 2535 87 | 1837 88 | 2009 89 | 1905 90 | 2636 91 | 1942 92 | 2193 93 | 2576 94 | 2373 95 | 1873 96 | 2463 97 | 2509 98 | 1954 99 | 2656 100 | 2455 101 | 2494 102 | 2295 103 | 2114 104 | 2561 105 | 2176 106 | 2275 107 | 2635 108 | 2442 109 | 2704 110 | 2127 111 | 2085 112 | 2214 113 | 2487 114 | 1739 115 | 2543 116 | 1783 117 | 2485 118 | 2262 119 | 2472 120 | 2326 121 | 1738 122 | 2170 123 | 2100 124 | 2384 125 | 2152 126 | 2647 127 | 2693 128 | 2376 129 | 1775 130 | 1726 131 | 2476 132 | 2195 133 | 1773 134 | 1793 135 | 2194 136 | 2581 137 | 1854 138 | 2524 139 | 1945 140 | 1781 141 | 1987 142 | 2599 143 | 1744 144 | 2225 145 | 2300 146 | 1928 147 | 2042 148 | 2202 149 | 1958 150 | 1816 151 | 1916 152 | 2679 153 | 2190 154 | 1733 155 | 2034 156 | 2643 157 | 2177 158 | 1883 159 | 1917 160 | 1996 161 | 2491 162 | 2268 163 | 2231 164 | 2471 165 | 1919 166 | 1909 167 | 2012 168 | 2522 169 | 1865 170 | 2466 171 | 2469 172 | 2087 173 | 2584 174 | 2563 175 | 1924 176 | 2143 177 | 1736 178 | 1966 179 | 2533 180 | 2490 181 | 2630 182 | 1973 183 | 2568 184 | 1978 185 | 2664 186 | 2633 187 | 2312 188 | 2178 189 | 1754 190 | 2307 191 | 2480 192 | 1960 193 | 1742 194 | 1962 195 | 2160 196 | 2070 197 | 2553 198 | 2433 199 | 1768 200 | 2659 201 | 2379 202 | 2271 203 | 1776 204 | 2153 205 | 1877 206 | 2027 207 | 2028 208 | 2155 209 | 2196 210 | 2483 211 | 2026 212 | 2158 213 | 2407 214 | 1821 215 | 2131 216 | 2676 217 | 2277 218 | 2489 219 | 2424 220 | 1963 221 | 1808 222 | 1859 223 | 2597 224 | 2548 225 | 2368 226 | 1817 227 | 2405 228 | 2413 229 | 2603 230 | 2350 231 | 2118 232 | 2329 233 | 1969 234 | 2577 235 | 2475 236 | 2467 237 | 2425 238 | 1769 239 | 2092 240 | 2044 241 | 2586 242 | 2608 243 | 1983 244 | 2109 245 | 2649 246 | 1964 247 | 2144 248 | 1902 249 | 2411 250 | 2508 251 | 2360 252 | 1721 253 | 2005 254 | 2014 255 | 2308 256 | 2646 257 | 1949 258 | 1830 259 | 2212 260 | 2596 261 | 1832 262 | 1735 263 | 1866 264 | 2695 265 | 1941 266 | 2546 267 | 2498 268 | 2686 269 | 2665 270 | 1784 271 | 2613 272 | 1970 273 | 2021 274 | 2211 275 | 2516 276 | 2185 277 | 2479 278 | 2699 279 | 2150 280 | 1990 281 | 2063 282 | 2075 283 | 1979 284 | 2094 285 | 1787 286 | 2571 287 | 2690 288 | 1926 289 | 2341 290 | 2566 291 | 1957 292 | 1709 293 | 1955 294 | 2570 295 | 2387 296 | 1811 297 | 2025 298 | 2447 299 | 2696 300 | 2052 301 | 2366 302 | 1857 303 | 2273 304 | 2245 305 | 2672 306 | 2133 307 | 2421 308 | 1929 309 | 2125 310 | 2319 311 | 2641 312 | 2167 313 | 2418 314 | 1765 315 | 1761 316 | 1828 317 | 2188 318 | 1972 319 | 1997 320 | 2419 321 | 2289 322 | 2296 323 | 2587 324 | 2051 325 | 2440 326 | 2053 327 | 2191 328 | 1923 329 | 2164 330 | 1861 331 | 2339 332 | 2333 333 | 2523 334 | 2670 335 | 2121 336 | 1921 337 | 1724 338 | 2253 339 | 2374 340 | 1940 341 | 2545 342 | 2301 343 | 2244 344 | 2156 345 | 1849 346 | 2551 347 | 2011 348 | 2279 349 | 2572 350 | 1757 351 | 2400 352 | 2569 353 | 2072 354 | 2526 355 | 2173 356 | 2069 357 | 2036 358 | 1819 359 | 1734 360 | 1880 361 | 2137 362 | 2408 363 | 2226 364 | 2604 365 | 1771 366 | 2698 367 | 2187 368 | 2060 369 | 1756 370 | 2201 371 | 2066 372 | 2439 373 | 1844 374 | 1772 375 | 2383 376 | 2398 377 | 1708 378 | 1992 379 | 1959 380 | 1794 381 | 2426 382 | 2702 383 | 2444 384 | 1944 385 | 1829 386 | 2660 387 | 2497 388 | 2607 389 | 2343 390 | 1730 391 | 2624 392 | 1790 393 | 1935 394 | 1967 395 | 2401 396 | 2255 397 | 2355 398 | 2348 399 | 1931 400 | 2183 401 | 2161 402 | 2701 403 | 1948 404 | 2501 405 | 2192 406 | 2404 407 | 2209 408 | 2331 409 | 1810 410 | 2363 411 | 2334 412 | 1887 413 | 2393 414 | 2557 415 | 1719 416 | 1732 417 | 1986 418 | 2037 419 | 2056 420 | 1867 421 | 2126 422 | 1932 423 | 2117 424 | 1807 425 | 1801 426 | 1743 427 | 2041 428 | 1843 429 | 2388 430 | 2221 431 | 1833 432 | 2677 433 | 1778 434 | 2661 435 | 2306 436 | 2394 437 | 2106 438 | 2430 439 | 2371 440 | 2606 441 | 2353 442 | 2269 443 | 2317 444 | 2645 445 | 2372 446 | 2550 447 | 2043 448 | 1968 449 | 2165 450 | 2310 451 | 1985 452 | 2446 453 | 1982 454 | 2377 455 | 2207 456 | 1818 457 | 1913 458 | 1766 459 | 1722 460 | 1894 461 | 2020 462 | 1881 463 | 2621 464 | 2409 465 | 2261 466 | 2458 467 | 2096 468 | 1712 469 | 2594 470 | 2293 471 | 2048 472 | 2359 473 | 1839 474 | 2392 475 | 2254 476 | 1911 477 | 2101 478 | 2367 479 | 1889 480 | 1753 481 | 2555 482 | 2246 483 | 2264 484 | 2010 485 | 2336 486 | 2651 487 | 2017 488 | 2140 489 | 1842 490 | 2019 491 | 1890 492 | 2525 493 | 2134 494 | 2492 495 | 2652 496 | 2040 497 | 2145 498 | 2575 499 | 2166 500 | 1999 501 | 2434 502 | 1711 503 | 2276 504 | 2450 505 | 2389 506 | 2669 507 | 2595 508 | 1814 509 | 2039 510 | 2502 511 | 1896 512 | 2168 513 | 2344 514 | 2637 515 | 2031 516 | 1977 517 | 2380 518 | 1936 519 | 2047 520 | 2460 521 | 2102 522 | 1745 523 | 2650 524 | 2046 525 | 2514 526 | 1980 527 | 2352 528 | 2113 529 | 1713 530 | 2058 531 | 2558 532 | 1718 533 | 1864 534 | 1876 535 | 2338 536 | 1879 537 | 1891 538 | 2186 539 | 2451 540 | 2181 541 | 2638 542 | 2644 543 | 2103 544 | 2591 545 | 2266 546 | 2468 547 | 1869 548 | 2582 549 | 2674 550 | 2361 551 | 2462 552 | 1748 553 | 2215 554 | 2615 555 | 2236 556 | 2248 557 | 2493 558 | 2342 559 | 2449 560 | 2274 561 | 1824 562 | 1852 563 | 1870 564 | 2441 565 | 2356 566 | 1835 567 | 2694 568 | 2602 569 | 2685 570 | 1893 571 | 2544 572 | 2536 573 | 1994 574 | 1853 575 | 1838 576 | 1786 577 | 1930 578 | 2539 579 | 1892 580 | 2265 581 | 2618 582 | 2486 583 | 2583 584 | 2061 585 | 1796 586 | 1806 587 | 2084 588 | 1933 589 | 2095 590 | 2136 591 | 2078 592 | 1884 593 | 2438 594 | 2286 595 | 2138 596 | 1750 597 | 2184 598 | 1799 599 | 2278 600 | 2410 601 | 2642 602 | 2435 603 | 1956 604 | 2399 605 | 1774 606 | 2129 607 | 1898 608 | 1823 609 | 1938 610 | 2299 611 | 1862 612 | 2420 613 | 2673 614 | 1984 615 | 2204 616 | 1717 617 | 2074 618 | 2213 619 | 2436 620 | 2297 621 | 2592 622 | 2667 623 | 2703 624 | 2511 625 | 1779 626 | 1782 627 | 2625 628 | 2365 629 | 2315 630 | 2381 631 | 1788 632 | 1714 633 | 2302 634 | 1927 635 | 2325 636 | 2506 637 | 2169 638 | 2328 639 | 2629 640 | 2128 641 | 2655 642 | 2282 643 | 2073 644 | 2395 645 | 2247 646 | 2521 647 | 2260 648 | 1868 649 | 1988 650 | 2324 651 | 2705 652 | 2541 653 | 1731 654 | 2681 655 | 2707 656 | 2465 657 | 1785 658 | 2149 659 | 2045 660 | 2505 661 | 2611 662 | 2217 663 | 2180 664 | 1904 665 | 2453 666 | 2484 667 | 1871 668 | 2309 669 | 2349 670 | 2482 671 | 2004 672 | 1965 673 | 2406 674 | 2162 675 | 1805 676 | 2654 677 | 2007 678 | 1947 679 | 1981 680 | 2112 681 | 2141 682 | 1720 683 | 1758 684 | 2080 685 | 2330 686 | 2030 687 | 2432 688 | 2089 689 | 2547 690 | 1820 691 | 1815 692 | 2675 693 | 1840 694 | 2658 695 | 2370 696 | 2251 697 | 1908 698 | 2029 699 | 2068 700 | 2513 701 | 2549 702 | 2267 703 | 2580 704 | 2327 705 | 2351 706 | 2111 707 | 2022 708 | 2321 709 | 2614 710 | 2252 711 | 2104 712 | 1822 713 | 2552 714 | 2243 715 | 1798 716 | 2396 717 | 2663 718 | 2564 719 | 2148 720 | 2562 721 | 2684 722 | 2001 723 | 2151 724 | 2706 725 | 2240 726 | 2474 727 | 2303 728 | 2634 729 | 2680 730 | 2055 731 | 2090 732 | 2503 733 | 2347 734 | 2402 735 | 2238 736 | 1950 737 | 2054 738 | 2016 739 | 1872 740 | 2233 741 | 1710 742 | 2032 743 | 2540 744 | 2628 745 | 1795 746 | 2616 747 | 1903 748 | 2531 749 | 2567 750 | 1946 751 | 1897 752 | 2222 753 | 2227 754 | 2627 755 | 1856 756 | 2464 757 | 2241 758 | 2481 759 | 2130 760 | 2311 761 | 2083 762 | 2223 763 | 2284 764 | 2235 765 | 2097 766 | 1752 767 | 2515 768 | 2527 769 | 2385 770 | 2189 771 | 2283 772 | 2182 773 | 2079 774 | 2375 775 | 2174 776 | 2437 777 | 1993 778 | 2517 779 | 2443 780 | 2224 781 | 2648 782 | 2171 783 | 2290 784 | 2542 785 | 2038 786 | 1855 787 | 1831 788 | 1759 789 | 1848 790 | 2445 791 | 1827 792 | 2429 793 | 2205 794 | 2598 795 | 2657 796 | 1728 797 | 2065 798 | 1918 799 | 2427 800 | 2573 801 | 2620 802 | 2292 803 | 1777 804 | 2008 805 | 1875 806 | 2288 807 | 2256 808 | 2033 809 | 2470 810 | 2585 811 | 2610 812 | 2082 813 | 2230 814 | 1915 815 | 1847 816 | 2337 817 | 2512 818 | 2386 819 | 2006 820 | 2653 821 | 2346 822 | 1951 823 | 2110 824 | 2639 825 | 2520 826 | 1939 827 | 2683 828 | 2139 829 | 2220 830 | 1910 831 | 2237 832 | 1900 833 | 1836 834 | 2197 835 | 1716 836 | 1860 837 | 2077 838 | 2519 839 | 2538 840 | 2323 841 | 1914 842 | 1971 843 | 1845 844 | 2132 845 | 1802 846 | 1907 847 | 2640 848 | 2496 849 | 2281 850 | 2198 851 | 2416 852 | 2285 853 | 1755 854 | 2431 855 | 2071 856 | 2249 857 | 2123 858 | 1727 859 | 2459 860 | 2304 861 | 2199 862 | 1791 863 | 1809 864 | 1780 865 | 2210 866 | 2417 867 | 1874 868 | 1878 869 | 2116 870 | 1961 871 | 1863 872 | 2579 873 | 2477 874 | 2228 875 | 2332 876 | 2578 877 | 2457 878 | 2024 879 | 1934 880 | 2316 881 | 1841 882 | 1764 883 | 1737 884 | 2322 885 | 2239 886 | 2294 887 | 1729 888 | 2488 889 | 1974 890 | 2473 891 | 2098 892 | 2612 893 | 1834 894 | 2340 895 | 2423 896 | 2175 897 | 2280 898 | 2617 899 | 2208 900 | 2560 901 | 1741 902 | 2600 903 | 2059 904 | 1747 905 | 2242 906 | 2700 907 | 2232 908 | 2057 909 | 2147 910 | 2682 911 | 1792 912 | 1826 913 | 2120 914 | 1895 915 | 2364 916 | 2163 917 | 1851 918 | 2391 919 | 2414 920 | 2452 921 | 1803 922 | 1989 923 | 2623 924 | 2200 925 | 2528 926 | 2415 927 | 1804 928 | 2146 929 | 2619 930 | 2687 931 | 1762 932 | 2172 933 | 2270 934 | 2678 935 | 2593 936 | 2448 937 | 1882 938 | 2257 939 | 2500 940 | 1899 941 | 2478 942 | 2412 943 | 2107 944 | 1746 945 | 2428 946 | 2115 947 | 1800 948 | 1901 949 | 2397 950 | 2530 951 | 1912 952 | 2108 953 | 2206 954 | 2091 955 | 1740 956 | 2219 957 | 1976 958 | 2099 959 | 2142 960 | 2671 961 | 2668 962 | 2216 963 | 2272 964 | 2229 965 | 2666 966 | 2456 967 | 2534 968 | 2697 969 | 2688 970 | 2062 971 | 2691 972 | 2689 973 | 2154 974 | 2590 975 | 2626 976 | 2390 977 | 1813 978 | 2067 979 | 1952 980 | 2518 981 | 2358 982 | 1789 983 | 2076 984 | 2049 985 | 2119 986 | 2013 987 | 2124 988 | 2556 989 | 2105 990 | 2093 991 | 1885 992 | 2305 993 | 2354 994 | 2135 995 | 2601 996 | 1770 997 | 1995 998 | 2504 999 | 1749 1000 | 2157 1001 | -------------------------------------------------------------------------------- /data/ind.cora.tx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tiiiger/SGC/2c7a2727e82e462d8ef9d6e57f0b08888e16488f/data/ind.cora.tx -------------------------------------------------------------------------------- /data/ind.cora.ty: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tiiiger/SGC/2c7a2727e82e462d8ef9d6e57f0b08888e16488f/data/ind.cora.ty -------------------------------------------------------------------------------- /data/ind.cora.x: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tiiiger/SGC/2c7a2727e82e462d8ef9d6e57f0b08888e16488f/data/ind.cora.x -------------------------------------------------------------------------------- /data/ind.cora.y: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tiiiger/SGC/2c7a2727e82e462d8ef9d6e57f0b08888e16488f/data/ind.cora.y -------------------------------------------------------------------------------- /data/ind.pubmed.allx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tiiiger/SGC/2c7a2727e82e462d8ef9d6e57f0b08888e16488f/data/ind.pubmed.allx -------------------------------------------------------------------------------- /data/ind.pubmed.ally: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tiiiger/SGC/2c7a2727e82e462d8ef9d6e57f0b08888e16488f/data/ind.pubmed.ally -------------------------------------------------------------------------------- /data/ind.pubmed.graph: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tiiiger/SGC/2c7a2727e82e462d8ef9d6e57f0b08888e16488f/data/ind.pubmed.graph -------------------------------------------------------------------------------- /data/ind.pubmed.test.index: -------------------------------------------------------------------------------- 1 | 18747 2 | 19392 3 | 19181 4 | 18843 5 | 19221 6 | 18962 7 | 19560 8 | 19097 9 | 18966 10 | 19014 11 | 18756 12 | 19313 13 | 19000 14 | 19569 15 | 19359 16 | 18854 17 | 18970 18 | 19073 19 | 19661 20 | 19180 21 | 19377 22 | 18750 23 | 19401 24 | 18788 25 | 19224 26 | 19447 27 | 19017 28 | 19241 29 | 18890 30 | 18908 31 | 18965 32 | 19001 33 | 18849 34 | 19641 35 | 18852 36 | 19222 37 | 19172 38 | 18762 39 | 19156 40 | 19162 41 | 18856 42 | 18763 43 | 19318 44 | 18826 45 | 19712 46 | 19192 47 | 19695 48 | 19030 49 | 19523 50 | 19249 51 | 19079 52 | 19232 53 | 19455 54 | 18743 55 | 18800 56 | 19071 57 | 18885 58 | 19593 59 | 19394 60 | 19390 61 | 18832 62 | 19445 63 | 18838 64 | 19632 65 | 19548 66 | 19546 67 | 18825 68 | 19498 69 | 19266 70 | 19117 71 | 19595 72 | 19252 73 | 18730 74 | 18913 75 | 18809 76 | 19452 77 | 19520 78 | 19274 79 | 19555 80 | 19388 81 | 18919 82 | 19099 83 | 19637 84 | 19403 85 | 18720 86 | 19526 87 | 18905 88 | 19451 89 | 19408 90 | 18923 91 | 18794 92 | 19322 93 | 19431 94 | 18912 95 | 18841 96 | 19239 97 | 19125 98 | 19258 99 | 19565 100 | 18898 101 | 19482 102 | 19029 103 | 18778 104 | 19096 105 | 19684 106 | 19552 107 | 18765 108 | 19361 109 | 19171 110 | 19367 111 | 19623 112 | 19402 113 | 19327 114 | 19118 115 | 18888 116 | 18726 117 | 19510 118 | 18831 119 | 19490 120 | 19576 121 | 19050 122 | 18729 123 | 18896 124 | 19246 125 | 19012 126 | 18862 127 | 18873 128 | 19193 129 | 19693 130 | 19474 131 | 18953 132 | 19115 133 | 19182 134 | 19269 135 | 19116 136 | 18837 137 | 18872 138 | 19007 139 | 19212 140 | 18798 141 | 19102 142 | 18772 143 | 19660 144 | 19511 145 | 18914 146 | 18886 147 | 19672 148 | 19360 149 | 19213 150 | 18810 151 | 19420 152 | 19512 153 | 18719 154 | 19432 155 | 19350 156 | 19127 157 | 18782 158 | 19587 159 | 18924 160 | 19488 161 | 18781 162 | 19340 163 | 19190 164 | 19383 165 | 19094 166 | 18835 167 | 19487 168 | 19230 169 | 18791 170 | 18882 171 | 18937 172 | 18928 173 | 18755 174 | 18802 175 | 19516 176 | 18795 177 | 18786 178 | 19273 179 | 19349 180 | 19398 181 | 19626 182 | 19130 183 | 19351 184 | 19489 185 | 19446 186 | 18959 187 | 19025 188 | 18792 189 | 18878 190 | 19304 191 | 19629 192 | 19061 193 | 18785 194 | 19194 195 | 19179 196 | 19210 197 | 19417 198 | 19583 199 | 19415 200 | 19443 201 | 18739 202 | 19662 203 | 18904 204 | 18910 205 | 18901 206 | 18960 207 | 18722 208 | 18827 209 | 19290 210 | 18842 211 | 19389 212 | 19344 213 | 18961 214 | 19098 215 | 19147 216 | 19334 217 | 19358 218 | 18829 219 | 18984 220 | 18931 221 | 18742 222 | 19320 223 | 19111 224 | 19196 225 | 18887 226 | 18991 227 | 19469 228 | 18990 229 | 18876 230 | 19261 231 | 19270 232 | 19522 233 | 19088 234 | 19284 235 | 19646 236 | 19493 237 | 19225 238 | 19615 239 | 19449 240 | 19043 241 | 19674 242 | 19391 243 | 18918 244 | 19155 245 | 19110 246 | 18815 247 | 19131 248 | 18834 249 | 19715 250 | 19603 251 | 19688 252 | 19133 253 | 19053 254 | 19166 255 | 19066 256 | 18893 257 | 18757 258 | 19582 259 | 19282 260 | 19257 261 | 18869 262 | 19467 263 | 18954 264 | 19371 265 | 19151 266 | 19462 267 | 19598 268 | 19653 269 | 19187 270 | 19624 271 | 19564 272 | 19534 273 | 19581 274 | 19478 275 | 18985 276 | 18746 277 | 19342 278 | 18777 279 | 19696 280 | 18824 281 | 19138 282 | 18728 283 | 19643 284 | 19199 285 | 18731 286 | 19168 287 | 18948 288 | 19216 289 | 19697 290 | 19347 291 | 18808 292 | 18725 293 | 19134 294 | 18847 295 | 18828 296 | 18996 297 | 19106 298 | 19485 299 | 18917 300 | 18911 301 | 18776 302 | 19203 303 | 19158 304 | 18895 305 | 19165 306 | 19382 307 | 18780 308 | 18836 309 | 19373 310 | 19659 311 | 18947 312 | 19375 313 | 19299 314 | 18761 315 | 19366 316 | 18754 317 | 19248 318 | 19416 319 | 19658 320 | 19638 321 | 19034 322 | 19281 323 | 18844 324 | 18922 325 | 19491 326 | 19272 327 | 19341 328 | 19068 329 | 19332 330 | 19559 331 | 19293 332 | 18804 333 | 18933 334 | 18935 335 | 19405 336 | 18936 337 | 18945 338 | 18943 339 | 18818 340 | 18797 341 | 19570 342 | 19464 343 | 19428 344 | 19093 345 | 19433 346 | 18986 347 | 19161 348 | 19255 349 | 19157 350 | 19046 351 | 19292 352 | 19434 353 | 19298 354 | 18724 355 | 19410 356 | 19694 357 | 19214 358 | 19640 359 | 19189 360 | 18963 361 | 19218 362 | 19585 363 | 19041 364 | 19550 365 | 19123 366 | 19620 367 | 19376 368 | 19561 369 | 18944 370 | 19706 371 | 19056 372 | 19283 373 | 18741 374 | 19319 375 | 19144 376 | 19542 377 | 18821 378 | 19404 379 | 19080 380 | 19303 381 | 18793 382 | 19306 383 | 19678 384 | 19435 385 | 19519 386 | 19566 387 | 19278 388 | 18946 389 | 19536 390 | 19020 391 | 19057 392 | 19198 393 | 19333 394 | 19649 395 | 19699 396 | 19399 397 | 19654 398 | 19136 399 | 19465 400 | 19321 401 | 19577 402 | 18907 403 | 19665 404 | 19386 405 | 19596 406 | 19247 407 | 19473 408 | 19568 409 | 19355 410 | 18925 411 | 19586 412 | 18982 413 | 19616 414 | 19495 415 | 19612 416 | 19023 417 | 19438 418 | 18817 419 | 19692 420 | 19295 421 | 19414 422 | 19676 423 | 19472 424 | 19107 425 | 19062 426 | 19035 427 | 18883 428 | 19409 429 | 19052 430 | 19606 431 | 19091 432 | 19651 433 | 19475 434 | 19413 435 | 18796 436 | 19369 437 | 19639 438 | 19701 439 | 19461 440 | 19645 441 | 19251 442 | 19063 443 | 19679 444 | 19545 445 | 19081 446 | 19363 447 | 18995 448 | 19549 449 | 18790 450 | 18855 451 | 18833 452 | 18899 453 | 19395 454 | 18717 455 | 19647 456 | 18768 457 | 19103 458 | 19245 459 | 18819 460 | 18779 461 | 19656 462 | 19076 463 | 18745 464 | 18971 465 | 19197 466 | 19711 467 | 19074 468 | 19128 469 | 19466 470 | 19139 471 | 19309 472 | 19324 473 | 18814 474 | 19092 475 | 19627 476 | 19060 477 | 18806 478 | 18929 479 | 18737 480 | 18942 481 | 18906 482 | 18858 483 | 19456 484 | 19253 485 | 19716 486 | 19104 487 | 19667 488 | 19574 489 | 18903 490 | 19237 491 | 18864 492 | 19556 493 | 19364 494 | 18952 495 | 19008 496 | 19323 497 | 19700 498 | 19170 499 | 19267 500 | 19345 501 | 19238 502 | 18909 503 | 18892 504 | 19109 505 | 19704 506 | 18902 507 | 19275 508 | 19680 509 | 18723 510 | 19242 511 | 19112 512 | 19169 513 | 18956 514 | 19343 515 | 19650 516 | 19541 517 | 19698 518 | 19521 519 | 19087 520 | 18976 521 | 19038 522 | 18775 523 | 18968 524 | 19671 525 | 19412 526 | 19407 527 | 19573 528 | 19027 529 | 18813 530 | 19357 531 | 19460 532 | 19673 533 | 19481 534 | 19036 535 | 19614 536 | 18787 537 | 19195 538 | 18732 539 | 18884 540 | 19613 541 | 19657 542 | 19575 543 | 19226 544 | 19589 545 | 19234 546 | 19617 547 | 19707 548 | 19484 549 | 18740 550 | 19424 551 | 18784 552 | 19419 553 | 19159 554 | 18865 555 | 19105 556 | 19315 557 | 19480 558 | 19664 559 | 19378 560 | 18803 561 | 19605 562 | 18870 563 | 19042 564 | 19426 565 | 18848 566 | 19223 567 | 19509 568 | 19532 569 | 18752 570 | 19691 571 | 18718 572 | 19209 573 | 19362 574 | 19090 575 | 19492 576 | 19567 577 | 19687 578 | 19018 579 | 18830 580 | 19530 581 | 19554 582 | 19119 583 | 19442 584 | 19558 585 | 19527 586 | 19427 587 | 19291 588 | 19543 589 | 19422 590 | 19142 591 | 18897 592 | 18950 593 | 19425 594 | 19002 595 | 19588 596 | 18978 597 | 19551 598 | 18930 599 | 18736 600 | 19101 601 | 19215 602 | 19150 603 | 19263 604 | 18949 605 | 18974 606 | 18759 607 | 19335 608 | 19200 609 | 19129 610 | 19328 611 | 19437 612 | 18988 613 | 19429 614 | 19368 615 | 19406 616 | 19049 617 | 18811 618 | 19296 619 | 19256 620 | 19385 621 | 19602 622 | 18770 623 | 19337 624 | 19580 625 | 19476 626 | 19045 627 | 19132 628 | 19089 629 | 19120 630 | 19265 631 | 19483 632 | 18767 633 | 19227 634 | 18934 635 | 19069 636 | 18820 637 | 19006 638 | 19459 639 | 18927 640 | 19037 641 | 19280 642 | 19441 643 | 18823 644 | 19015 645 | 19114 646 | 19618 647 | 18957 648 | 19176 649 | 18853 650 | 19648 651 | 19201 652 | 19444 653 | 19279 654 | 18751 655 | 19302 656 | 19505 657 | 18733 658 | 19601 659 | 19533 660 | 18863 661 | 19708 662 | 19387 663 | 19346 664 | 19152 665 | 19206 666 | 18851 667 | 19338 668 | 19681 669 | 19380 670 | 19055 671 | 18766 672 | 19085 673 | 19591 674 | 19547 675 | 18958 676 | 19146 677 | 18840 678 | 19051 679 | 19021 680 | 19207 681 | 19235 682 | 19086 683 | 18979 684 | 19300 685 | 18939 686 | 19100 687 | 19619 688 | 19287 689 | 18980 690 | 19277 691 | 19326 692 | 19108 693 | 18920 694 | 19625 695 | 19374 696 | 19078 697 | 18734 698 | 19634 699 | 19339 700 | 18877 701 | 19423 702 | 19652 703 | 19683 704 | 19044 705 | 18983 706 | 19330 707 | 19529 708 | 19714 709 | 19468 710 | 19075 711 | 19540 712 | 18839 713 | 19022 714 | 19286 715 | 19537 716 | 19175 717 | 19463 718 | 19167 719 | 19705 720 | 19562 721 | 19244 722 | 19486 723 | 19611 724 | 18801 725 | 19178 726 | 19590 727 | 18846 728 | 19450 729 | 19205 730 | 19381 731 | 18941 732 | 19670 733 | 19185 734 | 19504 735 | 19633 736 | 18997 737 | 19113 738 | 19397 739 | 19636 740 | 19709 741 | 19289 742 | 19264 743 | 19353 744 | 19584 745 | 19126 746 | 18938 747 | 19669 748 | 18964 749 | 19276 750 | 18774 751 | 19173 752 | 19231 753 | 18973 754 | 18769 755 | 19064 756 | 19040 757 | 19668 758 | 18738 759 | 19082 760 | 19655 761 | 19236 762 | 19352 763 | 19609 764 | 19628 765 | 18951 766 | 19384 767 | 19122 768 | 18875 769 | 18992 770 | 18753 771 | 19379 772 | 19254 773 | 19301 774 | 19506 775 | 19135 776 | 19010 777 | 19682 778 | 19400 779 | 19579 780 | 19316 781 | 19553 782 | 19208 783 | 19635 784 | 19644 785 | 18891 786 | 19024 787 | 18989 788 | 19250 789 | 18850 790 | 19317 791 | 18915 792 | 19607 793 | 18799 794 | 18881 795 | 19479 796 | 19031 797 | 19365 798 | 19164 799 | 18744 800 | 18760 801 | 19502 802 | 19058 803 | 19517 804 | 18735 805 | 19448 806 | 19243 807 | 19453 808 | 19285 809 | 18857 810 | 19439 811 | 19016 812 | 18975 813 | 19503 814 | 18998 815 | 18981 816 | 19186 817 | 18994 818 | 19240 819 | 19631 820 | 19070 821 | 19174 822 | 18900 823 | 19065 824 | 19220 825 | 19229 826 | 18880 827 | 19308 828 | 19372 829 | 19496 830 | 18771 831 | 19325 832 | 19538 833 | 19033 834 | 18874 835 | 19077 836 | 19211 837 | 18764 838 | 19458 839 | 19571 840 | 19121 841 | 19019 842 | 19059 843 | 19497 844 | 18969 845 | 19666 846 | 19297 847 | 19219 848 | 19622 849 | 19184 850 | 18977 851 | 19702 852 | 19539 853 | 19329 854 | 19095 855 | 19675 856 | 18972 857 | 19514 858 | 19703 859 | 19188 860 | 18866 861 | 18812 862 | 19314 863 | 18822 864 | 18845 865 | 19494 866 | 19411 867 | 18916 868 | 19686 869 | 18967 870 | 19294 871 | 19143 872 | 19204 873 | 18805 874 | 19689 875 | 19233 876 | 18758 877 | 18748 878 | 19011 879 | 19685 880 | 19336 881 | 19608 882 | 19454 883 | 19124 884 | 18868 885 | 18807 886 | 19544 887 | 19621 888 | 19228 889 | 19154 890 | 19141 891 | 19145 892 | 19153 893 | 18860 894 | 19163 895 | 19393 896 | 19268 897 | 19160 898 | 19305 899 | 19259 900 | 19471 901 | 19524 902 | 18783 903 | 19396 904 | 18894 905 | 19430 906 | 19690 907 | 19348 908 | 19597 909 | 19592 910 | 19677 911 | 18889 912 | 19331 913 | 18773 914 | 19137 915 | 19009 916 | 18932 917 | 19599 918 | 18816 919 | 19054 920 | 19067 921 | 19477 922 | 19191 923 | 18921 924 | 18940 925 | 19578 926 | 19183 927 | 19004 928 | 19072 929 | 19710 930 | 19005 931 | 19610 932 | 18955 933 | 19457 934 | 19148 935 | 18859 936 | 18993 937 | 19642 938 | 19047 939 | 19418 940 | 19535 941 | 19600 942 | 19312 943 | 19039 944 | 19028 945 | 18879 946 | 19003 947 | 19026 948 | 19013 949 | 19149 950 | 19177 951 | 19217 952 | 18987 953 | 19354 954 | 19525 955 | 19202 956 | 19084 957 | 19032 958 | 18749 959 | 18867 960 | 19048 961 | 18999 962 | 19260 963 | 19630 964 | 18727 965 | 19356 966 | 19083 967 | 18926 968 | 18789 969 | 19370 970 | 18861 971 | 19311 972 | 19557 973 | 19531 974 | 19436 975 | 19140 976 | 19310 977 | 19501 978 | 18721 979 | 19604 980 | 19713 981 | 19262 982 | 19563 983 | 19507 984 | 19440 985 | 19572 986 | 19513 987 | 19515 988 | 19518 989 | 19421 990 | 19470 991 | 19499 992 | 19663 993 | 19508 994 | 18871 995 | 19528 996 | 19500 997 | 19307 998 | 19288 999 | 19594 1000 | 19271 1001 | -------------------------------------------------------------------------------- /data/ind.pubmed.tx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tiiiger/SGC/2c7a2727e82e462d8ef9d6e57f0b08888e16488f/data/ind.pubmed.tx -------------------------------------------------------------------------------- /data/ind.pubmed.ty: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tiiiger/SGC/2c7a2727e82e462d8ef9d6e57f0b08888e16488f/data/ind.pubmed.ty -------------------------------------------------------------------------------- /data/ind.pubmed.x: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tiiiger/SGC/2c7a2727e82e462d8ef9d6e57f0b08888e16488f/data/ind.pubmed.x -------------------------------------------------------------------------------- /data/ind.pubmed.y: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tiiiger/SGC/2c7a2727e82e462d8ef9d6e57f0b08888e16488f/data/ind.pubmed.y -------------------------------------------------------------------------------- /downstream/TextSGC/README.md: -------------------------------------------------------------------------------- 1 | # TextSGC 2 | 3 | Implementation of SGC for text classification following 4 | ([Graph Convolutional Networks for Text Classification](https://arxiv.org/abs/1809.05679)). 5 | Code modified from the original repo of 6 | Text-GCN(https://github.com/yao8839836/text_gcn) and the pytorch implementation 7 | of GCN (https://github.com/tkipf/pygcn). 8 | 9 | For reference, 10 | 11 | Dataset | Accuracy | Training Time 12 | :------:|:------:|:-----------:| 13 | 20NG | 88.5 % | 19.06s 14 | R8 | 97.2 % | 1.90s 15 | R52 | 94.0 % | 3.01s 16 | Ohsumed | 68.5 % | 3.02s 17 | MR | 75.9 % | 4.00s 18 | ## Data 19 | 20 | We get the data from [original repo](https://github.com/yao8839836/text_gcn) and 21 | process with `remove_words.py`, `build_graph.py`. We modify these two files from 22 | the original repo and made some slight changes. 23 | We share the data in a zip file 24 | [online](https://drive.google.com/file/d/10kx3z3bjYFoeRjjg1_DZOAP39Jln0BCh/view?usp=sharing). 25 | Please uncompress the file and put it under `data/`. 26 | If you want to redo the processing, see options with `python build_graph.py 27 | --help` and `python remove_words.py --help`. 28 | 29 | Training with TextGCN and TextSGC can be memory intensive (10+ GB CPU memory 30 | required for the 20ng dataset). Therefore, we share a preprocessed version of 31 | the data [online](https://drive.google.com/file/d/1M3lxIjtqqsb9tzLXDeWTlh283dhc8CeV/view?usp=sharing). To use it, 32 | download and decompress to `./preprocessed`, and use the `--preprocessed` flag. 33 | 34 | ## Usage 35 | 36 | There is only one hyperparameter needs to be set and we tune it with hyperopt. 37 | We provide the tuned hyperparameters under `tuned_result`. 38 | You can replicate the paper reported result by 39 | ``` 40 | python train.py --dataset
--tuned 41 | ``` 42 | 43 | See `tuning.py` for the tuning details. 44 | -------------------------------------------------------------------------------- /downstream/TextSGC/build_corpus.py: -------------------------------------------------------------------------------- 1 | import re 2 | # build corpus 3 | 4 | 5 | dataset = '20ng' 6 | 7 | f = open('data/' + dataset + '.txt', 'r') 8 | lines = f.readlines() 9 | docs = [] 10 | for line in lines: 11 | temp = line.split("\t") 12 | doc_file = open(temp[0], 'r') 13 | doc_content = doc_file.read() 14 | doc_file.close() 15 | print(temp[0], doc_content) 16 | doc_content = doc_content.replace('\n', ' ') 17 | docs.append(doc_content) 18 | 19 | 20 | corpus_str = '\n'.join(docs) 21 | f.close() 22 | 23 | f = open('data/corpus/' + dataset + '.txt', 'w') 24 | f.write(corpus_str) 25 | f.close() 26 | 27 | 28 | ''' 29 | # datasets from PTE paper 30 | f = open('data/dblp/label_train.txt', 'r') 31 | lines = f.readlines() 32 | f.close() 33 | 34 | doc_id = 0 35 | doc_name_list = [] 36 | for line in lines: 37 | string = str(doc_id) + '\t' + 'train' + '\t' + line.strip() 38 | doc_name_list.append(string) 39 | doc_id += 1 40 | 41 | f = open('data/dblp/label_test.txt', 'r') 42 | lines = f.readlines() 43 | f.close() 44 | 45 | for line in lines: 46 | string = str(doc_id) + '\t' + 'test' + '\t' + line.strip() 47 | doc_name_list.append(string) 48 | doc_id += 1 49 | 50 | doc_list_str = '\n'.join(doc_name_list) 51 | 52 | f = open('data/dblp.txt', 'w') 53 | f.write(doc_list_str) 54 | f.close() 55 | 56 | # TREC, R8, R52, WebKB 57 | 58 | dataset = 'R52' 59 | 60 | f = open('data/' + dataset + '/train.txt', 'r') 61 | lines = f.readlines() 62 | f.close() 63 | 64 | doc_id = 0 65 | doc_name_list = [] 66 | doc_content_list = [] 67 | 68 | for line in lines: 69 | line = line.strip() 70 | label = line[:line.find('\t')] 71 | content = line[line.find('\t') + 1:] 72 | string = str(doc_id) + '\t' + 'train' + '\t' + label 73 | doc_name_list.append(string) 74 | doc_content_list.append(content) 75 | doc_id += 1 76 | 77 | f = open('data/' + dataset + '/test.txt', 'r') 78 | lines = f.readlines() 79 | f.close() 80 | 81 | for line in lines: 82 | line = line.strip() 83 | label = line[:line.find('\t')] 84 | content = line[line.find('\t') + 1:] 85 | string = str(doc_id) + '\t' + 'test' + '\t' + label 86 | doc_name_list.append(string) 87 | doc_content_list.append(content) 88 | doc_id += 1 89 | 90 | doc_list_str = '\n'.join(doc_name_list) 91 | 92 | f = open('data/' + dataset + '.txt', 'w') 93 | f.write(doc_list_str) 94 | f.close() 95 | 96 | doc_name_list_str = '\n'.join(doc_name_list) 97 | 98 | f = open('data/' + dataset + '.txt', 'w') 99 | f.write(doc_list_str) 100 | f.close() 101 | 102 | doc_content_list_str = '\n'.join(doc_content_list) 103 | 104 | f = open('data/corpus/' + dataset + '.txt', 'w') 105 | f.write(doc_content_list_str) 106 | f.close() 107 | ''' 108 | -------------------------------------------------------------------------------- /downstream/TextSGC/build_graph.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import random 4 | import numpy as np 5 | import pickle as pkl 6 | import networkx as nx 7 | import scipy.sparse as sp 8 | from utils import loadWord2Vec, clean_str 9 | from math import log 10 | from sklearn import svm 11 | from nltk.corpus import wordnet as wn 12 | from sklearn.feature_extraction.text import TfidfVectorizer 13 | from scipy.spatial.distance import cosine 14 | from tqdm import tqdm 15 | from collections import Counter 16 | import itertools 17 | 18 | parser = argparse.ArgumentParser(description='Build Document Graph') 19 | parser.add_argument('--dataset', type=str, default='20ng', 20 | choices=['20ng', 'R8', 'R52', 'ohsumed', 'mr', 'yelp', 'ag_news'], 21 | help='dataset name') 22 | parser.add_argument('--embedding_dim', type=int, default=300, 23 | help='word and document embedding size.') 24 | args = parser.parse_args() 25 | 26 | # build corpus 27 | dataset = args.dataset 28 | 29 | word_embeddings_dim = args.embedding_dim 30 | word_vector_map = {} # TODO: modify this to use embedding 31 | 32 | doc_name_list = [] 33 | train_val_ids = [] 34 | test_ids = [] 35 | label_names = set() 36 | train_val_labels = [] 37 | test_labels = [] 38 | 39 | with open('data/' + dataset + '.txt', 'r') as f: 40 | lines = f.readlines() 41 | for id, line in enumerate(lines): 42 | doc_name_list.append(line.strip()) 43 | _, data_name, data_label = line.strip().split("\t") 44 | if data_name.find('test') != -1: 45 | test_ids.append(id) 46 | elif data_name.find('train') != -1: 47 | train_val_ids.append(id) 48 | label_names.add(data_label) 49 | label_names = list(label_names) 50 | label_names_to_index = {name:i for i, name in enumerate(label_names)} 51 | for id, line in enumerate(lines): 52 | _, data_name, data_label_name = line.strip().split("\t") 53 | if data_name.find('test') != -1: 54 | test_labels.append(label_names_to_index[data_label_name]) 55 | elif data_name.find('train') != -1: 56 | train_val_labels.append(label_names_to_index[data_label_name]) 57 | 58 | with open('data/corpus/' + dataset + '_labels.txt', 'w') as f: 59 | f.write('\n'.join(label_names)) 60 | 61 | 62 | print("Loaded labels and indices") 63 | # Get document content, after removed words 64 | doc_content_list = [] 65 | with open('data/corpus/' + dataset + '.clean.txt', 'r') as f: 66 | lines = f.readlines() 67 | doc_content_list = [l.strip() for l in lines] 68 | 69 | print("Loaded document content") 70 | # Build vocab 71 | word_freq = Counter() 72 | progress_bar = tqdm(doc_content_list) 73 | progress_bar.set_postfix_str("building vocabulary") 74 | for doc_words in progress_bar: 75 | words = doc_words.split() 76 | word_freq.update(words) 77 | 78 | vocab, _ = zip(*word_freq.most_common()) 79 | # put words after documents 80 | word_id_map = dict(zip(vocab, np.array(range(len(vocab)))+len(train_val_ids+test_ids))) 81 | vocab_size = len(vocab) 82 | 83 | 84 | with open('data/corpus/' + dataset + '_vocab.txt', 'w') as f: 85 | vocab_str = '\n'.join(vocab) 86 | f.write(vocab_str) 87 | 88 | # split training and validation 89 | idx = list(range(len(train_val_labels))) 90 | random.shuffle(idx) 91 | train_val_ids = [train_val_ids[i] for i in idx] 92 | train_val_labels = [train_val_labels[i] for i in idx] 93 | 94 | idx = list(range(len(test_labels))) 95 | random.shuffle(idx) 96 | test_ids = [test_ids[i] for i in idx] 97 | test_labels = [test_labels[i] for i in idx] 98 | 99 | train_val_size = len(train_val_ids) 100 | val_size = int(0.1 * train_val_size) 101 | train_size = train_val_size - val_size 102 | train_ids, val_ids = train_val_ids[:train_size], train_val_ids[train_size:] 103 | train_labels, val_labels = train_val_labels[:train_size], train_val_labels[train_size:] 104 | 105 | # Construct feature vectors 106 | def average_word_vec(doc_id, doc_content_list, word_to_vector): 107 | doc_vec = np.array([0.0 for k in range(word_embeddings_dim)]) 108 | doc_words = doc_content_list[doc_id] 109 | words = doc_words.split() 110 | for word in words: 111 | if word in word_vector_map: 112 | word_vector = word_vector_map[word] 113 | doc_vec = doc_vec + np.array(word_vector) 114 | doc_vec /= len(words) 115 | return doc_vec 116 | 117 | def construct_feature_label_matrix(doc_ids, doc_content_list, word_vector_map): 118 | row_x = [] 119 | col_x = [] 120 | data_x = [] 121 | for i, doc_id in enumerate(doc_ids): 122 | doc_vec = average_word_vec(doc_id, doc_content_list, word_vector_map) 123 | for j in range(word_embeddings_dim): 124 | row_x.append(i) 125 | col_x.append(j) 126 | data_x.append(doc_vec[j]) 127 | x = sp.csr_matrix((data_x, (row_x, col_x)), shape=( 128 | real_train_size, word_embeddings_dim)) 129 | 130 | y = [] 131 | for label in train_labels: 132 | one_hot = [0 for l in range(len(label_list))] 133 | one_hot[label] = 1 134 | y.append(one_hot) 135 | y = np.array(y) 136 | return x, y 137 | 138 | # not used 139 | # train_x, train_y = construct_feature_label_matrix(train_ids, doc_content_list, word_vector_map) 140 | # val_x, val_y = construct_feature_label_matrix(val_ids, doc_content_list, word_vector_map) 141 | # test_x, test_y = construct_feature_label_matrix(test_ids, doc_content_list, word_vector_map) 142 | 143 | print("Finish building feature vectors") 144 | 145 | # Creating word and word edges 146 | def create_window(seq, n=2): 147 | """Returns a sliding window (of width n) over data from the iterable, 148 | code taken from https://docs.python.org/release/2.3.5/lib/itertools-example.html""" 149 | it = iter(seq) 150 | result = tuple(itertools.islice(it, n)) 151 | if len(result) == n: 152 | yield result 153 | for elem in it: 154 | result = result[1:] + (elem,) 155 | yield result 156 | 157 | # word co-occurence with context windows 158 | def construct_context_windows(ids, doc_words_list, window_size=20): 159 | windows = [] 160 | for id in ids: 161 | doc_words = doc_content_list[id] 162 | words = doc_words.split() 163 | length = len(words) 164 | if length <= window_size: 165 | windows.append(words) 166 | else: 167 | windows += list(create_window(words, window_size)) 168 | return windows 169 | 170 | def count_word_window_freq(windows): 171 | word_window_freq = Counter() 172 | progress_bar = tqdm(windows) 173 | progress_bar.set_postfix_str("constructing context window") 174 | for window in progress_bar: 175 | word_window_freq.update(set(window)) 176 | return word_window_freq 177 | 178 | def count_word_pair_count(windows): 179 | word_pair_count = Counter() 180 | progress_bar = tqdm(windows) 181 | progress_bar.set_postfix_str("counting word pair frequency") 182 | for window in progress_bar: 183 | word_pairs = list(itertools.permutations(window, 2)) 184 | word_pair_count.update(word_pairs) 185 | return word_pair_count 186 | 187 | def build_word_word_graph(num_window, word_id_map, word_window_freq, word_pair_count): 188 | row = [] 189 | col = [] 190 | weight = [] 191 | # pmi as weights 192 | for pair, count in word_pair_count.items(): 193 | i, j = pair 194 | word_freq_i = word_window_freq[i] 195 | word_freq_j = word_window_freq[j] 196 | pmi = log((1.0 * count / num_window) / 197 | (1.0 * word_freq_i * word_freq_j/(num_window * num_window))) 198 | if pmi <= 0: 199 | continue 200 | row.append(word_id_map[i]) 201 | col.append(word_id_map[j]) 202 | weight.append(pmi) 203 | return row, col, weight 204 | 205 | def calc_word_doc_freq(ids, doc_content_list): 206 | # Count number of documents that contain a word 207 | word_doc_list = {} # mapping from word to document id 208 | word_doc_freq = Counter() 209 | for doc_id in ids: 210 | doc_words = doc_content_list[doc_id] 211 | words = set(doc_words.split()) 212 | word_doc_freq.update(words) 213 | return word_doc_freq 214 | 215 | def calc_doc_word_freq(ids, doc_content_list): 216 | doc_word_freq = Counter() 217 | for doc_id in ids: 218 | doc_words = doc_content_list[doc_id] 219 | words = doc_words.split() 220 | word_ids = [word_id_map[word] for word in words] 221 | doc_word_pairs = zip([doc_id for _ in word_ids], word_ids) 222 | doc_word_freq.update(doc_word_pairs) 223 | return doc_word_freq 224 | 225 | def build_doc_word_graph(ids, doc_words_list, doc_word_freq, word_doc_freq, phase='B'): 226 | row = [] 227 | col = [] 228 | weight = [] 229 | for i, doc_id in enumerate(ids): 230 | doc_words = doc_words_list[doc_id] 231 | words = set(doc_words.split()) 232 | doc_word_set = set() 233 | for word in words: 234 | word_id = word_id_map[word] 235 | key = (doc_id, word_id) 236 | freq = doc_word_freq[key] 237 | idf = log(1.0 * len(ids) / 238 | word_doc_freq[word]) 239 | w = freq*idf 240 | if phase == "B": 241 | row.append(doc_id) 242 | col.append(word_id) 243 | weight.append(w) 244 | elif phase == "C": 245 | row.append(word_id) 246 | col.append(doc_id) 247 | weight.append(w) 248 | else: raise ValueError("wrong phase") 249 | return row, col, weight 250 | 251 | def concat_graph(*args): 252 | rows, cols, weights = zip(*args) 253 | row = list(itertools.chain(*rows)) 254 | col = list(itertools.chain(*cols)) 255 | weight = list(itertools.chain(*weights)) 256 | return row, col, weight 257 | 258 | def export_graph(graph, node_size, phase=""): 259 | row, col, weight = graph 260 | adj = sp.csr_matrix( 261 | (weight, (row, col)), shape=(node_size, node_size)) 262 | if phase == "": path = "data/ind.{}.adj".format(dataset) 263 | else: path = "data/ind.{}.{}.adj".format(dataset, phase) 264 | with open(path, 'wb') as f: 265 | pkl.dump(adj, f) 266 | 267 | ids = train_val_ids+test_ids 268 | windows = construct_context_windows(ids, doc_content_list) 269 | word_window_freq = count_word_window_freq(windows) 270 | word_pair_count = count_word_pair_count(windows) 271 | D = build_word_word_graph(len(windows), word_id_map, word_window_freq, word_pair_count) 272 | 273 | doc_word_freq = calc_doc_word_freq(ids, doc_content_list) 274 | word_doc_freq = calc_word_doc_freq(ids, doc_content_list) 275 | B = build_doc_word_graph(ids, doc_content_list, doc_word_freq, word_doc_freq, phase="B") 276 | C = build_doc_word_graph(ids, doc_content_list, doc_word_freq, word_doc_freq, phase="C") 277 | 278 | node_size = len(vocab)+len(train_val_ids)+len(test_ids) 279 | export_graph(concat_graph(B, C, D), node_size, phase="BCD") 280 | export_graph(concat_graph(B, C), node_size, phase="BC") 281 | export_graph(concat_graph(B, D), node_size, phase="BD") 282 | export_graph(B, node_size, phase="B") 283 | 284 | # dump objects 285 | f = open("data/ind.{}.{}.x".format(dataset, "train"), 'wb') 286 | pkl.dump(train_ids, f) 287 | f.close() 288 | 289 | f = open("data/ind.{}.{}.y".format(dataset, "train"), 'wb') 290 | pkl.dump(train_labels, f) 291 | f.close() 292 | 293 | f = open("data/ind.{}.{}.x".format(dataset, "val"), 'wb') 294 | pkl.dump(val_ids, f) 295 | f.close() 296 | 297 | f = open("data/ind.{}.{}.y".format(dataset, "val"), 'wb') 298 | pkl.dump(val_labels, f) 299 | f.close() 300 | 301 | f = open("data/ind.{}.{}.x".format(dataset, "test"), 'wb') 302 | pkl.dump(test_ids, f) 303 | f.close() 304 | 305 | f = open("data/ind.{}.{}.y".format(dataset, "test"), 'wb') 306 | pkl.dump(test_labels, f) 307 | f.close() 308 | -------------------------------------------------------------------------------- /downstream/TextSGC/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn.parameter import Parameter 4 | import torch.nn.functional as F 5 | 6 | class SGC(nn.Module): 7 | def __init__(self, nfeat, nclass, bias=False): 8 | super(SGC, self).__init__() 9 | 10 | self.W = nn.Linear(nfeat, nclass, bias=bias) 11 | torch.nn.init.xavier_normal_(self.W.weight) 12 | 13 | def forward(self, x): 14 | out = self.W(x) 15 | return out 16 | -------------------------------------------------------------------------------- /downstream/TextSGC/remove_words.py: -------------------------------------------------------------------------------- 1 | from nltk.corpus import stopwords 2 | import nltk 3 | from nltk.wsd import lesk 4 | from nltk.corpus import wordnet as wn 5 | from utils import clean_str, loadWord2Vec 6 | import sys 7 | import argparse 8 | import random 9 | from collections import Counter 10 | 11 | # nltk.download() 12 | stop_words = set(stopwords.words('english')) 13 | print(stop_words) 14 | 15 | parser = argparse.ArgumentParser(description='Build Document Graph') 16 | parser.add_argument('--dataset', type=str, default='20ng', 17 | choices=['20ng', 'R8', 'R52', 'ohsumed', 'mr'], 18 | help='dataset name') 19 | args = parser.parse_args() 20 | 21 | dataset = args.dataset 22 | 23 | train_val_ids = [] 24 | test_ids = [] 25 | 26 | with open('data/' + dataset + '.txt', 'r') as f: 27 | lines = f.readlines() 28 | for id, line in enumerate(lines): 29 | _, data_name, data_label = line.strip().split("\t") 30 | if data_name.find('test') != -1: 31 | test_ids.append(id) 32 | elif data_name.find('train') != -1: 33 | train_val_ids.append(id) 34 | 35 | idx = list(range(len(train_val_ids))) 36 | random.shuffle(idx) 37 | train_val_ids = [train_val_ids[i] for i in idx] 38 | 39 | idx = list(range(len(test_ids))) 40 | random.shuffle(idx) 41 | test_ids = [test_ids[i] for i in idx] 42 | 43 | train_val_size = len(train_val_ids) 44 | val_size = int(0.1 * train_val_size) 45 | train_size = train_val_size - val_size 46 | train_ids, val_ids = train_val_ids[:train_size], train_val_ids[train_size:] 47 | 48 | doc_content_list = [] 49 | f = open('data/corpus/' + dataset + '.txt', 'rb') 50 | for line in f.readlines(): 51 | doc_content_list.append(line.strip().decode('latin1')) 52 | f.close() 53 | 54 | with open('data/ind.train.ids', "w") as f: 55 | f.write('\n'.join([str(i) for i in train_ids])) 56 | with open('data/ind.val.ids', "w") as f: 57 | f.write('\n'.join([str(i) for i in val_ids])) 58 | with open('data/ind.test.ids', "w") as f: 59 | f.write('\n'.join([str(i) for i in test_ids])) 60 | 61 | def get_clean_words(docs): 62 | clean_words = [] 63 | for doc in docs: 64 | if args.dataset != "mr": 65 | temp = clean_str(doc).split() 66 | temp = list(filter(lambda x : x not in stop_words, temp)) 67 | else: 68 | temp = clean_str(doc).split() 69 | clean_words.append(temp) 70 | return clean_words 71 | clean_words = get_clean_words(doc_content_list) 72 | 73 | word_freq = Counter() 74 | # total = 0 75 | for i in train_ids+test_ids+val_ids: 76 | doc_words = clean_words[i] 77 | word_freq.update(doc_words) 78 | 79 | vocab, count = zip(*word_freq.most_common()) 80 | if dataset == "mr": 81 | cutoff = -1 82 | else: 83 | cutoff = count.index(5) 84 | 85 | vocab = set(vocab[:cutoff]) 86 | 87 | clean_docs = [] 88 | for words in clean_words: 89 | closed_words = [w for w in words if w in vocab ] 90 | doc_str = ' '.join(closed_words) 91 | clean_docs.append(doc_str) 92 | 93 | clean_corpus_str = '\n'.join(clean_docs) 94 | 95 | f = open('data/corpus/' + dataset + '.clean.txt', 'w') 96 | f.write(clean_corpus_str) 97 | f.close() 98 | 99 | dataset = args.dataset 100 | min_len = 10000 101 | aver_len = 0 102 | max_len = 0 103 | 104 | f = open('data/corpus/' + dataset + '.clean.txt', 'r') 105 | lines = f.readlines() 106 | for line in lines: 107 | line = line.strip() 108 | temp = line.split() 109 | aver_len = aver_len + len(temp) 110 | if len(temp) < min_len: 111 | min_len = len(temp) 112 | if len(temp) > max_len: 113 | max_len = len(temp) 114 | f.close() 115 | aver_len = 1.0 * aver_len / len(lines) 116 | print('min_len : ' + str(min_len)) 117 | print('max_len : ' + str(max_len)) 118 | print('average_len : ' + str(aver_len)) 119 | -------------------------------------------------------------------------------- /downstream/TextSGC/requirements.txt: -------------------------------------------------------------------------------- 1 | hyperopt==0.1.1 -------------------------------------------------------------------------------- /downstream/TextSGC/train.py: -------------------------------------------------------------------------------- 1 | import time 2 | import argparse 3 | import numpy as np 4 | import pickle 5 | import os 6 | from copy import deepcopy 7 | import torch 8 | import torch.nn.functional as F 9 | import torch.optim as optim 10 | import tabulate 11 | from functools import partial 12 | from utils import * 13 | from models import SGC 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('--dataset', type=str, default='20ng', help='Dataset string.') 17 | parser.add_argument('--no-cuda', action='store_true', default=False, 18 | help='Disables CUDA training.') 19 | parser.add_argument('--seed', type=int, default=42, help='Random seed.') 20 | parser.add_argument('--epochs', type=int, default=3, 21 | help='Number of epochs to train.') 22 | parser.add_argument('--batch_size', type=int, default=128, 23 | help='training batch size.') 24 | parser.add_argument('--weight_decay', type=float, default=0, 25 | help='Weight for L2 loss on embedding matrix.') 26 | parser.add_argument('--degree', type=int, default=2, 27 | help='degree of the approximation.') 28 | parser.add_argument('--tuned', action='store_true', help='use tuned hyperparams') 29 | parser.add_argument('--preprocessed', action='store_true', 30 | help='use preprocessed data') 31 | args = parser.parse_args() 32 | args.cuda = not args.no_cuda and torch.cuda.is_available() 33 | args.device = 'cuda' if args.cuda else 'cpu' 34 | if args.tuned: 35 | with open("tuned_result/{}.SGC.tuning.txt".format(args.dataset), "r") as f: 36 | args.weight_decay = float(f.read()) 37 | 38 | torch.backends.cudnn.benchmark = True 39 | set_seed(args.seed, args.cuda) 40 | 41 | sp_adj, index_dict, label_dict = load_corpus(args.dataset) 42 | for k, v in label_dict.items(): 43 | if args.dataset == "mr": 44 | label_dict[k] = torch.Tensor(v).to(args.device) 45 | else: 46 | label_dict[k] = torch.LongTensor(v).to(args.device) 47 | features = torch.arange(sp_adj.shape[0]).to(args.device) 48 | 49 | adj = sparse_to_torch_sparse(sp_adj, device=args.device) 50 | 51 | 52 | def train_linear(model, feat_dict, weight_decay, binary=False): 53 | if not binary: 54 | act = partial(F.log_softmax, dim=1) 55 | criterion = F.nll_loss 56 | else: 57 | act = torch.sigmoid 58 | criterion = F.binary_cross_entropy 59 | optimizer = optim.LBFGS(model.parameters()) 60 | best_val_loss = float('inf') 61 | best_val_acc = 0 62 | plateau = 0 63 | start = time.perf_counter() 64 | for epoch in range(args.epochs): 65 | def closure(): 66 | optimizer.zero_grad() 67 | output = model(feat_dict["train"].cuda()).squeeze() 68 | l2_reg = 0.5*weight_decay*(model.W.weight**2).sum() 69 | loss = criterion(act(output), label_dict["train"].cuda())+l2_reg 70 | loss.backward() 71 | return loss 72 | 73 | optimizer.step(closure) 74 | 75 | train_time = time.perf_counter()-start 76 | val_res = eval_linear(model, feat_dict["val"].cuda(), 77 | label_dict["val"].cuda(), binary) 78 | return val_res['accuracy'], model, train_time 79 | 80 | def eval_linear(model, features, label, binary=False): 81 | model.eval() 82 | if not binary: 83 | act = partial(F.log_softmax, dim=1) 84 | criterion = F.nll_loss 85 | else: 86 | act = torch.sigmoid 87 | criterion = F.binary_cross_entropy 88 | 89 | with torch.no_grad(): 90 | output = model(features).squeeze() 91 | loss = criterion(act(output), label) 92 | if not binary: predict_class = output.max(1)[1] 93 | else: predict_class = act(output).gt(0.5).float() 94 | correct = torch.eq(predict_class, label).long().sum().item() 95 | acc = correct/predict_class.size(0) 96 | 97 | return { 98 | 'loss': loss.item(), 99 | 'accuracy': acc 100 | } 101 | if __name__ == '__main__': 102 | if args.dataset == "mr": nclass = 1 103 | else: nclass = label_dict["train"].max().item()+1 104 | if not args.preprocessed: 105 | adj_dense = sparse_to_torch_dense(sp_adj, device='cpu') 106 | feat_dict, precompute_time = sgc_precompute(adj, adj_dense, args.degree-1, index_dict) 107 | else: 108 | # load the relased degree 2 features 109 | with open(os.path.join("preprocessed", 110 | "{}.pkl".format(args.dataset)), "rb") as prep: 111 | feat_dict = pkl.load(prep) 112 | precompute_time = 0 113 | 114 | model = SGC(nfeat=feat_dict["train"].size(1), 115 | nclass=nclass) 116 | if args.cuda: model.cuda() 117 | val_acc, best_model, train_time = train_linear(model, feat_dict, args.weight_decay, args.dataset=="mr") 118 | test_res = eval_linear(best_model, feat_dict["test"].cuda(), 119 | label_dict["test"].cuda(), args.dataset=="mr") 120 | train_res = eval_linear(best_model, feat_dict["train"].cuda(), 121 | label_dict["train"].cuda(), args.dataset=="mr") 122 | print("Total Time: {:2f}s, Train acc: {:.4f}, Val acc: {:.4f}, Test acc: {:.4f}".format(precompute_time+train_time, train_res["accuracy"], val_acc, test_res["accuracy"])) 123 | -------------------------------------------------------------------------------- /downstream/TextSGC/tuned_result/20ng.SGC.tuning.txt: -------------------------------------------------------------------------------- 1 | 0.0013576034109430972 -------------------------------------------------------------------------------- /downstream/TextSGC/tuned_result/R52.SGC.tuning.txt: -------------------------------------------------------------------------------- 1 | 3.39184266146822e-06 -------------------------------------------------------------------------------- /downstream/TextSGC/tuned_result/R8.SGC.tuning.txt: -------------------------------------------------------------------------------- 1 | 0.000555422439238392 -------------------------------------------------------------------------------- /downstream/TextSGC/tuned_result/mr.SGC.tuning.txt: -------------------------------------------------------------------------------- 1 | 0.0031331167329072614 -------------------------------------------------------------------------------- /downstream/TextSGC/tuned_result/ohsumed.SGC.tuning.txt: -------------------------------------------------------------------------------- 1 | 0.003911359029974794 -------------------------------------------------------------------------------- /downstream/TextSGC/tuning.py: -------------------------------------------------------------------------------- 1 | import time 2 | import argparse 3 | import numpy as np 4 | from train import train 5 | import pickle as pkl 6 | from hyperopt import fmin, tpe, hp, STATUS_OK, Trials 7 | from args import get_text_args 8 | from utils import * 9 | from train import train_linear, adj, sp_adj, label_dict, index_dict 10 | import torch.nn.functional as F 11 | from models import get_model 12 | from math import log 13 | 14 | args = get_text_args() 15 | set_seed(args.seed, args.cuda) 16 | 17 | adj_dense = sparse_to_torch_dense(sp_adj, device='cpu') 18 | feat_dict, precompute_time = sgc_precompute(adj, adj_dense, args.degree-1, index_dict) 19 | if args.dataset == "mr": nclass = 1 20 | else: nclass = label_dict["train"].max().item()+1 21 | 22 | def linear_objective(space): 23 | model = get_model(args.model, nfeat=feat_dict["train"].size(1), 24 | nclass=nclass, 25 | nhid=0, dropout=0, cuda=args.cuda) 26 | val_acc, _, _ = train_linear(model, feat_dict, space['weight_decay'], args.dataset=="mr") 27 | print( 'weight decay ' + str(space['weight_decay']) + '\n' + \ 28 | 'overall accuracy: ' + str(val_acc)) 29 | return {'loss': -val_acc, 'status': STATUS_OK} 30 | 31 | # Hyperparameter optimization 32 | space = {'weight_decay' : hp.loguniform('weight_decay', log(1e-6), log(1e-0))} 33 | 34 | best = fmin(linear_objective, space=space, algo=tpe.suggest, max_evals=60) 35 | print(best) 36 | 37 | with open('{}.SGC.tuning.txt'.format(args.dataset), 'w') as f: 38 | f.write(str(best['weight_decay'])) 39 | -------------------------------------------------------------------------------- /downstream/TextSGC/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle as pkl 3 | import networkx as nx 4 | import scipy.sparse as sp 5 | from scipy.sparse.linalg.eigen.arpack import eigsh 6 | import sys 7 | import re 8 | import torch 9 | from time import perf_counter 10 | import tabulate 11 | 12 | def parse_index_file(filename): 13 | """Parse index file.""" 14 | index = [] 15 | for line in open(filename): 16 | index.append(int(line.strip())) 17 | return index 18 | 19 | def load_corpus(dataset_str): 20 | """ 21 | Loads input corpus from gcn/data directory 22 | 23 | ind.dataset_str.x => the feature vectors of the training docs as scipy.sparse.csr.csr_matrix object; 24 | ind.dataset_str.tx => the feature vectors of the test docs as scipy.sparse.csr.csr_matrix object; 25 | ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training docs/words 26 | (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object; 27 | ind.dataset_str.y => the one-hot labels of the labeled training docs as numpy.ndarray object; 28 | ind.dataset_str.ty => the one-hot labels of the test docs as numpy.ndarray object; 29 | ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object; 30 | ind.dataset_str.adj => adjacency matrix of word/doc nodes as scipy.sparse.csr.csr_matrix object; 31 | ind.dataset_str.train.index => the indices of training docs in original doc list. 32 | 33 | All objects above must be saved using python pickle module. 34 | 35 | :param dataset_str: Dataset name 36 | :return: All data input files loaded (as well the training/test data). 37 | """ 38 | index_dict = {} 39 | label_dict = {} 40 | phases = ["train", "val", "test"] 41 | objects = [] 42 | def load_pkl(path): 43 | with open(path.format(dataset_str, p), 'rb') as f: 44 | if sys.version_info > (3, 0): 45 | return pkl.load(f, encoding='latin1') 46 | else: 47 | return pkl.load(f) 48 | 49 | for p in phases: 50 | index_dict[p] = load_pkl("data/ind.{}.{}.x".format(dataset_str, p)) 51 | label_dict[p] = load_pkl("data/ind.{}.{}.y".format(dataset_str, p)) 52 | 53 | adj = load_pkl("data/ind.{}.BCD.adj".format(dataset_str)) 54 | adj = adj.astype(np.float32) 55 | adj = preprocess_adj(adj) 56 | 57 | return adj, index_dict, label_dict 58 | 59 | def normalize_adj(adj): 60 | """Symmetrically normalize adjacency matrix.""" 61 | adj = sp.coo_matrix(adj) 62 | rowsum = np.array(adj.sum(1)) 63 | d_inv_sqrt = np.power(rowsum, -0.5).flatten() 64 | d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. 65 | d_mat_inv_sqrt = sp.diags(d_inv_sqrt) 66 | return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).transpose().tocoo() 67 | 68 | def preprocess_adj(adj): 69 | """Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation.""" 70 | adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0])) 71 | return adj_normalized 72 | 73 | def loadWord2Vec(filename): 74 | """Read Word Vectors""" 75 | vocab = [] 76 | embd = [] 77 | word_vector_map = {} 78 | file = open(filename, 'r') 79 | for line in file.readlines(): 80 | row = line.strip().split(' ') 81 | if(len(row) > 2): 82 | vocab.append(row[0]) 83 | vector = row[1:] 84 | length = len(vector) 85 | for i in range(length): 86 | vector[i] = float(vector[i]) 87 | embd.append(vector) 88 | word_vector_map[row[0]] = vector 89 | print('Loaded Word Vectors!') 90 | file.close() 91 | return vocab, embd, word_vector_map 92 | 93 | def clean_str(string): 94 | string = re.sub(r'[?|$|.|!]',r'',string) 95 | string = re.sub(r'[^a-zA-Z0-9 ]',r'',string) 96 | string = re.sub(r"\'s", " \'s", string) 97 | string = re.sub(r"\'ve", " \'ve", string) 98 | string = re.sub(r"n\'t", " n\'t", string) 99 | string = re.sub(r"\'re", " \'re", string) 100 | string = re.sub(r"\'d", " \'d", string) 101 | string = re.sub(r"\'ll", " \'ll", string) 102 | string = re.sub(r",", " , ", string) 103 | string = re.sub(r"!", " ! ", string) 104 | string = re.sub(r"\(", " \( ", string) 105 | string = re.sub(r"\)", " \) ", string) 106 | string = re.sub(r"\?", " \? ", string) 107 | string = re.sub(r"\s{2,}", " ", string) 108 | return string.strip().lower() 109 | 110 | def sparse_to_torch_sparse(sparse_mx, device='cuda'): 111 | """Convert a scipy sparse matrix to a torch sparse tensor.""" 112 | sparse_mx = sparse_mx.tocoo().astype(np.float32) 113 | indices = torch.from_numpy( 114 | np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64)) 115 | if device == 'cuda': 116 | indices = indices.cuda() 117 | values = torch.from_numpy(sparse_mx.data).cuda() 118 | shape = torch.Size(sparse_mx.shape) 119 | adj = torch.cuda.sparse.FloatTensor(indices, values, shape) 120 | elif device == 'cpu': 121 | values = torch.from_numpy(sparse_mx.data) 122 | shape = torch.Size(sparse_mx.shape) 123 | adj = torch.sparse.FloatTensor(indices, values, shape) 124 | return adj 125 | 126 | def sparse_to_torch_dense(sparse, device='cuda'): 127 | dense = sparse.todense().astype(np.float32) 128 | torch_dense = torch.from_numpy(dense).to(device=device) 129 | return torch_dense 130 | 131 | def sgc_precompute(adj, features, degree, index_dict): 132 | assert degree==1, "Only supporting degree 2 now" 133 | feat_dict = {} 134 | start = perf_counter() 135 | train_feats = features[:, index_dict["train"]].cuda() 136 | train_feats = torch.spmm(adj, train_feats).t() 137 | train_feats_max, _ = train_feats.max(dim=0, keepdim=True) 138 | train_feats_min, _ = train_feats.min(dim=0, keepdim=True) 139 | train_feats_range = train_feats_max-train_feats_min 140 | useful_features_dim = train_feats_range.squeeze().gt(0).nonzero().squeeze() 141 | train_feats = train_feats[:, useful_features_dim] 142 | train_feats_range = train_feats_range[:, useful_features_dim] 143 | train_feats_min = train_feats_min[:, useful_features_dim] 144 | train_feats = (train_feats-train_feats_min)/train_feats_range 145 | feat_dict["train"] = train_feats 146 | for phase in ["test", "val"]: 147 | feats = features[:, index_dict[phase]].cuda() 148 | feats = torch.spmm(adj, feats).t() 149 | feats = feats[:, useful_features_dim] 150 | feat_dict[phase] = ((feats-train_feats_min)/train_feats_range).cpu() # adj is symmetric! 151 | precompute_time = perf_counter()-start 152 | return feat_dict, precompute_time 153 | 154 | def set_seed(seed, cuda): 155 | np.random.seed(seed) 156 | torch.manual_seed(seed) 157 | if cuda: torch.cuda.manual_seed(seed) 158 | 159 | def print_table(values, columns, epoch): 160 | table = tabulate.tabulate([values], columns, tablefmt='simple', floatfmt='8.4f') 161 | if epoch % 40 == 0: 162 | table = table.split('\n') 163 | table = '\n'.join([table[1]] + table) 164 | else: 165 | table = table.split('\n')[2] 166 | print(table) 167 | -------------------------------------------------------------------------------- /metrics.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import f1_score 2 | 3 | def accuracy(output, labels): 4 | preds = output.max(1)[1].type_as(labels) 5 | correct = preds.eq(labels).double() 6 | correct = correct.sum() 7 | return correct / len(labels) 8 | 9 | def f1(output, labels): 10 | preds = output.max(1)[1] 11 | preds = preds.cpu().detach().numpy() 12 | labels = labels.cpu().detach().numpy() 13 | micro = f1_score(labels, preds, average='micro') 14 | macro = f1_score(labels, preds, average='macro') 15 | return micro, macro 16 | -------------------------------------------------------------------------------- /model.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tiiiger/SGC/2c7a2727e82e462d8ef9d6e57f0b08888e16488f/model.jpg -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import Module 4 | import torch.nn.functional as F 5 | import math 6 | 7 | class SGC(nn.Module): 8 | """ 9 | A Simple PyTorch Implementation of Logistic Regression. 10 | Assuming the features have been preprocessed with k-step graph propagation. 11 | """ 12 | def __init__(self, nfeat, nclass): 13 | super(SGC, self).__init__() 14 | 15 | self.W = nn.Linear(nfeat, nclass) 16 | 17 | def forward(self, x): 18 | return self.W(x) 19 | 20 | class GraphConvolution(Module): 21 | """ 22 | A Graph Convolution Layer (GCN) 23 | """ 24 | 25 | def __init__(self, in_features, out_features, bias=True): 26 | super(GraphConvolution, self).__init__() 27 | self.in_features = in_features 28 | self.out_features = out_features 29 | self.W = nn.Linear(in_features, out_features, bias=bias) 30 | self.init() 31 | 32 | def init(self): 33 | stdv = 1. / math.sqrt(self.W.weight.size(1)) 34 | self.W.weight.data.uniform_(-stdv, stdv) 35 | 36 | def forward(self, input, adj): 37 | support = self.W(input) 38 | output = torch.spmm(adj, support) 39 | 40 | class GCN(nn.Module): 41 | """ 42 | A Two-layer GCN. 43 | """ 44 | def __init__(self, nfeat, nhid, nclass, dropout): 45 | super(GCN, self).__init__() 46 | 47 | self.gc1 = GraphConvolution(nfeat, nhid) 48 | self.gc2 = GraphConvolution(nhid, nclass) 49 | self.dropout = dropout 50 | 51 | def forward(self, x, adj, use_relu=True): 52 | x = self.gc1(x, adj) 53 | if use_relu: 54 | x = F.relu(x) 55 | x = F.dropout(x, self.dropout, training=self.training) 56 | x = self.gc2(x, adj) 57 | return x 58 | 59 | def get_model(model_opt, nfeat, nclass, nhid=0, dropout=0, cuda=True): 60 | if model_opt == "GCN": 61 | model = GCN(nfeat=nfeat, 62 | nhid=nhid, 63 | nclass=nclass, 64 | dropout=dropout) 65 | elif model_opt == "SGC": 66 | model = SGC(nfeat=nfeat, 67 | nclass=nclass) 68 | else: 69 | raise NotImplementedError('model:{} is not implemented!'.format(model_opt)) 70 | 71 | if cuda: model.cuda() 72 | return model 73 | -------------------------------------------------------------------------------- /normalization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse as sp 3 | import torch 4 | 5 | def aug_normalized_adjacency(adj): 6 | adj = adj + sp.eye(adj.shape[0]) 7 | adj = sp.coo_matrix(adj) 8 | row_sum = np.array(adj.sum(1)) 9 | d_inv_sqrt = np.power(row_sum, -0.5).flatten() 10 | d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. 11 | d_mat_inv_sqrt = sp.diags(d_inv_sqrt) 12 | return d_mat_inv_sqrt.dot(adj).dot(d_mat_inv_sqrt).tocoo() 13 | 14 | def fetch_normalization(type): 15 | switcher = { 16 | 'AugNormAdj': aug_normalized_adjacency, # A' = (D + I)^-1/2 * ( A + I ) * (D + I)^-1/2 17 | } 18 | func = switcher.get(type, lambda: "Invalid normalization technique.") 19 | return func 20 | 21 | def row_normalize(mx): 22 | """Row-normalize sparse matrix""" 23 | rowsum = np.array(mx.sum(1)) 24 | r_inv = np.power(rowsum, -1).flatten() 25 | r_inv[np.isinf(r_inv)] = 0. 26 | r_mat_inv = sp.diags(r_inv) 27 | mx = r_mat_inv.dot(mx) 28 | return mx 29 | -------------------------------------------------------------------------------- /reddit.py: -------------------------------------------------------------------------------- 1 | from time import perf_counter 2 | import argparse 3 | import numpy as np 4 | import torch 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | from utils import load_reddit_data, sgc_precompute, set_seed 8 | from metrics import f1 9 | from models import SGC 10 | 11 | # Args 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--no-cuda', action='store_true', default=False, 14 | help='Disables CUDA training.') 15 | parser.add_argument('--inductive', action='store_true', default=False, 16 | help='inductive training.') 17 | parser.add_argument('--test', action='store_true', default=False, 18 | help='inductive training.') 19 | parser.add_argument('--seed', type=int, default=42, help='Random seed.') 20 | parser.add_argument('--epochs', type=int, default=2, 21 | help='Number of epochs to train.') 22 | parser.add_argument('--weight_decay', type=float, default=0, 23 | help='Weight decay (L2 loss on parameters).') 24 | parser.add_argument('--normalization', type=str, default='AugNormAdj', 25 | choices=['NormLap', 'Lap', 'RWalkLap', 'FirstOrderGCN', 26 | 'AugNormAdj', 'NormAdj', 'RWalk', 'AugRWalk', 'NoNorm'], 27 | help='Normalization method for the adjacency matrix.') 28 | parser.add_argument('--model', type=str, default="SGC", 29 | help='model to use.') 30 | parser.add_argument('--degree', type=int, default=2, 31 | help='degree of the approximation.') 32 | 33 | args = parser.parse_args() 34 | args.cuda = not args.no_cuda and torch.cuda.is_available() 35 | 36 | set_seed(args.seed, args.cuda) 37 | 38 | adj, train_adj, features, labels, idx_train, idx_val, idx_test = load_reddit_data(args.normalization, cuda=args.cuda) 39 | print("Finished data loading.") 40 | 41 | model = SGC(features.size(1), labels.max().item()+1) 42 | if args.cuda: model.cuda() 43 | processed_features, precompute_time = sgc_precompute(features, adj, args.degree) 44 | if args.inductive: 45 | train_features, _ = sgc_precompute(features[idx_train], train_adj, args.degree) 46 | else: 47 | train_features = processed_features[idx_train] 48 | 49 | test_features = processed_features[idx_test if args.test else idx_val] 50 | 51 | def train_regression(model, train_features, train_labels, epochs): 52 | optimizer = optim.LBFGS(model.parameters(), lr=1) 53 | model.train() 54 | def closure(): 55 | optimizer.zero_grad() 56 | output = model(train_features) 57 | loss_train = F.cross_entropy(output, train_labels) 58 | loss_train.backward() 59 | return loss_train 60 | t = perf_counter() 61 | for epoch in range(epochs): 62 | loss_train = optimizer.step(closure) 63 | train_time = perf_counter()-t 64 | return model, train_time 65 | 66 | def test_regression(model, test_features, test_labels): 67 | model.eval() 68 | return f1(model(test_features), test_labels) 69 | 70 | model, train_time = train_regression(model, train_features, labels[idx_train], args.epochs) 71 | test_f1, _ = test_regression(model, test_features, labels[idx_test if args.test else idx_val]) 72 | print("Total Time: {:.4f}s, {} F1: {:.4f}".format(train_time+precompute_time, 73 | "Test" if args.test else "Val", 74 | test_f1)) 75 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | networkx==1.11 4 | scikit-learn 5 | hyperopt==0.1.1 6 | -------------------------------------------------------------------------------- /tuning.py: -------------------------------------------------------------------------------- 1 | import time 2 | import argparse 3 | import numpy as np 4 | import pickle as pkl 5 | import os 6 | from math import log 7 | from citation import train_regression 8 | from models import get_model 9 | from utils import sgc_precompute, load_citation, set_seed 10 | from args import get_citation_args 11 | import torch 12 | from hyperopt import fmin, tpe, hp, STATUS_OK, Trials 13 | 14 | # Arguments 15 | args = get_citation_args() 16 | 17 | # setting random seeds 18 | set_seed(args.seed, args.cuda) 19 | 20 | # Hyperparameter optimization 21 | space = {'weight_decay' : hp.loguniform('weight_decay', log(1e-10), log(1e-4))} 22 | 23 | adj, features, labels, idx_train, idx_val, idx_test = load_citation(args.dataset, args.normalization, args.cuda) 24 | if args.model == "SGC": features, precompute_time = sgc_precompute(features, adj, args.degree) 25 | 26 | def sgc_objective(space): 27 | model = get_model(args.model, features.size(1), labels.max().item()+1, args.hidden, args.dropout, args.cuda) 28 | model, acc_val, _ = train_regression(model, features[idx_train], labels[idx_train], features[idx_val], labels[idx_val], 29 | args.epochs, space['weight_decay'], args.lr, args.dropout) 30 | print('weight decay: {:.2e} '.format(space['weight_decay']) + 'accuracy: {:.4f}'.format(acc_val)) 31 | return {'loss': -acc_val, 'status': STATUS_OK} 32 | 33 | best = fmin(sgc_objective, space=space, algo=tpe.suggest, max_evals=60) 34 | print("Best weight decay: {:.2e}".format(best["weight_decay"])) 35 | 36 | os.makedirs("./{}-tuning".format(args.model), exist_ok=True) 37 | path = '{}-tuning/{}.txt'.format(args.model, args.dataset) 38 | with open(path, 'wb') as f: pkl.dump(best, f) 39 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse as sp 3 | import torch 4 | import sys 5 | import pickle as pkl 6 | import networkx as nx 7 | from normalization import fetch_normalization, row_normalize 8 | from time import perf_counter 9 | 10 | def parse_index_file(filename): 11 | """Parse index file.""" 12 | index = [] 13 | for line in open(filename): 14 | index.append(int(line.strip())) 15 | return index 16 | 17 | def preprocess_citation(adj, features, normalization="FirstOrderGCN"): 18 | adj_normalizer = fetch_normalization(normalization) 19 | adj = adj_normalizer(adj) 20 | features = row_normalize(features) 21 | return adj, features 22 | 23 | def sparse_mx_to_torch_sparse_tensor(sparse_mx): 24 | """Convert a scipy sparse matrix to a torch sparse tensor.""" 25 | sparse_mx = sparse_mx.tocoo().astype(np.float32) 26 | indices = torch.from_numpy( 27 | np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64)) 28 | values = torch.from_numpy(sparse_mx.data) 29 | shape = torch.Size(sparse_mx.shape) 30 | return torch.sparse.FloatTensor(indices, values, shape) 31 | 32 | def load_citation(dataset_str="cora", normalization="AugNormAdj", cuda=True): 33 | """ 34 | Load Citation Networks Datasets. 35 | """ 36 | names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] 37 | objects = [] 38 | for i in range(len(names)): 39 | with open("data/ind.{}.{}".format(dataset_str.lower(), names[i]), 'rb') as f: 40 | if sys.version_info > (3, 0): 41 | objects.append(pkl.load(f, encoding='latin1')) 42 | else: 43 | objects.append(pkl.load(f)) 44 | 45 | x, y, tx, ty, allx, ally, graph = tuple(objects) 46 | test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str)) 47 | test_idx_range = np.sort(test_idx_reorder) 48 | 49 | if dataset_str == 'citeseer': 50 | # Fix citeseer dataset (there are some isolated nodes in the graph) 51 | # Find isolated nodes, add them as zero-vecs into the right position 52 | test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1) 53 | tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) 54 | tx_extended[test_idx_range-min(test_idx_range), :] = tx 55 | tx = tx_extended 56 | ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) 57 | ty_extended[test_idx_range-min(test_idx_range), :] = ty 58 | ty = ty_extended 59 | 60 | features = sp.vstack((allx, tx)).tolil() 61 | features[test_idx_reorder, :] = features[test_idx_range, :] 62 | adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) 63 | adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj) 64 | labels = np.vstack((ally, ty)) 65 | labels[test_idx_reorder, :] = labels[test_idx_range, :] 66 | 67 | idx_test = test_idx_range.tolist() 68 | idx_train = range(len(y)) 69 | idx_val = range(len(y), len(y)+500) 70 | 71 | adj, features = preprocess_citation(adj, features, normalization) 72 | 73 | # porting to pytorch 74 | features = torch.FloatTensor(np.array(features.todense())).float() 75 | labels = torch.LongTensor(labels) 76 | labels = torch.max(labels, dim=1)[1] 77 | adj = sparse_mx_to_torch_sparse_tensor(adj).float() 78 | idx_train = torch.LongTensor(idx_train) 79 | idx_val = torch.LongTensor(idx_val) 80 | idx_test = torch.LongTensor(idx_test) 81 | 82 | if cuda: 83 | features = features.cuda() 84 | adj = adj.cuda() 85 | labels = labels.cuda() 86 | idx_train = idx_train.cuda() 87 | idx_val = idx_val.cuda() 88 | idx_test = idx_test.cuda() 89 | 90 | return adj, features, labels, idx_train, idx_val, idx_test 91 | 92 | def sgc_precompute(features, adj, degree): 93 | t = perf_counter() 94 | for i in range(degree): 95 | features = torch.spmm(adj, features) 96 | precompute_time = perf_counter()-t 97 | return features, precompute_time 98 | 99 | def set_seed(seed, cuda): 100 | np.random.seed(seed) 101 | torch.manual_seed(seed) 102 | if cuda: torch.cuda.manual_seed(seed) 103 | 104 | def loadRedditFromNPZ(dataset_dir): 105 | adj = sp.load_npz(dataset_dir+"reddit_adj.npz") 106 | data = np.load(dataset_dir+"reddit.npz") 107 | 108 | return adj, data['feats'], data['y_train'], data['y_val'], data['y_test'], data['train_index'], data['val_index'], data['test_index'] 109 | 110 | def load_reddit_data(data_path="data/", normalization="AugNormAdj", cuda=True): 111 | adj, features, y_train, y_val, y_test, train_index, val_index, test_index = loadRedditFromNPZ("data/") 112 | labels = np.zeros(adj.shape[0]) 113 | labels[train_index] = y_train 114 | labels[val_index] = y_val 115 | labels[test_index] = y_test 116 | adj = adj + adj.T 117 | train_adj = adj[train_index, :][:, train_index] 118 | features = torch.FloatTensor(np.array(features)) 119 | features = (features-features.mean(dim=0))/features.std(dim=0) 120 | adj_normalizer = fetch_normalization(normalization) 121 | adj = adj_normalizer(adj) 122 | adj = sparse_mx_to_torch_sparse_tensor(adj).float() 123 | train_adj = adj_normalizer(train_adj) 124 | train_adj = sparse_mx_to_torch_sparse_tensor(train_adj).float() 125 | labels = torch.LongTensor(labels) 126 | if cuda: 127 | adj = adj.cuda() 128 | train_adj = train_adj.cuda() 129 | features = features.cuda() 130 | labels = labels.cuda() 131 | return adj, train_adj, features, labels, train_index, val_index, test_index 132 | --------------------------------------------------------------------------------