├── src ├── __init__.py ├── DPGGAN │ ├── __init__.py │ ├── DPCounter.py │ ├── gcn_layer.py │ ├── px_expander.py │ ├── linear.py │ ├── adadp.py │ ├── utils_dp.py │ ├── functional.py │ ├── dp_aggregators.py │ ├── dp_encoders.py │ ├── model.py │ ├── gaussian_moments.py │ └── data_utils.py ├── GGAN │ ├── __init__.py │ ├── draw.py │ ├── linear.py │ ├── encoders.py │ ├── functional.py │ ├── logger.py │ ├── utils.py │ ├── graph_drawer.py │ ├── aggregators.py │ ├── dataloader.py │ ├── train.py │ ├── config.py │ ├── main.py │ └── model.py ├── logger.py ├── utils.py ├── graph_drawer.py ├── dataloader.py ├── test.py ├── train.py ├── config.py ├── main.py └── eval.py ├── graph_classification_exp ├── __init__.py ├── models │ ├── __init__.py │ ├── mlp.py │ └── graphcnn.py ├── README.md ├── sample_IMDBMULTI.py ├── preprocess_nx_data.py ├── random_pred.py ├── util.py └── main.py ├── data.zip ├── link_classification_exp ├── node2vec │ ├── requirements.txt │ ├── .gitignore │ ├── graph │ │ └── karate.edgelist │ ├── LICENSE.md │ ├── README.md │ └── src │ │ ├── main.py │ │ └── node2vec.py ├── test.png ├── draw.py ├── preprocess.py └── main.py ├── run_graph_classification_exp.sh ├── .gitignore ├── run.sh ├── run_link_classification_exp.sh ├── requirements.txt ├── README.md └── environment.yml /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/DPGGAN/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/GGAN/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /graph_classification_exp/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /graph_classification_exp/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haonan3/Secure-Network-Release-with-Link-Privacy/HEAD/data.zip -------------------------------------------------------------------------------- /link_classification_exp/node2vec/requirements.txt: -------------------------------------------------------------------------------- 1 | networkx==1.11 2 | numpy==1.11.2 3 | gensim==0.13.3 4 | -------------------------------------------------------------------------------- /link_classification_exp/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haonan3/Secure-Network-Release-with-Link-Privacy/HEAD/link_classification_exp/test.png -------------------------------------------------------------------------------- /link_classification_exp/node2vec/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .DS_Store 3 | target 4 | bin 5 | build 6 | .gradle 7 | *.iml 8 | *.ipr 9 | *.iws 10 | *.log 11 | .classpath 12 | .project 13 | .settings 14 | .idea -------------------------------------------------------------------------------- /src/DPGGAN/DPCounter.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class DPCounter: 4 | def __init__(self, args, model_args): 5 | self.T = 0 6 | self.eps = 0 7 | self.delta = model_args.delta 8 | self.should_stop = False 9 | self.sigma = model_args.noise_sigma 10 | self.q = float(args.batch_size) / (args.num_samples) -------------------------------------------------------------------------------- /run_graph_classification_exp.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 3 | 4 | python graph_classification_exp/preprocess_nx_data.py --dataset relabeled_dblp2 --model orig 5 | 6 | CUDA_VISIBLE_DEVICES=1 python graph_classification_exp/main.py --hidden_dim 256 --epochs 300 --lr 0.0005 --dataset relabeled_dblp2 & -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | log/ 2 | data/ 3 | src/__pycache__ 4 | src/graph_drawer.py 5 | src/test.py 6 | src/GGAN/__pycache__ 7 | src/DPGGAN/__pycache__ 8 | graph_classification_exp/__pycache__ 9 | graph_classification_exp/models/__pycache__ 10 | graph_classification_exp/logs 11 | graph_classification_exp/dataset 12 | link_classification_exp/dataset 13 | link_classification_exp/log 14 | .DS_Store 15 | .idea -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 3 | 4 | python src/main.py --model_name GGAN --dataset new_dblp2 & 5 | 6 | python src/main.py --model_name GGAN --dataset new_IMDB_MULTI & 7 | 8 | python src/main.py --model_name GVAE --dataset new_dblp2 & 9 | 10 | python src/main.py --model_name GVAE --dataset new_IMDB_MULTI & -------------------------------------------------------------------------------- /run_link_classification_exp.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 3 | 4 | CUDA_VISIBLE_DEVICES=1 python link_classification_exp/main.py --graph_name new_IMDB_MULTI --graph_type 'GGAN_{}' --epochs 40 & 5 | 6 | CUDA_VISIBLE_DEVICES=2 python link_classification_exp/main.py --graph_name new_dblp2 --graph_type 'GGAN_{}' --epochs 40 & -------------------------------------------------------------------------------- /graph_classification_exp/README.md: -------------------------------------------------------------------------------- 1 | The model we used for graph classification experiment is from How Powerful are Graph Neural Networks?(GIN) 2 | 3 | The code is from the official PyTorch implementation of the experiments in the following paper: 4 | 5 | Keyulu Xu*, Weihua Hu*, Jure Leskovec, Stefanie Jegelka. How Powerful are Graph Neural Networks? ICLR 2019. 6 | 7 | [arXiv](https://arxiv.org/abs/1810.00826) [OpenReview](https://openreview.net/forum?id=ryGs6iA5Km) -------------------------------------------------------------------------------- /link_classification_exp/node2vec/graph/karate.edgelist: -------------------------------------------------------------------------------- 1 | 1 32 2 | 1 22 3 | 1 20 4 | 1 18 5 | 1 14 6 | 1 13 7 | 1 12 8 | 1 11 9 | 1 9 10 | 1 8 11 | 1 7 12 | 1 6 13 | 1 5 14 | 1 4 15 | 1 3 16 | 1 2 17 | 2 31 18 | 2 22 19 | 2 20 20 | 2 18 21 | 2 14 22 | 2 8 23 | 2 4 24 | 2 3 25 | 3 14 26 | 3 9 27 | 3 10 28 | 3 33 29 | 3 29 30 | 3 28 31 | 3 8 32 | 3 4 33 | 4 14 34 | 4 13 35 | 4 8 36 | 5 11 37 | 5 7 38 | 6 17 39 | 6 11 40 | 6 7 41 | 7 17 42 | 9 34 43 | 9 33 44 | 9 33 45 | 10 34 46 | 14 34 47 | 15 34 48 | 15 33 49 | 16 34 50 | 16 33 51 | 19 34 52 | 19 33 53 | 20 34 54 | 21 34 55 | 21 33 56 | 23 34 57 | 23 33 58 | 24 30 59 | 24 34 60 | 24 33 61 | 24 28 62 | 24 26 63 | 25 32 64 | 25 28 65 | 25 26 66 | 26 32 67 | 27 34 68 | 27 30 69 | 28 34 70 | 29 34 71 | 29 32 72 | 30 34 73 | 30 33 74 | 31 34 75 | 31 33 76 | 32 34 77 | 32 33 78 | 33 34 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | certifi==2020.12.5 2 | cffi @ file:///home/conda/feedstock_root/build_artifacts/cffi_1606601121339/work 3 | cycler==0.10.0 4 | decorator==4.4.2 5 | gensim==3.8.3 6 | joblib==1.0.0 7 | kiwisolver==1.3.1 8 | matplotlib==3.3.3 9 | mpmath==1.1.0 10 | networkx==2.5 11 | numpy @ file:///home/conda/feedstock_root/build_artifacts/numpy_1610324545699/work 12 | olefile @ file:///home/conda/feedstock_root/build_artifacts/olefile_1602866521163/work 13 | Pillow @ file:///home/conda/feedstock_root/build_artifacts/pillow_1610407356860/work 14 | powerlaw==1.4.6 15 | pycparser @ file:///home/conda/feedstock_root/build_artifacts/pycparser_1593275161868/work 16 | pyparsing==2.4.7 17 | python-dateutil==2.8.1 18 | python-igraph==0.8.3 19 | scikit-learn==0.24.0 20 | scipy==1.5.4 21 | six @ file:///home/conda/feedstock_root/build_artifacts/six_1590081179328/work 22 | smart-open==4.1.2 23 | texttable==1.6.3 24 | threadpoolctl==2.1.0 25 | torch==1.1.0 26 | torchvision==0.3.0 27 | tqdm==4.56.0 28 | -------------------------------------------------------------------------------- /link_classification_exp/draw.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import sys 5 | import pickle 6 | from scipy import stats 7 | 8 | models = ['Original', 'GGAN', 'DPGGAN'] 9 | styles = ['k1-', 'g1-', 'r.--'] #, 'g*--', 'b^--'] 10 | mean = {} 11 | mean['Original'] = [[0.1, 1.0, 10.0], [0.8661, 0.8661, 0.8661]] 12 | mean['GGAN'] = [[0.1, 1.0, 10.0], [0.6316, 0.6316, 0.6316]] 13 | mean['DPGGAN'] = [[0.1, 1.0, 10.0], [0.5798, 0.5889, 0.5931]] 14 | 15 | for m in models: 16 | plt.plot(mean[m][0], mean[m][1], styles[models.index(m)], label=m) 17 | 18 | plt.grid(linestyle='--', linewidth=0.5) 19 | plt.xlim(0, 10.1) # ind. 20 | plt.ylim(0.5, 0.9) #.MSG 21 | 22 | plt.xlabel('epsilon', fontsize=12) 23 | plt.ylabel('AUC', fontsize=12) 24 | plt.yticks(fontsize=12) 25 | plt.xticks(fontsize=12) 26 | plt.legend(fontsize=10, loc='lower right', ncol=1) 27 | plt.tight_layout() 28 | 29 | plt.savefig("test.png", format='png', dpi=200, bbox_inches='tight') 30 | plt.show() -------------------------------------------------------------------------------- /src/GGAN/draw.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | 5 | 6 | models = ['Original','GGAN (no DP)', 7 | 'GVAE','NetGAN',#'GraphRNN', 8 | 'DPGGAN\nepsilon=10', 'DPGGAN\nepsilon=1', 'DPGGAN\nepsilon=0.1'] 9 | 10 | 11 | 12 | mean_imdb = [0.8661,0.7743, 13 | 0.7714,0.7619, 14 | 0.5931,0.5889,0.5798] 15 | mean_dblp = [0.6824,0.6637, 16 | 0.7463,0.6536, 17 | 0.5527,0.5328,0.5137] 18 | y = range(7) 19 | 20 | plt.figure(figsize=(6,3)) 21 | bar_color=plt.get_cmap('RdYlGn')(np.linspace(0.15, 0.85, 2)) 22 | 23 | bar1 = plt.barh(y=[i + 0.2 for i in y], height = 0.4,width=mean_dblp, 24 | alpha = 0.8, color = bar_color[0],label = 'DBLP') 25 | 26 | bar2 = plt.barh(y=[i - 0.2 for i in y],height =0.4,width = mean_imdb, 27 | alpha = 0.8,color =bar_color[1],label = 'IMDB') 28 | 29 | plt.yticks(y,models) 30 | plt.xlim(0.5,0.9) 31 | plt.ylabel('Models') 32 | plt.xlabel('Accuracy') 33 | plt.legend() 34 | plt.tight_layout() 35 | 36 | 37 | plt.savefig("link_pred.png", format='png', dpi=200, bbox_inches='tight') 38 | plt.show() -------------------------------------------------------------------------------- /link_classification_exp/node2vec/LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Aditya Grover 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/DPGGAN/gcn_layer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch.nn.modules.module import Module 4 | from torch.nn.parameter import Parameter 5 | 6 | 7 | class GraphConvolution(Module): 8 | """ 9 | Simple GCN layer, similar to https://arxiv.org/abs/1609.02907 10 | """ 11 | 12 | def __init__(self, in_features, out_features, dropout=0., act=F.relu): 13 | super(GraphConvolution, self).__init__() 14 | self.in_features = in_features 15 | self.out_features = out_features 16 | self.dropout = dropout 17 | self.act = act 18 | self.weight = Parameter(torch.FloatTensor(in_features, out_features)) 19 | self.reset_parameters() 20 | 21 | def reset_parameters(self): 22 | torch.nn.init.xavier_uniform_(self.weight) 23 | 24 | def forward(self, input, adj): 25 | input = F.dropout(input, self.dropout, self.training) 26 | support = torch.mm(input, self.weight) 27 | output = torch.mm(adj, support) 28 | output = self.act(output) 29 | return output 30 | 31 | def __repr__(self): 32 | return self.__class__.__name__ + ' (' \ 33 | + str(self.in_features) + ' -> ' \ 34 | + str(self.out_features) + ')' -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Secure Deep Graph Generation with Link Differential Privacy 2 | 3 | This repository is the PyTorch implementation of DPGGan (IJCAI 2021). 4 | 5 | [arXiv](https://arxiv.org/abs/2005.00455) 6 | 7 | If you make use of the code/experiment, please cite our paper (Bibtex below). 8 | 9 | ``` 10 | @inproceedings{yang2020secure, 11 | title={Secure Deep Graph Generation with Link Differential Privacy}, 12 | author={Carl Yang and Haonan Wang and Ke Zhang and Liang Chen and Lichao Sun}, 13 | year={2021}, 14 | booktitle={The International Joint Conference on Artificial Intelligence (IJCAI)}, 15 | } 16 | 17 | ``` 18 | 19 | Contact: Haonan Wang (haonan3@illinois.edu), Carl Yang (yangji9181@gmail.com) 20 | 21 | 22 | ## Installation 23 | Install PyTorch following the instuctions on the [official website] (https://pytorch.org/). The code has been tested over PyTorch 1.1.0 versions. 24 | 25 | Then install the other dependencies. 26 | ``` 27 | conda env create -f environment.yml 28 | 29 | conda activate dpggan 30 | 31 | pip install -r requirements.txt 32 | ``` 33 | 34 | ## Test run 35 | Unzip the dataset file 36 | ``` 37 | unzip data.zip 38 | ``` 39 | 40 | and run 41 | 42 | ``` 43 | sh run.sh 44 | ``` 45 | 46 | Default parameters are not the best performing-hyper-parameters. Hyper-parameters need to be specified through the commandline arguments. 47 | 48 | For graph classification experiment and link prediction experiment, please refer `run_graph_classification_exp.sh` and `run_link_classification_exp.sh`. 49 | -------------------------------------------------------------------------------- /src/DPGGAN/px_expander.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | 4 | # clip and accumulate clipped gradients 5 | def acc_scaled_grads(model, C, cum_grads): 6 | # this two 'batch size' should be equal. 7 | assert model.batch_size == model.batch_proc_size 8 | batch_size = model.batch_proc_size 9 | g_norm = Variable(torch.zeros(batch_size), requires_grad=False) 10 | counter1 = 0 11 | counter2 = 0 12 | g_norm = {} 13 | for p in filter(lambda p: p.requires_grad, model.parameters()): 14 | if len(p.data.shape) == 2: 15 | continue 16 | counter2 += 1 17 | if p.grad is not None: 18 | counter1 += 1 19 | g_norm[str(counter2)] = torch.sqrt(torch.sum(p.grad.view(p.shape[0], -1) ** 2, 1)) 20 | 21 | # do clipping and accumulate 22 | for p, key in zip(filter(lambda p: p.requires_grad, model.parameters()), cum_grads.keys()): 23 | if len(p.data.shape) == 2: 24 | continue 25 | if p is not None: 26 | cum_grads[key] = torch.sum((p.grad / torch.clamp(g_norm[key].contiguous().view(-1, 1, 1) / C, min=1)), dim=0) 27 | 28 | 29 | # add noise and replace model grads with cumulative grads 30 | def add_noise_with_cum_grads(model, C, sigma, cum_grads, samp_num): 31 | for p, key in zip(filter(lambda p: p.requires_grad, model.parameters()), cum_grads.keys()): 32 | if len(p.data.shape) == 2: 33 | continue 34 | proc_size = model.batch_size 35 | if key == '1': 36 | proc_size = proc_size * (samp_num+1) 37 | if p.grad is not None: 38 | # add noise to summed clipped pars 39 | if proc_size > 1: 40 | p.grad = ((cum_grads[key].expand(proc_size, -1, -1) +Variable((sigma * C)*torch.normal(mean=torch.zeros_like(p.grad[0]).data, std=1.0).expand(proc_size, -1, -1))) / proc_size) 41 | # p.grad = (torch.sum((p.grad), dim=0).expand(proc_size, -1, -1)) / proc_size -------------------------------------------------------------------------------- /graph_classification_exp/models/mlp.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | ###MLP with lienar output 6 | class MLP(nn.Module): 7 | def __init__(self, num_layers, input_dim, hidden_dim, output_dim): 8 | ''' 9 | num_layers: number of layers in the neural networks (EXCLUDING the input layer). If num_layers=1, this reduces to linear model. 10 | input_dim: dimensionality of input features 11 | hidden_dim: dimensionality of hidden units at ALL layers 12 | output_dim: number of classes for prediction 13 | device: which device to use 14 | ''' 15 | 16 | super(MLP, self).__init__() 17 | 18 | self.linear_or_not = True #default is linear model 19 | self.num_layers = num_layers 20 | 21 | if num_layers < 1: 22 | raise ValueError("number of layers should be positive!") 23 | elif num_layers == 1: 24 | #Linear model 25 | self.linear = nn.Linear(input_dim, output_dim) 26 | else: 27 | #Multi-layer model 28 | self.linear_or_not = False 29 | self.linears = torch.nn.ModuleList() 30 | self.batch_norms = torch.nn.ModuleList() 31 | 32 | self.linears.append(nn.Linear(input_dim, hidden_dim)) 33 | for layer in range(num_layers - 2): 34 | self.linears.append(nn.Linear(hidden_dim, hidden_dim)) 35 | self.linears.append(nn.Linear(hidden_dim, output_dim)) 36 | 37 | for layer in range(num_layers - 1): 38 | self.batch_norms.append(nn.BatchNorm1d((hidden_dim))) 39 | 40 | def forward(self, x): 41 | if self.linear_or_not: 42 | #If linear model 43 | return self.linear(x) 44 | else: 45 | #If MLP 46 | h = x 47 | for layer in range(self.num_layers - 1): 48 | h = F.relu(self.batch_norms[layer](self.linears[layer](h))) 49 | return self.linears[self.num_layers - 1](h) -------------------------------------------------------------------------------- /src/DPGGAN/linear.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 3 | Linear module modified for the expander and clipping individual gradients. 4 | 5 | This code is due to Mikko Heikkilä (@mixheikk) 6 | 7 | ''' 8 | import math 9 | 10 | import torch 11 | from torch.nn.parameter import Parameter 12 | import src.DPGGAN.functional as F 13 | from torch.nn.modules import Module 14 | 15 | 16 | # The difference between original Linear and custom Linear is create Parameter for each item 17 | class Linear(Module): 18 | def __init__(self, in_features, out_features, bias=True, batch_size = None): 19 | super(Linear, self).__init__() 20 | self.in_features = in_features 21 | self.out_features = out_features 22 | self.batch_size = batch_size 23 | if batch_size is not None: 24 | self.weight = Parameter(torch.Tensor(batch_size, out_features, in_features)) 25 | else: 26 | self.weight = Parameter(torch.Tensor(out_features, in_features)) 27 | if bias: 28 | if batch_size is not None: 29 | self.bias = Parameter(torch.Tensor(batch_size, out_features)) 30 | else: 31 | self.bias = Parameter(torch.Tensor(out_features)) 32 | else: 33 | self.register_parameter('bias', None) 34 | self.reset_parameters() 35 | 36 | def reset_parameters(self): 37 | stdv = 1. / math.sqrt(self.weight.size(1)) 38 | self.weight.data.uniform_(-stdv, stdv) 39 | if self.bias is not None: 40 | self.bias.data.uniform_(-stdv, stdv) 41 | 42 | def forward(self, input, for_test=False): 43 | if len(input.shape) == 2 and not for_test: 44 | input = input.view(input.shape[0],1,input.shape[1]) 45 | return F.linear(input, self.weight, self.bias, for_test=for_test) 46 | 47 | def __repr__(self): 48 | return self.__class__.__name__ + '(' \ 49 | + 'in_features=' + str(self.in_features) \ 50 | + ', out_features=' + str(self.out_features) \ 51 | + ', bias=' + str(self.bias is not None) + ')' 52 | -------------------------------------------------------------------------------- /src/GGAN/linear.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 3 | Linear module modified for the expander and clipping individual gradients. 4 | 5 | This code is due to Mikko Heikkilä (@mixheikk) 6 | 7 | ''' 8 | import math 9 | 10 | import torch 11 | from torch.nn.parameter import Parameter 12 | from torch.nn.modules import Module 13 | 14 | 15 | # The difference between original Linear and custom Linear is create Parameter for each item 16 | class Linear(Module): 17 | def __init__(self, in_features, out_features, bias=True, batch_size = None): 18 | super(Linear, self).__init__() 19 | self.in_features = in_features 20 | self.out_features = out_features 21 | self.batch_size = batch_size 22 | if batch_size is not None: 23 | self.weight = Parameter(torch.Tensor(batch_size, in_features, out_features)) 24 | else: 25 | self.weight = Parameter(torch.Tensor(in_features, out_features)) 26 | if bias: 27 | if batch_size is not None: 28 | self.bias = Parameter(torch.Tensor(batch_size, out_features)) 29 | else: 30 | self.bias = Parameter(torch.Tensor(out_features)) 31 | else: 32 | self.register_parameter('bias', None) 33 | self.reset_parameters() 34 | 35 | def reset_parameters(self): 36 | stdv = 1. / math.sqrt(self.weight.size(1)) 37 | self.weight.data.uniform_(-stdv, stdv) 38 | if self.bias is not None: 39 | self.bias.data.uniform_(-stdv, stdv) 40 | 41 | def forward(self, input, nodes): 42 | hidden = input.view(input.shape[0], 1, input.shape[1]) 43 | sub_weight = self.weight[nodes,] 44 | output = hidden.matmul(sub_weight) 45 | output = output.view(output.shape[0], output.shape[2]) 46 | return output 47 | 48 | def __repr__(self): 49 | return self.__class__.__name__ + '(' \ 50 | + 'in_features=' + str(self.in_features) \ 51 | + ', out_features=' + str(self.out_features) \ 52 | + ', bias=' + str(self.bias is not None) + ')' 53 | -------------------------------------------------------------------------------- /src/GGAN/encoders.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import init 4 | import torch.nn.functional as F 5 | 6 | class Encoder(nn.Module): 7 | """ 8 | Encodes a node's using 'convolutional' GraphSage approach 9 | """ 10 | def __init__(self, feature_dim, embed_dim, adj_lists, aggregator, first_layer,num_sample=10, gcn=False, cuda=False): 11 | super(Encoder, self).__init__() 12 | self.feat_dim = feature_dim 13 | self.adj_lists = adj_lists 14 | self.aggregator = aggregator 15 | self.num_sample = num_sample 16 | self.first_layer = first_layer 17 | 18 | self.gcn = gcn 19 | self.embed_dim = embed_dim 20 | self.cuda = cuda 21 | self.aggregator.cuda = cuda 22 | if self.gcn: 23 | output_dim = self.feat_dim 24 | else: 25 | output_dim = 2 * self.feat_dim 26 | self.weight = nn.Parameter(torch.FloatTensor(embed_dim, output_dim)) 27 | init.xavier_uniform_(self.weight) 28 | 29 | def forward(self, features, nodes, samp_neighs=None, feature_dict=None): 30 | """ 31 | Generates embeddings for a batch of nodes. 32 | 33 | nodes -- list of nodes 34 | """ 35 | if self.first_layer: 36 | neigh_feats = self.aggregator.forward(features, nodes, 37 | [self.adj_lists[int(node)] for node in nodes], 38 | self.num_sample) 39 | else: 40 | assert (samp_neighs != None) 41 | assert (feature_dict != None) 42 | neigh_feats = self.aggregator.forward(features, nodes, samp_neighs, 43 | self.num_sample, feature_dict=feature_dict) 44 | if not self.gcn: 45 | if self.first_layer: 46 | self_feats = features(torch.LongTensor(nodes)) 47 | else: 48 | self_feats = features[nodes] 49 | combined = torch.cat([self_feats, neigh_feats], dim=1) 50 | else: 51 | combined = neigh_feats 52 | combined = F.relu(self.weight.mm(combined.t())) 53 | return combined.t() 54 | -------------------------------------------------------------------------------- /src/GGAN/functional.py: -------------------------------------------------------------------------------- 1 | 2 | """Functional interface""" 3 | 4 | import warnings 5 | import math 6 | from operator import mul 7 | from functools import reduce 8 | import sys 9 | 10 | import torch 11 | #from torch._C import _infer_size, _add_docstr 12 | #from . import _functions 13 | from torch.nn import _functions 14 | #from .modules import utils 15 | from torch.nn.modules import utils 16 | #from ._functions.linear import Bilinear 17 | #from torch.nn._functions.linear import Bilinear 18 | #from ._functions.padding import ConstantPadNd 19 | #from torch.nn._functions.padding import ConstantPadNd 20 | #from ._functions import vision 21 | #from torch.nn._functions import vision 22 | #from ._functions.thnn.fold import Col2Im, Im2Col 23 | #from torch.nn._functions.thnn.fold import Col2Im,Im2Col 24 | from torch.autograd import Variable 25 | #from .modules.utils import _single, _pair, _triple 26 | #from torch.nn.modules.utils import _single, _pair, _triple 27 | 28 | 29 | ''' 30 | Linear layer modified for PX gradients 31 | 32 | The code is due to Mikko Heikkilä (@mixheikk) 33 | ''' 34 | 35 | 36 | # Note: bias not checked yet 37 | def linear(input, weight, bias=None, batch_size=None, nodes=None): 38 | """ 39 | Applies a linear transformation to the incoming data: :math:`y = xA^T + b`. 40 | 41 | Shape: 42 | - Input: :math:`(N, *, in\_features)` where `*` means any number of 43 | additional dimensions 44 | - Weight: :math:`(out\_features, in\_features)` 45 | - Bias: :math:`(out\_features)` 46 | - Output: :math:`(N, *, out\_features)` 47 | """ 48 | if input.dim() == 2 and bias is not None: 49 | # fused op is marginally faster 50 | print("!!!!") 51 | sys.exit(1) 52 | if batch_size is None: 53 | return torch.addmm(bias, input, weight.t()) 54 | else: 55 | print('fused op in functional.linear not implemented yet!') 56 | sys.exit(1) 57 | return torch.addmm(bias, input, weight.t()) 58 | 59 | sub_weight = weight[nodes,] 60 | output = input.matmul(torch.transpose(sub_weight,-2,-1)) 61 | 62 | # kts bias kun muu toimii 63 | if bias is not None: 64 | output += bias 65 | return output 66 | -------------------------------------------------------------------------------- /src/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | 4 | dir_path = os.path.dirname(os.path.realpath(__file__)) 5 | root_path = os.path.abspath(os.path.join(dir_path, os.pardir)) 6 | 7 | 8 | class stat_logger: 9 | def __init__(self, args, current_time): 10 | self.log_file_name = '{}_{}_{}.txt'.format(current_time, args.model_name, args.dataset_str) 11 | self.log_save_folder = root_path + '/log/txt_log/' 12 | self.save_path = self.log_save_folder + self.log_file_name 13 | 14 | 15 | def check_folder(self): 16 | pass 17 | 18 | def write(self, log_info): 19 | with open(self.save_path, 'a') as log_file: 20 | log_file.write(log_info + '\n') 21 | 22 | 23 | def form_generated_stat_log(self, epoch, property_cache): 24 | stat_vec_cache = [] 25 | assert len(property_cache) > 1 26 | for property in property_cache: 27 | _, stat_vec = self.dict_to_vec(property) 28 | stat_vec_cache.append(stat_vec) 29 | stat_vec_mean = np.array(stat_vec_cache).mean(axis=0) 30 | stat_vec_str = ["%.3f" % number for number in stat_vec_mean] 31 | stat_vec_log = ' '.join(stat_vec_str) 32 | log = 'Epoch@{}: '.format(epoch) + stat_vec_log 33 | return log 34 | 35 | 36 | def from_dp_log(self, model): 37 | counter = model.dp_counter 38 | 39 | def form_original_stat_log(self, property): 40 | stat_name, stat_vec = self.dict_to_vec(property) 41 | stat_vec_str = ["%.3f" % number for number in stat_vec] 42 | stat_name_log = ' '.join(stat_name) 43 | stat_vec_log = ' '.join(stat_vec_str) 44 | return stat_name_log + '\n' + 'original_graph: ' + stat_vec_log 45 | 46 | 47 | def form_args_log_content(self, args, model_args): 48 | args_info_str = str(args).split('Namespace')[1].split('(')[1].split(')')[0] 49 | model_args_info_str = str(model_args.__dict__).split('{')[1].split('}')[0] 50 | return 'Args: {}.\nModel_Args: {}.\n'.format(args_info_str, model_args_info_str) 51 | 52 | 53 | def dict_to_vec(self, stat_dict): 54 | stat_name = list(stat_dict.keys()) 55 | stat_vec = np.array(list(stat_dict.values())) 56 | return stat_name, stat_vec -------------------------------------------------------------------------------- /src/GGAN/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | 4 | dir_path = os.path.dirname(os.path.realpath(__file__)) 5 | root_path = os.path.abspath(os.path.join(dir_path, os.pardir)) 6 | 7 | 8 | class stat_logger: 9 | def __init__(self, args, current_time): 10 | self.log_file_name = '{}_{}_{}.txt'.format(current_time, args.model_name, args.dataset_str) 11 | self.log_save_folder = root_path + '/log/txt_log/' 12 | self.save_path = self.log_save_folder + self.log_file_name 13 | 14 | 15 | def check_folder(self): 16 | pass 17 | 18 | def write(self, log_info): 19 | with open(self.save_path, 'a') as log_file: 20 | log_file.write(log_info + '\n') 21 | 22 | 23 | def form_generated_stat_log(self, epoch, property_cache): 24 | stat_vec_cache = [] 25 | assert len(property_cache) > 1 26 | for property in property_cache: 27 | _, stat_vec = self.dict_to_vec(property) 28 | stat_vec_cache.append(stat_vec) 29 | stat_vec_mean = np.array(stat_vec_cache).mean(axis=0) 30 | stat_vec_str = ["%.3f" % number for number in stat_vec_mean] 31 | stat_vec_log = ' '.join(stat_vec_str) 32 | log = 'Epoch@{}: '.format(epoch) + stat_vec_log 33 | return log 34 | 35 | 36 | def from_dp_log(self, model): 37 | counter = model.dp_counter 38 | 39 | def form_original_stat_log(self, property): 40 | stat_name, stat_vec = self.dict_to_vec(property) 41 | stat_vec_str = ["%.3f" % number for number in stat_vec] 42 | stat_name_log = ' '.join(stat_name) 43 | stat_vec_log = ' '.join(stat_vec_str) 44 | return stat_name_log + '\n' + 'original_graph: ' + stat_vec_log 45 | 46 | 47 | def form_args_log_content(self, args, model_args): 48 | args_info_str = str(args).split('Namespace')[1].split('(')[1].split(')')[0] 49 | model_args_info_str = str(model_args.__dict__).split('{')[1].split('}')[0] 50 | return 'Args: {}.\nModel_Args: {}.\n'.format(args_info_str, model_args_info_str) 51 | 52 | 53 | def dict_to_vec(self, stat_dict): 54 | stat_name = list(stat_dict.keys()) 55 | stat_vec = np.array(list(stat_dict.values())) 56 | return stat_name, stat_vec -------------------------------------------------------------------------------- /link_classification_exp/node2vec/README.md: -------------------------------------------------------------------------------- 1 | # node2vec 2 | 3 | This repository provides a reference implementation of *node2vec* as described in the paper:
4 | > node2vec: Scalable Feature Learning for Networks.
5 | > Aditya Grover and Jure Leskovec.
6 | > Knowledge Discovery and Data Mining, 2016.
7 | > 8 | 9 | The *node2vec* algorithm learns continuous representations for nodes in any (un)directed, (un)weighted graph. Please check the [project page](https://snap.stanford.edu/node2vec/) for more details. 10 | 11 | ### Basic Usage 12 | 13 | #### Example 14 | To run *node2vec* on Zachary's karate club network, execute the following command from the project home directory:
15 | ``python src/main.py --input graph/karate.edgelist --output emb/karate.emd`` 16 | 17 | #### Options 18 | You can check out the other options available to use with *node2vec* using:
19 | ``python src/main.py --help`` 20 | 21 | #### Input 22 | The supported input format is an edgelist: 23 | 24 | node1_id_int node2_id_int 25 | 26 | The graph is assumed to be undirected and unweighted by default. These options can be changed by setting the appropriate flags. 27 | 28 | #### Output 29 | The output file has *n+1* lines for a graph with *n* vertices. 30 | The first line has the following format: 31 | 32 | num_of_nodes dim_of_representation 33 | 34 | The next *n* lines are as follows: 35 | 36 | node_id dim1 dim2 ... dimd 37 | 38 | where dim1, ... , dimd is the *d*-dimensional representation learned by *node2vec*. 39 | 40 | ### Citing 41 | If you find *node2vec* useful for your research, please consider citing the following paper: 42 | 43 | @inproceedings{node2vec-kdd2016, 44 | author = {Grover, Aditya and Leskovec, Jure}, 45 | title = {node2vec: Scalable Feature Learning for Networks}, 46 | booktitle = {Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining}, 47 | year = {2016} 48 | } 49 | 50 | 51 | ### Miscellaneous 52 | 53 | Please send any questions you might have about the code and/or the algorithm to . 54 | 55 | *Note:* This is only a reference implementation of the *node2vec* algorithm and could benefit from several performance enhancement schemes, some of which are discussed in the paper. 56 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import numpy as np 3 | import networkx as nx 4 | import scipy 5 | import scipy.sparse as sp 6 | import matplotlib.pyplot as plt 7 | from src.DPGGAN.data_utils import make_adj_label 8 | 9 | def sigmoid(x): 10 | return 1 / (1 + np.exp(-x)) 11 | 12 | 13 | def graph_to_adj_list(adj): 14 | # Sparse adj matrix to adj lists 15 | G = nx.from_scipy_sparse_matrix(adj) 16 | adj_lists = defaultdict(set) 17 | 18 | # Check isolated node before training 19 | for node, adjacencies in enumerate(G.adjacency()): 20 | if len(list(adjacencies[1].keys())) == 0: 21 | print("Node %d is isolated !!!" % node) 22 | assert False 23 | adj_lists[node] = set(list(adjacencies[1].keys())) 24 | 25 | return adj_lists 26 | 27 | 28 | def save_top_n(adj, n, threshold=None): 29 | if threshold is None: 30 | il1 = np.tril_indices(adj.shape[0]) 31 | adj[il1] = float("-infinity") 32 | index = adj.reshape((-1,)).argsort()[-n:] 33 | (row, col) = divmod(index, adj.shape[0]) 34 | top_n = np.zeros_like(adj) 35 | top_n[row,col] = 1 36 | m = np.ones_like(adj) 37 | m[(top_n + top_n.T) == 0] = 0 38 | return m, 0 39 | else: 40 | # find neck_value 41 | adj_ = adj.copy() 42 | il1 = np.tril_indices(adj_.shape[0]) 43 | adj_[il1] = float("-infinity") 44 | index = adj_.reshape((-1,)).argsort()[-n:] 45 | last_one = index[0] 46 | (row, col) = divmod(last_one, adj_.shape[0]) 47 | neck_value = adj[row,col] 48 | # convert predict adj to adj(0,1) 49 | adj[adj >= threshold] = 1 50 | adj[adj < threshold] = 0 51 | return adj, neck_value 52 | 53 | 54 | def save_edge_num(graph): 55 | graph = graph - sp.dia_matrix((graph.diagonal()[np.newaxis, :], [0]), shape=graph.shape) 56 | graph.eliminate_zeros() 57 | assert np.diag(graph.todense()).sum() == 0 58 | original_garph = nx.from_scipy_sparse_matrix(graph) 59 | n = original_garph.number_of_edges() 60 | return n 61 | 62 | 63 | 64 | def sample_subgraph(args, node_num, dataset): 65 | index = np.random.choice(node_num, args.batch_size, replace=False) 66 | sub_adj = make_adj_label(index, dataset.adj) 67 | return index, sub_adj -------------------------------------------------------------------------------- /src/GGAN/utils.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import numpy as np 3 | import networkx as nx 4 | import scipy 5 | import scipy.sparse as sp 6 | import matplotlib.pyplot as plt 7 | from src.DPGGAN.data_utils import make_adj_label 8 | 9 | def sigmoid(x): 10 | return 1 / (1 + np.exp(-x)) 11 | 12 | 13 | def graph_to_adj_list(adj): 14 | # Sparse adj matrix to adj lists 15 | G = nx.from_scipy_sparse_matrix(adj) 16 | adj_lists = defaultdict(set) 17 | 18 | # Check isolated node before training 19 | for node, adjacencies in enumerate(G.adjacency()): 20 | if len(list(adjacencies[1].keys())) == 0: 21 | print("Node %d is isolated !!!" % node) 22 | assert False 23 | adj_lists[node] = set(list(adjacencies[1].keys())) 24 | 25 | return adj_lists 26 | 27 | 28 | def save_top_n(adj, n, threshold=None): 29 | if threshold is None: 30 | il1 = np.tril_indices(adj.shape[0]) 31 | adj[il1] = float("-infinity") 32 | index = adj.reshape((-1,)).argsort()[-n:] 33 | (row, col) = divmod(index, adj.shape[0]) 34 | top_n = np.zeros_like(adj) 35 | top_n[row,col] = 1 36 | m = np.ones_like(adj) 37 | m[(top_n + top_n.T) == 0] = 0 38 | return m, 0 39 | else: 40 | # find neck_value 41 | adj_ = adj.copy() 42 | il1 = np.tril_indices(adj_.shape[0]) 43 | adj_[il1] = float("-infinity") 44 | index = adj_.reshape((-1,)).argsort()[-n:] 45 | last_one = index[0] 46 | (row, col) = divmod(last_one, adj_.shape[0]) 47 | neck_value = adj[row,col] 48 | # convert predict adj to adj(0,1) 49 | adj[adj >= threshold] = 1 50 | adj[adj < threshold] = 0 51 | return adj, neck_value 52 | 53 | 54 | def save_edge_num(graph): 55 | graph = graph - sp.dia_matrix((graph.diagonal()[np.newaxis, :], [0]), shape=graph.shape) 56 | graph.eliminate_zeros() 57 | assert np.diag(graph.todense()).sum() == 0 58 | original_garph = nx.from_scipy_sparse_matrix(graph) 59 | n = original_garph.number_of_edges() 60 | return n 61 | 62 | 63 | 64 | def sample_subgraph(args, node_num, dataset): 65 | index = np.random.choice(node_num, args.batch_size, replace=False) 66 | sub_adj = make_adj_label(index, dataset.adj) 67 | return index, sub_adj -------------------------------------------------------------------------------- /src/DPGGAN/adadp.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A code for implementing the ADADP algorithm for neural networks, 3 | described in 4 | 5 | Koskela, A. and Honkela, A., 6 | Learning rate adaptation for differentially private stochastic gradient descent. 7 | arXiv preprint arXiv:1809.03832. (2018) 8 | 9 | The code is due to Antti Koskela (@koskeant) 10 | 11 | ''' 12 | import torch 13 | from torch.optim.optimizer import Optimizer 14 | import numpy as np 15 | 16 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 17 | 18 | class ADADP(Optimizer): 19 | 20 | def __init__(self, params, lr=1e-3): 21 | defaults = dict(lr=lr) 22 | self.p0 = None 23 | self.p1 = None 24 | self.lrs = lr 25 | self.accepted = 0 26 | self.failed = 0 27 | 28 | self.lrs_history = [] 29 | 30 | super(ADADP, self).__init__(params, defaults) 31 | 32 | def step1(self): 33 | 34 | del self.p0 35 | self.p0 = [] 36 | 37 | del self.p1 38 | self.p1 = [] 39 | 40 | for group in self.param_groups: 41 | 42 | for p in group['params']: 43 | if p.grad is None: 44 | continue 45 | 46 | dd = p.data.clone() 47 | self.p0.append(dd) 48 | 49 | self.p1.append(p.data - self.lrs*p.grad.data) 50 | p.data.add_(-0.5*self.lrs, p.grad.data) 51 | 52 | def step2(self, tol=1.0): 53 | 54 | for group in self.param_groups: 55 | 56 | err_e = 0.0 57 | 58 | for ijk,p in enumerate(group['params']): 59 | p.data.add_(-0.5*self.lrs, p.grad.data) 60 | err_e += (((self.p1[ijk] - p.data)**2/(torch.max(torch.ones(self.p1[ijk].size()).to(device),self.p1[ijk]**2))).norm(1)) 61 | 62 | err_e = np.sqrt(err_e) 63 | 64 | self.lrs = float(self.lrs*min(max(np.sqrt(tol/err_e),0.9), 1.1)) 65 | 66 | ## Accept the step only if err < tol. 67 | #if err_e > 1.0*tol: 68 | # for ijk,p in enumerate(group['params']): 69 | # p.data = self.p0[ijk] 70 | #if err_e < tol: 71 | # self.accepted += 1 72 | #else : 73 | # self.failed += 1 74 | 75 | self.lrs_history.append(self.lrs) 76 | 77 | 78 | -------------------------------------------------------------------------------- /src/DPGGAN/utils_dp.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from collections import OrderedDict 3 | from torch.autograd import Variable 4 | from src.DPGGAN import gaussian_moments as gm, px_expander 5 | 6 | ''' 7 | Update privacy budget 8 | 9 | priv_pars: the privacy dictionary 10 | ''' 11 | def update_privacy_pars(dp_counter): 12 | verify = False 13 | max_lmbd = 32 14 | lmbds = range(1, max_lmbd + 1) 15 | log_moments = [] 16 | for lmbd in lmbds: 17 | log_moment = 0 18 | ''' 19 | print('Here q = ' + str(priv_pars['q'])) 20 | print('Here sigma = ' + str(priv_pars['sigma'])) 21 | print('Here T = ' + str(priv_pars['T'])) 22 | ''' 23 | log_moment += gm.compute_log_moment(dp_counter.q, dp_counter.sigma, dp_counter.T, lmbd, verify=verify) 24 | log_moments.append((lmbd, log_moment)) 25 | dp_counter.eps, _ = gm.get_privacy_spent(log_moments, target_delta=dp_counter.delta) 26 | return dp_counter 27 | 28 | 29 | ''' 30 | create container for accumulated gradient 31 | 32 | :return is the gradient container 33 | ''' 34 | def create_cum_grads(model): 35 | cum_grads = OrderedDict() 36 | for i, p in enumerate(model.parameters()): 37 | if p.requires_grad: 38 | cum_grads[str(i)] = Variable(torch.zeros(p.shape[1:]), requires_grad=False) 39 | return cum_grads 40 | 41 | 42 | 43 | def update_privacy_account(model_args, model): 44 | stop_signal = False 45 | if 'dp_counter' in set(model.__dict__.keys()): 46 | model.dp_counter.T += 1 47 | update_privacy_pars(model.dp_counter) 48 | model_args.grad_norm_max *= model_args.C_decay 49 | if model.dp_counter.eps > model_args.eps_requirement: 50 | model.dp_counter.should_stop = True 51 | stop_signal = model.dp_counter.should_stop 52 | return stop_signal 53 | 54 | 55 | 56 | def perturb_grad(model_args, model): 57 | # For DP model: accumulate grads in the container, cum_grads; add noise on sum of grads 58 | px_expander.acc_scaled_grads(model=model, C=model_args.grad_norm_max, cum_grads=model.cum_grads) 59 | 60 | # because we don't use lot-batch structure, so just add noise after acc_grads 61 | px_expander.add_noise_with_cum_grads(model=model, C=model_args.grad_norm_max, 62 | sigma=model_args.noise_sigma, cum_grads=model.cum_grads, 63 | samp_num=model_args.samp_num) -------------------------------------------------------------------------------- /src/DPGGAN/functional.py: -------------------------------------------------------------------------------- 1 | 2 | """Functional interface""" 3 | 4 | import warnings 5 | import math 6 | from operator import mul 7 | from functools import reduce 8 | import sys 9 | 10 | import torch 11 | #from torch._C import _infer_size, _add_docstr 12 | #from . import _functions 13 | from torch.nn import _functions 14 | #from .modules import utils 15 | from torch.nn.modules import utils 16 | #from ._functions.linear import Bilinear 17 | #from torch.nn._functions.linear import Bilinear 18 | #from ._functions.padding import ConstantPadNd 19 | #from torch.nn._functions.padding import ConstantPadNd 20 | #from ._functions import vision 21 | #from torch.nn._functions import vision 22 | #from ._functions.thnn.fold import Col2Im, Im2Col 23 | #from torch.nn._functions.thnn.fold import Col2Im,Im2Col 24 | from torch.autograd import Variable 25 | #from .modules.utils import _single, _pair, _triple 26 | #from torch.nn.modules.utils import _single, _pair, _triple 27 | 28 | 29 | ''' 30 | Linear layer modified for PX gradients 31 | 32 | The code is due to Mikko Heikkilä (@mixheikk) 33 | ''' 34 | 35 | 36 | # Note: bias not checked yet 37 | def linear(input, weight, bias=None, batch_size=None, for_test=None): 38 | """ 39 | Applies a linear transformation to the incoming data: :math:`y = xA^T + b`. 40 | 41 | Shape: 42 | - Input: :math:`(N, *, in\_features)` where `*` means any number of 43 | additional dimensions 44 | - Weight: :math:`(out\_features, in\_features)` 45 | - Bias: :math:`(out\_features)` 46 | - Output: :math:`(N, *, out\_features)` 47 | """ 48 | if input.dim() == 2 and bias is not None: 49 | # fused op is marginally faster 50 | if batch_size is None: 51 | return torch.mm(input, weight.t()) 52 | else: 53 | print('fused op in functional.linear not implemented yet!') 54 | sys.exit(1) 55 | return torch.addmm(bias, input, weight.t()) 56 | 57 | if for_test: 58 | if len(list(input.shape)) == 3: 59 | input = input.view(input.shape[0], input.shape[2]) 60 | # output = input.matmul(torch.transpose(weight,-2,-1)[0]) 61 | output = torch.mm(input, weight[0].t()) 62 | assert len(list(output.shape)) == 2 63 | else: 64 | # output = input.matmul(torch.transpose(weight,-2,-1)) 65 | output = torch.bmm(input, weight.permute(0,2,1)) 66 | output = output.view(output.shape[0], output.shape[2]) 67 | assert len(list(output.shape)) == 2 68 | 69 | # kts bias kun muu toimii 70 | if bias is not None: 71 | output += bias 72 | return output 73 | -------------------------------------------------------------------------------- /graph_classification_exp/sample_IMDBMULTI.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import pickle 4 | import random 5 | from collections import defaultdict 6 | 7 | import networkx as nx 8 | dir_path = os.path.dirname(os.path.realpath(__file__)) 9 | root_path = os.path.abspath(os.path.join(dir_path, os.pardir)) 10 | 11 | def relabel_dblp2(data_list): 12 | # according to the method that we create dblp2 data, we can relabel dblp2 data to three classes 13 | # For (label<24) ->0; (241; (482 14 | for graph in data_list: 15 | if graph.graph['label'] < 24: 16 | graph.graph['label'] = 0 17 | elif graph.graph['label'] < 48: 18 | graph.graph['label'] = 1 19 | elif graph.graph['label'] < 72: 20 | graph.graph['label'] = 2 21 | return data_list 22 | 23 | 24 | def read_graph(dataset): 25 | data_path = root_path + '/data/orig/{}.pkl'.format(dataset) 26 | with open(data_path, 'rb') as file: 27 | data_list = pickle.load(file) 28 | if dataset == 'dblp2': 29 | data_list = relabel_dblp2(data_list) 30 | return data_list 31 | 32 | 33 | def save_graph(dataset, data_list): 34 | if dataset == 'IMDB_MULTI': 35 | save_path = root_path + '/data/orig/Resampled_IMDB_MULTI.pkl' 36 | elif dataset == 'dblp2': 37 | save_path = root_path + '/data/orig/relabeled_dblp2.pkl' 38 | else: 39 | save_path = None 40 | 41 | with open(save_path, 'wb') as file: 42 | pickle.dump(data_list, file) 43 | print("finish dump.") 44 | 45 | 46 | def arg_parser(): 47 | parser = argparse.ArgumentParser() 48 | parser.add_argument('--dataset', type=str, default='dblp2', help='[dblp2, IMDB_MULTI]') 49 | parser.add_argument('--num_per_class', type=int, default=None, help='sample 200 graphs for imdb dataset.') 50 | args = parser.parse_args() 51 | return args 52 | 53 | 54 | if __name__ == '__main__': 55 | args = arg_parser() 56 | graphs_dict = defaultdict(list) 57 | data_list = read_graph(args.dataset) 58 | for graph in data_list: 59 | graphs_dict[graph.graph['label']].append(graph) 60 | 61 | if args.num_per_class is not None: 62 | sampled_graph_list = [] # random.sample(data_list, num) 63 | for label, g_list in graphs_dict.items(): 64 | sampled_graph_list.extend(random.sample(g_list, args.num_per_class)) 65 | else: 66 | sampled_graph_list = data_list 67 | 68 | save_graph(args.dataset, sampled_graph_list) 69 | label_dict = defaultdict(int) 70 | for graph in sampled_graph_list: 71 | label_dict[graph.graph['label']] += 1 72 | print(label_dict) -------------------------------------------------------------------------------- /src/graph_drawer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | 4 | import networkx as nx 5 | import scipy 6 | import matplotlib.pyplot as plt 7 | 8 | dir_path = os.path.dirname(os.path.realpath(__file__)) 9 | root_path = os.path.abspath(os.path.join(dir_path, os.pardir)) 10 | 11 | 12 | class drawer: 13 | def __init__(self): 14 | pass 15 | 16 | def check_image_save_dir(self): 17 | pass 18 | # check the image save dir, according to the timestamp 19 | 20 | def save_graph(self): 21 | pass 22 | # save graph to relevant dir 23 | # write according to the following function. 24 | 25 | 26 | 27 | def draw_graph(G, path, circle=False, color=None, remove_isolated=True): 28 | # G = None 29 | # if scipy.sparse.issparse(adj): 30 | # G = nx.from_scipy_sparse_matrix(adj) 31 | # else: 32 | # G = nx.from_numpy_matrix(adj) 33 | 34 | if remove_isolated: 35 | print("remove isolated nodes") 36 | G.remove_nodes_from(list(nx.isolates(G))) 37 | 38 | options = { 39 | 'node_color': 'black', 40 | 'node_size': 5, 41 | # 'line_color': 'grey', 42 | 'linewidths': 0.1, 43 | 'width': 0.1, 44 | } 45 | 46 | if circle: 47 | node_list = sorted(G.degree, key=lambda x: x[1], reverse=True) 48 | node2order = {} 49 | for i, v in enumerate(node_list): 50 | node2order[v[0]] = i 51 | 52 | new_edge = [] 53 | for i in G.edges(): 54 | new_edge.append((node2order[i[0]], node2order[i[1]])) 55 | 56 | new_G = nx.Graph() 57 | new_G.add_nodes_from(range(len(node_list))) 58 | new_G.add_edges_from(new_edge) 59 | 60 | nx.draw_circular(new_G, with_labels=True) 61 | 62 | elif color is not None: 63 | nx.draw(G, node_color=color, node_size=20, with_labels=True) 64 | else: 65 | nx.draw(G, **options) 66 | plt.savefig(path) 67 | plt.clf() 68 | 69 | 70 | if __name__ == '__main__': 71 | print(root_path) 72 | # graph_list = ['GraphVAE_new_IMDB_MULTI', 'DPGraphVAE_new_IMDB_MULTI', 73 | # 'DPGraphGAN_new_IMDB_MULTI', 'GraphRNN_new_IMDB_MULTI', 'NetGAN_new_IMDB_MULTI', 74 | # 'GraphVAE_new_dblp2', 'DPGraphVAE_new_dblp2', 'DPGraphGAN_new_dblp2', 75 | # 'GraphRNN_new_dblp2', 'NetGAN_new_dblp2'] 76 | graph_list = ['dblp2', 'new_IMDB_MULTI'] 77 | ids = [0,1,2,3,4] 78 | for graph_name in graph_list: 79 | for id in ids: 80 | generated_graph_path = root_path + '/data/orig/{}.pkl'.format(graph_name) 81 | with open(generated_graph_path, 'rb') as file: 82 | graph_list = pickle.load(file) 83 | print(len(graph_list)) 84 | save_path = root_path + '/data/graph_figures/orig_{}_{}.png'.format(graph_name, str(id)) 85 | draw_graph(graph_list[id], path=save_path) -------------------------------------------------------------------------------- /src/GGAN/graph_drawer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | 4 | import networkx as nx 5 | import scipy 6 | import matplotlib.pyplot as plt 7 | 8 | dir_path = os.path.dirname(os.path.realpath(__file__)) 9 | root_path = os.path.abspath(os.path.join(dir_path, os.pardir)) 10 | 11 | 12 | class drawer: 13 | def __init__(self): 14 | pass 15 | 16 | def check_image_save_dir(self): 17 | pass 18 | # check the image save dir, according to the timestamp 19 | 20 | def save_graph(self): 21 | pass 22 | # save graph to relevant dir 23 | # write according to the following function. 24 | 25 | 26 | 27 | def draw_graph(G, path, circle=False, color=None, remove_isolated=True): 28 | # G = None 29 | # if scipy.sparse.issparse(adj): 30 | # G = nx.from_scipy_sparse_matrix(adj) 31 | # else: 32 | # G = nx.from_numpy_matrix(adj) 33 | 34 | if remove_isolated: 35 | print("remove isolated nodes") 36 | G.remove_nodes_from(list(nx.isolates(G))) 37 | 38 | options = { 39 | 'node_color': 'black', 40 | 'node_size': 5, 41 | # 'line_color': 'grey', 42 | 'linewidths': 0.1, 43 | 'width': 0.1, 44 | } 45 | 46 | if circle: 47 | node_list = sorted(G.degree, key=lambda x: x[1], reverse=True) 48 | node2order = {} 49 | for i, v in enumerate(node_list): 50 | node2order[v[0]] = i 51 | 52 | new_edge = [] 53 | for i in G.edges(): 54 | new_edge.append((node2order[i[0]], node2order[i[1]])) 55 | 56 | new_G = nx.Graph() 57 | new_G.add_nodes_from(range(len(node_list))) 58 | new_G.add_edges_from(new_edge) 59 | 60 | nx.draw_circular(new_G, with_labels=True) 61 | 62 | elif color is not None: 63 | nx.draw(G, node_color=color, node_size=20, with_labels=True) 64 | else: 65 | nx.draw(G, **options) 66 | plt.savefig(path) 67 | plt.clf() 68 | 69 | 70 | if __name__ == '__main__': 71 | print(root_path) 72 | # graph_list = ['GraphVAE_new_IMDB_MULTI', 'DPGraphVAE_new_IMDB_MULTI', 73 | # 'DPGraphGAN_new_IMDB_MULTI', 'GraphRNN_new_IMDB_MULTI', 'NetGAN_new_IMDB_MULTI', 74 | # 'GraphVAE_new_dblp2', 'DPGraphVAE_new_dblp2', 'DPGraphGAN_new_dblp2', 75 | # 'GraphRNN_new_dblp2', 'NetGAN_new_dblp2'] 76 | graph_list = ['dblp2', 'new_IMDB_MULTI'] 77 | ids = [0,1,2,3,4] 78 | for graph_name in graph_list: 79 | for id in ids: 80 | generated_graph_path = root_path + '/data/orig/{}.pkl'.format(graph_name) 81 | with open(generated_graph_path, 'rb') as file: 82 | graph_list = pickle.load(file) 83 | print(len(graph_list)) 84 | save_path = root_path + '/data/graph_figures/orig_{}_{}.png'.format(graph_name, str(id)) 85 | draw_graph(graph_list[id], path=save_path) -------------------------------------------------------------------------------- /src/GGAN/aggregators.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | import numpy as np 5 | import random 6 | 7 | """ 8 | Set of modules for aggregating embeddings of neighbors. 9 | """ 10 | 11 | class MeanAggregator(nn.Module): 12 | """ 13 | Aggregates a node's embeddings using mean of neighbors' embeddings 14 | """ 15 | def __init__(self, cuda=False, gcn=False, first_layer=True): 16 | """ 17 | Initializes the aggregator for a specific graph. 18 | 19 | features -- function mapping LongTensor of node ids to FloatTensor of feature values. 20 | cuda -- whether to use GPU 21 | gcn --- whether to perform concatenation GraphSAGE-style, or add self-loops GCN-style 22 | """ 23 | 24 | super(MeanAggregator, self).__init__() 25 | 26 | self.cuda = cuda 27 | self.gcn = gcn 28 | self.first_layer = first_layer 29 | 30 | def forward(self, features, nodes, to_neighs, num_sample=10, feature_dict=None): 31 | """ 32 | nodes --- list of nodes in a batch 33 | to_neighs --- list of sets, each set is the set of neighbors for node in batch 34 | num_sample --- number of neighbors to sample. No sampling if None. 35 | """ 36 | # Local pointers to functions (speed hack) 37 | _set = set 38 | if not num_sample is None and self.first_layer: 39 | _sample = random.sample 40 | samp_neighs = [_set(_sample(to_neigh, num_sample,)) 41 | if len(to_neigh) >= num_sample 42 | else to_neigh for to_neigh in to_neighs] 43 | else: 44 | samp_neighs = to_neighs 45 | 46 | if self.gcn: 47 | samp_neighs = [samp_neigh + set([nodes[i]]) for i, samp_neigh in enumerate(samp_neighs)] 48 | unique_nodes_list = list(set.union(*samp_neighs)) 49 | unique_nodes = {n:i for i,n in enumerate(unique_nodes_list)} 50 | mask = Variable(torch.zeros(len(samp_neighs), len(unique_nodes))) 51 | column_indices = [unique_nodes[n] for samp_neigh in samp_neighs for n in samp_neigh] 52 | row_indices = [i for i in range(len(samp_neighs)) for j in range(len(samp_neighs[i]))] 53 | mask[row_indices, column_indices] = 1 54 | if self.cuda: 55 | mask = mask.cuda() 56 | num_neigh = mask.sum(1, keepdim=True) 57 | # num_neigh = mask.sum(1, keepdim=True) + 1 58 | mask = mask.div(num_neigh) 59 | if self.cuda: 60 | embed_matrix = features(torch.LongTensor(unique_nodes_list).cuda()) 61 | else: 62 | if self.first_layer: 63 | embed_matrix = features(torch.LongTensor(unique_nodes_list)) 64 | else: 65 | node_idx = [] 66 | for i, v in enumerate(unique_nodes_list): 67 | node_idx.append(feature_dict[v]) 68 | embed_matrix = features[node_idx] 69 | to_feats = mask.mm(embed_matrix) 70 | return to_feats 71 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: dpggan 2 | channels: 3 | - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/pytorch 4 | - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main 5 | - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge 6 | - https://mirrors.tuna.tsinghua.edu.cn/help/tensorflow/ 7 | - https://mirrors.ustc.edu.cn/anaconda/cloud/menpo/ 8 | - https://mirrors.ustc.edu.cn/anaconda/cloud/bioconda/ 9 | - https://mirrors.ustc.edu.cn/anaconda/cloud/msys2/ 10 | - https://mirrors.ustc.edu.cn/anaconda/cloud/conda-forge/ 11 | - https://mirrors.ustc.edu.cn/anaconda/pkgs/free/ 12 | - https://mirrors.ustc.edu.cn/anaconda/pkgs/main/ 13 | - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/pytorch/ 14 | - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/ 15 | - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/ 16 | - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/ 17 | - defaults 18 | dependencies: 19 | - _libgcc_mutex=0.1=conda_forge 20 | - _openmp_mutex=4.5=1_gnu 21 | - ca-certificates=2020.12.5=ha878542_0 22 | - certifi=2020.12.5=py36h5fab9bb_1 23 | - cffi=1.14.4=py36hc120d54_1 24 | - cudatoolkit=9.0=h13b8566_0 25 | - freetype=2.10.4=h7ca028e_0 26 | - intel-openmp=2020.2=254 27 | - jpeg=9d=h36c2ea0_0 28 | - lcms2=2.11=hcbb858e_1 29 | - ld_impl_linux-64=2.35.1=hea4e1c9_1 30 | - libblas=3.9.0=7_openblas 31 | - libcblas=3.9.0=7_openblas 32 | - libffi=3.3=h58526e2_2 33 | - libgcc-ng=9.3.0=h5dbcf3e_17 34 | - libgfortran-ng=9.3.0=he4bcb1c_17 35 | - libgfortran5=9.3.0=he4bcb1c_17 36 | - libgomp=9.3.0=h5dbcf3e_17 37 | - liblapack=3.9.0=7_openblas 38 | - libopenblas=0.3.12=pthreads_h4812303_1 39 | - libpng=1.6.37=h21135ba_2 40 | - libstdcxx-ng=9.3.0=h2ae2ef3_17 41 | - libtiff=4.2.0=hdc55705_0 42 | - libwebp-base=1.1.0=h36c2ea0_3 43 | - lz4-c=1.9.3=h9c3ff4c_0 44 | - mkl=2020.2=256 45 | - ncurses=6.2=h58526e2_4 46 | - ninja=1.10.2=h4bd325d_0 47 | - numpy=1.19.5=py36h2aa4a07_1 48 | - olefile=0.46=pyh9f0ad1d_1 49 | - openssl=1.1.1i=h7f98852_0 50 | - pillow=8.1.0=py36h4f9996e_1 51 | - pip=20.3.3=pyhd8ed1ab_0 52 | - pycparser=2.20=pyh9f0ad1d_2 53 | - python=3.6.12=hffdb5ce_0_cpython 54 | - python_abi=3.6=1_cp36m 55 | - pytorch=1.1.0=py3.6_cuda9.0.176_cudnn7.5.1_0 56 | - readline=8.0=he28a2e2_2 57 | - setuptools=49.6.0=py36h5fab9bb_3 58 | - six=1.15.0=pyh9f0ad1d_0 59 | - sqlite=3.34.0=h74cdb3f_0 60 | - tk=8.6.10=h21135ba_1 61 | - torchvision=0.3.0=py36_cu9.0.176_1 62 | - wheel=0.36.2=pyhd3deb0d_0 63 | - xz=5.2.5=h516909a_1 64 | - zlib=1.2.11=h516909a_1010 65 | - zstd=1.4.8=ha95c52a_1 66 | - pip: 67 | - cycler==0.10.0 68 | - decorator==4.4.2 69 | - gensim==3.8.3 70 | - joblib==1.0.0 71 | - kiwisolver==1.3.1 72 | - matplotlib==3.3.3 73 | - mpmath==1.1.0 74 | - networkx==2.5 75 | - powerlaw==1.4.6 76 | - pyparsing==2.4.7 77 | - python-dateutil==2.8.1 78 | - python-igraph==0.8.3 79 | - scikit-learn==0.24.0 80 | - scipy==1.5.4 81 | - smart-open==4.1.2 82 | - texttable==1.6.3 83 | - threadpoolctl==2.1.0 84 | - tqdm==4.56.0 85 | prefix: /home/biaozhi.whn/miniconda3/envs/dpggan 86 | -------------------------------------------------------------------------------- /src/DPGGAN/dp_aggregators.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | 5 | import random 6 | import numpy as np 7 | 8 | """ 9 | Set of modules for aggregating embeddings of neighbors. 10 | """ 11 | 12 | class MeanAggregator(nn.Module): 13 | """ 14 | Aggregates a node's embeddings using mean of neighbors' embeddings 15 | """ 16 | def __init__(self, cuda=False, gcn=False, first_layer=True): 17 | """ 18 | Initializes the aggregator for a specific graph. 19 | 20 | features -- function mapping LongTensor of node ids to FloatTensor of feature values. 21 | cuda -- whether to use GPU 22 | gcn --- whether to perform concatenation GraphSAGE-style, or add self-loops GCN-style 23 | """ 24 | 25 | super(MeanAggregator, self).__init__() 26 | 27 | self.cuda = cuda 28 | self.gcn = gcn 29 | self.first_layer = first_layer 30 | 31 | def forward(self, features, nodes, to_neighs, num_sample=10, feature_dict=None): 32 | """ 33 | nodes --- list of nodes in a batch 34 | to_neighs --- list of sets, each set is the set of neighbors for node in batch 35 | num_sample --- number of neighbors to sample. No sampling if None. 36 | """ 37 | # Local pointers to functions (speed hack) 38 | _set = set 39 | if not num_sample is None and self.first_layer: 40 | _sample = np.random.choice 41 | # samp_neighs = [_set(_sample(to_neigh, num_sample,)) 42 | # if len(to_neigh) >= num_sample 43 | # else to_neigh for to_neigh in to_neighs] 44 | samp_neighs = [ _set(_sample(list(to_neigh), num_sample)) for to_neigh in to_neighs] 45 | 46 | else: 47 | samp_neighs = [ _set(list(to_neigh)) for to_neigh in to_neighs] 48 | 49 | # if self.gcn: 50 | # samp_neighs = [samp_neigh + set([nodes[i]]) for i, samp_neigh in enumerate(samp_neighs)] 51 | unique_nodes_list = list(set.union(*samp_neighs)) 52 | unique_nodes = {n:i for i,n in enumerate(unique_nodes_list)} 53 | mask = Variable(torch.zeros(len(samp_neighs), len(unique_nodes))) 54 | column_indices = [unique_nodes[n] for samp_neigh in samp_neighs for n in samp_neigh] 55 | row_indices = [i for i in range(len(samp_neighs)) for j in range(len(samp_neighs[i]))] 56 | mask[row_indices, column_indices] = 1 57 | if self.cuda: 58 | mask = mask.cuda() 59 | num_neigh = mask.sum(1, keepdim=True) 60 | # num_neigh = mask.sum(1, keepdim=True) + 1 61 | mask = mask.div(num_neigh) 62 | if self.cuda: 63 | embed_matrix = features(torch.LongTensor(unique_nodes_list).cuda()) 64 | else: 65 | if self.first_layer: 66 | embed_matrix = features(torch.LongTensor(unique_nodes_list)) 67 | else: 68 | node_idx = [] 69 | for i,v in enumerate(unique_nodes_list): 70 | node_idx.append(feature_dict[v]) 71 | embed_matrix = features[node_idx] 72 | to_feats = mask.mm(embed_matrix) 73 | return to_feats 74 | -------------------------------------------------------------------------------- /graph_classification_exp/preprocess_nx_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import shutil 4 | 5 | import networkx as nx 6 | dir_path = os.path.dirname(os.path.realpath(__file__)) 7 | root_path = os.path.abspath(os.path.join(dir_path, os.pardir)) 8 | import argparse 9 | 10 | def to_adj_str(adj): 11 | degree = adj.sum(axis = 1) 12 | node_num = adj.shape[0] 13 | str_cache = '' 14 | for i in range(node_num): 15 | neighbor = [] 16 | for idx, edge in enumerate(adj[i].tolist()): 17 | if edge: 18 | neighbor.append(str(idx)) 19 | str_cache += '0 ' + str(degree[i]) + ' ' + ' '.join(neighbor) + '\n' 20 | return str_cache 21 | 22 | def save_to_txt(total_graph_num, str_data_list, model, dataset): 23 | if model == 'orig': 24 | folder_path = dir_path + '/dataset/{}/'.format(dataset) 25 | if os.path.exists(folder_path): 26 | shutil.rmtree(folder_path) 27 | os.makedirs(folder_path) 28 | save_path = dir_path + '/dataset/{}/{}.txt'.format(dataset, dataset) 29 | else: 30 | folder_path = dir_path + '/dataset/{}_{}/'.format(model, dataset) 31 | if os.path.exists(folder_path): 32 | shutil.rmtree(folder_path) 33 | os.makedirs(folder_path) 34 | save_path = dir_path + '/dataset/{}_{}/{}_{}.txt'.format(model, dataset, model, dataset) 35 | 36 | with open(save_path, 'w') as file: 37 | file.write(str(total_graph_num) + '\n') 38 | for data in str_data_list: 39 | adj_str = data[0] 40 | node_num = data[1] 41 | graph_label = int(data[2])-1 42 | file.write('{} {}\n'.format(node_num, graph_label)) 43 | file.write(adj_str) 44 | 45 | 46 | def convert_data(model, dataset): 47 | # 1.read data 48 | if model == 'orig': 49 | data_path = root_path + '/data/orig/{}.pkl'.format(dataset) 50 | else: 51 | data_path = root_path + '/data/generated/{}_{}.pkl'.format(model, dataset) 52 | 53 | with open(data_path, 'rb') as file: 54 | data_list = pickle.load(file) 55 | 56 | # 2.convert to text data 57 | # demo: 58 | # 1500 59 | # 7 0 60 | # 0 6 1 2 3 4 5 6 61 | # 0 6 0 2 3 4 5 6 62 | # 0 6 0 1 3 4 5 6 63 | # 0 6 0 1 2 4 5 6 64 | # 0 6 0 1 2 3 5 6 65 | # 0 6 0 1 2 3 4 6 66 | # 0 6 0 1 2 3 4 5 67 | 68 | total_graph_num = len(data_list) 69 | str_data_list = [] 70 | for nx_graph in data_list: 71 | adj = nx.to_numpy_array(nx_graph) 72 | assert adj.shape[0] > 0 73 | label = nx_graph.graph['label'] 74 | str_data_list.append((to_adj_str(adj.astype(int)), str(adj.shape[0]), str(label))) 75 | 76 | save_to_txt(total_graph_num, str_data_list, model, dataset) 77 | 78 | 79 | 80 | def arg_parser(): 81 | parser = argparse.ArgumentParser() 82 | parser.add_argument('--dataset', type=str, default='new_dblp2', help='[new_IMDB_MULTI, new_dblp2]') 83 | parser.add_argument('--model', type=str, default='DPGGAN', help='[orig, DPGGAN, DPGVAE, GVAE, NetGAN, GraphRNN]') 84 | args = parser.parse_args() 85 | return args 86 | 87 | 88 | if __name__ == '__main__': 89 | print(dir_path) 90 | args = arg_parser() 91 | print(args) 92 | convert_data(args.model, args.dataset) 93 | -------------------------------------------------------------------------------- /src/DPGGAN/dp_encoders.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn as nn 5 | from torch.nn import init 6 | import torch.nn.functional as F 7 | import numpy as np 8 | 9 | class Encoder(nn.Module): 10 | """ 11 | Encodes a node's using 'convolutional' GraphSage approach 12 | """ 13 | def __init__(self, feature_dim, embed_dim, adj_lists, aggregator, first_layer, samp_neighs=None, 14 | num_sample=5, gcn=False, cuda=False, batch_size=None, 15 | feature_transform=False): 16 | super(Encoder, self).__init__() 17 | self.feat_dim = feature_dim 18 | self.adj_lists = adj_lists 19 | self.aggregator = aggregator 20 | self.num_sample = num_sample 21 | self.batch_size = batch_size 22 | self.first_layer = first_layer 23 | self.samp_neighs = samp_neighs 24 | # if base_model != None: 25 | # self.base_model = base_model 26 | 27 | self.gcn = gcn 28 | self.embed_dim = embed_dim 29 | self.cuda = cuda 30 | self.aggregator.cuda = cuda 31 | if self.gcn: 32 | input_dim = self.feat_dim 33 | else: 34 | input_dim = 2 * self.feat_dim 35 | self.weight = nn.Parameter(torch.FloatTensor(batch_size, input_dim, embed_dim)) 36 | init.xavier_uniform_(self.weight) 37 | 38 | # stdv = 1. / math.sqrt(self.weight.size(1)) 39 | # self.weight.data.uniform_(-stdv, stdv) 40 | 41 | 42 | def forward(self, features, nodes, samp_neighs=None, feature_dict=None, for_test=False): 43 | """ 44 | Generates embeddings for a batch of nodes. 45 | 46 | nodes -- list of nodes 47 | """ 48 | if self.first_layer: 49 | neigh_feats = self.aggregator.forward(features, nodes, 50 | [self.adj_lists[int(node)] for node in nodes], 51 | self.num_sample) 52 | else: 53 | assert (samp_neighs != None) 54 | assert (feature_dict != None) 55 | neigh_feats = self.aggregator.forward(features, nodes, samp_neighs, 56 | self.num_sample, feature_dict=feature_dict) 57 | if not self.gcn: 58 | if self.cuda: 59 | self_feats = features(torch.LongTensor(nodes).cuda()) 60 | else: 61 | self_feats = features(torch.LongTensor(nodes)) 62 | combined = torch.cat([self_feats, neigh_feats], dim=1) 63 | else: 64 | combined = neigh_feats 65 | 66 | if for_test: 67 | combined = F.relu(torch.mm(combined, self.weight[0])) 68 | else: 69 | combined = combined.view(combined.shape[0],1,combined.shape[1]) 70 | combined = F.relu(torch.bmm(combined, self.weight)) 71 | combined = combined.view(combined.shape[0], combined.shape[2]) # (192,128) 72 | assert (len(list(combined.shape)) == 2) 73 | return combined 74 | 75 | # if self.first_layer: 76 | # neigh_feats = self.aggregator.forward(features, nodes, 77 | # [self.adj_lists[int(node)] for node in nodes], 78 | # self.num_sample) 79 | # else: 80 | # assert ~(samp_neighs == None) 81 | # assert ~(feature_dict == None) 82 | # neigh_feats = self.aggregator.forward(features, nodes, samp_neighs, 83 | # self.num_sample, feature_dict=feature_dict) 84 | # if not self.gcn: 85 | # if self.first_layer: 86 | # self_feats = features(torch.LongTensor(nodes)) 87 | # else: 88 | # self_feats = features[nodes] 89 | # combined = torch.cat([self_feats, neigh_feats], dim=1) 90 | # else: 91 | # combined = neigh_feats 92 | # combined = F.relu(self.weight.mm(combined.t())) 93 | # return combined.t() 94 | -------------------------------------------------------------------------------- /graph_classification_exp/random_pred.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pickle 3 | from collections import defaultdict 4 | import random 5 | 6 | from sklearn.metrics import accuracy_score 7 | 8 | from util import load_data, separate_data 9 | 10 | numberList = [111,222,333,444,555] 11 | print("random item from list is: ", random.choice(numberList)) 12 | 13 | def read_graph(data_path): 14 | with open(data_path, 'rb') as tf: 15 | graph_set = pickle.load(tf) 16 | return graph_set 17 | 18 | 19 | if __name__ == '__main__': 20 | parser = argparse.ArgumentParser(description='PyTorch graph convolutional neural net for whole-graph classification') 21 | parser.add_argument('--dataset', type=str, default="Resampled_IMDB_MULTI", 22 | help='name of dataset (default: MUTAG)') 23 | parser.add_argument('--device', type=int, default=0, 24 | help='which gpu to use if any (default: 0)') 25 | parser.add_argument('--batch_size', type=int, default=32, 26 | help='input batch size for training (default: 32)') 27 | parser.add_argument('--iters_per_epoch', type=int, default=50, 28 | help='number of iterations per each epoch (default: 50)') 29 | parser.add_argument('--epochs', type=int, default=150, 30 | help='number of epochs to train (default: 350)') 31 | parser.add_argument('--lr', type=float, default=0.01, 32 | help='learning rate (default: 0.01)') 33 | parser.add_argument('--seed', type=int, default=0, 34 | help='random seed for splitting the dataset into 10 (default: 0)') 35 | parser.add_argument('--fold_idx', type=int, default=0, 36 | help='the index of fold in 10-fold validation. Should be less then 10.') 37 | parser.add_argument('--num_layers', type=int, default=5, 38 | help='number of layers INCLUDING the input one (default: 5)') 39 | parser.add_argument('--num_mlp_layers', type=int, default=2, 40 | help='number of layers for MLP EXCLUDING the input one (default: 2). 1 means linear model.') 41 | parser.add_argument('--hidden_dim', type=int, default=64, 42 | help='number of hidden units (default: 64)') 43 | parser.add_argument('--final_dropout', type=float, default=0.5, 44 | help='final layer dropout (default: 0.5)') 45 | parser.add_argument('--graph_pooling_type', type=str, default="sum", choices=["sum", "average"], 46 | help='Pooling for over nodes in a graph: sum or average') 47 | parser.add_argument('--neighbor_pooling_type', type=str, default="sum", choices=["sum", "average", "max"], 48 | help='Pooling for over neighboring nodes: sum, average or max') 49 | parser.add_argument('--learn_eps', action="store_true", help='Whether to learn the epsilon weighting for the center nodes. Does not affect training accuracy though.') 50 | parser.add_argument('--degree_as_tag', type=int, default=1, 51 | help='let the input node features be the degree of nodes (heuristics for unlabeled graph)') 52 | parser.add_argument('--filename', type = str, default = "log.txt", help='output file') 53 | args = parser.parse_args() 54 | 55 | random_acc_list = [] 56 | for _ in range(20): 57 | train_label_list = [] 58 | test_label_list = [] 59 | pred_label_list = [] 60 | 61 | graphs, num_classes = load_data(args.dataset, args.degree_as_tag) 62 | train_graphs, test_graphs = separate_data(graphs, args.seed, args.fold_idx) 63 | for g in train_graphs: 64 | train_label_list.append(g.label) 65 | for g in test_graphs: 66 | test_label_list.append(g.label) 67 | 68 | for _ in test_graphs: 69 | pred_label_list.append(random.choice(test_label_list)) 70 | 71 | random_acc = accuracy_score(test_label_list, pred_label_list) 72 | random_acc_list.append(random_acc) 73 | 74 | print("random prediction acc score:{}".format(sum(random_acc_list)/len(random_acc_list))) -------------------------------------------------------------------------------- /link_classification_exp/node2vec/src/main.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Reference implementation of node2vec. 3 | 4 | Author: Aditya Grover 5 | 6 | For more details, refer to the paper: 7 | node2vec: Scalable Feature Learning for Networks 8 | Aditya Grover and Jure Leskovec 9 | Knowledge Discovery and Data Mining (KDD), 2016 10 | ''' 11 | 12 | import argparse 13 | import os 14 | 15 | import numpy as np 16 | import networkx as nx 17 | 18 | from gensim.models import Word2Vec 19 | 20 | from link_classification_exp.node2vec.src import node2vec 21 | dir_path = os.path.dirname(os.path.realpath(__file__)) 22 | root_path = os.path.abspath(os.path.join(dir_path, os.pardir)) 23 | 24 | def parse_args(): 25 | ''' 26 | Parses the node2vec arguments. 27 | ''' 28 | parser = argparse.ArgumentParser(description="Run node2vec.") 29 | 30 | parser.add_argument('--input', nargs='?', default='graph/karate.edgelist', 31 | help='Input graph path') 32 | 33 | parser.add_argument('--output', nargs='?', default='emb/karate.emb', 34 | help='Embeddings path') 35 | 36 | parser.add_argument('--dimensions', type=int, default=128, 37 | help='Number of dimensions. Default is 128.') 38 | 39 | parser.add_argument('--walk-length', type=int, default=80, 40 | help='Length of walk per source. Default is 80.') 41 | 42 | parser.add_argument('--num-walks', type=int, default=10, 43 | help='Number of walks per source. Default is 10.') 44 | 45 | parser.add_argument('--window-size', type=int, default=10, 46 | help='Context size for optimization. Default is 10.') 47 | 48 | parser.add_argument('--iter', default=1, type=int, 49 | help='Number of epochs in SGD') 50 | 51 | parser.add_argument('--workers', type=int, default=8, 52 | help='Number of parallel workers. Default is 8.') 53 | 54 | parser.add_argument('--p', type=float, default=1, 55 | help='Return hyperparameter. Default is 1.') 56 | 57 | parser.add_argument('--q', type=float, default=1, 58 | help='Inout hyperparameter. Default is 1.') 59 | 60 | parser.add_argument('--weighted', dest='weighted', action='store_true', 61 | help='Boolean specifying (un)weighted. Default is unweighted.') 62 | parser.add_argument('--unweighted', dest='unweighted', action='store_false') 63 | parser.set_defaults(weighted=False) 64 | 65 | parser.add_argument('--directed', dest='directed', action='store_true', 66 | help='Graph is (un)directed. Default is undirected.') 67 | parser.add_argument('--undirected', dest='undirected', action='store_false') 68 | parser.set_defaults(directed=False) 69 | 70 | return parser.parse_args() 71 | 72 | def read_graph(): 73 | ''' 74 | Reads the input network in networkx. 75 | ''' 76 | if args.weighted: 77 | G = nx.read_edgelist('{}/{}'.format(root_path, args.input), nodetype=int, data=(('weight',float),), create_using=nx.DiGraph()) 78 | else: 79 | G = nx.read_edgelist('{}/{}'.format(root_path, args.input), nodetype=int, create_using=nx.DiGraph()) 80 | for edge in G.edges(): 81 | G[edge[0]][edge[1]]['weight'] = 1 82 | 83 | if not args.directed: 84 | G = G.to_undirected() 85 | 86 | return G 87 | 88 | def learn_embeddings(args, walks): 89 | ''' 90 | Learn embeddings by optimizing the Skipgram objective using SGD. 91 | ''' 92 | # walks = [map(str, walk) for walk in walks] 93 | walks = [list(map(str, walk)) for walk in walks] # convert each vertex id to a string 94 | model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, sg=1, workers=args.workers, iter=args.iter) 95 | model.wv.save_word2vec_format('{}/{}'.format(root_path, args.output)) 96 | return model.wv 97 | 98 | def main(args): 99 | ''' 100 | Pipeline for representational learning for all nodes in a graph. 101 | ''' 102 | nx_G = read_graph() 103 | G = node2vec.Graph(nx_G, args.directed, args.p, args.q) 104 | G.preprocess_transition_probs() 105 | walks = G.simulate_walks(args.num_walks, args.walk_length) 106 | learn_embeddings(args, walks) 107 | 108 | if __name__ == "__main__": 109 | args = parse_args() 110 | main(args) -------------------------------------------------------------------------------- /link_classification_exp/node2vec/src/node2vec.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import networkx as nx 3 | import random 4 | 5 | 6 | class Graph(): 7 | def __init__(self, nx_G, is_directed, p, q): 8 | self.G = nx_G 9 | self.is_directed = is_directed 10 | self.p = p 11 | self.q = q 12 | 13 | def node2vec_walk(self, walk_length, start_node): 14 | ''' 15 | Simulate a random walk starting from start node. 16 | ''' 17 | G = self.G 18 | alias_nodes = self.alias_nodes 19 | alias_edges = self.alias_edges 20 | 21 | walk = [start_node] 22 | 23 | while len(walk) < walk_length: 24 | cur = walk[-1] 25 | cur_nbrs = sorted(G.neighbors(cur)) 26 | if len(cur_nbrs) > 0: 27 | if len(walk) == 1: 28 | walk.append(cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])]) 29 | else: 30 | prev = walk[-2] 31 | next = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0], 32 | alias_edges[(prev, cur)][1])] 33 | walk.append(next) 34 | else: 35 | break 36 | 37 | return walk 38 | 39 | def simulate_walks(self, num_walks, walk_length): 40 | ''' 41 | Repeatedly simulate random walks from each node. 42 | ''' 43 | G = self.G 44 | walks = [] 45 | nodes = list(G.nodes()) 46 | print('Walk iteration:') 47 | for walk_iter in range(num_walks): 48 | print(str(walk_iter+1), '/', str(num_walks)) 49 | random.shuffle(nodes) 50 | for node in nodes: 51 | walks.append(self.node2vec_walk(walk_length=walk_length, start_node=node)) 52 | 53 | return walks 54 | 55 | def get_alias_edge(self, src, dst): 56 | ''' 57 | Get the alias edge setup lists for a given edge. 58 | ''' 59 | G = self.G 60 | p = self.p 61 | q = self.q 62 | 63 | unnormalized_probs = [] 64 | for dst_nbr in sorted(G.neighbors(dst)): 65 | if dst_nbr == src: 66 | unnormalized_probs.append(G[dst][dst_nbr]['weight']/p) 67 | elif G.has_edge(dst_nbr, src): 68 | unnormalized_probs.append(G[dst][dst_nbr]['weight']) 69 | else: 70 | unnormalized_probs.append(G[dst][dst_nbr]['weight']/q) 71 | norm_const = sum(unnormalized_probs) 72 | normalized_probs = [float(u_prob)/norm_const for u_prob in unnormalized_probs] 73 | 74 | return alias_setup(normalized_probs) 75 | 76 | def preprocess_transition_probs(self): 77 | ''' 78 | Preprocessing of transition probabilities for guiding the random walks. 79 | ''' 80 | G = self.G 81 | is_directed = self.is_directed 82 | 83 | alias_nodes = {} 84 | for node in G.nodes(): 85 | unnormalized_probs = [G[node][nbr]['weight'] for nbr in sorted(G.neighbors(node))] 86 | norm_const = sum(unnormalized_probs) 87 | normalized_probs = [float(u_prob)/norm_const for u_prob in unnormalized_probs] 88 | alias_nodes[node] = alias_setup(normalized_probs) 89 | 90 | alias_edges = {} 91 | triads = {} 92 | 93 | if is_directed: 94 | for edge in G.edges(): 95 | alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) 96 | else: 97 | for edge in G.edges(): 98 | alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) 99 | alias_edges[(edge[1], edge[0])] = self.get_alias_edge(edge[1], edge[0]) 100 | 101 | self.alias_nodes = alias_nodes 102 | self.alias_edges = alias_edges 103 | 104 | return 105 | 106 | 107 | def alias_setup(probs): 108 | ''' 109 | Compute utility lists for non-uniform sampling from discrete distributions. 110 | Refer to https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/ 111 | for details 112 | ''' 113 | K = len(probs) 114 | q = np.zeros(K) 115 | J = np.zeros(K, dtype=np.int) 116 | 117 | smaller = [] 118 | larger = [] 119 | for kk, prob in enumerate(probs): 120 | q[kk] = K*prob 121 | if q[kk] < 1.0: 122 | smaller.append(kk) 123 | else: 124 | larger.append(kk) 125 | 126 | while len(smaller) > 0 and len(larger) > 0: 127 | small = smaller.pop() 128 | large = larger.pop() 129 | 130 | J[small] = large 131 | q[large] = q[large] + q[small] - 1.0 132 | if q[large] < 1.0: 133 | smaller.append(large) 134 | else: 135 | larger.append(large) 136 | 137 | return J, q 138 | 139 | def alias_draw(J, q): 140 | ''' 141 | Draw sample from a non-uniform discrete distribution using alias sampling. 142 | ''' 143 | K = len(J) 144 | 145 | kk = int(np.floor(np.random.rand()*K)) 146 | if np.random.rand() < q[kk]: 147 | return kk 148 | else: 149 | return J[kk] -------------------------------------------------------------------------------- /src/dataloader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import torch 5 | import numpy as np 6 | import networkx as nx 7 | import scipy 8 | import scipy.sparse as sp 9 | import pickle 10 | 11 | from src.utils import graph_to_adj_list 12 | 13 | dir_path = os.path.dirname(os.path.realpath(__file__)) 14 | root_path = os.path.abspath(os.path.join(dir_path, os.pardir)) 15 | 16 | multi_graph_dataset = set(['relabeled_dblp2', 'new_dblp2', 'dblp2','new_IMDB_MULTI', 'IMDB_MULTI', 'Resampled_IMDB_MULTI']) 17 | 18 | def parse_index_file(filename): 19 | index = [] 20 | for line in open(filename): 21 | index.append(int(line.strip())) 22 | return index 23 | 24 | 25 | def generate_feature(adj, n_eigenvector): 26 | # Use eigenvector as feature 27 | adj_orig = adj.copy() 28 | adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) # eliminate diag element 29 | adj_orig.eliminate_zeros() 30 | 31 | # n_eigenvector will be bounded by (0.65 * node_num , args.n_eigenvector) 32 | node_num = adj_orig.shape[0] 33 | if n_eigenvector > 0.65 * node_num: 34 | n_eigenvector = int(0.65 * node_num) 35 | 36 | # graph spectical transformation 37 | adj_ = sp.coo_matrix(adj_orig) 38 | adj_ = adj_orig + sp.eye(adj_.shape[0]) # add diag back 39 | rowsum = np.array(adj_.sum(1)) 40 | degree_mat_inv_sqrt = sp.diags(np.power(rowsum, -0.5).flatten()) 41 | adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt).toarray() 42 | _, adj_features = scipy.linalg.eigh(adj_normalized, eigvals=(node_num - n_eigenvector, node_num - 1)) 43 | 44 | actual_feature_dim = n_eigenvector 45 | return adj_features, actual_feature_dim 46 | 47 | 48 | 49 | class Single_Graph_Dataset: 50 | def __init__(self, dataset_str, n_eigenvector, graph_adj=None, label=None): 51 | self.dataset_str = dataset_str 52 | self.n_eigenvector = n_eigenvector 53 | self.graph_adj = graph_adj 54 | self.label=label 55 | self.load_data() 56 | 57 | def load_data(self): 58 | if self.dataset_str == 'karate': 59 | G = nx.karate_club_graph() 60 | adj = nx.to_scipy_sparse_matrix(G) 61 | features = torch.eye(adj.shape[0]) 62 | elif self.dataset_str in multi_graph_dataset: 63 | adj = self.graph_adj 64 | else: 65 | print("dataset: {} is unkown, Single_Graph_Dataset.".format(self.dataset_str)) 66 | sys.exit(1) 67 | 68 | if self.n_eigenvector is not None and self.n_eigenvector != 0: 69 | features, actual_feature_dim = generate_feature(adj, self.n_eigenvector) 70 | self.actual_feature_dim = actual_feature_dim 71 | 72 | 73 | adj_temp = adj.copy() # eliminate 0 --> adj_orig 74 | adj_temp = adj_temp - sp.dia_matrix((adj_temp.diagonal()[np.newaxis, :], [0]), shape=adj_temp.shape) 75 | adj_temp = adj_temp + sp.dia_matrix((np.ones(adj_temp.shape[0]), [0]), shape=adj_temp.shape) 76 | adj_temp.eliminate_zeros() 77 | self.adj = adj_temp 78 | self.features = features 79 | self.adj_list = graph_to_adj_list(self.adj) 80 | 81 | 82 | # TODO: add graph label 83 | class Multi_Graph_Dataset: 84 | def __init__(self, dataset_str, n_eigenvector): 85 | self.dataset_str = dataset_str 86 | self.n_eigenvector = n_eigenvector 87 | self.load_data() 88 | 89 | def load_data(self): 90 | graph_size_list = [] 91 | dataset_list = [] 92 | with open(root_path + '/data/orig/' + self.dataset_str + '.pkl', 'rb') as tf: 93 | graph_set = pickle.load(tf) 94 | for graph in graph_set: 95 | label = graph.graph['label'] 96 | graph_size_list.append(graph.number_of_nodes()) 97 | sp_adj_matrix = nx.to_scipy_sparse_matrix(graph) 98 | dataset_list.append(Single_Graph_Dataset(self.dataset_str, self.n_eigenvector, graph_adj=sp_adj_matrix, label=label) ) 99 | self.datasets = dataset_list 100 | 101 | 102 | 103 | # def pickle_load_data(dataset_str): 104 | # if dataset_str == 'IMDB': 105 | # with open(sys.path[1] + '/data/' + 'IMDB_MULTI' + '.pickle', 'rb') as tf: 106 | # graph_set = pkl.load(tf) 107 | # 108 | # elif dataset_str == 'reddit_binary_nx2': 109 | # with open(sys.path[1] + '/data/' + 'reddit_nx2' + '.pickle', 'rb') as tf: 110 | # graph_set = pkl.load(tf) 111 | # else: 112 | # print("dataset: {} doesn't exist.".format(dataset_str)) 113 | # sys.exit(1) 114 | # return graph_set -------------------------------------------------------------------------------- /src/GGAN/dataloader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import torch 5 | import numpy as np 6 | import networkx as nx 7 | import scipy 8 | import scipy.sparse as sp 9 | import pickle 10 | 11 | from src.utils import graph_to_adj_list 12 | 13 | dir_path = os.path.dirname(os.path.realpath(__file__)) 14 | root_path = os.path.abspath(os.path.join(dir_path, os.pardir)) 15 | 16 | multi_graph_dataset = set(['relabeled_dblp2', 'new_dblp2', 'dblp2','new_IMDB_MULTI', 'IMDB_MULTI', 'Resampled_IMDB_MULTI']) 17 | 18 | def parse_index_file(filename): 19 | index = [] 20 | for line in open(filename): 21 | index.append(int(line.strip())) 22 | return index 23 | 24 | 25 | def generate_feature(adj, n_eigenvector): 26 | # Use eigenvector as feature 27 | adj_orig = adj.copy() 28 | adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) # eliminate diag element 29 | adj_orig.eliminate_zeros() 30 | 31 | # n_eigenvector will be bounded by (0.65 * node_num , args.n_eigenvector) 32 | node_num = adj_orig.shape[0] 33 | if n_eigenvector > 0.65 * node_num: 34 | n_eigenvector = int(0.65 * node_num) 35 | 36 | # graph spectical transformation 37 | adj_ = sp.coo_matrix(adj_orig) 38 | adj_ = adj_orig + sp.eye(adj_.shape[0]) # add diag back 39 | rowsum = np.array(adj_.sum(1)) 40 | degree_mat_inv_sqrt = sp.diags(np.power(rowsum, -0.5).flatten()) 41 | adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt).toarray() 42 | _, adj_features = scipy.linalg.eigh(adj_normalized, eigvals=(node_num - n_eigenvector, node_num - 1)) 43 | 44 | actual_feature_dim = n_eigenvector 45 | return adj_features, actual_feature_dim 46 | 47 | 48 | 49 | class Single_Graph_Dataset: 50 | def __init__(self, dataset_str, n_eigenvector, graph_adj=None, label=None): 51 | self.dataset_str = dataset_str 52 | self.n_eigenvector = n_eigenvector 53 | self.graph_adj = graph_adj 54 | self.label=label 55 | self.load_data() 56 | 57 | def load_data(self): 58 | if self.dataset_str == 'karate': 59 | G = nx.karate_club_graph() 60 | adj = nx.to_scipy_sparse_matrix(G) 61 | features = torch.eye(adj.shape[0]) 62 | elif self.dataset_str in multi_graph_dataset: 63 | adj = self.graph_adj 64 | else: 65 | print("dataset: {} is unkown, Single_Graph_Dataset.".format(self.dataset_str)) 66 | sys.exit(1) 67 | 68 | if self.n_eigenvector is not None and self.n_eigenvector != 0: 69 | features, actual_feature_dim = generate_feature(adj, self.n_eigenvector) 70 | self.actual_feature_dim = actual_feature_dim 71 | 72 | 73 | adj_temp = adj.copy() # eliminate 0 --> adj_orig 74 | adj_temp = adj_temp - sp.dia_matrix((adj_temp.diagonal()[np.newaxis, :], [0]), shape=adj_temp.shape) 75 | adj_temp = adj_temp + sp.dia_matrix((np.ones(adj_temp.shape[0]), [0]), shape=adj_temp.shape) 76 | adj_temp.eliminate_zeros() 77 | self.adj = adj_temp 78 | self.features = features 79 | self.adj_list = graph_to_adj_list(self.adj) 80 | 81 | 82 | # TODO: add graph label 83 | class Multi_Graph_Dataset: 84 | def __init__(self, dataset_str, n_eigenvector): 85 | self.dataset_str = dataset_str 86 | self.n_eigenvector = n_eigenvector 87 | self.load_data() 88 | 89 | def load_data(self): 90 | graph_size_list = [] 91 | dataset_list = [] 92 | with open(root_path + '/data/orig/' + self.dataset_str + '.pkl', 'rb') as tf: 93 | graph_set = pickle.load(tf) 94 | for graph in graph_set: 95 | label = graph.graph['label'] 96 | graph_size_list.append(graph.number_of_nodes()) 97 | sp_adj_matrix = nx.to_scipy_sparse_matrix(graph) 98 | dataset_list.append(Single_Graph_Dataset(self.dataset_str, self.n_eigenvector, graph_adj=sp_adj_matrix, label=label) ) 99 | self.datasets = dataset_list 100 | 101 | 102 | 103 | # def pickle_load_data(dataset_str): 104 | # if dataset_str == 'IMDB': 105 | # with open(sys.path[1] + '/data/' + 'IMDB_MULTI' + '.pickle', 'rb') as tf: 106 | # graph_set = pkl.load(tf) 107 | # 108 | # elif dataset_str == 'reddit_binary_nx2': 109 | # with open(sys.path[1] + '/data/' + 'reddit_nx2' + '.pickle', 'rb') as tf: 110 | # graph_set = pkl.load(tf) 111 | # else: 112 | # print("dataset: {} doesn't exist.".format(dataset_str)) 113 | # sys.exit(1) 114 | # return graph_set -------------------------------------------------------------------------------- /src/test.py: -------------------------------------------------------------------------------- 1 | def test(args, current_time, visual_path, result_filename, model, data_package,degree_distribution_folder=None): 2 | 3 | # unpack data_package 4 | adj_train = data_package['adj_train'] 5 | adj_orig = data_package['adj_orig'] 6 | test_edges = data_package['test_edges'] 7 | test_edges_false = data_package['test_edges_false'] 8 | color = data_package['color'] 9 | node_num = data_package['node_num'] 10 | priv_pars = data_package['priv_pars'] 11 | 12 | with torch.no_grad(): 13 | node_list = np.array(range(node_num)) 14 | mu, logvar, adj_without_sigmoid, _ = model.forward(node_list, use_L2_Loss=args.use_L2_Loss, KL_type=args.KL_type, for_test=True) 15 | 16 | # not full_data ==>> link prediction test 17 | if not args.full_data and args.single_graph: 18 | roc_score, ap_score = get_roc_score(adj_without_sigmoid, adj_orig, test_edges, test_edges_false) 19 | 20 | print('Test ROC score: ' + str(roc_score)) 21 | print('Test AP score: ' + str(ap_score)) 22 | log_info = 'Test ROC score: ' + str(roc_score) + '\tTest AP score: ' + str(ap_score) 23 | log_write(result_filename, log_info) 24 | 25 | 26 | # full_data ==>> generate some graph after training, in case of sampling variance. 27 | elif args.single_graph: 28 | for i in range(args.generated_graph_num): 29 | 30 | ## 1.generated graph ## 31 | seed = torch.randn(node_num, args.layer2_dim) 32 | generated_graph, gene_neck_value, _ = generate_graph(seed, None, model, adj_train, args, 33 | list(range(node_num)), threshold=args.threshold, 34 | for_test=True) 35 | 36 | generated_graph_property = compute_graph_statistics(generated_graph) 37 | print(generated_graph_property) 38 | 39 | graph_title = args.model_name+"_"+str(args.dataset_str)+'_predict_graph_test_'+str(i) 40 | draw_graph(generated_graph, visual_path, graph_title, circle=args.circle_plot, color=color) 41 | 42 | if args.single_graph: 43 | draw_degree_distribution(generated_graph, args.model_name+'_predict_graph_test_'+str(i), 44 | degree_distribution_folder) 45 | #################### 46 | 47 | ## 2.params_graph ## 48 | params_graph, params_graph_neck_value, _ = generate_graph(None, adj_without_sigmoid, model, adj_train, 49 | args, 50 | list(range(node_num)), 51 | threshold=args.threshold, for_test=True) 52 | 53 | params_graph_property = compute_graph_statistics(params_graph) 54 | print(params_graph_property) 55 | 56 | graph_title = args.model_name + "_" + str(args.dataset_str) + '_params_graph_test_' + str(i) 57 | draw_graph(params_graph, visual_path, graph_title, circle=args.circle_plot, color=color) 58 | 59 | if args.single_graph: 60 | draw_degree_distribution(params_graph, args.model_name+'_params_graph_test_ ' + str(i), 61 | degree_distribution_folder) 62 | #################### 63 | 64 | ## 3.embed_graph ## 65 | embed_graph_test, embed_test_neck_value, _ = generate_graph(mu, None, model, adj_train, args, 66 | list(range(node_num)), threshold=args.threshold, for_test=True) 67 | 68 | embed_graph_test_property = compute_graph_statistics(embed_graph_test) 69 | print(embed_graph_test_property) 70 | 71 | graph_title = args.model_name + "_" + str(args.dataset_str) + '_embed_graph_test' 72 | draw_graph(embed_graph_test, visual_path, graph_title, circle=args.circle_plot, color=color) 73 | 74 | if args.single_graph: 75 | draw_degree_distribution(embed_graph_test, args.model_name + '_embed_graph_test', 76 | degree_distribution_folder) 77 | #################### 78 | 79 | 80 | # save 'embed_graph_test' with networkx for presentation 81 | G = nx.from_numpy_array(embed_graph_test) 82 | file = sys.path[1] + '/presentation/' 83 | file = file + args.dataset_str + '_' + args.model_name + '_' + current_time + '_test.pickle' 84 | with open(file, 'wb') as handle: 85 | pickle.dump(G, handle) -------------------------------------------------------------------------------- /link_classification_exp/preprocess.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import pickle 4 | import random 5 | import shutil 6 | import numpy as np 7 | import networkx as nx 8 | 9 | from src.dataloader import Single_Graph_Dataset 10 | 11 | dir_path = os.path.dirname(os.path.realpath(__file__)) 12 | root_path = os.path.abspath(os.path.join(dir_path, os.pardir)) 13 | 14 | 15 | test_edge_ratio = 0.3 16 | 17 | 18 | def arg_parser(): 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('--seed', type=int, default=43, help='Random seed.') 21 | 22 | args = parser.parse_args() 23 | return args 24 | 25 | 26 | 27 | def process_orig_graph(graph_list, Gs): 28 | test_edges_list = [] 29 | test_edges_list_neg = [] 30 | train_edges_list = [] 31 | node_num_list = [] 32 | for i, graph_edges in enumerate(graph_list): 33 | node_num_list.append(Gs[i].number_of_nodes()) 34 | num_edge = len(graph_edges) 35 | hold_out_num = int(test_edge_ratio * num_edge) 36 | test_edge_idx = random.sample(range(num_edge), hold_out_num) 37 | train_edge_idx = list(set(list(range(num_edge))) - set(test_edge_idx)) 38 | # train_edge_idx = list(set(list(range(num_edge)))) 39 | assert hold_out_num > 0 40 | train_edges_list.append(list((np.array(graph_edges)[train_edge_idx]))) 41 | test_edges_list.append(list((np.array(graph_edges)[test_edge_idx]))) 42 | 43 | not_exist_edge = [] 44 | for non_edge in nx.non_edges(Gs[i]): 45 | not_exist_edge.append([int(non_edge[0]), int(non_edge[1])]) 46 | test_edge_idx_neg = random.sample(range(len(not_exist_edge)), hold_out_num) 47 | test_edges_list_neg.append(list((np.array(not_exist_edge)[test_edge_idx_neg]))) 48 | 49 | return train_edges_list, test_edges_list, test_edges_list_neg, node_num_list 50 | 51 | 52 | 53 | def process_generated_graph(graphs_list, test_edges_list): 54 | gen_train_edges_list = [] 55 | assert len(graph_list) == len(test_edges_list) 56 | for idx, edge_list in enumerate(graphs_list): 57 | test_edge_list = test_edges_list[idx] 58 | test_edge_tuple = [tuple(x) for x in test_edge_list] 59 | edge_tuple = [tuple(x) for x in edge_list] 60 | gen_train_edge_list = list(set(edge_tuple) - set(test_edge_tuple)) 61 | # gen_train_edge_list = list(set(edge_tuple)) 62 | gen_train_edges_list.append(gen_train_edge_list) 63 | return gen_train_edges_list 64 | 65 | 66 | def save_edge_list(node_num, file_name, edge_list): 67 | with open(file_name, 'w') as file: 68 | file.write(str(node_num) + '\n') 69 | for edge in edge_list: 70 | file.write(str(edge[0]) + ' ' + str(edge[1]) + '\n') 71 | 72 | 73 | 74 | def save_processed_graph(node_num_list, train_edges_list, test_edges_list=None, graph_dataset_name=None): 75 | # check folder exist or not 76 | folder_path = dir_path + '/dataset/{}/'.format(graph_dataset_name) 77 | if os.path.exists(folder_path): 78 | shutil.rmtree(folder_path) 79 | os.makedirs(folder_path) 80 | for idx, edge_list in enumerate(train_edges_list): 81 | save_edge_list(node_num_list[idx], folder_path + 'train_edge_list_{}.txt'.format(idx), edge_list) 82 | if test_edges_list is not None: 83 | for idx, (test_edges_list_pos, test_edges_list_neg) in enumerate(test_edges_list): 84 | with open(folder_path + 'test_edge_list_{}.txt'.format(idx), 'w') as file: 85 | for edge in test_edges_list_pos: 86 | file.write(str(edge[0]) + ' ' + str(edge[1]) + ' ' + '1' + '\n') 87 | for edge in test_edges_list_neg: 88 | file.write(str(edge[0]) + ' ' + str(edge[1]) + ' ' + '0' + '\n') 89 | 90 | 91 | 92 | def read_graph_list(dataset_folder): 93 | dataset_list = [] 94 | with open(root_path + '/data/' + dataset_folder + '.pkl', 'rb') as tf: 95 | graph_set = pickle.load(tf) 96 | for graph in graph_set: 97 | edge_list = [] 98 | for line in nx.generate_edgelist(graph, data=False): 99 | edge_list.append([int(line.split(' ')[0]), int(line.split(' ')[1])]) 100 | dataset_list.append(edge_list) 101 | return dataset_list, graph_set 102 | 103 | 104 | if __name__ == '__main__': 105 | args = arg_parser() 106 | for graph_set_name in ['new_dblp2', 'new_IMDB_MULTI']: 107 | orig_graph_list, Gs = read_graph_list('orig/{}'.format(graph_set_name)) 108 | train_edges_list, test_edges_list, test_edges_list_neg, node_num_list = process_orig_graph(orig_graph_list, Gs) 109 | save_processed_graph(node_num_list, train_edges_list, list(zip(test_edges_list, test_edges_list_neg)), graph_set_name) 110 | # for model in ['GGAN_{}', 'DPGGAN_{}_eps:0.1', 'DPGGAN_{}_eps:1.0', 'DPGGAN_{}_eps:10.0']: 111 | for model in ['GVAE_{}']: 112 | graph_list, _ = read_graph_list('generated/{}'.format(model.format(graph_set_name))) 113 | gen_train_edges_list = process_generated_graph(graph_list, test_edges_list) 114 | save_processed_graph(node_num_list, gen_train_edges_list, test_edges_list=None, graph_dataset_name=model.format(graph_set_name)) -------------------------------------------------------------------------------- /src/train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from tqdm import tqdm 4 | import numpy as np 5 | from src.DPGGAN import px_expander 6 | from src.DPGGAN.utils_dp import create_cum_grads, update_privacy_pars, perturb_grad, update_privacy_account 7 | from src.eval import compute_graph_statistics 8 | from src.utils import save_top_n, save_edge_num, sample_subgraph 9 | seed = 3 10 | 11 | def generate_graph(adj_without_sigmoid, adj, threshold): 12 | generated_graph, gene_neck_value = save_top_n(adj_without_sigmoid, save_edge_num(adj), threshold=threshold) 13 | return generated_graph, gene_neck_value 14 | 15 | 16 | def eval_generated_graph(args, epoch, model, adj, logger): 17 | generated_graph_property_cache = [] 18 | params_graph = None 19 | with torch.no_grad(): 20 | for i in range(args.stat_generate_num): 21 | # get embedding of all nodes 22 | node_list = np.array(range(adj.shape[0])) 23 | # recovered is a sigmoid prob matrix 24 | mu, logvar, adj_without_sigmoid, _, _ = model.forward(node_list, adj, for_test=True) 25 | # the generated_graph is a [0,1] matrix. recovered is the mu+var, so re-generate from mu 26 | params_graph, params_neck_value = generate_graph(adj_without_sigmoid.numpy(), adj, args.threshold) 27 | 28 | generated_graph_property_cache.append(compute_graph_statistics(params_graph)) 29 | stat_log_info = logger.form_generated_stat_log(epoch, generated_graph_property_cache) 30 | logger.write(stat_log_info) 31 | 32 | return params_graph, generated_graph_property_cache 33 | 34 | 35 | def get_loss_weight(node_num, dataset): 36 | pos_weight = float(node_num * node_num - dataset.adj.sum()) / dataset.adj.sum() 37 | return pos_weight 38 | 39 | 40 | def train(args, model_args, dataset, model, optimizer, logger): 41 | node_num, feature_dim = dataset.features.shape 42 | pos_weight = get_loss_weight(node_num, dataset) # Use global norm and weight 43 | 44 | original_graph_stat = compute_graph_statistics(dataset.adj) 45 | stat_log_info = logger.form_original_stat_log(original_graph_stat) 46 | logger.write(stat_log_info) 47 | 48 | stop_info = None 49 | for epoch in tqdm(range(args.epochs)): 50 | optimizer.zero_grad() 51 | index = np.array(list(range(dataset.adj.shape[0]))) 52 | sub_adj = torch.FloatTensor(np.array(dataset.adj.todense())) 53 | mu, logvar_sub, reconst_adj, gan_pred, gan_label = model.forward(index, sub_adj, for_test=False) 54 | # Calculate loss 55 | loss = model.loss_function(reconst_adj, sub_adj, logvar_sub, pos_weight, gan_pred, gan_label) 56 | # Optimize 57 | assert torch.isnan(loss).sum() == 0 # if nan appears, change lr 58 | loss.backward() 59 | 60 | if 'DP' in args.model_name: 61 | perturb_grad(model_args, model) 62 | 63 | # Perform one step optimize 64 | if args.optimizer == 'ADADP': 65 | optimizer.step1() if epoch % 2 == 0 else optimizer.step2(model_args.tol) 66 | else: 67 | optimizer.step() 68 | 69 | if 'DP' in args.model_name: 70 | stop_signal = update_privacy_account(model_args, model) # Update privacy budget 71 | else: 72 | stop_signal = False 73 | 74 | if stop_signal: 75 | stop_info = "Run out of DP budget at epoch@{}.".format(epoch) 76 | print(stop_info) 77 | break 78 | 79 | generated_adj, generated_graph_property_cache = eval_generated_graph(args, args.epochs, model, dataset.adj, logger) 80 | 81 | if stop_info is not None: 82 | logger.write(stop_info) 83 | 84 | logger.write('='*25 + '\n') 85 | print("Optimization Finished!") 86 | return model, generated_adj, generated_graph_property_cache 87 | 88 | 89 | 90 | # 91 | # def draw_graph(args, epoch, model, node_num, adj): 92 | # # No matter whether full_data, 93 | # # Generate graph from multi-normal distribution and paramterized distribution @epoch 94 | # if (epoch + 1) % args.draw_graph_epoch == 0 and args.dataset_type == 'single': 95 | # with torch.no_grad(): 96 | # mu, logvar, adj_without_sigmoid, _ = model.forward(np.array(range(node_num)), args.use_L2_Loss, 97 | # KL_type=args.KL_type, for_test=True) 98 | # 99 | # ##### generate from mu and logvar ##### 100 | # params_graph, params_neck_value, _ = \ 101 | # generate_graph(None, adj_without_sigmoid, model, adj, args, list(range(node_num)), 102 | # threshold=args.threshold, for_test=True) 103 | # 104 | # print('params_graph neck_value@' + str(epoch) + ": " + str(params_neck_value)) 105 | # ###################################### 106 | # 107 | # ##### generate only from mu ######## 108 | # emb_graph, emb_neck_value, _ = \ 109 | # generate_graph(mu, None, model, adj, args, list(range(node_num)), 110 | # threshold=args.threshold, for_test=True) 111 | # 112 | # print('emb_graph neck_value@' + str(epoch) + ": " + str(emb_neck_value)) 113 | # ###################################· ### 114 | # 115 | # ##### generate from random seed ###### 116 | # random_seed = torch.randn(node_num, args.layer2_dim) 117 | # generated_graph, gene_neck_value, _ = generate_graph(random_seed, None, model, adj, args, 118 | # list(range(node_num)), 119 | # threshold=args.threshold, for_test=True) 120 | # print('gene_graph neck_value@' + str(epoch) + ": " + str(gene_neck_value)) 121 | # ###################################### 122 | -------------------------------------------------------------------------------- /src/GGAN/train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from tqdm import tqdm 4 | import numpy as np 5 | from src.DPGGAN import px_expander 6 | from src.DPGGAN.utils_dp import create_cum_grads, update_privacy_pars, perturb_grad, update_privacy_account 7 | from src.eval import compute_graph_statistics 8 | from src.utils import save_top_n, save_edge_num, sample_subgraph 9 | seed = 3 10 | 11 | def generate_graph(adj_without_sigmoid, adj, threshold): 12 | generated_graph, gene_neck_value = save_top_n(adj_without_sigmoid, save_edge_num(adj), threshold=threshold) 13 | return generated_graph, gene_neck_value 14 | 15 | 16 | def eval_generated_graph(args, epoch, model, adj, logger): 17 | generated_graph_property_cache = [] 18 | params_graph = None 19 | with torch.no_grad(): 20 | for i in range(args.stat_generate_num): 21 | # get embedding of all nodes 22 | node_list = np.array(range(adj.shape[0])) 23 | # recovered is a sigmoid prob matrix 24 | mu, logvar, adj_without_sigmoid, _, _ = model.forward(node_list, adj, for_test=True) 25 | # the generated_graph is a [0,1] matrix. recovered is the mu+var, so re-generate from mu 26 | params_graph, params_neck_value = generate_graph(adj_without_sigmoid.numpy(), adj, args.threshold) 27 | 28 | generated_graph_property_cache.append(compute_graph_statistics(params_graph)) 29 | stat_log_info = logger.form_generated_stat_log(epoch, generated_graph_property_cache) 30 | logger.write(stat_log_info) 31 | 32 | return params_graph, generated_graph_property_cache 33 | 34 | 35 | def get_loss_weight(node_num, dataset): 36 | pos_weight = float(node_num * node_num - dataset.adj.sum()) / dataset.adj.sum() 37 | return pos_weight 38 | 39 | 40 | def train(args, model_args, dataset, model, optimizer, logger): 41 | node_num, feature_dim = dataset.features.shape 42 | pos_weight = get_loss_weight(node_num, dataset) # Use global norm and weight 43 | 44 | original_graph_stat = compute_graph_statistics(dataset.adj) 45 | stat_log_info = logger.form_original_stat_log(original_graph_stat) 46 | logger.write(stat_log_info) 47 | 48 | stop_info = None 49 | for epoch in tqdm(range(args.epochs)): 50 | optimizer.zero_grad() 51 | index = np.array(list(range(dataset.adj.shape[0]))) 52 | sub_adj = torch.FloatTensor(np.array(dataset.adj.todense())) 53 | mu, logvar_sub, reconst_adj, gan_pred, gan_label = model.forward(index, sub_adj, for_test=False) 54 | # Calculate loss 55 | loss = model.loss_function(reconst_adj, sub_adj, logvar_sub, pos_weight, gan_pred, gan_label) 56 | # Optimize 57 | assert torch.isnan(loss).sum() == 0 # if nan appears, change lr 58 | loss.backward() 59 | 60 | if 'DP' in args.model_name: 61 | perturb_grad(model_args, model) 62 | 63 | # Perform one step optimize 64 | if args.optimizer == 'ADADP': 65 | optimizer.step1() if epoch % 2 == 0 else optimizer.step2(model_args.tol) 66 | else: 67 | optimizer.step() 68 | 69 | if 'DP' in args.model_name: 70 | stop_signal = update_privacy_account(model_args, model) # Update privacy budget 71 | else: 72 | stop_signal = False 73 | 74 | if stop_signal: 75 | stop_info = "Run out of DP budget at epoch@{}.".format(epoch) 76 | print(stop_info) 77 | break 78 | 79 | generated_adj, generated_graph_property_cache = eval_generated_graph(args, args.epochs, model, dataset.adj, logger) 80 | 81 | if stop_info is not None: 82 | logger.write(stop_info) 83 | 84 | logger.write('='*25 + '\n') 85 | print("Optimization Finished!") 86 | return model, generated_adj, generated_graph_property_cache 87 | 88 | 89 | 90 | # 91 | # def draw_graph(args, epoch, model, node_num, adj): 92 | # # No matter whether full_data, 93 | # # Generate graph from multi-normal distribution and paramterized distribution @epoch 94 | # if (epoch + 1) % args.draw_graph_epoch == 0 and args.dataset_type == 'single': 95 | # with torch.no_grad(): 96 | # mu, logvar, adj_without_sigmoid, _ = model.forward(np.array(range(node_num)), args.use_L2_Loss, 97 | # KL_type=args.KL_type, for_test=True) 98 | # 99 | # ##### generate from mu and logvar ##### 100 | # params_graph, params_neck_value, _ = \ 101 | # generate_graph(None, adj_without_sigmoid, model, adj, args, list(range(node_num)), 102 | # threshold=args.threshold, for_test=True) 103 | # 104 | # print('params_graph neck_value@' + str(epoch) + ": " + str(params_neck_value)) 105 | # ###################################### 106 | # 107 | # ##### generate only from mu ######## 108 | # emb_graph, emb_neck_value, _ = \ 109 | # generate_graph(mu, None, model, adj, args, list(range(node_num)), 110 | # threshold=args.threshold, for_test=True) 111 | # 112 | # print('emb_graph neck_value@' + str(epoch) + ": " + str(emb_neck_value)) 113 | # ###################################· ### 114 | # 115 | # ##### generate from random seed ###### 116 | # random_seed = torch.randn(node_num, args.layer2_dim) 117 | # generated_graph, gene_neck_value, _ = generate_graph(random_seed, None, model, adj, args, 118 | # list(range(node_num)), 119 | # threshold=args.threshold, for_test=True) 120 | # print('gene_graph neck_value@' + str(epoch) + ": " + str(gene_neck_value)) 121 | # ###################################### 122 | -------------------------------------------------------------------------------- /graph_classification_exp/util.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import networkx as nx 4 | import numpy as np 5 | import random 6 | import torch 7 | from sklearn.model_selection import StratifiedKFold 8 | 9 | dir_path = os.path.dirname(os.path.realpath(__file__)) 10 | root_path = os.path.abspath(os.path.join(dir_path, os.pardir)) 11 | 12 | class S2VGraph(object): 13 | def __init__(self, g, label, node_tags=None, node_features=None): 14 | ''' 15 | g: a networkx graph 16 | label: an integer graph label 17 | node_tags: a list of integer node tags 18 | node_features: a torch float tensor, one-hot representation of the tag that is used as input to neural nets 19 | edge_mat: a torch long tensor, contain edge list, will be used to create torch sparse tensor 20 | neighbors: list of neighbors (without self-loop) 21 | ''' 22 | self.label = label 23 | self.g = g 24 | self.node_tags = node_tags 25 | self.neighbors = [] 26 | self.node_features = 0 27 | self.edge_mat = 0 28 | 29 | self.max_neighbor = 0 30 | 31 | 32 | def load_data(dataset, degree_as_tag): 33 | ''' 34 | dataset: name of dataset 35 | test_proportion: ratio of test train split 36 | seed: random seed for random splitting of dataset 37 | ''' 38 | 39 | print('loading data') 40 | g_list = [] 41 | label_dict = {} 42 | feat_dict = {} 43 | 44 | with open(dir_path + '/dataset/%s/%s.txt' % (dataset, dataset), 'r') as f: 45 | n_g = int(f.readline().strip()) 46 | for i in range(n_g): 47 | row = f.readline().strip().split() 48 | n, l = [int(w) for w in row] # n: num of nodes in this graph, l: graph label 49 | if not l in label_dict: # create graph label dict: l -> graph label 50 | mapped = len(label_dict) 51 | label_dict[l] = mapped 52 | g = nx.Graph() 53 | node_tags = [] 54 | node_features = [] 55 | n_edges = 0 56 | for j in range(n): 57 | g.add_node(j) 58 | row = f.readline().strip().split() 59 | tmp = int(row[1]) + 2 # row[1] is about neighbor number 60 | if tmp == len(row): 61 | # no node attributes 62 | row = [int(w) for w in row] 63 | attr = None 64 | else: 65 | print('row, attr = [int(w) for w in row[:tmp]], np.array([float(w) for w in row[tmp:]])') 66 | exit(1) 67 | row, attr = [int(w) for w in row[:tmp]], np.array([float(w) for w in row[tmp:]]) 68 | if not row[0] in feat_dict: # row[0] is orig node tags 69 | mapped = len(feat_dict) 70 | feat_dict[row[0]] = mapped 71 | node_tags.append(feat_dict[row[0]]) 72 | 73 | if tmp > len(row): 74 | node_features.append(attr) 75 | 76 | n_edges += row[1] 77 | for k in range(2, len(row)): 78 | g.add_edge(j, row[k]) 79 | 80 | if node_features != []: 81 | node_features = np.stack(node_features) 82 | node_feature_flag = True 83 | else: 84 | node_features = None 85 | node_feature_flag = False 86 | 87 | assert len(g) == n 88 | 89 | g_list.append(S2VGraph(g, l, node_tags)) 90 | 91 | #add labels and edge_mat 92 | for g in g_list: 93 | g.neighbors = [[] for i in range(len(g.g))] # init neighbors list 94 | for i, j in g.g.edges(): # add neighbor for each node 95 | g.neighbors[i].append(j) 96 | g.neighbors[j].append(i) 97 | degree_list = [] 98 | for i in range(len(g.g)): 99 | g.neighbors[i] = g.neighbors[i] 100 | degree_list.append(len(g.neighbors[i])) 101 | g.max_neighbor = max(degree_list) 102 | 103 | g.label = label_dict[g.label] # convert g.label(orig text) -> g.label(mapped) 104 | 105 | edges = [list(pair) for pair in g.g.edges()] 106 | edges.extend([[i, j] for j, i in edges]) # convert directed edge to two direction edge 107 | 108 | deg_list = list(dict(g.g.degree(range(len(g.g)))).values()) 109 | g.edge_mat = torch.LongTensor(edges).transpose(0,1) # edge_mat is edge pair list 110 | 111 | if degree_as_tag: 112 | for g in g_list: 113 | g.node_tags = list(dict(g.g.degree).values()) 114 | else: 115 | print("degree_as_tag == Flase.") 116 | exit(1) 117 | 118 | #Extracting unique tag labels 119 | tagset = set([]) 120 | for g in g_list: 121 | tagset = tagset.union(set(g.node_tags)) 122 | 123 | tagset = list(tagset) 124 | tag2index = {tagset[i]:i for i in range(len(tagset))} 125 | 126 | for g in g_list: 127 | g.node_features = torch.zeros(len(g.node_tags), len(tagset)) 128 | g.node_features[range(len(g.node_tags)), [tag2index[tag] for tag in g.node_tags]] = 1 129 | 130 | 131 | print('# classes: %d' % len(label_dict)) 132 | print('# maximum node tag: %d' % len(tagset)) 133 | 134 | print("# data: %d" % len(g_list)) 135 | 136 | return g_list, len(label_dict) 137 | 138 | def separate_data(graph_list, seed, fold_idx): 139 | assert 0 <= fold_idx and fold_idx < 10, "fold_idx must be from 0 to 9." 140 | skf = StratifiedKFold(n_splits=3, shuffle = True, random_state = seed) 141 | 142 | labels = [graph.label for graph in graph_list] 143 | idx_list = [] 144 | for idx in skf.split(np.zeros(len(labels)), labels): 145 | idx_list.append(idx) 146 | # train_idx, test_idx = idx_list[fold_idx] 147 | test_idx, train_idx = idx_list[fold_idx] 148 | 149 | train_graph_list = [graph_list[i] for i in train_idx] 150 | test_graph_list = [graph_list[i] for i in test_idx] 151 | 152 | return train_graph_list, test_graph_list 153 | 154 | 155 | -------------------------------------------------------------------------------- /src/config.py: -------------------------------------------------------------------------------- 1 | class DPGGAN_dblp2_Adam: 2 | def __init__(self): 3 | self.layer1_dim = 64 4 | self.layer2_dim = 64 5 | self.dec1_dim = 64 6 | self.dec2_dim = 64 7 | self.samp_num = 10 8 | self.delta = 1e-5 9 | self.eps_requirement = 10.0 10 | self.noise_sigma = 2.0 11 | self.batch_proc_size = 512 12 | self.grad_norm_max = 5 13 | self.C_decay = 0.99 14 | self.tol = 1.0 15 | 16 | 17 | class DPGGAN_dblp2_ADADP: 18 | def __init__(self): 19 | self.layer1_dim = 32 20 | self.layer2_dim = 16 21 | self.dec1_dim = 16 22 | self.dec2_dim = 32 23 | self.samp_num = 10 24 | self.delta = 1e-5 25 | self.eps_requirement = 10.0 26 | self.noise_sigma = 5.0 27 | self.batch_proc_size = 512 28 | self.grad_norm_max = 5 29 | self.C_decay = 0.99 30 | self.tol = 1.0 # 'tolerance parameter' 31 | 32 | 33 | class DPGVAE_dblp2_Adam: 34 | def __init__(self): 35 | self.layer1_dim = 64 36 | self.layer2_dim = 32 37 | self.dec1_dim = 32 38 | self.dec2_dim = 64 39 | self.samp_num = 10 40 | self.delta = 1e-5 41 | self.eps_requirement = 1.0 42 | self.noise_sigma = 2.0 43 | self.batch_proc_size = 512 44 | self.grad_norm_max = 5 45 | self.C_decay = 0.99 46 | self.tol = 1.0 47 | 48 | 49 | class DPGVAE_dblp2_ADADP: 50 | def __init__(self): 51 | self.layer1_dim = 32 52 | self.layer2_dim = 16 53 | self.dec1_dim = 16 54 | self.dec2_dim = 32 55 | self.samp_num = 10 56 | self.delta = 1e-5 57 | self.eps_requirement = 1.0 58 | self.noise_sigma = 5.0 59 | self.batch_proc_size = 512 60 | self.grad_norm_max = 5 61 | self.C_decay = 0.99 62 | self.tol = 1.0 # 'tolerance parameter' 63 | 64 | 65 | class DPGGAN_imdb_Adam: 66 | def __init__(self): 67 | self.layer1_dim = 64 68 | self.layer2_dim = 64 69 | self.dec1_dim = 64 70 | self.dec2_dim = 64 71 | self.samp_num = 10 72 | self.delta = 1e-5 73 | self.eps_requirement = 10.0 74 | self.noise_sigma = 2.0 75 | self.batch_proc_size = 512 76 | self.grad_norm_max = 5 77 | self.C_decay = 0.99 78 | 79 | 80 | class DPGVAE_imdb_Adam: 81 | def __init__(self): 82 | self.layer1_dim = 32 83 | self.layer2_dim = 16 84 | self.dec1_dim = 16 85 | self.dec2_dim = 32 86 | self.samp_num = 10 87 | self.delta = 1e-5 88 | self.eps_requirement = 0.1 89 | self.noise_sigma = 2.0 90 | self.batch_proc_size = 512 91 | self.grad_norm_max = 5 92 | self.C_decay = 0.99 93 | 94 | 95 | class DPGGAN_imdb_ADADP: 96 | def __init__(self): 97 | self.layer1_dim = 32 98 | self.layer2_dim = 16 99 | self.dec1_dim = 16 100 | self.dec2_dim = 32 101 | self.samp_num = 10 102 | self.delta = 1e-5 103 | self.eps_requirement = 1.0 104 | self.noise_sigma = 5.0 105 | self.batch_proc_size = 512 106 | self.grad_norm_max = 5 107 | self.C_decay = 0.99 108 | self.tol = 1.0 # 'tolerance parameter' 109 | 110 | 111 | class DPGVAE_imdb_ADADP: 112 | def __init__(self): 113 | self.layer1_dim = 32 114 | self.layer2_dim = 16 115 | self.dec1_dim = 16 116 | self.dec2_dim = 32 117 | self.samp_num = 10 118 | self.delta = 1e-5 119 | self.eps_requirement = 1.0 120 | self.noise_sigma = 5.0 121 | self.batch_proc_size = 512 122 | self.grad_norm_max = 5 123 | self.C_decay = 0.99 124 | self.tol = 1.0 # 'tolerance parameter' 125 | 126 | 127 | class GVAE_dblp2: 128 | def __init__(self): 129 | self.layer1_dim = 128 130 | self.layer2_dim = 64 131 | self.dec1_dim = 64 132 | self.dec2_dim = 128 133 | self.samp_num = 10 134 | 135 | 136 | class GVAE_imdb: 137 | def __init__(self): 138 | self.layer1_dim = 128 139 | self.layer2_dim = 64 140 | self.dec1_dim = 64 141 | self.dec2_dim = 128 142 | self.samp_num = 10 143 | 144 | 145 | class GGAN_dblp2: 146 | def __init__(self): 147 | self.layer1_dim = 128 148 | self.layer2_dim = 64 149 | self.dec1_dim = 64 150 | self.dec2_dim = 128 151 | self.samp_num = 10 152 | 153 | 154 | class GGAN_imdb: 155 | def __init__(self): 156 | self.layer1_dim = 128 157 | self.layer2_dim = 64 158 | self.dec1_dim = 64 159 | self.dec2_dim = 128 160 | self.samp_num = 10 161 | 162 | 163 | 164 | def load_config(model_name, dataset_str, optimizer): 165 | if model_name == 'GVAE' and 'dblp' in dataset_str: 166 | return GVAE_dblp2() 167 | elif model_name == 'GGAN' and 'dblp' in dataset_str: 168 | return GGAN_dblp2() 169 | elif model_name == 'DPGGAN' and 'dblp' in dataset_str and optimizer == 'ADADP': 170 | return DPGGAN_dblp2_ADADP() 171 | elif model_name == 'DPGGAN' and 'dblp' in dataset_str and optimizer == 'Adam': 172 | return DPGGAN_dblp2_Adam() 173 | elif model_name == 'DPGVAE' and 'dblp' in dataset_str and optimizer == 'ADADP': 174 | return DPGVAE_dblp2_ADADP() 175 | elif model_name == 'DPGVAE' and 'dblp' in dataset_str and optimizer == 'Adam': 176 | return DPGVAE_dblp2_Adam() 177 | elif model_name == 'GVAE' and 'IMDB_MULTI' in dataset_str: 178 | return GVAE_imdb() 179 | elif model_name == 'GGAN' and 'IMDB_MULTI' in dataset_str: 180 | return GGAN_imdb() 181 | elif model_name == 'DPGGAN' and 'IMDB_MULTI' in dataset_str and optimizer == 'ADADP': 182 | return DPGGAN_imdb_ADADP() 183 | elif model_name == 'DPGGAN' and 'IMDB_MULTI' in dataset_str and optimizer == 'Adam': 184 | return DPGGAN_imdb_Adam() 185 | elif model_name == 'DPGVAE' and 'IMDB_MULTI' in dataset_str and optimizer == 'ADADP': 186 | return DPGVAE_imdb_ADADP() 187 | elif model_name == 'DPGVAE' and 'IMDB_MULTI' in dataset_str and optimizer == 'Adam': 188 | return DPGVAE_imdb_Adam() 189 | else: 190 | print("Unknown config...") 191 | exit(1) -------------------------------------------------------------------------------- /src/GGAN/config.py: -------------------------------------------------------------------------------- 1 | class DPGGAN_dblp2_Adam: 2 | def __init__(self): 3 | self.layer1_dim = 64 4 | self.layer2_dim = 64 5 | self.dec1_dim = 64 6 | self.dec2_dim = 64 7 | self.samp_num = 10 8 | self.delta = 1e-5 9 | self.eps_requirement = 10.0 10 | self.noise_sigma = 2.0 11 | self.batch_proc_size = 512 12 | self.grad_norm_max = 5 13 | self.C_decay = 0.99 14 | self.tol = 1.0 15 | 16 | 17 | class DPGGAN_dblp2_ADADP: 18 | def __init__(self): 19 | self.layer1_dim = 32 20 | self.layer2_dim = 16 21 | self.dec1_dim = 16 22 | self.dec2_dim = 32 23 | self.samp_num = 10 24 | self.delta = 1e-5 25 | self.eps_requirement = 10.0 26 | self.noise_sigma = 5.0 27 | self.batch_proc_size = 512 28 | self.grad_norm_max = 5 29 | self.C_decay = 0.99 30 | self.tol = 1.0 # 'tolerance parameter' 31 | 32 | 33 | class DPGVAE_dblp2_Adam: 34 | def __init__(self): 35 | self.layer1_dim = 64 36 | self.layer2_dim = 32 37 | self.dec1_dim = 32 38 | self.dec2_dim = 64 39 | self.samp_num = 10 40 | self.delta = 1e-5 41 | self.eps_requirement = 1.0 42 | self.noise_sigma = 2.0 43 | self.batch_proc_size = 512 44 | self.grad_norm_max = 5 45 | self.C_decay = 0.99 46 | self.tol = 1.0 47 | 48 | 49 | class DPGVAE_dblp2_ADADP: 50 | def __init__(self): 51 | self.layer1_dim = 32 52 | self.layer2_dim = 16 53 | self.dec1_dim = 16 54 | self.dec2_dim = 32 55 | self.samp_num = 10 56 | self.delta = 1e-5 57 | self.eps_requirement = 1.0 58 | self.noise_sigma = 5.0 59 | self.batch_proc_size = 512 60 | self.grad_norm_max = 5 61 | self.C_decay = 0.99 62 | self.tol = 1.0 # 'tolerance parameter' 63 | 64 | 65 | class DPGGAN_imdb_Adam: 66 | def __init__(self): 67 | self.layer1_dim = 64 68 | self.layer2_dim = 64 69 | self.dec1_dim = 64 70 | self.dec2_dim = 64 71 | self.samp_num = 10 72 | self.delta = 1e-5 73 | self.eps_requirement = 10.0 74 | self.noise_sigma = 2.0 75 | self.batch_proc_size = 512 76 | self.grad_norm_max = 5 77 | self.C_decay = 0.99 78 | 79 | 80 | class DPGVAE_imdb_Adam: 81 | def __init__(self): 82 | self.layer1_dim = 32 83 | self.layer2_dim = 16 84 | self.dec1_dim = 16 85 | self.dec2_dim = 32 86 | self.samp_num = 10 87 | self.delta = 1e-5 88 | self.eps_requirement = 0.1 89 | self.noise_sigma = 2.0 90 | self.batch_proc_size = 512 91 | self.grad_norm_max = 5 92 | self.C_decay = 0.99 93 | 94 | 95 | class DPGGAN_imdb_ADADP: 96 | def __init__(self): 97 | self.layer1_dim = 32 98 | self.layer2_dim = 16 99 | self.dec1_dim = 16 100 | self.dec2_dim = 32 101 | self.samp_num = 10 102 | self.delta = 1e-5 103 | self.eps_requirement = 1.0 104 | self.noise_sigma = 5.0 105 | self.batch_proc_size = 512 106 | self.grad_norm_max = 5 107 | self.C_decay = 0.99 108 | self.tol = 1.0 # 'tolerance parameter' 109 | 110 | 111 | class DPGVAE_imdb_ADADP: 112 | def __init__(self): 113 | self.layer1_dim = 32 114 | self.layer2_dim = 16 115 | self.dec1_dim = 16 116 | self.dec2_dim = 32 117 | self.samp_num = 10 118 | self.delta = 1e-5 119 | self.eps_requirement = 1.0 120 | self.noise_sigma = 5.0 121 | self.batch_proc_size = 512 122 | self.grad_norm_max = 5 123 | self.C_decay = 0.99 124 | self.tol = 1.0 # 'tolerance parameter' 125 | 126 | 127 | class GVAE_dblp2: 128 | def __init__(self): 129 | self.layer1_dim = 128 130 | self.layer2_dim = 64 131 | self.dec1_dim = 64 132 | self.dec2_dim = 128 133 | self.samp_num = 10 134 | 135 | 136 | class GVAE_imdb: 137 | def __init__(self): 138 | self.layer1_dim = 128 139 | self.layer2_dim = 64 140 | self.dec1_dim = 64 141 | self.dec2_dim = 128 142 | self.samp_num = 10 143 | 144 | 145 | class GGAN_dblp2: 146 | def __init__(self): 147 | self.layer1_dim = 128 148 | self.layer2_dim = 64 149 | self.dec1_dim = 64 150 | self.dec2_dim = 128 151 | self.samp_num = 10 152 | 153 | 154 | class GGAN_imdb: 155 | def __init__(self): 156 | self.layer1_dim = 128 157 | self.layer2_dim = 64 158 | self.dec1_dim = 64 159 | self.dec2_dim = 128 160 | self.samp_num = 10 161 | 162 | 163 | 164 | def load_config(model_name, dataset_str, optimizer): 165 | if model_name == 'GVAE' and 'dblp' in dataset_str: 166 | return GVAE_dblp2() 167 | elif model_name == 'GGAN' and 'dblp' in dataset_str: 168 | return GGAN_dblp2() 169 | elif model_name == 'DPGGAN' and 'dblp' in dataset_str and optimizer == 'ADADP': 170 | return DPGGAN_dblp2_ADADP() 171 | elif model_name == 'DPGGAN' and 'dblp' in dataset_str and optimizer == 'Adam': 172 | return DPGGAN_dblp2_Adam() 173 | elif model_name == 'DPGVAE' and 'dblp' in dataset_str and optimizer == 'ADADP': 174 | return DPGVAE_dblp2_ADADP() 175 | elif model_name == 'DPGVAE' and 'dblp' in dataset_str and optimizer == 'Adam': 176 | return DPGVAE_dblp2_Adam() 177 | elif model_name == 'GVAE' and 'IMDB_MULTI' in dataset_str: 178 | return GVAE_imdb() 179 | elif model_name == 'GGAN' and 'IMDB_MULTI' in dataset_str: 180 | return GGAN_imdb() 181 | elif model_name == 'DPGGAN' and 'IMDB_MULTI' in dataset_str and optimizer == 'ADADP': 182 | return DPGGAN_imdb_ADADP() 183 | elif model_name == 'DPGGAN' and 'IMDB_MULTI' in dataset_str and optimizer == 'Adam': 184 | return DPGGAN_imdb_Adam() 185 | elif model_name == 'DPGVAE' and 'IMDB_MULTI' in dataset_str and optimizer == 'ADADP': 186 | return DPGVAE_imdb_ADADP() 187 | elif model_name == 'DPGVAE' and 'IMDB_MULTI' in dataset_str and optimizer == 'Adam': 188 | return DPGVAE_imdb_Adam() 189 | else: 190 | print("Unknown config...") 191 | exit(1) -------------------------------------------------------------------------------- /src/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import networkx as nx 4 | import torch 5 | import random 6 | import argparse 7 | import numpy as np 8 | from datetime import datetime 9 | from src.logger import stat_logger 10 | from src.GGAN.model import GGAN 11 | from src.DPGGAN.model import DPGGAN 12 | from src.DPGGAN.adadp import ADADP 13 | from src.config import load_config 14 | from src.dataloader import Multi_Graph_Dataset 15 | from src.train import train 16 | 17 | dir_path = os.path.dirname(os.path.realpath(__file__)) 18 | root_path = os.path.abspath(os.path.join(dir_path, os.pardir)) 19 | 20 | multi_graph_dataset = set(['relabeled_dblp2', 'new_dblp2', 'dblp2','new_IMDB_MULTI', 'IMDB_MULTI', 'Resampled_IMDB_MULTI']) 21 | 22 | def arg_parser(): 23 | # init the common args, expect the model specific args 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument('--seed', type=int, default=43, help='Random seed.') 26 | parser.add_argument('--threads', type=int, default=4, help='Thread number.') 27 | parser.add_argument('--txt_log', type=bool, default=True, help='whether save the txt_log.') 28 | parser.add_argument('--model_name', type=str, default='GVAE', help='[DPGGAN, GGAN, DPGVAE, GVAE]') 29 | parser.add_argument('--dataset_str', type=str, default='new_IMDB_MULTI', 30 | help="[dblp2, IMDB_MULTI, Resampled_IMDB_MULTI, new_IMDB_MULTI, new_dblp2]") 31 | parser.add_argument('--n_eigenvector', type=int, default=128, help='use eigenvector as initial feature.') 32 | 33 | parser.add_argument('--epochs', type=int, default=100, help='Number of epochs to train.') 34 | parser.add_argument('--test_period', type=int, default=10, help='test period.') 35 | parser.add_argument('--batch_size', type=int, default=16) 36 | parser.add_argument('--learning_rate', default=0.005, help='the ratio of training set in whole dataset.') 37 | parser.add_argument('--optimizer', type=str, default='Adam') 38 | parser.add_argument('--stat_generate_num', type=int, default=5, help='generate a batch of graph for graph stat.') 39 | parser.add_argument('--threshold', type=float, default=None) 40 | parser.add_argument('--discriminator_ratio', type=float, default=0.1, help='factor of discriminator loss.') 41 | parser.add_argument('--kl_ratio', type=float, default=1.0, help='factor of kl loss.') 42 | 43 | parser.add_argument('--std', type=float, default=0.75, help='Standard deviation of the Gaussian prior.') 44 | parser.add_argument('--eval_period', type=int, default=10) 45 | parser.add_argument('--draw_graph_epoch', type=int, default=2) 46 | parser.add_argument('--generated_graph_num', type=int, default=5, help='generate multiple graph at one time to compute variance.') 47 | parser.add_argument('--save_generated_graph', type=int, default=1, help='whether save generated graph or not.') 48 | 49 | args = parser.parse_args() 50 | return args 51 | 52 | 53 | 54 | def set_env(args): 55 | random.seed(args.seed) 56 | np.random.seed(args.seed) 57 | torch.manual_seed(args.seed) 58 | torch.set_num_threads(args.threads) 59 | current_time = datetime.now().strftime('%b_%d_%H-%M-%S') 60 | model_args = load_config(args.model_name, args.dataset_str, args.optimizer) 61 | if args.model_name in ['DPGVAE', 'GVAE']: 62 | args.discriminator_ratio = 0 63 | return args, model_args, current_time 64 | 65 | 66 | def load_data(args, dataset_str): 67 | assert dataset_str in multi_graph_dataset 68 | args.dataset_type = 'multi' 69 | dataset = Multi_Graph_Dataset(dataset_str, args.n_eigenvector) 70 | args.num_samples = dataset.datasets[0].features.shape[0] 71 | return dataset 72 | 73 | 74 | def save_data(args, data_list): 75 | if 'DP' in args.model_name: 76 | save_path = root_path + '/data/generated/{}_{}_eps:{}.pkl'.format(args.model_name, args.dataset_str, args.model_args.eps_requirement) 77 | else: 78 | save_path = root_path + '/data/generated/{}_{}.pkl'.format(args.model_name, args.dataset_str) 79 | 80 | with open(save_path, 'wb') as file: 81 | pickle.dump(data_list, file) 82 | 83 | 84 | def create_model(args, model_args, dataset): 85 | if args.batch_size > dataset.features.shape[0]: 86 | args.batch_size = dataset.features.shape[0] 87 | model = None 88 | if args.model_name in ['GGAN', 'GVAE']: 89 | model = GGAN(args, model_args, dataset.features, dataset.adj_list) 90 | elif args.model_name in ['DPGGAN', 'DPGVAE']: 91 | model = DPGGAN(args, model_args, dataset.features, dataset.adj_list) 92 | else: 93 | print(args.model_name) 94 | print("Unknown model name.") 95 | exit(1) 96 | 97 | is_enc1 = True 98 | for v in model.parameters(): 99 | if v is not None and v.requires_grad is True: 100 | if len(v.data.shape) == 2: 101 | continue 102 | if is_enc1: 103 | is_enc1 = False 104 | v.data.copy_(v[0].data.clone().repeat(args.batch_size * (1 + model_args.samp_num), 1, 1)) 105 | else: 106 | v.data.copy_(v[0].data.clone().repeat(args.batch_size, 1, 1)) 107 | return model 108 | 109 | 110 | def create_optimizer(args, model): 111 | optimizer = None 112 | if args.optimizer == 'Adam': 113 | optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.learning_rate) 114 | elif args.optimizer == 'SGD': 115 | optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.learning_rate) 116 | elif args.optimizer == 'ADADP': 117 | optimizer = ADADP(filter(lambda p: p.requires_grad, model.parameters())) 118 | return optimizer 119 | 120 | 121 | def main(args, model_args, dataset, current_time): 122 | model = create_model(args, model_args, dataset) 123 | optimizer = create_optimizer(args, model) 124 | logger = stat_logger(args, current_time) 125 | model, generated_adj, generated_graph_property_cache = train(args, model_args, dataset, model, optimizer, logger) 126 | return generated_adj, generated_graph_property_cache 127 | 128 | 129 | if __name__ == '__main__': 130 | print(root_path) 131 | args = arg_parser() # args is general arguments 132 | args, model_args, current_time = set_env(args) 133 | args.model_args = model_args # model_args is relevant to a specific model 134 | datasets = load_data(args, args.dataset_str) 135 | generated_graph_list = [] 136 | property_dict_all = [] 137 | 138 | for counter, dataset in enumerate(datasets.datasets): 139 | model_args.dec2_dim = dataset.actual_feature_dim # the feat dim is capped for some small graph 140 | print('='*10 + str(counter) + '='*10) 141 | args.batch_size = dataset.adj.shape[0] 142 | generated_adj, generated_graph_property_cache = main(args, model_args, dataset, current_time) 143 | assert generated_adj.shape[0] == dataset.adj.shape[0] 144 | # save stat and generated graph 145 | property_dict_all.extend(generated_graph_property_cache) 146 | G = nx.from_numpy_array(generated_adj) 147 | G.graph['label'] = dataset.label 148 | generated_graph_list.append(G) 149 | 150 | if args.save_generated_graph: 151 | save_data(args, generated_graph_list) 152 | 153 | logger = stat_logger(args, current_time) 154 | stat_log_info = logger.form_generated_stat_log('final', property_dict_all) 155 | logger.write(stat_log_info) -------------------------------------------------------------------------------- /src/GGAN/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import networkx as nx 4 | import torch 5 | import random 6 | import argparse 7 | import numpy as np 8 | from datetime import datetime 9 | from src.logger import stat_logger 10 | from src.GGAN.model import GGAN 11 | from src.DPGGAN.model import DPGGAN 12 | from src.DPGGAN.adadp import ADADP 13 | from src.config import load_config 14 | from src.dataloader import Multi_Graph_Dataset 15 | from src.train import train 16 | 17 | dir_path = os.path.dirname(os.path.realpath(__file__)) 18 | root_path = os.path.abspath(os.path.join(dir_path, os.pardir)) 19 | 20 | # what is relabeled_dblp2? new_dblp2? new_IMDB_MULTI? Resampled_IMDB_MULTI? 21 | multi_graph_dataset = set(['relabeled_dblp2', 'new_dblp2', 'dblp2','new_IMDB_MULTI', 'IMDB_MULTI', 'Resampled_IMDB_MULTI']) 22 | 23 | def arg_parser(): 24 | # init the common args, expect the model specific args 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument('--seed', type=int, default=43, help='Random seed.') 27 | parser.add_argument('--threads', type=int, default=4, help='Thread number.') 28 | parser.add_argument('--txt_log', type=bool, default=True, help='whether save the txt_log.') 29 | parser.add_argument('--model_name', type=str, default='GVAE', help='[DPGGAN, GGAN, DPGVAE, GVAE]') 30 | parser.add_argument('--dataset_str', type=str, default='new_IMDB_MULTI', 31 | help="[dblp2, IMDB_MULTI, Resampled_IMDB_MULTI, new_IMDB_MULTI, new_dblp2]") 32 | parser.add_argument('--n_eigenvector', type=int, default=128, help='use eigenvector as initial feature.') 33 | 34 | parser.add_argument('--epochs', type=int, default=100, help='Number of epochs to train.') 35 | parser.add_argument('--test_period', type=int, default=10, help='test period.') 36 | parser.add_argument('--batch_size', type=int, default=16) 37 | parser.add_argument('--learning_rate', default=0.005, help='the ratio of training set in whole dataset.') 38 | parser.add_argument('--optimizer', type=str, default='Adam') 39 | parser.add_argument('--stat_generate_num', type=int, default=5, help='generate a batch of graph for graph stat.') 40 | parser.add_argument('--threshold', type=float, default=None) 41 | parser.add_argument('--discriminator_ratio', type=float, default=0.1, help='factor of discriminator loss.') 42 | parser.add_argument('--kl_ratio', type=float, default=1.0, help='factor of kl loss.') 43 | 44 | parser.add_argument('--std', type=float, default=0.75, help='Standard deviation of the Gaussian prior.') 45 | parser.add_argument('--eval_period', type=int, default=10) 46 | parser.add_argument('--draw_graph_epoch', type=int, default=2) 47 | parser.add_argument('--generated_graph_num', type=int, default=5, help='generate multiple graph at one time to compute variance.') 48 | parser.add_argument('--save_generated_graph', type=int, default=1, help='whether save generated graph or not.') 49 | 50 | args = parser.parse_args() 51 | return args 52 | 53 | 54 | 55 | def set_env(args): 56 | random.seed(args.seed) 57 | np.random.seed(args.seed) 58 | torch.manual_seed(args.seed) 59 | torch.set_num_threads(args.threads) 60 | current_time = datetime.now().strftime('%b_%d_%H-%M-%S') 61 | model_args = load_config(args.model_name, args.dataset_str, args.optimizer) 62 | if args.model_name in ['DPGVAE', 'GVAE']: 63 | args.discriminator_ratio = 0 64 | return args, model_args, current_time 65 | 66 | 67 | def load_data(args, dataset_str): 68 | assert dataset_str in multi_graph_dataset 69 | args.dataset_type = 'multi' 70 | dataset = Multi_Graph_Dataset(dataset_str, args.n_eigenvector) 71 | args.num_samples = dataset.datasets[0].features.shape[0] 72 | return dataset 73 | 74 | 75 | def save_data(args, data_list): 76 | if 'DP' in args.model_name: 77 | save_path = root_path + '/data/generated/{}_{}_eps:{}.pkl'.format(args.model_name, args.dataset_str, args.model_args.eps_requirement) 78 | else: 79 | save_path = root_path + '/data/generated/{}_{}.pkl'.format(args.model_name, args.dataset_str) 80 | 81 | with open(save_path, 'wb') as file: 82 | pickle.dump(data_list, file) 83 | 84 | 85 | def create_model(args, model_args, dataset): 86 | if args.batch_size > dataset.features.shape[0]: 87 | args.batch_size = dataset.features.shape[0] 88 | model = None 89 | if args.model_name in ['GGAN', 'GVAE']: 90 | model = GGAN(args, model_args, dataset.features, dataset.adj_list) 91 | elif args.model_name in ['DPGGAN', 'DPGVAE']: 92 | model = DPGGAN(args, model_args, dataset.features, dataset.adj_list) 93 | else: 94 | print(args.model_name) 95 | print("Unknown model name.") 96 | exit(1) 97 | 98 | is_enc1 = True 99 | for v in model.parameters(): 100 | if v is not None and v.requires_grad is True: 101 | if len(v.data.shape) == 2: 102 | continue 103 | if is_enc1: 104 | is_enc1 = False 105 | v.data.copy_(v[0].data.clone().repeat(args.batch_size * (1 + model_args.samp_num), 1, 1)) 106 | else: 107 | v.data.copy_(v[0].data.clone().repeat(args.batch_size, 1, 1)) 108 | return model 109 | 110 | 111 | def create_optimizer(args, model): 112 | optimizer = None 113 | if args.optimizer == 'Adam': 114 | optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.learning_rate) 115 | elif args.optimizer == 'SGD': 116 | optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.learning_rate) 117 | elif args.optimizer == 'ADADP': 118 | optimizer = ADADP(filter(lambda p: p.requires_grad, model.parameters())) 119 | return optimizer 120 | 121 | 122 | def main(args, model_args, dataset, current_time): 123 | model = create_model(args, model_args, dataset) 124 | optimizer = create_optimizer(args, model) 125 | logger = stat_logger(args, current_time) 126 | model, generated_adj, generated_graph_property_cache = train(args, model_args, dataset, model, optimizer, logger) 127 | return generated_adj, generated_graph_property_cache 128 | 129 | 130 | if __name__ == '__main__': 131 | print(root_path) 132 | args = arg_parser() # args is general arguments 133 | args, model_args, current_time = set_env(args) 134 | args.model_args = model_args # model_args is relevant to a specific model 135 | datasets = load_data(args, args.dataset_str) 136 | generated_graph_list = [] 137 | property_dict_all = [] 138 | 139 | for counter, dataset in enumerate(datasets.datasets): 140 | model_args.dec2_dim = dataset.actual_feature_dim # the feat dim is capped for some small graph 141 | print('='*10 + str(counter) + '='*10) 142 | args.batch_size = dataset.adj.shape[0] 143 | generated_adj, generated_graph_property_cache = main(args, model_args, dataset, current_time) 144 | assert generated_adj.shape[0] == dataset.adj.shape[0] 145 | # save stat and generated graph 146 | property_dict_all.extend(generated_graph_property_cache) 147 | G = nx.from_numpy_array(generated_adj) 148 | G.graph['label'] = dataset.label 149 | generated_graph_list.append(G) 150 | 151 | if args.save_generated_graph: 152 | save_data(args, generated_graph_list) 153 | 154 | logger = stat_logger(args, current_time) 155 | stat_log_info = logger.form_generated_stat_log('final', property_dict_all) 156 | logger.write(stat_log_info) -------------------------------------------------------------------------------- /src/GGAN/model.py: -------------------------------------------------------------------------------- 1 | import math 2 | import random 3 | import torch 4 | import torch.nn.modules.loss 5 | import torch.nn.functional as F 6 | import torch.nn as nn 7 | from torch.nn import Parameter 8 | 9 | from src.GGAN.aggregators import MeanAggregator 10 | from src.GGAN.encoders import Encoder 11 | 12 | # from gcn_layer import GraphConvolution 13 | import numpy as np 14 | 15 | from src.GGAN.gcn_layer import GraphConvolution 16 | 17 | 18 | class GGAN(nn.Module): 19 | def __init__(self, args, model_args, features_np, adj_lists): 20 | super(GGAN,self).__init__() 21 | self.args = args 22 | self.features_np = features_np 23 | features = nn.Embedding(features_np.shape[0], features_np.shape[1]) 24 | features.weight = nn.Parameter(torch.FloatTensor(features_np), requires_grad=False) 25 | # features.weight = nn.Parameter(torch.randn(features_np.shape[0], features_np.shape[1]), requires_grad=False) 26 | self.features = features 27 | self.node_num = features_np.shape[0] 28 | self.feature_dim = features_np.shape[1] 29 | self.adj_lists = adj_lists 30 | self.layer1_dim = model_args.layer1_dim 31 | self.layer2_dim = model_args.layer2_dim 32 | self.dec1_dim = model_args.dec1_dim 33 | self.dec2_dim = model_args.dec2_dim 34 | self.samp_num = model_args.samp_num 35 | 36 | 37 | # Follow the mode used in graphSAGE, the agg with 'gcn=False' 38 | # layer1 39 | self.agg1 = MeanAggregator(cuda=False,first_layer=True,gcn=False) 40 | self.enc1 = Encoder(self.feature_dim, self.layer1_dim, adj_lists, self.agg1, num_sample=self.samp_num, 41 | gcn=True, cuda=False, first_layer=True) 42 | # layer2 * 2 43 | self.agg2 = MeanAggregator(cuda=False,first_layer=False,gcn=False) 44 | self.enc2 = Encoder(self.layer1_dim, self.layer2_dim, adj_lists, self.agg2, num_sample=self.samp_num, 45 | gcn=True, cuda=False, first_layer=False) 46 | 47 | self.agg3 = MeanAggregator(cuda=False,first_layer=False,gcn=False) 48 | self.enc3 = Encoder(self.layer1_dim, self.layer2_dim, adj_lists, self.agg3, num_sample=self.samp_num, 49 | gcn=True, cuda=False, first_layer=False) 50 | # decoder layer1 & layer2 51 | self.dec1 = nn.Linear(self.layer2_dim, self.dec1_dim, bias=True) 52 | self.dec2 = nn.Linear(self.dec1_dim, self.dec2_dim, bias=True) 53 | 54 | # mapping 55 | self.mapping1 = nn.Linear(self.dec2_dim, self.dec2_dim, bias=False) 56 | self.mapping2 = nn.Linear(self.dec2_dim, self.dec2_dim, bias=False) 57 | 58 | self.is_gan = 0 59 | if args.model_name == 'GGAN': 60 | self.disc_gcn = GraphConvolution(in_features=self.dec2_dim, out_features=self.dec2_dim) 61 | self.disc_linear = nn.Linear(in_features=self.dec2_dim, out_features=1, bias=False) 62 | self.is_gan = 1 63 | 64 | 65 | def discriminate(self, origin_feature, origin_adj, generated_adj): 66 | origin_embed = self.disc_gcn(origin_feature, origin_adj) 67 | orig_prob = self.disc_linear(origin_embed) 68 | pos_label = torch.ones_like(orig_prob) 69 | generated_embed = self.disc_gcn(origin_feature, generated_adj) 70 | generated_prob = self.disc_linear(generated_embed) 71 | neg_label = torch.zeros_like(generated_prob) 72 | pred = torch.cat((orig_prob, generated_prob), dim=0) 73 | label = torch.cat((pos_label, neg_label), dim=0) 74 | return pred, label 75 | 76 | 77 | def encode(self, nodes): 78 | _set = set 79 | _sample = random.sample 80 | node_num = len(self.adj_lists) 81 | 82 | # encode nodes by its neighs 83 | to_neighs = [self.adj_lists[int(node)] for node in nodes] 84 | 85 | # samp_neighs is neighs of nodes, self.enc2.num_sample == self.enc3.num_sample 86 | samp_neighs = [_set(_sample(to_neigh, self.enc2.num_sample, )) 87 | if len(to_neigh) >= self.enc2.num_sample else to_neigh 88 | for to_neigh in to_neighs] 89 | 90 | # unique_nodes_list is all nodes required in layer2 91 | unique_nodes_list = list(set.union(*samp_neighs) | set(nodes)) 92 | 93 | # encode unique_nodes_list in layer1 94 | embeds_layer1 = self.enc1(self.features, unique_nodes_list) 95 | 96 | # Look-up dict for layer1's embedding 97 | feature_dict = {} 98 | for i, v in enumerate(unique_nodes_list): 99 | feature_dict[v] = i 100 | 101 | features_embeds = embeds_layer1 102 | 103 | # feed Look-up dict and features_embeds into layer2 104 | nodes_idx = [] 105 | for i, v in enumerate(nodes): 106 | nodes_idx.append(feature_dict[v]) 107 | mu = self.enc2(features_embeds, nodes_idx, samp_neighs, feature_dict=feature_dict) 108 | logvar = self.enc3(features_embeds, nodes_idx, samp_neighs, feature_dict=feature_dict) 109 | 110 | return mu, logvar 111 | 112 | 113 | def decode(self, input_features): 114 | output = self.dec1(input_features) 115 | output = F.relu(output) 116 | output = self.dec2(output) 117 | emb1 = self.mapping1(output) 118 | emb2 = self.mapping2(output) 119 | return emb1, emb2 120 | 121 | def inner_product_with_mapping(self, emb1,emb2): # cos similarity 122 | emb1 = F.normalize(emb1, dim=-1, p=2) 123 | emb2 = F.normalize(emb2, dim=-1, p=2) 124 | adj = torch.mm(emb1, emb2.t()) 125 | return adj 126 | 127 | def get_sub_adj_feat(self, nodes): 128 | subgraph_feature = [] 129 | for i,v in enumerate(nodes): 130 | subgraph_feature.append(self.features_np[v]) 131 | subgraph_feature_tensor = torch.FloatTensor(np.array(subgraph_feature)) 132 | return subgraph_feature_tensor 133 | 134 | def forward(self, nodes, sub_adj, for_test=False): 135 | gan_pred, gan_label = None, None 136 | sub_adj_feat = self.get_sub_adj_feat(nodes) 137 | mu, logvar = self.encode(nodes) 138 | mu_q = F.normalize(mu, dim=-1, p=2) 139 | logvar_sub = -logvar 140 | std0 = self.args.std 141 | std_q = torch.exp(0.5 * logvar_sub) * std0 142 | epsilon = torch.randn(std_q.shape) 143 | h = mu_q + int(for_test) * epsilon * std_q 144 | emb1, emb2 = self.decode(h) 145 | reconst_adj = self.inner_product_with_mapping(emb1, emb2) 146 | 147 | if not for_test and self.is_gan: 148 | gan_pred, gan_label = self.discriminate(sub_adj_feat, sub_adj, reconst_adj) 149 | return mu, logvar_sub, reconst_adj, gan_pred, gan_label 150 | 151 | 152 | def loss_function(self, preds, labels, logvar_sub, pos_weight, gan_pred, gan_label): 153 | cost = F.binary_cross_entropy_with_logits(preds, labels, pos_weight=torch.FloatTensor([pos_weight])) 154 | # see Appendix B from VAE paper: 155 | # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014 156 | # https://arxiv.org/abs/1312.6114 157 | # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2) 158 | # Trick: KL is constant w.r.t. to mu_q after we normalize mu_q. 159 | kl = (0.5 * (-logvar_sub + torch.exp(logvar_sub) - 1.0)).sum(dim=1).mean() 160 | if self.is_gan: 161 | gan_loss = F.binary_cross_entropy_with_logits(gan_pred, gan_label) 162 | return cost + self.args.kl_ratio * kl + self.args.discriminator_ratio * gan_loss 163 | else: 164 | return cost + self.args.kl_ratio * kl -------------------------------------------------------------------------------- /graph_classification_exp/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import torch.optim as optim 8 | import numpy as np 9 | 10 | from tqdm import tqdm 11 | 12 | from graph_classification_exp.models.graphcnn import GraphCNN 13 | from util import load_data, separate_data 14 | 15 | criterion = nn.CrossEntropyLoss() 16 | 17 | 18 | dir_path = os.path.dirname(os.path.realpath(__file__)) 19 | parent_path = os.path.abspath(os.path.join(dir_path, os.pardir)) 20 | 21 | def train(args, model, device, train_graphs, optimizer, epoch): 22 | model.train() 23 | 24 | total_iters = args.iters_per_epoch 25 | pbar = tqdm(range(total_iters), unit='batch') 26 | 27 | loss_accum = 0 28 | for pos in pbar: 29 | selected_idx = np.random.permutation(len(train_graphs))[:args.batch_size] 30 | batch_graph = [train_graphs[idx] for idx in selected_idx] 31 | output = model(batch_graph) 32 | labels = torch.LongTensor([graph.label for graph in batch_graph]).to(device) 33 | #compute loss 34 | loss = criterion(output, labels) 35 | 36 | #backprop 37 | if optimizer is not None: 38 | optimizer.zero_grad() 39 | loss.backward() 40 | optimizer.step() 41 | 42 | 43 | loss = loss.detach().cpu().numpy() 44 | loss_accum += loss 45 | 46 | #report 47 | pbar.set_description('epoch: %d' % (epoch)) 48 | 49 | average_loss = loss_accum/total_iters 50 | print("loss training: %f" % (average_loss)) 51 | 52 | return average_loss 53 | 54 | ###pass data to model with minibatch during testing to avoid memory overflow (does not perform backpropagation) 55 | def pass_data_iteratively(model, graphs, minibatch_size = 64): 56 | model.eval() 57 | output = [] 58 | idx = np.arange(len(graphs)) 59 | for i in range(0, len(graphs), minibatch_size): 60 | sampled_idx = idx[i:i+minibatch_size] 61 | if len(sampled_idx) == 0: 62 | continue 63 | output.append(model([graphs[j] for j in sampled_idx]).detach()) 64 | return torch.cat(output, 0) 65 | 66 | 67 | def test(args, model, device, train_graphs, test_graphs, epoch): 68 | model.eval() 69 | 70 | output = pass_data_iteratively(model, train_graphs) 71 | pred = output.max(1, keepdim=True)[1] 72 | labels = torch.LongTensor([graph.label for graph in train_graphs]).to(device) 73 | correct = pred.eq(labels.view_as(pred)).sum().cpu().item() 74 | acc_train = correct / float(len(train_graphs)) 75 | 76 | output = pass_data_iteratively(model, test_graphs) 77 | pred = output.max(1, keepdim=True)[1] 78 | labels = torch.LongTensor([graph.label for graph in test_graphs]).to(device) 79 | correct = pred.eq(labels.view_as(pred)).sum().cpu().item() 80 | acc_test = correct / float(len(test_graphs)) 81 | 82 | print("accuracy train: %f test: %f" % (acc_train, acc_test)) 83 | 84 | return acc_train, acc_test 85 | 86 | def main(): 87 | # Training settings 88 | # Note: Hyper-parameters need to be tuned in order to obtain results reported in the paper. 89 | parser = argparse.ArgumentParser(description='PyTorch graph convolutional neural net for whole-graph classification') 90 | parser.add_argument('--dataset', type=str, default="DPGraphGAN_Resampled_IMDB_MULTI", help='name of dataset') 91 | parser.add_argument('--device', type=int, default=0, 92 | help='which gpu to use if any (default: 0)') 93 | parser.add_argument('--batch_size', type=int, default=32, 94 | help='input batch size for training (default: 32)') 95 | parser.add_argument('--iters_per_epoch', type=int, default=50, 96 | help='number of iterations per each epoch (default: 50)') 97 | parser.add_argument('--epochs', type=int, default=50, 98 | help='number of epochs to train (default: 350)') 99 | parser.add_argument('--lr', type=float, default=0.01, 100 | help='learning rate (default: 0.01)') 101 | parser.add_argument('--seed', type=int, default=0, 102 | help='random seed for splitting the dataset into 10 (default: 0)') 103 | parser.add_argument('--fold_idx', type=int, default=0, 104 | help='the index of fold in 10-fold validation. Should be less then 10.') 105 | parser.add_argument('--num_layers', type=int, default=5, 106 | help='number of layers INCLUDING the input one (default: 5)') 107 | parser.add_argument('--num_mlp_layers', type=int, default=2, 108 | help='number of layers for MLP EXCLUDING the input one (default: 2). 1 means linear model.') 109 | parser.add_argument('--hidden_dim', type=int, default=64, 110 | help='number of hidden units (default: 64)') 111 | parser.add_argument('--final_dropout', type=float, default=0.5, 112 | help='final layer dropout (default: 0.5)') 113 | parser.add_argument('--graph_pooling_type', type=str, default="sum", choices=["sum", "average"], 114 | help='Pooling for over nodes in a graph: sum or average') 115 | parser.add_argument('--neighbor_pooling_type', type=str, default="sum", choices=["sum", "average", "max"], 116 | help='Pooling for over neighboring nodes: sum, average or max') 117 | parser.add_argument('--learn_eps', action="store_true", help='Whether to learn the epsilon weighting for the center nodes. Does not affect training accuracy though.') 118 | parser.add_argument('--degree_as_tag', type=int, default=1, 119 | help='let the input node features be the degree of nodes (heuristics for unlabeled graph)') 120 | parser.add_argument('--filename', type = str, default = "log.txt", help='output file') 121 | args = parser.parse_args() 122 | 123 | #set up seeds and gpu device 124 | torch.manual_seed(0) 125 | np.random.seed(0) 126 | device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") 127 | if torch.cuda.is_available(): 128 | torch.cuda.manual_seed_all(0) 129 | 130 | graphs, num_classes = load_data(args.dataset, args.degree_as_tag) 131 | 132 | ##10-fold cross validation. Conduct an experiment on the fold specified by args.fold_idx. 133 | train_graphs, test_graphs = separate_data(graphs, args.seed, args.fold_idx) 134 | 135 | model = GraphCNN(args.num_layers, args.num_mlp_layers, train_graphs[0].node_features.shape[1], args.hidden_dim, num_classes, args.final_dropout, args.learn_eps, args.graph_pooling_type, args.neighbor_pooling_type, device).to(device) 136 | 137 | optimizer = optim.Adam(model.parameters(), lr=args.lr) 138 | scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5) 139 | 140 | best_acc_test = 0 141 | file_path = dir_path + '/logs/' + args.dataset + '_' + args.filename 142 | if os.path.exists(file_path): 143 | os.remove(file_path) 144 | for epoch in range(1, args.epochs + 1): 145 | scheduler.step() 146 | avg_loss = train(args, model, device, train_graphs, optimizer, epoch) 147 | acc_train, acc_test = test(args, model, device, train_graphs, test_graphs, epoch) 148 | print("%f %f %f" % (avg_loss, acc_train, acc_test)) 149 | 150 | if acc_test > best_acc_test: 151 | with open(file_path, 'a') as f: 152 | f.write("%f %f %f" % (avg_loss, acc_train, acc_test)) 153 | f.write("\n") 154 | best_acc_test = acc_test 155 | print("") 156 | 157 | print(model.eps) 158 | 159 | 160 | if __name__ == '__main__': 161 | print(dir_path) 162 | main() 163 | -------------------------------------------------------------------------------- /src/DPGGAN/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.modules.loss 3 | import torch.nn.functional as F 4 | import numpy as np 5 | import torch.nn as nn 6 | 7 | from src.DPGGAN.DPCounter import DPCounter 8 | from src.DPGGAN.dp_aggregators import MeanAggregator as DPMeanAggregator 9 | from src.DPGGAN.dp_encoders import Encoder as DPEncoder 10 | from src.DPGGAN.gcn_layer import GraphConvolution 11 | # from src.GGAN.aggregators import MeanAggregator 12 | # from src.GGAN.encoders import Encoder 13 | import src.DPGGAN.linear as linear 14 | from src.DPGGAN.utils_dp import create_cum_grads 15 | 16 | 17 | class DPGGAN(nn.Module): 18 | def __init__(self, args, model_args, features_np, adj_lists): 19 | super(DPGGAN, self).__init__() 20 | self.args = args 21 | self.features_np = features_np 22 | features = nn.Embedding(features_np.shape[0], features_np.shape[1]) 23 | features.weight = nn.Parameter(torch.FloatTensor(features_np), requires_grad=False) 24 | # features.weight = nn.Parameter(torch.randn(features_np.shape[0], features_np.shape[1]), requires_grad=False) 25 | self.features = features 26 | self.disc_factor = args.discriminator_ratio 27 | self.node_num = features_np.shape[0] 28 | self.feature_dim = features_np.shape[1] 29 | self.adj_lists = adj_lists 30 | self.layer1_dim = model_args.layer1_dim 31 | self.layer2_dim = model_args.layer2_dim 32 | self.dec1_dim = model_args.dec1_dim 33 | self.dec2_dim = model_args.dec2_dim 34 | self.samp_num = model_args.samp_num 35 | self.batch_size = args.batch_size 36 | self.batch_proc_size = model_args.batch_proc_size 37 | self.check_proc_size() 38 | self.dp_counter = DPCounter(args, model_args) 39 | 40 | 41 | # Follow the mode used in graphSAGE, the agg with 'gcn=False' 42 | # layer1 43 | self.agg1 = DPMeanAggregator(cuda=False,first_layer=True,gcn=False) 44 | self.enc1 = DPEncoder(self.feature_dim, self.layer1_dim, 45 | self.adj_lists, self.agg1, num_sample=self.samp_num, 46 | gcn=True, cuda=False, first_layer=True, batch_size=self.batch_size*(self.samp_num+1)) 47 | 48 | # layer2 49 | self.agg2 = DPMeanAggregator(cuda=False,first_layer=False,gcn=False) 50 | self.enc2 = DPEncoder(self.layer1_dim, self.layer2_dim, 51 | self.adj_lists, self.agg2, num_sample=self.samp_num, 52 | gcn=True, cuda=False, first_layer=False, batch_size=self.batch_size) 53 | 54 | self.agg3 = DPMeanAggregator(cuda=False,first_layer=False,gcn=False) 55 | self.enc3 = DPEncoder(self.layer1_dim, self.layer2_dim, 56 | self.adj_lists, self.agg3, num_sample=self.samp_num, 57 | gcn=True, cuda=False, first_layer=False, batch_size=self.batch_size) 58 | 59 | # decoder layer1 60 | self.dec1 = linear.Linear(self.layer2_dim, self.dec1_dim, bias=False, batch_size=self.batch_size) 61 | # decoder layer2 62 | self.dec2 = linear.Linear(self.dec1_dim, self.dec2_dim, bias=False, batch_size=self.batch_size) 63 | 64 | self.mapping1 = linear.Linear(self.dec2_dim, self.dec2_dim, bias=False, batch_size=self.batch_size) 65 | self.mapping2 = linear.Linear(self.dec2_dim, self.dec2_dim, bias=False, batch_size=self.batch_size) 66 | 67 | self.cum_grads = create_cum_grads(self) 68 | 69 | # gan part 70 | self.is_gan = 0 71 | if args.model_name == 'DPGGAN': 72 | self.disc_gcn = GraphConvolution(in_features=self.dec2_dim, out_features=self.dec2_dim) 73 | self.disc_linear = nn.Linear(in_features=self.dec2_dim, out_features=1, bias=False) 74 | self.is_gan = 1 75 | 76 | def check_proc_size(self): 77 | self.batch_size = self.node_num if self.batch_size > self.node_num else self.batch_size 78 | self.batch_proc_size = self.batch_size 79 | 80 | 81 | 82 | def discriminate(self, origin_feature, origin_adj, generated_adj): 83 | origin_embed = self.disc_gcn(origin_feature, origin_adj) 84 | orig_prob = self.disc_linear(origin_embed) 85 | pos_label = torch.ones_like(orig_prob) 86 | generated_embed = self.disc_gcn(origin_feature, generated_adj) 87 | generated_prob = self.disc_linear(generated_embed) 88 | neg_label = torch.zeros_like(generated_prob) 89 | pred = torch.cat((orig_prob, generated_prob), dim=0) 90 | label = torch.cat((pos_label, neg_label), dim=0) 91 | return pred, label 92 | 93 | 94 | def encode(self, nodes,for_test): 95 | _set = set 96 | _sample = np.random.choice 97 | node_num = len(self.adj_lists) 98 | # encode nodes by its neighs 99 | to_neighs = [self.adj_lists[int(node)] for node in nodes] 100 | 101 | samp_neighs = [(_sample(list(to_neigh), self.enc2.num_sample)) for to_neigh in to_neighs] 102 | neighs = np.array(samp_neighs).flatten() 103 | 104 | # unique_nodes_list is all nodes required in layer2 105 | not_unique_nodes_list = list(np.concatenate((neighs, nodes))) 106 | 107 | # encode unique_nodes_list in layer1 108 | embeds_layer1 = self.enc1(self.features, not_unique_nodes_list, for_test=for_test) 109 | 110 | feature_dict = {} 111 | for i,v in enumerate(not_unique_nodes_list): 112 | feature_dict[v] = i 113 | 114 | features_embeds = F.relu(embeds_layer1) 115 | 116 | # feed Look-up dict and features_embeds into layer2 117 | nodes_idx = [] 118 | for i, v in enumerate(nodes): 119 | nodes_idx.append(feature_dict[v]) 120 | 121 | mu = self.enc2(features_embeds, nodes_idx, samp_neighs, feature_dict=feature_dict, for_test=for_test) 122 | logvar = self.enc3(features_embeds, nodes_idx, samp_neighs, feature_dict=feature_dict, for_test=for_test) 123 | 124 | return mu, logvar 125 | 126 | 127 | def decode(self, input, for_test): 128 | if not for_test: 129 | input = input.view(input.shape[0],1,input.shape[1]) 130 | 131 | # decoder1 132 | output = self.dec1(input, for_test) 133 | output = F.normalize(output, dim=-1, p=2) 134 | output = F.relu(output) 135 | 136 | # decoder2 137 | output = self.dec2(output, for_test) 138 | output = F.normalize(output, dim=-1, p=2) 139 | output = F.relu(output) 140 | 141 | # linear transform 142 | emb1 = self.mapping1(output, for_test) 143 | emb2 = self.mapping2(output, for_test) 144 | 145 | return emb1, emb2 146 | 147 | 148 | def inner_product_with_mapping(self, emb1, emb2): 149 | emb1 = F.normalize(emb1, dim=-1, p=2) 150 | emb2 = F.normalize(emb2, dim=-1, p=2) 151 | adj = torch.mm(emb1, emb2.t()) 152 | return adj 153 | 154 | 155 | def get_sub_adj_feat(self, nodes): 156 | subgraph_feature = [] 157 | for i,v in enumerate(nodes): 158 | subgraph_feature.append(self.features_np[v]) 159 | subgraph_feature_tensor = torch.FloatTensor(np.array(subgraph_feature)) 160 | return subgraph_feature_tensor 161 | 162 | 163 | def forward(self, nodes, sub_adj, for_test=False): 164 | gan_pred, gan_label = None, None 165 | sub_adj_feat = self.get_sub_adj_feat(nodes) 166 | mu, logvar = self.encode(nodes, for_test) 167 | 168 | mu_q = F.normalize(mu, dim=-1, p=2) 169 | logvar_sub = -logvar 170 | std0 = self.args.std 171 | std_q = torch.exp(0.5 * logvar_sub) * std0 172 | epsilon = torch.randn(std_q.shape) 173 | h = mu_q + int(for_test) * epsilon * std_q 174 | emb1, emb2 = self.decode(h, for_test) 175 | 176 | reconst_adj = self.inner_product_with_mapping(emb1, emb2) 177 | # the prob of one graph is orig 178 | if not for_test and self.is_gan: 179 | gan_pred, gan_label = self.discriminate(sub_adj_feat, sub_adj, reconst_adj) 180 | return mu, logvar_sub, reconst_adj, gan_pred, gan_label 181 | 182 | 183 | 184 | def loss_function(self, preds, labels, logvar_sub, pos_weight, gan_pred, gan_label): 185 | cost = F.binary_cross_entropy_with_logits(preds, labels, pos_weight=torch.FloatTensor([pos_weight])) 186 | # see Appendix B from VAE paper: 187 | # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014 188 | # https://arxiv.org/abs/1312.6114 189 | # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2) 190 | # Trick: KL is constant w.r.t. to mu_q after we normalize mu_q. 191 | kl = (0.5 * (-logvar_sub + torch.exp(logvar_sub) - 1.0)).sum(dim=1).mean() 192 | if self.is_gan: 193 | gan_loss = F.binary_cross_entropy_with_logits(gan_pred, gan_label) 194 | return cost + self.args.kl_ratio * kl + self.args.discriminator_ratio * gan_loss 195 | else: 196 | return cost + self.args.kl_ratio * kl -------------------------------------------------------------------------------- /link_classification_exp/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import random 4 | from random import shuffle 5 | import warnings 6 | warnings.filterwarnings("ignore", category=DeprecationWarning) 7 | 8 | import torch 9 | from torch import nn, optim 10 | import torch.nn.functional as F 11 | import torch.utils.data as tdata 12 | from tqdm import tqdm 13 | 14 | from link_classification_exp.node2vec.src import node2vec 15 | from link_classification_exp.node2vec.src.main import learn_embeddings 16 | 17 | dir_path = os.path.dirname(os.path.realpath(__file__)) 18 | root_path = os.path.abspath(os.path.join(dir_path, os.pardir)) 19 | 20 | new_dblp2_idx = [0,22] 21 | new_IMDB_MULTI_idx = [0,99] 22 | import networkx as nx 23 | import numpy as np 24 | from sklearn.metrics import roc_auc_score 25 | from datetime import datetime 26 | 27 | def parse_args(): 28 | ''' 29 | Parses the node2vec arguments. 30 | ''' 31 | parser = argparse.ArgumentParser(description="Run node2vec.") 32 | 33 | parser.add_argument('--input', nargs='?', default='graph/karate.edgelist', 34 | help='Input graph path') 35 | 36 | parser.add_argument('--output', nargs='?', default='emb/karate.emb', 37 | help='Embeddings path') 38 | 39 | parser.add_argument('--dimensions', type=int, default=128, 40 | help='Number of dimensions. Default is 128.') 41 | 42 | parser.add_argument('--walk-length', type=int, default=80, 43 | help='Length of walk per source. Default is 80.') 44 | 45 | parser.add_argument('--num-walks', type=int, default=10, 46 | help='Number of walks per source. Default is 10.') 47 | 48 | parser.add_argument('--window-size', type=int, default=10, 49 | help='Context size for optimization. Default is 10.') 50 | 51 | parser.add_argument('--iter', default=1, type=int, 52 | help='Number of epochs in SGD') 53 | 54 | parser.add_argument('--workers', type=int, default=8, 55 | help='Number of parallel workers. Default is 8.') 56 | 57 | parser.add_argument('--p', type=float, default=1, 58 | help='Return hyperparameter. Default is 1.') 59 | 60 | parser.add_argument('--q', type=float, default=1, 61 | help='Inout hyperparameter. Default is 1.') 62 | 63 | parser.add_argument('--weighted', dest='weighted', action='store_true', 64 | help='Boolean specifying (un)weighted. Default is unweighted.') 65 | parser.add_argument('--unweighted', dest='unweighted', action='store_false') 66 | parser.set_defaults(weighted=False) 67 | 68 | parser.add_argument('--directed', dest='directed', action='store_true', 69 | help='Graph is (un)directed. Default is undirected.') 70 | parser.add_argument('--undirected', dest='undirected', action='store_false') 71 | 72 | parser.add_argument('--lr', type=float, default=0.01) 73 | parser.add_argument('--batch_size', type=int, default=128) 74 | parser.add_argument('--epochs', type=int, default=100) 75 | parser.add_argument('--graph_name', type=str, default='new_dblp2', help="[new_dblp2, new_IMDB_MULTI]") 76 | parser.add_argument('--graph_type', type=str, default='{}', help="['{}', 'DPGGAN_{}_eps:0.1', 'DPGGAN_{}_eps:1.0', 'DPGGAN_{}_eps:10.0']") 77 | 78 | parser.set_defaults(directed=False) 79 | 80 | return parser.parse_args() 81 | 82 | 83 | def read_train_edgelist(data_folder_path, idx): 84 | train_file = '{}/train_edge_list_{}.txt'.format(data_folder_path, idx) 85 | edge_list = [] 86 | node_num = None 87 | with open(train_file, 'r') as io: 88 | for idx, line in enumerate(io): 89 | if idx == 0: 90 | node_num = int(line.strip()) 91 | else: 92 | line = line.strip() 93 | edge_list.append([int(line.split(' ')[0]), int(line.split(' ')[1])]) 94 | G = nx.Graph() 95 | G.add_nodes_from(list(range(node_num))) 96 | G.add_edges_from(edge_list) 97 | for edge in G.edges(): 98 | G[edge[0]][edge[1]]['weight'] = 1 99 | G = G.to_undirected() 100 | return G 101 | 102 | 103 | 104 | def read_test_edgelist(graph_name, idx): 105 | test_file = '{}/dataset/{}/test_edge_list_{}.txt'.format(dir_path, graph_name, idx) 106 | edge_list = [] 107 | with open(test_file, 'r') as io: 108 | for line in io: 109 | edge_list.append([int(line.split(' ')[0]), int(line.split(' ')[1]), int(line.split(' ')[2])]) 110 | return edge_list 111 | 112 | 113 | 114 | 115 | def train_node2vec(nx_G, args, test_edge_array): 116 | 117 | G = node2vec.Graph(nx_G, args.directed, args.p, args.q) 118 | G.preprocess_transition_probs() 119 | walks = G.simulate_walks(args.num_walks, args.walk_length) 120 | w2v = learn_embeddings(args, walks) 121 | 122 | # get edge & non-exist edge list 123 | edge_list = [] 124 | for line in nx.generate_edgelist(nx_G, data=False): 125 | edge_list.append([line.split(' ')[0], line.split(' ')[1], 1]) 126 | pos_num = len(edge_list) 127 | neg_edge_list = [] 128 | for line_idx, line in enumerate(nx.non_edges(nx_G)): 129 | neg_edge_list.append([str(line[0]), str(line[1]), 0]) 130 | # balance neg and pos 131 | sampled_idx = random.sample(range(len(neg_edge_list)), min(int(pos_num*1.5), len(neg_edge_list))) 132 | sampled_neg_edge_list = list((np.array(neg_edge_list)[sampled_idx])) 133 | edge_list.extend(sampled_neg_edge_list) 134 | edge_array = np.array(edge_list) 135 | edge_array_idx = np.array(list(range(edge_array.shape[0]))) 136 | pairs_dataloader = tdata.DataLoader(torch.from_numpy(edge_array_idx), batch_size=args.batch_size, 137 | shuffle=True) 138 | 139 | input = np.concatenate((w2v[edge_array[:,0]], w2v[edge_array[:,1]]), axis=1) 140 | input_tensor = torch.FloatTensor(input).to(args.cuda) 141 | label = torch.FloatTensor(edge_array[:, -1].astype(int)).to(args.cuda) 142 | 143 | # make sure test node has embedding 144 | valid_idx = [] 145 | valid_set = set(w2v.index2word) 146 | for i, test_edge in enumerate(test_edge_array): 147 | if str(test_edge_array[i, 0]) in valid_set and str(test_edge_array[i, 1]) in valid_set: 148 | valid_idx.append(i) 149 | if not len(valid_idx) > 0: 150 | print('no valid test edge!') 151 | return -1 152 | test_edge_array = test_edge_array[valid_idx] 153 | test_input = np.concatenate((w2v[test_edge_array[:, 0].astype(str)], w2v[test_edge_array[:, 1].astype(str)]), axis=1) 154 | test_input_tensor = torch.FloatTensor(test_input).to(args.cuda) 155 | test_label = torch.FloatTensor(test_edge_array[:, -1].astype(int)).to(args.cuda) 156 | 157 | #### train mlp 158 | mlp = MLP(input_size=128*2, hidden_size=128).to(args.cuda) 159 | optimizer = optim.Adam(mlp.parameters(), lr=args.lr) 160 | 161 | max_auc = 0 162 | for epoch in range(args.epochs): 163 | print("\r", '{}/{}'.format(epoch, args.epochs), end="", flush=True) 164 | mlp.train() 165 | for batch_pairs_idx in pairs_dataloader: 166 | batch_input = input_tensor[batch_pairs_idx] 167 | batch_label = label[batch_pairs_idx] 168 | optimizer.zero_grad() 169 | pred = mlp(batch_input) 170 | loss = mlp.get_loss(pred, batch_label) 171 | loss.backward() 172 | optimizer.step() 173 | 174 | auc_result = link_pred_test(mlp,test_input_tensor, test_label) 175 | if auc_result > max_auc: 176 | max_auc = auc_result 177 | 178 | return max_auc 179 | 180 | 181 | def link_pred_test(mlp,test_input_tensor, test_label): 182 | with torch.no_grad(): 183 | pred = torch.sigmoid(mlp(test_input_tensor)) 184 | auc_score = mlp.get_auc_score(pred, test_label) 185 | return auc_score 186 | 187 | 188 | class MLP(nn.Module): 189 | def __init__(self, input_size, hidden_size): 190 | super(MLP, self).__init__() 191 | self.fc1 = nn.Linear(input_size, hidden_size) 192 | self.relu1 = nn.ReLU() 193 | self.fc2 = nn.Linear(hidden_size, hidden_size) 194 | self.relu2 = nn.ReLU() 195 | self.fc3 = nn.Linear(hidden_size, 1) 196 | 197 | def forward(self, x): 198 | out = self.relu1(self.fc1(x)) 199 | out = self.relu2(self.fc2(out)) 200 | out = self.fc3(out) 201 | return out 202 | 203 | def get_loss(self, pred, target): 204 | loss = F.binary_cross_entropy_with_logits(pred, target.reshape(-1,1)) 205 | return loss 206 | 207 | def get_auc_score(self, pred, target): 208 | if args.cuda: 209 | return roc_auc_score(target.to('cpu').numpy().astype(int), pred.detach().to('cpu').numpy().reshape(-1)) 210 | else: 211 | return roc_auc_score(target.numpy().astype(int), pred.detach().numpy().reshape(-1)) 212 | 213 | 214 | if __name__ == "__main__": 215 | now = datetime.now() 216 | dt_string = now.strftime("%d/%m/%Y %H:%M:%S") 217 | print("date and time =", dt_string) 218 | 219 | args = parse_args() 220 | args.cuda = 'cuda' if torch.cuda.is_available() else 'cpu' 221 | graph_name = args.graph_name 222 | graph_type = args.graph_type 223 | folder_name = graph_type.format(graph_name) 224 | data_folder_path = '{}/dataset/{}'.format(dir_path, folder_name) 225 | print(data_folder_path) 226 | 227 | idx_range = new_dblp2_idx if graph_name == 'new_dblp2' else new_IMDB_MULTI_idx 228 | max_auc_list = [] 229 | for idx in tqdm(range(idx_range[0], idx_range[1])): 230 | train_graph = read_train_edgelist(data_folder_path, idx) 231 | test_edgelist = read_test_edgelist(graph_name, idx) 232 | max_auc = train_node2vec(train_graph, args, np.array(test_edgelist)) 233 | if max_auc == -1: 234 | continue 235 | max_auc_list.append(max_auc) 236 | avg_auc = sum(max_auc_list) / len(max_auc_list) 237 | 238 | # write result to file 239 | log_path = '{}/log/{}.txt'.format(dir_path, folder_name) 240 | with open(log_path, 'a') as f: 241 | f.write('{}\t{}\n'.format(dt_string, avg_auc)) 242 | print('{}\t{}\n'.format(dt_string, avg_auc)) -------------------------------------------------------------------------------- /graph_classification_exp/models/graphcnn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | import sys 6 | 7 | from graph_classification_exp.models.mlp import MLP 8 | 9 | sys.path.append("models/") 10 | 11 | class GraphCNN(nn.Module): 12 | def __init__(self, num_layers, num_mlp_layers, input_dim, hidden_dim, output_dim, final_dropout, learn_eps, graph_pooling_type, neighbor_pooling_type, device): 13 | ''' 14 | num_layers: number of layers in the neural networks (INCLUDING the input layer) 15 | num_mlp_layers: number of layers in mlps (EXCLUDING the input layer) 16 | input_dim: dimensionality of input features 17 | hidden_dim: dimensionality of hidden units at ALL layers 18 | output_dim: number of classes for prediction 19 | final_dropout: dropout ratio on the final linear layer 20 | learn_eps: If True, learn epsilon to distinguish center nodes from neighboring nodes. If False, aggregate neighbors and center nodes altogether. 21 | neighbor_pooling_type: how to aggregate neighbors (mean, average, or max) 22 | graph_pooling_type: how to aggregate entire nodes in a graph (mean, average) 23 | device: which device to use 24 | ''' 25 | 26 | super(GraphCNN, self).__init__() 27 | 28 | self.final_dropout = final_dropout 29 | self.device = device 30 | self.num_layers = num_layers 31 | self.graph_pooling_type = graph_pooling_type 32 | self.neighbor_pooling_type = neighbor_pooling_type 33 | self.learn_eps = learn_eps 34 | self.eps = nn.Parameter(torch.zeros(self.num_layers-1)) 35 | 36 | ###List of MLPs 37 | self.mlps = torch.nn.ModuleList() 38 | 39 | ###List of batchnorms applied to the output of MLP (input of the final prediction linear layer) 40 | self.batch_norms = torch.nn.ModuleList() 41 | 42 | for layer in range(self.num_layers-1): 43 | if layer == 0: 44 | self.mlps.append(MLP(num_mlp_layers, input_dim, hidden_dim, hidden_dim)) 45 | else: 46 | self.mlps.append(MLP(num_mlp_layers, hidden_dim, hidden_dim, hidden_dim)) 47 | 48 | self.batch_norms.append(nn.BatchNorm1d(hidden_dim)) 49 | 50 | #Linear function that maps the hidden representation at dofferemt layers into a prediction score 51 | self.linears_prediction = torch.nn.ModuleList() 52 | for layer in range(num_layers): 53 | if layer == 0: 54 | self.linears_prediction.append(nn.Linear(input_dim, output_dim)) 55 | else: 56 | self.linears_prediction.append(nn.Linear(hidden_dim, output_dim)) 57 | 58 | 59 | def __preprocess_neighbors_maxpool(self, batch_graph): 60 | ###create padded_neighbor_list in concatenated graph 61 | 62 | #compute the maximum number of neighbors within the graphs in the current minibatch 63 | max_deg = max([graph.max_neighbor for graph in batch_graph]) 64 | 65 | padded_neighbor_list = [] 66 | start_idx = [0] 67 | 68 | 69 | for i, graph in enumerate(batch_graph): 70 | start_idx.append(start_idx[i] + len(graph.g)) 71 | padded_neighbors = [] 72 | for j in range(len(graph.neighbors)): 73 | #add off-set values to the neighbor indices 74 | pad = [n + start_idx[i] for n in graph.neighbors[j]] 75 | #padding, dummy data is assumed to be stored in -1 76 | pad.extend([-1]*(max_deg - len(pad))) 77 | 78 | #Add center nodes in the maxpooling if learn_eps is False, i.e., aggregate center nodes and neighbor nodes altogether. 79 | if not self.learn_eps: 80 | pad.append(j + start_idx[i]) 81 | 82 | padded_neighbors.append(pad) 83 | padded_neighbor_list.extend(padded_neighbors) 84 | 85 | return torch.LongTensor(padded_neighbor_list) 86 | 87 | 88 | def __preprocess_neighbors_sumavepool(self, batch_graph): 89 | ###create block diagonal sparse matrix 90 | 91 | edge_mat_list = [] 92 | start_idx = [0] 93 | for i, graph in enumerate(batch_graph): 94 | start_idx.append(start_idx[i] + len(graph.g)) 95 | edge_mat_list.append(graph.edge_mat + start_idx[i]) 96 | Adj_block_idx = torch.cat(edge_mat_list, 1) 97 | Adj_block_elem = torch.ones(Adj_block_idx.shape[1]) 98 | 99 | #Add self-loops in the adjacency matrix if learn_eps is False, i.e., aggregate center nodes and neighbor nodes altogether. 100 | 101 | if not self.learn_eps: 102 | num_node = start_idx[-1] 103 | self_loop_edge = torch.LongTensor([range(num_node), range(num_node)]) 104 | elem = torch.ones(num_node) 105 | Adj_block_idx = torch.cat([Adj_block_idx, self_loop_edge], 1) 106 | Adj_block_elem = torch.cat([Adj_block_elem, elem], 0) 107 | 108 | Adj_block = torch.sparse.FloatTensor(Adj_block_idx, Adj_block_elem, torch.Size([start_idx[-1],start_idx[-1]])) 109 | 110 | return Adj_block.to(self.device) 111 | 112 | 113 | def __preprocess_graphpool(self, batch_graph): 114 | ###create sum or average pooling sparse matrix over entire nodes in each graph (num graphs x num nodes) 115 | 116 | start_idx = [0] 117 | 118 | #compute the padded neighbor list 119 | for i, graph in enumerate(batch_graph): 120 | start_idx.append(start_idx[i] + len(graph.g)) 121 | 122 | idx = [] 123 | elem = [] 124 | for i, graph in enumerate(batch_graph): 125 | ###average pooling 126 | if self.graph_pooling_type == "average": 127 | elem.extend([1./len(graph.g)]*len(graph.g)) 128 | 129 | else: 130 | ###sum pooling 131 | elem.extend([1]*len(graph.g)) 132 | 133 | idx.extend([[i, j] for j in range(start_idx[i], start_idx[i+1], 1)]) 134 | elem = torch.FloatTensor(elem) 135 | idx = torch.LongTensor(idx).transpose(0,1) 136 | graph_pool = torch.sparse.FloatTensor(idx, elem, torch.Size([len(batch_graph), start_idx[-1]])) 137 | 138 | return graph_pool.to(self.device) 139 | 140 | def maxpool(self, h, padded_neighbor_list): 141 | ###Element-wise minimum will never affect max-pooling 142 | 143 | dummy = torch.min(h, dim = 0)[0] 144 | h_with_dummy = torch.cat([h, dummy.reshape((1, -1)).to(self.device)]) 145 | pooled_rep = torch.max(h_with_dummy[padded_neighbor_list], dim = 1)[0] 146 | return pooled_rep 147 | 148 | 149 | def next_layer_eps(self, h, layer, padded_neighbor_list = None, Adj_block = None): 150 | ###pooling neighboring nodes and center nodes separately by epsilon reweighting. 151 | 152 | if self.neighbor_pooling_type == "max": 153 | ##If max pooling 154 | pooled = self.maxpool(h, padded_neighbor_list) 155 | else: 156 | #If sum or average pooling 157 | pooled = torch.spmm(Adj_block, h) 158 | if self.neighbor_pooling_type == "average": 159 | #If average pooling 160 | degree = torch.spmm(Adj_block, torch.ones((Adj_block.shape[0], 1)).to(self.device)) 161 | pooled = pooled/degree 162 | 163 | #Reweights the center node representation when aggregating it with its neighbors 164 | pooled = pooled + (1 + self.eps[layer])*h 165 | pooled_rep = self.mlps[layer](pooled) 166 | h = self.batch_norms[layer](pooled_rep) 167 | 168 | #non-linearity 169 | h = F.relu(h) 170 | return h 171 | 172 | 173 | def next_layer(self, h, layer, padded_neighbor_list = None, Adj_block = None): 174 | ###pooling neighboring nodes and center nodes altogether 175 | 176 | if self.neighbor_pooling_type == "max": 177 | ##If max pooling 178 | pooled = self.maxpool(h, padded_neighbor_list) 179 | else: 180 | #If sum or average pooling 181 | pooled = torch.spmm(Adj_block, h) 182 | if self.neighbor_pooling_type == "average": 183 | #If average pooling 184 | degree = torch.spmm(Adj_block, torch.ones((Adj_block.shape[0], 1)).to(self.device)) 185 | pooled = pooled/degree 186 | 187 | #representation of neighboring and center nodes 188 | pooled_rep = self.mlps[layer](pooled) 189 | 190 | h = self.batch_norms[layer](pooled_rep) 191 | 192 | #non-linearity 193 | h = F.relu(h) 194 | return h 195 | 196 | 197 | def forward(self, batch_graph): 198 | X_concat = torch.cat([graph.node_features for graph in batch_graph], 0).to(self.device) 199 | graph_pool = self.__preprocess_graphpool(batch_graph) 200 | 201 | if self.neighbor_pooling_type == "max": 202 | padded_neighbor_list = self.__preprocess_neighbors_maxpool(batch_graph) 203 | else: 204 | Adj_block = self.__preprocess_neighbors_sumavepool(batch_graph) 205 | 206 | #list of hidden representation at each layer (including input) 207 | hidden_rep = [X_concat] 208 | h = X_concat 209 | 210 | for layer in range(self.num_layers-1): 211 | if self.neighbor_pooling_type == "max" and self.learn_eps: 212 | h = self.next_layer_eps(h, layer, padded_neighbor_list = padded_neighbor_list) 213 | elif not self.neighbor_pooling_type == "max" and self.learn_eps: 214 | h = self.next_layer_eps(h, layer, Adj_block = Adj_block) 215 | elif self.neighbor_pooling_type == "max" and not self.learn_eps: 216 | h = self.next_layer(h, layer, padded_neighbor_list = padded_neighbor_list) 217 | elif not self.neighbor_pooling_type == "max" and not self.learn_eps: 218 | h = self.next_layer(h, layer, Adj_block = Adj_block) 219 | 220 | hidden_rep.append(h) 221 | 222 | score_over_layer = 0 223 | 224 | #perform pooling over all nodes in each graph in every layer 225 | for layer, h in enumerate(hidden_rep): 226 | pooled_h = torch.spmm(graph_pool, h) 227 | score_over_layer += F.dropout(self.linears_prediction[layer](pooled_h), self.final_dropout, training = self.training) 228 | 229 | return score_over_layer 230 | -------------------------------------------------------------------------------- /src/DPGGAN/gaussian_moments.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """A standalone utility for computing the log moments. 17 | 18 | The utility for computing the log moments. It consists of two methods. 19 | compute_log_moment(q, sigma, T, lmbd) computes the log moment with sampling 20 | probability q, noise sigma, order lmbd, and T steps. get_privacy_spent computes 21 | delta (or eps) given log moments and eps (or delta). 22 | 23 | Example use: 24 | 25 | Suppose that we have run an algorithm with parameters, an array of 26 | (q1, sigma1, T1) ... (qk, sigmak, Tk), and we wish to compute eps for a given 27 | delta. The example code would be: 28 | 29 | max_lmbd = 32 30 | lmbds = xrange(1, max_lmbd + 1) 31 | log_moments = [] 32 | for lmbd in lmbds: 33 | log_moment = 0 34 | for q, sigma, T in parameters: 35 | log_moment += compute_log_moment(q, sigma, T, lmbd) 36 | log_moments.append((lmbd, log_moment)) 37 | eps, delta = get_privacy_spent(log_moments, target_delta=delta) 38 | 39 | To verify that the I1 >= I2 (see comments in GaussianMomentsAccountant in 40 | accountant.py for the context), run the same loop above with verify=True 41 | passed to compute_log_moment. 42 | """ 43 | import math 44 | import sys 45 | 46 | import numpy as np 47 | import scipy.integrate as integrate 48 | import scipy.stats 49 | #from sympy.mpmath import mp 50 | import mpmath as mp 51 | 52 | def _to_np_float64(v): 53 | if math.isnan(v) or math.isinf(v): 54 | return np.inf 55 | return np.float64(v) 56 | 57 | 58 | ###################### 59 | # FLOAT64 ARITHMETIC # 60 | ###################### 61 | 62 | 63 | def pdf_gauss(x, sigma, mean=0): 64 | return scipy.stats.norm.pdf(x, loc=mean, scale=sigma) 65 | 66 | 67 | def cropped_ratio(a, b): 68 | if a < 1E-50 and b < 1E-50: 69 | return 1. 70 | else: 71 | return a / b 72 | 73 | 74 | def integral_inf(fn): 75 | integral, _ = integrate.quad(fn, -np.inf, np.inf) 76 | return integral 77 | 78 | 79 | def integral_bounded(fn, lb, ub): 80 | integral, _ = integrate.quad(fn, lb, ub) 81 | return integral 82 | 83 | 84 | def distributions(sigma, q): 85 | mu0 = lambda y: pdf_gauss(y, sigma=sigma, mean=0.0) 86 | mu1 = lambda y: pdf_gauss(y, sigma=sigma, mean=1.0) 87 | mu = lambda y: (1 - q) * mu0(y) + q * mu1(y) 88 | return mu0, mu1, mu 89 | 90 | 91 | def compute_a(sigma, q, lmbd, verbose=False): 92 | lmbd_int = int(math.ceil(lmbd)) 93 | if lmbd_int == 0: 94 | return 1.0 95 | 96 | a_lambda_first_term_exact = 0 97 | a_lambda_second_term_exact = 0 98 | for i in range(lmbd_int + 1): 99 | coef_i = scipy.special.binom(lmbd_int, i) * (q ** i) 100 | s1, s2 = 0, 0 101 | for j in range(i + 1): 102 | coef_j = scipy.special.binom(i, j) * (-1) ** (i - j) 103 | s1 += coef_j * np.exp((j * j - j) / (2.0 * (sigma ** 2))) 104 | s2 += coef_j * np.exp((j * j + j) / (2.0 * (sigma ** 2))) 105 | a_lambda_first_term_exact += coef_i * s1 106 | a_lambda_second_term_exact += coef_i * s2 107 | 108 | a_lambda_exact = ((1.0 - q) * a_lambda_first_term_exact + 109 | q * a_lambda_second_term_exact) 110 | if verbose: 111 | print("A: by binomial expansion {} = {} + {}".format( 112 | a_lambda_exact, 113 | (1.0 - q) * a_lambda_first_term_exact, 114 | q * a_lambda_second_term_exact)) 115 | return _to_np_float64(a_lambda_exact) 116 | 117 | 118 | def compute_b(sigma, q, lmbd, verbose=False): 119 | mu0, _, mu = distributions(sigma, q) 120 | 121 | b_lambda_fn = lambda z: mu0(z) * np.power(cropped_ratio(mu0(z), mu(z)), lmbd) 122 | b_lambda = integral_inf(b_lambda_fn) 123 | m = sigma ** 2 * (np.log((2. - q) / (1. - q)) + 1. / (2 * sigma ** 2)) 124 | 125 | b_fn = lambda z: (np.power(mu0(z) / mu(z), lmbd) - 126 | np.power(mu(-z) / mu0(z), lmbd)) 127 | if verbose: 128 | print("M =", m) 129 | print("f(-M) = {} f(M) = {}".format(b_fn(-m), b_fn(m))) 130 | assert b_fn(-m) < 0 and b_fn(m) < 0 131 | 132 | b_lambda_int1_fn = lambda z: (mu0(z) * 133 | np.power(cropped_ratio(mu0(z), mu(z)), lmbd)) 134 | b_lambda_int2_fn = lambda z: (mu0(z) * 135 | np.power(cropped_ratio(mu(z), mu0(z)), lmbd)) 136 | b_int1 = integral_bounded(b_lambda_int1_fn, -m, m) 137 | b_int2 = integral_bounded(b_lambda_int2_fn, -m, m) 138 | 139 | a_lambda_m1 = compute_a(sigma, q, lmbd - 1) 140 | b_bound = a_lambda_m1 + b_int1 - b_int2 141 | 142 | if verbose: 143 | print("B: by numerical integration", b_lambda) 144 | print("B must be no more than ", b_bound) 145 | print(b_lambda, b_bound) 146 | return _to_np_float64(b_lambda) 147 | 148 | 149 | ########################### 150 | # MULTIPRECISION ROUTINES # 151 | ########################### 152 | 153 | 154 | def pdf_gauss_mp(x, sigma, mean): 155 | return mp.mpf(1.) / mp.sqrt(mp.mpf("2.") * sigma ** 2 * mp.pi) * mp.exp( 156 | - (x - mean) ** 2 / (mp.mpf("2.") * sigma ** 2)) 157 | 158 | 159 | def integral_inf_mp(fn): 160 | integral, _ = mp.quad(fn, [-mp.inf, mp.inf], error=True) 161 | return integral 162 | 163 | 164 | def integral_bounded_mp(fn, lb, ub): 165 | integral, _ = mp.quad(fn, [lb, ub], error=True) 166 | return integral 167 | 168 | 169 | def distributions_mp(sigma, q): 170 | mu0 = lambda y: pdf_gauss_mp(y, sigma=sigma, mean=mp.mpf(0)) 171 | mu1 = lambda y: pdf_gauss_mp(y, sigma=sigma, mean=mp.mpf(1)) 172 | mu = lambda y: (1 - q) * mu0(y) + q * mu1(y) 173 | return mu0, mu1, mu 174 | 175 | 176 | def compute_a_mp(sigma, q, lmbd, verbose=False): 177 | lmbd_int = int(math.ceil(lmbd)) 178 | if lmbd_int == 0: 179 | return 1.0 180 | 181 | mu0, mu1, mu = distributions_mp(sigma, q) 182 | a_lambda_fn = lambda z: mu(z) * (mu(z) / mu0(z)) ** lmbd_int 183 | a_lambda_first_term_fn = lambda z: mu0(z) * (mu(z) / mu0(z)) ** lmbd_int 184 | a_lambda_second_term_fn = lambda z: mu1(z) * (mu(z) / mu0(z)) ** lmbd_int 185 | 186 | a_lambda = integral_inf_mp(a_lambda_fn) 187 | a_lambda_first_term = integral_inf_mp(a_lambda_first_term_fn) 188 | a_lambda_second_term = integral_inf_mp(a_lambda_second_term_fn) 189 | 190 | if verbose: 191 | print("A: by numerical integration {} = {} + {}".format( 192 | a_lambda, 193 | (1 - q) * a_lambda_first_term, 194 | q * a_lambda_second_term)) 195 | 196 | return _to_np_float64(a_lambda) 197 | 198 | 199 | def compute_b_mp(sigma, q, lmbd, verbose=False): 200 | lmbd_int = int(math.ceil(lmbd)) 201 | if lmbd_int == 0: 202 | return 1.0 203 | 204 | mu0, _, mu = distributions_mp(sigma, q) 205 | 206 | b_lambda_fn = lambda z: mu0(z) * (mu0(z) / mu(z)) ** lmbd_int 207 | b_lambda = integral_inf_mp(b_lambda_fn) 208 | 209 | m = sigma ** 2 * (mp.log((2 - q) / (1 - q)) + 1 / (2 * (sigma ** 2))) 210 | b_fn = lambda z: ((mu0(z) / mu(z)) ** lmbd_int - 211 | (mu(-z) / mu0(z)) ** lmbd_int) 212 | if verbose: 213 | print("M =", m) 214 | print("f(-M) = {} f(M) = {}".format(b_fn(-m), b_fn(m))) 215 | assert b_fn(-m) < 0 and b_fn(m) < 0 216 | 217 | b_lambda_int1_fn = lambda z: mu0(z) * (mu0(z) / mu(z)) ** lmbd_int 218 | b_lambda_int2_fn = lambda z: mu0(z) * (mu(z) / mu0(z)) ** lmbd_int 219 | b_int1 = integral_bounded_mp(b_lambda_int1_fn, -m, m) 220 | b_int2 = integral_bounded_mp(b_lambda_int2_fn, -m, m) 221 | 222 | a_lambda_m1 = compute_a_mp(sigma, q, lmbd - 1) 223 | b_bound = a_lambda_m1 + b_int1 - b_int2 224 | 225 | if verbose: 226 | print("B by numerical integration", b_lambda) 227 | print("B must be no more than ", b_bound) 228 | assert b_lambda < b_bound + 1e-5 229 | return _to_np_float64(b_lambda) 230 | 231 | 232 | def _compute_delta(log_moments, eps): 233 | """Compute delta for given log_moments and eps. 234 | 235 | Args: 236 | log_moments: the log moments of privacy loss, in the form of pairs 237 | of (moment_order, log_moment) 238 | eps: the target epsilon. 239 | Returns: 240 | delta 241 | """ 242 | min_delta = 1.0 243 | for moment_order, log_moment in log_moments: 244 | if moment_order == 0: 245 | continue 246 | if math.isinf(log_moment) or math.isnan(log_moment): 247 | sys.stderr.write("The %d-th order is inf or Nan\n" % moment_order) 248 | continue 249 | if log_moment < moment_order * eps: 250 | min_delta = min(min_delta, 251 | math.exp(log_moment - moment_order * eps)) 252 | return min_delta 253 | 254 | 255 | def _compute_eps(log_moments, delta): 256 | """Compute epsilon for given log_moments and delta. 257 | 258 | Args: 259 | log_moments: the log moments of privacy loss, in the form of pairs 260 | of (moment_order, log_moment) 261 | delta: the target delta. 262 | Returns: 263 | epsilon 264 | """ 265 | min_eps = float("inf") 266 | for moment_order, log_moment in log_moments: 267 | if moment_order == 0: 268 | continue 269 | if math.isinf(log_moment) or math.isnan(log_moment): 270 | sys.stderr.write("The %d-th order is inf or Nan\n" % moment_order) 271 | continue 272 | min_eps = min(min_eps, (log_moment - math.log(delta)) / moment_order) 273 | return min_eps 274 | 275 | 276 | def compute_log_moment(q, sigma, steps, lmbd, verify=False, verbose=False): 277 | """Compute the log moment of Gaussian mechanism for given parameters. 278 | 279 | Args: 280 | q: the sampling ratio. 281 | sigma: the noise sigma. 282 | steps: the number of steps.(priv_pars['T']) 283 | lmbd: the moment order. 284 | verify: if False, only compute the symbolic version. If True, computes 285 | both symbolic and numerical solutions and verifies the results match. 286 | verbose: if True, print out debug information. 287 | Returns: 288 | the log moment with type np.float64, could be np.inf. 289 | """ 290 | moment = compute_a(sigma, q, lmbd, verbose=verbose) 291 | if verify: 292 | mp.dps = 50 293 | moment_a_mp = compute_a_mp(sigma, q, lmbd, verbose=verbose) 294 | moment_b_mp = compute_b_mp(sigma, q, lmbd, verbose=verbose) 295 | print((moment,moment_a_mp)) 296 | np.testing.assert_allclose(moment, moment_a_mp, rtol=1e-10) 297 | if not np.isinf(moment_a_mp): 298 | # The following test fails for (1, np.inf)! 299 | np.testing.assert_array_less(moment_b_mp, moment_a_mp) 300 | if np.isinf(moment): 301 | return np.inf 302 | else: 303 | return np.log(moment) * steps 304 | 305 | 306 | def get_privacy_spent(log_moments, target_eps=None, target_delta=None): 307 | """Compute delta (or eps) for given eps (or delta) from log moments. 308 | 309 | Args: 310 | log_moments: array of (moment_order, log_moment) pairs. 311 | target_eps: if not None, the epsilon for which we would like to compute 312 | corresponding delta value. 313 | target_delta: if not None, the delta for which we would like to compute 314 | corresponding epsilon value. 315 | Exactly one of target_eps and target_delta is None. 316 | Returns: 317 | eps, delta pair 318 | """ 319 | assert (target_eps is None) ^ (target_delta is None) 320 | assert not ((target_eps is None) and (target_delta is None)) 321 | if target_eps is not None: 322 | return (target_eps, _compute_delta(log_moments, target_eps)) 323 | else: 324 | return (_compute_eps(log_moments, target_delta), target_delta) 325 | -------------------------------------------------------------------------------- /src/DPGGAN/data_utils.py: -------------------------------------------------------------------------------- 1 | import pickle as pkl 2 | import scipy 3 | import sys 4 | 5 | import networkx as nx 6 | import numpy as np 7 | import scipy.sparse as sp 8 | import torch 9 | from sklearn.metrics import roc_auc_score, average_precision_score 10 | import matplotlib.pyplot as plt 11 | 12 | def load_data(dataset, n_eigenvector=None): 13 | adj, features = None, None 14 | 15 | # Set default feature_dim 16 | if n_eigenvector != None: 17 | feature_dim = n_eigenvector 18 | else: 19 | feature_dim = 15 20 | 21 | # Load data from specific dataset 22 | if dataset == 'karate': 23 | G = nx.karate_club_graph() 24 | adj = nx.to_scipy_sparse_matrix(G) 25 | elif dataset == 'cora' or dataset == 'citeseer': 26 | # load the data: x, tx, allx, graph 27 | names = ['x', 'tx', 'allx', 'graph'] 28 | objects = [] 29 | for i in range(len(names)): 30 | ''' 31 | fix Pickle incompatibility of numpy arrays between Python 2 and 3 32 | https://stackoverflow.com/questions/11305790/pickle-incompatibility-of-numpy-arrays-between-python-2-and-3 33 | ''' 34 | with open("data/ind.{}.{}".format(dataset, names[i]), 'rb') as rf: 35 | u = pkl._Unpickler(rf) 36 | u.encoding = 'latin1' 37 | cur_data = u.load() 38 | objects.append(cur_data) 39 | # objects.append( 40 | # pkl.load(open("data/ind.{}.{}".format(dataset, names[i]), 'rb'))) 41 | x, tx, allx, graph = tuple(objects) 42 | test_idx_reorder = parse_index_file( 43 | "data/ind.{}.test.index".format(dataset)) 44 | test_idx_range = np.sort(test_idx_reorder) 45 | 46 | if dataset == 'citeseer': 47 | # Fix citeseer dataset (there are some isolated nodes in the graph) 48 | # Find isolated nodes, add them as zero-vecs into the right position 49 | test_idx_range_full = range( 50 | min(test_idx_reorder), max(test_idx_reorder) + 1) 51 | tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) 52 | tx_extended[test_idx_range - min(test_idx_range), :] = tx 53 | tx = tx_extended 54 | 55 | features = sp.vstack((allx, tx)).tolil() 56 | features[test_idx_reorder, :] = features[test_idx_range, :] 57 | features = torch.FloatTensor(np.array(features.todense())) 58 | adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) 59 | 60 | # If use eigenvector as feature 61 | if n_eigenvector != None: 62 | node_num = adj.shape[0] 63 | adj_ = sp.coo_matrix(adj) 64 | adj_ = adj + sp.eye(adj_.shape[0]) 65 | rowsum = np.array(adj_.sum(1)) 66 | degree_mat_inv_sqrt = sp.diags(np.power(rowsum, -0.5).flatten()) 67 | adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt).toarray() 68 | _, features = scipy.linalg.eigh(adj_normalized, eigvals=(node_num - feature_dim, node_num - 1)) 69 | features = torch.FloatTensor(features) 70 | 71 | features_normalize = True 72 | if features_normalize: 73 | features = normalize(features) 74 | 75 | return adj, features 76 | 77 | def normalize(x): 78 | import torch.nn.functional as F 79 | x_normed = F.normalize(x, p=2, dim=1) 80 | return x_normed 81 | 82 | 83 | def parse_index_file(filename): 84 | index = [] 85 | for line in open(filename): 86 | index.append(int(line.strip())) 87 | return index 88 | 89 | 90 | def sparse_to_tuple(sparse_mx): 91 | if not sp.isspmatrix_coo(sparse_mx): 92 | sparse_mx = sparse_mx.tocoo() 93 | coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose() 94 | values = sparse_mx.data 95 | shape = sparse_mx.shape 96 | return coords, values, shape 97 | 98 | 99 | def mask_test_edges(adj): 100 | # Function to build test set with 10% positive links 101 | # NOTE: Splits are randomized and results might slightly deviate from reported numbers in the paper. 102 | # TODO: Clean up. 103 | 104 | # Remove diagonal elements 105 | adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape) 106 | adj.eliminate_zeros() 107 | # Check that diag is zero: 108 | assert np.diag(adj.todense()).sum() == 0 109 | 110 | # TODO: prevent delete edge from node having only one edge 111 | node_with_one_edge = set() 112 | for i,v in enumerate(adj.sum(axis=1)): 113 | if v[0] == 0: 114 | print("Node %d without neigh" % i) 115 | sys.exit() 116 | elif v[0] == 1: 117 | node_with_one_edge.add(i) 118 | 119 | adj_triu = sp.triu(adj) 120 | adj_tuple = sparse_to_tuple(adj_triu) 121 | edges = adj_tuple[0] # edges-(5278, 2) 122 | qualified_edges = [] 123 | for i in edges: 124 | if i[0] not in node_with_one_edge and i[1] not in node_with_one_edge: 125 | qualified_edges.append(i) 126 | qualified_edges = np.array(qualified_edges) 127 | num_test = int(np.floor(edges.shape[0] / 10.)) 128 | num_val = int(np.floor(edges.shape[0] / 20.)) 129 | edges_set = set() 130 | qualified_edges_set = set() 131 | for edge in edges: 132 | edges_set.add((edge[0],edge[1])) 133 | for edge in qualified_edges: 134 | qualified_edges_set.add((edge[0], edge[1])) 135 | diff_edges = [] 136 | for t in (edges_set - qualified_edges_set): 137 | diff_edges.append([t[0],t[1]]) 138 | diff_edges = np.array(diff_edges) 139 | 140 | qualified_edge_idx = list(range(qualified_edges.shape[0])) 141 | np.random.shuffle(qualified_edge_idx) 142 | val_edge_idx = qualified_edge_idx[:num_val] 143 | test_edge_idx = qualified_edge_idx[num_val:(num_val + num_test)] 144 | train_edge_idx = qualified_edge_idx[(num_val + num_test):] 145 | test_edges = qualified_edges[test_edge_idx] 146 | val_edges = qualified_edges[val_edge_idx] 147 | train_edges = qualified_edges[train_edge_idx] 148 | train_edges = np.vstack([train_edges,diff_edges]) 149 | # Catch-up for those isolated nodes 150 | data = np.ones(train_edges.shape[0]) 151 | adj_train = sp.csr_matrix((data, (train_edges[:, 0], train_edges[:, 1])), shape=adj.shape) 152 | adj_train = adj_train + adj_train.T 153 | isolated_nodes = set() 154 | for i,v in enumerate(adj_train.sum(axis=1)): 155 | if v[0] == 0: 156 | isolated_nodes.add(i) 157 | patch_edges = [] 158 | for i in isolated_nodes: 159 | j = np.random.choice(np.nonzero(adj[i].toarray())[1]) 160 | patch_edges.append([i,j]) 161 | patch_edges = np.array(patch_edges) 162 | train_edges = np.vstack([train_edges, patch_edges]) 163 | 164 | 165 | def ismember(a, b, tol=5): 166 | rows_close = np.all(np.round(a - b[:, None], tol) == 0, axis=-1) 167 | redundant_index = np.where(rows_close)[1] # redundant element's index of a 168 | return np.any(rows_close), redundant_index 169 | 170 | test_edges_false = [] 171 | edges_all = sparse_to_tuple(adj)[0] 172 | while len(test_edges_false) < len(test_edges): 173 | idx_i = np.random.randint(0, adj.shape[0]) 174 | idx_j = np.random.randint(0, adj.shape[0]) 175 | if idx_i == idx_j: 176 | continue 177 | if ismember([idx_i, idx_j], edges_all)[0]: 178 | continue 179 | if test_edges_false: 180 | if ismember([idx_j, idx_i], np.array(test_edges_false))[0]: 181 | continue 182 | if ismember([idx_i, idx_j], np.array(test_edges_false))[0]: 183 | continue 184 | test_edges_false.append([idx_i, idx_j]) 185 | 186 | val_edges_false = [] 187 | while len(val_edges_false) < len(val_edges): 188 | idx_i = np.random.randint(0, adj.shape[0]) 189 | idx_j = np.random.randint(0, adj.shape[0]) 190 | if idx_i == idx_j: 191 | continue 192 | if ismember([idx_i, idx_j], train_edges)[0]: 193 | continue 194 | if ismember([idx_j, idx_i], train_edges)[0]: 195 | continue 196 | if ismember([idx_i, idx_j], val_edges)[0]: 197 | continue 198 | if ismember([idx_j, idx_i], val_edges)[0]: 199 | continue 200 | if val_edges_false: 201 | if ismember([idx_j, idx_i], np.array(val_edges_false))[0]: 202 | continue 203 | if ismember([idx_i, idx_j], np.array(val_edges_false))[0]: 204 | continue 205 | val_edges_false.append([idx_i, idx_j]) 206 | 207 | assert ~ismember(test_edges_false, edges_all)[0] 208 | assert ~ismember(val_edges_false, edges_all)[0] 209 | assert ~ismember(val_edges, test_edges)[0] 210 | if ismember(val_edges, train_edges)[0]: 211 | remove_index = ismember(val_edges, train_edges)[1] 212 | print("element need to remove from val: %d" % len(remove_index)) 213 | val_edges = np.delete(val_edges,remove_index,0) 214 | if ismember(test_edges, train_edges): 215 | remove_index = ismember(test_edges, train_edges)[1] 216 | print("element need to remove from test: %d" % len(remove_index)) 217 | test_edges = np.delete(test_edges, remove_index, 0) 218 | assert ~ismember(val_edges, train_edges)[0] 219 | assert ~ismember(test_edges, train_edges)[0] 220 | 221 | data = np.ones(train_edges.shape[0]) 222 | 223 | # Re-build adj matrix 224 | adj_train = sp.csr_matrix((data, (train_edges[:, 0], train_edges[:, 1])), shape=adj.shape) 225 | adj_train = adj_train + adj_train.T 226 | 227 | # NOTE: these edge lists only contain single direction of edge! 228 | return adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false 229 | 230 | 231 | def preprocess_graph(adj): 232 | adj = sp.coo_matrix(adj) 233 | adj_ = adj + sp.eye(adj.shape[0]) 234 | rowsum = np.array(adj_.sum(1)) 235 | degree_mat_inv_sqrt = sp.diags(np.power(rowsum, -0.5).flatten()) 236 | adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt).tocoo() 237 | # return sparse_to_tuple(adj_normalized) 238 | return sparse_mx_to_torch_sparse_tensor(adj_normalized) 239 | 240 | 241 | def sparse_mx_to_torch_sparse_tensor(sparse_mx): 242 | """Convert a scipy sparse matrix to a torch sparse tensor.""" 243 | sparse_mx = sparse_mx.tocoo().astype(np.float32) 244 | indices = torch.from_numpy( 245 | np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64)) 246 | values = torch.from_numpy(sparse_mx.data) 247 | shape = torch.Size(sparse_mx.shape) 248 | return torch.sparse.FloatTensor(indices, values, shape) 249 | 250 | 251 | def get_roc_score(emb, adj_orig, edges_pos, edges_neg): 252 | def sigmoid(x): 253 | return 1 / (1 + np.exp(-x)) 254 | # Predict on test set of edges 255 | adj_rec = np.dot(emb, emb.T) 256 | preds = [] 257 | pos = [] 258 | for e in edges_pos: 259 | preds.append(sigmoid(adj_rec[e[0], e[1]])) 260 | pos.append(adj_orig[e[0], e[1]]) 261 | 262 | preds_neg = [] 263 | neg = [] 264 | for e in edges_neg: 265 | preds_neg.append(sigmoid(adj_rec[e[0], e[1]])) 266 | neg.append(adj_orig[e[0], e[1]]) 267 | 268 | preds_all = np.hstack([preds, preds_neg]) 269 | labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds_neg))]) 270 | roc_score = roc_auc_score(labels_all, preds_all) 271 | ap_score = average_precision_score(labels_all, preds_all) 272 | 273 | return roc_score, ap_score 274 | 275 | 276 | def make_adj_label(index, adj_matrix): 277 | adj_label = np.zeros((len(index),len(index))) 278 | for i,v in enumerate(index): 279 | adj_label[i] = adj_matrix[v,index].toarray() 280 | return adj_label 281 | 282 | 283 | def draw_graph(adj, path, plot_name, circle): 284 | G = None 285 | if scipy.sparse.issparse(adj): 286 | G = nx.from_scipy_sparse_matrix(adj) 287 | else: 288 | G = nx.from_numpy_matrix(adj) 289 | 290 | options = { 291 | 'node_color': 'black', 292 | 'node_size': 5, 293 | 'line_color': 'grey', 294 | 'linewidths': 0.1, 295 | 'width': 0.1, 296 | } 297 | 298 | if circle: 299 | node_list = sorted(G.degree, key=lambda x: x[1], reverse=True) 300 | node2order = {} 301 | for i, v in enumerate(node_list): 302 | node2order[v[0]] = i 303 | 304 | new_edge = [] 305 | for i in G.edges(): 306 | new_edge.append((node2order[i[0]], node2order[i[1]])) 307 | 308 | new_G = nx.Graph() 309 | new_G.add_nodes_from(range(len(node_list))) 310 | new_G.add_edges_from(new_edge) 311 | 312 | nx.draw_circular(new_G, with_labels=True) 313 | 314 | else: 315 | nx.draw(G, **options) 316 | plt.savefig(path + '/' + plot_name + ".png") 317 | plt.clf() -------------------------------------------------------------------------------- /src/eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | 4 | import scipy 5 | import igraph 6 | import networkx as nx 7 | import numpy as np 8 | import powerlaw 9 | import scipy.sparse as sp 10 | from scipy.sparse.csgraph import connected_components 11 | 12 | import matplotlib.pyplot as plt 13 | plt.switch_backend('agg') 14 | 15 | 16 | dir_path = os.path.dirname(os.path.realpath(__file__)) 17 | root_path = os.path.abspath(os.path.join(dir_path, os.pardir)) 18 | 19 | 20 | def statistics_degrees(A_in): 21 | """ 22 | Compute min, max, mean degree 23 | 24 | Parameters 25 | ---------- 26 | A_in: sparse matrix or np.array 27 | The input adjacency matrix. 28 | Returns 29 | ------- 30 | d_max. d_min, d_mean 31 | """ 32 | 33 | degrees = A_in.sum(axis=0) 34 | return np.max(degrees), np.min(degrees), np.mean(degrees) 35 | 36 | 37 | def statistics_LCC(A_in): 38 | """ 39 | Compute the size of the largest connected component (LCC) 40 | 41 | Parameters 42 | ---------- 43 | A_in: sparse matrix or np.array 44 | The input adjacency matrix. 45 | Returns 46 | ------- 47 | Size of LCC 48 | 49 | """ 50 | 51 | unique, counts = np.unique(connected_components(A_in)[1], return_counts=True) 52 | LCC = np.where(connected_components(A_in)[1] == np.argmax(counts))[0] 53 | return LCC 54 | 55 | 56 | def statistics_wedge_count(A_in): 57 | """ 58 | Compute the wedge count of the input graph 59 | 60 | Parameters 61 | ---------- 62 | A_in: sparse matrix or np.array 63 | The input adjacency matrix. 64 | 65 | Returns 66 | ------- 67 | The wedge count. 68 | """ 69 | 70 | degrees = A_in.sum(axis=0) 71 | return float(np.sum(np.array([0.5 * x * (x - 1) for x in degrees]))) 72 | 73 | 74 | def statistics_claw_count(A_in): 75 | """ 76 | Compute the claw count of the input graph 77 | 78 | Parameters 79 | ---------- 80 | A_in: sparse matrix or np.array 81 | The input adjacency matrix. 82 | 83 | Returns 84 | ------- 85 | Claw count 86 | """ 87 | 88 | degrees = A_in.sum(axis=0) 89 | return float(np.sum(np.array([1 / 6. * x * (x - 1) * (x - 2) for x in degrees]))) 90 | 91 | 92 | def statistics_triangle_count(A_in): 93 | """ 94 | Compute the triangle count of the input graph 95 | 96 | Parameters 97 | ---------- 98 | A_in: sparse matrix or np.array 99 | The input adjacency matrix. 100 | Returns 101 | ------- 102 | Triangle count 103 | """ 104 | 105 | A_graph = nx.from_numpy_matrix(A_in) 106 | triangles = nx.triangles(A_graph) 107 | t = np.sum(list(triangles.values())) / 3 108 | return int(t) 109 | 110 | 111 | def squares(g): 112 | """ 113 | Count the number of squares for each node 114 | Parameters 115 | ---------- 116 | g: igraph Graph object 117 | The input graph. 118 | 119 | Returns 120 | ------- 121 | List with N entries (N is number of nodes) that give the number of squares a node is part of. 122 | """ 123 | 124 | cliques = g.cliques(min=4, max=4) 125 | result = [0] * g.vcount() 126 | for i, j, k, l in cliques: 127 | result[i] += 1 128 | result[j] += 1 129 | result[k] += 1 130 | result[l] += 1 131 | return result 132 | 133 | 134 | def statistics_square_count(A_in): 135 | """ 136 | Compute the square count of the input graph 137 | 138 | Parameters 139 | ---------- 140 | A_in: sparse matrix or np.array 141 | The input adjacency matrix. 142 | Returns 143 | ------- 144 | Square count 145 | """ 146 | 147 | A_igraph = igraph.Graph.Adjacency((A_in > 0).tolist()).as_undirected() 148 | return int(np.sum(squares(A_igraph)) / 4) 149 | 150 | 151 | def statistics_power_law_alpha(A_in): 152 | """ 153 | Compute the power law coefficient of the degree distribution of the input graph 154 | 155 | Parameters 156 | ---------- 157 | A_in: sparse matrix or np.array 158 | The input adjacency matrix. 159 | 160 | Returns 161 | ------- 162 | Power law coefficient 163 | """ 164 | 165 | degrees = A_in.sum(axis=0) 166 | return powerlaw.Fit(degrees, xmin=max(np.min(degrees),1)).power_law.alpha 167 | 168 | 169 | def statistics_gini(A_in): 170 | """ 171 | Compute the Gini coefficient of the degree distribution of the input graph 172 | 173 | Parameters 174 | ---------- 175 | A_in: sparse matrix or np.array 176 | The input adjacency matrix. 177 | 178 | Returns 179 | ------- 180 | Gini coefficient 181 | """ 182 | 183 | n = A_in.shape[0] 184 | degrees = A_in.sum(axis=0) 185 | degrees_sorted = np.sort(degrees) 186 | G = (2 * np.sum(np.array([i * degrees_sorted[i] for i in range(len(degrees))]))) / (n * np.sum(degrees)) - ( 187 | n + 1) / n 188 | return float(G) 189 | 190 | 191 | def statistics_edge_distribution_entropy(A_in): 192 | """ 193 | Compute the relative edge distribution entropy of the input graph. 194 | 195 | Parameters 196 | ---------- 197 | A_in: sparse matrix or np.array 198 | The input adjacency matrix. 199 | 200 | Returns 201 | ------- 202 | Rel. edge distribution entropy 203 | """ 204 | 205 | degrees = A_in.sum(axis=0) 206 | m = 0.5 * np.sum(np.square(A_in)) 207 | n = A_in.shape[0] 208 | 209 | H_er = 1 / np.log(n) * np.sum(-degrees / (2 * float(m)) * np.log((degrees+.0001) / (2 * float(m)))) 210 | return H_er 211 | 212 | 213 | def statistics_compute_cpl(A): 214 | """Compute characteristic path length.""" 215 | P = sp.csgraph.shortest_path(sp.csr_matrix(A)) 216 | return P[((1 - np.isinf(P)) * (1 - np.eye(P.shape[0]))).astype(np.bool)].mean() 217 | 218 | 219 | def symmetrize_and_without_self_loop(adj_orig): 220 | def symmetrize(a): 221 | # print("symmetrize A!") 222 | a = a + a.T 223 | sum_a = a - np.diag(a.diagonal()) 224 | sum_a[sum_a >= 1] = 1 225 | sum_a[sum_a < 1] = 0 226 | return sum_a 227 | 228 | # input must be np.array not sparse matrix 229 | if scipy.sparse.issparse(adj_orig): 230 | adj_ = adj_orig.todense() 231 | else: 232 | adj_ = adj_orig 233 | 234 | # remove self_loop 235 | np.fill_diagonal(adj_, 0) 236 | adj_orig = symmetrize(adj_) 237 | 238 | G = nx.from_numpy_array(adj_) 239 | G.remove_nodes_from(list(nx.isolates(G))) 240 | adj = nx.to_numpy_array(G) 241 | return adj 242 | 243 | 244 | def compute_graph_statistics(A_in, Z_obs=None): 245 | """ 246 | 247 | Parameters 248 | ---------- 249 | A_in: sparse matrix 250 | The input adjacency matrix. 251 | Z_obs: np.matrix [N, K], where K is the number of classes. 252 | Matrix whose rows are one-hot vectors indicating the class membership of the respective node. 253 | 254 | Returns 255 | ------- 256 | Dictionary containing the following statistics: 257 | * Maximum, minimum, mean degree of nodes 258 | * Size of the largest connected component (LCC) 259 | * Wedge count 260 | * Claw count 261 | * Triangle count 262 | * Square count 263 | * Power law exponent 264 | * Gini coefficient 265 | * Relative edge distribution entropy 266 | * Assortativity 267 | * Clustering coefficient 268 | * Number of connected components 269 | * Intra- and inter-community density (if Z_obs is passed) 270 | * Characteristic path length 271 | """ 272 | A = A_in.copy() 273 | 274 | 275 | # important restriction 276 | A = symmetrize_and_without_self_loop(A) 277 | 278 | A_graph = nx.from_numpy_matrix(A).to_undirected() 279 | 280 | statistics = {} 281 | 282 | d_max, d_min, d_mean = statistics_degrees(A) 283 | 284 | # Degree statistics 285 | statistics['d_max'] = d_max 286 | statistics['d_min'] = d_min 287 | statistics['d'] = d_mean 288 | 289 | # node number & edger number 290 | statistics['node_num'] = A_graph.number_of_nodes() 291 | statistics['edge_num'] = A_graph.number_of_edges() 292 | 293 | # largest connected component 294 | LCC = statistics_LCC(A) 295 | 296 | statistics['LCC'] = LCC.shape[0] 297 | # wedge count 298 | # statistics['wedge_count'] = statistics_wedge_count(A) 299 | 300 | # claw count 301 | # statistics['claw_count'] = statistics_claw_count(A) 302 | 303 | # triangle count 304 | statistics['triangle_count'] = statistics_triangle_count(A) 305 | 306 | # Square count 307 | # statistics['square_count'] = statistics_square_count(A) 308 | 309 | # power law exponent 310 | # statistics['power_law_exp'] = statistics_power_law_alpha(A) 311 | 312 | # gini coefficient 313 | statistics['gini'] = statistics_gini(A) 314 | 315 | # Relative edge distribution entropy 316 | statistics['rel_edge_distr_entropy'] = statistics_edge_distribution_entropy(A) 317 | 318 | # Assortativity 319 | # statistics['assortativity'] = nx.degree_assortativity_coefficient(A_graph) 320 | 321 | # Clustering coefficient 322 | # statistics['clustering_coefficient'] = 3 * statistics['triangle_count'] / statistics['claw_count'] 323 | 324 | # Number of connected components 325 | # statistics['n_components'] = connected_components(A)[0] 326 | 327 | # if Z_obs is not None: 328 | # # inter- and intra-community density 329 | # intra, inter = statistics_cluster_props(A, Z_obs) 330 | # statistics['intra_community_density'] = intra 331 | # statistics['inter_community_density'] = inter 332 | 333 | statistics['cpl'] = statistics_compute_cpl(A) 334 | 335 | return statistics 336 | 337 | 338 | def stat_eval(G): 339 | return compute_graph_statistics(nx.to_scipy_sparse_matrix(G).toarray()) 340 | 341 | 342 | 343 | def load_graphs(file_path): 344 | with open(file_path, 'rb') as pkl_file: 345 | data = pickle.load(pkl_file) 346 | return data 347 | 348 | 349 | # load saved graphs and calculate avg of each metric 350 | if __name__ == '__main__': 351 | print(root_path) 352 | dataset = 'imdb' # 'dblp' 353 | 354 | data_folder = root_path + '/data/' 355 | 356 | if dataset == 'imdb': 357 | orig = load_graphs(data_folder + 'orig/new_IMDB_MULTI.pkl') 358 | dpgraphgan_graph = load_graphs(data_folder + 'generated/DPGraphGAN_new_IMDB_MULTI.pkl') 359 | dpgraphvae_graph = load_graphs(data_folder + 'generated/DPGraphVAE_new_IMDB_MULTI.pkl') 360 | netgan_graph = load_graphs(data_folder + 'generated/NetGAN_new_IMDB_MULTI.pkl') 361 | graphrnn_graph = load_graphs(data_folder + 'generated/GraphRNN_new_IMDB_MULTI.pkl') 362 | graphvae_graph = load_graphs(data_folder + 'generated/GraphVAE_new_IMDB_MULTI.pkl') 363 | graphgan_graph = load_graphs(data_folder + 'generated/GraphGAN_new_IMDB_MULTI.pkl') 364 | else: 365 | orig = load_graphs(data_folder + 'orig/new_dblp2.pkl') 366 | dpgraphgan_graph = load_graphs(data_folder + 'generated/DPGraphGAN_new_dblp2.pkl') 367 | dpgraphvae_graph = load_graphs(data_folder + 'generated/DPGraphVAE_new_dblp2.pkl') 368 | netgan_graph = load_graphs(data_folder + 'generated/NetGAN_new_dblp2.pkl') 369 | graphrnn_graph = load_graphs(data_folder + 'generated/GraphRNN_new_dblp2.pkl') 370 | graphvae_graph = load_graphs(data_folder + 'generated/GraphVAE_new_dblp2.pkl') 371 | graphgan_graph = load_graphs(data_folder + 'generated/GraphGAN_new_dblp2.pkl') 372 | 373 | # # link density 374 | # upper_link_density = 0 375 | # lower_link_density = 1 376 | # for g in orig: 377 | # num_nodes = len(g) 378 | # full_edge_num = num_nodes**2 379 | # actual_edge_num = g.number_of_edges() 380 | # edge_density = actual_edge_num/full_edge_num 381 | # if edge_density > upper_link_density: 382 | # upper_link_density = edge_density 383 | # if edge_density < lower_link_density: 384 | # lower_link_density = edge_density 385 | # print("Upper:{}; Lower:{}".format(upper_link_density, lower_link_density)) 386 | 387 | 388 | # graph stat 389 | print(len(orig)) 390 | print(len(graphgan_graph)) 391 | # LCC, triangle_count, cpl, gini, rel_edge_distr_entropy 392 | LCC_list = [] 393 | TC_list = [] 394 | CPL_list = [] 395 | GINI_list = [] 396 | REDE_list = [] 397 | for g, generated_g in zip(orig, graphgan_graph): 398 | LCC_list.append(stat_eval(generated_g)['LCC']) 399 | TC_list.append(stat_eval(generated_g)['triangle_count']) 400 | CPL_list.append(stat_eval(generated_g)['cpl']) 401 | GINI_list.append(stat_eval(generated_g)['gini']) 402 | REDE_list.append(stat_eval(generated_g)['rel_edge_distr_entropy']) 403 | 404 | print("avg LCC:{}".format(sum(LCC_list)/len(LCC_list))) 405 | print("avg TC:{}".format(sum(TC_list) / len(TC_list))) 406 | print("avg CPL:{}".format(sum(CPL_list) / len(CPL_list))) 407 | print("avg GINI:{}".format(sum(GINI_list) / len(GINI_list))) 408 | print("avg REDE:{}".format(sum(REDE_list) / len(REDE_list))) 409 | 410 | # dblp 411 | # avg LCC:163.2173913043478 412 | # avg TC:643.9565217391304 413 | # avg CPL:3.6229098437512697 414 | # avg GINI:0.5010830435774121 415 | # avg REDE:0.9010612433648646 416 | # imdb 417 | # avg LCC:31.12 418 | # avg TC:1508.62 419 | # avg CPL:1.6412643551055979 420 | # avg GINI:0.17521018241531106 421 | # avg REDE:0.9630393055580962 422 | 423 | --------------------------------------------------------------------------------