├── GAE ├── __init__.py ├── __pycache__ │ ├── layers.cpython-36.pyc │ ├── model.cpython-36.pyc │ ├── __init__.cpython-36.pyc │ ├── optimizer.cpython-36.pyc │ ├── train_model.cpython-36.pyc │ ├── preprocessing.cpython-36.pyc │ └── initialization.cpython-36.pyc ├── initialization.py ├── optimizer.py ├── layers.py ├── model.py ├── preprocessing.py └── train_model.py ├── OpenNE ├── __init__.py ├── __pycache__ │ ├── RWR.cpython-36.pyc │ ├── gf.cpython-36.pyc │ ├── lap.cpython-36.pyc │ ├── graph.cpython-36.pyc │ ├── hope.cpython-36.pyc │ ├── line.cpython-36.pyc │ ├── sdne.cpython-36.pyc │ ├── __init__.cpython-36.pyc │ ├── classify.cpython-36.pyc │ ├── grarep.cpython-36.pyc │ ├── node2vec.cpython-36.pyc │ └── walker.cpython-36.pyc ├── node2vec.py ├── lap.py ├── gf.py ├── L3Hope.py ├── grarep.py ├── classify.py ├── RWR.py ├── graph.py ├── hope.py ├── walker.py ├── line.py └── sdne.py ├── DGI ├── models │ ├── __init__.py │ ├── __pycache__ │ │ ├── dgi.cpython-36.pyc │ │ ├── logreg.cpython-36.pyc │ │ └── __init__.cpython-36.pyc │ ├── logreg.py │ └── dgi.py ├── layers │ ├── __init__.py │ ├── __pycache__ │ │ ├── gcn.cpython-36.pyc │ │ ├── __init__.cpython-36.pyc │ │ ├── readout.cpython-36.pyc │ │ └── discriminator.cpython-36.pyc │ ├── readout.py │ ├── discriminator.py │ └── gcn.py └── utils │ ├── Laplacian.py │ └── process.py ├── README.md ├── evaluation.py ├── utils.py ├── main.py └── embed_train.py /GAE/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /OpenNE/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /DGI/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .dgi import DGI 2 | from .logreg import LogReg 3 | -------------------------------------------------------------------------------- /DGI/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .gcn import GCN 2 | from .readout import AvgReadout 3 | from .discriminator import Discriminator 4 | -------------------------------------------------------------------------------- /GAE/__pycache__/layers.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mustafaCoskunAgu/SiGraC/HEAD/GAE/__pycache__/layers.cpython-36.pyc -------------------------------------------------------------------------------- /GAE/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mustafaCoskunAgu/SiGraC/HEAD/GAE/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /OpenNE/__pycache__/RWR.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mustafaCoskunAgu/SiGraC/HEAD/OpenNE/__pycache__/RWR.cpython-36.pyc -------------------------------------------------------------------------------- /OpenNE/__pycache__/gf.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mustafaCoskunAgu/SiGraC/HEAD/OpenNE/__pycache__/gf.cpython-36.pyc -------------------------------------------------------------------------------- /OpenNE/__pycache__/lap.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mustafaCoskunAgu/SiGraC/HEAD/OpenNE/__pycache__/lap.cpython-36.pyc -------------------------------------------------------------------------------- /GAE/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mustafaCoskunAgu/SiGraC/HEAD/GAE/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /OpenNE/__pycache__/graph.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mustafaCoskunAgu/SiGraC/HEAD/OpenNE/__pycache__/graph.cpython-36.pyc -------------------------------------------------------------------------------- /OpenNE/__pycache__/hope.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mustafaCoskunAgu/SiGraC/HEAD/OpenNE/__pycache__/hope.cpython-36.pyc -------------------------------------------------------------------------------- /OpenNE/__pycache__/line.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mustafaCoskunAgu/SiGraC/HEAD/OpenNE/__pycache__/line.cpython-36.pyc -------------------------------------------------------------------------------- /OpenNE/__pycache__/sdne.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mustafaCoskunAgu/SiGraC/HEAD/OpenNE/__pycache__/sdne.cpython-36.pyc -------------------------------------------------------------------------------- /DGI/layers/__pycache__/gcn.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mustafaCoskunAgu/SiGraC/HEAD/DGI/layers/__pycache__/gcn.cpython-36.pyc -------------------------------------------------------------------------------- /DGI/models/__pycache__/dgi.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mustafaCoskunAgu/SiGraC/HEAD/DGI/models/__pycache__/dgi.cpython-36.pyc -------------------------------------------------------------------------------- /GAE/__pycache__/optimizer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mustafaCoskunAgu/SiGraC/HEAD/GAE/__pycache__/optimizer.cpython-36.pyc -------------------------------------------------------------------------------- /GAE/__pycache__/train_model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mustafaCoskunAgu/SiGraC/HEAD/GAE/__pycache__/train_model.cpython-36.pyc -------------------------------------------------------------------------------- /OpenNE/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mustafaCoskunAgu/SiGraC/HEAD/OpenNE/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /OpenNE/__pycache__/classify.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mustafaCoskunAgu/SiGraC/HEAD/OpenNE/__pycache__/classify.cpython-36.pyc -------------------------------------------------------------------------------- /OpenNE/__pycache__/grarep.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mustafaCoskunAgu/SiGraC/HEAD/OpenNE/__pycache__/grarep.cpython-36.pyc -------------------------------------------------------------------------------- /OpenNE/__pycache__/node2vec.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mustafaCoskunAgu/SiGraC/HEAD/OpenNE/__pycache__/node2vec.cpython-36.pyc -------------------------------------------------------------------------------- /OpenNE/__pycache__/walker.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mustafaCoskunAgu/SiGraC/HEAD/OpenNE/__pycache__/walker.cpython-36.pyc -------------------------------------------------------------------------------- /DGI/models/__pycache__/logreg.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mustafaCoskunAgu/SiGraC/HEAD/DGI/models/__pycache__/logreg.cpython-36.pyc -------------------------------------------------------------------------------- /GAE/__pycache__/preprocessing.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mustafaCoskunAgu/SiGraC/HEAD/GAE/__pycache__/preprocessing.cpython-36.pyc -------------------------------------------------------------------------------- /DGI/layers/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mustafaCoskunAgu/SiGraC/HEAD/DGI/layers/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /DGI/layers/__pycache__/readout.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mustafaCoskunAgu/SiGraC/HEAD/DGI/layers/__pycache__/readout.cpython-36.pyc -------------------------------------------------------------------------------- /DGI/models/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mustafaCoskunAgu/SiGraC/HEAD/DGI/models/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /GAE/__pycache__/initialization.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mustafaCoskunAgu/SiGraC/HEAD/GAE/__pycache__/initialization.cpython-36.pyc -------------------------------------------------------------------------------- /DGI/layers/__pycache__/discriminator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mustafaCoskunAgu/SiGraC/HEAD/DGI/layers/__pycache__/discriminator.cpython-36.pyc -------------------------------------------------------------------------------- /DGI/layers/readout.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | # Applies an average on seq, of shape (batch, nodes, features) 5 | # While taking into account the masking of msk 6 | class AvgReadout(nn.Module): 7 | def __init__(self): 8 | super(AvgReadout, self).__init__() 9 | 10 | def forward(self, seq, msk): 11 | if msk is None: 12 | return torch.mean(seq, 1) 13 | else: 14 | msk = torch.unsqueeze(msk, -1) 15 | return torch.sum(seq * msk, 1) / torch.sum(msk) 16 | 17 | -------------------------------------------------------------------------------- /GAE/initialization.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | 7 | def weight_variable_glorot(input_dim, output_dim, name=""): 8 | """Create a weight variable with Glorot & Bengio (AISTATS 2010) 9 | initialization. 10 | """ 11 | init_range = np.sqrt(6.0 / (input_dim + output_dim)) 12 | initial = tf.random_uniform([input_dim, output_dim], minval=-init_range, 13 | maxval=init_range, dtype=tf.float32) 14 | return tf.Variable(initial, name=name) 15 | -------------------------------------------------------------------------------- /DGI/models/logreg.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class LogReg(nn.Module): 6 | def __init__(self, ft_in, nb_classes): 7 | super(LogReg, self).__init__() 8 | self.fc = nn.Linear(ft_in, nb_classes) 9 | 10 | for m in self.modules(): 11 | self.weights_init(m) 12 | 13 | def weights_init(self, m): 14 | if isinstance(m, nn.Linear): 15 | torch.nn.init.xavier_uniform_(m.weight.data) 16 | if m.bias is not None: 17 | m.bias.data.fill_(0.0) 18 | 19 | def forward(self, seq): 20 | ret = self.fc(seq) 21 | return ret 22 | 23 | -------------------------------------------------------------------------------- /DGI/utils/Laplacian.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed May 20 19:31:11 2020 4 | 5 | @author: Secil 6 | """ 7 | 8 | import numpy as np 9 | import scipy.sparse as sp 10 | 11 | def normalize_adj(adj): 12 | adj = sp.coo_matrix(adj) 13 | rowsum = np.array(adj.sum(1)) 14 | d_inv_sqrt = np.power(rowsum, -0.5).flatten() 15 | d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. 16 | d_mat_inv_sqrt = sp.diags(d_inv_sqrt) 17 | return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo() 18 | 19 | A = [[0, 1, 2], 20 | [1, 0, 4], 21 | [2, 4, 0]] 22 | 23 | A2 = [[1, 1, 2], 24 | [1, 1, 4], 25 | [2, 4, 1]] 26 | 27 | L = normalize_adj(A) 28 | 29 | LE = normalize_adj(A2) -------------------------------------------------------------------------------- /DGI/models/dgi.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from DGI.layers import GCN, AvgReadout, Discriminator 4 | 5 | class DGI(nn.Module): 6 | def __init__(self, n_in, n_h, activation): 7 | super(DGI, self).__init__() 8 | self.gcn = GCN(n_in, n_h, activation) 9 | self.read = AvgReadout() 10 | 11 | self.sigm = nn.Sigmoid() 12 | 13 | self.disc = Discriminator(n_h) 14 | 15 | def forward(self, seq1, seq2, adj, sparse, msk, samp_bias1, samp_bias2): 16 | h_1 = self.gcn(seq1, adj, sparse) 17 | 18 | c = self.read(h_1, msk) 19 | c = self.sigm(c) 20 | 21 | h_2 = self.gcn(seq2, adj, sparse) 22 | 23 | ret = self.disc(c, h_1, h_2, samp_bias1, samp_bias2) 24 | 25 | return ret 26 | 27 | # Detach the return variables 28 | def embed(self, seq, adj, sparse, msk): 29 | h_1 = self.gcn(seq, adj, sparse) 30 | c = self.read(h_1, msk) 31 | 32 | return h_1.detach(), c.detach() 33 | 34 | -------------------------------------------------------------------------------- /DGI/layers/discriminator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class Discriminator(nn.Module): 5 | def __init__(self, n_h): 6 | super(Discriminator, self).__init__() 7 | self.f_k = nn.Bilinear(n_h, n_h, 1) 8 | 9 | for m in self.modules(): 10 | self.weights_init(m) 11 | 12 | def weights_init(self, m): 13 | if isinstance(m, nn.Bilinear): 14 | torch.nn.init.xavier_uniform_(m.weight.data) 15 | if m.bias is not None: 16 | m.bias.data.fill_(0.0) 17 | 18 | def forward(self, c, h_pl, h_mi, s_bias1=None, s_bias2=None): 19 | c_x = torch.unsqueeze(c, 1) 20 | c_x = c_x.expand_as(h_pl) 21 | 22 | sc_1 = torch.squeeze(self.f_k(h_pl, c_x), 2) 23 | sc_2 = torch.squeeze(self.f_k(h_mi, c_x), 2) 24 | 25 | if s_bias1 is not None: 26 | sc_1 += s_bias1 27 | if s_bias2 is not None: 28 | sc_2 += s_bias2 29 | 30 | logits = torch.cat((sc_1, sc_2), 1) 31 | 32 | return logits 33 | 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SiGraC 2 | 3 | Requirements 4 | 5 | Python 3.6 6 | 7 | pytorch 8 | 9 | networkx 10 | 11 | pandas 12 | 13 | scipy 14 | 15 | scikit-learn 16 | 17 | numpy 18 | 19 | ... 20 | 21 | PS: I got a few questions about the torch version. Pyton 3.6.10 torch: 1.9.0 + cpu 22 | 23 | Run 24 | 25 | main.py 26 | 27 | Change parser.add_argument('--embTech', choices=[ 'DGI', 'CN', 'AA', 28 | 29 | ], default='HDI', help='The embedding learning method') 30 | For various convolution matrix option change HDI to RA, HPI,etc. 31 | 32 | For any question email me via coskunmustafa@ankara.edu.tr 33 | 34 | If you find this code useful, please cite: 35 | 36 | 37 | @article{cocskun2021node, 38 | title={Node similarity-based graph convolution for link prediction in biological networks}, 39 | author={Co{\c{s}}kun, Mustafa and Koyut{\"u}rk, Mehmet}, 40 | journal={Bioinformatics}, 41 | volume={37}, 42 | number={23}, 43 | pages={4501--4508}, 44 | year={2021}, 45 | publisher={Oxford University Press} 46 | } 47 | 48 | 49 | PS: I got a few questions about the torch version. 50 | Pyton 3.6.10 51 | torch: 1.9.0 + cpu 52 | -------------------------------------------------------------------------------- /DGI/layers/gcn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class GCN(nn.Module): 5 | def __init__(self, in_ft, out_ft, act, bias=True): 6 | super(GCN, self).__init__() 7 | self.fc = nn.Linear(in_ft, out_ft, bias=False) 8 | self.act = nn.PReLU() if act == 'prelu' else act 9 | 10 | if bias: 11 | self.bias = nn.Parameter(torch.FloatTensor(out_ft)) 12 | self.bias.data.fill_(0.0) 13 | else: 14 | self.register_parameter('bias', None) 15 | 16 | for m in self.modules(): 17 | self.weights_init(m) 18 | 19 | def weights_init(self, m): 20 | if isinstance(m, nn.Linear): 21 | torch.nn.init.xavier_uniform_(m.weight.data) 22 | if m.bias is not None: 23 | m.bias.data.fill_(0.0) 24 | 25 | # Shape of seq: (batch, nodes, features) 26 | def forward(self, seq, adj, sparse=False): 27 | seq_fts = self.fc(seq) 28 | if sparse: 29 | out = torch.unsqueeze(torch.spmm(adj, torch.squeeze(seq_fts, 0)), 0) 30 | else: 31 | out = torch.bmm(adj, seq_fts) 32 | if self.bias is not None: 33 | out += self.bias 34 | 35 | return self.act(out) 36 | 37 | -------------------------------------------------------------------------------- /OpenNE/node2vec.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from gensim.models import Word2Vec 4 | 5 | from OpenNE import walker 6 | 7 | 8 | class Node2vec(object): 9 | 10 | def __init__(self, graph, path_length, num_paths, dim, p=1.0, q=1.0, dw=False, **kwargs): 11 | 12 | kwargs["workers"] = kwargs.get("workers", 1) 13 | if dw: 14 | kwargs["hs"] = 1 15 | p = 1.0 16 | q = 1.0 17 | 18 | self.graph = graph 19 | if dw: 20 | self.walker = walker.BasicWalker(graph, workers=kwargs["workers"]) 21 | else: 22 | self.walker = walker.Walker( 23 | graph, p=p, q=q, workers=kwargs["workers"]) 24 | print("Preprocess transition probs...") 25 | self.walker.preprocess_transition_probs() 26 | sentences = self.walker.simulate_walks( 27 | num_walks=num_paths, walk_length=path_length) 28 | kwargs["sentences"] = sentences 29 | kwargs["min_count"] = kwargs.get("min_count", 0) 30 | kwargs["size"] = kwargs.get("size", dim) 31 | kwargs["sg"] = 1 32 | 33 | self.size = kwargs["size"] 34 | print("Learning representation...") 35 | word2vec = Word2Vec(**kwargs) 36 | self.vectors = {} 37 | for word in graph.G.nodes(): 38 | self.vectors[word] = word2vec.wv[word] 39 | del word2vec 40 | 41 | def save_embeddings(self, filename): 42 | fout = open(filename, 'w') 43 | node_num = len(self.vectors.keys()) 44 | fout.write("{} {}\n".format(node_num, self.size)) 45 | for node, vec in self.vectors.items(): 46 | fout.write("{} {}\n".format(node, 47 | ' '.join([str(x) for x in vec]))) 48 | fout.close() 49 | -------------------------------------------------------------------------------- /GAE/optimizer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | 5 | 6 | class OptimizerAE(object): 7 | def __init__(self, preds, labels, pos_weight, norm, learning_rate): 8 | preds_sub = preds 9 | labels_sub = labels 10 | 11 | self.cost = norm * tf.reduce_mean( 12 | tf.nn.weighted_cross_entropy_with_logits(logits=preds_sub, targets=labels_sub, pos_weight=pos_weight)) 13 | self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) # Adam Optimizer 14 | 15 | self.opt_op = self.optimizer.minimize(self.cost) 16 | self.grads_vars = self.optimizer.compute_gradients(self.cost) 17 | 18 | self.correct_prediction = tf.equal(tf.cast(tf.greater_equal(tf.sigmoid(preds_sub), 0.5), tf.int32), 19 | tf.cast(labels_sub, tf.int32)) 20 | self.accuracy = tf.reduce_mean(tf.cast(self.correct_prediction, tf.float32)) 21 | 22 | 23 | class OptimizerVAE(object): 24 | def __init__(self, preds, labels, model, num_nodes, pos_weight, norm, learning_rate): 25 | preds_sub = preds 26 | labels_sub = labels 27 | 28 | self.cost = norm * tf.reduce_mean( 29 | tf.nn.weighted_cross_entropy_with_logits(logits=preds_sub, targets=labels_sub, pos_weight=pos_weight)) 30 | self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) # Adam Optimizer 31 | 32 | # Latent loss 33 | self.log_lik = self.cost 34 | self.kl = (0.5 / num_nodes) * tf.reduce_mean(tf.reduce_sum(1 + 2 * model.z_log_std - tf.square(model.z_mean) - 35 | tf.square(tf.exp(model.z_log_std)), 1)) 36 | self.cost -= self.kl 37 | 38 | self.opt_op = self.optimizer.minimize(self.cost) 39 | self.grads_vars = self.optimizer.compute_gradients(self.cost) 40 | 41 | self.correct_prediction = tf.equal(tf.cast(tf.greater_equal(tf.sigmoid(preds_sub), 0.5), tf.int32), 42 | tf.cast(labels_sub, tf.int32)) 43 | self.accuracy = tf.reduce_mean(tf.cast(self.correct_prediction, tf.float32)) 44 | -------------------------------------------------------------------------------- /OpenNE/lap.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import networkx as nx 4 | import numpy as np 5 | from scipy.sparse.linalg import eigsh 6 | 7 | __author__ = "Wang Binlu" 8 | __email__ = "wblmail@whu.edu.cn" 9 | 10 | 11 | class LaplacianEigenmaps(object): 12 | def __init__(self, graph, rep_size=128): 13 | self.g = graph 14 | self.node_size = self.g.G.number_of_nodes() 15 | self.rep_size = rep_size 16 | self.adj_mat = nx.to_numpy_array(self.g.G) 17 | self.vectors = {} 18 | self.embeddings = self.get_train() 19 | look_back = self.g.look_back_list 20 | 21 | for i, embedding in enumerate(self.embeddings): 22 | self.vectors[look_back[i]] = embedding 23 | 24 | def getAdj(self): 25 | node_size = self.g.node_size 26 | look_up = self.g.look_up_dict 27 | adj = np.zeros((node_size, node_size)) 28 | for edge in self.g.G.edges(): 29 | adj[look_up[edge[0]]][look_up[edge[1]]] = self.g.G[edge[0]][edge[1]]['weight'] 30 | return adj 31 | 32 | def getLap(self): 33 | # degree_mat = np.diagflat(np.sum(self.adj_mat, axis=1)) 34 | # print('np.diagflat(np.sum(self.adj_mat, axis=1))') 35 | # deg_trans = np.diagflat(np.reciprocal(np.sqrt(np.sum(self.adj_mat, axis=1)))) 36 | # print('np.diagflat(np.reciprocal(np.sqrt(np.sum(self.adj_mat, axis=1))))') 37 | # deg_trans = np.nan_to_num(deg_trans) 38 | # L = degree_mat-self.adj_mat 39 | # print('begin norm_lap_mat') 40 | # # eye = np.eye(self.node_size) 41 | # 42 | # norm_lap_mat = np.matmul(np.matmul(deg_trans, L), deg_trans) 43 | G = self.g.G.to_undirected() 44 | print('begin norm_lap_mat') 45 | norm_lap_mat = nx.normalized_laplacian_matrix(G) 46 | print('finish norm_lap_mat') 47 | return norm_lap_mat 48 | 49 | def get_train(self): 50 | lap_mat = self.getLap() 51 | print('finish getLap...') 52 | w, vec = eigsh(lap_mat, k=self.rep_size) 53 | print('finish eigh(lap_mat)...') 54 | # start = 0 55 | # for i in range(self.node_size): 56 | # if w[i] > 1e-10: 57 | # start = i 58 | # break 59 | # vec = vec[:, start:start+self.rep_size] 60 | 61 | return vec 62 | 63 | def save_embeddings(self, filename): 64 | fout = open(filename, 'w') 65 | node_num = len(self.vectors) 66 | fout.write("{} {}\n".format(node_num, self.rep_size)) 67 | for node, vec in self.vectors.items(): 68 | fout.write("{} {}\n".format(node, ' '.join([str(x) for x in vec]))) 69 | fout.close() 70 | -------------------------------------------------------------------------------- /OpenNE/gf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | __author__ = "Wang Binlu" 7 | __email__ = "wblmail@whu.edu.cn" 8 | 9 | 10 | class GraphFactorization(object): 11 | def __init__(self, graph, rep_size=128, epoch=120, learning_rate=0.003, weight_decay=1.): 12 | self.g = graph 13 | 14 | self.node_size = graph.G.number_of_nodes() 15 | self.rep_size = rep_size 16 | self.max_iter = epoch 17 | self.lr = learning_rate 18 | self.lamb = weight_decay 19 | self.sess = tf.Session() 20 | self.adj_mat = self.getAdj() 21 | self.vectors = {} 22 | 23 | self.embeddings = self.get_train() 24 | 25 | look_back = self.g.look_back_list 26 | 27 | for i, embedding in enumerate(self.embeddings): 28 | self.vectors[look_back[i]] = embedding 29 | 30 | def getAdj(self): 31 | node_size = self.g.node_size 32 | look_up = self.g.look_up_dict 33 | adj = np.zeros((node_size, node_size)) 34 | for edge in self.g.G.edges(): 35 | adj[look_up[edge[0]]][look_up[edge[1]]] = self.g.G[edge[0]][edge[1]]['weight'] 36 | return adj 37 | 38 | def get_train(self): 39 | 40 | adj_mat = self.adj_mat 41 | 42 | mat_mask = 1. * (adj_mat > 0) 43 | 44 | _embeddings = tf.Variable(tf.contrib.layers.xavier_initializer()([self.node_size, self.rep_size]), 45 | dtype=tf.float32, name='embeddings') 46 | 47 | Adj = tf.placeholder(tf.float32, [self.node_size, self.node_size], name='adj_mat') 48 | AdjMask = tf.placeholder(tf.float32, [self.node_size, self.node_size], name='adj_mask') 49 | 50 | cost = tf.reduce_sum( 51 | tf.square(Adj - tf.matmul(_embeddings, tf.transpose(_embeddings)) * AdjMask)) + \ 52 | self.lamb * tf.reduce_sum(tf.square(_embeddings)) 53 | 54 | optimizer = tf.train.AdamOptimizer(self.lr) 55 | train_op = optimizer.minimize(cost) 56 | 57 | init = tf.global_variables_initializer() 58 | self.sess.run(init) 59 | 60 | print("total iter: %i" % self.max_iter) 61 | for step in range(self.max_iter): 62 | self.sess.run(train_op, feed_dict={Adj: adj_mat, AdjMask: mat_mask}) 63 | if step % 50 == 0: 64 | print("step %i: cost: %g" % (step, self.sess.run(cost, feed_dict={Adj: adj_mat, AdjMask: mat_mask}))) 65 | return self.sess.run(_embeddings) 66 | 67 | def save_embeddings(self, filename): 68 | fout = open(filename, 'w') 69 | node_num = len(self.vectors) 70 | fout.write("{} {}\n".format(node_num, self.rep_size)) 71 | for node, vec in self.vectors.items(): 72 | fout.write("{} {}\n".format(node, ' '.join([str(x) for x in vec]))) 73 | fout.close() 74 | -------------------------------------------------------------------------------- /OpenNE/L3Hope.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Apr 9 20:26:32 2020 4 | 5 | @author: Secil 6 | """ 7 | 8 | # -*- coding: utf-8 -*- 9 | 10 | import networkx as nx 11 | import numpy as np 12 | import scipy.sparse.linalg as lg 13 | 14 | __author__ = "Alan WANG" 15 | __email__ = "alan1995wang@outlook.com" 16 | 17 | import scipy.sparse as sp 18 | class HOPE(object): 19 | def __init__(self, graph, d): 20 | ''' 21 | d: representation vector dimension 22 | ''' 23 | self._d = d 24 | self._graph = graph.G 25 | self.g = graph 26 | self._node_num = graph.node_size 27 | self.learn_embedding() 28 | 29 | def calc_A_hat(adj_matrix): 30 | nnodes = adj_matrix.shape[0] 31 | mu = 0.95 32 | eta = 1e-6 33 | A = adj_matrix# + sp.eye(nnodes) 34 | D_vec = np.sum(A, axis=1) 35 | D_vec_invsqrt_corr = 1 / np.sqrt(D_vec) 36 | D_invsqrt_corr = sp.diags(D_vec_invsqrt_corr) 37 | return mu*D_invsqrt_corr @ A @ D_invsqrt_corr + (1-mu)*sp.eye(nnodes) + eta*sp.eye(nnodes) 38 | 39 | 40 | def learn_embedding(self): 41 | 42 | #graph = self.g.G 43 | graph = self.g.G.to_undirected() 44 | A = nx.to_numpy_matrix(graph) 45 | mu = 0.1; 46 | eta = 1e-6 47 | 48 | norm_lap_mat = nx.laplacian_matrix(graph) 49 | 50 | A = mu*norm_lap_mat + (1-mu)*np.eye(graph.number_of_nodes()) + eta*np.eye(graph.number_of_nodes()) 51 | #A = norm_lap_mat 52 | # self._beta = 0.0728 53 | 54 | # M_g = np.eye(graph.number_of_nodes()) - self._beta * A 55 | # M_l = self._beta * A 56 | print("dimension = ", self._d) 57 | print("PPR") 58 | M_g = np.eye(graph.number_of_nodes()) 59 | M_l = np.dot(A, A) 60 | 61 | S = np.dot(np.linalg.inv(M_g), M_l) 62 | # s: \sigma_k 63 | u, s, vt = lg.svds(S, k=self._d // 2) 64 | sigma = np.diagflat(np.sqrt(s)) 65 | X1 = np.dot(u, sigma) 66 | X2 = np.dot(vt.T, sigma) 67 | # self._X = X2 68 | self._X = np.concatenate((X1, X2), axis=1) 69 | 70 | @property 71 | def vectors(self): 72 | vectors = {} 73 | look_back = self.g.look_back_list 74 | for i, embedding in enumerate(self._X): 75 | vectors[look_back[i]] = embedding 76 | return vectors 77 | 78 | def save_embeddings(self, filename): 79 | fout = open(filename, 'w') 80 | node_num = len(self.vectors.keys()) 81 | fout.write("{} {}\n".format(node_num, self._d)) 82 | for node, vec in self.vectors.items(): 83 | fout.write("{} {}\n".format(node, 84 | ' '.join([str(x) for x in vec]))) 85 | fout.close() 86 | -------------------------------------------------------------------------------- /OpenNE/grarep.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | from scipy.sparse.linalg import svds 5 | from sklearn.preprocessing import normalize 6 | 7 | 8 | class GraRep(object): 9 | 10 | def __init__(self, graph, Kstep, dim): 11 | self.g = graph 12 | self.Kstep = Kstep 13 | assert dim % Kstep == 0 14 | self.dim = int(dim / Kstep) 15 | self.train() 16 | 17 | def getAdjMat(self): 18 | graph = self.g.G 19 | node_size = self.g.node_size 20 | look_up = self.g.look_up_dict 21 | adj = np.zeros((node_size, node_size)) 22 | for edge in self.g.G.edges(): 23 | adj[look_up[edge[0]]][look_up[edge[1]]] = 1.0 24 | adj[look_up[edge[1]]][look_up[edge[0]]] = 1.0 25 | # ScaleSimMat 26 | # print('finish getAdjMat') 27 | return np.matrix(adj) 28 | 29 | def GetProbTranMat(self, Ak): 30 | # print(np.sum(Ak, axis=0)) 31 | tileMat = np.tile(np.sum(Ak, axis=0), (self.node_size, 1)) 32 | # print(np.min(tileMat)) 33 | probTranMat = np.log(Ak / tileMat) - np.log(1.0 / self.node_size) 34 | probTranMat[probTranMat < 0] = 0 35 | probTranMat[probTranMat == np.nan] = 0 36 | return probTranMat 37 | 38 | def GetRepUseSVD(self, probTranMat, alpha): 39 | # U, S, VT = la.svd(probTranMat) 40 | 41 | U, Sigma, VT = svds(probTranMat, self.dim) 42 | # print("finish svd..") 43 | Sigma = np.diag(Sigma) 44 | W = np.matmul(U, np.power(Sigma, alpha)) 45 | C = np.matmul(VT.T, np.power(Sigma, alpha)) 46 | # print(np.sum(U)) 47 | embeddings = W + C 48 | return embeddings 49 | # Ud = U[:, 0:self.dim] 50 | # Sd = S[0:self.dim] 51 | # return np.array(Ud)*np.power(Sd, alpha).reshape((self.dim)) 52 | 53 | def save_embeddings(self, filename): 54 | fout = open(filename, 'w') 55 | node_num = len(self.vectors.keys()) 56 | fout.write("{} {}\n".format(node_num, self.Kstep * self.dim)) 57 | for node, vec in self.vectors.items(): 58 | fout.write("{} {}\n".format(node, ' '.join([str(x) for x in vec]))) 59 | fout.close() 60 | 61 | def train(self): 62 | self.adj = self.getAdjMat() 63 | self.node_size = self.adj.shape[0] 64 | self.Ak = np.matrix(np.identity(self.node_size)) 65 | self.RepMat = np.zeros((self.node_size, int(self.dim * self.Kstep))) 66 | for i in range(self.Kstep): 67 | print('Kstep =', i) 68 | self.Ak = np.dot(self.Ak, self.adj) 69 | # print('finish np.dot(self.Ak, self.adj)') 70 | probTranMat = self.GetProbTranMat(self.Ak) 71 | # print('finish GetProbTranMat') 72 | Rk = self.GetRepUseSVD(probTranMat, 0.5) 73 | # print('finish GetRepUseSVD') 74 | Rk = normalize(Rk, axis=1, norm='l2') 75 | # print('finish normalize') 76 | self.RepMat[:, self.dim * i:self.dim * (i + 1)] = Rk[:, :] 77 | # print('finish RepMat[:, self.dim*i:self.dim*(i+1)] = Rk[:, :]') 78 | # get embeddings 79 | self.vectors = {} 80 | look_back = self.g.look_back_list 81 | for i, embedding in enumerate(self.RepMat): 82 | self.vectors[look_back[i]] = embedding 83 | -------------------------------------------------------------------------------- /OpenNE/classify.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy 4 | from sklearn.metrics import f1_score 5 | from sklearn.multiclass import OneVsRestClassifier 6 | from sklearn.preprocessing import MultiLabelBinarizer 7 | 8 | 9 | class TopKRanker(OneVsRestClassifier): 10 | def predict(self, X, top_k_list): 11 | probs = numpy.asarray(super(TopKRanker, self).predict_proba(X)) 12 | all_labels = [] 13 | for i, k in enumerate(top_k_list): 14 | probs_ = probs[i, :] 15 | labels = self.classes_[probs_.argsort()[-k:]].tolist() 16 | probs_[:] = 0 17 | probs_[labels] = 1 18 | all_labels.append(probs_) 19 | return numpy.asarray(all_labels) 20 | 21 | 22 | class Classifier(object): 23 | 24 | def __init__(self, vectors, clf): 25 | self.embeddings = vectors 26 | self.clf = TopKRanker(clf) 27 | self.binarizer = MultiLabelBinarizer(sparse_output=True) 28 | 29 | def train(self, X, Y, Y_all): 30 | self.binarizer.fit(Y_all) 31 | X_train = [self.embeddings[x] for x in X] 32 | Y = self.binarizer.transform(Y) 33 | self.clf.fit(X_train, Y) 34 | 35 | def evaluate(self, X, Y): 36 | top_k_list = [len(l) for l in Y] 37 | Y_ = self.predict(X, top_k_list) 38 | Y = self.binarizer.transform(Y) 39 | averages = ["micro", "macro", "samples", "weighted"] 40 | results = {} 41 | for average in averages: 42 | results[average] = f1_score(Y, Y_, average=average) 43 | # print('Results, using embeddings of dimensionality', len(self.embeddings[X[0]])) 44 | # print('-------------------') 45 | print(results) 46 | return results 47 | # print('-------------------') 48 | 49 | def predict(self, X, top_k_list): 50 | X_ = numpy.asarray([self.embeddings[x] for x in X]) 51 | Y = self.clf.predict(X_, top_k_list=top_k_list) 52 | return Y 53 | 54 | def split_train_evaluate(self, X, Y, train_precent, seed=0): 55 | state = numpy.random.get_state() 56 | 57 | training_size = int(train_precent * len(X)) 58 | numpy.random.seed(seed) 59 | shuffle_indices = numpy.random.permutation(numpy.arange(len(X))) 60 | X_train = [X[shuffle_indices[i]] for i in range(training_size)] 61 | Y_train = [Y[shuffle_indices[i]] for i in range(training_size)] 62 | X_test = [X[shuffle_indices[i]] for i in range(training_size, len(X))] 63 | Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))] 64 | 65 | self.train(X_train, Y_train, Y) 66 | numpy.random.set_state(state) 67 | return self.evaluate(X_test, Y_test) 68 | 69 | 70 | def load_embeddings(filename): 71 | fin = open(filename, 'r') 72 | node_num, size = [int(x) for x in fin.readline().strip().split()] 73 | vectors = {} 74 | while 1: 75 | l = fin.readline() 76 | if l == '': 77 | break 78 | vec = l.strip().split(' ') 79 | assert len(vec) == size + 1 80 | vectors[vec[0]] = [float(x) for x in vec[1:]] 81 | fin.close() 82 | assert len(vectors) == node_num 83 | return vectors 84 | 85 | 86 | def read_node_label(filename): 87 | fin = open(filename, 'r') 88 | X = [] 89 | Y = [] 90 | while 1: 91 | l = fin.readline() 92 | if l == '': 93 | break 94 | vec = l.strip().split(' ') 95 | X.append(vec[0]) 96 | Y.append(vec[1:]) 97 | fin.close() 98 | return X, Y 99 | -------------------------------------------------------------------------------- /OpenNE/RWR.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Apr 7 22:36:32 2020 4 | 5 | @author: Secil 6 | """ 7 | 8 | # -*- coding: utf-8 -*- 9 | 10 | import networkx as nx 11 | import numpy as np 12 | from scipy.sparse.linalg import eigsh 13 | import scipy.sparse as sp 14 | import scipy.io as sio 15 | 16 | import scipy.sparse.linalg as slinalg 17 | import scipy.linalg as linalg 18 | import scipy.sparse as sp 19 | 20 | __author__ = "Mustafa Coskun" 21 | __email__ = "mxc522@case.edu" 22 | 23 | 24 | class RWR(object): 25 | def __init__(self, graph, rep_size=100): 26 | self.g = graph 27 | self.node_size = self.g.G.number_of_nodes() 28 | self.rep_size = rep_size 29 | adj_mat = nx.to_numpy_array(self.g.G) 30 | self.adj_mat = adj_mat 31 | self.vectors = {} 32 | self.embeddings = self.get_train() 33 | look_back = self.g.look_back_list 34 | 35 | for i, embedding in enumerate(self.embeddings): 36 | self.vectors[look_back[i]] = embedding 37 | 38 | def getAdj(self): 39 | node_size = self.g.node_size 40 | look_up = self.g.look_up_dict 41 | adj = np.zeros((node_size, node_size)) 42 | for edge in self.g.G.edges(): 43 | adj[look_up[edge[0]]][look_up[edge[1]]] = self.g.G[edge[0]][edge[1]]['weight'] 44 | return adj 45 | 46 | def getLap(self): 47 | # degree_mat = np.diagflat(np.sum(self.adj_mat, axis=1)) 48 | # print('np.diagflat(np.sum(self.adj_mat, axis=1))') 49 | # deg_trans = np.diagflat(np.reciprocal(np.sqrt(np.sum(self.adj_mat, axis=1)))) 50 | # print('np.diagflat(np.reciprocal(np.sqrt(np.sum(self.adj_mat, axis=1))))') 51 | # deg_trans = np.nan_to_num(deg_trans) 52 | # L = degree_mat-self.adj_mat 53 | # print('begin norm_lap_mat') 54 | # # eye = np.eye(self.node_size) 55 | # 56 | # norm_lap_mat = np.matmul(np.matmul(deg_trans, L), deg_trans) 57 | G = self.g.G.to_undirected() 58 | print('begin norm_lap_mat') 59 | norm_lap_mat = nx.normalized_laplacian_matrix(G) 60 | print('finish norm_lap_mat') 61 | return norm_lap_mat 62 | def calc_A_hat(self): 63 | #nnodes = adj_matrix.shape[0] 64 | A = self.adj_mat 65 | D_vec = np.sum(A, axis=1) 66 | D_vec_invsqrt_corr = 1 / np.sqrt(D_vec) 67 | D_invsqrt_corr = sp.diags(D_vec_invsqrt_corr) 68 | return D_invsqrt_corr @ A @ D_invsqrt_corr 69 | 70 | 71 | def calc_ppr_exact(self): 72 | adj_matrix = self.adj_mat 73 | nnodes = adj_matrix.shape[0] 74 | M = self.calc_A_hat() 75 | A_inner = sp.eye(nnodes) - (1 - 0.85) * M 76 | return 0.85 * np.linalg.inv(A_inner) 77 | 78 | def get_train(self): 79 | #lap_mat = self.calc_ppr_exact() 80 | 81 | mat = sio.loadmat('DDIEmb.mat') 82 | 83 | #index = mat['index2'] 84 | print('finish getLap...') 85 | vec = mat['vec'] 86 | print('finish eigh(lap_mat)...') 87 | # start = 0 88 | # for i in range(self.node_size): 89 | # if w[i] > 1e-10: 90 | # start = i 91 | # break 92 | # vec = vec[:, start:start+self.rep_size] 93 | 94 | return vec 95 | 96 | def save_embeddings(self, filename): 97 | fout = open(filename, 'w') 98 | node_num = len(self.vectors) 99 | fout.write("{} {}\n".format(node_num, self.rep_size)) 100 | for node, vec in self.vectors.items(): 101 | fout.write("{} {}\n".format(node, ' '.join([str(x) for x in vec]))) 102 | fout.close() 103 | -------------------------------------------------------------------------------- /OpenNE/graph.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Graph utilities.""" 4 | 5 | import networkx as nx 6 | 7 | import numpy as np 8 | from scipy.io import loadmat 9 | 10 | __author__ = "Zhang Zhengyan" 11 | __email__ = "zhangzhengyan14@mails.tsinghua.edu.cn" 12 | 13 | 14 | class Graph(object): 15 | def __init__(self): 16 | self.G = None 17 | self.look_up_dict = {} 18 | self.look_back_list = [] 19 | self.node_size = 0 20 | 21 | def encode_node(self): 22 | look_up = self.look_up_dict 23 | look_back = self.look_back_list 24 | for node in self.G.nodes(): 25 | look_up[node] = self.node_size 26 | look_back.append(node) 27 | self.node_size += 1 28 | self.G.nodes[node]['status'] = '' 29 | 30 | def read_g(self, g): 31 | self.G = g 32 | self.encode_node() 33 | 34 | def read_adjlist(self, filename): 35 | """ Read graph from adjacency file in which the edge must be unweighted 36 | the format of each line: v1 n1 n2 n3 ... nk 37 | :param filename: the filename of input file 38 | """ 39 | self.G = nx.read_adjlist(filename, create_using=nx.DiGraph()) 40 | for i, j in self.G.edges(): 41 | self.G[i][j]['weight'] = 1.0 42 | self.encode_node() 43 | 44 | def read_edgelist(self, filename, weighted=False, directed=False): 45 | self.G = nx.DiGraph() 46 | 47 | if directed: 48 | def read_unweighted(l): 49 | src, dst = l.split() 50 | self.G.add_edge(src, dst) 51 | self.G[src][dst]['weight'] = 1.0 52 | 53 | def read_weighted(l): 54 | src, dst, w = l.split() 55 | self.G.add_edge(src, dst) 56 | self.G[src][dst]['weight'] = float(w) 57 | else: 58 | def read_unweighted(l): 59 | src, dst = l.split() 60 | self.G.add_edge(src, dst) 61 | self.G.add_edge(dst, src) 62 | self.G[src][dst]['weight'] = 1.0 63 | self.G[dst][src]['weight'] = 1.0 64 | 65 | def read_weighted(l): 66 | src, dst, w = l.split() 67 | # print(src, dst, float(w)) 68 | self.G.add_edge(src, dst) 69 | self.G.add_edge(dst, src) 70 | self.G[src][dst]['weight'] = float(w) 71 | self.G[dst][src]['weight'] = float(w) 72 | fin = open(filename, 'r') 73 | func = read_unweighted 74 | if weighted: 75 | func = read_weighted 76 | while 1: 77 | l = fin.readline() 78 | if l == '': 79 | break 80 | func(l) 81 | fin.close() 82 | self.encode_node() 83 | 84 | def readMatFile(self,filename): 85 | ne = loadmat(filename) 86 | ne = ne['adj'] 87 | G=nx.from_numpy_matrix(ne) 88 | self.G =G 89 | self.encode_node() 90 | 91 | def read_node_label(self, filename): 92 | fin = open(filename, 'r') 93 | while 1: 94 | l = fin.readline() 95 | if l == '': 96 | break 97 | vec = l.split() 98 | self.G.nodes[vec[0]]['label'] = vec[1:] 99 | fin.close() 100 | 101 | def read_node_features(self, filename): 102 | fin = open(filename, 'r') 103 | for l in fin.readlines(): 104 | vec = l.split() 105 | self.G.nodes[vec[0]]['feature'] = np.array( 106 | [float(x) for x in vec[1:]]) 107 | fin.close() 108 | 109 | def read_node_status(self, filename): 110 | fin = open(filename, 'r') 111 | while 1: 112 | l = fin.readline() 113 | if l == '': 114 | break 115 | vec = l.split() 116 | self.G.nodes[vec[0]]['status'] = vec[1] # train test valid 117 | fin.close() 118 | 119 | def read_edge_label(self, filename): 120 | fin = open(filename, 'r') 121 | while 1: 122 | l = fin.readline() 123 | if l == '': 124 | break 125 | vec = l.split() 126 | self.G[vec[0]][vec[1]]['label'] = vec[2:] 127 | fin.close() 128 | -------------------------------------------------------------------------------- /OpenNE/hope.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import networkx as nx 4 | import numpy as np 5 | import scipy.sparse.linalg as lg 6 | import scipy.io as sio 7 | #import hdf5storage as hd 8 | from scipy.sparse.linalg import svds 9 | import scipy.sparse as sp 10 | 11 | __author__ = "Alan WANG" 12 | __email__ = "alan1995wang@outlook.com" 13 | 14 | 15 | class HOPE(object): 16 | def __init__(self, graph, d): 17 | ''' 18 | d: representation vector dimension 19 | ''' 20 | self._d = d 21 | self._graph = graph.G 22 | self.g = graph 23 | self._node_num = graph.node_size 24 | self.learn_embedding() 25 | 26 | def learn_embedding(self): 27 | 28 | graph = self.g.G.to_undirected() 29 | A = nx.to_numpy_matrix(graph) 30 | # idSave={} 31 | # idSave['Net']=A 32 | # sio.savemat('Node2VecPPIAdj.mat',idSave) 33 | 34 | 35 | #--------------------Open for RWR --------------- 36 | # print("Page Rank") 37 | # norm_lap_mat = nx.laplacian_matrix(graph) 38 | # alpha = 0.1 39 | # 40 | # M_g = np.eye(graph.number_of_nodes())- alpha*norm_lap_mat 41 | # M_l = (1-alpha)*np.eye(graph.number_of_nodes()) 42 | 43 | #---------------------------Open this L3---------------------- 44 | # print("L3G") 45 | # norm_lap_mat = nx.laplacian_matrix(graph) 46 | # mu = 0.1; 47 | # eta = 1e-6; 48 | # M_g = mu*norm_lap_mat + (1-mu)*np.eye(graph.number_of_nodes()) + eta*np.eye(graph.number_of_nodes()) 49 | # M_l = np.eye(graph.number_of_nodes()) 50 | 51 | #----------------------Open for Katz-------------------------- 52 | # print("Katz Measure") 53 | # self._beta = 0.0728 54 | # M_g = np.eye(graph.number_of_nodes()) - self._beta * A 55 | # M_l = self._beta * A 56 | #------------------------------------------------------------- 57 | 58 | 59 | #----------------------------Open this part for CN --------------------- 60 | # 61 | M_g = np.eye(graph.number_of_nodes()) 62 | 63 | M_l = np.dot(A, A) 64 | # # ------------------------------------- 65 | S = np.dot(np.linalg.inv(M_g), M_l) 66 | # s: \sigma_k 67 | u, s, vt = lg.svds(S, k=self._d // 2) 68 | sigma = np.diagflat(np.sqrt(s)) 69 | X1 = np.dot(u, sigma) 70 | X2 = np.dot(vt.T, sigma) 71 | # self._X = X2 72 | self._X = np.concatenate((X1, X2), axis=1) 73 | #--------------------LoadTopKEmbeddings-------------------------- 74 | # print("Load Top-k Embedding") 75 | # mydata = sio.loadmat('TopKEmbedding50.mat') 76 | # self._X = mydata['Embedding'] 77 | #---------------------------------------------------------------- 78 | 79 | 80 | ###################Correlation based S matrix-------------------- 81 | # mat = hd.loadmat('S50.mat') 82 | # S = mat['S'] 83 | # u, s, vt = lg.svds(S, k=self._d // 2) 84 | # sigma = np.diagflat(np.sqrt(s)) 85 | # X1 = np.dot(u, sigma) 86 | # X2 = np.dot(vt.T, sigma) 87 | # # self._X = X2 88 | # self._X = np.concatenate((X1, X2), axis=1) 89 | 90 | ####################Direct SVD------------------------------------- 91 | # 92 | # print("LP3D SVD") 93 | # norm_lap_mat = nx.laplacian_matrix(graph) 94 | # mu = 0.9; 95 | # eta = 1e-6; 96 | # M_g = mu*norm_lap_mat + (1-mu)*np.eye(graph.number_of_nodes()) + eta*np.eye(graph.number_of_nodes()) 97 | # #M_l = np.eye(graph.number_of_nodes()) 98 | # U, Sigma, VT = svds(M_g, k=self._d) 99 | # Sigma = np.diag(Sigma) 100 | # W = np.matmul(U, np.sqrt(Sigma)) 101 | # C = np.matmul(VT.T, np.sqrt(Sigma)) 102 | # # print(np.sum(U)) 103 | # embeddings = W + C 104 | # self._X = embeddings 105 | @property 106 | def vectors(self): 107 | vectors = {} 108 | look_back = self.g.look_back_list 109 | for i, embedding in enumerate(self._X): 110 | vectors[look_back[i]] = embedding 111 | return vectors 112 | 113 | def save_embeddings(self, filename): 114 | fout = open(filename, 'w') 115 | node_num = len(self.vectors.keys()) 116 | fout.write("{} {}\n".format(node_num, self._d)) 117 | for node, vec in self.vectors.items(): 118 | fout.write("{} {}\n".format(node, 119 | ' '.join([str(x) for x in vec]))) 120 | fout.close() -------------------------------------------------------------------------------- /GAE/layers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | 5 | from GAE.initialization import * 6 | 7 | # global unique layer ID dictionary for layer name assignment 8 | _LAYER_UIDS = {} 9 | 10 | 11 | def get_layer_uid(layer_name=''): 12 | """Helper function, assigns unique layer IDs 13 | """ 14 | if layer_name not in _LAYER_UIDS: 15 | _LAYER_UIDS[layer_name] = 1 16 | return 1 17 | else: 18 | _LAYER_UIDS[layer_name] += 1 19 | return _LAYER_UIDS[layer_name] 20 | 21 | 22 | def dropout_sparse(x, keep_prob, num_nonzero_elems): 23 | """Dropout for sparse tensors. Currently fails for very large sparse tensors (>1M elements) 24 | """ 25 | noise_shape = [num_nonzero_elems] 26 | random_tensor = keep_prob 27 | random_tensor += tf.random_uniform(noise_shape) 28 | dropout_mask = tf.cast(tf.floor(random_tensor), dtype=tf.bool) 29 | pre_out = tf.sparse_retain(x, dropout_mask) 30 | return pre_out * (1. / keep_prob) 31 | 32 | 33 | class Layer(object): 34 | """Base layer class. Defines basic API for all layer objects. 35 | 36 | # Properties 37 | name: String, defines the variable scope of the layer. 38 | 39 | # Methods 40 | _call(inputs): Defines computation graph of layer 41 | (i.e. takes input, returns output) 42 | __call__(inputs): Wrapper for _call() 43 | """ 44 | 45 | def __init__(self, **kwargs): 46 | allowed_kwargs = {'name', 'logging'} 47 | for kwarg in kwargs.keys(): 48 | assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg 49 | name = kwargs.get('name') 50 | if not name: 51 | layer = self.__class__.__name__.lower() 52 | name = layer + '_' + str(get_layer_uid(layer)) 53 | self.name = name 54 | self.vars = {} 55 | logging = kwargs.get('logging', False) 56 | self.logging = logging 57 | self.issparse = False 58 | 59 | def _call(self, inputs): 60 | return inputs 61 | 62 | def __call__(self, inputs): 63 | with tf.name_scope(self.name): 64 | outputs = self._call(inputs) 65 | return outputs 66 | 67 | 68 | class GraphConvolution(Layer): 69 | """Basic graph convolution layer for undirected graph without edge labels.""" 70 | 71 | def __init__(self, input_dim, output_dim, adj, dropout=0., act=tf.nn.relu, **kwargs): 72 | super(GraphConvolution, self).__init__(**kwargs) 73 | with tf.variable_scope(self.name + '_vars'): 74 | self.vars['weights'] = weight_variable_glorot(input_dim, output_dim, name="weights") 75 | self.dropout = dropout 76 | self.adj = adj 77 | self.act = act 78 | 79 | def _call(self, inputs): 80 | x = inputs 81 | x = tf.nn.dropout(x, 1 - self.dropout) 82 | x = tf.matmul(x, self.vars['weights']) 83 | x = tf.sparse_tensor_dense_matmul(self.adj, x) 84 | outputs = self.act(x) 85 | return outputs 86 | 87 | 88 | class GraphConvolutionSparse(Layer): 89 | """Graph convolution layer for sparse inputs.""" 90 | 91 | def __init__(self, input_dim, output_dim, adj, features_nonzero, dropout=0., act=tf.nn.relu, **kwargs): 92 | super(GraphConvolutionSparse, self).__init__(**kwargs) 93 | with tf.variable_scope(self.name + '_vars'): 94 | self.vars['weights'] = weight_variable_glorot(input_dim, output_dim, name="weights") 95 | self.dropout = dropout 96 | self.adj = adj 97 | self.act = act 98 | self.issparse = True 99 | self.features_nonzero = features_nonzero 100 | 101 | def _call(self, inputs): 102 | x = inputs 103 | x = dropout_sparse(x, 1 - self.dropout, self.features_nonzero) 104 | x = tf.sparse_tensor_dense_matmul(x, self.vars['weights']) 105 | x = tf.sparse_tensor_dense_matmul(self.adj, x) 106 | outputs = self.act(x) 107 | return outputs 108 | 109 | 110 | class InnerProductDecoder(Layer): 111 | """Decoder model layer for link prediction.""" 112 | 113 | def __init__(self, input_dim, dropout=0., act=tf.nn.sigmoid, **kwargs): 114 | super(InnerProductDecoder, self).__init__(**kwargs) 115 | self.dropout = dropout 116 | self.act = act 117 | 118 | def _call(self, inputs): 119 | inputs = tf.nn.dropout(inputs, 1 - self.dropout) 120 | x = tf.transpose(inputs) 121 | x = tf.matmul(inputs, x) 122 | x = tf.reshape(x, [-1]) 123 | outputs = self.act(x) 124 | return outputs 125 | -------------------------------------------------------------------------------- /evaluation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from sklearn.linear_model import LogisticRegression 4 | from sklearn.metrics import accuracy_score, average_precision_score, f1_score, roc_auc_score 5 | from sklearn.multiclass import OneVsRestClassifier 6 | from sklearn.preprocessing import MultiLabelBinarizer 7 | 8 | from utils import * 9 | 10 | 11 | def LinkPrediction(embedding_look_up, original_graph, train_graph, test_pos_edges, seed): 12 | random.seed(seed) 13 | 14 | train_neg_edges = generate_neg_edges(original_graph, len(train_graph.edges()), seed) 15 | 16 | # create a auxiliary graph to ensure that testing negative edges will not used in training 17 | G_aux = copy.deepcopy(original_graph) 18 | G_aux.add_edges_from(train_neg_edges) 19 | test_neg_edges = generate_neg_edges(G_aux, len(test_pos_edges), seed) 20 | 21 | # construct X_train, y_train, X_test, y_test 22 | X_train = [] 23 | y_train = [] 24 | for edge in train_graph.edges(): 25 | node_u_emb = embedding_look_up[edge[0]] 26 | node_v_emb = embedding_look_up[edge[1]] 27 | #feature_vector = np.append(node_v_emb, node_u_emb) 28 | ####################### Perform Hadamard Product################# 29 | feature_vector = np.multiply(node_u_emb,node_v_emb) 30 | X_train.append(feature_vector) 31 | y_train.append(1) 32 | for edge in train_neg_edges: 33 | node_u_emb = embedding_look_up[edge[0]] 34 | node_v_emb = embedding_look_up[edge[1]] 35 | #feature_vector = np.append(node_v_emb, node_u_emb) 36 | feature_vector = np.multiply(node_u_emb,node_v_emb) 37 | X_train.append(feature_vector) 38 | y_train.append(0) 39 | 40 | X_test = [] 41 | y_test = [] 42 | for edge in test_pos_edges: 43 | node_u_emb = embedding_look_up[edge[0]] 44 | node_v_emb = embedding_look_up[edge[1]] 45 | #feature_vector = np.append(node_v_emb, node_u_emb) 46 | feature_vector = np.multiply(node_u_emb,node_v_emb) 47 | X_test.append(feature_vector) 48 | y_test.append(1) 49 | for edge in test_neg_edges: 50 | node_u_emb = embedding_look_up[edge[0]] 51 | node_v_emb = embedding_look_up[edge[1]] 52 | #feature_vector = np.append(node_v_emb, node_u_emb) 53 | feature_vector = np.multiply(node_u_emb,node_v_emb) 54 | X_test.append(feature_vector) 55 | y_test.append(0) 56 | 57 | # shuffle for training and testing 58 | c = list(zip(X_train, y_train)) 59 | random.shuffle(c) 60 | X_train, y_train = zip(*c) 61 | 62 | c = list(zip(X_test, y_test)) 63 | random.shuffle(c) 64 | X_test, y_test = zip(*c) 65 | 66 | X_train = np.array(X_train) 67 | y_train = np.array(y_train) 68 | 69 | X_test = np.array(X_test) 70 | y_test = np.array(y_test) 71 | 72 | clf1 = LogisticRegression(random_state=seed, solver='lbfgs') 73 | clf1.fit(X_train, y_train) 74 | y_pred_proba = clf1.predict_proba(X_test)[:, 1] 75 | y_pred = clf1.predict(X_test) 76 | auc_roc = roc_auc_score(y_test, y_pred_proba) 77 | auc_pr = average_precision_score(y_test, y_pred_proba) 78 | accuracy = accuracy_score(y_test, y_pred) 79 | f1 = f1_score(y_test, y_pred) 80 | print('#' * 9 + ' Link Prediction Performance ' + '#' * 9) 81 | print(f'AUC-ROC: {auc_roc:.3f}, AUC-PR: {auc_pr:.3f}, Accuracy: {accuracy:.3f}, F1: {f1:.3f}') 82 | print('#' * 50) 83 | return auc_roc, auc_pr, accuracy, f1 84 | 85 | 86 | def NodeClassification(embedding_look_up, node_list, labels, testing_ratio, seed): 87 | 88 | X_train, y_train, X_test, y_test = split_train_test_classify(embedding_look_up, node_list, labels, 89 | testing_ratio=testing_ratio,seed=seed) 90 | binarizer = MultiLabelBinarizer(sparse_output=True) 91 | y_all = np.append(y_train, y_test) 92 | binarizer.fit(y_all) 93 | y_train = binarizer.transform(y_train).todense() 94 | y_test = binarizer.transform(y_test).todense() 95 | model = OneVsRestClassifier(LogisticRegression(random_state=seed, solver='lbfgs')) 96 | model.fit(X_train, y_train) 97 | y_pred_prob = model.predict_proba(X_test) 98 | 99 | ## small trick : we assume that we know how many label to predict 100 | y_pred = get_y_pred(y_test, y_pred_prob) 101 | 102 | accuracy = accuracy_score(y_test, y_pred) 103 | micro_f1 = f1_score(y_test, y_pred, average="micro") 104 | macro_f1 = f1_score(y_test, y_pred, average="macro") 105 | 106 | print('#' * 9 + ' Node Classification Performance ' + '#' * 9) 107 | print(f'Accuracy: {accuracy:.3f}, Micro-F1: {micro_f1:.3f}, Macro-F1: {macro_f1:.3f}') 108 | print('#' * 50) 109 | return accuracy, micro_f1, macro_f1 110 | -------------------------------------------------------------------------------- /GAE/model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | 5 | from GAE.layers import GraphConvolution, GraphConvolutionSparse, InnerProductDecoder 6 | 7 | flags = tf.app.flags 8 | FLAGS = flags.FLAGS 9 | 10 | 11 | class Model(object): 12 | def __init__(self, **kwargs): 13 | allowed_kwargs = {'name', 'logging'} 14 | for kwarg in kwargs.keys(): 15 | assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg 16 | 17 | for kwarg in kwargs.keys(): 18 | assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg 19 | name = kwargs.get('name') 20 | if not name: 21 | name = self.__class__.__name__.lower() 22 | self.name = name 23 | 24 | logging = kwargs.get('logging', False) 25 | self.logging = logging 26 | 27 | self.vars = {} 28 | 29 | def _build(self): 30 | raise NotImplementedError 31 | 32 | def build(self): 33 | """ Wrapper for _build() """ 34 | with tf.variable_scope(self.name): 35 | self._build() 36 | variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name) 37 | self.vars = {var.name: var for var in variables} 38 | 39 | def fit(self): 40 | pass 41 | 42 | def predict(self): 43 | pass 44 | 45 | 46 | class GCNModelAE(Model): 47 | def __init__(self, placeholders, num_features, features_nonzero, hidden1, hidden2, **kwargs): 48 | super(GCNModelAE, self).__init__(**kwargs) 49 | 50 | self.inputs = placeholders['features'] 51 | self.input_dim = num_features 52 | self.features_nonzero = features_nonzero 53 | self.adj = placeholders['adj'] 54 | self.dropout = placeholders['dropout'] 55 | self.hidden_dim_1 = hidden1 56 | self.hidden_dim_2 = hidden2 57 | self.build() 58 | 59 | def _build(self): 60 | self.hidden1 = GraphConvolutionSparse(input_dim=self.input_dim, 61 | output_dim=self.hidden_dim_1, 62 | adj=self.adj, 63 | features_nonzero=self.features_nonzero, 64 | act=tf.nn.relu, 65 | dropout=self.dropout, 66 | logging=self.logging)(self.inputs) 67 | 68 | self.embeddings = GraphConvolution(input_dim=self.hidden_dim_1, 69 | output_dim=self.hidden_dim_2, 70 | adj=self.adj, 71 | act=lambda x: x, 72 | dropout=self.dropout, 73 | logging=self.logging)(self.hidden1) 74 | 75 | self.z_mean = self.embeddings 76 | 77 | self.reconstructions = InnerProductDecoder(input_dim=self.hidden_dim_2, 78 | act=lambda x: x, 79 | logging=self.logging)(self.embeddings) 80 | 81 | 82 | class GCNModelVAE(Model): 83 | def __init__(self, placeholders, num_features, num_nodes, features_nonzero, hidden1, hidden2, **kwargs): 84 | super(GCNModelVAE, self).__init__(**kwargs) 85 | 86 | self.inputs = placeholders['features'] 87 | self.input_dim = num_features 88 | self.features_nonzero = features_nonzero 89 | self.n_samples = num_nodes 90 | self.adj = placeholders['adj'] 91 | self.dropout = placeholders['dropout'] 92 | self.hidden_dim_1 = hidden1 93 | self.hidden_dim_2 = hidden2 94 | self.build() 95 | 96 | def _build(self): 97 | self.hidden1 = GraphConvolutionSparse(input_dim=self.input_dim, 98 | output_dim=self.hidden_dim_1, 99 | adj=self.adj, 100 | features_nonzero=self.features_nonzero, 101 | act=tf.nn.relu, 102 | dropout=self.dropout, 103 | logging=self.logging)(self.inputs) 104 | 105 | self.z_mean = GraphConvolution(input_dim=self.hidden_dim_1, 106 | output_dim=self.hidden_dim_2, 107 | adj=self.adj, 108 | act=lambda x: x, 109 | dropout=self.dropout, 110 | logging=self.logging)(self.hidden1) 111 | 112 | self.z_log_std = GraphConvolution(input_dim=self.hidden_dim_1, 113 | output_dim=self.hidden_dim_2, 114 | adj=self.adj, 115 | act=lambda x: x, 116 | dropout=self.dropout, 117 | logging=self.logging)(self.hidden1) 118 | 119 | self.z = self.z_mean + tf.random_normal([self.n_samples, self.hidden_dim_2]) * tf.exp(self.z_log_std) 120 | 121 | self.reconstructions = InnerProductDecoder(input_dim=self.hidden_dim_2, 122 | act=lambda x: x, 123 | logging=self.logging)(self.z) 124 | -------------------------------------------------------------------------------- /GAE/preprocessing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import scipy.sparse as sp 5 | 6 | 7 | def sparse_to_tuple(sparse_mx): 8 | if not sp.isspmatrix_coo(sparse_mx): 9 | sparse_mx = sparse_mx.tocoo() 10 | coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose() 11 | values = sparse_mx.data 12 | shape = sparse_mx.shape 13 | return coords, values, shape 14 | #####################################Original Code######################### 15 | # alpha = 0.1 16 | # print('@' * 70) 17 | # print('Alpha: %f' % (alpha)) 18 | # print('@' * 70) 19 | # adj = sp.coo_matrix(adj) 20 | # adj_ = adj + sp.eye(adj.shape[0]) 21 | # rowsum = np.array(adj_.sum(1)) 22 | # degree_mat_inv_sqrt = sp.diags(np.power(rowsum, -0.5).flatten()) 23 | # adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt).tocoo() 24 | # return sparse_to_tuple(adj_normalized) 25 | ########################################################################### 26 | 27 | def preprocess_graph(adj): 28 | # alpha = 0.1 29 | # print('@' * 70) 30 | # print('Alpha with inverse: %f' % (alpha)) 31 | # print('@' * 70) 32 | # adj = sp.coo_matrix(adj) 33 | # adj_ = adj + sp.eye(adj.shape[0]) 34 | # rowsum = np.array(adj_.sum(1)) 35 | # degree_mat_inv_sqrt = sp.diags(np.power(rowsum, -0.5).flatten()) 36 | # adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt).tocoo() 37 | # A_inner = sp.eye(adj.shape[0]) - (1 - alpha) * adj_normalized 38 | # adj_normalized = alpha * sp.linalg.inv(A_inner) 39 | alpha = 0.1 40 | print('@' * 70) 41 | print('Alpha: %f' % (alpha)) 42 | print('@' * 70) 43 | adj = sp.coo_matrix(adj) 44 | adj_ = adj + sp.eye(adj.shape[0]) 45 | rowsum = np.array(adj_.sum(1)) 46 | degree_mat_inv_sqrt = sp.diags(np.power(rowsum, -0.5).flatten()) 47 | adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt).tocoo() 48 | return sparse_to_tuple(adj_normalized) 49 | 50 | 51 | def construct_feed_dict(adj_normalized, adj, features, placeholders): 52 | # construct feed dictionary 53 | feed_dict = dict() 54 | feed_dict.update({placeholders['features']: features}) 55 | feed_dict.update({placeholders['adj']: adj_normalized}) 56 | feed_dict.update({placeholders['adj_orig']: adj}) 57 | return feed_dict 58 | 59 | 60 | def mask_test_edges(adj): 61 | # Function to build test set with 10% positive links 62 | # NOTE: Splits are randomized and results might slightly deviate from reported numbers in the paper. 63 | 64 | # Remove diagonal elements 65 | adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape) 66 | adj.eliminate_zeros() 67 | # Check that diag is zero: 68 | assert np.diag(adj.todense()).sum() == 0 69 | 70 | adj_triu = sp.triu(adj) 71 | adj_tuple = sparse_to_tuple(adj_triu) 72 | edges = adj_tuple[0] 73 | edges_all = sparse_to_tuple(adj)[0] 74 | num_test = int(np.floor(edges.shape[0] / 10.)) 75 | num_val = int(np.floor(edges.shape[0] / 20.)) 76 | 77 | all_edge_idx = list(range(edges.shape[0])) 78 | np.random.shuffle(all_edge_idx) 79 | val_edge_idx = all_edge_idx[:num_val] 80 | test_edge_idx = all_edge_idx[num_val:(num_val + num_test)] 81 | test_edges = edges[test_edge_idx] 82 | val_edges = edges[val_edge_idx] 83 | train_edges = np.delete(edges, np.hstack([test_edge_idx, val_edge_idx]), axis=0) 84 | 85 | def ismember(a, b, tol=5): 86 | rows_close = np.all(np.round(a - b[:, None], tol) == 0, axis=-1) 87 | return np.any(rows_close) 88 | 89 | test_edges_false = [] 90 | while len(test_edges_false) < len(test_edges): 91 | idx_i = np.random.randint(0, adj.shape[0]) 92 | idx_j = np.random.randint(0, adj.shape[0]) 93 | if idx_i == idx_j: 94 | continue 95 | if ismember([idx_i, idx_j], edges_all): 96 | continue 97 | if test_edges_false: 98 | if ismember([idx_j, idx_i], np.array(test_edges_false)): 99 | continue 100 | if ismember([idx_i, idx_j], np.array(test_edges_false)): 101 | continue 102 | test_edges_false.append([idx_i, idx_j]) 103 | 104 | val_edges_false = [] 105 | while len(val_edges_false) < len(val_edges): 106 | idx_i = np.random.randint(0, adj.shape[0]) 107 | idx_j = np.random.randint(0, adj.shape[0]) 108 | if idx_i == idx_j: 109 | continue 110 | if ismember([idx_i, idx_j], train_edges): 111 | continue 112 | if ismember([idx_j, idx_i], train_edges): 113 | continue 114 | if ismember([idx_i, idx_j], val_edges): 115 | continue 116 | if ismember([idx_j, idx_i], val_edges): 117 | continue 118 | if val_edges_false: 119 | if ismember([idx_j, idx_i], np.array(val_edges_false)): 120 | continue 121 | if ismember([idx_i, idx_j], np.array(val_edges_false)): 122 | continue 123 | val_edges_false.append([idx_i, idx_j]) 124 | 125 | assert ~ismember(test_edges_false, edges_all) 126 | assert ~ismember(val_edges_false, edges_all) 127 | assert ~ismember(val_edges, train_edges) 128 | assert ~ismember(test_edges, train_edges) 129 | assert ~ismember(val_edges, test_edges) 130 | 131 | data = np.ones(train_edges.shape[0]) 132 | 133 | # Re-build adj matrix 134 | adj_train = sp.csr_matrix((data, (train_edges[:, 0], train_edges[:, 1])), shape=adj.shape) 135 | adj_train = adj_train + adj_train.T 136 | 137 | # NOTE: these edge lists only contain single direction of edge! 138 | return adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false 139 | -------------------------------------------------------------------------------- /GAE/train_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import time 4 | 5 | import numpy as np 6 | import scipy.sparse as sp 7 | import tensorflow as tf 8 | 9 | from GAE.model import GCNModelAE, GCNModelVAE 10 | from GAE.optimizer import OptimizerAE, OptimizerVAE 11 | from GAE.preprocessing import construct_feed_dict, preprocess_graph, sparse_to_tuple 12 | #from fast_pagerank import pagerank 13 | 14 | # # Train on CPU (hide GPU) due to memory constraints 15 | # os.environ['CUDA_VISIBLE_DEVICES'] = "" 16 | 17 | 18 | class gae_model(object): 19 | def __init__(self, args): 20 | super(gae_model, self).__init__() 21 | self.learning_rate = args.lr 22 | self.epochs = args.epochs 23 | self.hidden1 = args.hidden 24 | self.hidden2 = args.dimensions 25 | self.weight_decay = args.weight_decay 26 | self.dropout = args.dropout 27 | self.model_selection = args.gae_model_selection 28 | self.model = None 29 | 30 | def save_embeddings(self, output, node_list): 31 | self.feed_dict.update({self.placeholders['dropout']: 0}) 32 | emb = self.sess.run(self.model.z_mean, feed_dict=self.feed_dict) 33 | print(emb.shape[0]) 34 | print(emb.shape[1]) 35 | fout = open(output, 'w') 36 | fout.write("{} {}\n".format(emb.shape[0], emb.shape[1])) 37 | for idx in range(emb.shape[0]): 38 | fout.write("{} {}\n".format(node_list[idx], ' '.join([str(x) for x in emb[idx, :]]))) 39 | fout.close() 40 | 41 | def train(self, adj): 42 | # Store original adjacency matrix (without diagonal entries) for later 43 | adj_orig = adj 44 | adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) 45 | adj_orig.eliminate_zeros() 46 | 47 | adj_train = adj 48 | features = sp.identity(adj.shape[0]) # featureless 49 | #pr=pagerank(adj, p=0.85) 50 | 51 | 52 | # What happends if I use PageRank scores as features 53 | #features = sp.diags(pr) 54 | # Some preprocessing 55 | print("You calling this function") 56 | adj_norm = preprocess_graph(adj) 57 | print("Yes") 58 | # Define placeholders 59 | self.placeholders = { 60 | 'features': tf.sparse_placeholder(tf.float32), 61 | 'adj': tf.sparse_placeholder(tf.float32), 62 | 'adj_orig': tf.sparse_placeholder(tf.float32), 63 | 'dropout': tf.placeholder_with_default(0., shape=()) 64 | } 65 | 66 | num_nodes = adj.shape[0] 67 | features = sparse_to_tuple(features.tocoo()) 68 | num_features = features[2][1] 69 | features_nonzero = features[1].shape[0] 70 | 71 | # Create model 72 | if self.model_selection == 'gcn_ae': 73 | self.model = GCNModelAE(self.placeholders, num_features, features_nonzero, self.hidden1, self.hidden2) 74 | elif self.model_selection == 'gcn_vae': 75 | self.model = GCNModelVAE(self.placeholders, num_features, num_nodes, features_nonzero, self.hidden1, 76 | self.hidden2) 77 | 78 | pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() 79 | norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2) 80 | 81 | # Optimizer 82 | with tf.name_scope('optimizer'): 83 | if self.model_selection == 'gcn_ae': 84 | opt = OptimizerAE(preds=self.model.reconstructions, 85 | labels=tf.reshape(tf.sparse_tensor_to_dense(self.placeholders['adj_orig'], 86 | validate_indices=False), [-1]), 87 | pos_weight=pos_weight, 88 | norm=norm, 89 | learning_rate=self.learning_rate 90 | ) 91 | elif self.model_selection == 'gcn_vae': 92 | opt = OptimizerVAE(preds=self.model.reconstructions, 93 | labels=tf.reshape(tf.sparse_tensor_to_dense(self.placeholders['adj_orig'], 94 | validate_indices=False), [-1]), 95 | model=self.model, 96 | num_nodes=num_nodes, 97 | pos_weight=pos_weight, 98 | norm=norm, 99 | learning_rate=self.learning_rate 100 | ) 101 | 102 | # Initialize session 103 | self.sess = tf.Session() 104 | self.sess.run(tf.global_variables_initializer()) 105 | 106 | adj_label = adj_train + sp.eye(adj_train.shape[0]) 107 | adj_label = sparse_to_tuple(adj_label) 108 | 109 | # Train model 110 | for epoch in range(self.epochs): 111 | t = time.time() 112 | # Construct feed dictionary 113 | self.feed_dict = construct_feed_dict(adj_norm, adj_label, features, self.placeholders) 114 | self.feed_dict.update({self.placeholders['dropout']: self.dropout}) 115 | # Run single weight update 116 | outs = self.sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=self.feed_dict) 117 | 118 | # Compute average loss 119 | avg_cost = outs[1] 120 | avg_accuracy = outs[2] 121 | 122 | print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost), 123 | "train_acc=", "{:.5f}".format(avg_accuracy), 124 | "time=", "{:.5f}".format(time.time() - t)) 125 | 126 | print("Optimization Finished!") 127 | -------------------------------------------------------------------------------- /OpenNE/walker.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import random 4 | 5 | import numpy as np 6 | 7 | 8 | def deepwalk_walk_wrapper(class_instance, walk_length, start_node): 9 | class_instance.deepwalk_walk(walk_length, start_node) 10 | 11 | 12 | class BasicWalker: 13 | def __init__(self, G, workers): 14 | self.G = G.G 15 | self.node_size = G.node_size 16 | self.look_up_dict = G.look_up_dict 17 | 18 | def deepwalk_walk(self, walk_length, start_node): 19 | ''' 20 | Simulate a random walk starting from start node. 21 | ''' 22 | G = self.G 23 | look_up_dict = self.look_up_dict 24 | node_size = self.node_size 25 | 26 | walk = [start_node] 27 | 28 | while len(walk) < walk_length: 29 | cur = walk[-1] 30 | cur_nbrs = list(G.neighbors(cur)) 31 | if len(cur_nbrs) > 0: 32 | walk.append(random.choice(cur_nbrs)) 33 | else: 34 | break 35 | return walk 36 | 37 | def simulate_walks(self, num_walks, walk_length): 38 | ''' 39 | Repeatedly simulate random walks from each node. 40 | ''' 41 | G = self.G 42 | walks = [] 43 | nodes = list(G.nodes()) 44 | print('Begin random walks...') 45 | for walk_iter in range(num_walks): 46 | # pool = multiprocessing.Pool(processes = 4) 47 | # print(str(walk_iter+1), '/', str(num_walks)) 48 | random.shuffle(nodes) 49 | for node in nodes: 50 | # walks.append(pool.apply_async(deepwalk_walk_wrapper, (self, walk_length, node, ))) 51 | walks.append(self.deepwalk_walk( 52 | walk_length=walk_length, start_node=node)) 53 | # pool.close() 54 | # pool.join() 55 | # print(len(walks)) 56 | print('Walk finished...') 57 | return walks 58 | 59 | 60 | class Walker: 61 | def __init__(self, G, p, q, workers): 62 | self.G = G.G 63 | self.p = p 64 | self.q = q 65 | self.node_size = G.node_size 66 | self.look_up_dict = G.look_up_dict 67 | 68 | def node2vec_walk(self, walk_length, start_node): 69 | ''' 70 | Simulate a random walk starting from start node. 71 | ''' 72 | G = self.G 73 | alias_nodes = self.alias_nodes 74 | alias_edges = self.alias_edges 75 | look_up_dict = self.look_up_dict 76 | node_size = self.node_size 77 | 78 | walk = [start_node] 79 | 80 | while len(walk) < walk_length: 81 | cur = walk[-1] 82 | cur_nbrs = list(G.neighbors(cur)) 83 | if len(cur_nbrs) > 0: 84 | if len(walk) == 1: 85 | walk.append( 86 | cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])]) 87 | else: 88 | prev = walk[-2] 89 | pos = (prev, cur) 90 | next = cur_nbrs[alias_draw(alias_edges[pos][0], 91 | alias_edges[pos][1])] 92 | walk.append(next) 93 | else: 94 | break 95 | 96 | return walk 97 | 98 | def simulate_walks(self, num_walks, walk_length): 99 | ''' 100 | Repeatedly simulate random walks from each node. 101 | ''' 102 | G = self.G 103 | walks = [] 104 | nodes = list(G.nodes()) 105 | print('Begin random walk...') 106 | for walk_iter in range(num_walks): 107 | # print(str(walk_iter+1), '/', str(num_walks)) 108 | random.shuffle(nodes) 109 | for node in nodes: 110 | walks.append(self.node2vec_walk( 111 | walk_length=walk_length, start_node=node)) 112 | print('Walk finished...') 113 | return walks 114 | 115 | def get_alias_edge(self, src, dst): 116 | ''' 117 | Get the alias edge setup lists for a given edge. 118 | ''' 119 | G = self.G 120 | p = self.p 121 | q = self.q 122 | 123 | unnormalized_probs = [] 124 | for dst_nbr in G.neighbors(dst): 125 | if dst_nbr == src: 126 | unnormalized_probs.append(G[dst][dst_nbr]['weight'] / p) 127 | elif G.has_edge(dst_nbr, src): 128 | unnormalized_probs.append(G[dst][dst_nbr]['weight']) 129 | else: 130 | unnormalized_probs.append(G[dst][dst_nbr]['weight'] / q) 131 | norm_const = sum(unnormalized_probs) 132 | normalized_probs = [ 133 | float(u_prob) / norm_const for u_prob in unnormalized_probs] 134 | 135 | return alias_setup(normalized_probs) 136 | 137 | def preprocess_transition_probs(self): 138 | ''' 139 | Preprocessing of transition probabilities for guiding the random walks. 140 | ''' 141 | G = self.G 142 | 143 | alias_nodes = {} 144 | for node in G.nodes(): 145 | unnormalized_probs = [G[node][nbr]['weight'] 146 | for nbr in G.neighbors(node)] 147 | norm_const = sum(unnormalized_probs) 148 | normalized_probs = [ 149 | float(u_prob) / norm_const for u_prob in unnormalized_probs] 150 | alias_nodes[node] = alias_setup(normalized_probs) 151 | 152 | alias_edges = {} 153 | triads = {} 154 | 155 | look_up_dict = self.look_up_dict 156 | node_size = self.node_size 157 | for edge in G.edges(): 158 | alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) 159 | 160 | self.alias_nodes = alias_nodes 161 | self.alias_edges = alias_edges 162 | 163 | return 164 | 165 | 166 | def alias_setup(probs): 167 | ''' 168 | Compute utility lists for non-uniform sampling from discrete distributions. 169 | Refer to https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/ 170 | for details 171 | ''' 172 | K = len(probs) 173 | q = np.zeros(K, dtype=np.float32) 174 | J = np.zeros(K, dtype=np.int32) 175 | 176 | smaller = [] 177 | larger = [] 178 | for kk, prob in enumerate(probs): 179 | q[kk] = K * prob 180 | if q[kk] < 1.0: 181 | smaller.append(kk) 182 | else: 183 | larger.append(kk) 184 | 185 | while len(smaller) > 0 and len(larger) > 0: 186 | small = smaller.pop() 187 | large = larger.pop() 188 | 189 | J[small] = large 190 | q[large] = q[large] + q[small] - 1.0 191 | if q[large] < 1.0: 192 | smaller.append(large) 193 | else: 194 | larger.append(large) 195 | 196 | return J, q 197 | 198 | 199 | def alias_draw(J, q): 200 | ''' 201 | Draw sample from a non-uniform discrete distribution using alias sampling. 202 | ''' 203 | K = len(J) 204 | 205 | kk = int(np.floor(np.random.rand() * K)) 206 | if np.random.rand() < q[kk]: 207 | return kk 208 | else: 209 | return J[kk] 210 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import copy 4 | import itertools 5 | import random 6 | 7 | import networkx as nx 8 | import numpy as np 9 | 10 | import OpenNE.graph as og 11 | #import struc2vec.graph as sg 12 | 13 | 14 | def read_for_OpenNE(filename, weighted=False): 15 | G = og.Graph() 16 | print("Loading training graph for learning embedding...") 17 | G.read_edgelist(filename=filename, weighted=weighted) 18 | print("Graph Loaded...") 19 | return G 20 | 21 | def read_for_OpenNE_from_mat(filename): 22 | 23 | G = og.Graph() 24 | 25 | print("Loading mat file for classification only") 26 | G.readMatFile(filename) 27 | print("Mat Graph loaded") 28 | return G 29 | 30 | 31 | def read_for_gae(filename, weighted=False): 32 | print("Loading training graph for learning embedding...") 33 | edgelist = np.loadtxt(filename, dtype='float') 34 | if weighted: 35 | edgelist = [(int(edgelist[idx, 0]), int(edgelist[idx, 1])) for idx in range(edgelist.shape[0]) if 36 | edgelist[idx, 2] > 0] 37 | else: 38 | edgelist = [(int(edgelist[idx, 0]), int(edgelist[idx, 1])) for idx in range(edgelist.shape[0])] 39 | G=nx.from_edgelist(edgelist) 40 | node_list=list(G.nodes) 41 | adj = nx.adjacency_matrix(G, nodelist=node_list) 42 | print("Graph Loaded...") 43 | return (adj,node_list) 44 | 45 | 46 | def read_for_SVD(filename, weighted=False): 47 | if weighted: 48 | G = nx.read_weighted_edgelist(filename) 49 | else: 50 | G = nx.read_edgelist(filename) 51 | return G 52 | 53 | 54 | def split_train_test_graph(input_edgelist, seed, testing_ratio, weighted=False): 55 | 56 | if (weighted): 57 | G = nx.read_weighted_edgelist(input_edgelist) 58 | else: 59 | G = nx.read_edgelist(input_edgelist) 60 | node_num1, edge_num1 = len(G.nodes), len(G.edges) 61 | print('Original Graph: nodes:', node_num1, 'edges:', edge_num1) 62 | testing_edges_num = int(len(G.edges) * testing_ratio) 63 | random.seed(seed) 64 | testing_pos_edges = random.sample(G.edges, testing_edges_num) 65 | G_train = copy.deepcopy(G) 66 | for edge in testing_pos_edges: 67 | node_u, node_v = edge 68 | if (G_train.degree(node_u) > 1 and G_train.degree(node_v) > 1): 69 | G_train.remove_edge(node_u, node_v) 70 | 71 | G_train.remove_nodes_from(nx.isolates(G_train)) 72 | node_num2, edge_num2 = len(G_train.nodes), len(G_train.edges) 73 | assert node_num1 == node_num2 74 | train_graph_filename = 'graph_train.edgelist' 75 | if weighted: 76 | nx.write_edgelist(G_train, train_graph_filename, data=['weight']) 77 | else: 78 | nx.write_edgelist(G_train, train_graph_filename, data=False) 79 | 80 | node_num1, edge_num1 = len(G_train.nodes), len(G_train.edges) 81 | print('Training Graph: nodes:', node_num1, 'edges:', edge_num1) 82 | # idSave={} 83 | # idSave['G']=G 84 | # import scipy.io as sio 85 | # #idSave['Label'] = labels 86 | # #idSave['Attributes'] = features 87 | # 88 | # sio.savemat('DrugBankAdj.mat',idSave) 89 | return G, G_train, testing_pos_edges, train_graph_filename 90 | 91 | 92 | def split_train_test_graphReal(input_edgelist1, input_edgelist2, seed, testing_ratio, weighted=False): 93 | 94 | if (weighted): 95 | G1 = nx.read_weighted_edgelist(input_edgelist1) 96 | G2 = nx.read_weighted_edgelist(input_edgelist2) 97 | else: 98 | G1 = nx.read_edgelist(input_edgelist1) 99 | G2 = nx.read_edgelist(input_edgelist2) 100 | node_num1, edge_num1 = len(G1.nodes), len(G1.edges) 101 | node_num2, edge_num2 = len(G2.nodes), len(G2.edges) 102 | 103 | print('Original CoExp Graph: nodes:', node_num1, 'edges:', edge_num1) 104 | print('Original Exper Graph: nodes:', node_num2, 'edges:', edge_num2) 105 | testing_edges_num = int(len(G2.edges)) 106 | random.seed(seed) 107 | testing_pos_edges = G2.edges 108 | G_train = copy.deepcopy(G1) 109 | overlapCount = 0 110 | for edge in testing_pos_edges: 111 | node_u, node_v = edge 112 | if(G_train.has_edge(node_u,node_v)): 113 | overlapCount = overlapCount+1 114 | print("Number of edge Overlap: ", overlapCount) 115 | 116 | 117 | def generate_neg_edges(original_graph, testing_edges_num, seed): 118 | L = list(original_graph.nodes()) 119 | 120 | # create a complete graph 121 | G = nx.Graph() 122 | G.add_nodes_from(L) 123 | G.add_edges_from(itertools.combinations(L, 2)) 124 | # remove original edges 125 | G.remove_edges_from(original_graph.edges()) 126 | random.seed(seed) 127 | neg_edges = random.sample(G.edges, testing_edges_num) 128 | return neg_edges 129 | 130 | 131 | def load_embedding(embedding_file_name, node_list=None): 132 | with open(embedding_file_name) as f: 133 | node_num, emb_size = f.readline().split() 134 | print('Nodes with embedding: %s'%node_num) 135 | embedding_look_up = {} 136 | if node_list: 137 | for line in f: 138 | vec = line.strip().split() 139 | node_id = vec[0] 140 | if (node_id in node_list): 141 | emb = [float(x) for x in vec[1:]] 142 | emb = emb / np.linalg.norm(emb) 143 | emb[np.isnan(emb)] = 0 144 | embedding_look_up[node_id] = np.array(emb) 145 | 146 | # if len(node_list) != len(embedding_look_up): 147 | # diff_nodes=set(node_list).difference(set(embedding_look_up.keys())) 148 | # for node in diff_nodes: 149 | # emb = np.random.random((int(emb_size))) 150 | # emb = emb / np.linalg.norm(emb) 151 | # emb[np.isnan(emb)] = 0 152 | # embedding_look_up[node] = np.array(emb) 153 | 154 | assert len(node_list) == len(embedding_look_up) 155 | else: 156 | for line in f: 157 | vec = line.strip().split() 158 | node_id = vec[0] 159 | embeddings = vec[1:] 160 | emb = [float(x) for x in embeddings] 161 | emb = emb / np.linalg.norm(emb) 162 | emb[np.isnan(emb)] = 0 163 | embedding_look_up[node_id] = list(emb) 164 | assert int(node_num) == len(embedding_look_up) 165 | f.close() 166 | return embedding_look_up 167 | 168 | 169 | def read_node_labels(filename): 170 | fin = open(filename, 'r') 171 | node_list = [] 172 | labels = [] 173 | while 1: 174 | l = fin.readline() 175 | if l == '': 176 | break 177 | vec = l.strip().split() 178 | node_list.append(vec[0]) 179 | labels.append(vec[1:]) 180 | fin.close() 181 | print('Nodes with labels: %s'%len(node_list)) 182 | return node_list, labels 183 | 184 | 185 | def split_train_test_classify(embedding_look_up, X, Y, seed, testing_ratio=0.5): 186 | state = np.random.get_state() 187 | training_ratio = 1 - testing_ratio 188 | training_size = int(training_ratio * len(X)) 189 | np.random.seed(seed) 190 | shuffle_indices = np.random.permutation(np.arange(len(X))) 191 | X_train = [embedding_look_up[X[shuffle_indices[i]]] for i in range(training_size)] 192 | Y_train = [Y[shuffle_indices[i]] for i in range(training_size)] 193 | X_test = [embedding_look_up[X[shuffle_indices[i]]] for i in range(training_size, len(X))] 194 | Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))] 195 | 196 | X_train = np.array(X_train) 197 | Y_train = np.array(Y_train) 198 | X_test = np.array(X_test) 199 | Y_test = np.array(Y_test) 200 | 201 | np.random.set_state(state) 202 | return X_train, Y_train, X_test, Y_test 203 | 204 | 205 | def get_y_pred(y_test, y_pred_prob): 206 | y_pred = np.zeros(y_pred_prob.shape) 207 | sort_index = np.flip(np.argsort(y_pred_prob, axis=1), 1) 208 | for i in range(y_test.shape[0]): 209 | num = np.sum(y_test[i]) 210 | for j in range(num): 211 | y_pred[i][sort_index[i][j]] = 1 212 | return y_pred 213 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | 2 | import datetime 3 | import getpass 4 | import json 5 | import os 6 | import random 7 | import time 8 | import scipy.io as sio 9 | from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser 10 | 11 | import numpy as np 12 | import scipy.sparse as sp 13 | 14 | from embed_train import embedding_training, load_embedding, read_node_labels, split_train_test_graph 15 | from evaluation import LinkPrediction, NodeClassification 16 | 17 | 18 | 19 | 20 | def parse_args(): 21 | parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter, 22 | conflict_handler='resolve') 23 | 24 | parser.add_argument('--input', choices=[ 25 | 'DrugBank_DDI.edgelist', 26 | 'NDFRT_DDA.edgelist', 27 | 'CTD_DDA.edgelist'], default='DDI5.edgelist', 28 | help='Input Graph file' 29 | 'None represents no evaluation, and only run for training embedding.') 30 | parser.add_argument('--output', choices=[ 31 | 'DGI_RA_1_DrugBank_DDI.txt', 32 | 'out2', 33 | 'out'], default='Default.txt', 34 | help='Yada yada' 35 | 'None represents no evaluation, and only run for training embedding.') 36 | 37 | parser.add_argument('--embTech', choices=[ 38 | 'DGI', 39 | 'CN', 40 | 'AA', 41 | 42 | ], default='CN', help='The embedding learning method') 43 | 44 | parser.add_argument('--method', choices=[ 45 | 'Laplacian', 46 | 'SVD', 47 | ], default='DGI', help='The embedding learning method') 48 | parser.add_argument('--task', choices=[ 49 | 'link-prediction', 50 | 'node-classification'], default='link-prediction', 51 | help='Choose to evaluate the embedding quality based on a specific prediction task. ' 52 | 'None represents no evaluation, and only run for training embedding.') 53 | parser.add_argument('--testingratio', default=0.1, type=float, 54 | help='Testing set ratio for prediction tasks.' 55 | 'In link prediction, it splits all the known edges; ' 56 | 'in node classification, it splits all the labeled nodes.') 57 | parser.add_argument('--number-walks', default=32, type=int, 58 | help='Number of random walks to start at each node. ' 59 | 'Only for random walk-based methods: DeepWalk, node2vec, struc2vec') 60 | parser.add_argument('--walk-length', default=64, type=int, 61 | help='Length of the random walk started at each node. ' 62 | 'Only for random walk-based methods: DeepWalk, node2vec, struc2vec') 63 | parser.add_argument('--workers', default=8, type=int, 64 | help='Number of parallel processes. ' 65 | 'Only for random walk-based methods: DeepWalk, node2vec, struc2vec') 66 | parser.add_argument('--dimensions', default=100, type=int, 67 | help='the dimensions of embedding for each node.') 68 | parser.add_argument('--window-size', default=10, type=int, 69 | help='Window size of word2vec model. ' 70 | 'Only for random walk-based methods: DeepWalk, node2vec, struc2vec') 71 | parser.add_argument('--epochs', default=100, type=int, 72 | help='The training epochs of LINE, SDNE and GAE') 73 | parser.add_argument('--p', default=1.0, type=float, 74 | help='p is a hyper-parameter for node2vec, ' 75 | 'and it controls how fast the walk explores.') 76 | parser.add_argument('--q', default=1.0, type=float, 77 | help='q is a hyper-parameter for node2vec, ' 78 | 'and it controls how fast the walk leaves the neighborhood of starting node.') 79 | 80 | 81 | 82 | 83 | parser.add_argument('--label-file', default='node2vec_PPI_labels.txt', 84 | help='The label file for node classification') 85 | parser.add_argument('--negative-ratio', default=5, type=int, 86 | help='the negative ratio of LINE') 87 | parser.add_argument('--weighted', type=bool, default=False, 88 | help='Treat graph as weighted') 89 | parser.add_argument('--directed', type=bool, default=False, 90 | help='Treat graph as directed') 91 | parser.add_argument('--order', default=2, type=int, 92 | help='Choose the order of LINE, 1 means first order, 2 means second order, 3 means first order + second order') 93 | parser.add_argument('--weight-decay', type=float, default=5e-4, 94 | help='coefficient for L2 regularization for Graph Factorization.') 95 | parser.add_argument('--kstep', default=4, type=int, 96 | help='Use k-step transition probability matrix for GraRep.') 97 | parser.add_argument('--lr', default=0.01, type=float, 98 | help='learning rate') 99 | parser.add_argument('--alpha', default=0.3, type=float, 100 | help='alhpa is a hyperparameter in SDNE') 101 | parser.add_argument('--beta', default=0, type=float, 102 | help='beta is a hyperparameter in SDNE') 103 | parser.add_argument('--nu1', default=1e-5, type=float, 104 | help='nu1 is a hyperparameter in SDNE') 105 | parser.add_argument('--nu2', default=1e-4, type=float, 106 | help='nu2 is a hyperparameter in SDNE') 107 | parser.add_argument('--bs', default=200, type=int, 108 | help='batch size of SDNE') 109 | parser.add_argument('--encoder-list', default='[1000, 128]', type=str, 110 | help='a list of numbers of the neuron at each encoder layer, the last number is the ' 111 | 'dimension of the output node representation') 112 | parser.add_argument('--OPT1', default=True, type=bool, 113 | help='optimization 1 for struc2vec') 114 | parser.add_argument('--OPT2', default=True, type=bool, 115 | help='optimization 2 for struc2vec') 116 | parser.add_argument('--OPT3', default=True, type=bool, 117 | help='optimization 3 for struc2vec') 118 | parser.add_argument('--until-layer', type=int, default=6, 119 | help='Calculation until the layer. A hyper-parameter for struc2vec.') 120 | parser.add_argument('--dropout', default=0, type=float, help='Dropout rate (1 - keep probability).') 121 | parser.add_argument('--hidden', default=32, type=int, help='Number of units in hidden layer.') 122 | parser.add_argument('--gae_model_selection', default='gcn_ae', type=str, 123 | help='gae model selection: gcn_ae or gcn_vae') 124 | parser.add_argument('--eval-result-file', help='save evaluation performance') 125 | parser.add_argument('--seed',default=0, type=int, help='seed value') 126 | args = parser.parse_args() 127 | 128 | return args 129 | 130 | 131 | 132 | def main(args): 133 | print('#' * 70) 134 | print('Embedding Method: %s, Evaluation Task: %s' % (args.method, args.task)) 135 | print('#' * 70) 136 | 137 | if args.task == 'link-prediction': 138 | partitiondata = ['DDI1.edgelist'] 139 | techniques = ['DGI'] 140 | 141 | for d in partitiondata: 142 | print(d) 143 | args.input = d 144 | for x in techniques: 145 | print(x) 146 | args.method = x 147 | for i in range(3): 148 | G, G_train, testing_pos_edges, train_graph_filename = split_train_test_graph(args.input, args.seed, args.testingratio,weighted=args.weighted) 149 | # time2 = time.time() 150 | # print('Compute RWR ') 151 | # calc_ppr_exact(G[0], 0.1) 152 | # time2 = time.time() 153 | # print('Exact PPR took ', time2) 154 | # 155 | time1 = time.time() 156 | #idSave={} 157 | #idSave['G']=G 158 | #idSave['Label'] = labels 159 | #idSave['Attributes'] = features 160 | 161 | #sio.savemat('DrugBankAdj.mat',idSave) 162 | 163 | embedding_training(args, train_graph_filename) 164 | embed_train_time = time.time() - time1 165 | print('Embedding Learning Time: %.2f s' % embed_train_time) 166 | embedding_look_up = load_embedding(args.output) 167 | time1 = time.time() 168 | print('Begin evaluation...') 169 | result = LinkPrediction(embedding_look_up, G, G_train, testing_pos_edges,args.seed) 170 | eval_time = time.time() - time1 171 | print('Prediction Task Time: %.2f s' % eval_time) 172 | os.remove(train_graph_filename) 173 | elif args.task == 'node-classification': 174 | if not args.label_file: 175 | raise ValueError("No input label file. Exit.") 176 | node_list, labels = read_node_labels(args.label_file) 177 | idSave={} 178 | idSave['labels'] = labels 179 | sio.savemat('LabelNode2VecPPI.mat',idSave) 180 | train_graph_filename = args.input 181 | time1 = time.time() 182 | embedding_training(args, train_graph_filename) 183 | embed_train_time = time.time() - time1 184 | print('Embedding Learning Time: %.2f s' % embed_train_time) 185 | embedding_look_up = load_embedding('N2V_DW_Emb.txt', node_list) 186 | time1 = time.time() 187 | print('Begin evaluation...') 188 | result = NodeClassification(embedding_look_up, node_list, labels, args.testingratio, args.seed) 189 | eval_time = time.time() - time1 190 | print('Prediction Task Time: %.2f s' % eval_time) 191 | else: 192 | train_graph_filename = args.input 193 | time1 = time.time() 194 | embedding_training(args, train_graph_filename) 195 | embed_train_time = time.time() - time1 196 | print('Embedding Learning Time: %.2f s' % embed_train_time) 197 | os.remove(train_graph_filename) 198 | 199 | if args.eval_result_file and result: 200 | _results = dict( 201 | input=args.input, 202 | task=args.task, 203 | method=args.method, 204 | dimension=args.dimensions, 205 | user=getpass.getuser(), 206 | date=datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S'), 207 | seed=args.seed, 208 | ) 209 | 210 | if args.task == 'link-prediction': 211 | auc_roc, auc_pr, accuracy, f1 = result 212 | _results['results'] = dict( 213 | auc_roc=auc_roc, 214 | auc_pr=auc_pr, 215 | accuracy=accuracy, 216 | f1=f1, 217 | ) 218 | else: 219 | accuracy, f1_micro, f1_macro = result 220 | _results['results'] = dict( 221 | accuracy=accuracy, 222 | f1_micro=f1_micro, 223 | f1_macro=f1_macro, 224 | ) 225 | 226 | with open(args.eval_result_file, 'a+') as wf: 227 | print(json.dumps(_results, sort_keys=True), file=wf) 228 | 229 | 230 | def more_main(): 231 | args = parse_args() 232 | seed = args.seed 233 | random.seed(seed) 234 | np.random.seed(seed) 235 | main(parse_args()) 236 | 237 | 238 | if __name__ == "__main__": 239 | more_main() 240 | -------------------------------------------------------------------------------- /OpenNE/line.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import math 4 | import random 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | from sklearn.linear_model import LogisticRegression 9 | 10 | from OpenNE.classify import Classifier, read_node_label 11 | 12 | 13 | class _LINE(object): 14 | 15 | def __init__(self, graph, rep_size=128, batch_size=1000, negative_ratio=5, order=3): 16 | self.cur_epoch = 0 17 | self.order = order 18 | self.g = graph 19 | self.node_size = graph.G.number_of_nodes() 20 | self.rep_size = rep_size 21 | self.batch_size = batch_size 22 | self.negative_ratio = negative_ratio 23 | 24 | self.gen_sampling_table() 25 | self.sess = tf.Session() 26 | cur_seed = random.getrandbits(32) 27 | initializer = tf.contrib.layers.xavier_initializer( 28 | uniform=False, seed=cur_seed) 29 | with tf.variable_scope("model", reuse=None, initializer=initializer): 30 | self.build_graph() 31 | self.sess.run(tf.global_variables_initializer()) 32 | 33 | def build_graph(self): 34 | self.h = tf.placeholder(tf.int32, [None]) 35 | self.t = tf.placeholder(tf.int32, [None]) 36 | self.sign = tf.placeholder(tf.float32, [None]) 37 | 38 | cur_seed = random.getrandbits(32) 39 | self.embeddings = tf.get_variable(name="embeddings" + str(self.order), shape=[ 40 | self.node_size, self.rep_size], initializer=tf.contrib.layers.xavier_initializer(uniform=False, 41 | seed=cur_seed)) 42 | self.context_embeddings = tf.get_variable(name="context_embeddings" + str(self.order), shape=[ 43 | self.node_size, self.rep_size], initializer=tf.contrib.layers.xavier_initializer(uniform=False, 44 | seed=cur_seed)) 45 | # self.h_e = tf.nn.l2_normalize(tf.nn.embedding_lookup(self.embeddings, self.h), 1) 46 | # self.t_e = tf.nn.l2_normalize(tf.nn.embedding_lookup(self.embeddings, self.t), 1) 47 | # self.t_e_context = tf.nn.l2_normalize(tf.nn.embedding_lookup(self.context_embeddings, self.t), 1) 48 | self.h_e = tf.nn.embedding_lookup(self.embeddings, self.h) 49 | self.t_e = tf.nn.embedding_lookup(self.embeddings, self.t) 50 | self.t_e_context = tf.nn.embedding_lookup( 51 | self.context_embeddings, self.t) 52 | self.second_loss = -tf.reduce_mean(tf.log_sigmoid( 53 | self.sign * tf.reduce_sum(tf.multiply(self.h_e, self.t_e_context), axis=1))) 54 | self.first_loss = -tf.reduce_mean(tf.log_sigmoid( 55 | self.sign * tf.reduce_sum(tf.multiply(self.h_e, self.t_e), axis=1))) 56 | if self.order == 1: 57 | self.loss = self.first_loss 58 | else: 59 | self.loss = self.second_loss 60 | optimizer = tf.train.AdamOptimizer(0.001) 61 | self.train_op = optimizer.minimize(self.loss) 62 | 63 | def train_one_epoch(self): 64 | sum_loss = 0.0 65 | batches = self.batch_iter() 66 | batch_id = 0 67 | for batch in batches: 68 | h, t, sign = batch 69 | feed_dict = { 70 | self.h: h, 71 | self.t: t, 72 | self.sign: sign, 73 | } 74 | _, cur_loss = self.sess.run([self.train_op, self.loss], feed_dict) 75 | sum_loss += cur_loss 76 | batch_id += 1 77 | print('epoch:{} sum of loss:{!s}'.format(self.cur_epoch, sum_loss)) 78 | self.cur_epoch += 1 79 | 80 | def batch_iter(self): 81 | look_up = self.g.look_up_dict 82 | 83 | table_size = 1e8 84 | numNodes = self.node_size 85 | 86 | edges = [(look_up[x[0]], look_up[x[1]]) for x in self.g.G.edges()] 87 | 88 | data_size = self.g.G.number_of_edges() 89 | edge_set = set([x[0] * numNodes + x[1] for x in edges]) 90 | shuffle_indices = np.random.permutation(np.arange(data_size)) 91 | 92 | # positive or negative mod 93 | mod = 0 94 | mod_size = 1 + self.negative_ratio 95 | h = [] 96 | t = [] 97 | sign = 0 98 | 99 | start_index = 0 100 | end_index = min(start_index + self.batch_size, data_size) 101 | while start_index < data_size: 102 | if mod == 0: 103 | sign = 1. 104 | h = [] 105 | t = [] 106 | for i in range(start_index, end_index): 107 | if not random.random() < self.edge_prob[shuffle_indices[i]]: 108 | shuffle_indices[i] = self.edge_alias[shuffle_indices[i]] 109 | cur_h = edges[shuffle_indices[i]][0] 110 | cur_t = edges[shuffle_indices[i]][1] 111 | h.append(cur_h) 112 | t.append(cur_t) 113 | else: 114 | sign = -1. 115 | t = [] 116 | for i in range(len(h)): 117 | t.append( 118 | self.sampling_table[random.randint(0, table_size - 1)]) 119 | 120 | yield h, t, [sign] 121 | mod += 1 122 | mod %= mod_size 123 | if mod == 0: 124 | start_index = end_index 125 | end_index = min(start_index + self.batch_size, data_size) 126 | 127 | def gen_sampling_table(self): 128 | table_size = 1e8 129 | power = 0.75 130 | numNodes = self.node_size 131 | 132 | print("Pre-procesing for non-uniform negative sampling!") 133 | node_degree = np.zeros(numNodes) # out degree 134 | 135 | look_up = self.g.look_up_dict 136 | for edge in self.g.G.edges(): 137 | node_degree[look_up[edge[0]] 138 | ] += self.g.G[edge[0]][edge[1]]["weight"] 139 | 140 | norm = sum([math.pow(node_degree[i], power) for i in range(numNodes)]) 141 | 142 | self.sampling_table = np.zeros(int(table_size), dtype=np.uint32) 143 | 144 | p = 0 145 | i = 0 146 | for j in range(numNodes): 147 | p += float(math.pow(node_degree[j], power)) / norm 148 | while i < table_size and float(i) / table_size < p: 149 | self.sampling_table[i] = j 150 | i += 1 151 | 152 | data_size = self.g.G.number_of_edges() 153 | self.edge_alias = np.zeros(data_size, dtype=np.int32) 154 | self.edge_prob = np.zeros(data_size, dtype=np.float32) 155 | large_block = np.zeros(data_size, dtype=np.int32) 156 | small_block = np.zeros(data_size, dtype=np.int32) 157 | 158 | total_sum = sum([self.g.G[edge[0]][edge[1]]["weight"] 159 | for edge in self.g.G.edges()]) 160 | norm_prob = [self.g.G[edge[0]][edge[1]]["weight"] * 161 | data_size / total_sum for edge in self.g.G.edges()] 162 | num_small_block = 0 163 | num_large_block = 0 164 | cur_small_block = 0 165 | cur_large_block = 0 166 | for k in range(data_size - 1, -1, -1): 167 | if norm_prob[k] < 1: 168 | small_block[num_small_block] = k 169 | num_small_block += 1 170 | else: 171 | large_block[num_large_block] = k 172 | num_large_block += 1 173 | while num_small_block and num_large_block: 174 | num_small_block -= 1 175 | cur_small_block = small_block[num_small_block] 176 | num_large_block -= 1 177 | cur_large_block = large_block[num_large_block] 178 | self.edge_prob[cur_small_block] = norm_prob[cur_small_block] 179 | self.edge_alias[cur_small_block] = cur_large_block 180 | norm_prob[cur_large_block] = norm_prob[cur_large_block] + \ 181 | norm_prob[cur_small_block] - 1 182 | if norm_prob[cur_large_block] < 1: 183 | small_block[num_small_block] = cur_large_block 184 | num_small_block += 1 185 | else: 186 | large_block[num_large_block] = cur_large_block 187 | num_large_block += 1 188 | 189 | while num_large_block: 190 | num_large_block -= 1 191 | self.edge_prob[large_block[num_large_block]] = 1 192 | while num_small_block: 193 | num_small_block -= 1 194 | self.edge_prob[small_block[num_small_block]] = 1 195 | 196 | def get_embeddings(self): 197 | vectors = {} 198 | embeddings = self.embeddings.eval(session=self.sess) 199 | # embeddings = self.sess.run(tf.nn.l2_normalize(self.embeddings.eval(session=self.sess), 1)) 200 | look_back = self.g.look_back_list 201 | for i, embedding in enumerate(embeddings): 202 | vectors[look_back[i]] = embedding 203 | return vectors 204 | 205 | 206 | class LINE(object): 207 | 208 | def __init__(self, graph, rep_size=128, batch_size=1000, epoch=10, negative_ratio=5, order=3, label_file=None, 209 | clf_ratio=0.5, auto_save=True): 210 | self.rep_size = rep_size 211 | self.order = order 212 | self.best_result = 0 213 | self.vectors = {} 214 | if order == 3: 215 | self.model1 = _LINE(graph, rep_size / 2, batch_size, 216 | negative_ratio, order=1) 217 | self.model2 = _LINE(graph, rep_size / 2, batch_size, 218 | negative_ratio, order=2) 219 | for i in range(epoch): 220 | self.model1.train_one_epoch() 221 | self.model2.train_one_epoch() 222 | if label_file: 223 | self.get_embeddings() 224 | X, Y = read_node_label(label_file) 225 | print("Training classifier using {:.2f}% nodes...".format( 226 | clf_ratio * 100)) 227 | clf = Classifier(vectors=self.vectors, 228 | clf=LogisticRegression()) 229 | result = clf.split_train_evaluate(X, Y, clf_ratio) 230 | 231 | if result['macro'] > self.best_result: 232 | self.best_result = result['macro'] 233 | if auto_save: 234 | self.best_vector = self.vectors 235 | 236 | else: 237 | self.model = _LINE(graph, rep_size, batch_size, 238 | negative_ratio, order=self.order) 239 | for i in range(epoch): 240 | self.model.train_one_epoch() 241 | if label_file: 242 | self.get_embeddings() 243 | X, Y = read_node_label(label_file) 244 | print("Training classifier using {:.2f}% nodes...".format( 245 | clf_ratio * 100)) 246 | clf = Classifier(vectors=self.vectors, 247 | clf=LogisticRegression()) 248 | result = clf.split_train_evaluate(X, Y, clf_ratio) 249 | 250 | if result['macro'] > self.best_result: 251 | self.best_result = result['macro'] 252 | if auto_save: 253 | self.best_vector = self.vectors 254 | 255 | self.get_embeddings() 256 | if auto_save and label_file: 257 | self.vectors = self.best_vector 258 | 259 | def get_embeddings(self): 260 | self.last_vectors = self.vectors 261 | self.vectors = {} 262 | if self.order == 3: 263 | vectors1 = self.model1.get_embeddings() 264 | vectors2 = self.model2.get_embeddings() 265 | for node in vectors1.keys(): 266 | self.vectors[node] = np.append(vectors1[node], vectors2[node]) 267 | else: 268 | self.vectors = self.model.get_embeddings() 269 | 270 | def save_embeddings(self, filename): 271 | fout = open(filename, 'w') 272 | node_num = len(self.vectors.keys()) 273 | fout.write("{} {}\n".format(node_num, self.rep_size)) 274 | for node, vec in self.vectors.items(): 275 | fout.write("{} {}\n".format(node, 276 | ' '.join([str(x) for x in vec]))) 277 | fout.close() 278 | -------------------------------------------------------------------------------- /OpenNE/sdne.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | __author__ = "Wang Binlu" 7 | __email__ = "wblmail@whu.edu.cn" 8 | 9 | 10 | def fc_op(input_op, name, n_out, layer_collector, act_func=tf.nn.leaky_relu): 11 | n_in = input_op.get_shape()[-1].value 12 | with tf.name_scope(name) as scope: 13 | kernel = tf.Variable(tf.contrib.layers.xavier_initializer()([n_in, n_out]), dtype=tf.float32, name=scope + "w") 14 | 15 | # kernel = tf.Variable(tf.random_normal([n_in, n_out])) 16 | biases = tf.Variable(tf.constant(0, shape=[1, n_out], dtype=tf.float32), name=scope + 'b') 17 | 18 | fc = tf.add(tf.matmul(input_op, kernel), biases) 19 | activation = act_func(fc, name=scope + 'act') 20 | layer_collector.append([kernel, biases]) 21 | return activation 22 | 23 | 24 | class SDNE(object): 25 | def __init__(self, graph, encoder_layer_list, alpha=1e-6, beta=5., nu1=1e-5, nu2=1e-4, 26 | batch_size=200, epoch=100, learning_rate=None): 27 | """ 28 | encoder_layer_list: a list of numbers of the neuron at each ecdoer layer, the last number is the 29 | dimension of the output node representation 30 | Eg: 31 | if node size is 2000, encoder_layer_list=[1000, 128], then the whole neural network would be 32 | 2000(input)->1000->128->1000->2000, SDNE extract the middle layer as the node representation 33 | """ 34 | self.g = graph 35 | 36 | self.node_size = self.g.G.number_of_nodes() 37 | self.dim = encoder_layer_list[-1] 38 | 39 | self.encoder_layer_list = [self.node_size] 40 | self.encoder_layer_list.extend(encoder_layer_list) 41 | self.encoder_layer_num = len(encoder_layer_list) + 1 42 | 43 | self.alpha = alpha 44 | self.beta = beta 45 | self.nu1 = nu1 46 | self.nu2 = nu2 47 | self.bs = batch_size 48 | self.epoch = epoch 49 | self.max_iter = (epoch * self.node_size) // batch_size 50 | 51 | self.lr = learning_rate 52 | if self.lr is None: 53 | self.lr = tf.train.inverse_time_decay(0.03, self.max_iter, decay_steps=1, decay_rate=0.9999) 54 | 55 | self.sess = tf.Session() 56 | self.vectors = {} 57 | 58 | self.adj_mat = self.getAdj() 59 | self.embeddings = self.train() 60 | 61 | look_back = self.g.look_back_list 62 | 63 | for i, embedding in enumerate(self.embeddings): 64 | self.vectors[look_back[i]] = embedding 65 | 66 | def getAdj(self): 67 | node_size = self.g.node_size 68 | look_up = self.g.look_up_dict 69 | adj = np.zeros((node_size, node_size)) 70 | for edge in self.g.G.edges(): 71 | adj[look_up[edge[0]]][look_up[edge[1]]] = self.g.G[edge[0]][edge[1]]['weight'] 72 | return adj 73 | 74 | def train(self): 75 | adj_mat = self.adj_mat 76 | 77 | AdjBatch = tf.placeholder(tf.float32, [None, self.node_size], name='adj_batch') 78 | Adj = tf.placeholder(tf.float32, [None, None], name='adj_mat') 79 | B = tf.placeholder(tf.float32, [None, self.node_size], name='b_mat') 80 | 81 | fc = AdjBatch 82 | scope_name = 'encoder' 83 | layer_collector = [] 84 | 85 | with tf.name_scope(scope_name): 86 | for i in range(1, self.encoder_layer_num): 87 | fc = fc_op(fc, 88 | name=scope_name + str(i), 89 | n_out=self.encoder_layer_list[i], 90 | layer_collector=layer_collector) 91 | 92 | _embeddings = fc 93 | 94 | scope_name = 'decoder' 95 | with tf.name_scope(scope_name): 96 | for i in range(self.encoder_layer_num - 2, 0, -1): 97 | fc = fc_op(fc, 98 | name=scope_name + str(i), 99 | n_out=self.encoder_layer_list[i], 100 | layer_collector=layer_collector) 101 | fc = fc_op(fc, 102 | name=scope_name + str(0), 103 | n_out=self.encoder_layer_list[0], 104 | layer_collector=layer_collector, ) 105 | 106 | _embeddings_norm = tf.reduce_sum(tf.square(_embeddings), 1, keepdims=True) 107 | 108 | L_1st = tf.reduce_sum( 109 | Adj * ( 110 | _embeddings_norm - 2 * tf.matmul( 111 | _embeddings, tf.transpose(_embeddings) 112 | ) + tf.transpose(_embeddings_norm) 113 | ) 114 | ) 115 | 116 | L_2nd = tf.reduce_sum(tf.square((AdjBatch - fc) * B)) 117 | 118 | L = L_2nd + self.alpha * L_1st 119 | 120 | for param in layer_collector: 121 | L += self.nu1 * tf.reduce_sum(tf.abs(param[0])) + self.nu2 * tf.reduce_sum(tf.square(param[0])) 122 | 123 | optimizer = tf.train.AdamOptimizer(self.lr) 124 | 125 | train_op = optimizer.minimize(L) 126 | 127 | init = tf.global_variables_initializer() 128 | self.sess.run(init) 129 | 130 | print("total iter: %i" % self.max_iter) 131 | for step in range(self.max_iter): 132 | index = np.random.randint(self.node_size, size=self.bs) 133 | adj_batch_train = adj_mat[index, :] 134 | adj_mat_train = adj_batch_train[:, index] 135 | b_mat_train = np.ones_like(adj_batch_train) 136 | b_mat_train[adj_batch_train != 0] = self.beta 137 | 138 | self.sess.run(train_op, feed_dict={AdjBatch: adj_batch_train, 139 | Adj: adj_mat_train, 140 | B: b_mat_train}) 141 | if step % 50 == 0: 142 | l, l1, l2 = self.sess.run((L, L_1st, L_2nd), 143 | feed_dict={AdjBatch: adj_batch_train, 144 | Adj: adj_mat_train, 145 | B: b_mat_train}) 146 | print("step %i: total loss: %s, l1 loss: %s, l2 loss: %s" % (step, l, l1, l2)) 147 | 148 | return self.sess.run(_embeddings, feed_dict={AdjBatch: adj_mat}) 149 | 150 | def save_embeddings(self, filename): 151 | fout = open(filename, 'w') 152 | node_num = len(self.vectors) 153 | fout.write("{} {}\n".format(node_num, self.dim)) 154 | for node, vec in self.vectors.items(): 155 | fout.write("{} {}\n".format(node, ' '.join([str(x) for x in vec]))) 156 | fout.close() 157 | 158 | 159 | class SDNE2(object): 160 | def __init__(self, graph, encoder_layer_list, alpha=1e-6, beta=5., nu1=1e-5, nu2=1e-5, 161 | batch_size=100, max_iter=2000, learning_rate=None): 162 | 163 | self.g = graph 164 | 165 | self.node_size = self.g.G.number_of_nodes() 166 | self.rep_size = encoder_layer_list[-1] 167 | 168 | self.encoder_layer_list = [self.node_size] + encoder_layer_list 169 | self.encoder_layer_num = len(encoder_layer_list) + 1 170 | 171 | self.alpha = alpha 172 | self.beta = beta 173 | self.nu1 = nu1 174 | self.nu2 = nu2 175 | self.bs = batch_size 176 | self.max_iter = max_iter 177 | self.lr = learning_rate 178 | if self.lr is None: 179 | self.lr = tf.train.inverse_time_decay(0.1, self.max_iter, decay_steps=1, decay_rate=0.9999) 180 | 181 | self.sess = tf.Session() 182 | self.vectors = {} 183 | 184 | self.adj_mat = self.getAdj() 185 | self.deg_vec = np.sum(self.adj_mat, axis=1) 186 | self.embeddings = self.get_train() 187 | 188 | look_back = self.g.look_back_list 189 | 190 | for i, embedding in enumerate(self.embeddings): 191 | self.vectors[look_back[i]] = embedding 192 | 193 | def getAdj(self): 194 | node_size = self.g.node_size 195 | look_up = self.g.look_up_dict 196 | adj = np.zeros((node_size, node_size)) 197 | for edge in self.g.G.edges(): 198 | adj[look_up[edge[0]]][look_up[edge[1]]] = self.g.G[edge[0]][edge[1]]['weight'] 199 | return adj 200 | 201 | def model(self, node, layer_collector, scope_name): 202 | fc = node 203 | with tf.name_scope(scope_name + 'encoder'): 204 | for i in range(1, self.encoder_layer_num): 205 | fc = fc_op(fc, 206 | name=scope_name + str(i), 207 | n_out=self.encoder_layer_list[i], 208 | layer_collector=layer_collector) 209 | 210 | _embeddings = fc 211 | 212 | with tf.name_scope(scope_name + 'decoder'): 213 | for i in range(self.encoder_layer_num - 2, -1, -1): 214 | fc = fc_op(fc, 215 | name=scope_name + str(i), 216 | n_out=self.encoder_layer_list[i], 217 | layer_collector=layer_collector) 218 | 219 | return _embeddings, fc 220 | 221 | def generate_batch(self, shuffle=True): 222 | adj = self.adj_mat 223 | 224 | row_indices, col_indices = adj.nonzero() 225 | sample_index = np.arange(row_indices.shape[0]) 226 | num_of_batches = row_indices.shape[0] // self.bs 227 | counter = 0 228 | if shuffle: 229 | np.random.shuffle(sample_index) 230 | 231 | while True: 232 | batch_index = sample_index[self.bs * counter:self.bs * (counter + 1)] 233 | 234 | nodes_a = adj[row_indices[batch_index], :] 235 | nodes_b = adj[col_indices[batch_index], :] 236 | weights = adj[row_indices[batch_index], col_indices[batch_index]] 237 | weights = np.reshape(weights, [-1, 1]) 238 | 239 | beta_mask_a = np.ones_like(nodes_a) 240 | beta_mask_a[nodes_a != 0] = self.beta 241 | beta_mask_b = np.ones_like(nodes_b) 242 | beta_mask_b[nodes_b != 0] = self.beta 243 | 244 | if counter == num_of_batches: 245 | counter = 0 246 | np.random.shuffle(sample_index) 247 | else: 248 | counter += 1 249 | 250 | yield (nodes_a, nodes_b, beta_mask_a, beta_mask_b, weights) 251 | 252 | def get_train(self): 253 | 254 | NodeA = tf.placeholder(tf.float32, [None, self.node_size], name='node_a') 255 | BmaskA = tf.placeholder(tf.float32, [None, self.node_size], name='beta_mask_a') 256 | NodeB = tf.placeholder(tf.float32, [None, self.node_size], name='node_b') 257 | BmaskB = tf.placeholder(tf.float32, [None, self.node_size], name='beta_mask_b') 258 | Weights = tf.placeholder(tf.float32, [None, 1], name='adj_weights') 259 | 260 | layer_collector = [] 261 | nodes = tf.concat([NodeA, NodeB], axis=0) 262 | bmasks = tf.concat([BmaskA, BmaskB], axis=0) 263 | emb, recons = self.model(nodes, layer_collector, 'reconstructor') 264 | embs = tf.split(emb, num_or_size_splits=2, axis=0) 265 | 266 | L_1st = tf.reduce_sum(Weights * (tf.reduce_sum(tf.square(embs[0] - embs[1]), axis=1))) 267 | 268 | L_2nd = tf.reduce_sum(tf.square((nodes - recons) * bmasks)) 269 | 270 | L = L_2nd + self.alpha * L_1st 271 | 272 | for param in layer_collector: 273 | L += self.nu1 * tf.reduce_sum(tf.abs(param[0])) + self.nu2 * tf.reduce_sum(tf.square(param[0])) 274 | 275 | # lr = tf.train.exponential_decay(1e-6, self.max_iter, decay_steps=1, decay_rate=0.9999) 276 | # optimizer = tf.train.MomentumOptimizer(lr, 0.99, use_nesterov=True) 277 | 278 | optimizer = tf.train.AdamOptimizer(self.lr) 279 | train_op = optimizer.minimize(L) 280 | 281 | init = tf.global_variables_initializer() 282 | self.sess.run(init) 283 | 284 | generator = self.generate_batch() 285 | 286 | for step in range(self.max_iter + 1): 287 | nodes_a, nodes_b, beta_mask_a, beta_mask_b, weights = generator.__next__() 288 | 289 | feed_dict = {NodeA: nodes_a, 290 | NodeB: nodes_b, 291 | BmaskA: beta_mask_a, 292 | BmaskB: beta_mask_b, 293 | Weights: weights} 294 | 295 | self.sess.run(train_op, feed_dict=feed_dict) 296 | if step % 50 == 0: 297 | print("step %i: %s" % (step, self.sess.run([L, L_1st, L_2nd], feed_dict=feed_dict))) 298 | 299 | return self.sess.run(emb, feed_dict={NodeA: self.adj_mat[0:1, :], NodeB: self.adj_mat[1:, :]}) 300 | 301 | def save_embeddings(self, filename): 302 | fout = open(filename, 'w') 303 | node_num = len(self.vectors) 304 | fout.write("{} {}\n".format(node_num, self.rep_size)) 305 | for node, vec in self.vectors.items(): 306 | fout.write("{} {}\n".format(node, ' '.join([str(x) for x in vec]))) 307 | fout.close() 308 | -------------------------------------------------------------------------------- /DGI/utils/process.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle as pkl 3 | import networkx as nx 4 | import scipy.sparse as sp 5 | from scipy.sparse.linalg.eigen.arpack import eigsh 6 | import sys 7 | import torch 8 | import torch.nn as nn 9 | 10 | def parse_skipgram(fname): 11 | with open(fname) as f: 12 | toks = list(f.read().split()) 13 | nb_nodes = int(toks[0]) 14 | nb_features = int(toks[1]) 15 | ret = np.empty((nb_nodes, nb_features)) 16 | it = 2 17 | for i in range(nb_nodes): 18 | cur_nd = int(toks[it]) - 1 19 | it += 1 20 | for j in range(nb_features): 21 | cur_ft = float(toks[it]) 22 | ret[cur_nd][j] = cur_ft 23 | it += 1 24 | return ret 25 | 26 | # Process a (subset of) a TU dataset into standard form 27 | def process_tu(data, nb_nodes): 28 | nb_graphs = len(data) 29 | ft_size = data.num_features 30 | 31 | features = np.zeros((nb_graphs, nb_nodes, ft_size)) 32 | adjacency = np.zeros((nb_graphs, nb_nodes, nb_nodes)) 33 | labels = np.zeros(nb_graphs) 34 | sizes = np.zeros(nb_graphs, dtype=np.int32) 35 | masks = np.zeros((nb_graphs, nb_nodes)) 36 | 37 | for g in range(nb_graphs): 38 | sizes[g] = data[g].x.shape[0] 39 | features[g, :sizes[g]] = data[g].x 40 | labels[g] = data[g].y[0] 41 | masks[g, :sizes[g]] = 1.0 42 | e_ind = data[g].edge_index 43 | coo = sp.coo_matrix((np.ones(e_ind.shape[1]), (e_ind[0, :], e_ind[1, :])), shape=(nb_nodes, nb_nodes)) 44 | adjacency[g] = coo.todense() 45 | 46 | return features, adjacency, labels, sizes, masks 47 | 48 | def micro_f1(logits, labels): 49 | # Compute predictions 50 | preds = torch.round(nn.Sigmoid()(logits)) 51 | 52 | # Cast to avoid trouble 53 | preds = preds.long() 54 | labels = labels.long() 55 | 56 | # Count true positives, true negatives, false positives, false negatives 57 | tp = torch.nonzero(preds * labels).shape[0] * 1.0 58 | tn = torch.nonzero((preds - 1) * (labels - 1)).shape[0] * 1.0 59 | fp = torch.nonzero(preds * (labels - 1)).shape[0] * 1.0 60 | fn = torch.nonzero((preds - 1) * labels).shape[0] * 1.0 61 | 62 | # Compute micro-f1 score 63 | prec = tp / (tp + fp) 64 | rec = tp / (tp + fn) 65 | f1 = (2 * prec * rec) / (prec + rec) 66 | return f1 67 | 68 | """ 69 | Prepare adjacency matrix by expanding up to a given neighbourhood. 70 | This will insert loops on every node. 71 | Finally, the matrix is converted to bias vectors. 72 | Expected shape: [graph, nodes, nodes] 73 | """ 74 | def adj_to_bias(adj, sizes, nhood=1): 75 | nb_graphs = adj.shape[0] 76 | mt = np.empty(adj.shape) 77 | for g in range(nb_graphs): 78 | mt[g] = np.eye(adj.shape[1]) 79 | for _ in range(nhood): 80 | mt[g] = np.matmul(mt[g], (adj[g] + np.eye(adj.shape[1]))) 81 | for i in range(sizes[g]): 82 | for j in range(sizes[g]): 83 | if mt[g][i][j] > 0.0: 84 | mt[g][i][j] = 1.0 85 | return -1e9 * (1.0 - mt) 86 | 87 | 88 | ############################################### 89 | # This section of code adapted from tkipf/gcn # 90 | ############################################### 91 | 92 | def parse_index_file(filename): 93 | """Parse index file.""" 94 | index = [] 95 | for line in open(filename): 96 | index.append(int(line.strip())) 97 | return index 98 | 99 | def sample_mask(idx, l): 100 | """Create mask.""" 101 | mask = np.zeros(l) 102 | mask[idx] = 1 103 | return np.array(mask, dtype=np.bool) 104 | 105 | def load_data(dataset_str): # {'pubmed', 'citeseer', 'cora'} 106 | """Load data.""" 107 | names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] 108 | objects = [] 109 | for i in range(len(names)): 110 | with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f: 111 | if sys.version_info > (3, 0): 112 | objects.append(pkl.load(f, encoding='latin1')) 113 | else: 114 | objects.append(pkl.load(f)) 115 | 116 | x, y, tx, ty, allx, ally, graph = tuple(objects) 117 | test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str)) 118 | test_idx_range = np.sort(test_idx_reorder) 119 | 120 | if dataset_str == 'citeseer': 121 | # Fix citeseer dataset (there are some isolated nodes in the graph) 122 | # Find isolated nodes, add them as zero-vecs into the right position 123 | test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1) 124 | tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) 125 | tx_extended[test_idx_range-min(test_idx_range), :] = tx 126 | tx = tx_extended 127 | ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) 128 | ty_extended[test_idx_range-min(test_idx_range), :] = ty 129 | ty = ty_extended 130 | 131 | features = sp.vstack((allx, tx)).tolil() 132 | features[test_idx_reorder, :] = features[test_idx_range, :] 133 | adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) 134 | 135 | labels = np.vstack((ally, ty)) 136 | labels[test_idx_reorder, :] = labels[test_idx_range, :] 137 | 138 | idx_test = test_idx_range.tolist() 139 | idx_train = range(len(y)) 140 | idx_val = range(len(y), len(y)+500) 141 | 142 | return adj, features, labels, idx_train, idx_val, idx_test 143 | 144 | def sparse_to_tuple(sparse_mx, insert_batch=False): 145 | """Convert sparse matrix to tuple representation.""" 146 | """Set insert_batch=True if you want to insert a batch dimension.""" 147 | def to_tuple(mx): 148 | if not sp.isspmatrix_coo(mx): 149 | mx = mx.tocoo() 150 | if insert_batch: 151 | coords = np.vstack((np.zeros(mx.row.shape[0]), mx.row, mx.col)).transpose() 152 | values = mx.data 153 | shape = (1,) + mx.shape 154 | else: 155 | coords = np.vstack((mx.row, mx.col)).transpose() 156 | values = mx.data 157 | shape = mx.shape 158 | return coords, values, shape 159 | 160 | if isinstance(sparse_mx, list): 161 | for i in range(len(sparse_mx)): 162 | sparse_mx[i] = to_tuple(sparse_mx[i]) 163 | else: 164 | sparse_mx = to_tuple(sparse_mx) 165 | 166 | return sparse_mx 167 | 168 | def standardize_data(f, train_mask): 169 | """Standardize feature matrix and convert to tuple representation""" 170 | # standardize data 171 | f = f.todense() 172 | mu = f[train_mask == True, :].mean(axis=0) 173 | sigma = f[train_mask == True, :].std(axis=0) 174 | f = f[:, np.squeeze(np.array(sigma > 0))] 175 | mu = f[train_mask == True, :].mean(axis=0) 176 | sigma = f[train_mask == True, :].std(axis=0) 177 | f = (f - mu) / sigma 178 | return f 179 | 180 | def preprocess_features(features): 181 | """Row-normalize feature matrix and convert to tuple representation""" 182 | rowsum = np.array(features.sum(1)) 183 | r_inv = np.power(rowsum, -1).flatten() 184 | r_inv[np.isinf(r_inv)] = 0. 185 | r_mat_inv = sp.diags(r_inv) 186 | features = r_mat_inv.dot(features) 187 | return features.todense(), sparse_to_tuple(features) 188 | 189 | def normalize_adjRA(adj): 190 | """Symmetrically normalize adjacency matrix.""" 191 | adj = sp.coo_matrix(adj) 192 | rowsum = np.array(adj.sum(1)) 193 | d_inv_sqrt = np.power(rowsum, -1).flatten() 194 | d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. 195 | d_mat_inv_sqrt = sp.diags(d_inv_sqrt) 196 | DA = d_mat_inv_sqrt.dot(adj); 197 | 198 | return adj.dot(DA).tocoo() 199 | def normalize_adj(adj): 200 | adj = sp.coo_matrix(adj) 201 | rowsum = np.array(adj.sum(1)) 202 | d_inv_sqrt = np.power(rowsum, -0.5).flatten() 203 | d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. 204 | d_mat_inv_sqrt = sp.diags(d_inv_sqrt) 205 | return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo() 206 | 207 | def normalize_adjCN(adj): 208 | adj = sp.coo_matrix(adj) 209 | rowsum = np.array(adj.sum(1)) 210 | d_inv_sqrt = np.power(rowsum, -0.5).flatten() 211 | d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. 212 | d_mat_inv_sqrt = sp.diags(d_inv_sqrt) 213 | return adj.dot(adj).tocoo() 214 | 215 | def normalize_adjAA(adj): 216 | adj = sp.coo_matrix(adj) 217 | rowsum = np.array(adj.sum(1)) 218 | d_inv_sqrt = np.power(np.log(rowsum), -1).flatten() 219 | d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. 220 | d_mat_inv_sqrt = sp.diags(d_inv_sqrt) 221 | DA = d_mat_inv_sqrt.dot(adj); 222 | return adj.dot(DA).tocoo() 223 | 224 | def normalize_adjSalton (adj): 225 | adj = sp.coo_matrix(adj) 226 | rowsum = np.array(adj.sum(1)) 227 | d_inv_sqrt = np.power(rowsum, -0.5).flatten() 228 | d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. 229 | d_mat_inv_sqrt = sp.diags(d_inv_sqrt) 230 | 231 | Dtemp = d_mat_inv_sqrt @ d_mat_inv_sqrt.T 232 | 233 | CNmat = adj @ adj 234 | result = CNmat @ Dtemp 235 | return result 236 | 237 | def mymaximum (A, B): 238 | BisBigger = A-B 239 | BisBigger.data = np.where(BisBigger.data <= 0, 1, 0) 240 | return A - A.multiply(BisBigger) + B.multiply(BisBigger) 241 | 242 | def myminimum(A,B): 243 | BisBigger = A-B 244 | BisBigger.data = np.where(BisBigger.data >= 0, 1, 0) 245 | return A - A.multiply(BisBigger) + B.multiply(BisBigger) 246 | 247 | 248 | def normalize_adjHDI(adj): 249 | adj = sp.coo_matrix(adj) 250 | 251 | rowsum = np.array(adj.sum(1)) 252 | 253 | deg_row = np.tile(rowsum, (1,adj.shape[0])) 254 | 255 | #deg_row = deg_row.T 256 | deg_row = sp.coo_matrix(deg_row) 257 | 258 | sim = adj.dot(adj) 259 | 260 | #y = sim.copy().tocsr() 261 | #y.data.fill(1) 262 | X = sim.astype(bool).astype(int) 263 | deg_row = deg_row.multiply(X) 264 | 265 | deg_row = mymaximum(deg_row, deg_row.T) 266 | 267 | sim = sim/deg_row 268 | #sim = sp.coo_matrix(sim) 269 | whereAreNan = np.isnan(sim) 270 | whereAreInf = np.isinf(sim) 271 | sim[whereAreNan] = 0 272 | sim[whereAreInf] = 0 273 | 274 | sim = sp.coo_matrix(sim) 275 | #print(sim[0]) 276 | return sim 277 | 278 | def normalize_adjHPI(adj): 279 | adj = sp.coo_matrix(adj) 280 | 281 | rowsum = np.array(adj.sum(1)) 282 | 283 | deg_row = np.tile(rowsum, (1,adj.shape[0])) 284 | 285 | #deg_row = deg_row.T 286 | deg_row = sp.coo_matrix(deg_row) 287 | 288 | sim = adj.dot(adj) 289 | 290 | #y = sim.copy().tocsr() 291 | #y.data.fill(1) 292 | X = sim.astype(bool).astype(int) 293 | deg_row = deg_row.multiply(X) 294 | 295 | deg_row = myminimum(deg_row, deg_row.T) 296 | 297 | sim = sim/deg_row 298 | #sim = sp.coo_matrix(sim) 299 | whereAreNan = np.isnan(sim) 300 | whereAreInf = np.isinf(sim) 301 | sim[whereAreNan] = 0 302 | sim[whereAreInf] = 0 303 | 304 | sim = sp.coo_matrix(sim) 305 | #print(sim[0]) 306 | return sim 307 | 308 | def normalize_adjJaccard(adj): 309 | adj = sp.coo_matrix(adj) 310 | rowsum = np.array(adj.sum(1)) 311 | deg_row = np.tile(rowsum, (1,adj.shape[0])) 312 | deg_row = sp.coo_matrix(deg_row) 313 | 314 | sim = adj.dot(adj) 315 | X = sim.astype(bool).astype(int) 316 | deg_row = deg_row.multiply(X) 317 | deg_row = sp.triu(deg_row, k=0) + sp.triu(deg_row.T,k=0) 318 | 319 | sim = sim/(deg_row.multiply(X)-sim) 320 | whereAreNan = np.isnan(sim) 321 | whereAreInf = np.isinf(sim) 322 | sim[whereAreNan] = 0 323 | sim[whereAreInf] = 0 324 | 325 | sim = sp.coo_matrix(sim) 326 | return sim 327 | 328 | def calc_A_hat(adj_matrix: sp.spmatrix) -> sp.spmatrix: 329 | nnodes = adj_matrix.shape[0] 330 | A = adj_matrix + sp.eye(nnodes) 331 | D_vec = np.sum(A, axis=1).A1 332 | D_vec_invsqrt_corr = 1 / np.sqrt(D_vec) 333 | D_invsqrt_corr = sp.diags(D_vec_invsqrt_corr) 334 | return D_invsqrt_corr @ A @ D_invsqrt_corr 335 | def calc_ppr_exact(adj_matrix: sp.spmatrix, alpha: float) -> np.ndarray: 336 | nnodes = adj_matrix.shape[0] 337 | M = calc_A_hat(adj_matrix) 338 | A_inner = sp.eye(nnodes) - (1 - alpha) * M 339 | return alpha * np.linalg.inv(A_inner.toarray()) 340 | 341 | def normalize_adjSorenson(adj): 342 | """Symmetrically normalize adjacency matrix.""" 343 | adj = sp.coo_matrix(adj) 344 | rowsum = np.array(adj.sum(1)) 345 | d_inv_sqrt = np.power(rowsum, -1).flatten() 346 | d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. 347 | d_mat_inv_sqrt = sp.diags(d_inv_sqrt) 348 | sim = adj @ adj 349 | sim = sp.triu(sim, k=1) 350 | Dtemp = d_mat_inv_sqrt + d_mat_inv_sqrt.T 351 | 352 | Dtemp = sp.triu(Dtemp) 353 | 354 | return 2*sim.dot(Dtemp) 355 | 356 | def linCCALap( H1, H2, outdim_size,adj,gamma): 357 | """ 358 | An implementation of linear CCA 359 | # Arguments: 360 | H1 and H2: the matrices containing the data for view 1 and view 2. Each row is a sample. 361 | outdim_size: specifies the number of new features 362 | # Returns 363 | A and B: the linear transformation matrices 364 | mean1 and mean2: the means of data for both views 365 | """ 366 | L = normalize_adj(adj) 367 | r1 = 1e-4 368 | r2 = 1e-4 369 | 370 | m = H1.shape[0] 371 | o1 = H1.shape[1] 372 | o2 = H2.shape[1] 373 | 374 | m1 = np.mean(H1, axis=0) 375 | m2 = np.mean(H2, axis=0) 376 | H1bar = H1 - np.tile(m1, (m, 1)) 377 | H2bar = H2 - np.tile(m2, (m, 1)) 378 | 379 | SigmaHat12 = (1.0 / (m - 1)) * np.dot(H1bar.T, H2bar) 380 | SigmaHat11 = (1.0 / (m - 1)) * np.dot(H1bar.T, 381 | H1bar) + r1 * np.identity(o1) 382 | SigmaHat22 = (1.0 / (m - 1)) * np.dot(H2bar.T, 383 | H2bar) + r2 * np.identity(o2) 384 | 385 | [D1, V1] = np.linalg.eigh(SigmaHat11) 386 | [D2, V2] = np.linalg.eigh(SigmaHat22) 387 | SigmaHat11RootInv = np.dot( 388 | np.dot(V1, np.diag(D1 ** -0.5)), V1.T) 389 | SigmaHat22RootInv = np.dot( 390 | np.dot(V2, np.diag(D2 ** -0.5)), V2.T) 391 | 392 | T1 = np.dot(np.dot(SigmaHat11RootInv, 393 | SigmaHat12), SigmaHat22RootInv) 394 | regulTerm = np.dot(np.dot(H1bar.T, 395 | L), H2bar) 396 | regulTerm = gamma*regulTerm 397 | T2 = np.dot(np.dot(SigmaHat11RootInv, 398 | regulTerm), SigmaHat22RootInv) 399 | 400 | # Tval = np.dot(np.dot(SigmaHat11RootInv, 401 | # SigmaHat12), SigmaHat22RootInv) 402 | Tval = T1-T2 403 | 404 | [U, D, V] = np.linalg.svd(Tval) 405 | V = V.T 406 | w1 = np.dot(SigmaHat11RootInv, U[:, 0:outdim_size]) 407 | w2 = np.dot(SigmaHat22RootInv, V[:, 0:outdim_size]) 408 | D = D[0:outdim_size] 409 | return w1,w2,m1,m2,D 410 | 411 | 412 | def preprocess_adj(adj): 413 | """Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation.""" 414 | adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0])) 415 | return sparse_to_tuple(adj_normalized) 416 | 417 | def sparse_mx_to_torch_sparse_tensor(sparse_mx): 418 | """Convert a scipy sparse matrix to a torch sparse tensor.""" 419 | sparse_mx = sparse_mx.tocoo().astype(np.float32) 420 | indices = torch.from_numpy( 421 | np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64)) 422 | values = torch.from_numpy(sparse_mx.data) 423 | shape = torch.Size(sparse_mx.shape) 424 | return torch.sparse.FloatTensor(indices, values, shape) 425 | -------------------------------------------------------------------------------- /embed_train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import ast 4 | import logging 5 | import os 6 | 7 | #from gensim.models import Word2Vec 8 | #from gensim.models.word2vec import LineSentence 9 | from sklearn.preprocessing import scale 10 | from GAE.train_model import gae_model 11 | from OpenNE import gf, grarep, hope, lap, line, node2vec, sdne,RWR 12 | #from SVD.model import SVD_embedding 13 | #from struc2vec import struc2vec 14 | from utils import * 15 | from scipy.linalg import fractional_matrix_power, inv 16 | import numpy as np 17 | import scipy.sparse as sp 18 | #import hdf5storage as hd 19 | import torch 20 | import torch.nn as nn 21 | import networkx as nx 22 | import pandas as pd 23 | from DGI.models import DGI, LogReg 24 | from DGI.utils import process 25 | from scipy.io import loadmat 26 | 27 | 28 | #from utils import sparse_mx_to_torch_sparse_tensor 29 | #from dataset import load 30 | 31 | 32 | # Borrowed from https://github.com/PetarV-/DGI 33 | class GCN(nn.Module): 34 | def __init__(self, in_ft, out_ft, bias=True): 35 | super(GCN, self).__init__() 36 | self.fc = nn.Linear(in_ft, out_ft, bias=False) 37 | self.act = nn.PReLU() 38 | 39 | if bias: 40 | self.bias = nn.Parameter(torch.FloatTensor(out_ft)) 41 | self.bias.data.fill_(0.0) 42 | else: 43 | self.register_parameter('bias', None) 44 | 45 | for m in self.modules(): 46 | self.weights_init(m) 47 | 48 | def weights_init(self, m): 49 | if isinstance(m, nn.Linear): 50 | torch.nn.init.xavier_uniform_(m.weight.data) 51 | if m.bias is not None: 52 | m.bias.data.fill_(0.0) 53 | 54 | # Shape of seq: (batch, nodes, features) 55 | def forward(self, seq, adj, sparse=True): 56 | seq_fts = self.fc(seq) 57 | if sparse: 58 | out = torch.unsqueeze(torch.spmm(adj, torch.squeeze(seq_fts, 0)), 0) 59 | else: 60 | out = torch.bmm(adj, seq_fts) 61 | if self.bias is not None: 62 | out += self.bias 63 | return self.act(out) 64 | 65 | 66 | # Borrowed from https://github.com/PetarV-/DGI 67 | class Readout(nn.Module): 68 | def __init__(self): 69 | super(Readout, self).__init__() 70 | 71 | def forward(self, seq, msk): 72 | if msk is None: 73 | return torch.mean(seq, 1) 74 | else: 75 | msk = torch.unsqueeze(msk, -1) 76 | return torch.mean(seq * msk, 1) / torch.sum(msk) 77 | 78 | 79 | # Borrowed from https://github.com/PetarV-/DGI 80 | class Discriminator(nn.Module): 81 | def __init__(self, n_h): 82 | super(Discriminator, self).__init__() 83 | self.f_k = nn.Bilinear(n_h, n_h, 1) 84 | 85 | for m in self.modules(): 86 | self.weights_init(m) 87 | 88 | def weights_init(self, m): 89 | if isinstance(m, nn.Bilinear): 90 | torch.nn.init.xavier_uniform_(m.weight.data) 91 | if m.bias is not None: 92 | m.bias.data.fill_(0.0) 93 | 94 | def forward(self, c1, c2, h1, h2, h3, h4, s_bias1=None, s_bias2=None): 95 | c_x1 = torch.unsqueeze(c1, 1) 96 | c_x1 = c_x1.expand_as(h1).contiguous() 97 | c_x2 = torch.unsqueeze(c2, 1) 98 | c_x2 = c_x2.expand_as(h2).contiguous() 99 | 100 | # positive 101 | sc_1 = torch.squeeze(self.f_k(h2, c_x1), 2) 102 | sc_2 = torch.squeeze(self.f_k(h1, c_x2), 2) 103 | 104 | # negetive 105 | sc_3 = torch.squeeze(self.f_k(h4, c_x1), 2) 106 | sc_4 = torch.squeeze(self.f_k(h3, c_x2), 2) 107 | 108 | logits = torch.cat((sc_1, sc_2, sc_3, sc_4), 1) 109 | return logits 110 | 111 | 112 | class Model(nn.Module): 113 | def __init__(self, n_in, n_h): 114 | super(Model, self).__init__() 115 | self.gcn1 = GCN(n_in, n_h) 116 | self.gcn2 = GCN(n_in, n_h) 117 | self.read = Readout() 118 | 119 | self.sigm = nn.Sigmoid() 120 | 121 | self.disc = Discriminator(n_h) 122 | 123 | def forward(self, seq1, seq2, adj, diff, sparse, msk, samp_bias1, samp_bias2): 124 | h_1 = self.gcn1(seq1, adj, sparse) 125 | c_1 = self.read(h_1, msk) 126 | c_1 = self.sigm(c_1) 127 | 128 | h_2 = self.gcn2(seq1, diff, sparse) 129 | c_2 = self.read(h_2, msk) 130 | c_2 = self.sigm(c_2) 131 | 132 | h_3 = self.gcn1(seq2, adj, sparse) 133 | h_4 = self.gcn2(seq2, diff, sparse) 134 | 135 | ret = self.disc(c_1, c_2, h_1, h_2, h_3, h_4, samp_bias1, samp_bias2) 136 | 137 | return ret, h_1, h_2 138 | 139 | def embed(self, seq, adj, diff, sparse, msk): 140 | h_1 = self.gcn1(seq, adj, sparse) 141 | c = self.read(h_1, msk) 142 | 143 | h_2 = self.gcn2(seq, diff, sparse) 144 | return (h_1 + h_2).detach(), c.detach() 145 | 146 | 147 | class LogReg(nn.Module): 148 | def __init__(self, ft_in, nb_classes): 149 | super(LogReg, self).__init__() 150 | self.fc = nn.Linear(ft_in, nb_classes) 151 | self.sigm = nn.Sigmoid() 152 | 153 | for m in self.modules(): 154 | self.weights_init(m) 155 | 156 | def weights_init(self, m): 157 | if isinstance(m, nn.Linear): 158 | torch.nn.init.xavier_uniform_(m.weight.data) 159 | if m.bias is not None: 160 | m.bias.data.fill_(0.0) 161 | 162 | def forward(self, seq): 163 | ret = torch.log_softmax(self.fc(seq), dim=-1) 164 | return ret 165 | 166 | 167 | def compute_pprAdj(adj, alpha=0.2, self_loop=True): 168 | if self_loop: 169 | adj = adj + sp.eye(adj.shape[0]) 170 | adj = sp.coo_matrix(adj) 171 | rowsum = np.array(adj.sum(1)) 172 | d_inv_sqrt = np.power(rowsum, -0.5).flatten() 173 | d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. 174 | d_mat_inv_sqrt = sp.diags(d_inv_sqrt) 175 | at = d_mat_inv_sqrt @ adj @ d_mat_inv_sqrt 176 | return alpha * inv((np.eye(adj.shape[0]) - (1 - alpha) * at)) # a(I_n-(1-a)A~)^-1 177 | 178 | def _scaleSimMat(A): 179 | """Scale rows of similarity matrix""" 180 | A = A - np.diag(np.diag(A)) 181 | A = A + np.diag(A.sum(axis=0) == 0) 182 | col = A.sum(axis=0) 183 | A = A.astype(np.float)/col[:, None] 184 | 185 | return A 186 | 187 | def PPMI_matrix(M): 188 | """ Compute Positive Pointwise Mutual Information Matrix""" 189 | M = _scaleSimMat(M) 190 | n = M.shape[0] 191 | col = np.asarray(M.sum(axis=0), dtype=float) 192 | col = col.reshape((1, n)) 193 | row = np.asarray(M.sum(axis=1), dtype=float) 194 | row = row.reshape((n, 1)) 195 | D = np.sum(col) 196 | 197 | np.seterr(all='ignore') 198 | PPMI = np.log(np.divide(D*M, np.dot(row, col))) 199 | PPMI[np.isnan(PPMI)] = 0 200 | PPMI[PPMI < 0] = 0 201 | 202 | def embedding_training(args, train_graph_filename): 203 | if args.method == 'struc2vec': 204 | g = read_for_struc2vec(train_graph_filename) 205 | elif args.method == 'GAE': 206 | if args.input == 'YeastAdj.mat': 207 | g = load_mat_data() 208 | else: 209 | g = read_for_gae(train_graph_filename) 210 | elif args.method == 'DGI': 211 | if args.input == 'YeastAdj.mat': 212 | g = load_mat_data() 213 | else: 214 | g = read_for_gae(train_graph_filename) 215 | elif args.method == 'SDGI': 216 | if args.input == 'YeastAdj.mat': 217 | g = load_mat_data() 218 | else: 219 | g = read_for_gae(train_graph_filename) 220 | elif args.method == 'SVD': 221 | g = read_for_SVD(train_graph_filename, weighted=args.weighted) 222 | else: 223 | if args.input == 'YeastAdj.mat': 224 | g = read_for_OpenNE_from_mat(args.input) 225 | else: 226 | g = read_for_OpenNE(train_graph_filename, weighted=args.weighted) 227 | 228 | _embedding_training(args, G_=g) 229 | 230 | return 231 | 232 | 233 | def load_mat_data(): 234 | ne = loadmat('YeastAdj.mat') 235 | ne = ne['adj'] 236 | G=nx.from_numpy_matrix(ne) 237 | node_list=list(G.nodes) 238 | adj = nx.adjacency_matrix(G, nodelist=node_list) 239 | print("Graph Loaded...") 240 | return (adj,node_list) 241 | 242 | 243 | def _embedding_training(args, G_=None): 244 | seed=args.seed 245 | 246 | if args.method == 'struc2vec': 247 | logging.basicConfig(filename='./src/bionev/struc2vec/struc2vec.log', filemode='w', level=logging.DEBUG, 248 | format='%(asctime)s %(message)s') 249 | if (args.OPT3): 250 | until_layer = args.until_layer 251 | else: 252 | until_layer = None 253 | 254 | G = struc2vec.Graph(G_, args.workers, untilLayer=until_layer) 255 | 256 | if (args.OPT1): 257 | G.preprocess_neighbors_with_bfs_compact() 258 | else: 259 | G.preprocess_neighbors_with_bfs() 260 | 261 | if (args.OPT2): 262 | G.create_vectors() 263 | G.calc_distances(compactDegree=args.OPT1) 264 | else: 265 | G.calc_distances_all_vertices(compactDegree=args.OPT1) 266 | 267 | print('create distances network..') 268 | G.create_distances_network() 269 | print('begin random walk...') 270 | G.preprocess_parameters_random_walk() 271 | 272 | G.simulate_walks(args.number_walks, args.walk_length) 273 | print('walk finished..\nLearning embeddings...') 274 | walks = LineSentence('random_walks.txt') 275 | model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, hs=1, sg=1, 276 | workers=args.workers, seed=seed) 277 | os.remove("random_walks.txt") 278 | model.wv.save_word2vec_format(args.output) 279 | elif args.method == 'GAE': 280 | if args.input == 'STRING-EXP.mat': 281 | model = gae_model(args) 282 | G, node_list = load_mat_data() 283 | model.train(G) 284 | # save embeddings 285 | model.save_embeddings(args.output, node_list) 286 | else: 287 | 288 | model = gae_model(args) 289 | G = G_[0] 290 | node_list = G_[1] 291 | model.train(G) 292 | # save embeddings 293 | model.save_embeddings(args.output, node_list) 294 | elif args.method == 'SDGI': 295 | nb_epochs = 200 296 | patience = 20 297 | lr = 0.001 298 | l2_coef = 0.0 299 | hid_units = 100 300 | sparse = False 301 | verbose=True 302 | alpha = 0.2 303 | 304 | adj = G_[0] 305 | #diff = alpha * inv((np.eye(adj.shape[0]) - (1 - alpha) * (adj + sp.eye(adj.shape[0])))) 306 | #diff = process.normalize_adj(adj + sp.eye(adj.shape[0])) 307 | #diff = diff.todense() 308 | 309 | 310 | #diff = compute_pprAdj(adj,alpha) 311 | node_list = G_[1] 312 | # datafile = 'expression_data.tsv' 313 | # normalize = True 314 | # df = pd.read_csv(datafile, sep='\t', header=0) 315 | # df.columns = [int(x[1:]) - 1 for x in df.columns] 316 | # if normalize==True: 317 | # df = df[node_list] 318 | # df = pd.DataFrame(scale(df, axis=0)) 319 | # t_data = df.T 320 | # features = t_data.to_numpy() 321 | 322 | features = sp.identity(adj.shape[0]) 323 | features, _ = process.preprocess_features(features) 324 | 325 | 326 | if args.embTech == 'DGI': 327 | adj = process.normalize_adj(adj + sp.eye(adj.shape[0])) 328 | elif args.embTech == 'CN': 329 | adj = process.normalize_adjCN(adj + sp.eye(adj.shape[0])) 330 | elif args.embTech == 'AA': 331 | adj = process.normalize_adjAA(adj + sp.eye(adj.shape[0])) 332 | elif args.embTech == 'Jaccard': 333 | adj = process.normalize_adjJaccard(adj + sp.eye(adj.shape[0])) 334 | elif args.embTech == 'RA': 335 | adj = process.normalize_adjRA(adj + sp.eye(adj.shape[0])) 336 | elif args.embTech == 'Adj-HDI': 337 | diff = process.normalize_adjHDI(adj + sp.eye(adj.shape[0])) 338 | diff = diff.todense() 339 | adj = process.normalize_adj(adj + sp.eye(adj.shape[0])) 340 | adj = adj.todense() 341 | elif args.embTech =='Adj-Adj': 342 | diff = process.normalize_adj(adj + sp.eye(adj.shape[0])) 343 | diff = diff.todense() 344 | adj = process.normalize_adj(adj + sp.eye(adj.shape[0])) 345 | adj = adj.todense() 346 | elif args.embTech == 'Salton-Salton': 347 | diff = process.normalize_adjSalton(adj + sp.eye(adj.shape[0])) 348 | diff = diff.todense() 349 | adj = process.normalize_adjSalton(adj + sp.eye(adj.shape[0])) 350 | adj = adj.todense() 351 | elif args.embTech == 'HDI-RA': 352 | diff = process.normalize_adjHDI(adj + sp.eye(adj.shape[0])) 353 | diff = diff.todense() 354 | adj = process.normalize_adjRA(adj + sp.eye(adj.shape[0])) 355 | adj = adj.todense() 356 | elif args.embTech == 'HDI-Rwr': 357 | diff = compute_pprAdj(adj,alpha) 358 | #diff = process.normalize_adjHDI(adj + sp.eye(adj.shape[0])) 359 | #diff = diff.todense() 360 | adj = process.normalize_adjHDI(adj + sp.eye(adj.shape[0])) 361 | adj = adj.todense() 362 | elif args.embTech == 'HPI': 363 | adj = process.normalize_adjHPI(adj + sp.eye(adj.shape[0])) 364 | elif args.embTech == 'Sorenson': 365 | adj = process.normalize_adjSorenson(adj + sp.eye(adj.shape[0])) 366 | elif args.embTech == 'Salton': 367 | adj = process.normalize_adjSalton(adj + sp.eye(adj.shape[0])) 368 | elif args.embTech == 'Adj-Rwr': 369 | diff = compute_pprAdj(adj,alpha) 370 | #diff = process.normalize_adjHDI(adj + sp.eye(adj.shape[0])) 371 | #diff = diff.todense() 372 | adj = process.normalize_adj(adj + sp.eye(adj.shape[0])) 373 | adj = adj.todense() 374 | elif args.embTech == 'Adj-Salton': 375 | #diff = compute_pprAdj(adj,alpha) 376 | diff = process.normalize_adjSalton(adj + sp.eye(adj.shape[0])) 377 | diff = diff.todense() 378 | adj = process.normalize_adj(adj + sp.eye(adj.shape[0])) 379 | adj = adj.todense() 380 | elif args.embTech == 'Adj-RA': 381 | #diff = compute_pprAdj(adj,alpha) 382 | diff = process.normalize_adjRA(adj + sp.eye(adj.shape[0])) 383 | diff = diff.todense() 384 | adj = process.normalize_adj(adj + sp.eye(adj.shape[0])) 385 | adj = adj.todense() 386 | elif args.embTech == 'Salton-Rwr': 387 | diff = compute_pprAdj(adj,alpha) 388 | 389 | adj = process.normalize_adjSalton(adj + sp.eye(adj.shape[0])) 390 | adj = adj.todense() 391 | elif args.embTech == 'Adj-HPI': 392 | diff = process.normalize_adjHPI(adj + sp.eye(adj.shape[0])) 393 | diff = diff.todense() 394 | 395 | adj = process.normalize_adjSalton(adj + sp.eye(adj.shape[0])) 396 | adj = adj.todense() 397 | else: 398 | print("No such embedding technique \n We are calling default DGI", args.embTech) 399 | adj = process.normalize_adj(adj + sp.eye(adj.shape[0])) 400 | #adj = adj.todense() 401 | ft_size = features.shape[1] 402 | print("Size of features", ft_size) 403 | #features.tocsr() 404 | #nb_classes = np.unique(labels).shape[0] 405 | #sparse = True 406 | sample_size = 2000 407 | batch_size = 4 408 | 409 | 410 | lbl_1 = torch.ones(batch_size, sample_size * 2) 411 | lbl_2 = torch.zeros(batch_size, sample_size * 2) 412 | lbl = torch.cat((lbl_1, lbl_2), 1) 413 | 414 | model = Model(ft_size, hid_units) 415 | optimiser = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2_coef) 416 | 417 | 418 | b_xent = nn.BCEWithLogitsLoss() 419 | xent = nn.CrossEntropyLoss() 420 | cnt_wait = 0 421 | best = 1e9 422 | best_t = 0 423 | 424 | for epoch in range(nb_epochs): 425 | 426 | idx = np.random.randint(0, adj.shape[-1] - sample_size + 1, batch_size) 427 | ba, bd, bf = [], [], [] 428 | for i in idx: 429 | ba.append(adj[i: i + sample_size, i: i + sample_size]) 430 | bd.append(diff[i: i + sample_size, i: i + sample_size]) 431 | bf.append(features[i: i + sample_size]) 432 | 433 | ba = np.array(ba).reshape(batch_size, sample_size, sample_size) 434 | bd = np.array(bd).reshape(batch_size, sample_size, sample_size) 435 | bf = np.array(bf).reshape(batch_size, sample_size, ft_size) 436 | 437 | if sparse: 438 | ba = sparse_mx_to_torch_sparse_tensor(sp.coo_matrix(ba)) 439 | bd = sparse_mx_to_torch_sparse_tensor(sp.coo_matrix(bd)) 440 | else: 441 | ba = torch.FloatTensor(ba) 442 | bd = torch.FloatTensor(bd) 443 | 444 | bf = torch.FloatTensor(bf) 445 | idx = np.random.permutation(sample_size) 446 | shuf_fts = bf[:, idx, :] 447 | 448 | if torch.cuda.is_available(): 449 | bf = bf.cuda() 450 | ba = ba.cuda() 451 | bd = bd.cuda() 452 | shuf_fts = shuf_fts.cuda() 453 | 454 | model.train() 455 | optimiser.zero_grad() 456 | 457 | logits, __, __ = model(bf, shuf_fts, ba, bd, sparse, None, None, None) 458 | 459 | loss = b_xent(logits, lbl) 460 | 461 | loss.backward() 462 | optimiser.step() 463 | 464 | if verbose: 465 | print('Epoch: {0}, Loss: {1:0.4f}'.format(epoch, loss.item())) 466 | 467 | if loss < best: 468 | best = loss 469 | best_t = epoch 470 | cnt_wait = 0 471 | torch.save(model.state_dict(), 'model.pkl') 472 | else: 473 | cnt_wait += 1 474 | 475 | if cnt_wait == patience: 476 | if verbose: 477 | print('Early stopping!') 478 | break 479 | 480 | if verbose: 481 | print('Loading {}th epoch'.format(best_t)) 482 | model.load_state_dict(torch.load('model.pkl')) 483 | 484 | if sparse: 485 | adj = sparse_mx_to_torch_sparse_tensor(sp.coo_matrix(adj)) 486 | diff = sparse_mx_to_torch_sparse_tensor(sp.coo_matrix(diff)) 487 | 488 | features = torch.FloatTensor(features[np.newaxis]) 489 | adj = torch.FloatTensor(adj[np.newaxis]) 490 | diff = torch.FloatTensor(diff[np.newaxis]) 491 | #features = features.cuda() 492 | #adj = adj.cuda() 493 | #diff = diff.cuda() 494 | 495 | embeds, _ = model.embed(features, adj, diff, sparse, None) 496 | output = args.output 497 | TenToNum = embeds.numpy() 498 | newembeds = TenToNum[0] 499 | 500 | fout = open(output, 'w') 501 | fout.write("{} {}\n".format(newembeds.shape[0], newembeds.shape[1])) 502 | for idx in range(newembeds.shape[0]): 503 | fout.write("{} {}\n".format(node_list[idx], ' '.join([str(x) for x in newembeds[idx, :]]))) 504 | fout.close() 505 | 506 | elif args.method == 'DGI': 507 | # training params for DGI 508 | batch_size = 1 509 | nb_epochs = args.epochs 510 | patience = 20 511 | lr = 0.001 512 | l2_coef = 0.0 513 | drop_prob = 0.0 514 | hid_units = 100 515 | sparse = True #Small datasets make it True 516 | nonlinearity = 'prelu' # special name to separate parameters 517 | adj = G_[0] 518 | node_list = G_[1] 519 | features = sp.identity(adj.shape[0]) 520 | # datafile = 'expression_data.tsv' 521 | # normalize = True 522 | # df = pd.read_csv(datafile, sep='\t', header=0) 523 | # df.columns = [int(x[1:]) - 1 for x in df.columns] 524 | # if normalize==True: 525 | # df = pd.DataFrame(scale(df, axis=0)) 526 | # t_data = df.T 527 | # features = t_data.to_numpy() 528 | # features = features[[node_list],:] 529 | # #features = features.T 530 | # #features = sp.diags(pr) 531 | 532 | """ RWR features 3 steps """ 533 | #features = myrwr(adj, 0.15,3) 534 | 535 | features, _ = process.preprocess_features(features) 536 | 537 | nb_nodes = features.shape[0] 538 | ft_size = features.shape[1] 539 | 540 | #matlabData = hd.loadmat('CTD_DDA_HDI.mat') 541 | #adj = matlabData['adj'] 542 | 543 | 544 | """ For large file use implementation in Python""" 545 | #adj = process.normalize_adj(adj + sp.eye(adj.shape[0])) 546 | #adj = process.calc_ppr_exact(adj, 0.15) 547 | #adj = myrwr(adj + sp.eye(adj.shape[0]), 0.15, 10) 548 | if args.embTech == 'DGI': 549 | adj = process.normalize_adj(adj + sp.eye(adj.shape[0])) 550 | elif args.embTech == 'CN': 551 | adj = process.normalize_adjCN(adj + sp.eye(adj.shape[0])) 552 | elif args.embTech == 'AA': 553 | adj = process.normalize_adjAA(adj + sp.eye(adj.shape[0])) 554 | elif args.embTech == 'Jaccard': 555 | adj = process.normalize_adjJaccard(adj + sp.eye(adj.shape[0])) 556 | elif args.embTech == 'RA': 557 | adj = process.normalize_adjRA(adj + sp.eye(adj.shape[0])) 558 | elif args.embTech == 'HDI': 559 | adj = process.normalize_adjHDI(adj + sp.eye(adj.shape[0])) 560 | elif args.embTech == 'HPI': 561 | adj = process.normalize_adjHPI(adj + sp.eye(adj.shape[0])) 562 | elif args.embTech == 'Sorenson': 563 | adj = process.normalize_adjSorenson(adj + sp.eye(adj.shape[0])) 564 | elif args.embTech == 'Salton': 565 | adj = process.normalize_adjSalton(adj + sp.eye(adj.shape[0])) 566 | else: 567 | print("No such embedding technique \n We are calling default DGI", args.embTech) 568 | adj = process.normalize_adj(adj + sp.eye(adj.shape[0])) 569 | 570 | 571 | if sparse: 572 | sp_adj = process.sparse_mx_to_torch_sparse_tensor(adj) 573 | else: 574 | adj = (adj + sp.eye(adj.shape[0])).todense() 575 | 576 | features = torch.FloatTensor(features[np.newaxis]) 577 | if not sparse: 578 | adj = torch.FloatTensor(adj[np.newaxis]) 579 | 580 | 581 | 582 | 583 | model = DGI(ft_size, hid_units, nonlinearity) 584 | optimiser = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2_coef) 585 | 586 | if torch.cuda.is_available(): 587 | print('Using CUDA') 588 | model.cuda() 589 | features = features.cuda() 590 | if sparse: 591 | sp_adj = sp_adj.cuda() 592 | else: 593 | adj = adj.cuda() 594 | labels = labels.cuda() 595 | idx_train = idx_train.cuda() 596 | idx_val = idx_val.cuda() 597 | idx_test = idx_test.cuda() 598 | 599 | b_xent = nn.BCEWithLogitsLoss() 600 | xent = nn.CrossEntropyLoss() 601 | cnt_wait = 0 602 | best = 1e9 603 | best_t = 0 604 | 605 | for epoch in range(nb_epochs): 606 | model.train() 607 | optimiser.zero_grad() 608 | 609 | idx = np.random.permutation(nb_nodes) 610 | shuf_fts = features[:, idx, :] 611 | 612 | lbl_1 = torch.ones(batch_size, nb_nodes) 613 | lbl_2 = torch.zeros(batch_size, nb_nodes) 614 | lbl = torch.cat((lbl_1, lbl_2), 1) 615 | 616 | if torch.cuda.is_available(): 617 | shuf_fts = shuf_fts.cuda() 618 | lbl = lbl.cuda() 619 | 620 | logits = model(features, shuf_fts, sp_adj if sparse else adj, sparse, None, None, None) 621 | 622 | loss = b_xent(logits, lbl) 623 | 624 | #print('Loss:', loss) 625 | 626 | if loss < best: 627 | best = loss 628 | best_t = epoch 629 | cnt_wait = 0 630 | torch.save(model.state_dict(), 'best_dgi.pkl') 631 | else: 632 | cnt_wait += 1 633 | 634 | if cnt_wait == patience: 635 | print('Early stopping!') 636 | break 637 | 638 | loss.backward() 639 | optimiser.step() 640 | 641 | print('Loading {}th epoch'.format(best_t)) 642 | model.load_state_dict(torch.load('best_dgi.pkl')) 643 | 644 | embeds, _ = model.embed(features, sp_adj if sparse else adj, sparse, None) 645 | 646 | output = args.output 647 | TenToNum = embeds.numpy() 648 | newembeds = TenToNum[0] 649 | 650 | fout = open(output, 'w') 651 | fout.write("{} {}\n".format(newembeds.shape[0], newembeds.shape[1])) 652 | for idx in range(newembeds.shape[0]): 653 | fout.write("{} {}\n".format(node_list[idx], ' '.join([str(x) for x in newembeds[idx, :]]))) 654 | fout.close() 655 | 656 | elif args.method == 'SVD': 657 | SVD_embedding(G_, args.output, size=args.dimensions) 658 | else: 659 | if args.method == 'Laplacian': 660 | model = lap.LaplacianEigenmaps(G_, rep_size=args.dimensions) 661 | elif args.method == 'RWR': 662 | model = RWR.RWR(G_, rep_size=100) 663 | 664 | elif args.method == 'GF': 665 | model = gf.GraphFactorization(G_, rep_size=args.dimensions, 666 | epoch=args.epochs, learning_rate=args.lr, weight_decay=args.weight_decay) 667 | 668 | elif args.method == 'HOPE': 669 | model = hope.HOPE(graph=G_, d=args.dimensions) 670 | 671 | elif args.method == 'GraRep': 672 | model = grarep.GraRep(graph=G_, Kstep=args.kstep, dim=args.dimensions) 673 | 674 | elif args.method == 'DeepWalk': 675 | model = node2vec.Node2vec(graph=G_, path_length=args.walk_length, 676 | num_paths=args.number_walks, dim=args.dimensions, 677 | workers=args.workers, window=args.window_size, dw=True) 678 | 679 | elif args.method == 'node2vec': 680 | model = node2vec.Node2vec(graph=G_, path_length=args.walk_length, 681 | num_paths=args.number_walks, dim=args.dimensions, 682 | workers=args.workers, p=args.p, q=args.q, window=args.window_size) 683 | 684 | elif args.method == 'LINE': 685 | model = line.LINE(G_, epoch=args.epochs, 686 | rep_size=args.dimensions, order=args.order) 687 | 688 | elif args.method == 'SDNE': 689 | encoder_layer_list = ast.literal_eval(args.encoder_list) 690 | model = sdne.SDNE(G_, encoder_layer_list=encoder_layer_list, 691 | alpha=args.alpha, beta=args.beta, nu1=args.nu1, nu2=args.nu2, 692 | batch_size=args.bs, epoch=args.epochs, learning_rate=args.lr) 693 | else: 694 | raise ValueError(f'Invalid method: {args.method}') 695 | 696 | print("Saving embeddings...") 697 | model.save_embeddings(args.output) 698 | 699 | return 700 | --------------------------------------------------------------------------------