├── .DS_Store ├── .gitattributes ├── __pycache__ ├── model.cpython-37.pyc ├── utils.cpython-37.pyc ├── run_model.cpython-37.pyc ├── dataLoader.cpython-37.pyc └── logisticRegression.cpython-37.pyc ├── run_model.sh ├── .idea ├── vcs.xml ├── .gitignore ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml └── hierarchyClusteringTuning.iml ├── run_model.py ├── README.md ├── model.py ├── logisticRegression.py ├── utils.py └── dataLoader.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Todo/GraphHop/master/.DS_Store -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /__pycache__/model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Todo/GraphHop/master/__pycache__/model.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Todo/GraphHop/master/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/run_model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Todo/GraphHop/master/__pycache__/run_model.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/dataLoader.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Todo/GraphHop/master/__pycache__/dataLoader.cpython-37.pyc -------------------------------------------------------------------------------- /run_model.sh: -------------------------------------------------------------------------------- 1 | python run_model.py --dataset cora --num_per_class 20 --batch_prop 512 --temperature .1 --alpha 10 --beta 1 2 | -------------------------------------------------------------------------------- /__pycache__/logisticRegression.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Todo/GraphHop/master/__pycache__/logisticRegression.cpython-37.pyc -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Datasource local storage ignored files 5 | /dataSources/ 6 | /dataSources.local.xml 7 | # Editor-based HTTP Client requests 8 | /httpRequests/ 9 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/hierarchyClusteringTuning.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /run_model.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import model 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--dataset", default='cora', type=str, help='select datasets.') 7 | parser.add_argument("--num_per_class", default=20, type=int, help='select number of labeled examples per class.') 8 | parser.add_argument("--batch_prop", default=512, type=int, help='select batch number') 9 | parser.add_argument("--temperature", default=.1, type=float, help='select temperature') 10 | parser.add_argument("--alpha", default=1., type=float, help='select alpha') 11 | parser.add_argument("--beta", default=1., type=float, help='select beta') 12 | parser.add_argument("--W1", default=.5, type=float) 13 | parser.add_argument("--W2", default=.5, type=float) 14 | args = parser.parse_args() 15 | 16 | DATASET = args.dataset 17 | TEMPERATURE = float(args.temperature) 18 | ALPHA = float(args.alpha) 19 | BETA = float(args.beta) 20 | W1 = float(args.W1) 21 | W2 = float(args.W2) 22 | NUM_PER_CLASS = int(args.num_per_class) 23 | BATCH_PROP = int(args.batch_prop) 24 | 25 | model 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GraphHop: An Enhanced Label Propagation Method for Node Classification 2 | 3 | This respository contains the PyTorch implementation of GraphHop for the task of semi-supervised classification of nodes in a graph, as described in our paper: 4 | 5 | Tian Xie, Bin Wang, C.-C. Jay Kuo, GraphHop: An Enhanced Label PropagationMethod for Node Classification. [[paper]](https://arxiv.org/abs/2101.02326) 6 | 7 | 8 | ## Dependencies 9 | * torch == 1.5.0 10 | * numpy == 1.18.1 11 | * scipy == 1.4.1 12 | * [pytorch geometric](https://pytorch-geometric.readthedocs.io/en/latest/notes/installation.html) 13 | 14 | ## RUN 15 | ``` 16 | sh run_model.sh 17 | ``` 18 | You may change the hyperparameters inside the shell script. 19 | 20 | ## Citation 21 | If you are use this code for your research, please cite our paper. 22 | 23 | ``` 24 | @ARTICLE{9737682, 25 | author={Xie, Tian and Wang, Bin and Kuo, C.-C. Jay}, 26 | journal={IEEE Transactions on Neural Networks and Learning Systems}, 27 | title={GraphHop: An Enhanced Label Propagation Method for Node Classification}, 28 | year={2022}, 29 | volume={}, 30 | number={}, 31 | pages={1-15}, 32 | doi={10.1109/TNNLS.2022.3157746}} 33 | ``` 34 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from logisticRegression import fit 5 | from datetime import datetime 6 | from dataLoader import load_planetoid_datasets 7 | from utils import random_walk_normalize, pure_k_hops, sparse_mx_to_torch_sparse_tensor, accuracy 8 | 9 | from run_model import W1, W2, DATASET, NUM_PER_CLASS 10 | 11 | if torch.cuda.is_available(): 12 | print("Using CUDA.") 13 | 14 | dataset = DATASET 15 | 16 | date = datetime.now() 17 | 18 | num_labels_per_class = NUM_PER_CLASS 19 | if dataset in ['cora', 'citeseer', 'pubmed']: 20 | feat, one_hot_labels, adj, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_planetoid_datasets( 21 | dataset, num_labels_per_class) 22 | else: 23 | assert False, "Choose dataset from Cora, CiteSeer, or PubMed." 24 | 25 | labels = np.where(one_hot_labels == 1)[1] 26 | 27 | # k hops 28 | one_hops_adj = pure_k_hops(adj, 1) 29 | two_hops_adj = pure_k_hops(adj, 2) 30 | 31 | one_hops_adj = random_walk_normalize(one_hops_adj) 32 | two_hops_adj = random_walk_normalize(two_hops_adj) 33 | 34 | new_feat = feat 35 | 36 | pseudo_labels = np.zeros(one_hot_labels.shape) 37 | pseudo_labels[train_mask] = one_hot_labels[train_mask] 38 | y_val = one_hot_labels[val_mask] 39 | 40 | epoch = 100 41 | 42 | output = [] 43 | prev_model = [None, None] 44 | num_perturb = 0 45 | test_scores_record = [] 46 | 47 | new_feat = torch.FloatTensor(new_feat) 48 | one_hops_adj = sparse_mx_to_torch_sparse_tensor(one_hops_adj) 49 | two_hops_adj = sparse_mx_to_torch_sparse_tensor(two_hops_adj) 50 | y_val = torch.FloatTensor(y_val) 51 | pseudo_labels = torch.FloatTensor(pseudo_labels) 52 | one_hot_labels = torch.FloatTensor(one_hot_labels) 53 | 54 | if torch.cuda.is_available(): 55 | new_feat = new_feat.cuda() 56 | one_hops_adj = one_hops_adj.cuda() 57 | two_hops_adj = two_hops_adj.cuda() 58 | pseudo_labels = pseudo_labels.cuda() 59 | one_hot_labels = one_hot_labels.cuda() 60 | 61 | ave_acc = [] 62 | for i in range(epoch + 1): 63 | one_agg_feat = torch.spmm(one_hops_adj, new_feat) 64 | two_agg_feat = torch.spmm(two_hops_adj, new_feat) 65 | 66 | one_new_feat = torch.cat((new_feat, one_agg_feat), dim=1) 67 | two_new_feat = torch.cat((new_feat, one_agg_feat, two_agg_feat), dim=1) 68 | 69 | X_1 = one_new_feat 70 | y_1 = pseudo_labels 71 | X_2 = two_new_feat 72 | y_2 = pseudo_labels 73 | 74 | if torch.cuda.is_available(): 75 | X_1 = X_1.cuda() 76 | y_1 = y_1.cuda() 77 | X_2 = X_2.cuda() 78 | y_2 = y_2.cuda() 79 | y_val = y_val.cuda() 80 | 81 | clf_1 = fit(i, X_1, y_1, train_mask, val_mask, y_val, prev_model[0]) 82 | clf_2 = fit(i, X_2, y_2, train_mask, val_mask, y_val, prev_model[1]) 83 | 84 | prev_model[0] = clf_1 85 | prev_model[1] = clf_2 86 | 87 | pseudo_labels = W1 * clf_1.predict_temp_soft_labels(X_1).detach() + (1. - W1) * clf_2.predict_temp_soft_labels( 88 | X_2).detach() 89 | pseudo_labels[train_mask] = one_hot_labels[train_mask] 90 | 91 | new_feat = W2 * clf_1.predict_soft_labels(X_1).detach() + (1. - W2) * clf_2.predict_soft_labels(X_2).detach() 92 | 93 | # model evaluation 94 | clf_1.eval() 95 | clf_2.eval() 96 | 97 | y_train = one_hot_labels[train_mask] 98 | y_test = one_hot_labels[test_mask] 99 | if torch.cuda.is_available(): 100 | y_train = y_train.cuda() 101 | y_test = y_test.cuda() 102 | 103 | train_score = accuracy(new_feat[train_mask], y_train) 104 | test_score = accuracy(new_feat[test_mask], y_test) 105 | print('Iteration {}, train accuracy: {:.4f}, test accuracy: {:.4f}'.format(i, train_score, test_score)) 106 | ave_acc.append(test_score.item()) 107 | -------------------------------------------------------------------------------- /logisticRegression.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import torch.optim as optim 4 | import numpy as np 5 | import copy 6 | 7 | from torch.nn import Linear 8 | from run_model import TEMPERATURE, ALPHA, BETA, BATCH_PROP 9 | 10 | learning_rate = 0.01 11 | weight_decay = 5e-4 12 | epoch = 1000 13 | early_stopping = 10 14 | 15 | temperature = TEMPERATURE 16 | alpha = ALPHA 17 | beta = BETA 18 | batch_prop = BATCH_PROP 19 | 20 | 21 | class LogisticRegression(torch.nn.Module): 22 | def __init__(self, num_feat, num_classes): 23 | super(LogisticRegression, self).__init__() 24 | self.linear = Linear(num_feat, num_classes) 25 | 26 | def forward(self, feat): 27 | return F.log_softmax(self.linear(feat), dim=1) 28 | 29 | def predict_soft_labels(self, feat): 30 | return F.softmax(self.linear(feat), dim=1) 31 | 32 | def predict_temp_soft_labels(self, feat): 33 | return F.softmax(self.linear(feat) / temperature, dim=1) 34 | 35 | def score(self, feat, labels): 36 | y_prob = F.softmax(self.linear(feat), dim=1) 37 | _accuracy = accuracy(y_prob, labels) 38 | return _accuracy 39 | 40 | 41 | def fit(step, feat, labels, train_mask, val_mask, y_val, prev_model): 42 | num_feat = feat.shape[1] 43 | num_classes = labels.shape[1] 44 | pseudo_mask = torch.zeros(train_mask.shape[0], dtype=torch.bool) 45 | pseudo_mask[train_mask == False] = True 46 | 47 | X_train = feat[train_mask] 48 | y_train = labels[train_mask] 49 | X_pseudo = feat[pseudo_mask] 50 | y_pseudo = labels[pseudo_mask] 51 | 52 | X_val = feat[val_mask] 53 | y_val = y_val 54 | 55 | new_batch_prop = float(batch_prop / feat.shape[0]) 56 | 57 | if step <= 1: 58 | model = LogisticRegression(num_feat, num_classes) 59 | if torch.cuda.is_available(): 60 | model.cuda() 61 | else: 62 | model = prev_model 63 | optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) 64 | model.train() 65 | count = 0 66 | best_model = copy.deepcopy(model) 67 | prev_loss_val = np.inf 68 | for i in range(epoch): 69 | for j in range(0, int(1. / new_batch_prop) + 1): 70 | optimizer.zero_grad() 71 | y_train_batch = y_train[ 72 | int(y_train.shape[0] * j * new_batch_prop):int(y_train.shape[0] * (j + 1) * new_batch_prop)] 73 | if y_train_batch.shape[0] == 0: 74 | break 75 | y_pseudo_batch = y_pseudo[ 76 | int(y_pseudo.shape[0] * j * new_batch_prop):int( 77 | y_pseudo.shape[0] * (j + 1) * new_batch_prop)] 78 | X_train_batch = X_train[ 79 | int(X_train.shape[0] * j * new_batch_prop):int(X_train.shape[0] * (j + 1) * new_batch_prop)] 80 | X_pseudo_batch = X_pseudo[ 81 | int(X_pseudo.shape[0] * j * new_batch_prop):int( 82 | X_pseudo.shape[0] * (j + 1) * new_batch_prop)] 83 | y_train_log_prob = model.forward(X_train_batch) 84 | y_pseudo_log_prob = model.forward(X_pseudo_batch) 85 | num_train = y_train_batch.shape[0] 86 | num_pseudo = y_pseudo_batch.shape[0] 87 | if step == 0: 88 | entropy_train = (y_train_batch * y_train_log_prob).sum() 89 | loss_train = -1.0 * entropy_train 90 | else: 91 | entropy_train = (y_train_batch * y_train_log_prob).sum() / num_train + alpha * ( 92 | y_pseudo_batch * y_pseudo_log_prob).sum() / (num_pseudo * num_classes) \ 93 | + beta * (torch.exp(y_pseudo_log_prob) * y_pseudo_log_prob).sum() / ( 94 | num_pseudo * num_classes) 95 | loss_train = -1.0 * entropy_train 96 | 97 | loss_train.backward() 98 | optimizer.step() 99 | 100 | if count == 0: 101 | best_model = copy.deepcopy(model) 102 | y_log_prob_val = model.forward(X_val) 103 | entropy_val = y_val * y_log_prob_val 104 | loss_val = -1.0 * entropy_val.sum() 105 | accuracy_val = accuracy(y_log_prob_val, y_val) 106 | if loss_val - prev_loss_val > 0 or prev_loss_val - loss_val < 1e-2: 107 | count += 1 108 | else: 109 | count = 0 110 | if count == early_stopping: 111 | break 112 | prev_loss_val = loss_val 113 | # print("epoch: {}, train loss: {:.4f}, train accuracy: {:.4f}, validation loss: {:.4f}, " 114 | # "validation accuracy: {:.4f}".format(i, loss_train, accuracy_train, loss_val, accuracy_val)) 115 | # f1.write(str(accuracy_val.item()) + ',\n') 116 | return best_model 117 | 118 | 119 | def accuracy(output, labels): 120 | labels = labels.max(1)[1] 121 | preds = output.max(1)[1].type_as(labels) 122 | correct = preds.eq(labels).double() 123 | correct = correct.sum() 124 | return correct / len(labels) 125 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse as sp 3 | import torch 4 | from scipy.spatial.distance import cosine, euclidean 5 | 6 | from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity 7 | 8 | # select samples that are farthest to the center 9 | from dataLoader import load_preprocess_data 10 | 11 | 12 | def maxCover(data, ratio): 13 | center = np.mean(data, axis=0).reshape(1, -1) 14 | centers = np.tile(center, (center.shape[0], 1)) 15 | distance = np.linalg.norm(data - centers, axis=1) 16 | sort_distance = np.argsort(distance)[::-1] 17 | return sort_distance[:int(len(sort_distance) * ratio)] 18 | 19 | 20 | # select the top confidence 21 | def topConfidence(data, ratio): 22 | row_max = np.max(data, axis=1) 23 | index = np.argsort(row_max)[::-1] 24 | return index[:int(len(index) * ratio)] 25 | 26 | 27 | # assign each nodes to clusters 28 | def nodesAssign(data, nodes_index, clusters): 29 | means = [] 30 | for c in clusters: 31 | means.append(np.mean(data[c], axis=0)) 32 | means = np.array(means) 33 | nodes = data[nodes_index] 34 | for i in range(len(nodes)): 35 | distance = np.linalg.norm(means - nodes[i], axis=1) 36 | min_c = np.argsort(distance)[0] 37 | clusters[min_c] = np.append(clusters[min_c], i) 38 | return clusters 39 | 40 | 41 | # calculate the weight matrix 42 | # w_ij = e^{d_ij / d_min} / sum_j(e^{d_ij / d_min}) 43 | # d_ij = cosine distance of i and j -- 1 - cos(a) 44 | def edge_weight(data, adj, type='euclidean'): 45 | weight_matrix = np.zeros(adj.shape) 46 | diagonal = sp.diags(adj.diagonal()) 47 | adj = adj - diagonal 48 | for i in range(data.shape[0]): 49 | adj_nodes_index = (adj[i].toarray() != 0).squeeze() 50 | adj_nodes = data[adj_nodes_index] 51 | 52 | # citeseer dataset has some isolated nodes 53 | if len(adj_nodes) == 0: 54 | continue 55 | assert (type in ['euclidean', 'cosine']) 56 | if type == 'euclidean': 57 | distance = np.array([euclidean(data[i], adj_nodes[j]) for j in range(adj_nodes.shape[0])]) 58 | elif type == 'cosine': 59 | distance = np.array([cosine(data[i], adj_nodes[j]) for j in range(adj_nodes.shape[0])]) 60 | 61 | # # there are some nodes have exactly same features. 62 | # if min(distance) == 0: 63 | # index = np.where(distance == 0) 64 | # index = np.arange(adj.shape[0])[adj_nodes_index][index] 65 | # weight_matrix[i][index] = 1 66 | # continue 67 | weight = np.exp(distance) 68 | weight = weight / np.sum(weight) 69 | weight_matrix[i][adj_nodes_index] = weight 70 | # weight_matrix += sp.eye(adj.shape[0]) 71 | return weight_matrix 72 | 73 | 74 | # calculate the feature similarity matrix 75 | def one_shot_edge_weight(data, type='euclidean'): 76 | assert (type in ['euclidean', 'cosine']) 77 | if type == 'euclidean': 78 | distance = euclidean_distances(data) 79 | elif type == 'cosine': 80 | distance = cosine_similarity(data) 81 | return distance 82 | 83 | 84 | # A = D^{-1/2} * A * D^{-1/2} 85 | def normalize(adj): 86 | adj = adj + sp.eye(adj.shape[0]) # add self-loop 87 | row_sum = np.array(adj.sum(1)) 88 | r_inv = np.power(row_sum, -0.5).flatten() 89 | r_mat_inv = sp.diags(r_inv) 90 | norm_adj = r_mat_inv.dot(adj) 91 | norm_adj = norm_adj.dot(r_mat_inv) 92 | return norm_adj 93 | 94 | 95 | def random_walk_normalize(adj): 96 | # adj = adj + sp.eye(adj.shape[0]) # add self-loop 97 | row_sum = np.array(adj.sum(1)).astype('float') 98 | r_inv = np.power(row_sum, -1).flatten() 99 | r_inv[r_inv == float('inf')] = 0 100 | r_mat_inv = sp.diags(r_inv) 101 | norm_adj = r_mat_inv.dot(adj) 102 | return norm_adj 103 | 104 | 105 | # sparse adjacency matrix 106 | def multiHops(adj, k): 107 | multi_adj = adj 108 | for i in range(k - 1): 109 | multi_adj = multi_adj.dot(adj) 110 | return multi_adj 111 | 112 | 113 | def pure_k_hops(adj, k): 114 | multi_adj = adj 115 | pre_multi_adj = [adj] 116 | for i in range(k - 1): 117 | multi_adj = multi_adj.dot(adj) 118 | multi_adj = multi_adj - sp.diags(multi_adj.diagonal()) 119 | multi_adj = multi_adj.tolil() 120 | for m in pre_multi_adj: 121 | multi_adj[m.nonzero()] = 0 122 | multi_adj = multi_adj.tocsr() 123 | pre_multi_adj.append(multi_adj) 124 | return multi_adj 125 | 126 | 127 | # calculate the prediction accuracy 128 | def accuracy(output, labels): 129 | labels = labels.max(1)[1] 130 | preds = output.max(1)[1].type_as(labels) 131 | correct = preds.eq(labels).double() 132 | correct = correct.sum() 133 | return correct / len(labels) 134 | 135 | 136 | def sparse_mx_to_torch_sparse_tensor(sparse_mx): 137 | """Convert a scipy sparse matrix to a torch sparse tensor.""" 138 | sparse_mx = sparse_mx.tocoo().astype(np.float32) 139 | indices = torch.from_numpy( 140 | np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64)) 141 | values = torch.from_numpy(sparse_mx.data) 142 | shape = torch.Size(sparse_mx.shape) 143 | return torch.sparse.FloatTensor(indices, values, shape) 144 | 145 | 146 | if __name__ == '__main__': 147 | # # toy example 148 | # data = np.array([[1, 2, 3], 149 | # [4, 5, 6]]) 150 | # adj = csr_matrix([[0, 1], 151 | # [1, 0]]) 152 | # # adj = csr_matrix(np.array([[0, 1, 0, 0, 0, 0], 153 | # # [1, 0, 1, 0, 0, 0], 154 | # # [0, 1, 0, 1, 1, 0], 155 | # # [0, 0, 1, 0, 0, 0], 156 | # # [0, 0, 1, 0, 0, 1], 157 | # # [0, 0, 0, 0, 1, 0]])) 158 | # weight = np.array([[0., 1., 2., 1., 2, 3], 159 | # [1., 0., 1., 3., 4, 5], 160 | # [4., 1., 0., 1., 6, 1], 161 | # [1., 1., 1., 0., 3, 2], 162 | # [1, 2, 3, 4, 5, 0], 163 | # [4, 3, 2, 1, 6, 0]]) 164 | # 165 | # a = edge_weight(data, adj) 166 | # print(a) 167 | 168 | dataset = 'pubmed' 169 | emb_dimensions = 20 170 | feat, one_hot_labels, adj, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_preprocess_data(dataset, 171 | emb_dimensions) 172 | a = edge_weight(feat, adj, 'cosine') 173 | print(a) 174 | # print(one_hot_labels.shape) 175 | # clf = LogisticRegression(solver="lbfgs", multi_class="multinomial", max_iter=1000, verbose=100) 176 | # labels = np.where(one_hot_labels == 1)[1] 177 | # 178 | # clf.fit(feat[train_mask], labels[train_mask]) 179 | # print(clf.score(feat[train_mask], labels[train_mask])) 180 | 181 | # adj = adj + sp.eye(adj.shape[0]) 182 | # weight = one_shot_edge_weight(feat, 'euclidean') 183 | # add_edge_weight(adj, weight) 184 | -------------------------------------------------------------------------------- /dataLoader.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | import pickle as pkl 5 | import sys 6 | import scipy.sparse as sp 7 | import networkx as nx 8 | import os 9 | import warnings 10 | 11 | import torch 12 | import torch_geometric.transforms as T 13 | 14 | from torch_geometric.datasets import Planetoid 15 | 16 | warnings.filterwarnings('ignore') 17 | 18 | DATA_PATH = '../datasets' 19 | 20 | 21 | def one_hot(idx, num_class): 22 | return torch.zeros(len(idx), num_class).to(idx.device).scatter_( 23 | 1, idx.unsqueeze(1), 1.) 24 | 25 | 26 | def parse_index_file(filename): 27 | index = [] 28 | for line in open(filename): 29 | index.append(int(line.strip())) 30 | return index 31 | 32 | 33 | def sample_mask(idx, l): 34 | """Create mask.""" 35 | mask = np.zeros(l) 36 | mask[idx] = 1 37 | return np.array(mask, dtype=np.bool) 38 | 39 | 40 | def load_data(dataset): 41 | names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] 42 | objects = [] 43 | for i in range(len(names)): 44 | with open(DATA_PATH + "/ind.{}.{}".format(dataset, names[i]), 'rb') as f: 45 | if sys.version_info > (3, 0): 46 | objects.append(pkl.load(f, encoding='latin1')) 47 | else: 48 | objects.append(pkl.load(f)) 49 | 50 | x, y, tx, ty, allx, ally, graph = tuple(objects) 51 | 52 | test_idx_reorder = parse_index_file(DATA_PATH + "/ind.{}.test.index".format(dataset)) 53 | test_idx_range = np.sort(test_idx_reorder) 54 | 55 | if dataset == 'citeseer': 56 | # Fix citeseer dataset (there are some isolated nodes in the graph) 57 | # Find isolated nodes, add them as zero-vecs into the right position 58 | test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1) 59 | tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) 60 | tx_extended[test_idx_range - min(test_idx_range), :] = tx 61 | tx = tx_extended 62 | # set zero if no label 63 | ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) 64 | ty_extended[test_idx_range - min(test_idx_range), :] = ty 65 | ty = ty_extended 66 | 67 | features = np.vstack((np.array(allx.todense()), np.array(tx.todense()))) 68 | labels = np.vstack((ally, ty)) 69 | 70 | features[test_idx_reorder, :] = features[test_idx_range, :] 71 | labels[test_idx_reorder, :] = labels[test_idx_range, :] 72 | adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) 73 | 74 | # from here mask the labels y for training, validation and testing, 75 | # so during training, only the labels from training dataset are used 76 | idx_test = test_idx_range.tolist() 77 | idx_train = range(len(y)) 78 | idx_val = range(len(y), len(y) + 500) 79 | 80 | train_mask = sample_mask(idx_train, labels.shape[0]) 81 | val_mask = sample_mask(idx_val, labels.shape[0]) 82 | test_mask = sample_mask(idx_test, labels.shape[0]) 83 | 84 | y_train = np.zeros(labels.shape) 85 | y_val = np.zeros(labels.shape) 86 | y_test = np.zeros(labels.shape) 87 | y_train[train_mask, :] = labels[train_mask, :] 88 | y_val[val_mask, :] = labels[val_mask, :] 89 | y_test[test_mask, :] = labels[test_mask, :] 90 | 91 | return features, labels, adj, y_train, y_val, y_test, train_mask, val_mask, test_mask 92 | 93 | 94 | def preprocess_features(features): 95 | """Row-normalize feature matrix and convert to tuple representation""" 96 | # print('Pre-processing feature by Simple Normalization') 97 | rowsum = np.array(features.sum(1)) 98 | r_inv = np.power(rowsum, -1).flatten() 99 | r_inv[np.isinf(r_inv)] = 0. 100 | r_mat_inv = sp.diags(r_inv) 101 | features = r_mat_inv.dot(features) 102 | return features 103 | 104 | 105 | def preprocess_features_Probability(features): 106 | """Co-occurrence embedding to pre-process feature""" 107 | print('Pre-processing feature by Co-occurrence/Probability statistics') 108 | # co_occur = np.zeros((features.shape[1],features.shape[1])) 109 | 110 | # Get co-occurrence matrix 111 | co_occur = features.T.dot(features) 112 | 113 | # Normalization 114 | co_occur = preprocess_features(co_occur) 115 | features += features.dot(co_occur) 116 | features = preprocess_features(features) 117 | return features 118 | 119 | 120 | def load_preprocess_data(dataset, emb_dimensions=20): 121 | features, labels, adj, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(dataset) 122 | 123 | # drop the non-labeled nodes 124 | if dataset == 'citeseer': 125 | mask_index = [] 126 | for i in range(len(labels)): 127 | if not (labels[i] == 0).all(): 128 | mask_index.append(i) 129 | mask_index = np.array(mask_index) 130 | features = features[mask_index] 131 | labels = labels[mask_index] 132 | temp_adj = sp.csc_matrix(adj[mask_index]).T 133 | temp_adj = temp_adj[mask_index] 134 | adj = sp.csr_matrix(temp_adj) 135 | y_train = y_train[mask_index] 136 | y_val = y_val[mask_index] 137 | y_test = y_test[mask_index] 138 | train_mask = train_mask[mask_index] 139 | val_mask = val_mask[mask_index] 140 | test_mask = test_mask[mask_index] 141 | 142 | print("{} dataset loaded.".format(dataset)) 143 | return features, labels, adj, y_train, y_val, y_test, train_mask, val_mask, test_mask 144 | 145 | 146 | def split_by_fixed_training_data(data, num_labels_per_class=20): 147 | num = data.x.shape[0] 148 | num_labels = data.num_classes 149 | labels = data.y.tolist() 150 | # the different sampling of training set requires the parameters retuning 151 | # labels = np.random.RandomState(seed=2).permutation(labels).tolist() 152 | 153 | idx_train = [] 154 | class_cnt = np.zeros(num_labels) 155 | for i in range(num): 156 | if (class_cnt >= num_labels_per_class).all(): 157 | break 158 | if class_cnt[labels[i]] == num_labels_per_class: 159 | continue 160 | idx_train.append(i) 161 | class_cnt[labels[i]] += 1 162 | 163 | idx_val = random.sample(set(range(num)) - set(idx_train), 500) # random sample 500 for validation 164 | idx_test = list(set(range(num)) - set(idx_train) - set(idx_val)) # the rest as testing 165 | 166 | train_mask = np.zeros((num,), dtype=int) 167 | train_mask[np.array(idx_train)] = 1 168 | 169 | val_mask = np.zeros((num,), dtype=int) 170 | val_mask[np.array(idx_val)] = 1 171 | 172 | test_mask = np.zeros((num,), dtype=int) 173 | test_mask[np.array(idx_test)] = 1 174 | return train_mask, val_mask, test_mask 175 | 176 | 177 | def load_planetoid_datasets(dataset, num_labels_per_class=20): 178 | name = dataset 179 | path = os.path.join("./datasets", dataset) 180 | dataset = Planetoid(root=path, name=name, transform=T.NormalizeFeatures()) 181 | data = dataset[0] 182 | data.num_classes = dataset.num_classes 183 | 184 | train_mask, val_mask, test_mask = split_by_fixed_training_data(data, num_labels_per_class) 185 | train_mask = train_mask.astype(bool) 186 | val_mask = val_mask.astype(bool) 187 | test_mask = test_mask.astype(bool) 188 | 189 | features = data.x.numpy() 190 | labels = one_hot(data.y, data.num_classes).numpy() 191 | edges = data.edge_index.numpy() 192 | ones = np.ones(edges.shape[1]) 193 | adj = sp.csr_matrix((ones, edges), shape=(data.num_nodes, data.num_nodes)) 194 | 195 | print("Citation-{} dataset loaded.".format(name)) 196 | return features, labels, adj, None, None, None, train_mask, val_mask, test_mask 197 | 198 | 199 | if __name__ == '__main__': 200 | features, labels, adj, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_planetoid_datasets('cora') 201 | print('train: ', features[train_mask].shape) 202 | print('val: ', features[val_mask].shape) 203 | print('test: ', features[test_mask].shape) 204 | print('labels: ', labels.shape) 205 | print(np.where(val_mask == True)) 206 | --------------------------------------------------------------------------------