├── .DS_Store
├── .gitattributes
├── __pycache__
├── model.cpython-37.pyc
├── utils.cpython-37.pyc
├── run_model.cpython-37.pyc
├── dataLoader.cpython-37.pyc
└── logisticRegression.cpython-37.pyc
├── run_model.sh
├── .idea
├── vcs.xml
├── .gitignore
├── inspectionProfiles
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
└── hierarchyClusteringTuning.iml
├── run_model.py
├── README.md
├── model.py
├── logisticRegression.py
├── utils.py
└── dataLoader.py
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Todo/GraphHop/master/.DS_Store
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/__pycache__/model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Todo/GraphHop/master/__pycache__/model.cpython-37.pyc
--------------------------------------------------------------------------------
/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Todo/GraphHop/master/__pycache__/utils.cpython-37.pyc
--------------------------------------------------------------------------------
/__pycache__/run_model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Todo/GraphHop/master/__pycache__/run_model.cpython-37.pyc
--------------------------------------------------------------------------------
/__pycache__/dataLoader.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Todo/GraphHop/master/__pycache__/dataLoader.cpython-37.pyc
--------------------------------------------------------------------------------
/run_model.sh:
--------------------------------------------------------------------------------
1 | python run_model.py --dataset cora --num_per_class 20 --batch_prop 512 --temperature .1 --alpha 10 --beta 1
2 |
--------------------------------------------------------------------------------
/__pycache__/logisticRegression.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Todo/GraphHop/master/__pycache__/logisticRegression.cpython-37.pyc
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Datasource local storage ignored files
5 | /dataSources/
6 | /dataSources.local.xml
7 | # Editor-based HTTP Client requests
8 | /httpRequests/
9 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/hierarchyClusteringTuning.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/run_model.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import model
4 |
5 | parser = argparse.ArgumentParser()
6 | parser.add_argument("--dataset", default='cora', type=str, help='select datasets.')
7 | parser.add_argument("--num_per_class", default=20, type=int, help='select number of labeled examples per class.')
8 | parser.add_argument("--batch_prop", default=512, type=int, help='select batch number')
9 | parser.add_argument("--temperature", default=.1, type=float, help='select temperature')
10 | parser.add_argument("--alpha", default=1., type=float, help='select alpha')
11 | parser.add_argument("--beta", default=1., type=float, help='select beta')
12 | parser.add_argument("--W1", default=.5, type=float)
13 | parser.add_argument("--W2", default=.5, type=float)
14 | args = parser.parse_args()
15 |
16 | DATASET = args.dataset
17 | TEMPERATURE = float(args.temperature)
18 | ALPHA = float(args.alpha)
19 | BETA = float(args.beta)
20 | W1 = float(args.W1)
21 | W2 = float(args.W2)
22 | NUM_PER_CLASS = int(args.num_per_class)
23 | BATCH_PROP = int(args.batch_prop)
24 |
25 | model
26 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # GraphHop: An Enhanced Label Propagation Method for Node Classification
2 |
3 | This respository contains the PyTorch implementation of GraphHop for the task of semi-supervised classification of nodes in a graph, as described in our paper:
4 |
5 | Tian Xie, Bin Wang, C.-C. Jay Kuo, GraphHop: An Enhanced Label PropagationMethod for Node Classification. [[paper]](https://arxiv.org/abs/2101.02326)
6 |
7 |
8 | ## Dependencies
9 | * torch == 1.5.0
10 | * numpy == 1.18.1
11 | * scipy == 1.4.1
12 | * [pytorch geometric](https://pytorch-geometric.readthedocs.io/en/latest/notes/installation.html)
13 |
14 | ## RUN
15 | ```
16 | sh run_model.sh
17 | ```
18 | You may change the hyperparameters inside the shell script.
19 |
20 | ## Citation
21 | If you are use this code for your research, please cite our paper.
22 |
23 | ```
24 | @ARTICLE{9737682,
25 | author={Xie, Tian and Wang, Bin and Kuo, C.-C. Jay},
26 | journal={IEEE Transactions on Neural Networks and Learning Systems},
27 | title={GraphHop: An Enhanced Label Propagation Method for Node Classification},
28 | year={2022},
29 | volume={},
30 | number={},
31 | pages={1-15},
32 | doi={10.1109/TNNLS.2022.3157746}}
33 | ```
34 |
--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 |
4 | from logisticRegression import fit
5 | from datetime import datetime
6 | from dataLoader import load_planetoid_datasets
7 | from utils import random_walk_normalize, pure_k_hops, sparse_mx_to_torch_sparse_tensor, accuracy
8 |
9 | from run_model import W1, W2, DATASET, NUM_PER_CLASS
10 |
11 | if torch.cuda.is_available():
12 | print("Using CUDA.")
13 |
14 | dataset = DATASET
15 |
16 | date = datetime.now()
17 |
18 | num_labels_per_class = NUM_PER_CLASS
19 | if dataset in ['cora', 'citeseer', 'pubmed']:
20 | feat, one_hot_labels, adj, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_planetoid_datasets(
21 | dataset, num_labels_per_class)
22 | else:
23 | assert False, "Choose dataset from Cora, CiteSeer, or PubMed."
24 |
25 | labels = np.where(one_hot_labels == 1)[1]
26 |
27 | # k hops
28 | one_hops_adj = pure_k_hops(adj, 1)
29 | two_hops_adj = pure_k_hops(adj, 2)
30 |
31 | one_hops_adj = random_walk_normalize(one_hops_adj)
32 | two_hops_adj = random_walk_normalize(two_hops_adj)
33 |
34 | new_feat = feat
35 |
36 | pseudo_labels = np.zeros(one_hot_labels.shape)
37 | pseudo_labels[train_mask] = one_hot_labels[train_mask]
38 | y_val = one_hot_labels[val_mask]
39 |
40 | epoch = 100
41 |
42 | output = []
43 | prev_model = [None, None]
44 | num_perturb = 0
45 | test_scores_record = []
46 |
47 | new_feat = torch.FloatTensor(new_feat)
48 | one_hops_adj = sparse_mx_to_torch_sparse_tensor(one_hops_adj)
49 | two_hops_adj = sparse_mx_to_torch_sparse_tensor(two_hops_adj)
50 | y_val = torch.FloatTensor(y_val)
51 | pseudo_labels = torch.FloatTensor(pseudo_labels)
52 | one_hot_labels = torch.FloatTensor(one_hot_labels)
53 |
54 | if torch.cuda.is_available():
55 | new_feat = new_feat.cuda()
56 | one_hops_adj = one_hops_adj.cuda()
57 | two_hops_adj = two_hops_adj.cuda()
58 | pseudo_labels = pseudo_labels.cuda()
59 | one_hot_labels = one_hot_labels.cuda()
60 |
61 | ave_acc = []
62 | for i in range(epoch + 1):
63 | one_agg_feat = torch.spmm(one_hops_adj, new_feat)
64 | two_agg_feat = torch.spmm(two_hops_adj, new_feat)
65 |
66 | one_new_feat = torch.cat((new_feat, one_agg_feat), dim=1)
67 | two_new_feat = torch.cat((new_feat, one_agg_feat, two_agg_feat), dim=1)
68 |
69 | X_1 = one_new_feat
70 | y_1 = pseudo_labels
71 | X_2 = two_new_feat
72 | y_2 = pseudo_labels
73 |
74 | if torch.cuda.is_available():
75 | X_1 = X_1.cuda()
76 | y_1 = y_1.cuda()
77 | X_2 = X_2.cuda()
78 | y_2 = y_2.cuda()
79 | y_val = y_val.cuda()
80 |
81 | clf_1 = fit(i, X_1, y_1, train_mask, val_mask, y_val, prev_model[0])
82 | clf_2 = fit(i, X_2, y_2, train_mask, val_mask, y_val, prev_model[1])
83 |
84 | prev_model[0] = clf_1
85 | prev_model[1] = clf_2
86 |
87 | pseudo_labels = W1 * clf_1.predict_temp_soft_labels(X_1).detach() + (1. - W1) * clf_2.predict_temp_soft_labels(
88 | X_2).detach()
89 | pseudo_labels[train_mask] = one_hot_labels[train_mask]
90 |
91 | new_feat = W2 * clf_1.predict_soft_labels(X_1).detach() + (1. - W2) * clf_2.predict_soft_labels(X_2).detach()
92 |
93 | # model evaluation
94 | clf_1.eval()
95 | clf_2.eval()
96 |
97 | y_train = one_hot_labels[train_mask]
98 | y_test = one_hot_labels[test_mask]
99 | if torch.cuda.is_available():
100 | y_train = y_train.cuda()
101 | y_test = y_test.cuda()
102 |
103 | train_score = accuracy(new_feat[train_mask], y_train)
104 | test_score = accuracy(new_feat[test_mask], y_test)
105 | print('Iteration {}, train accuracy: {:.4f}, test accuracy: {:.4f}'.format(i, train_score, test_score))
106 | ave_acc.append(test_score.item())
107 |
--------------------------------------------------------------------------------
/logisticRegression.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | import torch.optim as optim
4 | import numpy as np
5 | import copy
6 |
7 | from torch.nn import Linear
8 | from run_model import TEMPERATURE, ALPHA, BETA, BATCH_PROP
9 |
10 | learning_rate = 0.01
11 | weight_decay = 5e-4
12 | epoch = 1000
13 | early_stopping = 10
14 |
15 | temperature = TEMPERATURE
16 | alpha = ALPHA
17 | beta = BETA
18 | batch_prop = BATCH_PROP
19 |
20 |
21 | class LogisticRegression(torch.nn.Module):
22 | def __init__(self, num_feat, num_classes):
23 | super(LogisticRegression, self).__init__()
24 | self.linear = Linear(num_feat, num_classes)
25 |
26 | def forward(self, feat):
27 | return F.log_softmax(self.linear(feat), dim=1)
28 |
29 | def predict_soft_labels(self, feat):
30 | return F.softmax(self.linear(feat), dim=1)
31 |
32 | def predict_temp_soft_labels(self, feat):
33 | return F.softmax(self.linear(feat) / temperature, dim=1)
34 |
35 | def score(self, feat, labels):
36 | y_prob = F.softmax(self.linear(feat), dim=1)
37 | _accuracy = accuracy(y_prob, labels)
38 | return _accuracy
39 |
40 |
41 | def fit(step, feat, labels, train_mask, val_mask, y_val, prev_model):
42 | num_feat = feat.shape[1]
43 | num_classes = labels.shape[1]
44 | pseudo_mask = torch.zeros(train_mask.shape[0], dtype=torch.bool)
45 | pseudo_mask[train_mask == False] = True
46 |
47 | X_train = feat[train_mask]
48 | y_train = labels[train_mask]
49 | X_pseudo = feat[pseudo_mask]
50 | y_pseudo = labels[pseudo_mask]
51 |
52 | X_val = feat[val_mask]
53 | y_val = y_val
54 |
55 | new_batch_prop = float(batch_prop / feat.shape[0])
56 |
57 | if step <= 1:
58 | model = LogisticRegression(num_feat, num_classes)
59 | if torch.cuda.is_available():
60 | model.cuda()
61 | else:
62 | model = prev_model
63 | optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
64 | model.train()
65 | count = 0
66 | best_model = copy.deepcopy(model)
67 | prev_loss_val = np.inf
68 | for i in range(epoch):
69 | for j in range(0, int(1. / new_batch_prop) + 1):
70 | optimizer.zero_grad()
71 | y_train_batch = y_train[
72 | int(y_train.shape[0] * j * new_batch_prop):int(y_train.shape[0] * (j + 1) * new_batch_prop)]
73 | if y_train_batch.shape[0] == 0:
74 | break
75 | y_pseudo_batch = y_pseudo[
76 | int(y_pseudo.shape[0] * j * new_batch_prop):int(
77 | y_pseudo.shape[0] * (j + 1) * new_batch_prop)]
78 | X_train_batch = X_train[
79 | int(X_train.shape[0] * j * new_batch_prop):int(X_train.shape[0] * (j + 1) * new_batch_prop)]
80 | X_pseudo_batch = X_pseudo[
81 | int(X_pseudo.shape[0] * j * new_batch_prop):int(
82 | X_pseudo.shape[0] * (j + 1) * new_batch_prop)]
83 | y_train_log_prob = model.forward(X_train_batch)
84 | y_pseudo_log_prob = model.forward(X_pseudo_batch)
85 | num_train = y_train_batch.shape[0]
86 | num_pseudo = y_pseudo_batch.shape[0]
87 | if step == 0:
88 | entropy_train = (y_train_batch * y_train_log_prob).sum()
89 | loss_train = -1.0 * entropy_train
90 | else:
91 | entropy_train = (y_train_batch * y_train_log_prob).sum() / num_train + alpha * (
92 | y_pseudo_batch * y_pseudo_log_prob).sum() / (num_pseudo * num_classes) \
93 | + beta * (torch.exp(y_pseudo_log_prob) * y_pseudo_log_prob).sum() / (
94 | num_pseudo * num_classes)
95 | loss_train = -1.0 * entropy_train
96 |
97 | loss_train.backward()
98 | optimizer.step()
99 |
100 | if count == 0:
101 | best_model = copy.deepcopy(model)
102 | y_log_prob_val = model.forward(X_val)
103 | entropy_val = y_val * y_log_prob_val
104 | loss_val = -1.0 * entropy_val.sum()
105 | accuracy_val = accuracy(y_log_prob_val, y_val)
106 | if loss_val - prev_loss_val > 0 or prev_loss_val - loss_val < 1e-2:
107 | count += 1
108 | else:
109 | count = 0
110 | if count == early_stopping:
111 | break
112 | prev_loss_val = loss_val
113 | # print("epoch: {}, train loss: {:.4f}, train accuracy: {:.4f}, validation loss: {:.4f}, "
114 | # "validation accuracy: {:.4f}".format(i, loss_train, accuracy_train, loss_val, accuracy_val))
115 | # f1.write(str(accuracy_val.item()) + ',\n')
116 | return best_model
117 |
118 |
119 | def accuracy(output, labels):
120 | labels = labels.max(1)[1]
121 | preds = output.max(1)[1].type_as(labels)
122 | correct = preds.eq(labels).double()
123 | correct = correct.sum()
124 | return correct / len(labels)
125 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import scipy.sparse as sp
3 | import torch
4 | from scipy.spatial.distance import cosine, euclidean
5 |
6 | from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
7 |
8 | # select samples that are farthest to the center
9 | from dataLoader import load_preprocess_data
10 |
11 |
12 | def maxCover(data, ratio):
13 | center = np.mean(data, axis=0).reshape(1, -1)
14 | centers = np.tile(center, (center.shape[0], 1))
15 | distance = np.linalg.norm(data - centers, axis=1)
16 | sort_distance = np.argsort(distance)[::-1]
17 | return sort_distance[:int(len(sort_distance) * ratio)]
18 |
19 |
20 | # select the top confidence
21 | def topConfidence(data, ratio):
22 | row_max = np.max(data, axis=1)
23 | index = np.argsort(row_max)[::-1]
24 | return index[:int(len(index) * ratio)]
25 |
26 |
27 | # assign each nodes to clusters
28 | def nodesAssign(data, nodes_index, clusters):
29 | means = []
30 | for c in clusters:
31 | means.append(np.mean(data[c], axis=0))
32 | means = np.array(means)
33 | nodes = data[nodes_index]
34 | for i in range(len(nodes)):
35 | distance = np.linalg.norm(means - nodes[i], axis=1)
36 | min_c = np.argsort(distance)[0]
37 | clusters[min_c] = np.append(clusters[min_c], i)
38 | return clusters
39 |
40 |
41 | # calculate the weight matrix
42 | # w_ij = e^{d_ij / d_min} / sum_j(e^{d_ij / d_min})
43 | # d_ij = cosine distance of i and j -- 1 - cos(a)
44 | def edge_weight(data, adj, type='euclidean'):
45 | weight_matrix = np.zeros(adj.shape)
46 | diagonal = sp.diags(adj.diagonal())
47 | adj = adj - diagonal
48 | for i in range(data.shape[0]):
49 | adj_nodes_index = (adj[i].toarray() != 0).squeeze()
50 | adj_nodes = data[adj_nodes_index]
51 |
52 | # citeseer dataset has some isolated nodes
53 | if len(adj_nodes) == 0:
54 | continue
55 | assert (type in ['euclidean', 'cosine'])
56 | if type == 'euclidean':
57 | distance = np.array([euclidean(data[i], adj_nodes[j]) for j in range(adj_nodes.shape[0])])
58 | elif type == 'cosine':
59 | distance = np.array([cosine(data[i], adj_nodes[j]) for j in range(adj_nodes.shape[0])])
60 |
61 | # # there are some nodes have exactly same features.
62 | # if min(distance) == 0:
63 | # index = np.where(distance == 0)
64 | # index = np.arange(adj.shape[0])[adj_nodes_index][index]
65 | # weight_matrix[i][index] = 1
66 | # continue
67 | weight = np.exp(distance)
68 | weight = weight / np.sum(weight)
69 | weight_matrix[i][adj_nodes_index] = weight
70 | # weight_matrix += sp.eye(adj.shape[0])
71 | return weight_matrix
72 |
73 |
74 | # calculate the feature similarity matrix
75 | def one_shot_edge_weight(data, type='euclidean'):
76 | assert (type in ['euclidean', 'cosine'])
77 | if type == 'euclidean':
78 | distance = euclidean_distances(data)
79 | elif type == 'cosine':
80 | distance = cosine_similarity(data)
81 | return distance
82 |
83 |
84 | # A = D^{-1/2} * A * D^{-1/2}
85 | def normalize(adj):
86 | adj = adj + sp.eye(adj.shape[0]) # add self-loop
87 | row_sum = np.array(adj.sum(1))
88 | r_inv = np.power(row_sum, -0.5).flatten()
89 | r_mat_inv = sp.diags(r_inv)
90 | norm_adj = r_mat_inv.dot(adj)
91 | norm_adj = norm_adj.dot(r_mat_inv)
92 | return norm_adj
93 |
94 |
95 | def random_walk_normalize(adj):
96 | # adj = adj + sp.eye(adj.shape[0]) # add self-loop
97 | row_sum = np.array(adj.sum(1)).astype('float')
98 | r_inv = np.power(row_sum, -1).flatten()
99 | r_inv[r_inv == float('inf')] = 0
100 | r_mat_inv = sp.diags(r_inv)
101 | norm_adj = r_mat_inv.dot(adj)
102 | return norm_adj
103 |
104 |
105 | # sparse adjacency matrix
106 | def multiHops(adj, k):
107 | multi_adj = adj
108 | for i in range(k - 1):
109 | multi_adj = multi_adj.dot(adj)
110 | return multi_adj
111 |
112 |
113 | def pure_k_hops(adj, k):
114 | multi_adj = adj
115 | pre_multi_adj = [adj]
116 | for i in range(k - 1):
117 | multi_adj = multi_adj.dot(adj)
118 | multi_adj = multi_adj - sp.diags(multi_adj.diagonal())
119 | multi_adj = multi_adj.tolil()
120 | for m in pre_multi_adj:
121 | multi_adj[m.nonzero()] = 0
122 | multi_adj = multi_adj.tocsr()
123 | pre_multi_adj.append(multi_adj)
124 | return multi_adj
125 |
126 |
127 | # calculate the prediction accuracy
128 | def accuracy(output, labels):
129 | labels = labels.max(1)[1]
130 | preds = output.max(1)[1].type_as(labels)
131 | correct = preds.eq(labels).double()
132 | correct = correct.sum()
133 | return correct / len(labels)
134 |
135 |
136 | def sparse_mx_to_torch_sparse_tensor(sparse_mx):
137 | """Convert a scipy sparse matrix to a torch sparse tensor."""
138 | sparse_mx = sparse_mx.tocoo().astype(np.float32)
139 | indices = torch.from_numpy(
140 | np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
141 | values = torch.from_numpy(sparse_mx.data)
142 | shape = torch.Size(sparse_mx.shape)
143 | return torch.sparse.FloatTensor(indices, values, shape)
144 |
145 |
146 | if __name__ == '__main__':
147 | # # toy example
148 | # data = np.array([[1, 2, 3],
149 | # [4, 5, 6]])
150 | # adj = csr_matrix([[0, 1],
151 | # [1, 0]])
152 | # # adj = csr_matrix(np.array([[0, 1, 0, 0, 0, 0],
153 | # # [1, 0, 1, 0, 0, 0],
154 | # # [0, 1, 0, 1, 1, 0],
155 | # # [0, 0, 1, 0, 0, 0],
156 | # # [0, 0, 1, 0, 0, 1],
157 | # # [0, 0, 0, 0, 1, 0]]))
158 | # weight = np.array([[0., 1., 2., 1., 2, 3],
159 | # [1., 0., 1., 3., 4, 5],
160 | # [4., 1., 0., 1., 6, 1],
161 | # [1., 1., 1., 0., 3, 2],
162 | # [1, 2, 3, 4, 5, 0],
163 | # [4, 3, 2, 1, 6, 0]])
164 | #
165 | # a = edge_weight(data, adj)
166 | # print(a)
167 |
168 | dataset = 'pubmed'
169 | emb_dimensions = 20
170 | feat, one_hot_labels, adj, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_preprocess_data(dataset,
171 | emb_dimensions)
172 | a = edge_weight(feat, adj, 'cosine')
173 | print(a)
174 | # print(one_hot_labels.shape)
175 | # clf = LogisticRegression(solver="lbfgs", multi_class="multinomial", max_iter=1000, verbose=100)
176 | # labels = np.where(one_hot_labels == 1)[1]
177 | #
178 | # clf.fit(feat[train_mask], labels[train_mask])
179 | # print(clf.score(feat[train_mask], labels[train_mask]))
180 |
181 | # adj = adj + sp.eye(adj.shape[0])
182 | # weight = one_shot_edge_weight(feat, 'euclidean')
183 | # add_edge_weight(adj, weight)
184 |
--------------------------------------------------------------------------------
/dataLoader.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | import numpy as np
4 | import pickle as pkl
5 | import sys
6 | import scipy.sparse as sp
7 | import networkx as nx
8 | import os
9 | import warnings
10 |
11 | import torch
12 | import torch_geometric.transforms as T
13 |
14 | from torch_geometric.datasets import Planetoid
15 |
16 | warnings.filterwarnings('ignore')
17 |
18 | DATA_PATH = '../datasets'
19 |
20 |
21 | def one_hot(idx, num_class):
22 | return torch.zeros(len(idx), num_class).to(idx.device).scatter_(
23 | 1, idx.unsqueeze(1), 1.)
24 |
25 |
26 | def parse_index_file(filename):
27 | index = []
28 | for line in open(filename):
29 | index.append(int(line.strip()))
30 | return index
31 |
32 |
33 | def sample_mask(idx, l):
34 | """Create mask."""
35 | mask = np.zeros(l)
36 | mask[idx] = 1
37 | return np.array(mask, dtype=np.bool)
38 |
39 |
40 | def load_data(dataset):
41 | names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
42 | objects = []
43 | for i in range(len(names)):
44 | with open(DATA_PATH + "/ind.{}.{}".format(dataset, names[i]), 'rb') as f:
45 | if sys.version_info > (3, 0):
46 | objects.append(pkl.load(f, encoding='latin1'))
47 | else:
48 | objects.append(pkl.load(f))
49 |
50 | x, y, tx, ty, allx, ally, graph = tuple(objects)
51 |
52 | test_idx_reorder = parse_index_file(DATA_PATH + "/ind.{}.test.index".format(dataset))
53 | test_idx_range = np.sort(test_idx_reorder)
54 |
55 | if dataset == 'citeseer':
56 | # Fix citeseer dataset (there are some isolated nodes in the graph)
57 | # Find isolated nodes, add them as zero-vecs into the right position
58 | test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1)
59 | tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
60 | tx_extended[test_idx_range - min(test_idx_range), :] = tx
61 | tx = tx_extended
62 | # set zero if no label
63 | ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
64 | ty_extended[test_idx_range - min(test_idx_range), :] = ty
65 | ty = ty_extended
66 |
67 | features = np.vstack((np.array(allx.todense()), np.array(tx.todense())))
68 | labels = np.vstack((ally, ty))
69 |
70 | features[test_idx_reorder, :] = features[test_idx_range, :]
71 | labels[test_idx_reorder, :] = labels[test_idx_range, :]
72 | adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
73 |
74 | # from here mask the labels y for training, validation and testing,
75 | # so during training, only the labels from training dataset are used
76 | idx_test = test_idx_range.tolist()
77 | idx_train = range(len(y))
78 | idx_val = range(len(y), len(y) + 500)
79 |
80 | train_mask = sample_mask(idx_train, labels.shape[0])
81 | val_mask = sample_mask(idx_val, labels.shape[0])
82 | test_mask = sample_mask(idx_test, labels.shape[0])
83 |
84 | y_train = np.zeros(labels.shape)
85 | y_val = np.zeros(labels.shape)
86 | y_test = np.zeros(labels.shape)
87 | y_train[train_mask, :] = labels[train_mask, :]
88 | y_val[val_mask, :] = labels[val_mask, :]
89 | y_test[test_mask, :] = labels[test_mask, :]
90 |
91 | return features, labels, adj, y_train, y_val, y_test, train_mask, val_mask, test_mask
92 |
93 |
94 | def preprocess_features(features):
95 | """Row-normalize feature matrix and convert to tuple representation"""
96 | # print('Pre-processing feature by Simple Normalization')
97 | rowsum = np.array(features.sum(1))
98 | r_inv = np.power(rowsum, -1).flatten()
99 | r_inv[np.isinf(r_inv)] = 0.
100 | r_mat_inv = sp.diags(r_inv)
101 | features = r_mat_inv.dot(features)
102 | return features
103 |
104 |
105 | def preprocess_features_Probability(features):
106 | """Co-occurrence embedding to pre-process feature"""
107 | print('Pre-processing feature by Co-occurrence/Probability statistics')
108 | # co_occur = np.zeros((features.shape[1],features.shape[1]))
109 |
110 | # Get co-occurrence matrix
111 | co_occur = features.T.dot(features)
112 |
113 | # Normalization
114 | co_occur = preprocess_features(co_occur)
115 | features += features.dot(co_occur)
116 | features = preprocess_features(features)
117 | return features
118 |
119 |
120 | def load_preprocess_data(dataset, emb_dimensions=20):
121 | features, labels, adj, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(dataset)
122 |
123 | # drop the non-labeled nodes
124 | if dataset == 'citeseer':
125 | mask_index = []
126 | for i in range(len(labels)):
127 | if not (labels[i] == 0).all():
128 | mask_index.append(i)
129 | mask_index = np.array(mask_index)
130 | features = features[mask_index]
131 | labels = labels[mask_index]
132 | temp_adj = sp.csc_matrix(adj[mask_index]).T
133 | temp_adj = temp_adj[mask_index]
134 | adj = sp.csr_matrix(temp_adj)
135 | y_train = y_train[mask_index]
136 | y_val = y_val[mask_index]
137 | y_test = y_test[mask_index]
138 | train_mask = train_mask[mask_index]
139 | val_mask = val_mask[mask_index]
140 | test_mask = test_mask[mask_index]
141 |
142 | print("{} dataset loaded.".format(dataset))
143 | return features, labels, adj, y_train, y_val, y_test, train_mask, val_mask, test_mask
144 |
145 |
146 | def split_by_fixed_training_data(data, num_labels_per_class=20):
147 | num = data.x.shape[0]
148 | num_labels = data.num_classes
149 | labels = data.y.tolist()
150 | # the different sampling of training set requires the parameters retuning
151 | # labels = np.random.RandomState(seed=2).permutation(labels).tolist()
152 |
153 | idx_train = []
154 | class_cnt = np.zeros(num_labels)
155 | for i in range(num):
156 | if (class_cnt >= num_labels_per_class).all():
157 | break
158 | if class_cnt[labels[i]] == num_labels_per_class:
159 | continue
160 | idx_train.append(i)
161 | class_cnt[labels[i]] += 1
162 |
163 | idx_val = random.sample(set(range(num)) - set(idx_train), 500) # random sample 500 for validation
164 | idx_test = list(set(range(num)) - set(idx_train) - set(idx_val)) # the rest as testing
165 |
166 | train_mask = np.zeros((num,), dtype=int)
167 | train_mask[np.array(idx_train)] = 1
168 |
169 | val_mask = np.zeros((num,), dtype=int)
170 | val_mask[np.array(idx_val)] = 1
171 |
172 | test_mask = np.zeros((num,), dtype=int)
173 | test_mask[np.array(idx_test)] = 1
174 | return train_mask, val_mask, test_mask
175 |
176 |
177 | def load_planetoid_datasets(dataset, num_labels_per_class=20):
178 | name = dataset
179 | path = os.path.join("./datasets", dataset)
180 | dataset = Planetoid(root=path, name=name, transform=T.NormalizeFeatures())
181 | data = dataset[0]
182 | data.num_classes = dataset.num_classes
183 |
184 | train_mask, val_mask, test_mask = split_by_fixed_training_data(data, num_labels_per_class)
185 | train_mask = train_mask.astype(bool)
186 | val_mask = val_mask.astype(bool)
187 | test_mask = test_mask.astype(bool)
188 |
189 | features = data.x.numpy()
190 | labels = one_hot(data.y, data.num_classes).numpy()
191 | edges = data.edge_index.numpy()
192 | ones = np.ones(edges.shape[1])
193 | adj = sp.csr_matrix((ones, edges), shape=(data.num_nodes, data.num_nodes))
194 |
195 | print("Citation-{} dataset loaded.".format(name))
196 | return features, labels, adj, None, None, None, train_mask, val_mask, test_mask
197 |
198 |
199 | if __name__ == '__main__':
200 | features, labels, adj, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_planetoid_datasets('cora')
201 | print('train: ', features[train_mask].shape)
202 | print('val: ', features[val_mask].shape)
203 | print('test: ', features[test_mask].shape)
204 | print('labels: ', labels.shape)
205 | print(np.where(val_mask == True))
206 |
--------------------------------------------------------------------------------