├── config.py ├── exp ├── exp.py ├── exp_attack_unlearning.py ├── exp_graph_partition.py ├── exp_node_edge_unlearning.py └── exp_unlearning.py ├── lib_aggregator ├── __init__.py ├── aggregator.py ├── opt_dataset.py └── optimal_aggregator.py ├── lib_dataset ├── __init__.py └── data_store.py ├── lib_gnn_model ├── __init__.py ├── gat │ ├── gat.py │ └── gat_net.py ├── gcn │ ├── gcn.py │ └── gcn_net.py ├── gin │ ├── gin.py │ └── gin_net.py ├── gnn_base.py ├── graphsage │ ├── graphsage.py │ └── graphsage_net.py ├── mlp │ ├── __init__.py │ ├── mlp.py │ └── mlpnet.py └── node_classifier.py ├── lib_graph_partition ├── __init__.py ├── constrained_kmeans.py ├── constrained_kmeans_base.py ├── constrained_lpa.py ├── constrained_lpa_base.py ├── graph_partition.py ├── hungarian.py ├── hungarian_1.py ├── metis_partition.py ├── partition.py ├── partition_kmeans.py ├── partition_lpa.py └── partition_random.py ├── lib_node_embedding ├── __init__.py ├── ge │ ├── __init__.py │ ├── alias.py │ ├── classify.py │ ├── models │ │ ├── __init__.py │ │ ├── deepwalk.py │ │ ├── line.py │ │ ├── node2vec.py │ │ ├── sdne.py │ │ └── struc2vec.py │ ├── utils.py │ └── walker.py └── node_embedding.py ├── lib_utils ├── logger.py └── utils.py ├── main.py ├── parameter_parser.py └── readme.md /config.py: -------------------------------------------------------------------------------- 1 | RAW_DATA_PATH = 'temp_data/raw_data/' 2 | PROCESSED_DATA_PATH = 'temp_data/processed_data/' 3 | MODEL_PATH = 'temp_data/models/' 4 | ANALYSIS_PATH = 'temp_data/analysis_data/' 5 | 6 | # database name 7 | DATABASE_NAME = "unlearning_gnn" -------------------------------------------------------------------------------- /exp/exp.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from lib_dataset.data_store import DataStore 4 | 5 | 6 | class Exp: 7 | def __init__(self, args): 8 | self.logger = logging.getLogger('exp') 9 | 10 | self.args = args 11 | self.data_store = DataStore(args) 12 | 13 | def load_data(self): 14 | pass 15 | -------------------------------------------------------------------------------- /exp/exp_attack_unlearning.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | from collections import defaultdict 4 | 5 | import numpy as np 6 | import torch 7 | import torch_geometric as tg 8 | from torch_geometric.data import Data 9 | from scipy.spatial import distance 10 | 11 | import config 12 | from exp.exp import Exp 13 | from lib_graph_partition.graph_partition import GraphPartition 14 | from lib_gnn_model.node_classifier import NodeClassifier 15 | from lib_aggregator.aggregator import Aggregator 16 | from lib_utils import utils 17 | 18 | 19 | class ExpAttackUnlearning(Exp): 20 | def __init__(self, args): 21 | super(ExpAttackUnlearning, self).__init__(args) 22 | self.logger = logging.getLogger('exp_attack_unlearning') 23 | # 1. respond to the unlearning requests 24 | self.load_preprocessed_data() 25 | # self.graph_unlearning_request_respond() 26 | if self.args['repartition']: 27 | with open(config.MODEL_PATH + self.args['dataset_name'] + '/' + self.args['target_model']+"_unlearned_indices") as file: 28 | node_unlearning_indices = [line.rstrip() for line in file] 29 | for unlearned_node in node_unlearning_indices: 30 | self.graph_unlearning_request_respond(int(unlearned_node)) 31 | else: 32 | self.graph_unlearning_request_respond() 33 | # 2. evalute the attack performance 34 | self.attack_graph_unlearning() 35 | 36 | def load_preprocessed_data(self): 37 | self.shard_data = self.data_store.load_shard_data() 38 | self.raw_data = self.data_store.load_raw_data() 39 | self.train_data = self.data_store.load_train_data() 40 | self.train_graph = self.data_store.load_train_graph() 41 | self.train_indices, self.test_indices = self.data_store.load_train_test_split() 42 | self.community_to_node = self.data_store.load_community_data() 43 | num_feats = self.train_data.num_features 44 | num_classes = len(self.train_data.y.unique()) 45 | self.target_model = NodeClassifier(num_feats, num_classes, self.args) 46 | 47 | def graph_unlearning_request_respond(self, node_unlearning_request=None): 48 | # reindex the node ids 49 | node_to_com = self.data_store.c2n_to_n2c(self.community_to_node) 50 | train_indices_prune = list(node_to_com.keys()) 51 | 52 | if node_unlearning_request==None: 53 | # generate node unlearning requests 54 | node_unlearning_indices = np.random.choice(train_indices_prune, self.args['num_unlearned_nodes']) 55 | else: 56 | node_unlearning_indices = np.array([node_unlearning_request]) 57 | self.num_unlearned_edges =0 58 | unlearning_indices = defaultdict(list) 59 | for node in node_unlearning_indices: 60 | unlearning_indices[node_to_com[node]].append(node) 61 | # delete a list of revoked nodes from train_graph 62 | self.train_graph.remove_nodes_from(node_unlearning_indices) 63 | 64 | # delete the revoked nodes from train_data 65 | # by building unlearned data from unlearned train_graph 66 | self.train_data.train_mask = torch.from_numpy(np.isin(np.arange(self.train_data.num_nodes), self.train_indices)) 67 | self.train_data.test_mask = torch.from_numpy(np.isin(np.arange(self.train_data.num_nodes), np.append(self.test_indices, node_unlearning_indices))) 68 | 69 | # delete the revoked nodes from shard_data 70 | self.shard_data_after_unlearning = {} 71 | self.affected_shard=[] 72 | for shard in range(self.args["num_shards"]): 73 | train_shard_indices = list(self.community_to_node[shard]) 74 | # node unlearning 75 | train_shard_indices = np.setdiff1d(train_shard_indices, unlearning_indices[shard]) 76 | shard_indices = np.union1d(train_shard_indices, self.test_indices) 77 | 78 | x = self.train_data.x[shard_indices] 79 | y = self.train_data.y[shard_indices] 80 | edge_index = utils.filter_edge_index_1(self.train_data, shard_indices) 81 | 82 | data = Data(x=x, edge_index=torch.from_numpy(edge_index), y=y) 83 | data.train_mask = torch.from_numpy(np.isin(shard_indices, train_shard_indices)) 84 | data.test_mask = torch.from_numpy(np.isin(shard_indices, self.test_indices)) 85 | 86 | self.shard_data_after_unlearning[shard] = data 87 | self.num_unlearned_edges += self.shard_data[shard].num_edges - self.shard_data_after_unlearning[shard].num_edges 88 | 89 | # find the affected shard model 90 | if self.shard_data_after_unlearning[shard].num_nodes != self.shard_data[shard].num_nodes: 91 | self.affected_shard.append(shard) 92 | 93 | self.data_store.save_unlearned_data(self.train_graph, 'train_graph') 94 | self.data_store.save_unlearned_data(self.train_data, 'train_data') 95 | self.data_store.save_unlearned_data(self.shard_data_after_unlearning, 'shard_data') 96 | 97 | # retrain the correponding shard model 98 | if not self.args['repartition']: 99 | for shard in self.affected_shard: 100 | suffix = "unlearned_"+str(node_unlearning_indices[0]) 101 | self._train_shard_model(shard, suffix) 102 | 103 | # (if re-partition, re-partition the remaining graph) 104 | # re-train the shard model, save model and optimal weight score 105 | if self.args['repartition']: 106 | suffix="_repartition_unlearned_" + str(node_unlearning_indices[0]) 107 | self._repartition(suffix) 108 | for shard in range(self.args["num_shards"]): 109 | self._train_shard_model(shard, suffix) 110 | 111 | def _repartition(self, suffix): 112 | # load unlearned train_graph and train_data 113 | train_graph = self.data_store.load_unlearned_data('train_graph') 114 | train_data = self.data_store.load_unlearned_data('train_data') 115 | # repartition 116 | start_time = time.time() 117 | partition = GraphPartition(self.args, train_graph, train_data) 118 | community_to_node = partition.graph_partition() 119 | partition_time = time.time() - start_time 120 | self.logger.info("Partition cost %s seconds." % partition_time) 121 | # save the new partition and shard 122 | self.data_store.save_community_data(community_to_node, suffix) 123 | self._generate_unlearned_repartitioned_shard_data(train_data, community_to_node, self.test_indices) 124 | 125 | def _generate_unlearned_repartitioned_shard_data(self, train_data, community_to_node, test_indices): 126 | self.logger.info('generating shard data') 127 | 128 | shard_data = {} 129 | for shard in range(self.args['num_shards']): 130 | train_shard_indices = list(community_to_node[shard]) 131 | shard_indices = np.union1d(train_shard_indices, test_indices) 132 | 133 | x = self.train_data.x[shard_indices] 134 | y = self.train_data.y[shard_indices] 135 | edge_index = utils.filter_edge_index_1(train_data, shard_indices) 136 | 137 | data = Data(x=x, edge_index=torch.from_numpy(edge_index), y=y) 138 | data.train_mask = torch.from_numpy(np.isin(shard_indices, train_shard_indices)) 139 | data.test_mask = torch.from_numpy(np.isin(shard_indices, test_indices)) 140 | 141 | shard_data[shard] = data 142 | 143 | # self.data_store.save_unlearned_data(shard_data, 'shard_data_repartition') 144 | return shard_data 145 | 146 | def _train_shard_model(self, shard, suffix="unlearned"): 147 | self.logger.info('training target models, shard %s' % shard) 148 | 149 | # load shard data 150 | self.target_model.data = self.shard_data_after_unlearning[shard] 151 | # retrain shard model 152 | self.target_model.train_model() 153 | # replace shard model 154 | device=torch.device("cpu") 155 | self.target_model.device = device 156 | self.data_store.save_target_model(0, self.target_model, shard, suffix) 157 | # self.data_store.save_unlearned_target_model(0, self.target_model, shard, suffix) 158 | 159 | def attack_graph_unlearning(self): 160 | 161 | # load unlearned indices 162 | with open(config.MODEL_PATH + self.args['dataset_name'] + "/" + self.args['target_model'] +"_unlearned_indices") as file: 163 | unlearned_indices = [line.rstrip() for line in file] 164 | 165 | # member sample query, label as 1 166 | positive_posteriors = self._query_target_model(unlearned_indices, unlearned_indices) 167 | # non-member sample query, label as 0 168 | negative_posteriors = self._query_target_model(unlearned_indices, self.test_indices) 169 | 170 | # evaluate attack performance, train multiple shadow models, or calculate posterior entropy, or directly calculate AUC. 171 | self.evaluate_attack_performance(positive_posteriors, negative_posteriors) 172 | 173 | def _query_target_model(self, unlearned_indices, test_indices): 174 | # load unlearned data 175 | train_data = self.data_store.load_unlearned_data('train_data') 176 | 177 | # load optimal weight score 178 | # optimal_weight=self.data_store.load_optimal_weight(0) 179 | 180 | # calculate the final posterior, save as attack feature 181 | self.logger.info('aggregating submodels') 182 | posteriors_a, posteriors_b, posteriors_c =[],[],[] 183 | 184 | for i in unlearned_indices: 185 | community_to_node = self.data_store.load_community_data('') 186 | shard_data = self._generate_unlearned_repartitioned_shard_data(train_data, community_to_node, int(i)) 187 | 188 | posteriors_a.append(self._generate_posteriors(shard_data, '')) 189 | 190 | suffix="unlearned_" + str(i) 191 | posteriors_b.append(self._generate_posteriors_unlearned(shard_data, suffix, i)) 192 | 193 | if self.args['repartition']: 194 | suffix = "_repartition_unlearned_" + str(i) 195 | community_to_node = self.data_store.load_community_data(suffix) 196 | shard_data = self._generate_unlearned_repartitioned_shard_data(train_data, community_to_node, int(i)) 197 | suffix = "__repartition_unlearned_" + str(i) 198 | posteriors_c.append(self._generate_posteriors(shard_data, suffix)) 199 | 200 | return posteriors_a, posteriors_b, posteriors_c 201 | 202 | def _generate_posteriors_unlearned(self, shard_data, suffix, unlearned_indice): 203 | import glob 204 | model_path=glob.glob(config.MODEL_PATH+self.args['dataset_name']+"/*_1unlearned_"+str(unlearned_indice)) 205 | if not model_path: 206 | self.logger.info("No corresponding unlearned shard model for node %s" % str(unlearned_indice)) 207 | return torch.tensor([0]*6) 208 | else: 209 | affected_shard = int(model_path[0].split('/')[-1].split('_')[-4]) 210 | posteriors = [] 211 | for shard in range(self.args['num_shards']): 212 | if shard == affected_shard: 213 | # load the retrained the shard model 214 | self.data_store.load_target_model(0, self.target_model, shard, suffix) 215 | else: 216 | # self.target_model.model.reset_parameters() 217 | # load unaffected shard model 218 | self.data_store.load_target_model(0, self.target_model, shard, '') 219 | self.device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu') 220 | self.target_model.model = self.target_model.model.to(self.device) 221 | self.target_model.data = shard_data[shard].to(self.device) 222 | posteriors.append(self.target_model.posterior()) 223 | return torch.mean(torch.cat(posteriors, dim=0), dim=0) 224 | 225 | def _generate_posteriors(self, shard_data, suffix): 226 | posteriors = [] 227 | for shard in range(self.args['num_shards']): 228 | # self.target_model.model.reset_parameters() 229 | self.data_store.load_target_model(0, self.target_model, shard, suffix) 230 | self.device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu') 231 | self.target_model.model = self.target_model.model.to(self.device) 232 | self.target_model.data = shard_data[shard].to(self.device) 233 | 234 | posteriors.append(self.target_model.posterior()) 235 | return torch.mean(torch.cat(posteriors, dim=0), dim=0) 236 | 237 | def evaluate_attack_performance(self, positive_posteriors, negative_posteriors): 238 | # constrcut attack data 239 | label = torch.cat((torch.ones(len(positive_posteriors[0])), torch.zeros(len(negative_posteriors[0])))) 240 | data={} 241 | for i in range(2): 242 | data[i] = torch.cat((torch.stack(positive_posteriors[i]), torch.stack(negative_posteriors[i])),0) 243 | 244 | # calculate l2 distance 245 | model_b_distance = self._calculate_distance(data[0], data[1]) 246 | # directly calculate AUC with feature and labels 247 | attack_auc_b = self.evaluate_attack_with_AUC(model_b_distance, label) 248 | 249 | if self.args['repartition']: 250 | model_c_distance = self._calculate_distance(data[0], data[2]) 251 | attack_auc_c = self.evaluate_attack_with_AUC(model_c_distance, label) 252 | 253 | self.logger.info("Attack_Model_B AUC: %s | Attack_Model_C AUC: %s" % (attack_auc_b, attack_auc_c)) 254 | 255 | def evaluate_attack_with_AUC(self, data, label): 256 | from sklearn.metrics import roc_auc_score 257 | self.logger.info("Directly calculate the attack AUC") 258 | return roc_auc_score(label, data.reshape(-1, 1)) 259 | 260 | def _calculate_distance(self, data0, data1, distance='l2_norm' ): 261 | if distance == 'l2_norm': 262 | return np.array([np.linalg.norm(data0[i]-data1[i]) for i in range(len(data0))]) 263 | elif distance =='direct_diff': 264 | return data0 - data1 265 | else: 266 | raise Exception("Unsupported distance") 267 | -------------------------------------------------------------------------------- /exp/exp_graph_partition.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | 4 | import torch 5 | from sklearn.model_selection import train_test_split 6 | import numpy as np 7 | from torch_geometric.data import Data 8 | import torch_geometric as tg 9 | import networkx as nx 10 | 11 | from exp.exp import Exp 12 | from lib_utils.utils import connected_component_subgraphs 13 | from lib_graph_partition.graph_partition import GraphPartition 14 | from lib_utils import utils 15 | 16 | 17 | class ExpGraphPartition(Exp): 18 | def __init__(self, args): 19 | super(ExpGraphPartition, self).__init__(args) 20 | 21 | self.logger = logging.getLogger('exp_graph_partition') 22 | 23 | self.load_data() 24 | self.train_test_split() 25 | self.gen_train_graph() 26 | self.graph_partition() 27 | self.generate_shard_data() 28 | 29 | def load_data(self): 30 | self.data = self.data_store.load_raw_data() 31 | 32 | def train_test_split(self): 33 | if self.args['is_split']: 34 | self.logger.info('splitting train/test data') 35 | self.train_indices, self.test_indices = train_test_split(np.arange((self.data.num_nodes)), test_size=self.args['test_ratio'], random_state=100) 36 | self.data_store.save_train_test_split(self.train_indices, self.test_indices) 37 | 38 | self.data.train_mask = torch.from_numpy(np.isin(np.arange(self.data.num_nodes), self.train_indices)) 39 | self.data.test_mask = torch.from_numpy(np.isin(np.arange(self.data.num_nodes), self.test_indices)) 40 | else: 41 | self.train_indices, self.test_indices = self.data_store.load_train_test_split() 42 | 43 | self.data.train_mask = torch.from_numpy(np.isin(np.arange(self.data.num_nodes), self.train_indices)) 44 | self.data.test_mask = torch.from_numpy(np.isin(np.arange(self.data.num_nodes), self.test_indices)) 45 | 46 | def gen_train_graph(self): 47 | # delete ratio of edges and update the train graph 48 | if self.args['ratio_deleted_edges'] != 0: 49 | self.logger.debug("Before edge deletion. train data #.Nodes: %f, #.Edges: %f" % ( 50 | self.data.num_nodes, self.data.num_edges)) 51 | 52 | # self._ratio_delete_edges() 53 | self.data.edge_index = self._ratio_delete_edges(self.data.edge_index) 54 | 55 | # decouple train test edges. 56 | edge_index = self.data.edge_index.numpy() 57 | test_edge_indices = np.logical_or(np.isin(edge_index[0], self.test_indices), 58 | np.isin(edge_index[1], self.test_indices)) 59 | train_edge_indices = np.logical_not(test_edge_indices) 60 | edge_index_train = edge_index[:, train_edge_indices] 61 | 62 | self.train_graph = nx.Graph() 63 | self.train_graph.add_nodes_from(self.train_indices) 64 | 65 | # use largest connected graph as train graph 66 | if self.args['is_prune']: 67 | self._prune_train_set() 68 | 69 | # reconstruct a networkx train graph 70 | for u, v in np.transpose(edge_index_train): 71 | self.train_graph.add_edge(u, v) 72 | 73 | self.logger.debug("After edge deletion. train graph #.Nodes: %f, #.Edges: %f" % ( 74 | self.train_graph.number_of_nodes(), self.train_graph.number_of_edges())) 75 | self.logger.debug("After edge deletion. train data #.Nodes: %f, #.Edges: %f" % ( 76 | self.data.num_nodes, self.data.num_edges)) 77 | self.data_store.save_train_data(self.data) 78 | self.data_store.save_train_graph(self.train_graph) 79 | 80 | def graph_partition(self): 81 | if self.args['is_partition']: 82 | self.logger.info('graph partitioning') 83 | 84 | start_time = time.time() 85 | partition = GraphPartition(self.args, self.train_graph, self.data) 86 | self.community_to_node = partition.graph_partition() 87 | partition_time = time.time() - start_time 88 | self.logger.info("Partition cost %s seconds." % partition_time) 89 | self.data_store.save_community_data(self.community_to_node) 90 | else: 91 | self.community_to_node = self.data_store.load_community_data() 92 | 93 | def generate_shard_data(self): 94 | self.logger.info('generating shard data') 95 | 96 | self.shard_data = {} 97 | for shard in range(self.args['num_shards']): 98 | train_shard_indices = list(self.community_to_node[shard]) 99 | shard_indices = np.union1d(train_shard_indices, self.test_indices) 100 | 101 | x = self.data.x[shard_indices] 102 | y = self.data.y[shard_indices] 103 | edge_index = utils.filter_edge_index_1(self.data, shard_indices) 104 | 105 | data = Data(x=x, edge_index=torch.from_numpy(edge_index), y=y) 106 | data.train_mask = torch.from_numpy(np.isin(shard_indices, train_shard_indices)) 107 | data.test_mask = torch.from_numpy(np.isin(shard_indices, self.test_indices)) 108 | 109 | self.shard_data[shard] = data 110 | 111 | self.data_store.save_shard_data(self.shard_data) 112 | 113 | def _prune_train_set(self): 114 | # extract the the maximum connected component 115 | self.logger.debug("Before Prune... #. of Nodes: %f, #. of Edges: %f" % ( 116 | self.train_graph.number_of_nodes(), self.train_graph.number_of_edges())) 117 | 118 | self.train_graph = max(connected_component_subgraphs(self.train_graph), key=len) 119 | 120 | self.logger.debug("After Prune... #. of Nodes: %f, #. of Edges: %f" % ( 121 | self.train_graph.number_of_nodes(), self.train_graph.number_of_edges())) 122 | # self.train_indices = np.array(self.train_graph.nodes) 123 | 124 | def _ratio_delete_edges(self, edge_index): 125 | edge_index = edge_index.numpy() 126 | 127 | unique_indices = np.where(edge_index[0] < edge_index[1])[0] 128 | unique_indices_not = np.where(edge_index[0] > edge_index[1])[0] 129 | remain_indices = np.random.choice(unique_indices, 130 | int(unique_indices.shape[0] * (1.0 - self.args['ratio_deleted_edges'])), 131 | replace=False) 132 | 133 | remain_encode = edge_index[0, remain_indices] * edge_index.shape[1] * 2 + edge_index[1, remain_indices] 134 | unique_encode_not = edge_index[1, unique_indices_not] * edge_index.shape[1] * 2 + edge_index[0, unique_indices_not] 135 | sort_indices = np.argsort(unique_encode_not) 136 | remain_indices_not = unique_indices_not[sort_indices[np.searchsorted(unique_encode_not, remain_encode, sorter=sort_indices)]] 137 | remain_indices = np.union1d(remain_indices, remain_indices_not) 138 | 139 | # self.data.edge_index = torch.from_numpy(edge_index[:, remain_indices]) 140 | return torch.from_numpy(edge_index[:, remain_indices]) 141 | -------------------------------------------------------------------------------- /exp/exp_node_edge_unlearning.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pickle 3 | import time 4 | from collections import defaultdict 5 | 6 | import numpy as np 7 | import torch 8 | from torch_geometric.data import Data 9 | 10 | import config 11 | from exp.exp import Exp 12 | from lib_gnn_model.graphsage.graphsage import SAGE 13 | from lib_gnn_model.gat.gat import GAT 14 | from lib_gnn_model.gin.gin import GIN 15 | from lib_gnn_model.gcn.gcn import GCN 16 | from lib_gnn_model.mlp.mlp import MLP 17 | from lib_gnn_model.node_classifier import NodeClassifier 18 | from lib_aggregator.aggregator import Aggregator 19 | from lib_utils import utils 20 | 21 | 22 | class ExpNodeEdgeUnlearning(Exp): 23 | def __init__(self, args): 24 | super(ExpNodeEdgeUnlearning, self).__init__(args) 25 | self.logger = logging.getLogger('exp_node_edge_unlearning') 26 | self.target_model_name = self.args['target_model'] 27 | 28 | self.load_data() 29 | self.determine_target_model() 30 | self.run_exp() 31 | 32 | def run_exp(self): 33 | # unlearning efficiency 34 | run_f1 = np.empty((0)) 35 | unlearning_time = np.empty((0)) 36 | for run in range(self.args['num_runs']): 37 | self.logger.info("Run %f" % run) 38 | self.train_target_models(run) 39 | aggregate_f1_score = self.aggregate(run) 40 | # node_unlearning_time = self.unlearning_time_statistic() 41 | node_unlearning_time = 0 42 | run_f1 = np.append(run_f1, aggregate_f1_score) 43 | unlearning_time = np.append(unlearning_time, node_unlearning_time) 44 | self.num_unlearned_edges = 0 45 | # model utility 46 | self.f1_score_avg = np.average(run_f1) 47 | self.f1_score_std = np.std(run_f1) 48 | self.unlearning_time_avg = np.average(unlearning_time) 49 | self.unlearning_time_std = np.std(unlearning_time) 50 | self.logger.info( 51 | "%s %s %s %s" % (self.f1_score_avg, self.f1_score_std, self.unlearning_time_avg, self.unlearning_time_std)) 52 | 53 | def load_data(self): 54 | self.shard_data = self.data_store.load_shard_data() 55 | self.raw_data = self.data_store.load_raw_data() 56 | self.train_data = self.data_store.load_train_data() 57 | 58 | self.unlearned_shard_data = self.shard_data 59 | 60 | def determine_target_model(self): 61 | num_feats = self.train_data.num_features 62 | num_classes = len(self.train_data.y.unique()) 63 | 64 | if not self.args['is_use_batch']: 65 | if self.target_model_name == 'SAGE': 66 | self.target_model = SAGE(num_feats, num_classes) 67 | elif self.target_model_name == 'GCN': 68 | self.target_model = GCN(num_feats, num_classes) 69 | elif self.target_model_name == 'GAT': 70 | self.target_model = GAT(num_feats, num_classes) 71 | elif self.target_model_name == 'GIN': 72 | self.target_model = GIN(num_feats, num_classes) 73 | else: 74 | raise Exception('unsupported target model') 75 | else: 76 | if self.target_model_name == 'MLP': 77 | self.target_model = MLP(num_feats, num_classes) 78 | else: 79 | self.target_model = NodeClassifier(num_feats, num_classes, self.args) 80 | 81 | def train_target_models(self, run): 82 | if self.args['is_train_target_model']: 83 | self.logger.info('training target models') 84 | 85 | self.time = {} 86 | for shard in range(self.args['num_shards']): 87 | self.time[shard] = self._train_model(run, shard) 88 | 89 | def aggregate(self, run): 90 | self.logger.info('aggregating submodels') 91 | 92 | # posteriors, true_label = self.generate_posterior() 93 | aggregator = Aggregator(run, self.target_model, self.train_data, self.unlearned_shard_data, self.args) 94 | aggregator.generate_posterior() 95 | self.aggregate_f1_score = aggregator.aggregate() 96 | 97 | self.logger.info("Final Test F1: %s" % (self.aggregate_f1_score,)) 98 | return self.aggregate_f1_score 99 | 100 | def _generate_unlearning_request(self, num_unlearned="assign"): 101 | node_list = [] 102 | for key, value in self.community_to_node.items(): 103 | # node_list.extend(value.tolist()) 104 | node_list.extend(value) 105 | if num_unlearned == "assign": 106 | num_of_unlearned_nodes = self.args['num_unlearned_nodes'] 107 | elif num_unlearned == "ratio": 108 | num_of_unlearned_nodes = int(self.args['ratio_unlearned_nodes'] * len(node_list)) 109 | 110 | if self.args['unlearning_request'] == 'random': 111 | unlearned_nodes_indices = np.random.choice(node_list, num_of_unlearned_nodes, replace=False) 112 | 113 | elif self.args['unlearning_request'] == 'top1': 114 | sorted_shards = sorted(self.community_to_node.items(), key=lambda x: len(x[1]), reverse=True) 115 | unlearned_nodes_indices = np.random.choice(sorted_shards[0][1], num_of_unlearned_nodes, replace=False) 116 | 117 | elif self.args['unlearning_request'] == 'adaptive': 118 | sorted_shards = sorted(self.community_to_node.items(), key=lambda x: len(x[1]), reverse=True) 119 | candidate_list = np.concatenate([sorted_shards[i][1] for i in range(int(self.args['num_shards']/2)+1)], axis=0) 120 | unlearned_nodes_indices = np.random.choice(candidate_list, num_of_unlearned_nodes, replace=False) 121 | 122 | elif self.args['unlearning_request'] == 'last5': 123 | sorted_shards = sorted(self.community_to_node.items(), key=lambda x: len(x[1]), reverse=False) 124 | candidate_list = np.concatenate([sorted_shards[i][1] for i in range(int(self.args['num_shards']/2)+1)], axis=0) 125 | unlearned_nodes_indices = np.random.choice(candidate_list, num_of_unlearned_nodes, replace=False) 126 | 127 | return unlearned_nodes_indices 128 | 129 | def unlearning_time_statistic(self): 130 | if self.args['is_train_target_model'] and self.args['num_shards'] != 1: 131 | # random sample 5% nodes, find their belonging communities 132 | unlearned_nodes = self._generate_unlearning_request(num_unlearned="ratio") 133 | belong_community = [] 134 | for sample_node in range(len(unlearned_nodes)): 135 | for community, node in self.community_to_node.items(): 136 | if np.in1d(unlearned_nodes[sample_node], node).any(): 137 | belong_community.append(community) 138 | 139 | # calculate the total unlearning time and group unlearning time 140 | group_unlearning_time = [] 141 | node_unlearning_time = [] 142 | for shard in range(self.args['num_shards']): 143 | if belong_community.count(shard) != 0: 144 | group_unlearning_time.append(self.time[shard]) 145 | node_unlearning_time.extend([float(self.time[shard]) for j in range(belong_community.count(shard))]) 146 | return node_unlearning_time 147 | 148 | elif self.args['is_train_target_model'] and self.args['num_shards'] == 1: 149 | return self.time[0] 150 | 151 | else: 152 | return 0 153 | 154 | def _train_model(self, run, shard): 155 | self.logger.info('training target models, run %s, shard %s' % (run, shard)) 156 | 157 | start_time = time.time() 158 | self.target_model.data = self.unlearned_shard_data[shard] 159 | self.target_model.train_model() 160 | train_time = time.time() - start_time 161 | 162 | self.data_store.save_target_model(run, self.target_model, shard) 163 | 164 | return train_time 165 | -------------------------------------------------------------------------------- /exp/exp_unlearning.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | 4 | import numpy as np 5 | 6 | from exp.exp import Exp 7 | from lib_gnn_model.graphsage.graphsage import SAGE 8 | from lib_gnn_model.gat.gat import GAT 9 | from lib_gnn_model.gin.gin import GIN 10 | from lib_gnn_model.gcn.gcn import GCN 11 | from lib_gnn_model.mlp.mlp import MLP 12 | from lib_gnn_model.node_classifier import NodeClassifier 13 | from lib_aggregator.aggregator import Aggregator 14 | 15 | 16 | class ExpUnlearning(Exp): 17 | def __init__(self, args): 18 | super(ExpUnlearning, self).__init__(args) 19 | 20 | self.logger = logging.getLogger('exp_unlearning') 21 | 22 | self.target_model_name = self.args['target_model'] 23 | self.num_opt_samples = self.args['num_opt_samples'] 24 | 25 | self.load_data() 26 | self.determine_target_model() 27 | 28 | run_f1 = np.empty((0)) 29 | unlearning_time = np.empty((0)) 30 | for run in range(self.args['num_runs']): 31 | self.logger.info("Run %f" % run) 32 | self.train_target_models(run) 33 | aggregate_f1_score = self.aggregate(run) 34 | node_unlearning_time = self.unlearning_time_statistic() 35 | run_f1 = np.append(run_f1, aggregate_f1_score) 36 | unlearning_time = np.append(unlearning_time, node_unlearning_time) 37 | 38 | self.f1_score_avg = np.average(run_f1) 39 | self.f1_score_std = np.std(run_f1) 40 | self.unlearning_time_avg = np.average(unlearning_time) 41 | self.unlearning_time_std = np.std(unlearning_time) 42 | self.logger.info("%s %s %s %s" % (self.f1_score_avg, self.f1_score_std, self.unlearning_time_avg, self.unlearning_time_std)) 43 | 44 | def load_data(self): 45 | self.shard_data = self.data_store.load_shard_data() 46 | self.data = self.data_store.load_raw_data() 47 | 48 | def determine_target_model(self): 49 | num_feats = self.data.num_features 50 | num_classes = len(self.data.y.unique()) 51 | 52 | if not self.args['is_use_batch']: 53 | if self.target_model_name == 'SAGE': 54 | self.target_model = SAGE(num_feats, num_classes) 55 | elif self.target_model_name == 'GCN': 56 | self.target_model = GCN(num_feats, num_classes) 57 | elif self.target_model_name == 'GAT': 58 | self.target_model = GAT(num_feats, num_classes) 59 | elif self.target_model_name == 'GIN': 60 | self.target_model = GIN(num_feats, num_classes) 61 | else: 62 | raise Exception('unsupported target model') 63 | else: 64 | if self.target_model_name == 'MLP': 65 | self.target_model = MLP(num_feats, num_classes) 66 | else: 67 | self.target_model = NodeClassifier(num_feats, num_classes, self.args) 68 | 69 | def train_target_models(self, run): 70 | if self.args['is_train_target_model']: 71 | self.logger.info('training target models') 72 | 73 | self.time = {} 74 | for shard in range(self.args['num_shards']): 75 | self.time[shard] = self._train_model(run, shard) 76 | 77 | def aggregate(self, run): 78 | self.logger.info('aggregating submodels') 79 | 80 | start_time = time.time() 81 | aggregator = Aggregator(run, self.target_model, self.data, self.shard_data, self.args) 82 | aggregator.generate_posterior() 83 | self.aggregate_f1_score = aggregator.aggregate() 84 | aggregate_time = time.time() - start_time 85 | self.logger.info("Partition cost %s seconds." % aggregate_time) 86 | 87 | self.logger.info("Final Test F1: %s" % (self.aggregate_f1_score,)) 88 | return self.aggregate_f1_score 89 | 90 | def unlearning_time_statistic(self): 91 | if self.args['is_train_target_model'] and self.args['num_shards'] != 1: 92 | self.community_to_node = self.data_store.load_community_data() 93 | node_list = [] 94 | for key, value in self.community_to_node.items(): 95 | node_list.extend(value) 96 | 97 | # random sample 5% nodes, find their belonging communities 98 | sample_nodes = np.random.choice(node_list, int(0.05 * len(node_list))) 99 | belong_community = [] 100 | for sample_node in range(len(sample_nodes)): 101 | for community, node in self.community_to_node.items(): 102 | if np.in1d(sample_nodes[sample_node], node).any(): 103 | belong_community.append(community) 104 | 105 | # calculate the total unlearning time and group unlearning time 106 | group_unlearning_time = [] 107 | node_unlearning_time = [] 108 | for shard in range(self.args['num_shards']): 109 | if belong_community.count(shard) != 0: 110 | group_unlearning_time.append(self.time[shard]) 111 | node_unlearning_time.extend([float(self.time[shard]) for j in range(belong_community.count(shard))]) 112 | 113 | return node_unlearning_time 114 | 115 | elif self.args['is_train_target_model'] and self.args['num_shards'] == 1: 116 | return self.time[0] 117 | 118 | else: 119 | return 0 120 | 121 | def _train_model(self, run, shard): 122 | self.logger.info('training target models, run %s, shard %s' % (run, shard)) 123 | 124 | start_time = time.time() 125 | self.target_model.data = self.shard_data[shard] 126 | self.target_model.train_model() 127 | train_time = time.time() - start_time 128 | 129 | self.data_store.save_target_model(run, self.target_model, shard) 130 | self.logger.info("Model training time: %s" % (train_time)) 131 | 132 | return train_time 133 | -------------------------------------------------------------------------------- /lib_aggregator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinChen00/Graph-Unlearning/a9b942d01651c2e3d780ae12e1a1459e35120ffa/lib_aggregator/__init__.py -------------------------------------------------------------------------------- /lib_aggregator/aggregator.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import torch 3 | 4 | torch.cuda.empty_cache() 5 | 6 | from sklearn.metrics import f1_score 7 | import numpy as np 8 | 9 | from lib_aggregator.optimal_aggregator import OptimalAggregator 10 | from lib_dataset.data_store import DataStore 11 | 12 | 13 | class Aggregator: 14 | def __init__(self, run, target_model, data, shard_data, args): 15 | self.logger = logging.getLogger('Aggregator') 16 | self.args = args 17 | 18 | self.data_store = DataStore(self.args) 19 | 20 | self.run = run 21 | self.target_model = target_model 22 | self.data = data 23 | self.shard_data = shard_data 24 | 25 | self.num_shards = args['num_shards'] 26 | 27 | def generate_posterior(self, suffix=""): 28 | self.true_label = self.shard_data[0].y[self.shard_data[0]['test_mask']].detach().cpu().numpy() 29 | self.posteriors = {} 30 | 31 | for shard in range(self.args['num_shards']): 32 | self.target_model.data = self.shard_data[shard] 33 | self.data_store.load_target_model(self.run, self.target_model, shard, suffix) 34 | self.posteriors[shard] = self.target_model.posterior() 35 | self.logger.info("Saving posteriors.") 36 | self.data_store.save_posteriors(self.posteriors, self.run, suffix) 37 | 38 | def aggregate(self): 39 | if self.args['aggregator'] == 'mean': 40 | aggregate_f1_score = self._mean_aggregator() 41 | elif self.args['aggregator'] == 'optimal': 42 | aggregate_f1_score = self._optimal_aggregator() 43 | elif self.args['aggregator'] == 'majority': 44 | aggregate_f1_score = self._majority_aggregator() 45 | else: 46 | raise Exception("unsupported aggregator.") 47 | 48 | return aggregate_f1_score 49 | 50 | def _mean_aggregator(self): 51 | posterior = self.posteriors[0] 52 | for shard in range(1, self.num_shards): 53 | posterior += self.posteriors[shard] 54 | 55 | posterior = posterior / self.num_shards 56 | return f1_score(self.true_label, posterior.argmax(axis=1).cpu().numpy(), average="micro") 57 | 58 | def _majority_aggregator(self): 59 | pred_labels = [] 60 | for shard in range(self.num_shards): 61 | pred_labels.append(self.posteriors[shard].argmax(axis=1).cpu().numpy()) 62 | 63 | pred_labels = np.stack(pred_labels) 64 | pred_label = np.argmax( 65 | np.apply_along_axis(np.bincount, axis=0, arr=pred_labels, minlength=self.posteriors[0].shape[1]), axis=0) 66 | 67 | return f1_score(self.true_label, pred_label, average="micro") 68 | 69 | def _optimal_aggregator(self): 70 | optimal = OptimalAggregator(self.run, self.target_model, self.data, self.args) 71 | optimal.generate_train_data() 72 | weight_para = optimal.optimization() 73 | self.data_store.save_optimal_weight(weight_para, run=self.run) 74 | 75 | posterior = self.posteriors[0] * weight_para[0] 76 | for shard in range(1, self.num_shards): 77 | posterior += self.posteriors[shard] * weight_para[shard] 78 | 79 | return f1_score(self.true_label, posterior.argmax(axis=1).cpu().numpy(), average="micro") 80 | -------------------------------------------------------------------------------- /lib_aggregator/opt_dataset.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | 3 | 4 | class OptDataset(Dataset): 5 | def __init__(self, posteriors, labels): 6 | self.posteriors = posteriors 7 | self.labels = labels 8 | 9 | def __getitem__(self, index): 10 | ret_posterior = {} 11 | 12 | for shard, post in self.posteriors.items(): 13 | ret_posterior[shard] = post[index] 14 | 15 | return ret_posterior, self.labels[index] 16 | 17 | def __len__(self): 18 | return self.labels.shape[0] 19 | -------------------------------------------------------------------------------- /lib_aggregator/optimal_aggregator.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import logging 3 | 4 | import numpy as np 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from torch import optim 9 | from torch.optim.lr_scheduler import MultiStepLR 10 | from torch.utils.data import DataLoader 11 | from torch_geometric.data import Data 12 | 13 | from lib_aggregator.opt_dataset import OptDataset 14 | from lib_dataset.data_store import DataStore 15 | from lib_utils import utils 16 | 17 | 18 | class OptimalAggregator: 19 | def __init__(self, run, target_model, data, args): 20 | self.logger = logging.getLogger('optimal_aggregator') 21 | self.args = args 22 | 23 | self.run = run 24 | self.target_model = target_model 25 | self.data = data 26 | self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 27 | 28 | self.num_shards = args['num_shards'] 29 | 30 | def generate_train_data(self): 31 | data_store = DataStore(self.args) 32 | train_indices, _ = data_store.load_train_test_split() 33 | 34 | # sample a set of nodes from train_indices 35 | if self.args["num_opt_samples"] == 1000: 36 | train_indices = np.random.choice(train_indices, size=1000, replace=False) 37 | elif self.args["num_opt_samples"] == 10000: 38 | train_indices = np.random.choice(train_indices, size=int(train_indices.shape[0] * 0.1), replace=False) 39 | elif self.args["num_opt_samples"] == 1: 40 | train_indices = np.random.choice(train_indices, size=int(train_indices.shape[0]), replace=False) 41 | 42 | train_indices = np.sort(train_indices) 43 | self.logger.info("Using %s samples for optimization" % (int(train_indices.shape[0]))) 44 | 45 | x = self.data.x[train_indices] 46 | y = self.data.y[train_indices] 47 | edge_index = utils.filter_edge_index(self.data.edge_index, train_indices) 48 | 49 | train_data = Data(x=x, edge_index=torch.from_numpy(edge_index), y=y) 50 | train_data.train_mask = torch.zeros(train_indices.shape[0], dtype=torch.bool) 51 | train_data.test_mask = torch.ones(train_indices.shape[0], dtype=torch.bool) 52 | self.true_labels = y 53 | 54 | self.posteriors = {} 55 | for shard in range(self.num_shards): 56 | self.target_model.data = train_data 57 | data_store.load_target_model(self.run, self.target_model, shard) 58 | self.posteriors[shard] = self.target_model.posterior().to(self.device) 59 | 60 | def optimization(self): 61 | weight_para = nn.Parameter(torch.full((self.num_shards,), fill_value=1.0 / self.num_shards), requires_grad=True) 62 | optimizer = optim.Adam([weight_para], lr=self.args['opt_lr']) 63 | scheduler = MultiStepLR(optimizer, milestones=[500, 1000], gamma=self.args['opt_lr']) 64 | 65 | train_dset = OptDataset(self.posteriors, self.true_labels) 66 | train_loader = DataLoader(train_dset, batch_size=32, shuffle=True, num_workers=0) 67 | 68 | min_loss = 1000.0 69 | for epoch in range(self.args['opt_num_epochs']): 70 | loss_all = 0.0 71 | 72 | for posteriors, labels in train_loader: 73 | labels = labels.to(self.device) 74 | 75 | optimizer.zero_grad() 76 | loss = self._loss_fn(posteriors, labels, weight_para) 77 | loss.backward() 78 | loss_all += loss 79 | 80 | optimizer.step() 81 | with torch.no_grad(): 82 | weight_para[:] = torch.clamp(weight_para, min=0.0) 83 | 84 | scheduler.step() 85 | 86 | if loss_all < min_loss: 87 | ret_weight_para = copy.deepcopy(weight_para) 88 | min_loss = loss_all 89 | 90 | self.logger.info('epoch: %s, loss: %s' % (epoch, loss_all)) 91 | 92 | return ret_weight_para / torch.sum(ret_weight_para) 93 | 94 | def _loss_fn(self, posteriors, labels, weight_para): 95 | aggregate_posteriors = torch.zeros_like(posteriors[0]) 96 | for shard in range(self.num_shards): 97 | aggregate_posteriors += weight_para[shard] * posteriors[shard] 98 | 99 | aggregate_posteriors = F.softmax(aggregate_posteriors, dim=1) 100 | loss_1 = F.cross_entropy(aggregate_posteriors, labels) 101 | loss_2 = torch.sqrt(torch.sum(weight_para ** 2)) 102 | 103 | return loss_1 + loss_2 104 | -------------------------------------------------------------------------------- /lib_dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinChen00/Graph-Unlearning/a9b942d01651c2e3d780ae12e1a1459e35120ffa/lib_dataset/__init__.py -------------------------------------------------------------------------------- /lib_dataset/data_store.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import logging 4 | import shutil 5 | 6 | import numpy as np 7 | import torch 8 | from torch_geometric.datasets import Planetoid, Coauthor 9 | import torch_geometric.transforms as T 10 | 11 | import config 12 | 13 | 14 | class DataStore: 15 | def __init__(self, args): 16 | self.logger = logging.getLogger('data_store') 17 | self.args = args 18 | 19 | self.dataset_name = self.args['dataset_name'] 20 | self.num_features = { 21 | "cora": 1433, 22 | "pubmed": 500, 23 | "citeseer": 3703, 24 | "Coauthor_CS": 6805, 25 | "Coauthor_Phys": 8415 26 | } 27 | self.partition_method = self.args['partition_method'] 28 | self.num_shards = self.args['num_shards'] 29 | self.target_model = self.args['target_model'] 30 | 31 | self.determine_data_path() 32 | 33 | def determine_data_path(self): 34 | embedding_name = '_'.join(('embedding', self._extract_embedding_method(self.partition_method), 35 | str(self.args['ratio_deleted_edges']))) 36 | 37 | community_name = '_'.join(('community', self.partition_method, str(self.num_shards), 38 | str(self.args['ratio_deleted_edges']))) 39 | shard_name = '_'.join(('shard_data', self.partition_method, str(self.num_shards), 40 | str(self.args['shard_size_delta']), str(self.args['ratio_deleted_edges']))) 41 | target_model_name = '_'.join((self.target_model, self.partition_method, str(self.num_shards), 42 | str(self.args['shard_size_delta']), str(self.args['ratio_deleted_edges']))) 43 | optimal_weight_name = '_'.join((self.target_model, self.partition_method, str(self.num_shards), 44 | str(self.args['shard_size_delta']), str(self.args['ratio_deleted_edges']))) 45 | 46 | processed_data_prefix = config.PROCESSED_DATA_PATH + self.dataset_name + "/" 47 | self.train_test_split_file = processed_data_prefix + "train_test_split" + str(self.args['test_ratio']) 48 | self.train_data_file = processed_data_prefix + "train_data" 49 | self.train_graph_file = processed_data_prefix + "train_graph" 50 | self.embedding_file = processed_data_prefix + embedding_name 51 | self.community_file = processed_data_prefix + community_name 52 | self.shard_file = processed_data_prefix + shard_name 53 | self.unlearned_file = processed_data_prefix+ '_'.join(('unlearned', str(self.args['num_unlearned_nodes']))) 54 | 55 | self.target_model_file = config.MODEL_PATH + self.dataset_name + '/' + target_model_name 56 | self.optimal_weight_file = config.ANALYSIS_PATH + 'optimal/' + self.dataset_name + '/' + optimal_weight_name 57 | self.posteriors_file = config.ANALYSIS_PATH + 'posteriors/' + self.dataset_name + '/' + target_model_name 58 | 59 | dir_lists = [s + self.dataset_name for s in [config.PROCESSED_DATA_PATH, 60 | config.MODEL_PATH, 61 | config.ANALYSIS_PATH + 'optimal/', 62 | config.ANALYSIS_PATH + 'posteriors/']] 63 | for dir in dir_lists: 64 | self._check_and_create_dirs(dir) 65 | 66 | def _check_and_create_dirs(self, folder): 67 | if not os.path.exists(folder): 68 | try: 69 | self.logger.info("checking directory %s", folder) 70 | os.makedirs(folder, exist_ok=True) 71 | self.logger.info("new directory %s created", folder) 72 | except OSError as error: 73 | self.logger.info("deleting old and creating new empty %s", folder) 74 | shutil.rmtree(folder) 75 | os.mkdir(folder) 76 | self.logger.info("new empty directory %s created", folder) 77 | else: 78 | self.logger.info("folder %s exists, do not need to create again.", folder) 79 | 80 | def load_raw_data(self): 81 | self.logger.info('loading raw data') 82 | if not self.args['is_use_node_feature']: 83 | self.transform = T.Compose([ 84 | T.OneHotDegree(-2, cat=False) # use only node degree as node feature. 85 | ]) 86 | else: 87 | self.transform = None 88 | 89 | if self.dataset_name in ["cora", "pubmed", "citeseer"]: 90 | dataset = Planetoid(config.RAW_DATA_PATH, self.dataset_name, transform=T.NormalizeFeatures()) 91 | labels = np.unique(dataset.data.y.numpy()) 92 | elif self.dataset_name in ["Coauthor_CS", "Coauthor_Phys"]: 93 | if self.dataset_name == "Coauthor_Phys": 94 | dataset = Coauthor(config.RAW_DATA_PATH, name="Physics", pre_transform=self.transform) 95 | else: 96 | dataset = Coauthor(config.RAW_DATA_PATH, name="CS", pre_transform=self.transform) 97 | else: 98 | raise Exception('unsupported dataset') 99 | 100 | data = dataset[0] 101 | 102 | return data 103 | 104 | def save_train_data(self, train_data): 105 | self.logger.info('saving train data') 106 | pickle.dump(train_data, open(self.train_data_file, 'wb')) 107 | 108 | def load_train_data(self): 109 | self.logger.info('loading train data') 110 | return pickle.load(open(self.train_data_file, 'rb')) 111 | 112 | def save_train_graph(self, train_data): 113 | self.logger.info('saving train graph') 114 | pickle.dump(train_data, open(self.train_graph_file, 'wb')) 115 | 116 | def load_train_graph(self): 117 | self.logger.info('loading train graph') 118 | return pickle.load(open(self.train_graph_file, 'rb')) 119 | 120 | def save_train_test_split(self, train_indices, test_indices): 121 | self.logger.info('saving train test split data') 122 | pickle.dump((train_indices, test_indices), open(self.train_test_split_file, 'wb')) 123 | 124 | def load_train_test_split(self): 125 | self.logger.info('loading train test split data') 126 | return pickle.load(open(self.train_test_split_file, 'rb')) 127 | 128 | def save_embeddings(self, embeddings): 129 | self.logger.info('saving embedding data') 130 | pickle.dump(embeddings, open(self.embedding_file, 'wb')) 131 | 132 | def load_embeddings(self): 133 | self.logger.info('loading embedding data') 134 | return pickle.load(open(self.embedding_file, 'rb')) 135 | 136 | def save_community_data(self, community_to_node, suffix=''): 137 | self.logger.info('saving community data') 138 | pickle.dump(community_to_node, open(self.community_file + suffix, 'wb')) 139 | 140 | def load_community_data(self, suffix=''): 141 | self.logger.info('loading community data from: %s'%(self.community_file + suffix)) 142 | return pickle.load(open(self.community_file + suffix, 'rb')) 143 | 144 | def c2n_to_n2c(self, community_to_node): 145 | node_list = [] 146 | for i in range(self.num_shards): 147 | node_list.extend(list(community_to_node.values())[i]) 148 | node_to_community = {} 149 | 150 | for comm, nodes in dict(community_to_node).items(): 151 | for node in nodes: 152 | # Map node id back to original graph 153 | # node_to_community[node_list[node]] = comm 154 | node_to_community[node] = comm 155 | 156 | return node_to_community 157 | 158 | def save_shard_data(self, shard_data): 159 | self.logger.info('saving shard data') 160 | pickle.dump(shard_data, open(self.shard_file, 'wb')) 161 | 162 | def load_shard_data(self): 163 | self.logger.info('loading shard data') 164 | return pickle.load(open(self.shard_file, 'rb')) 165 | 166 | def load_unlearned_data(self, suffix): 167 | file_path = '_'.join((self.unlearned_file, suffix)) 168 | self.logger.info('loading unlearned data from %s' % file_path) 169 | return pickle.load(open(file_path, 'rb')) 170 | 171 | def save_unlearned_data(self, data, suffix): 172 | self.logger.info('saving unlearned data %s' % suffix) 173 | pickle.dump(data, open('_'.join((self.unlearned_file, suffix)), 'wb')) 174 | 175 | def save_target_model(self, run, model, shard, suffix=''): 176 | if self.args["exp"] in ["node_edge_unlearning", "attack_unlearning"]: 177 | model_path = '_'.join((self.target_model_file, str(shard), str(run), str(self.args['num_unlearned_nodes']))) + suffix 178 | model.save_model(model_path) 179 | else: 180 | model.save_model(self.target_model_file + '_' + str(shard) + '_' + str(run)) 181 | # model.save_model(self.target_model_file + '_' + str(shard)) 182 | 183 | def load_target_model(self, run, model, shard, suffix=''): 184 | if self.args["exp"] == "node_edge_unlearning": 185 | model.load_model( 186 | '_'.join((self.target_model_file, str(shard), str(run), str(self.args['num_unlearned_nodes'])))) 187 | elif self.args["exp"] == "attack_unlearning": 188 | model_path = '_'.join((self.target_model_file, str(shard), str(run), str(self.args['num_unlearned_nodes']))) + suffix 189 | print("loading target model from:" + model_path) 190 | device = torch.device('cpu') 191 | model.load_model(model_path) 192 | model.device=device 193 | else: 194 | # model.load_model(self.target_model_file + '_' + str(shard) + '_' + str(run)) 195 | model.load_model(self.target_model_file + '_' + str(shard) + '_' + str(0)) 196 | 197 | def save_optimal_weight(self, weight, run): 198 | torch.save(weight, self.optimal_weight_file + '_' + str(run)) 199 | 200 | def load_optimal_weight(self, run): 201 | return torch.load(self.optimal_weight_file + '_' + str(run)) 202 | 203 | def save_posteriors(self, posteriors, run, suffix=''): 204 | torch.save(posteriors, self.posteriors_file + '_' + str(run) + suffix) 205 | 206 | def load_posteriors(self, run): 207 | return torch.load(self.posteriors_file + '_' + str(run)) 208 | 209 | def _extract_embedding_method(self, partition_method): 210 | return partition_method.split('_')[0] 211 | -------------------------------------------------------------------------------- /lib_gnn_model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinChen00/Graph-Unlearning/a9b942d01651c2e3d780ae12e1a1459e35120ffa/lib_gnn_model/__init__.py -------------------------------------------------------------------------------- /lib_gnn_model/gat/gat.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import torch 5 | import torch.nn.functional as F 6 | import torch_geometric.transforms as T 7 | from torch_geometric.datasets import Planetoid 8 | 9 | import config 10 | from lib_gnn_model.gnn_base import GNNBase 11 | from lib_gnn_model.gat.gat_net import GATNet 12 | 13 | 14 | class GAT(GNNBase): 15 | def __init__(self, num_feats, num_classes, data=None): 16 | super(GAT, self).__init__() 17 | self.logger = logging.getLogger('gat') 18 | 19 | self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 20 | self.model = GATNet(num_feats, num_classes) 21 | self.data = data 22 | 23 | def train_model(self, num_epoch=100): 24 | self.model.train() 25 | self.model.reset_parameters() 26 | self.model, self.data = self.model.to(self.device), self.data.to(self.device) 27 | 28 | optimizer = torch.optim.Adam(self.model.parameters(), lr=0.005, weight_decay=0.0001) 29 | 30 | for epoch in range(num_epoch): 31 | self.logger.info('epoch %s' % (epoch,)) 32 | 33 | optimizer.zero_grad() 34 | output = self.model(self.data)[self.data.train_mask] 35 | loss = F.nll_loss(output, self.data.y[self.data.train_mask]) 36 | loss.backward() 37 | optimizer.step() 38 | 39 | train_acc, test_acc = self.evaluate_model() 40 | self.logger.info('train acc: %s, test acc: %s' % (train_acc, test_acc)) 41 | 42 | def evaluate_model(self): 43 | self.model.eval() 44 | # self.model, self.data = self.model.to(self.device), self.data.to(self.device) 45 | 46 | logits, accs = self.model(self.data), [] 47 | 48 | for _, mask in self.data('train_mask', 'test_mask'): 49 | pred = logits[mask].max(1)[1] 50 | acc = pred.eq(self.data.y[mask]).sum().item() / mask.sum().item() 51 | accs.append(acc) 52 | 53 | return accs 54 | 55 | 56 | if __name__ == '__main__': 57 | os.chdir('../../') 58 | 59 | output_file = None 60 | logging.basicConfig(filename=output_file, 61 | format='%(levelname)s:%(asctime)s: - %(name)s - : %(message)s', 62 | level=logging.DEBUG) 63 | 64 | dataset_name = 'cora' 65 | dataset = Planetoid(config.RAW_DATA_PATH, dataset_name, transform=T.NormalizeFeatures()) 66 | data = dataset[0] 67 | 68 | gat = GAT(dataset.num_features, dataset.num_classes, data) 69 | gat.train_model() 70 | # gat.evaluate_model() 71 | -------------------------------------------------------------------------------- /lib_gnn_model/gat/gat_net.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch_geometric.nn import GATConv 4 | 5 | 6 | class GATNet(torch.nn.Module): 7 | def __init__(self, num_feats, num_classes, dropout=0.6): 8 | super(GATNet, self).__init__() 9 | self.dropout = dropout 10 | 11 | self.conv1 = GATConv(num_feats, 8, heads=8, dropout=self.dropout, add_self_loops=False) 12 | # On the Pubmed dataset, use heads=8 in conv2. 13 | self.conv2 = GATConv(8 * 8, num_classes, heads=1, concat=False, dropout=self.dropout, add_self_loops=False) 14 | # self.conv2 = GATConv(8 * 8, num_classes, heads=8, concat=False, dropout=self.dropout, add_self_loops=False) 15 | 16 | self.reset_parameters() 17 | 18 | def forward(self, data): 19 | x = F.dropout(data.x, p=self.dropout, training=self.training) 20 | x = F.elu(self.conv1(x, data.edge_index)) 21 | x = F.dropout(x, p=self.dropout, training=self.training) 22 | x = self.conv2(x, data.edge_index) 23 | 24 | return F.log_softmax(x, dim=1) 25 | 26 | def reset_parameters(self): 27 | self.conv1.reset_parameters() 28 | self.conv2.reset_parameters() 29 | -------------------------------------------------------------------------------- /lib_gnn_model/gcn/gcn.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | 4 | import torch 5 | import torch.nn.functional as F 6 | import torch_geometric.transforms as T 7 | from torch_geometric.datasets import Planetoid 8 | 9 | from lib_gnn_model.gnn_base import GNNBase 10 | from lib_gnn_model.gcn.gcn_net import GCNNet 11 | import config 12 | 13 | 14 | class GCN(GNNBase): 15 | def __init__(self, num_feats, num_classes, data=None): 16 | super(GCN, self).__init__() 17 | self.logger = logging.getLogger('gcn') 18 | 19 | self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 20 | self.model = GCNNet(num_feats, num_classes) 21 | self.data = data 22 | 23 | def train_model(self, num_epoch=100): 24 | self.model.train() 25 | self.model.reset_parameters() 26 | self.model, self.data = self.model.to(self.device), self.data.to(self.device) 27 | 28 | optimizer = torch.optim.Adam(self.model.parameters(), lr=0.01) 29 | 30 | for epoch in range(num_epoch): 31 | self.logger.info('epoch %s' % (epoch,)) 32 | 33 | optimizer.zero_grad() 34 | output = self.model(self.data)[self.data.train_mask] 35 | loss = F.nll_loss(output, self.data.y[self.data.train_mask]) 36 | loss.backward() 37 | optimizer.step() 38 | 39 | train_acc, test_acc = self.evaluate_model() 40 | self.logger.info('train acc: %s, test acc: %s' % (train_acc, test_acc)) 41 | 42 | def evaluate_model(self): 43 | self.model.eval() 44 | self.model, self.data = self.model.to(self.device), self.data.to(self.device) 45 | 46 | logits, accs = self.model(self.data), [] 47 | 48 | for _, mask in self.data('train_mask', 'test_mask'): 49 | pred = logits[mask].max(1)[1] 50 | acc = pred.eq(self.data.y[mask]).sum().item() / mask.sum().item() 51 | accs.append(acc) 52 | 53 | return accs 54 | 55 | 56 | if __name__ == '__main__': 57 | os.chdir('../../') 58 | 59 | output_file = None 60 | logging.basicConfig(filename=output_file, 61 | format='%(levelname)s:%(asctime)s: - %(name)s - : %(message)s', 62 | level=logging.DEBUG) 63 | 64 | dataset_name = 'cora' 65 | dataset = Planetoid(config.RAW_DATA_PATH, dataset_name, transform=T.NormalizeFeatures()) 66 | data = dataset[0] 67 | 68 | gcn = GCN(dataset.num_features, dataset.num_classes, data) 69 | gcn.train_model() -------------------------------------------------------------------------------- /lib_gnn_model/gcn/gcn_net.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch_geometric.nn import GCNConv 4 | 5 | 6 | class GCNNet(torch.nn.Module): 7 | def __init__(self, num_feats, num_classes): 8 | super(GCNNet, self).__init__() 9 | 10 | self.conv1 = GCNConv(num_feats, 16, cached=True, add_self_loops=False) 11 | self.conv2 = GCNConv(16, num_classes, cached=True, add_self_loops=False) 12 | 13 | def forward(self, data): 14 | x, edge_index, edge_weight = data.x, data.edge_index, data.edge_attr 15 | x = F.relu(self.conv1(x, edge_index, edge_weight)) 16 | x = F.dropout(x, training=self.training) 17 | x = self.conv2(x, edge_index, edge_weight) 18 | 19 | return F.log_softmax(x, dim=-1) 20 | 21 | def reset_parameters(self): 22 | self.conv1.reset_parameters() 23 | self.conv2.reset_parameters() 24 | -------------------------------------------------------------------------------- /lib_gnn_model/gin/gin.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | 4 | import torch 5 | import torch.nn.functional as F 6 | import torch_geometric.transforms as T 7 | from torch_geometric.datasets import Planetoid, Reddit 8 | 9 | from lib_gnn_model.gnn_base import GNNBase 10 | from lib_gnn_model.gin.gin_net import GINNet 11 | import config 12 | 13 | 14 | class GIN(GNNBase): 15 | def __init__(self, num_feats, num_classes, data=None): 16 | super(GIN, self).__init__() 17 | self.logger = logging.getLogger('gin') 18 | 19 | self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 20 | self.model = GINNet(num_feats, num_classes).to(self.device) 21 | self.data = data 22 | 23 | def train_model(self, num_epochs=100): 24 | self.model.train() 25 | self.model.reset_parameters() 26 | self.model, self.data = self.model.to(self.device), self.data.to(self.device) 27 | 28 | optimizer = torch.optim.Adam(self.model.parameters(), lr=0.01) 29 | 30 | for epoch in range(num_epochs): 31 | self.logger.info('epoch %s' % (epoch,)) 32 | 33 | optimizer.zero_grad() 34 | output = self.model(self.data)[self.data.train_mask] 35 | loss = F.nll_loss(output, self.data.y[self.data.train_mask]) 36 | # loss = F.nll_loss(output, self.data.y.squeeze(1)[self.data.train_mask]) 37 | loss.backward() 38 | optimizer.step() 39 | 40 | train_acc, test_acc = self.evaluate_model() 41 | self.logger.info('train acc: %s, test acc: %s' % (train_acc, test_acc)) 42 | 43 | def evaluate_model(self): 44 | self.model.eval() 45 | self.model, self.data = self.model.to(self.device), self.data.to(self.device) 46 | 47 | logits, accs = self.model(self.data), [] 48 | 49 | for _, mask in self.data('train_mask', 'test_mask'): 50 | pred = logits[mask].max(1)[1] 51 | acc = pred.eq(self.data.y[mask]).sum().item() / mask.sum().item() 52 | accs.append(acc) 53 | 54 | return accs 55 | 56 | 57 | if __name__ == '__main__': 58 | os.chdir('../../') 59 | 60 | output_file = None 61 | logging.basicConfig(filename=output_file, 62 | format='%(levelname)s:%(asctime)s: - %(name)s - : %(message)s', 63 | level=logging.DEBUG) 64 | 65 | dataset_name = 'citeseer' 66 | dataset = Planetoid(config.RAW_DATA_PATH, dataset_name, transform=T.NormalizeFeatures()) 67 | data = dataset[0] 68 | 69 | gin = GIN(dataset.num_features, dataset.num_classes, data) 70 | gin.train_model() 71 | -------------------------------------------------------------------------------- /lib_gnn_model/gin/gin_net.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch.nn import Sequential, Linear, ReLU 4 | from torch_geometric.nn import GINConv 5 | 6 | 7 | class GINNet(torch.nn.Module): 8 | def __init__(self, num_feats, num_classes): 9 | super(GINNet, self).__init__() 10 | 11 | dim = 32 12 | 13 | nn1 = Sequential(Linear(num_feats, dim), ReLU(), Linear(dim, dim)) 14 | self.conv1 = GINConv(nn1) 15 | self.bn1 = torch.nn.BatchNorm1d(dim) 16 | 17 | nn2 = Sequential(Linear(dim, dim), ReLU(), Linear(dim, dim)) 18 | self.conv2 = GINConv(nn2) 19 | self.bn2 = torch.nn.BatchNorm1d(dim) 20 | 21 | nn3 = Sequential(Linear(dim, dim), ReLU(), Linear(dim, dim)) 22 | self.conv3 = GINConv(nn3) 23 | self.bn3 = torch.nn.BatchNorm1d(dim) 24 | 25 | nn4 = Sequential(Linear(dim, dim), ReLU(), Linear(dim, dim)) 26 | self.conv4 = GINConv(nn4) 27 | self.bn4 = torch.nn.BatchNorm1d(dim) 28 | 29 | nn5 = Sequential(Linear(dim, dim), ReLU(), Linear(dim, dim)) 30 | self.conv5 = GINConv(nn5) 31 | self.bn5 = torch.nn.BatchNorm1d(dim) 32 | 33 | self.fc1 = Linear(dim, dim) 34 | self.fc2 = Linear(dim, num_classes) 35 | 36 | def forward(self, data, batch=None): 37 | x = F.relu(self.conv1(data.x, data.edge_index)) 38 | x = self.bn1(x) 39 | x = F.relu(self.conv2(x, data.edge_index)) 40 | x = self.bn2(x) 41 | x = F.relu(self.fc1(x)) 42 | x = F.dropout(x, p=0.5, training=self.training) 43 | x = self.fc2(x) 44 | 45 | return F.log_softmax(x, dim=1) 46 | 47 | def reset_parameters(self): 48 | self.conv1.reset_parameters() 49 | self.conv2.reset_parameters() 50 | -------------------------------------------------------------------------------- /lib_gnn_model/gnn_base.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pickle 3 | 4 | import torch 5 | 6 | 7 | class GNNBase: 8 | def __init__(self): 9 | self.logger = logging.getLogger('gnn') 10 | 11 | self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 12 | # self.device = torch.device('cpu') 13 | self.model = None 14 | self.embedding_dim = 0 15 | self.data = None 16 | self.subgraph_loader = None 17 | 18 | def save_model(self, save_path): 19 | self.logger.info('saving model') 20 | torch.save(self.model.state_dict(), save_path) 21 | 22 | def load_model(self, save_path): 23 | self.logger.info('loading model') 24 | device = torch.device('cpu') 25 | self.model.load_state_dict(torch.load(save_path, map_location=device)) 26 | 27 | def save_paras(self, save_path): 28 | self.logger.info('saving paras') 29 | self.paras = { 30 | 'embedding_dim': self.embedding_dim 31 | } 32 | pickle.dump(self.paras, open(save_path, 'wb')) 33 | 34 | def load_paras(self, save_path): 35 | self.logger.info('loading paras') 36 | return pickle.load(open(save_path, 'rb')) 37 | 38 | def count_parameters(self): 39 | return sum(p.numel() for p in self.model.parameters() if p.requires_grad) 40 | 41 | def posterior(self): 42 | self.model.eval() 43 | self.model = self.model.to(self.device) 44 | self.data = self.data.to(self.device) 45 | 46 | posteriors = self.model(self.data) 47 | for _, mask in self.data('test_mask'): 48 | posteriors = posteriors[mask] 49 | 50 | return posteriors.detach() 51 | -------------------------------------------------------------------------------- /lib_gnn_model/graphsage/graphsage.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | 4 | import torch 5 | import torch.nn.functional as F 6 | import torch_geometric.transforms as T 7 | from torch_geometric.datasets import Planetoid 8 | from torch_geometric.data import NeighborSampler 9 | 10 | from lib_gnn_model.graphsage.graphsage_net import SageNet 11 | from lib_gnn_model.gnn_base import GNNBase 12 | import config 13 | 14 | 15 | class SAGE(GNNBase): 16 | def __init__(self, num_feats, num_classes, data=None): 17 | super(SAGE, self).__init__() 18 | self.logger = logging.getLogger('graphsage') 19 | 20 | self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 21 | # self.device = torch.device('cpu') 22 | self.model = SageNet(num_feats, 256, num_classes).to(self.device) 23 | self.data = data 24 | 25 | def train_model(self, num_epochs=100): 26 | self.model.train() 27 | self.model.reset_parameters() 28 | self.model, self.data = self.model.to(self.device), self.data.to(self.device) 29 | self.data.y = self.data.y.squeeze().to(self.device) 30 | self._gen_train_loader() 31 | 32 | optimizer = torch.optim.Adam(self.model.parameters(), lr=0.01, weight_decay=0.001) 33 | 34 | for epoch in range(num_epochs): 35 | self.logger.info('epoch %s' % (epoch,)) 36 | 37 | for batch_size, n_id, adjs in self.train_loader: 38 | # `adjs` holds a list of `(edge_index, e_id, size)` tuples. 39 | adjs = [adj.to(self.device) for adj in adjs] 40 | 41 | optimizer.zero_grad() 42 | out = self.model(self.data.x[n_id], adjs) 43 | loss = F.nll_loss(out, self.data.y[n_id[:batch_size]]) 44 | loss.backward() 45 | optimizer.step() 46 | 47 | train_acc, test_acc = self.evaluate_model() 48 | self.logger.info(f'Train: {train_acc:.4f}, Test: {test_acc:.4f}') 49 | 50 | @torch.no_grad() 51 | def evaluate_model(self): 52 | self.model.eval() 53 | self.model, self.data = self.model.to(self.device), self.data.to(self.device) 54 | self._gen_subgraph_loader() 55 | 56 | out = self.model.inference(self.data.x, self.subgraph_loader, self.device) 57 | y_true = self.data.y.cpu().unsqueeze(-1) 58 | y_pred = out.argmax(dim=-1, keepdim=True) 59 | 60 | results = [] 61 | for mask in [self.data.train_mask, self.data.test_mask]: 62 | results += [int(y_pred[mask].eq(y_true[mask]).sum()) / int(mask.sum())] 63 | 64 | return results 65 | 66 | def posterior(self): 67 | self.model.eval() 68 | self.model, self.data = self.model.to(self.device), self.data.to(self.device) 69 | self._gen_subgraph_loader() 70 | 71 | posteriors = self.model.inference(self.data.x, self.subgraph_loader, self.device) 72 | 73 | for _, mask in self.data('test_mask'): 74 | posteriors = F.log_softmax(posteriors[mask], dim=-1) 75 | 76 | return posteriors.detach() 77 | 78 | def generate_embeddings(self): 79 | self.model.eval() 80 | self.model, self.data = self.model.to(self.device), self.data.to(self.device) 81 | self._gen_subgraph_loader() 82 | 83 | logits = self.model.inference(self.data.x, self.subgraph_loader, self.device) 84 | return logits 85 | 86 | def _gen_train_loader(self): 87 | if self.data.edge_index.shape[1] == 0: 88 | self.data.edge_index = torch.tensor([[1, 2], [2, 1]]) 89 | self.train_loader = NeighborSampler(self.data.edge_index, node_idx=self.data.train_mask, 90 | # sizes=[25, 10], batch_size=128, shuffle=True, 91 | # sizes=[25, 10], num_nodes=self.data.num_nodes, 92 | sizes=[10, 10], num_nodes=self.data.num_nodes, 93 | # sizes=[5, 5], num_nodes=self.data.num_nodes, 94 | # batch_size=128, shuffle=True, 95 | batch_size=64, shuffle=True, 96 | num_workers=0) 97 | 98 | def _gen_subgraph_loader(self): 99 | self.subgraph_loader = NeighborSampler(self.data.edge_index, node_idx=None, 100 | # sizes=[-1], num_nodes=self.data.num_nodes, 101 | sizes=[10], num_nodes=self.data.num_nodes, 102 | # batch_size=128, shuffle=False, 103 | batch_size=64, shuffle=False, 104 | num_workers=0) 105 | 106 | 107 | if __name__ == '__main__': 108 | os.chdir('../../') 109 | 110 | output_file = None 111 | logging.basicConfig(filename=output_file, 112 | format='%(levelname)s:%(asctime)s: - %(name)s - : %(message)s', 113 | level=logging.DEBUG) 114 | 115 | dataset_name = 'cora' 116 | dataset = Planetoid(config.RAW_DATA_PATH, dataset_name, transform=T.NormalizeFeatures()) 117 | data = dataset[0] 118 | 119 | graphsage = SAGE(dataset.num_features, dataset.num_classes, data) 120 | graphsage.train_model() 121 | -------------------------------------------------------------------------------- /lib_gnn_model/graphsage/graphsage_net.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch_geometric.nn import SAGEConv 4 | 5 | 6 | class SageNet(torch.nn.Module): 7 | def __init__(self, in_channels, hidden_channels, out_channels): 8 | super(SageNet, self).__init__() 9 | 10 | self.num_layers = 2 11 | 12 | self.convs = torch.nn.ModuleList() 13 | self.convs.append(SAGEConv(in_channels, hidden_channels)) 14 | self.convs.append(SAGEConv(hidden_channels, out_channels)) 15 | 16 | def forward(self, x, adjs): 17 | # `train_loader` computes the k-hop neighborhood of a batch of nodes, 18 | # and returns, for each layer, a bipartite graph object, holding the 19 | # bipartite edges `edge_index`, the index `e_id` of the original edges, 20 | # and the size/shape `size` of the bipartite graph. 21 | # Target nodes are also included in the source nodes so that one can 22 | # easily apply skip-connections or add self-loops. 23 | for i, (edge_index, _, size) in enumerate(adjs): 24 | x_target = x[:size[1]] # Target nodes are always placed first. 25 | x = self.convs[i]((x, x_target), edge_index) 26 | 27 | if i != self.num_layers - 1: 28 | x = F.relu(x) 29 | x = F.dropout(x, p=0.5, training=self.training) 30 | 31 | return F.log_softmax(x, dim=-1) 32 | 33 | def inference(self, x_all, subgraph_loader, device): 34 | # Compute representations of nodes layer by layer, using *all* 35 | # available edges. This leads to faster computation in contrast to 36 | # immediately computing the final representations of each batch. 37 | for i in range(self.num_layers): 38 | xs = [] 39 | 40 | for batch_size, n_id, adj in subgraph_loader: 41 | edge_index, _, size = adj.to(device) 42 | x = x_all[n_id].to(device) 43 | x_target = x[:size[1]] 44 | x = self.convs[i]((x, x_target), edge_index) 45 | if i != self.num_layers - 1: 46 | x = F.relu(x) 47 | xs.append(x.cpu()) 48 | 49 | x_all = torch.cat(xs, dim=0) 50 | 51 | return x_all 52 | 53 | def reset_parameters(self): 54 | for i in range(self.num_layers): 55 | self.convs[i].reset_parameters() 56 | -------------------------------------------------------------------------------- /lib_gnn_model/mlp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinChen00/Graph-Unlearning/a9b942d01651c2e3d780ae12e1a1459e35120ffa/lib_gnn_model/mlp/__init__.py -------------------------------------------------------------------------------- /lib_gnn_model/mlp/mlp.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | 4 | import torch 5 | import torch.nn.functional as F 6 | import torch_geometric.transforms as T 7 | from torch_geometric.datasets import Planetoid 8 | 9 | from lib_gnn_model.gnn_base import GNNBase 10 | from lib_gnn_model.mlp.mlpnet import MLPNet 11 | import config 12 | 13 | 14 | class MLP(GNNBase): 15 | def __init__(self, num_feats, num_classes, data=None): 16 | super(MLP, self).__init__() 17 | 18 | self.logger = logging.getLogger(__name__) 19 | 20 | self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 21 | self.model = MLPNet(num_feats, num_classes) 22 | self.data = data 23 | 24 | def train_model(self, num_epoch=100): 25 | self.model.train() 26 | self.model, self.data = self.model.to(self.device), self.data.to(self.device) 27 | 28 | optimizer = torch.optim.Adam(self.model.parameters(), lr=0.01) 29 | 30 | for epoch in range(num_epoch): 31 | self.logger.info('epoch %s' % (epoch,)) 32 | 33 | optimizer.zero_grad() 34 | output = self.model(self.data.x)[self.data.train_mask] 35 | # loss = F.nll_loss(output, self.data.y[self.data.train_mask]) 36 | loss = torch.nn.CrossEntropyLoss(output, self.data.y[self.data.train_mask].squeeze()) 37 | loss.backward() 38 | optimizer.step() 39 | 40 | train_acc, test_acc = self.evaluate_model() 41 | self.logger.info('train acc: %s, test acc: %s' % (train_acc, test_acc)) 42 | 43 | def evaluate_model(self): 44 | self.model.eval() 45 | self.model, self.data = self.model.to(self.device), self.data.to(self.device) 46 | 47 | logits, accs = self.model(self.data.x), [] 48 | 49 | for _, mask in self.data('train_mask', 'test_mask'): 50 | pred = logits[mask].max(1)[1] 51 | acc = pred.eq(self.data.y[mask]).sum().item() / mask.sum().item() 52 | accs.append(acc) 53 | 54 | return accs 55 | 56 | def posterior(self): 57 | self.model.eval() 58 | posteriors = self.model(self.data.x) 59 | for _, mask in self.data('test_mask'): 60 | posteriors = posteriors[mask] 61 | 62 | return posteriors 63 | 64 | 65 | if __name__ == '__main__': 66 | os.chdir('../../') 67 | 68 | output_file = None 69 | logging.basicConfig(filename=output_file, 70 | format='%(levelname)s:%(asctime)s: - %(name)s - : %(message)s', 71 | level=logging.DEBUG) 72 | 73 | dataset_name = 'Cora' 74 | dataset = Planetoid(config.RAW_DATA_PATH + dataset_name, dataset_name, transform=T.NormalizeFeatures()) 75 | data = dataset[0] 76 | 77 | gcn = MLP(dataset.num_features, dataset.num_classes, data) 78 | gcn.train_model() -------------------------------------------------------------------------------- /lib_gnn_model/mlp/mlpnet.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch.nn.functional as F 3 | 4 | 5 | class MLPNet(nn.Module): 6 | def __init__(self, input_size, num_classes): 7 | super(MLPNet, self).__init__() 8 | self.xent = nn.CrossEntropyLoss() 9 | 10 | self.layers = nn.Sequential( 11 | nn.Linear(input_size, 250), 12 | nn.Linear(250, 100), 13 | nn.Linear(100, num_classes) 14 | ) 15 | 16 | def forward(self, x): 17 | x = x.view(x.size(0), -1) 18 | x = self.layers(x) 19 | return F.softmax(x, dim=1) 20 | 21 | def loss(self, nodes, labels): 22 | scores = self.forward(nodes) 23 | return self.xent(scores, labels.squeeze()) 24 | 25 | def reset_parameters(self): 26 | return 0 27 | -------------------------------------------------------------------------------- /lib_gnn_model/node_classifier.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import torch 5 | from sklearn.model_selection import train_test_split 6 | 7 | torch.cuda.empty_cache() 8 | import torch.nn.functional as F 9 | import torch_geometric.transforms as T 10 | from torch_geometric.datasets import Planetoid 11 | from torch_geometric.data import NeighborSampler 12 | from torch_geometric.nn.conv.gcn_conv import gcn_norm 13 | import numpy as np 14 | 15 | import config 16 | from lib_gnn_model.gat.gat_net_batch import GATNet 17 | from lib_gnn_model.gin.gin_net_batch import GINNet 18 | from lib_gnn_model.gcn.gcn_net_batch import GCNNet 19 | from lib_gnn_model.graphsage.graphsage_net import SageNet 20 | from lib_gnn_model.gnn_base import GNNBase 21 | from parameter_parser import parameter_parser 22 | from lib_utils import utils 23 | 24 | 25 | class NodeClassifier(GNNBase): 26 | def __init__(self, num_feats, num_classes, args, data=None): 27 | super(NodeClassifier, self).__init__() 28 | 29 | self.args = args 30 | self.logger = logging.getLogger('node_classifier') 31 | self.target_model = args['target_model'] 32 | 33 | self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 34 | # self.device = 'cpu' 35 | self.model = self.determine_model(num_feats, num_classes).to(self.device) 36 | self.data = data 37 | 38 | def determine_model(self, num_feats, num_classes): 39 | self.logger.info('target model: %s' % (self.args['target_model'],)) 40 | 41 | if self.target_model == 'SAGE': 42 | self.lr, self.decay = 0.01, 0.001 43 | return SageNet(num_feats, 256, num_classes) 44 | elif self.target_model == 'GAT': 45 | self.lr, self.decay = 0.01, 0.001 46 | return GATNet(num_feats, num_classes) 47 | elif self.target_model == 'GCN': 48 | self.lr, self.decay = 0.05, 0.0001 49 | return GCNNet(num_feats, num_classes) 50 | elif self.target_model == 'GIN': 51 | self.lr, self.decay = 0.01, 0.0001 52 | return GINNet(num_feats, num_classes) 53 | else: 54 | raise Exception('unsupported target model') 55 | 56 | def train_model(self): 57 | self.logger.info("training model") 58 | self.model.train() 59 | self.model.reset_parameters() 60 | self.model, self.data = self.model.to(self.device), self.data.to(self.device) 61 | self.data.y = self.data.y.squeeze().to(self.device) 62 | self._gen_train_loader() 63 | 64 | optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr, weight_decay=self.decay) 65 | 66 | for epoch in range(self.args['num_epochs']): 67 | self.logger.info('epoch %s' % (epoch,)) 68 | 69 | for batch_size, n_id, adjs in self.train_loader: 70 | # self.logger.info("batch size: %s"%(batch_size)) 71 | # `adjs` holds a list of `(edge_index, e_id, size)` tuples. 72 | adjs = [adj.to(self.device) for adj in adjs] 73 | 74 | test_node = np.nonzero(self.data.test_mask.cpu().numpy())[0] 75 | intersect = np.intersect1d(test_node, n_id.numpy()) 76 | 77 | optimizer.zero_grad() 78 | 79 | if self.target_model == 'GCN': 80 | out = self.model(self.data.x[n_id], adjs, self.edge_weight) 81 | else: 82 | out = self.model(self.data.x[n_id], adjs) 83 | 84 | loss = F.nll_loss(out, self.data.y[n_id[:batch_size]]) 85 | loss.backward() 86 | optimizer.step() 87 | 88 | train_acc, test_acc = self.evaluate_model() 89 | self.logger.info(f'Train: {train_acc:.4f}, Test: {test_acc:.4f}') 90 | 91 | @torch.no_grad() 92 | def evaluate_model(self): 93 | self.model.eval() 94 | self.model, self.data = self.model.to(self.device), self.data.to(self.device) 95 | self._gen_test_loader() 96 | 97 | if self.target_model == 'GCN': 98 | out = self.model.inference(self.data.x, self.test_loader, self.edge_weight, self.device) 99 | else: 100 | out = self.model.inference(self.data.x, self.test_loader, self.device) 101 | 102 | y_true = self.data.y.cpu().unsqueeze(-1) 103 | y_pred = out.argmax(dim=-1, keepdim=True) 104 | 105 | results = [] 106 | for mask in [self.data.train_mask, self.data.test_mask]: 107 | results += [int(y_pred[mask].eq(y_true[mask]).sum()) / int(mask.sum())] 108 | 109 | return results 110 | 111 | def posterior(self): 112 | self.logger.debug("generating posteriors") 113 | self.model, self.data = self.model.to(self.device), self.data.to(self.device) 114 | self.model.eval() 115 | 116 | self._gen_test_loader() 117 | if self.target_model == 'GCN': 118 | posteriors = self.model.inference(self.data.x, self.test_loader, self.edge_weight, self.device) 119 | else: 120 | posteriors = self.model.inference(self.data.x, self.test_loader, self.device) 121 | 122 | for _, mask in self.data('test_mask'): 123 | posteriors = F.log_softmax(posteriors[mask], dim=-1) 124 | 125 | return posteriors.detach() 126 | 127 | def generate_embeddings(self): 128 | self.model.eval() 129 | self.model, self.data = self.model.to(self.device), self.data.to(self.device) 130 | self._gen_test_loader() 131 | 132 | if self.target_model == 'GCN': 133 | logits = self.model.inference(self.data.x, self.test_loader, self.edge_weight, self.device) 134 | else: 135 | logits = self.model.inference(self.data.x, self.test_loader, self.device) 136 | return logits 137 | 138 | def _gen_train_loader(self): 139 | self.logger.info("generate train loader") 140 | train_indices = np.nonzero(self.data.train_mask.cpu().numpy())[0] 141 | edge_index = utils.filter_edge_index(self.data.edge_index, train_indices, reindex=False) 142 | if edge_index.shape[1] == 0: 143 | edge_index = torch.tensor([[1, 2], [2, 1]]) 144 | 145 | self.train_loader = NeighborSampler( 146 | edge_index, node_idx=self.data.train_mask, 147 | sizes=[5, 5], num_nodes=self.data.num_nodes, 148 | batch_size=self.args['batch_size'], shuffle=True, 149 | num_workers=0) 150 | 151 | if self.target_model == 'GCN': 152 | _, self.edge_weight = gcn_norm(self.data.edge_index, edge_weight=None, num_nodes=self.data.x.shape[0], 153 | add_self_loops=False) 154 | 155 | self.logger.info("generate train loader finish") 156 | 157 | def _gen_test_loader(self): 158 | test_indices = np.nonzero(self.data.train_mask.cpu().numpy())[0] 159 | 160 | if not self.args['use_test_neighbors']: 161 | edge_index = utils.filter_edge_index(self.data.edge_index, test_indices, reindex=False) 162 | else: 163 | edge_index = self.data.edge_index 164 | 165 | if edge_index.shape[1] == 0: 166 | edge_index = torch.tensor([[1, 3], [3, 1]]) 167 | 168 | self.test_loader = NeighborSampler( 169 | edge_index, node_idx=None, 170 | sizes=[-1], num_nodes=self.data.num_nodes, 171 | # sizes=[5], num_nodes=self.data.num_nodes, 172 | batch_size=self.args['test_batch_size'], shuffle=False, 173 | num_workers=0) 174 | 175 | if self.target_model == 'GCN': 176 | _, self.edge_weight = gcn_norm(self.data.edge_index, edge_weight=None, num_nodes=self.data.x.shape[0], 177 | add_self_loops=False) 178 | 179 | 180 | if __name__ == '__main__': 181 | os.chdir('../') 182 | args = parameter_parser() 183 | 184 | output_file = None 185 | logging.basicConfig(filename=output_file, 186 | format='%(levelname)s:%(asctime)s: - %(name)s - : %(message)s', 187 | level=logging.DEBUG) 188 | 189 | dataset_name = 'cora' 190 | dataset = Planetoid(config.RAW_DATA_PATH, dataset_name, transform=T.NormalizeFeatures()) 191 | data = dataset[0] 192 | 193 | train_indices, test_indices = train_test_split(np.arange((data.num_nodes)), test_size=0.2, random_state=100) 194 | data.train_mask, data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool), torch.zeros(data.num_nodes, 195 | dtype=torch.bool) 196 | data.train_mask[train_indices] = True 197 | data.test_mask[test_indices] = True 198 | 199 | graphsage = NodeClassifier(dataset.num_features, dataset.num_classes, args, data) 200 | graphsage.train_model() 201 | -------------------------------------------------------------------------------- /lib_graph_partition/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinChen00/Graph-Unlearning/a9b942d01651c2e3d780ae12e1a1459e35120ffa/lib_graph_partition/__init__.py -------------------------------------------------------------------------------- /lib_graph_partition/constrained_kmeans.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import copy 3 | 4 | from tqdm import tqdm 5 | 6 | import numpy as np 7 | import cupy as np 8 | 9 | 10 | class ConstrainedKmeans: 11 | def __init__(self, data_feat, num_clusters, node_threshold, terminate_delta, max_iteration=20): 12 | self.logger = logging.getLogger('constrained_kmeans') 13 | 14 | self.data_feat = data_feat 15 | self.num_clusters = num_clusters 16 | self.node_threshold = node_threshold 17 | self.terminate_delta = terminate_delta 18 | self.max_iteration = max_iteration 19 | 20 | def initialization(self): 21 | centroids = np.random.choice(np.arange(self.data_feat.shape[0]), self.num_clusters, replace=False) 22 | self.centroid = {} 23 | for i in range(self.num_clusters): 24 | self.centroid[i] = self.data_feat[centroids[i].get()] 25 | 26 | def clustering(self): 27 | centroid = copy.deepcopy(self.centroid) 28 | km_delta = [] 29 | 30 | pbar = tqdm(total=self.max_iteration) 31 | pbar.set_description('Clustering') 32 | 33 | for i in range(self.max_iteration): 34 | self.logger.info('iteration %s' % (i,)) 35 | 36 | self._node_reassignment() 37 | self._centroid_updating() 38 | 39 | # record the average change of centroids, if the change is smaller than a very small value, then terminate 40 | delta = self._centroid_delta(centroid, self.centroid) 41 | km_delta.append(delta) 42 | centroid = copy.deepcopy(self.centroid) 43 | 44 | if delta <= self.terminate_delta: 45 | break 46 | self.logger.info("delta: %s" % delta) 47 | pbar.close() 48 | return self.clusters, km_delta 49 | 50 | def _node_reassignment(self): 51 | self.clusters = {} 52 | for i in range(self.num_clusters): 53 | self.clusters[i] = np.zeros(0, dtype=np.uint64) 54 | 55 | distance = np.zeros([self.num_clusters, self.data_feat.shape[0]]) 56 | 57 | for i in range(self.num_clusters): 58 | distance[i] = np.sum(np.power((self.data_feat - self.centroid[i]), 2), axis=1) 59 | 60 | sort_indices = np.unravel_index(np.argsort(distance, axis=None), distance.shape) 61 | clusters = sort_indices[0] 62 | users = sort_indices[1] 63 | selected_nodes = np.zeros(0, dtype=np.int64) 64 | counter = 0 65 | 66 | while len(selected_nodes) < self.data_feat.shape[0]: 67 | cluster = int(clusters[counter]) 68 | user = users[counter] 69 | if self.clusters[cluster].size < self.node_threshold: 70 | self.clusters[cluster] = np.append(self.clusters[cluster], np.array(int(user))) 71 | selected_nodes = np.append(selected_nodes, np.array(int(user))) 72 | 73 | # delete all the following pairs for the selected user 74 | user_indices = np.where(users == user)[0] 75 | a = np.arange(users.size) 76 | b = user_indices[user_indices > counter] 77 | remain_indices = a[np.where(np.logical_not(np.isin(a, b)))[0]] 78 | clusters = clusters[remain_indices] 79 | users = users[remain_indices] 80 | 81 | counter += 1 82 | 83 | def _centroid_updating(self): 84 | for i in range(self.num_clusters): 85 | self.centroid[i] = np.mean(self.data_feat[self.clusters[i].astype(int)], axis=0) 86 | 87 | def _centroid_delta(self, centroid_pre, centroid_cur): 88 | delta = 0.0 89 | for i in range(len(centroid_cur)): 90 | delta += np.sum(np.abs(centroid_cur[i] - centroid_pre[i])) 91 | 92 | return delta 93 | 94 | 95 | if __name__ == '__main__': 96 | output_file = None 97 | logging.basicConfig(filename=output_file, 98 | format='%(levelname)s:%(asctime)s: - %(name)s - : %(message)s', 99 | level=logging.DEBUG) 100 | 101 | data_feat = np.array([[1, 2], 102 | [1, 3], 103 | [1, 4], 104 | [1, 5], 105 | [10, 2], 106 | [10, 3]]) 107 | num_clusters = 2 108 | node_threshold = 3 109 | terminate_delta = 0.001 110 | 111 | cluster = ConstrainedKmeans(data_feat, num_clusters, node_threshold, terminate_delta) 112 | cluster.initialization() 113 | cluster.clustering() -------------------------------------------------------------------------------- /lib_graph_partition/constrained_kmeans_base.py: -------------------------------------------------------------------------------- 1 | # An implementation of ``Balanced K-Means for Clustering.'' (https://rdcu.be/cESzk) 2 | import logging 3 | import copy 4 | 5 | import numpy as np 6 | import seaborn as sns 7 | import matplotlib.pyplot as plt 8 | from munkres import Munkres 9 | from lib_graph_partition.hungarian import Hungarian 10 | from lib_graph_partition.hungarian_1 import KMMatcher 11 | 12 | 13 | class ConstrainedKmeansBase: 14 | def __init__(self, data_feat, num_clusters, node_threshold, terminate_delta, max_iteration=20): 15 | self.logger = logging.getLogger('constrained_kmeans_base') 16 | 17 | self.data_feat = data_feat 18 | self.num_clusters = num_clusters 19 | self.node_threshold = node_threshold 20 | self.terminate_delta = terminate_delta 21 | self.max_iteration = max_iteration 22 | 23 | def initialization(self): 24 | centroids = np.random.choice(np.arange(self.data_feat.shape[0]), self.num_clusters, replace=False) 25 | self.centroid = dict(zip(range(self.num_clusters), self.data_feat[centroids])) 26 | 27 | def clustering(self): 28 | centroid = copy.deepcopy(self.centroid) 29 | centroid_delta = {} 30 | km_base_delta = [] 31 | 32 | for i in range(self.max_iteration): 33 | self.logger.info('iteration %s' % (i)) 34 | self._node_reassignment() 35 | self._centroid_updating() 36 | 37 | # record the average change of centroids, if the change is smaller than a very small value, then terminate 38 | delta = self._centroid_delta(centroid, self.centroid) 39 | centroid_delta[i] = delta 40 | km_base_delta.append(delta) 41 | centroid = copy.deepcopy(self.centroid) 42 | 43 | if delta <= self.terminate_delta: 44 | break 45 | self.logger.info("delta: %s" % delta) 46 | 47 | return self.clusters, km_base_delta 48 | 49 | def _node_reassignment(self): 50 | self.logger.info('Node reassignment begins') 51 | self.clusters = dict( 52 | zip(np.arange(self.num_clusters), [np.zeros(0, dtype=np.uint64) for _ in range(self.num_clusters)])) 53 | 54 | distance = np.zeros([self.num_clusters, self.data_feat.shape[0]]) 55 | # cost_matrix = np.zeros([self.data_feat.shape[0], self.data_feat.shape[0]]) 56 | for i in range(self.num_clusters): 57 | distance[i] = np.sum((self.data_feat - self.centroid[i]) ** 2, axis=1) 58 | cost_matrix = np.tile(distance, (self.data_feat.shape[0], 1)) 59 | cost_matrix = cost_matrix[:self.data_feat.shape[0], :] 60 | 61 | # too slow 62 | # matrix = np.array(cost_matrix) 63 | # m = Munkres() 64 | # assignment = m.compute(matrix) 65 | # assignment = np.array(assignment) 66 | # assignment = assignment[:, 1] 67 | 68 | # hungarian = Hungarian(cost_matrix) 69 | # hungarian.calculate() 70 | # assignment = hungarian.get_results() 71 | # assignment = np.array(assignment) 72 | # assignment = assignment[np.argsort(assignment[:, 0])] 73 | # assignment = assignment[:, 1] 74 | 75 | matcher = KMMatcher(cost_matrix) 76 | assignment, _ = matcher.solve() 77 | 78 | partition = np.zeros(self.data_feat.shape[0]) 79 | for i in range(self.data_feat.shape[0]): 80 | partition[assignment[i]] = i % self.num_clusters 81 | 82 | for i in range(self.num_clusters): 83 | self.clusters[i] = np.where(partition == i)[0] 84 | 85 | def _centroid_updating(self): 86 | self.logger.info('Updating centroid begins') 87 | for i in range(self.num_clusters): 88 | self.centroid[i] = np.mean(self.data_feat[self.clusters[i]], axis=0) 89 | 90 | def _centroid_delta(self, centroid_pre, centroid_cur): 91 | delta = 0.0 92 | for i in range(len(centroid_cur)): 93 | delta += np.sum(np.abs(centroid_cur[i] - centroid_pre[i])) 94 | 95 | return delta 96 | 97 | 98 | if __name__ == '__main__': 99 | output_file = None 100 | logging.basicConfig(filename=output_file, 101 | format='%(levelname)s:%(asctime)s: - %(name)s - : %(message)s', 102 | level=logging.DEBUG) 103 | 104 | data_feat = np.array([[1, 2], 105 | [1, 3], 106 | [1, 4], 107 | [1, 5], 108 | [10, 2], 109 | [10, 3]]) 110 | num_clusters = 2 111 | node_threshold = 3 112 | terminate_delta = 0.001 113 | 114 | cluster = ConstrainedKmeansBase(data_feat, num_clusters, node_threshold, terminate_delta) 115 | cluster.initialization() 116 | cluster.clustering() 117 | -------------------------------------------------------------------------------- /lib_graph_partition/constrained_lpa.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import logging 3 | from collections import defaultdict 4 | 5 | import numpy as np 6 | 7 | 8 | class ConstrainedLPA: 9 | def __init__(self, adj, num_communities, node_threshold, terminate_delta): 10 | self.logger = logging.getLogger('constrained_lpa_single') 11 | 12 | self.adj = adj 13 | self.num_nodes = adj.shape[0] 14 | self.num_communities = num_communities 15 | self.node_threshold = node_threshold 16 | self.terminate_delta = terminate_delta 17 | 18 | def initialization(self): 19 | self.logger.info('initializing communities') 20 | 21 | random_nodes = np.arange(self.num_nodes) 22 | np.random.shuffle(random_nodes) 23 | self.communities = defaultdict(set) 24 | self.node_community = np.zeros(self.adj.shape[0]) 25 | 26 | # each node use node is as its community label 27 | for community, nodes in enumerate(np.array_split(random_nodes, self.num_communities)): 28 | self.communities[community] = set(nodes) 29 | self.node_community[nodes] = community 30 | 31 | def community_detection(self, iterations=100): 32 | self.logger.info('detecting communities') 33 | 34 | communities = copy.deepcopy(self.communities) 35 | lpa_deltas = [] 36 | 37 | # Currently, break when maximum iterations round achieves. 38 | for i in range(iterations): 39 | self.logger.info('iteration %s' % (i,)) 40 | 41 | desire_move = self._determine_desire_move() 42 | sort_indices = np.flip(np.argsort(desire_move[:, 2])) 43 | candidate_nodes = defaultdict(list) 44 | 45 | # allocate nodes' community with descending order of colocate count 46 | for node in sort_indices: 47 | src_community = desire_move[node][0] 48 | dst_community = desire_move[node][1] 49 | 50 | if src_community != dst_community: 51 | if len(self.communities[dst_community]) < self.node_threshold: 52 | self.node_community[node] = dst_community 53 | self.communities[dst_community].add(node) 54 | self.communities[src_community].remove(node) 55 | 56 | # reallocate the candidate nodes 57 | candidate_nodes_cur = candidate_nodes[src_community] 58 | while len(candidate_nodes_cur) != 0: 59 | node_cur = candidate_nodes_cur[0] 60 | src_community_cur = desire_move[node_cur][0] 61 | dst_community_cur = desire_move[node_cur][1] 62 | 63 | self.node_community[node_cur] = dst_community_cur 64 | self.communities[dst_community_cur].add(node_cur) 65 | self.communities[src_community_cur].remove(node_cur) 66 | 67 | candidate_nodes[dst_community_cur].pop(0) 68 | candidate_nodes_cur = candidate_nodes[src_community_cur] 69 | else: 70 | candidate_nodes[dst_community].append(node) 71 | # record the communities of each iteration, break the loop while communities are stable. 72 | 73 | delta = self._lpa_delta(communities, self.communities) 74 | lpa_deltas.append(delta) 75 | self.logger.info("%d" % delta) 76 | communities = copy.deepcopy(self.communities) 77 | if delta <= self.terminate_delta: 78 | break 79 | 80 | return self.communities, lpa_deltas 81 | 82 | def _determine_desire_move(self): 83 | desire_move = np.zeros([self.num_nodes, 3]) 84 | desire_move[:, 0] = self.node_community 85 | 86 | for i in range(self.num_nodes): 87 | # neighbor_community = self.node_community[np.nonzero(self.adj[i])[0]] # for non-bool adj 88 | neighbor_community = self.node_community[self.adj[i]] # for bool adj 89 | unique_community, unique_count = np.unique(neighbor_community, return_counts=True) 90 | if unique_community.shape[0] == 0: 91 | continue 92 | max_indices = np.where(unique_count == np.max(unique_count))[0] 93 | 94 | if max_indices.size == 1: 95 | desire_move[i, 1] = unique_community[max_indices] 96 | desire_move[i, 2] = unique_count[max_indices] 97 | elif max_indices.size > 1: 98 | max_index = np.random.choice(max_indices) 99 | desire_move[i, 1] = unique_community[max_index] 100 | desire_move[i, 2] = unique_count[max_index] 101 | 102 | return desire_move 103 | 104 | def _lpa_delta(self, lpa_pre, lpa_cur): 105 | delta = 0.0 106 | for i in range(len(lpa_cur)): 107 | delta += len((lpa_cur[i] | lpa_pre[i]) - (lpa_cur[i] & lpa_pre[i])) 108 | 109 | return delta 110 | 111 | 112 | if __name__ == '__main__': 113 | output_file = None 114 | logging.basicConfig(filename=output_file, 115 | format='%(levelname)s:%(asctime)s: - %(name)s - : %(message)s', 116 | level=logging.DEBUG) 117 | 118 | adj = np.array([[0, 1, 1], 119 | [1, 0, 1], 120 | [1, 1, 0]], 121 | dtype=np.bool) 122 | 123 | num_communities = 2 124 | node_threshold = 3 125 | terminate_delta = 1 126 | 127 | lpa = ConstrainedLPA(adj, num_communities, node_threshold, terminate_delta) 128 | 129 | lpa.initialization() 130 | lpa.community_detection() 131 | -------------------------------------------------------------------------------- /lib_graph_partition/constrained_lpa_base.py: -------------------------------------------------------------------------------- 1 | # An implementation of `` Balanced Label Propagation for Partitioning MassiveGraphs'' (https://stanford.edu/~jugander/papers/wsdm13-blp.pdf) 2 | 3 | import copy 4 | import logging 5 | from collections import defaultdict 6 | 7 | import numpy as np 8 | import cvxpy as cp 9 | from scipy.stats import linregress 10 | 11 | 12 | class ConstrainedLPABase: 13 | def __init__(self, adj, num_communities, node_threshold, terminate_delta): 14 | self.logger = logging.getLogger('constrained_lpa_base') 15 | 16 | self.adj = adj 17 | self.num_nodes = adj.shape[0] 18 | self.num_communities = num_communities 19 | self.node_threshold = node_threshold 20 | self.terminate_delta = terminate_delta 21 | 22 | def initialization(self): 23 | self.logger.info('initializing communities') 24 | 25 | random_nodes = np.arange(self.num_nodes) 26 | np.random.shuffle(random_nodes) 27 | self.communities = defaultdict(set) 28 | self.node_community = np.zeros(self.adj.shape[0]) 29 | 30 | # each node use node is as its community label 31 | for community, nodes in enumerate(np.array_split(random_nodes, self.num_communities)): 32 | self.communities[community] = set(nodes) 33 | self.node_community[nodes] = community 34 | 35 | def community_detection(self, iterations=100): 36 | self.logger.info('detecting communities') 37 | 38 | communities = copy.deepcopy(self.communities) 39 | lpa_deltas = [] 40 | 41 | for i in range(iterations): 42 | self.logger.info('iteration %s' % (i,)) 43 | 44 | ## Step 1: calculate desired move 45 | desire_move = self._determine_desire_move() 46 | relocation = {} 47 | utility_func = {} 48 | 49 | ## Step 2: calculate parameters for linear programming problem 50 | for src_community in range(self.num_communities): 51 | for dst_community in range(self.num_communities): 52 | move_node = desire_move[np.where(np.logical_and(desire_move[:, 1] == src_community, desire_move[:, 2] == dst_community))[0]] 53 | 54 | if src_community != dst_community and move_node.size != 0: 55 | move_node = move_node[np.flip(np.argsort(move_node[:, 3]))] 56 | relocation[(src_community, dst_community)] = move_node 57 | 58 | if move_node.shape[0] == 1: 59 | utility_func[(src_community, dst_community)] = np.array([[0, move_node[0, 3]]]) 60 | else: 61 | cum_sum = np.cumsum(move_node[:, 3]) 62 | utility_func_temp = np.zeros([move_node.shape[0] - 1, 2]) 63 | for k in range(move_node.shape[0] - 1): 64 | utility_func_temp[k, 0], utility_func_temp[k, 1], _, _, _ = linregress([k, k+1], [cum_sum[k], cum_sum[k+1]]) 65 | utility_func[(src_community, dst_community)] = utility_func_temp 66 | 67 | ## Step 3: solve linear programming problem 68 | x = cp.Variable([self.num_communities, self.num_communities]) 69 | z = cp.Variable([self.num_communities, self.num_communities]) 70 | 71 | objective = cp.Maximize(cp.sum(z)) 72 | constraints = [] 73 | for src_community in range(self.num_communities): 74 | const = 0 75 | for dst_community in range(self.num_communities): 76 | if (src_community, dst_community) in relocation: 77 | if src_community == dst_community: 78 | constraints.append(x[src_community, dst_community] == 0) 79 | constraints.append(z[src_community, dst_community] == 0) 80 | else: 81 | ## Constraint 2 of Theorem 2 82 | constraints.append(x[src_community, dst_community] >= 0) 83 | constraints.append(x[src_community, dst_community] <= relocation[(src_community, dst_community)].shape[0]) 84 | 85 | ## Constraint 1 of Theorem 2 86 | if (dst_community, src_community) in relocation: 87 | const += x[src_community, dst_community] - x[dst_community, src_community] 88 | 89 | ## Constraint 3 of Theorem 2 90 | for utility_func_value in utility_func[(src_community, dst_community)]: 91 | constraints.append(- utility_func_value[0] * x[src_community, dst_community] + z[src_community, dst_community] <= utility_func_value[1]) 92 | 93 | else: 94 | constraints.append(x[src_community, dst_community] == 0) 95 | constraints.append(z[src_community, dst_community] == 0) 96 | 97 | ## Constraint 1 of Theorem 2 98 | constraints.append(len(self.communities[src_community]) + const <= self.node_threshold) 99 | 100 | problem = cp.Problem(objective, constraints) 101 | problem.solve() 102 | 103 | ## Step 4: parse linear programming problem results 104 | if problem.status == 'optimal': 105 | x_value = np.floor(np.abs(x.value)).astype(np.int64) 106 | for src_community in range(self.num_communities): 107 | for dst_community in range(self.num_communities): 108 | if (src_community, dst_community) in relocation and x_value[src_community, dst_community] != 0: 109 | # if (src_community, dst_community) in relocation: 110 | relocation_temp = relocation[(src_community, dst_community)][:, 0].astype(np.int64) 111 | move_node = relocation_temp[:x_value[src_community, dst_community] - 1] 112 | if isinstance(move_node, np.int64): 113 | self.communities[src_community].remove(move_node) 114 | self.communities[dst_community].add(move_node) 115 | self.node_community[move_node] = dst_community 116 | else: 117 | # move_node = set(move_node) 118 | self.communities[src_community].difference_update(move_node) 119 | self.communities[dst_community].update(move_node) 120 | for node in move_node: 121 | self.node_community[node] = dst_community 122 | else: 123 | self.logger.info("No optimal solution, break!") 124 | break 125 | 126 | ## Check the number of moved nodes 127 | delta = self._lpa_delta(communities, self.communities) 128 | lpa_deltas.append(delta) 129 | self.logger.info("%d" % delta) 130 | communities = copy.deepcopy(self.communities) 131 | if delta <= self.terminate_delta: 132 | break 133 | 134 | return self.communities, lpa_deltas 135 | 136 | def _determine_desire_move(self): 137 | desire_move = [] 138 | 139 | for i in range(self.num_nodes): 140 | # neighbor_community = self.node_community[np.nonzero(self.adj[i])[0]] # for non-bool adj 141 | neighbor_community = self.node_community[self.adj[i]] # for bool adj 142 | unique_community, unique_count = np.unique(neighbor_community, return_counts=True) 143 | 144 | src_relocation = unique_count[np.where(unique_community == self.node_community[i])[0]] 145 | for community in unique_community: 146 | if community != self.node_community[i]: 147 | dst_relocation = unique_count[np.where(unique_community == community)[0]] 148 | if dst_relocation - src_relocation >= 0: 149 | desire_move_temp = np.zeros(4) 150 | desire_move_temp[0] = i 151 | desire_move_temp[1] = self.node_community[i] 152 | desire_move_temp[2] = community 153 | desire_move_temp[3] = dst_relocation - src_relocation 154 | 155 | desire_move.append(desire_move_temp) 156 | 157 | return np.stack(desire_move) 158 | 159 | def _lpa_delta(self, lpa_pre, lpa_cur): 160 | delta = 0.0 161 | for i in range(len(lpa_cur)): 162 | delta += len((lpa_cur[i] | lpa_pre[i]) - (lpa_cur[i] & lpa_pre[i])) 163 | 164 | return delta 165 | 166 | 167 | if __name__ == '__main__': 168 | output_file = None 169 | logging.basicConfig(filename=output_file, 170 | format='%(levelname)s:%(asctime)s: - %(name)s - : %(message)s', 171 | level=logging.DEBUG) 172 | 173 | adj = np.array([[0, 1, 1], 174 | [1, 0, 1], 175 | [1, 1, 0]], 176 | dtype=np.bool) 177 | 178 | num_communities = 2 179 | node_threshold = 3 180 | terminate_delta = 1 181 | 182 | lpa = ConstrainedLPABase(adj, num_communities, node_threshold, terminate_delta) 183 | 184 | lpa.initialization() 185 | lpa.community_detection() 186 | -------------------------------------------------------------------------------- /lib_graph_partition/graph_partition.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from lib_graph_partition.partition_kmeans import PartitionKMeans 4 | from lib_graph_partition.partition_lpa import PartitionConstrainedLPA, PartitionLPA, PartitionConstrainedLPABase 5 | from lib_graph_partition.metis_partition import MetisPartition 6 | from lib_graph_partition.partition_random import PartitionRandom 7 | 8 | 9 | class GraphPartition: 10 | def __init__(self, args, graph, dataset=None): 11 | self.logger = logging.getLogger(__name__) 12 | 13 | self.args = args 14 | self.graph = graph 15 | self.dataset = dataset 16 | 17 | self.partition_method = self.args['partition_method'] 18 | self.num_shards = self.args['num_shards'] 19 | 20 | def graph_partition(self): 21 | self.logger.info('graph partition, method: %s' % self.partition_method) 22 | 23 | if self.partition_method == 'random': 24 | partition_method = PartitionRandom(self.args, self.graph) 25 | elif self.partition_method in ['sage_km', 'sage_km_base']: 26 | partition_method = PartitionKMeans(self.args, self.graph, self.dataset) 27 | elif self.partition_method == 'lpa' and not self.args['is_constrained']: 28 | partition_method = PartitionLPA(self.args, self.graph) 29 | elif self.partition_method == 'lpa' and self.args['is_constrained']: 30 | partition_method = PartitionConstrainedLPA(self.args, self.graph) 31 | elif self.partition_method == 'lpa_base': 32 | partition_method = PartitionConstrainedLPABase(self.args, self.graph) 33 | elif self.partition_method == 'metis': 34 | partition_method = MetisPartition(self.args, self.graph, self.dataset) 35 | else: 36 | raise Exception('Unsupported partition method') 37 | 38 | return partition_method.partition() 39 | -------------------------------------------------------------------------------- /lib_graph_partition/hungarian.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | """ 3 | Implementation of the Hungarian (Munkres) Algorithm using Python and NumPy 4 | References: http://www.ams.jhu.edu/~castello/362/Handouts/hungarian.pdf 5 | http://weber.ucsd.edu/~vcrawfor/hungar.pdf 6 | http://en.wikipedia.org/wiki/Hungarian_algorithm 7 | http://www.public.iastate.edu/~ddoty/HungarianAlgorithm.html 8 | http://www.clapper.org/software/python/munkres/ 9 | """ 10 | 11 | # Module Information. 12 | __version__ = "1.1.1" 13 | __author__ = "Thom Dedecko" 14 | __url__ = "http://github.com/tdedecko/hungarian-algorithm" 15 | __copyright__ = "(c) 2010 Thom Dedecko" 16 | __license__ = "MIT License" 17 | 18 | 19 | class HungarianError(Exception): 20 | pass 21 | 22 | # Import numpy. Error if fails 23 | try: 24 | import numpy as np 25 | except ImportError: 26 | raise HungarianError("NumPy is not installed.") 27 | 28 | 29 | class Hungarian: 30 | """ 31 | Implementation of the Hungarian (Munkres) Algorithm using np. 32 | Usage: 33 | hungarian = Hungarian(cost_matrix) 34 | hungarian.calculate() 35 | or 36 | hungarian = Hungarian() 37 | hungarian.calculate(cost_matrix) 38 | Handle Profit matrix: 39 | hungarian = Hungarian(profit_matrix, is_profit_matrix=True) 40 | or 41 | cost_matrix = Hungarian.make_cost_matrix(profit_matrix) 42 | The matrix will be automatically padded if it is not square. 43 | For that numpy's resize function is used, which automatically adds 0's to any row/column that is added 44 | Get results and total potential after calculation: 45 | hungarian.get_results() 46 | hungarian.get_total_potential() 47 | """ 48 | 49 | def __init__(self, input_matrix=None, is_profit_matrix=False): 50 | """ 51 | input_matrix is a List of Lists. 52 | input_matrix is assumed to be a cost matrix unless is_profit_matrix is True. 53 | """ 54 | if input_matrix is not None: 55 | # Save input 56 | my_matrix = np.array(input_matrix) 57 | self._input_matrix = np.array(input_matrix) 58 | self._maxColumn = my_matrix.shape[1] 59 | self._maxRow = my_matrix.shape[0] 60 | 61 | # Adds 0s if any columns/rows are added. Otherwise stays unaltered 62 | matrix_size = max(self._maxColumn, self._maxRow) 63 | pad_columns = matrix_size - self._maxRow 64 | pad_rows = matrix_size - self._maxColumn 65 | my_matrix = np.pad(my_matrix, ((0,pad_columns),(0,pad_rows)), 'constant', constant_values=(0)) 66 | 67 | # Convert matrix to profit matrix if necessary 68 | if is_profit_matrix: 69 | my_matrix = self.make_cost_matrix(my_matrix) 70 | 71 | self._cost_matrix = my_matrix 72 | self._size = len(my_matrix) 73 | self._shape = my_matrix.shape 74 | 75 | # Results from algorithm. 76 | self._results = [] 77 | self._totalPotential = 0 78 | else: 79 | self._cost_matrix = None 80 | 81 | def get_results(self): 82 | """Get results after calculation.""" 83 | return self._results 84 | 85 | def get_total_potential(self): 86 | """Returns expected value after calculation.""" 87 | return self._totalPotential 88 | 89 | def calculate(self, input_matrix=None, is_profit_matrix=False): 90 | """ 91 | Implementation of the Hungarian (Munkres) Algorithm. 92 | input_matrix is a List of Lists. 93 | input_matrix is assumed to be a cost matrix unless is_profit_matrix is True. 94 | """ 95 | # Handle invalid and new matrix inputs. 96 | if input_matrix is None and self._cost_matrix is None: 97 | raise HungarianError("Invalid input") 98 | elif input_matrix is not None: 99 | self.__init__(input_matrix, is_profit_matrix) 100 | 101 | result_matrix = self._cost_matrix.copy() 102 | 103 | # Step 1: Subtract row mins from each row. 104 | for index, row in enumerate(result_matrix): 105 | result_matrix[index] -= row.min() 106 | 107 | # Step 2: Subtract column mins from each column. 108 | for index, column in enumerate(result_matrix.T): 109 | result_matrix[:, index] -= column.min() 110 | 111 | # Step 3: Use minimum number of lines to cover all zeros in the matrix. 112 | # If the total covered rows+columns is not equal to the matrix size then adjust matrix and repeat. 113 | total_covered = 0 114 | while total_covered < self._size: 115 | # Find minimum number of lines to cover all zeros in the matrix and find total covered rows and columns. 116 | cover_zeros = CoverZeros(result_matrix) 117 | covered_rows = cover_zeros.get_covered_rows() 118 | covered_columns = cover_zeros.get_covered_columns() 119 | total_covered = len(covered_rows) + len(covered_columns) 120 | 121 | # if the total covered rows+columns is not equal to the matrix size then adjust it by min uncovered num (m). 122 | if total_covered < self._size: 123 | result_matrix = self._adjust_matrix_by_min_uncovered_num(result_matrix, covered_rows, covered_columns) 124 | 125 | # Step 4: Starting with the top row, work your way downwards as you make assignments. 126 | # Find single zeros in rows or columns. 127 | # Add them to final result and remove them and their associated row/column from the matrix. 128 | expected_results = min(self._maxColumn, self._maxRow) 129 | zero_locations = (result_matrix == 0) 130 | while len(self._results) != expected_results: 131 | 132 | # If number of zeros in the matrix is zero before finding all the results then an error has occurred. 133 | if not zero_locations.any(): 134 | raise HungarianError("Unable to find results. Algorithm has failed.") 135 | 136 | # Find results and mark rows and columns for deletion 137 | matched_rows, matched_columns = self.__find_matches(zero_locations) 138 | 139 | # Make arbitrary selection 140 | total_matched = len(matched_rows) + len(matched_columns) 141 | if total_matched == 0: 142 | matched_rows, matched_columns = self.select_arbitrary_match(zero_locations) 143 | 144 | # Delete rows and columns 145 | for row in matched_rows: 146 | zero_locations[row] = False 147 | for column in matched_columns: 148 | zero_locations[:, column] = False 149 | 150 | # Save Results 151 | self.__set_results(zip(matched_rows, matched_columns)) 152 | 153 | # Calculate total potential 154 | value = 0 155 | for row, column in self._results: 156 | value += self._input_matrix[row, column] 157 | self._totalPotential = value 158 | 159 | @staticmethod 160 | def make_cost_matrix(profit_matrix): 161 | """ 162 | Converts a profit matrix into a cost matrix. 163 | Expects NumPy objects as input. 164 | """ 165 | # subtract profit matrix from a matrix made of the max value of the profit matrix 166 | matrix_shape = profit_matrix.shape 167 | offset_matrix = np.ones(matrix_shape, dtype=int) * profit_matrix.max() 168 | cost_matrix = offset_matrix - profit_matrix 169 | return cost_matrix 170 | 171 | def _adjust_matrix_by_min_uncovered_num(self, result_matrix, covered_rows, covered_columns): 172 | """Subtract m from every uncovered number and add m to every element covered with two lines.""" 173 | # Calculate minimum uncovered number (m) 174 | elements = [] 175 | for row_index, row in enumerate(result_matrix): 176 | if row_index not in covered_rows: 177 | for index, element in enumerate(row): 178 | if index not in covered_columns: 179 | elements.append(element) 180 | min_uncovered_num = min(elements) 181 | 182 | # Add m to every covered element 183 | adjusted_matrix = result_matrix 184 | for row in covered_rows: 185 | adjusted_matrix[row] += min_uncovered_num 186 | for column in covered_columns: 187 | adjusted_matrix[:, column] += min_uncovered_num 188 | 189 | # Subtract m from every element 190 | m_matrix = np.ones(self._shape, dtype=int) * min_uncovered_num 191 | adjusted_matrix -= m_matrix 192 | 193 | return adjusted_matrix 194 | 195 | def __find_matches(self, zero_locations): 196 | """Returns rows and columns with matches in them.""" 197 | marked_rows = np.array([], dtype=int) 198 | marked_columns = np.array([], dtype=int) 199 | 200 | # Mark rows and columns with matches 201 | # Iterate over rows 202 | for index, row in enumerate(zero_locations): 203 | row_index = np.array([index]) 204 | if np.sum(row) == 1: 205 | column_index, = np.where(row) 206 | marked_rows, marked_columns = self.__mark_rows_and_columns(marked_rows, marked_columns, row_index, 207 | column_index) 208 | 209 | # Iterate over columns 210 | for index, column in enumerate(zero_locations.T): 211 | column_index = np.array([index]) 212 | if np.sum(column) == 1: 213 | row_index, = np.where(column) 214 | marked_rows, marked_columns = self.__mark_rows_and_columns(marked_rows, marked_columns, row_index, 215 | column_index) 216 | 217 | return marked_rows, marked_columns 218 | 219 | @staticmethod 220 | def __mark_rows_and_columns(marked_rows, marked_columns, row_index, column_index): 221 | """Check if column or row is marked. If not marked then mark it.""" 222 | new_marked_rows = marked_rows 223 | new_marked_columns = marked_columns 224 | if not (marked_rows == row_index).any() and not (marked_columns == column_index).any(): 225 | new_marked_rows = np.insert(marked_rows, len(marked_rows), row_index) 226 | new_marked_columns = np.insert(marked_columns, len(marked_columns), column_index) 227 | return new_marked_rows, new_marked_columns 228 | 229 | @staticmethod 230 | def select_arbitrary_match(zero_locations): 231 | """Selects row column combination with minimum number of zeros in it.""" 232 | # Count number of zeros in row and column combinations 233 | rows, columns = np.where(zero_locations) 234 | zero_count = [] 235 | for index, row in enumerate(rows): 236 | total_zeros = np.sum(zero_locations[row]) + np.sum(zero_locations[:, columns[index]]) 237 | zero_count.append(total_zeros) 238 | 239 | # Get the row column combination with the minimum number of zeros. 240 | indices = zero_count.index(min(zero_count)) 241 | row = np.array([rows[indices]]) 242 | column = np.array([columns[indices]]) 243 | 244 | return row, column 245 | 246 | def __set_results(self, result_lists): 247 | """Set results during calculation.""" 248 | # Check if results values are out of bound from input matrix (because of matrix being padded). 249 | # Add results to results list. 250 | for result in result_lists: 251 | row, column = result 252 | if row < self._maxRow and column < self._maxColumn: 253 | new_result = (int(row), int(column)) 254 | self._results.append(new_result) 255 | 256 | 257 | class CoverZeros: 258 | """ 259 | Use minimum number of lines to cover all zeros in the matrix. 260 | Algorithm based on: http://weber.ucsd.edu/~vcrawfor/hungar.pdf 261 | """ 262 | 263 | def __init__(self, matrix): 264 | """ 265 | Input a matrix and save it as a boolean matrix to designate zero locations. 266 | Run calculation procedure to generate results. 267 | """ 268 | # Find zeros in matrix 269 | self._zero_locations = (matrix == 0) 270 | self._shape = matrix.shape 271 | 272 | # Choices starts without any choices made. 273 | self._choices = np.zeros(self._shape, dtype=bool) 274 | 275 | self._marked_rows = [] 276 | self._marked_columns = [] 277 | 278 | # marks rows and columns 279 | self.__calculate() 280 | 281 | # Draw lines through all unmarked rows and all marked columns. 282 | self._covered_rows = list(set(range(self._shape[0])) - set(self._marked_rows)) 283 | self._covered_columns = self._marked_columns 284 | 285 | def get_covered_rows(self): 286 | """Return list of covered rows.""" 287 | return self._covered_rows 288 | 289 | def get_covered_columns(self): 290 | """Return list of covered columns.""" 291 | return self._covered_columns 292 | 293 | def __calculate(self): 294 | """ 295 | Calculates minimum number of lines necessary to cover all zeros in a matrix. 296 | Algorithm based on: http://weber.ucsd.edu/~vcrawfor/hungar.pdf 297 | """ 298 | while True: 299 | # Erase all marks. 300 | self._marked_rows = [] 301 | self._marked_columns = [] 302 | 303 | # Mark all rows in which no choice has been made. 304 | for index, row in enumerate(self._choices): 305 | if not row.any(): 306 | self._marked_rows.append(index) 307 | 308 | # If no marked rows then finish. 309 | if not self._marked_rows: 310 | return True 311 | 312 | # Mark all columns not already marked which have zeros in marked rows. 313 | num_marked_columns = self.__mark_new_columns_with_zeros_in_marked_rows() 314 | 315 | # If no new marked columns then finish. 316 | if num_marked_columns == 0: 317 | return True 318 | 319 | # While there is some choice in every marked column. 320 | while self.__choice_in_all_marked_columns(): 321 | # Some Choice in every marked column. 322 | 323 | # Mark all rows not already marked which have choices in marked columns. 324 | num_marked_rows = self.__mark_new_rows_with_choices_in_marked_columns() 325 | 326 | # If no new marks then Finish. 327 | if num_marked_rows == 0: 328 | return True 329 | 330 | # Mark all columns not already marked which have zeros in marked rows. 331 | num_marked_columns = self.__mark_new_columns_with_zeros_in_marked_rows() 332 | 333 | # If no new marked columns then finish. 334 | if num_marked_columns == 0: 335 | return True 336 | 337 | # No choice in one or more marked columns. 338 | # Find a marked column that does not have a choice. 339 | choice_column_index = self.__find_marked_column_without_choice() 340 | 341 | while choice_column_index is not None: 342 | # Find a zero in the column indexed that does not have a row with a choice. 343 | choice_row_index = self.__find_row_without_choice(choice_column_index) 344 | 345 | # Check if an available row was found. 346 | new_choice_column_index = None 347 | if choice_row_index is None: 348 | # Find a good row to accomodate swap. Find its column pair. 349 | choice_row_index, new_choice_column_index = \ 350 | self.__find_best_choice_row_and_new_column(choice_column_index) 351 | 352 | # Delete old choice. 353 | self._choices[choice_row_index, new_choice_column_index] = False 354 | 355 | # Set zero to choice. 356 | self._choices[choice_row_index, choice_column_index] = True 357 | 358 | # Loop again if choice is added to a row with a choice already in it. 359 | choice_column_index = new_choice_column_index 360 | 361 | def __mark_new_columns_with_zeros_in_marked_rows(self): 362 | """Mark all columns not already marked which have zeros in marked rows.""" 363 | num_marked_columns = 0 364 | for index, column in enumerate(self._zero_locations.T): 365 | if index not in self._marked_columns: 366 | if column.any(): 367 | row_indices, = np.where(column) 368 | zeros_in_marked_rows = (set(self._marked_rows) & set(row_indices)) != set([]) 369 | if zeros_in_marked_rows: 370 | self._marked_columns.append(index) 371 | num_marked_columns += 1 372 | return num_marked_columns 373 | 374 | def __mark_new_rows_with_choices_in_marked_columns(self): 375 | """Mark all rows not already marked which have choices in marked columns.""" 376 | num_marked_rows = 0 377 | for index, row in enumerate(self._choices): 378 | if index not in self._marked_rows: 379 | if row.any(): 380 | column_index, = np.where(row) 381 | if column_index in self._marked_columns: 382 | self._marked_rows.append(index) 383 | num_marked_rows += 1 384 | return num_marked_rows 385 | 386 | def __choice_in_all_marked_columns(self): 387 | """Return Boolean True if there is a choice in all marked columns. Returns boolean False otherwise.""" 388 | for column_index in self._marked_columns: 389 | if not self._choices[:, column_index].any(): 390 | return False 391 | return True 392 | 393 | def __find_marked_column_without_choice(self): 394 | """Find a marked column that does not have a choice.""" 395 | for column_index in self._marked_columns: 396 | if not self._choices[:, column_index].any(): 397 | return column_index 398 | 399 | raise HungarianError( 400 | "Could not find a column without a choice. Failed to cover matrix zeros. Algorithm has failed.") 401 | 402 | def __find_row_without_choice(self, choice_column_index): 403 | """Find a row without a choice in it for the column indexed. If a row does not exist then return None.""" 404 | row_indices, = np.where(self._zero_locations[:, choice_column_index]) 405 | for row_index in row_indices: 406 | if not self._choices[row_index].any(): 407 | return row_index 408 | 409 | # All rows have choices. Return None. 410 | return None 411 | 412 | def __find_best_choice_row_and_new_column(self, choice_column_index): 413 | """ 414 | Find a row index to use for the choice so that the column that needs to be changed is optimal. 415 | Return a random row and column if unable to find an optimal selection. 416 | """ 417 | row_indices, = np.where(self._zero_locations[:, choice_column_index]) 418 | for row_index in row_indices: 419 | column_indices, = np.where(self._choices[row_index]) 420 | column_index = column_indices[0] 421 | if self.__find_row_without_choice(column_index) is not None: 422 | return row_index, column_index 423 | 424 | # Cannot find optimal row and column. Return a random row and column. 425 | from random import shuffle 426 | 427 | shuffle(row_indices) 428 | column_index, = np.where(self._choices[row_indices[0]]) 429 | return row_indices[0], column_index[0] 430 | 431 | 432 | if __name__ == '__main__': 433 | profit_matrix = [ 434 | [62, 75, 80, 93, 95, 97], 435 | [75, 80, 82, 85, 71, 97], 436 | [80, 75, 81, 98, 90, 97], 437 | [78, 82, 84, 80, 50, 98], 438 | [90, 85, 85, 80, 85, 99], 439 | [65, 75, 80, 75, 68, 96]] 440 | 441 | hungarian = Hungarian(profit_matrix, is_profit_matrix=True) 442 | hungarian.calculate() 443 | print("Expected value:\t\t543") 444 | print("Calculated value:\t", hungarian.get_total_potential()) # = 543 445 | print("Expected results:\n\t[(0, 4), (2, 3), (5, 5), (4, 0), (1, 1), (3, 2)]") 446 | print("Results:\n\t", hungarian.get_results()) 447 | print("-" * 80) 448 | 449 | cost_matrix = [ 450 | [4, 2, 8], 451 | [4, 3, 7], 452 | [3, 1, 6]] 453 | hungarian = Hungarian(cost_matrix) 454 | print('calculating...') 455 | hungarian.calculate() 456 | print("Expected value:\t\t12") 457 | print("Calculated value:\t", hungarian.get_total_potential()) # = 12 458 | print("Expected results:\n\t[(0, 1), (1, 0), (2, 2)]") 459 | print("Results:\n\t", hungarian.get_results()) 460 | print("-" * 80) 461 | 462 | profit_matrix = [ 463 | [62, 75, 80, 93, 0, 97], 464 | [75, 0, 82, 85, 71, 97], 465 | [80, 75, 81, 0, 90, 97], 466 | [78, 82, 0, 80, 50, 98], 467 | [0, 85, 85, 80, 85, 99], 468 | [65, 75, 80, 75, 68, 0]] 469 | hungarian = Hungarian() 470 | hungarian.calculate(profit_matrix, is_profit_matrix=True) 471 | print("Expected value:\t\t523") 472 | print("Calculated value:\t", hungarian.get_total_potential()) # = 523 473 | print("Expected results:\n\t[(0, 3), (2, 4), (3, 0), (5, 2), (1, 5), (4, 1)]") 474 | print("Results:\n\t", hungarian.get_results()) 475 | print("-" * 80) 476 | -------------------------------------------------------------------------------- /lib_graph_partition/hungarian_1.py: -------------------------------------------------------------------------------- 1 | ''' 2 | reference: https://www.topcoder.com/community/competitive-programming/tutorials/assignment-problem-and-hungarian-algorithm/ 3 | ''' 4 | 5 | import numpy as np 6 | 7 | #max weight assignment 8 | class KMMatcher: 9 | 10 | ## weights : nxm weight matrix (numpy , float), n <= m 11 | def __init__(self, weights): 12 | weights = np.array(weights).astype(np.float32) 13 | self.weights = weights 14 | self.n, self.m = weights.shape 15 | assert self.n <= self.m 16 | # init label 17 | self.label_x = np.max(weights, axis=1) 18 | self.label_y = np.zeros((self.m, ), dtype=np.float32) 19 | 20 | self.max_match = 0 21 | self.xy = -np.ones((self.n,), dtype=np.int) 22 | self.yx = -np.ones((self.m,), dtype=np.int) 23 | 24 | def do_augment(self, x, y): 25 | self.max_match += 1 26 | while x != -2: 27 | self.yx[y] = x 28 | ty = self.xy[x] 29 | self.xy[x] = y 30 | x, y = self.prev[x], ty 31 | 32 | def find_augment_path(self): 33 | self.S = np.zeros((self.n,), np.bool) 34 | self.T = np.zeros((self.m,), np.bool) 35 | 36 | self.slack = np.zeros((self.m,), dtype=np.float32) 37 | self.slackyx = -np.ones((self.m,), dtype=np.int) # l[slackyx[y]] + l[y] - w[slackx[y], y] == slack[y] 38 | 39 | self.prev = -np.ones((self.n,), np.int) 40 | 41 | queue, st = [], 0 42 | root = -1 43 | 44 | for x in range(self.n): 45 | if self.xy[x] == -1: 46 | queue.append(x); 47 | root = x 48 | self.prev[x] = -2 49 | self.S[x] = True 50 | break 51 | 52 | self.slack = self.label_y + self.label_x[root] - self.weights[root] 53 | self.slackyx[:] = root 54 | 55 | while True: 56 | while st < len(queue): 57 | x = queue[st]; st+= 1 58 | 59 | is_in_graph = np.isclose(self.weights[x], self.label_x[x] + self.label_y) 60 | nonzero_inds = np.nonzero(np.logical_and(is_in_graph, np.logical_not(self.T)))[0] 61 | 62 | for y in nonzero_inds: 63 | if self.yx[y] == -1: 64 | return x, y 65 | self.T[y] = True 66 | queue.append(self.yx[y]) 67 | self.add_to_tree(self.yx[y], x) 68 | 69 | self.update_labels() 70 | queue, st = [], 0 71 | is_in_graph = np.isclose(self.slack, 0) 72 | nonzero_inds = np.nonzero(np.logical_and(is_in_graph, np.logical_not(self.T)))[0] 73 | 74 | for y in nonzero_inds: 75 | x = self.slackyx[y] 76 | if self.yx[y] == -1: 77 | return x, y 78 | self.T[y] = True 79 | if not self.S[self.yx[y]]: 80 | queue.append(x) 81 | self.add_to_tree(self.yx[y], x) 82 | 83 | def solve(self, verbose = False): 84 | while self.max_match < self.n: 85 | x, y = self.find_augment_path() 86 | self.do_augment(x, y) 87 | 88 | sum = 0. 89 | for x in range(self.n): 90 | if verbose: 91 | print('match {} to {}, weight {:.4f}'.format(x, self.xy[x], self.weights[x, self.xy[x]])) 92 | sum += self.weights[x, self.xy[x]] 93 | self.best = sum 94 | if verbose: 95 | print('ans: {:.4f}'.format(sum)) 96 | return self.xy, sum 97 | 98 | 99 | def add_to_tree(self, x, prevx): 100 | self.S[x] = True 101 | self.prev[x] = prevx 102 | 103 | better_slack_idx = self.label_x[x] + self.label_y - self.weights[x] < self.slack 104 | self.slack[better_slack_idx] = self.label_x[x] + self.label_y[better_slack_idx] - self.weights[x, better_slack_idx] 105 | self.slackyx[better_slack_idx] = x 106 | 107 | def update_labels(self): 108 | delta = self.slack[np.logical_not(self.T)].min() 109 | self.label_x[self.S] -= delta 110 | self.label_y[self.T] += delta 111 | self.slack[np.logical_not(self.T)] -= delta 112 | 113 | 114 | if __name__ == '__main__': 115 | matcher = KMMatcher([ 116 | [2., 3., 0., 3.], 117 | [0., 4., 4., 0.], 118 | [5., 6., 0., 0.], 119 | [0., 0., 7., 0.] 120 | ]) 121 | best = matcher.solve(verbose=True) 122 | print(best) 123 | -------------------------------------------------------------------------------- /lib_graph_partition/metis_partition.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import networkx as nx 3 | import pymetis 4 | from torch_geometric.data import ClusterData 5 | from torch_geometric.utils import from_networkx 6 | 7 | from lib_graph_partition.partition import Partition 8 | 9 | 10 | class MetisPartition(Partition): 11 | def __init__(self, args, graph, dataset): 12 | super(MetisPartition, self).__init__(args, graph, dataset) 13 | self.graph = graph 14 | self.args = args 15 | self.data = dataset 16 | 17 | def partition(self, recursive=False): 18 | # recursive (bool, optional): If set to :obj:`True`, will use multilevel 19 | # recursive bisection instead of multilevel k-way partitioning. 20 | # (default: :obj:`False`) 21 | # only use train data, not the whole dataset 22 | self.train_data = from_networkx(self.graph) 23 | data = ClusterData(self.train_data, self.args['num_shards'], recursive=recursive) 24 | 25 | community_to_node = {} 26 | for i in range(self.args['num_shards']): 27 | community_to_node[i] = [*range(data.partptr[i], data.partptr[i+1], 1)] 28 | 29 | # map node back to original graph 30 | for com in range(self.args['num_shards']): 31 | community_to_node[com] = np.array(list(self.graph.nodes))[data.partptr.numpy()[com]:data.partptr.numpy()[com+1]] 32 | 33 | return community_to_node 34 | 35 | 36 | class PyMetisPartition(Partition): 37 | def __init__(self, args, graph, dataset): 38 | super(PyMetisPartition, self).__init__(args, graph, dataset) 39 | self.graph = graph 40 | self.args = args 41 | self.data = dataset 42 | 43 | def partition(self, recursive=False): 44 | # recursive (bool, optional): If set to :obj:`True`, will use multilevel 45 | # recursive bisection instead of multilevel k-way partitioning. 46 | # (default: :obj:`False`) 47 | # only use train data, not the whole dataset 48 | # map graph into new graph 49 | mapping = {} 50 | for i, node in enumerate(self.graph.nodes): 51 | mapping[node] = i 52 | partition_graph = nx.relabel_nodes(self.graph, mapping=mapping) 53 | 54 | adj_list = [] 55 | for line in nx.generate_adjlist(partition_graph): 56 | line_int = list(map(int, line.split())) 57 | adj_list.append(np.array(line_int)) 58 | 59 | n_cuts, membership = pymetis.part_graph(self.args['num_shards'], adjacency=adj_list) 60 | 61 | # map node back to original graph 62 | community_to_node = {} 63 | for shard_index in range(self.args['num_shards']): 64 | community_to_node[shard_index] = np.array([node_id for node_id, node_shard_index in zip(list(mapping.keys()), membership) if node_shard_index == shard_index]) 65 | return community_to_node 66 | -------------------------------------------------------------------------------- /lib_graph_partition/partition.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Partition: 5 | def __init__(self, args, graph, dataset=None): 6 | self.args = args 7 | self.graph = graph 8 | self.dataset = dataset 9 | 10 | self.partition_method = self.args['partition_method'] 11 | self.num_shards = self.args['num_shards'] 12 | self.dataset_name = self.args['dataset_name'] 13 | 14 | def idx2id(self, idx_dict, node_list): 15 | ret_dict = {} 16 | for com, idx in idx_dict.items(): 17 | ret_dict[com] = node_list[list(idx)] 18 | 19 | return ret_dict 20 | 21 | def id2idx(self, id_dict, node_list): 22 | ret_dict = {} 23 | for com, id in id_dict.items(): 24 | ret_dict[com] = np.searchsorted(node_list, id) 25 | 26 | return ret_dict 27 | -------------------------------------------------------------------------------- /lib_graph_partition/partition_kmeans.py: -------------------------------------------------------------------------------- 1 | import math 2 | import pickle 3 | 4 | import cupy as cp 5 | import numpy as np 6 | import logging 7 | 8 | from sklearn.cluster import KMeans 9 | 10 | import config 11 | from lib_graph_partition.constrained_kmeans_base import ConstrainedKmeansBase 12 | from lib_graph_partition.partition import Partition 13 | from lib_graph_partition.constrained_kmeans import ConstrainedKmeans 14 | from lib_node_embedding.node_embedding import NodeEmbedding 15 | 16 | 17 | class PartitionKMeans(Partition): 18 | def __init__(self, args, graph, dataset): 19 | super(PartitionKMeans, self).__init__(args, graph, dataset) 20 | 21 | self.logger = logging.getLogger('partition_kmeans') 22 | cp.cuda.Device(self.args['cuda']).use() 23 | self.load_embeddings() 24 | 25 | def load_embeddings(self): 26 | node_embedding = NodeEmbedding(self.args, self.graph, self.dataset) 27 | 28 | if self.partition_method in ["sage_km", "sage_km_base"]: 29 | self.node_to_embedding = node_embedding.sage_encoder() 30 | else: 31 | raise Exception('unsupported embedding method') 32 | 33 | def partition(self): 34 | self.logger.info("partitioning") 35 | 36 | embedding = [] 37 | for node in self.node_to_embedding.keys(): 38 | embedding.append(self.node_to_embedding[node]) 39 | 40 | if not self.args['is_constrained']: 41 | cluster = KMeans(n_clusters=self.num_shards, random_state=10) 42 | cluster_labels = cluster.fit_predict(embedding) 43 | 44 | node_to_community = {} 45 | for com, node in zip(cluster_labels, self.node_to_embedding.keys()): 46 | node_to_community[node] = com 47 | 48 | community_to_node = {} 49 | for com in range(len(set(node_to_community.values()))): 50 | community_to_node[com] = np.where(np.array(list(node_to_community.values())) == com)[0] 51 | community_to_node = dict(sorted(community_to_node.items())) 52 | 53 | else: 54 | # node_threshold = math.ceil(self.graph.number_of_nodes() / self.num_shards) 55 | # node_threshold = math.ceil(self.graph.number_of_nodes() / self.num_shards + 0.05*self.graph.number_of_nodes()) 56 | node_threshold = math.ceil( 57 | self.graph.number_of_nodes() / self.args['num_shards'] + self.args['shard_size_delta'] * ( 58 | self.graph.number_of_nodes() - self.graph.number_of_nodes() / self.args['num_shards'])) 59 | self.logger.info("#.nodes: %s. Shard threshold: %s." % (self.graph.number_of_nodes(), node_threshold)) 60 | 61 | if self.partition_method == 'sage_km_base': 62 | cluster = ConstrainedKmeansBase(np.array(embedding), num_clusters=self.num_shards, 63 | node_threshold=node_threshold, 64 | terminate_delta=self.args['terminate_delta']) 65 | cluster.initialization() 66 | community, km_deltas = cluster.clustering() 67 | pickle.dump(km_deltas, open(config.ANALYSIS_PATH + "partition/base_bkm_" + self.args['dataset_name'], 'wb')) 68 | 69 | community_to_node = {} 70 | for i in range(self.num_shards): 71 | community_to_node[i] = np.array(community[i]) 72 | 73 | if self.partition_method == 'sage_km': 74 | cluster = ConstrainedKmeans(cp.array(embedding), num_clusters=self.num_shards, 75 | node_threshold=node_threshold, 76 | terminate_delta=self.args['terminate_delta']) 77 | cluster.initialization() 78 | community, km_deltas = cluster.clustering() 79 | pickle.dump(km_deltas, open(config.ANALYSIS_PATH + "partition/bkm_" + self.args['dataset_name'], 'wb')) 80 | 81 | community_to_node = {} 82 | for i in range(self.num_shards): 83 | community_to_node[i] = np.array(community[i].get().astype(int)) 84 | 85 | return community_to_node 86 | 87 | -------------------------------------------------------------------------------- /lib_graph_partition/partition_lpa.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import networkx as nx 4 | import logging 5 | import pickle 6 | 7 | from lib_graph_partition.constrained_lpa_base import ConstrainedLPABase 8 | from lib_graph_partition.partition import Partition 9 | from lib_graph_partition.constrained_lpa import ConstrainedLPA 10 | import config 11 | 12 | 13 | class PartitionLPA(Partition): 14 | def __init__(self, args, graph): 15 | super(PartitionLPA, self).__init__(args, graph) 16 | 17 | self.logger = logging.getLogger('partition_lpa') 18 | 19 | def partition(self): 20 | # implement LPA by hand, refer to https://github.com/benedekrozemberczki/LabelPropagation 21 | community_generator = nx.algorithms.community.label_propagation.label_propagation_communities(self.graph) 22 | self.logger.info("Generating LPA communities.") 23 | community_to_node = {key: c for key, c in zip(range(self.graph.number_of_nodes()), community_generator)} 24 | print("Found %s communities by unconstrained LPA", len(community_to_node.keys())) 25 | return community_to_node 26 | 27 | 28 | class PartitionConstrainedLPA(Partition): 29 | def __init__(self, args, graph): 30 | super(PartitionConstrainedLPA, self).__init__(args, graph) 31 | self.args = args 32 | 33 | self.logger = logging.getLogger('partition_constrained_lpa') 34 | 35 | def partition(self): 36 | adj_array = nx.linalg.adj_matrix(self.graph).toarray().astype(np.bool) 37 | # node_threshold = math.ceil(self.graph.number_of_nodes() / self.args['num_shards']) + 0.05 * self.graph.number_of_nodes() 38 | # node_threshold = math.ceil(self.graph.number_of_nodes() / self.args['num_shards']) 39 | node_threshold = math.ceil(self.graph.number_of_nodes() / self.args['num_shards'] + 40 | self.args['shard_size_delta'] * (self.graph.number_of_nodes()-self.graph.number_of_nodes() / self.args['num_shards'])) 41 | 42 | self.logger.info(" #. nodes: %s. LPA shard threshold: %s." % (self.graph.number_of_nodes(), node_threshold)) 43 | lpa = ConstrainedLPA(adj_array, self.num_shards, node_threshold, self.args['terminate_delta']) 44 | 45 | lpa.initialization() 46 | community_to_node, lpa_deltas = lpa.community_detection() 47 | 48 | pickle.dump(lpa_deltas, open(config.ANALYSIS_PATH + "partition/blpa_" + self.args['dataset_name'], 'wb')) 49 | 50 | return self.idx2id(community_to_node, np.array(self.graph.nodes)) 51 | 52 | 53 | class PartitionConstrainedLPABase(Partition): 54 | def __init__(self, args, graph): 55 | super(PartitionConstrainedLPABase, self).__init__(args, graph) 56 | self.args = args 57 | 58 | self.logger = logging.getLogger('partition_constrained_lpa') 59 | 60 | def partition(self): 61 | adj_array = nx.linalg.adj_matrix(self.graph).toarray().astype(np.bool) 62 | node_threshold = math.ceil(self.graph.number_of_nodes() / self.args['num_shards'] + self.args['shard_size_delta'] * (self.graph.number_of_nodes()-self.graph.number_of_nodes() / self.args['num_shards'])) 63 | 64 | self.logger.info(" #. nodes: %s. LPA shard threshold: %s." % (self.graph.number_of_nodes(), node_threshold)) 65 | lpa = ConstrainedLPABase(adj_array, self.num_shards, node_threshold, self.args['terminate_delta']) 66 | 67 | lpa.initialization() 68 | community_to_node, lpa_deltas = lpa.community_detection() 69 | 70 | pickle.dump(lpa_deltas, open(config.ANALYSIS_PATH + "partition/base_blpa_" + self.args['dataset_name'], 'wb')) 71 | 72 | return self.idx2id(community_to_node, np.array(self.graph.nodes)) 73 | -------------------------------------------------------------------------------- /lib_graph_partition/partition_random.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from lib_graph_partition.partition import Partition 4 | 5 | 6 | class PartitionRandom(Partition): 7 | def __init__(self, args, graph): 8 | super(PartitionRandom, self).__init__(args, graph) 9 | 10 | def partition(self): 11 | graph_nodes = np.array(self.graph.nodes) 12 | np.random.shuffle(graph_nodes) 13 | train_shard_indices = np.array_split(graph_nodes, self.args['num_shards']) 14 | 15 | return dict(zip(range(self.num_shards), train_shard_indices)) 16 | -------------------------------------------------------------------------------- /lib_node_embedding/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MinChen00/Graph-Unlearning/a9b942d01651c2e3d780ae12e1a1459e35120ffa/lib_node_embedding/__init__.py -------------------------------------------------------------------------------- /lib_node_embedding/ge/__init__.py: -------------------------------------------------------------------------------- 1 | from .models import * -------------------------------------------------------------------------------- /lib_node_embedding/ge/alias.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def create_alias_table(area_ratio): 5 | """ 6 | 7 | :param area_ratio: sum(area_ratio)=1 8 | :return: accept,alias 9 | """ 10 | l = len(area_ratio) 11 | accept, alias = [0] * l, [0] * l 12 | small, large = [], [] 13 | area_ratio_ = np.array(area_ratio) * l 14 | for i, prob in enumerate(area_ratio_): 15 | if prob < 1.0: 16 | small.append(i) 17 | else: 18 | large.append(i) 19 | 20 | while small and large: 21 | small_idx, large_idx = small.pop(), large.pop() 22 | accept[small_idx] = area_ratio_[small_idx] 23 | alias[small_idx] = large_idx 24 | area_ratio_[large_idx] = area_ratio_[large_idx] - \ 25 | (1 - area_ratio_[small_idx]) 26 | if area_ratio_[large_idx] < 1.0: 27 | small.append(large_idx) 28 | else: 29 | large.append(large_idx) 30 | 31 | while large: 32 | large_idx = large.pop() 33 | accept[large_idx] = 1 34 | while small: 35 | small_idx = small.pop() 36 | accept[small_idx] = 1 37 | 38 | return accept, alias 39 | 40 | 41 | def alias_sample(accept, alias): 42 | """ 43 | 44 | :param accept: 45 | :param alias: 46 | :return: sample index 47 | """ 48 | N = len(accept) 49 | i = int(np.random.random()*N) 50 | r = np.random.random() 51 | if r < accept[i]: 52 | return i 53 | else: 54 | return alias[i] 55 | -------------------------------------------------------------------------------- /lib_node_embedding/ge/classify.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | 4 | import numpy 5 | from sklearn.metrics import f1_score, accuracy_score 6 | from sklearn.multiclass import OneVsRestClassifier 7 | from sklearn.preprocessing import MultiLabelBinarizer 8 | 9 | 10 | class TopKRanker(OneVsRestClassifier): 11 | def predict(self, X, top_k_list): 12 | probs = numpy.asarray(super(TopKRanker, self).predict_proba(X)) 13 | all_labels = [] 14 | for i, k in enumerate(top_k_list): 15 | probs_ = probs[i, :] 16 | labels = self.classes_[probs_.argsort()[-k:]].tolist() 17 | probs_[:] = 0 18 | probs_[labels] = 1 19 | all_labels.append(probs_) 20 | return numpy.asarray(all_labels) 21 | 22 | 23 | class Classifier(object): 24 | 25 | def __init__(self, embeddings, clf): 26 | self.embeddings = embeddings 27 | self.clf = TopKRanker(clf) 28 | self.binarizer = MultiLabelBinarizer(sparse_output=True) 29 | 30 | def train(self, X, Y, Y_all): 31 | self.binarizer.fit(Y_all) 32 | X_train = [self.embeddings[x] for x in X] 33 | Y = self.binarizer.transform(Y) 34 | self.clf.fit(X_train, Y) 35 | 36 | def evaluate(self, X, Y): 37 | top_k_list = [len(l) for l in Y] 38 | Y_ = self.predict(X, top_k_list) 39 | Y = self.binarizer.transform(Y) 40 | averages = ["micro", "macro", "samples", "weighted"] 41 | results = {} 42 | for average in averages: 43 | results[average] = f1_score(Y, Y_, average=average) 44 | results['acc'] = accuracy_score(Y,Y_) 45 | print('-------------------') 46 | print(results) 47 | return results 48 | print('-------------------') 49 | 50 | def predict(self, X, top_k_list): 51 | X_ = numpy.asarray([self.embeddings[x] for x in X]) 52 | Y = self.clf.predict(X_, top_k_list=top_k_list) 53 | return Y 54 | 55 | def split_train_evaluate(self, X, Y, train_precent, seed=0): 56 | state = numpy.random.get_state() 57 | 58 | training_size = int(train_precent * len(X)) 59 | numpy.random.seed(seed) 60 | shuffle_indices = numpy.random.permutation(numpy.arange(len(X))) 61 | X_train = [X[shuffle_indices[i]] for i in range(training_size)] 62 | Y_train = [Y[shuffle_indices[i]] for i in range(training_size)] 63 | X_test = [X[shuffle_indices[i]] for i in range(training_size, len(X))] 64 | Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))] 65 | 66 | self.train(X_train, Y_train, Y) 67 | numpy.random.set_state(state) 68 | return self.evaluate(X_test, Y_test) 69 | 70 | 71 | def read_node_label(filename, skip_head=False): 72 | fin = open(filename, 'r') 73 | X = [] 74 | Y = [] 75 | while 1: 76 | if skip_head: 77 | fin.readline() 78 | l = fin.readline() 79 | if l == '': 80 | break 81 | vec = l.strip().split(' ') 82 | X.append(vec[0]) 83 | Y.append(vec[1:]) 84 | fin.close() 85 | return X, Y 86 | -------------------------------------------------------------------------------- /lib_node_embedding/ge/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .deepwalk import DeepWalk 2 | from .node2vec import Node2Vec 3 | from .line import LINE 4 | from .sdne import SDNE 5 | from .struc2vec import Struc2Vec 6 | 7 | 8 | __all__ = ["DeepWalk", "Node2Vec", "LINE", "SDNE", "Struc2Vec"] 9 | -------------------------------------------------------------------------------- /lib_node_embedding/ge/models/deepwalk.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | 5 | 6 | 7 | Author: 8 | 9 | Weichen Shen,wcshen1994@163.com 10 | 11 | 12 | 13 | Reference: 14 | 15 | [1] Perozzi B, Al-Rfou R, Skiena S. Deepwalk: Online learning of social representations[C]//Proceedings of the 20th ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2014: 701-710.(http://www.perozzi.net/publications/14_kdd_deepwalk.pdf) 16 | 17 | 18 | 19 | """ 20 | from ..walker import RandomWalker 21 | from gensim.models import Word2Vec 22 | import pandas as pd 23 | 24 | 25 | class DeepWalk: 26 | def __init__(self, graph, walk_length, num_walks, workers=1): 27 | 28 | self.graph = graph 29 | self.w2v_model = None 30 | self._embeddings = {} 31 | 32 | self.walker = RandomWalker( 33 | graph, p=1, q=1, ) 34 | self.sentences = self.walker.simulate_walks( 35 | num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1) 36 | 37 | def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs): 38 | 39 | kwargs["sentences"] = self.sentences 40 | kwargs["min_count"] = kwargs.get("min_count", 0) 41 | kwargs["size"] = embed_size 42 | kwargs["sg"] = 1 # skip gram 43 | kwargs["hs"] = 1 # deepwalk use Hierarchical Softmax 44 | kwargs["workers"] = workers 45 | kwargs["window"] = window_size 46 | kwargs["iter"] = iter 47 | 48 | print("Learning embedding vectors...") 49 | model = Word2Vec(**kwargs) 50 | print("Learning embedding vectors done!") 51 | 52 | self.w2v_model = model 53 | return model 54 | 55 | def get_embeddings(self,): 56 | if self.w2v_model is None: 57 | print("model not train") 58 | return {} 59 | 60 | self._embeddings = {} 61 | for word in self.graph.nodes(): 62 | self._embeddings[word] = self.w2v_model.wv[word] 63 | 64 | return self._embeddings 65 | -------------------------------------------------------------------------------- /lib_node_embedding/ge/models/line.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | 5 | 6 | 7 | Author: 8 | 9 | Weichen Shen,wcshen1994@163.com 10 | 11 | 12 | 13 | Reference: 14 | 15 | [1] Tang J, Qu M, Wang M, et al. Line: Large-scale information network embedding[C]//Proceedings of the 24th International Conference on World Wide Web. International World Wide Web Conferences Steering Committee, 2015: 1067-1077.(https://arxiv.org/pdf/1503.03578.pdf) 16 | 17 | 18 | 19 | """ 20 | import math 21 | import random 22 | 23 | import numpy as np 24 | import tensorflow as tf 25 | from tensorflow.python.keras import backend as K 26 | from tensorflow.python.keras.layers import Embedding, Input, Lambda 27 | from tensorflow.python.keras.models import Model 28 | 29 | from ..alias import create_alias_table, alias_sample 30 | from ..utils import preprocess_nxgraph 31 | 32 | 33 | def line_loss(y_true, y_pred): 34 | return -K.mean(K.log(K.sigmoid(y_true*y_pred))) 35 | 36 | 37 | def create_model(numNodes, embedding_size, order='second'): 38 | 39 | v_i = Input(shape=(1,)) 40 | v_j = Input(shape=(1,)) 41 | 42 | first_emb = Embedding(numNodes, embedding_size, name='first_emb') 43 | second_emb = Embedding(numNodes, embedding_size, name='second_emb') 44 | context_emb = Embedding(numNodes, embedding_size, name='context_emb') 45 | 46 | v_i_emb = first_emb(v_i) 47 | v_j_emb = first_emb(v_j) 48 | 49 | v_i_emb_second = second_emb(v_i) 50 | v_j_context_emb = context_emb(v_j) 51 | 52 | first = Lambda(lambda x: tf.reduce_sum( 53 | x[0]*x[1], axis=-1, keep_dims=False), name='first_order')([v_i_emb, v_j_emb]) 54 | second = Lambda(lambda x: tf.reduce_sum( 55 | x[0]*x[1], axis=-1, keep_dims=False), name='second_order')([v_i_emb_second, v_j_context_emb]) 56 | 57 | if order == 'first': 58 | output_list = [first] 59 | elif order == 'second': 60 | output_list = [second] 61 | else: 62 | output_list = [first, second] 63 | 64 | model = Model(inputs=[v_i, v_j], outputs=output_list) 65 | 66 | return model, {'first': first_emb, 'second': second_emb} 67 | 68 | 69 | class LINE: 70 | def __init__(self, graph, embedding_size=8, negative_ratio=5, order='second',): 71 | """ 72 | 73 | :param graph: 74 | :param embedding_size: 75 | :param negative_ratio: 76 | :param order: 'first','second','all' 77 | """ 78 | if order not in ['first', 'second', 'all']: 79 | raise ValueError('mode must be fisrt,second,or all') 80 | 81 | self.graph = graph 82 | self.idx2node, self.node2idx = preprocess_nxgraph(graph) 83 | self.use_alias = True 84 | 85 | self.rep_size = embedding_size 86 | self.order = order 87 | 88 | self._embeddings = {} 89 | self.negative_ratio = negative_ratio 90 | self.order = order 91 | 92 | self.node_size = graph.number_of_nodes() 93 | self.edge_size = graph.number_of_edges() 94 | self.samples_per_epoch = self.edge_size*(1+negative_ratio) 95 | 96 | self._gen_sampling_table() 97 | self.reset_model() 98 | 99 | def reset_training_config(self, batch_size, times): 100 | self.batch_size = batch_size 101 | self.steps_per_epoch = ( 102 | (self.samples_per_epoch - 1) // self.batch_size + 1)*times 103 | 104 | def reset_model(self, opt='adam'): 105 | 106 | self.model, self.embedding_dict = create_model( 107 | self.node_size, self.rep_size, self.order) 108 | self.model.compile(opt, line_loss) 109 | self.batch_it = self.batch_iter(self.node2idx) 110 | 111 | def _gen_sampling_table(self): 112 | 113 | # create sampling table for vertex 114 | power = 0.75 115 | numNodes = self.node_size 116 | node_degree = np.zeros(numNodes) # out degree 117 | node2idx = self.node2idx 118 | 119 | for edge in self.graph.edges(): 120 | node_degree[node2idx[edge[0]] 121 | ] += self.graph[edge[0]][edge[1]].get('weight', 1.0) 122 | 123 | total_sum = sum([math.pow(node_degree[i], power) 124 | for i in range(numNodes)]) 125 | norm_prob = [float(math.pow(node_degree[j], power)) / 126 | total_sum for j in range(numNodes)] 127 | 128 | self.node_accept, self.node_alias = create_alias_table(norm_prob) 129 | 130 | # create sampling table for edge 131 | numEdges = self.graph.number_of_edges() 132 | total_sum = sum([self.graph[edge[0]][edge[1]].get('weight', 1.0) 133 | for edge in self.graph.edges()]) 134 | norm_prob = [self.graph[edge[0]][edge[1]].get('weight', 1.0) * 135 | numEdges / total_sum for edge in self.graph.edges()] 136 | 137 | self.edge_accept, self.edge_alias = create_alias_table(norm_prob) 138 | 139 | def batch_iter(self, node2idx): 140 | 141 | edges = [(node2idx[x[0]], node2idx[x[1]]) for x in self.graph.edges()] 142 | 143 | data_size = self.graph.number_of_edges() 144 | shuffle_indices = np.random.permutation(np.arange(data_size)) 145 | # positive or negative mod 146 | mod = 0 147 | mod_size = 1 + self.negative_ratio 148 | h = [] 149 | t = [] 150 | sign = 0 151 | count = 0 152 | start_index = 0 153 | end_index = min(start_index + self.batch_size, data_size) 154 | while True: 155 | if mod == 0: 156 | 157 | h = [] 158 | t = [] 159 | for i in range(start_index, end_index): 160 | if random.random() >= self.edge_accept[shuffle_indices[i]]: 161 | shuffle_indices[i] = self.edge_alias[shuffle_indices[i]] 162 | cur_h = edges[shuffle_indices[i]][0] 163 | cur_t = edges[shuffle_indices[i]][1] 164 | h.append(cur_h) 165 | t.append(cur_t) 166 | sign = np.ones(len(h)) 167 | else: 168 | sign = np.ones(len(h))*-1 169 | t = [] 170 | for i in range(len(h)): 171 | 172 | t.append(alias_sample( 173 | self.node_accept, self.node_alias)) 174 | 175 | if self.order == 'all': 176 | yield ([np.array(h), np.array(t)], [sign, sign]) 177 | else: 178 | yield ([np.array(h), np.array(t)], [sign]) 179 | mod += 1 180 | mod %= mod_size 181 | if mod == 0: 182 | start_index = end_index 183 | end_index = min(start_index + self.batch_size, data_size) 184 | 185 | if start_index >= data_size: 186 | count += 1 187 | mod = 0 188 | h = [] 189 | shuffle_indices = np.random.permutation(np.arange(data_size)) 190 | start_index = 0 191 | end_index = min(start_index + self.batch_size, data_size) 192 | 193 | def get_embeddings(self,): 194 | self._embeddings = {} 195 | if self.order == 'first': 196 | embeddings = self.embedding_dict['first'].get_weights()[0] 197 | elif self.order == 'second': 198 | embeddings = self.embedding_dict['second'].get_weights()[0] 199 | else: 200 | embeddings = np.hstack((self.embedding_dict['first'].get_weights()[ 201 | 0], self.embedding_dict['second'].get_weights()[0])) 202 | idx2node = self.idx2node 203 | for i, embedding in enumerate(embeddings): 204 | self._embeddings[idx2node[i]] = embedding 205 | 206 | return self._embeddings 207 | 208 | def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1, times=1): 209 | self.reset_training_config(batch_size, times) 210 | hist = self.model.fit_generator(self.batch_it, epochs=epochs, initial_epoch=initial_epoch, steps_per_epoch=self.steps_per_epoch, 211 | verbose=verbose) 212 | 213 | return hist 214 | -------------------------------------------------------------------------------- /lib_node_embedding/ge/models/node2vec.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | 5 | 6 | 7 | Author: 8 | 9 | Weichen Shen,wcshen1994@163.com 10 | 11 | 12 | 13 | Reference: 14 | 15 | [1] Grover A, Leskovec J. node2vec: Scalable feature learning for networks[C]//Proceedings of the 22nd ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2016: 855-864.(https://www.kdd.org/kdd2016/papers/files/rfp0218-groverA.pdf) 16 | 17 | 18 | 19 | """ 20 | 21 | from gensim.models import Word2Vec 22 | import pandas as pd 23 | 24 | from ..walker import RandomWalker 25 | 26 | 27 | class Node2Vec: 28 | 29 | def __init__(self, graph, walk_length, num_walks, p=1.0, q=1.0, workers=1, use_rejection_sampling=0): 30 | 31 | self.graph = graph 32 | self._embeddings = {} 33 | self.walker = RandomWalker( 34 | graph, p=p, q=q, use_rejection_sampling=use_rejection_sampling) 35 | 36 | print("Preprocess transition probs...") 37 | self.walker.preprocess_transition_probs() 38 | 39 | self.sentences = self.walker.simulate_walks( 40 | num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1) 41 | 42 | def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs): 43 | 44 | kwargs["sentences"] = self.sentences 45 | kwargs["min_count"] = kwargs.get("min_count", 0) 46 | kwargs["size"] = embed_size 47 | kwargs["sg"] = 1 48 | kwargs["hs"] = 0 # node2vec not use Hierarchical Softmax 49 | kwargs["workers"] = workers 50 | kwargs["window"] = window_size 51 | kwargs["iter"] = iter 52 | 53 | print("Learning embedding vectors...") 54 | model = Word2Vec(**kwargs) 55 | print("Learning embedding vectors done!") 56 | 57 | self.w2v_model = model 58 | 59 | return model 60 | 61 | def get_embeddings(self,): 62 | if self.w2v_model is None: 63 | print("model not train") 64 | return {} 65 | 66 | self._embeddings = {} 67 | for word in self.graph.nodes(): 68 | self._embeddings[word] = self.w2v_model.wv[word] 69 | 70 | return self._embeddings 71 | -------------------------------------------------------------------------------- /lib_node_embedding/ge/models/sdne.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | 5 | 6 | 7 | Author: 8 | 9 | Weichen Shen,wcshen1994@163.com 10 | 11 | 12 | 13 | Reference: 14 | 15 | [1] Wang D, Cui P, Zhu W. Structural deep network embedding[C]//Proceedings of the 22nd ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2016: 1225-1234.(https://www.kdd.org/kdd2016/papers/files/rfp0191-wangAemb.pdf) 16 | 17 | 18 | 19 | """ 20 | import time 21 | 22 | import numpy as np 23 | import scipy.sparse as sp 24 | import tensorflow as tf 25 | from tensorflow.python.keras import backend as K 26 | from tensorflow.python.keras.callbacks import History 27 | from tensorflow.python.keras.layers import Dense, Input 28 | from tensorflow.python.keras.models import Model 29 | from tensorflow.python.keras.regularizers import l1_l2 30 | 31 | from ..utils import preprocess_nxgraph 32 | 33 | 34 | def l_2nd(beta): 35 | def loss_2nd(y_true, y_pred): 36 | b_ = np.ones_like(y_true) 37 | b_[y_true != 0] = beta 38 | x = K.square((y_true - y_pred) * b_) 39 | t = K.sum(x, axis=-1, ) 40 | return K.mean(t) 41 | 42 | return loss_2nd 43 | 44 | 45 | def l_1st(alpha): 46 | def loss_1st(y_true, y_pred): 47 | L = y_true 48 | Y = y_pred 49 | batch_size = tf.to_float(K.shape(L)[0]) 50 | return alpha * 2 * tf.linalg.trace(tf.matmul(tf.matmul(Y, L, transpose_a=True), Y)) / batch_size 51 | 52 | return loss_1st 53 | 54 | 55 | def create_model(node_size, hidden_size=[256, 128], l1=1e-5, l2=1e-4): 56 | A = Input(shape=(node_size,)) 57 | L = Input(shape=(None,)) 58 | fc = A 59 | for i in range(len(hidden_size)): 60 | if i == len(hidden_size) - 1: 61 | fc = Dense(hidden_size[i], activation='relu', 62 | kernel_regularizer=l1_l2(l1, l2), name='1st')(fc) 63 | else: 64 | fc = Dense(hidden_size[i], activation='relu', 65 | kernel_regularizer=l1_l2(l1, l2))(fc) 66 | Y = fc 67 | for i in reversed(range(len(hidden_size) - 1)): 68 | fc = Dense(hidden_size[i], activation='relu', 69 | kernel_regularizer=l1_l2(l1, l2))(fc) 70 | 71 | A_ = Dense(node_size, 'relu', name='2nd')(fc) 72 | model = Model(inputs=[A, L], outputs=[A_, Y]) 73 | emb = Model(inputs=A, outputs=Y) 74 | return model, emb 75 | 76 | 77 | class SDNE(object): 78 | def __init__(self, graph, hidden_size=[32, 16], alpha=1e-6, beta=5., nu1=1e-5, nu2=1e-4, ): 79 | 80 | self.graph = graph 81 | # self.g.remove_edges_from(self.g.selfloop_edges()) 82 | self.idx2node, self.node2idx = preprocess_nxgraph(self.graph) 83 | 84 | self.node_size = self.graph.number_of_nodes() 85 | self.hidden_size = hidden_size 86 | self.alpha = alpha 87 | self.beta = beta 88 | self.nu1 = nu1 89 | self.nu2 = nu2 90 | 91 | self.A, self.L = self._create_A_L( 92 | self.graph, self.node2idx) # Adj Matrix,L Matrix 93 | self.reset_model() 94 | self.inputs = [self.A, self.L] 95 | self._embeddings = {} 96 | 97 | def reset_model(self, opt='adam'): 98 | 99 | self.model, self.emb_model = create_model(self.node_size, hidden_size=self.hidden_size, l1=self.nu1, 100 | l2=self.nu2) 101 | self.model.compile(opt, [l_2nd(self.beta), l_1st(self.alpha)]) 102 | self.get_embeddings() 103 | 104 | def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1): 105 | if batch_size >= self.node_size: 106 | if batch_size > self.node_size: 107 | print('batch_size({0}) > node_size({1}),set batch_size = {1}'.format( 108 | batch_size, self.node_size)) 109 | batch_size = self.node_size 110 | return self.model.fit([self.A.todense(), self.L.todense()], [self.A.todense(), self.L.todense()], 111 | batch_size=batch_size, epochs=epochs, initial_epoch=initial_epoch, verbose=verbose, 112 | shuffle=False, ) 113 | else: 114 | steps_per_epoch = (self.node_size - 1) // batch_size + 1 115 | hist = History() 116 | hist.on_train_begin() 117 | logs = {} 118 | for epoch in range(initial_epoch, epochs): 119 | start_time = time.time() 120 | losses = np.zeros(3) 121 | for i in range(steps_per_epoch): 122 | index = np.arange( 123 | i * batch_size, min((i + 1) * batch_size, self.node_size)) 124 | A_train = self.A[index, :].todense() 125 | L_mat_train = self.L[index][:, index].todense() 126 | inp = [A_train, L_mat_train] 127 | batch_losses = self.model.train_on_batch(inp, inp) 128 | losses += batch_losses 129 | losses = losses / steps_per_epoch 130 | 131 | logs['loss'] = losses[0] 132 | logs['2nd_loss'] = losses[1] 133 | logs['1st_loss'] = losses[2] 134 | epoch_time = int(time.time() - start_time) 135 | hist.on_epoch_end(epoch, logs) 136 | if verbose > 0: 137 | print('Epoch {0}/{1}'.format(epoch + 1, epochs)) 138 | print('{0}s - loss: {1: .4f} - 2nd_loss: {2: .4f} - 1st_loss: {3: .4f}'.format( 139 | epoch_time, losses[0], losses[1], losses[2])) 140 | return hist 141 | 142 | def evaluate(self, ): 143 | return self.model.evaluate(x=self.inputs, y=self.inputs, batch_size=self.node_size) 144 | 145 | def get_embeddings(self): 146 | self._embeddings = {} 147 | embeddings = self.emb_model.predict(self.A.todense(), batch_size=self.node_size) 148 | look_back = self.idx2node 149 | for i, embedding in enumerate(embeddings): 150 | self._embeddings[look_back[i]] = embedding 151 | 152 | return self._embeddings 153 | 154 | def _create_A_L(self, graph, node2idx): 155 | node_size = graph.number_of_nodes() 156 | A_data = [] 157 | A_row_index = [] 158 | A_col_index = [] 159 | 160 | for edge in graph.edges(): 161 | v1, v2 = edge 162 | edge_weight = graph[v1][v2].get('weight', 1) 163 | 164 | A_data.append(edge_weight) 165 | A_row_index.append(node2idx[v1]) 166 | A_col_index.append(node2idx[v2]) 167 | 168 | A = sp.csr_matrix((A_data, (A_row_index, A_col_index)), shape=(node_size, node_size)) 169 | A_ = sp.csr_matrix((A_data + A_data, (A_row_index + A_col_index, A_col_index + A_row_index)), 170 | shape=(node_size, node_size)) 171 | 172 | D = sp.diags(A_.sum(axis=1).flatten().tolist()[0]) 173 | L = D - A_ 174 | return A, L 175 | -------------------------------------------------------------------------------- /lib_node_embedding/ge/models/struc2vec.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | 5 | 6 | 7 | Author: 8 | 9 | Weichen Shen,wcshen1994@163.com 10 | 11 | 12 | 13 | Reference: 14 | 15 | [1] Ribeiro L F R, Saverese P H P, Figueiredo D R. struc2vec: Learning node representations from structural identity[C]//Proceedings of the 23rd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining. ACM, 2017: 385-394.(https://arxiv.org/pdf/1704.03165.pdf) 16 | 17 | 18 | 19 | """ 20 | 21 | import math 22 | import os 23 | import shutil 24 | from collections import ChainMap, deque 25 | 26 | import numpy as np 27 | import pandas as pd 28 | from fastdtw import fastdtw 29 | from gensim.models import Word2Vec 30 | from joblib import Parallel, delayed 31 | from tqdm import tqdm 32 | 33 | from ..alias import create_alias_table 34 | from ..utils import partition_dict, preprocess_nxgraph 35 | from ..walker import BiasedWalker 36 | 37 | 38 | class Struc2Vec(): 39 | def __init__(self, graph, walk_length=10, num_walks=100, workers=1, verbose=0, stay_prob=0.3, opt1_reduce_len=True, opt2_reduce_sim_calc=True, opt3_num_layers=None, temp_path='./temp_struc2vec/', reuse=False): 40 | self.graph = graph 41 | self.idx2node, self.node2idx = preprocess_nxgraph(graph) 42 | self.idx = list(range(len(self.idx2node))) 43 | 44 | self.opt1_reduce_len = opt1_reduce_len 45 | self.opt2_reduce_sim_calc = opt2_reduce_sim_calc 46 | self.opt3_num_layers = opt3_num_layers 47 | 48 | self.resue = reuse 49 | self.temp_path = temp_path 50 | 51 | if not os.path.exists(self.temp_path): 52 | os.mkdir(self.temp_path) 53 | if not reuse: 54 | shutil.rmtree(self.temp_path) 55 | os.mkdir(self.temp_path) 56 | 57 | self.create_context_graph(self.opt3_num_layers, workers, verbose) 58 | self.prepare_biased_walk() 59 | self.walker = BiasedWalker(self.idx2node, self.temp_path) 60 | self.sentences = self.walker.simulate_walks( 61 | num_walks, walk_length, stay_prob, workers, verbose) 62 | 63 | self._embeddings = {} 64 | 65 | def create_context_graph(self, max_num_layers, workers=1, verbose=0,): 66 | 67 | pair_distances = self._compute_structural_distance( 68 | max_num_layers, workers, verbose,) 69 | layers_adj, layers_distances = self._get_layer_rep(pair_distances) 70 | pd.to_pickle(layers_adj, self.temp_path + 'layers_adj.pkl') 71 | 72 | layers_accept, layers_alias = self._get_transition_probs( 73 | layers_adj, layers_distances) 74 | pd.to_pickle(layers_alias, self.temp_path + 'layers_alias.pkl') 75 | pd.to_pickle(layers_accept, self.temp_path + 'layers_accept.pkl') 76 | 77 | def prepare_biased_walk(self,): 78 | 79 | sum_weights = {} 80 | sum_edges = {} 81 | average_weight = {} 82 | gamma = {} 83 | layer = 0 84 | while (os.path.exists(self.temp_path+'norm_weights_distance-layer-' + str(layer)+'.pkl')): 85 | probs = pd.read_pickle( 86 | self.temp_path+'norm_weights_distance-layer-' + str(layer)+'.pkl') 87 | for v, list_weights in probs.items(): 88 | sum_weights.setdefault(layer, 0) 89 | sum_edges.setdefault(layer, 0) 90 | sum_weights[layer] += sum(list_weights) 91 | sum_edges[layer] += len(list_weights) 92 | 93 | average_weight[layer] = sum_weights[layer] / sum_edges[layer] 94 | 95 | gamma.setdefault(layer, {}) 96 | 97 | for v, list_weights in probs.items(): 98 | num_neighbours = 0 99 | for w in list_weights: 100 | if (w > average_weight[layer]): 101 | num_neighbours += 1 102 | gamma[layer][v] = num_neighbours 103 | 104 | layer += 1 105 | 106 | pd.to_pickle(average_weight, self.temp_path + 'average_weight') 107 | pd.to_pickle(gamma, self.temp_path + 'gamma.pkl') 108 | 109 | def train(self, embed_size=128, window_size=5, workers=3, iter=5): 110 | 111 | # pd.read_pickle(self.temp_path+'walks.pkl') 112 | sentences = self.sentences 113 | 114 | print("Learning representation...") 115 | model = Word2Vec(sentences, size=embed_size, window=window_size, min_count=0, hs=1, sg=1, workers=workers, 116 | iter=iter) 117 | print("Learning representation done!") 118 | self.w2v_model = model 119 | 120 | return model 121 | 122 | def get_embeddings(self,): 123 | if self.w2v_model is None: 124 | print("model not train") 125 | return {} 126 | 127 | self._embeddings = {} 128 | for word in self.graph.nodes(): 129 | self._embeddings[word] = self.w2v_model.wv[word] 130 | 131 | return self._embeddings 132 | 133 | def _compute_ordered_degreelist(self, max_num_layers): 134 | 135 | degreeList = {} 136 | vertices = self.idx # self.g.nodes() 137 | for v in vertices: 138 | degreeList[v] = self._get_order_degreelist_node(v, max_num_layers) 139 | return degreeList 140 | 141 | def _get_order_degreelist_node(self, root, max_num_layers=None): 142 | if max_num_layers is None: 143 | max_num_layers = float('inf') 144 | 145 | ordered_degree_sequence_dict = {} 146 | visited = [False] * len(self.graph.nodes()) 147 | queue = deque() 148 | level = 0 149 | queue.append(root) 150 | visited[root] = True 151 | 152 | while (len(queue) > 0 and level <= max_num_layers): 153 | 154 | count = len(queue) 155 | if self.opt1_reduce_len: 156 | degree_list = {} 157 | else: 158 | degree_list = [] 159 | while (count > 0): 160 | 161 | top = queue.popleft() 162 | node = self.idx2node[top] 163 | degree = len(self.graph[node]) 164 | 165 | if self.opt1_reduce_len: 166 | degree_list[degree] = degree_list.get(degree, 0) + 1 167 | else: 168 | degree_list.append(degree) 169 | 170 | for nei in self.graph[node]: 171 | nei_idx = self.node2idx[nei] 172 | if not visited[nei_idx]: 173 | visited[nei_idx] = True 174 | queue.append(nei_idx) 175 | count -= 1 176 | if self.opt1_reduce_len: 177 | orderd_degree_list = [(degree, freq) 178 | for degree, freq in degree_list.items()] 179 | orderd_degree_list.sort(key=lambda x: x[0]) 180 | else: 181 | orderd_degree_list = sorted(degree_list) 182 | ordered_degree_sequence_dict[level] = orderd_degree_list 183 | level += 1 184 | 185 | return ordered_degree_sequence_dict 186 | 187 | def _compute_structural_distance(self, max_num_layers, workers=1, verbose=0,): 188 | 189 | if os.path.exists(self.temp_path+'structural_dist.pkl'): 190 | structural_dist = pd.read_pickle( 191 | self.temp_path+'structural_dist.pkl') 192 | else: 193 | if self.opt1_reduce_len: 194 | dist_func = cost_max 195 | else: 196 | dist_func = cost 197 | 198 | if os.path.exists(self.temp_path + 'degreelist.pkl'): 199 | degreeList = pd.read_pickle(self.temp_path + 'degreelist.pkl') 200 | else: 201 | degreeList = self._compute_ordered_degreelist(max_num_layers) 202 | pd.to_pickle(degreeList, self.temp_path + 'degreelist.pkl') 203 | 204 | if self.opt2_reduce_sim_calc: 205 | degrees = self._create_vectors() 206 | degreeListsSelected = {} 207 | vertices = {} 208 | n_nodes = len(self.idx) 209 | for v in self.idx: # c:list of vertex 210 | nbs = get_vertices( 211 | v, len(self.graph[self.idx2node[v]]), degrees, n_nodes) 212 | vertices[v] = nbs # store nbs 213 | degreeListsSelected[v] = degreeList[v] # store dist 214 | for n in nbs: 215 | # store dist of nbs 216 | degreeListsSelected[n] = degreeList[n] 217 | else: 218 | vertices = {} 219 | for v in degreeList: 220 | vertices[v] = [vd for vd in degreeList.keys() if vd > v] 221 | 222 | results = Parallel(n_jobs=workers, verbose=verbose,)( 223 | delayed(compute_dtw_dist)(part_list, degreeList, dist_func) for part_list in partition_dict(vertices, workers)) 224 | dtw_dist = dict(ChainMap(*results)) 225 | 226 | structural_dist = convert_dtw_struc_dist(dtw_dist) 227 | pd.to_pickle(structural_dist, self.temp_path + 228 | 'structural_dist.pkl') 229 | 230 | return structural_dist 231 | 232 | def _create_vectors(self): 233 | degrees = {} # sotre v list of degree 234 | degrees_sorted = set() # store degree 235 | G = self.graph 236 | for v in self.idx: 237 | degree = len(G[self.idx2node[v]]) 238 | degrees_sorted.add(degree) 239 | if (degree not in degrees): 240 | degrees[degree] = {} 241 | degrees[degree]['vertices'] = [] 242 | degrees[degree]['vertices'].append(v) 243 | degrees_sorted = np.array(list(degrees_sorted), dtype='int') 244 | degrees_sorted = np.sort(degrees_sorted) 245 | 246 | l = len(degrees_sorted) 247 | for index, degree in enumerate(degrees_sorted): 248 | if (index > 0): 249 | degrees[degree]['before'] = degrees_sorted[index - 1] 250 | if (index < (l - 1)): 251 | degrees[degree]['after'] = degrees_sorted[index + 1] 252 | 253 | return degrees 254 | 255 | def _get_layer_rep(self, pair_distances): 256 | layer_distances = {} 257 | layer_adj = {} 258 | for v_pair, layer_dist in pair_distances.items(): 259 | for layer, distance in layer_dist.items(): 260 | vx = v_pair[0] 261 | vy = v_pair[1] 262 | 263 | layer_distances.setdefault(layer, {}) 264 | layer_distances[layer][vx, vy] = distance 265 | 266 | layer_adj.setdefault(layer, {}) 267 | layer_adj[layer].setdefault(vx, []) 268 | layer_adj[layer].setdefault(vy, []) 269 | layer_adj[layer][vx].append(vy) 270 | layer_adj[layer][vy].append(vx) 271 | 272 | return layer_adj, layer_distances 273 | 274 | def _get_transition_probs(self, layers_adj, layers_distances): 275 | layers_alias = {} 276 | layers_accept = {} 277 | 278 | for layer in layers_adj: 279 | 280 | neighbors = layers_adj[layer] 281 | layer_distances = layers_distances[layer] 282 | node_alias_dict = {} 283 | node_accept_dict = {} 284 | norm_weights = {} 285 | 286 | for v, neighbors in neighbors.items(): 287 | e_list = [] 288 | sum_w = 0.0 289 | 290 | for n in neighbors: 291 | if (v, n) in layer_distances: 292 | wd = layer_distances[v, n] 293 | else: 294 | wd = layer_distances[n, v] 295 | w = np.exp(-float(wd)) 296 | e_list.append(w) 297 | sum_w += w 298 | 299 | e_list = [x / sum_w for x in e_list] 300 | norm_weights[v] = e_list 301 | accept, alias = create_alias_table(e_list) 302 | node_alias_dict[v] = alias 303 | node_accept_dict[v] = accept 304 | 305 | pd.to_pickle( 306 | norm_weights, self.temp_path + 'norm_weights_distance-layer-' + str(layer)+'.pkl') 307 | 308 | layers_alias[layer] = node_alias_dict 309 | layers_accept[layer] = node_accept_dict 310 | 311 | return layers_accept, layers_alias 312 | 313 | 314 | def cost(a, b): 315 | ep = 0.5 316 | m = max(a, b) + ep 317 | mi = min(a, b) + ep 318 | return ((m / mi) - 1) 319 | 320 | 321 | def cost_min(a, b): 322 | ep = 0.5 323 | m = max(a[0], b[0]) + ep 324 | mi = min(a[0], b[0]) + ep 325 | return ((m / mi) - 1) * min(a[1], b[1]) 326 | 327 | 328 | def cost_max(a, b): 329 | ep = 0.5 330 | m = max(a[0], b[0]) + ep 331 | mi = min(a[0], b[0]) + ep 332 | return ((m / mi) - 1) * max(a[1], b[1]) 333 | 334 | 335 | def convert_dtw_struc_dist(distances, startLayer=1): 336 | """ 337 | 338 | :param distances: dict of dict 339 | :param startLayer: 340 | :return: 341 | """ 342 | for vertices, layers in distances.items(): 343 | keys_layers = sorted(layers.keys()) 344 | startLayer = min(len(keys_layers), startLayer) 345 | for layer in range(0, startLayer): 346 | keys_layers.pop(0) 347 | 348 | for layer in keys_layers: 349 | layers[layer] += layers[layer - 1] 350 | return distances 351 | 352 | 353 | def get_vertices(v, degree_v, degrees, n_nodes): 354 | a_vertices_selected = 2 * math.log(n_nodes, 2) 355 | vertices = [] 356 | try: 357 | c_v = 0 358 | 359 | for v2 in degrees[degree_v]['vertices']: 360 | if (v != v2): 361 | vertices.append(v2) # same degree 362 | c_v += 1 363 | if (c_v > a_vertices_selected): 364 | raise StopIteration 365 | 366 | if ('before' not in degrees[degree_v]): 367 | degree_b = -1 368 | else: 369 | degree_b = degrees[degree_v]['before'] 370 | if ('after' not in degrees[degree_v]): 371 | degree_a = -1 372 | else: 373 | degree_a = degrees[degree_v]['after'] 374 | if (degree_b == -1 and degree_a == -1): 375 | raise StopIteration # not anymore v 376 | degree_now = verifyDegrees(degrees, degree_v, degree_a, degree_b) 377 | # nearest valid degree 378 | while True: 379 | for v2 in degrees[degree_now]['vertices']: 380 | if (v != v2): 381 | vertices.append(v2) 382 | c_v += 1 383 | if (c_v > a_vertices_selected): 384 | raise StopIteration 385 | 386 | if (degree_now == degree_b): 387 | if ('before' not in degrees[degree_b]): 388 | degree_b = -1 389 | else: 390 | degree_b = degrees[degree_b]['before'] 391 | else: 392 | if ('after' not in degrees[degree_a]): 393 | degree_a = -1 394 | else: 395 | degree_a = degrees[degree_a]['after'] 396 | 397 | if (degree_b == -1 and degree_a == -1): 398 | raise StopIteration 399 | 400 | degree_now = verifyDegrees(degrees, degree_v, degree_a, degree_b) 401 | 402 | except StopIteration: 403 | return list(vertices) 404 | 405 | return list(vertices) 406 | 407 | 408 | def verifyDegrees(degrees, degree_v_root, degree_a, degree_b): 409 | 410 | if(degree_b == -1): 411 | degree_now = degree_a 412 | elif(degree_a == -1): 413 | degree_now = degree_b 414 | elif(abs(degree_b - degree_v_root) < abs(degree_a - degree_v_root)): 415 | degree_now = degree_b 416 | else: 417 | degree_now = degree_a 418 | 419 | return degree_now 420 | 421 | 422 | def compute_dtw_dist(part_list, degreeList, dist_func): 423 | dtw_dist = {} 424 | for v1, nbs in part_list: 425 | lists_v1 = degreeList[v1] # lists_v1 :orderd degree list of v1 426 | for v2 in nbs: 427 | lists_v2 = degreeList[v2] # lists_v1 :orderd degree list of v2 428 | max_layer = min(len(lists_v1), len(lists_v2)) # valid layer 429 | dtw_dist[v1, v2] = {} 430 | for layer in range(0, max_layer): 431 | dist, path = fastdtw( 432 | lists_v1[layer], lists_v2[layer], radius=1, dist=dist_func) 433 | dtw_dist[v1, v2][layer] = dist 434 | return dtw_dist 435 | -------------------------------------------------------------------------------- /lib_node_embedding/ge/utils.py: -------------------------------------------------------------------------------- 1 | def preprocess_nxgraph(graph): 2 | node2idx = {} 3 | idx2node = [] 4 | node_size = 0 5 | for node in graph.nodes(): 6 | node2idx[node] = node_size 7 | idx2node.append(node) 8 | node_size += 1 9 | return idx2node, node2idx 10 | 11 | 12 | def partition_dict(vertices, workers): 13 | batch_size = (len(vertices) - 1) // workers + 1 14 | part_list = [] 15 | part = [] 16 | count = 0 17 | for v1, nbs in vertices.items(): 18 | part.append((v1, nbs)) 19 | count += 1 20 | if count % batch_size == 0: 21 | part_list.append(part) 22 | part = [] 23 | if len(part) > 0: 24 | part_list.append(part) 25 | return part_list 26 | 27 | 28 | def partition_list(vertices, workers): 29 | batch_size = (len(vertices) - 1) // workers + 1 30 | part_list = [] 31 | part = [] 32 | count = 0 33 | for v1, nbs in enumerate(vertices): 34 | part.append((v1, nbs)) 35 | count += 1 36 | if count % batch_size == 0: 37 | part_list.append(part) 38 | part = [] 39 | if len(part) > 0: 40 | part_list.append(part) 41 | return part_list 42 | 43 | 44 | def partition_num(num, workers): 45 | if num % workers == 0: 46 | return [num//workers]*workers 47 | else: 48 | return [num//workers]*workers + [num % workers] 49 | -------------------------------------------------------------------------------- /lib_node_embedding/ge/walker.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import math 3 | import random 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from joblib import Parallel, delayed 8 | from tqdm import trange 9 | 10 | from .alias import alias_sample, create_alias_table 11 | from .utils import partition_num 12 | 13 | 14 | class RandomWalker: 15 | def __init__(self, G, p=1, q=1, use_rejection_sampling=0): 16 | """ 17 | :param G: 18 | :param p: Return parameter,controls the likelihood of immediately revisiting a node in the walk. 19 | :param q: In-out parameter,allows the search to differentiate between “inward” and “outward” nodes 20 | :param use_rejection_sampling: Whether to use the rejection sampling strategy in node2vec. 21 | """ 22 | self.G = G 23 | self.p = p 24 | self.q = q 25 | self.use_rejection_sampling = use_rejection_sampling 26 | 27 | def deepwalk_walk(self, walk_length, start_node): 28 | 29 | walk = [start_node] 30 | 31 | while len(walk) < walk_length: 32 | cur = walk[-1] 33 | cur_nbrs = list(self.G.neighbors(cur)) 34 | if len(cur_nbrs) > 0: 35 | walk.append(random.choice(cur_nbrs)) 36 | else: 37 | break 38 | return walk 39 | 40 | def node2vec_walk(self, walk_length, start_node): 41 | 42 | G = self.G 43 | alias_nodes = self.alias_nodes 44 | alias_edges = self.alias_edges 45 | 46 | walk = [start_node] 47 | 48 | while len(walk) < walk_length: 49 | cur = walk[-1] 50 | cur_nbrs = list(G.neighbors(cur)) 51 | if len(cur_nbrs) > 0: 52 | if len(walk) == 1: 53 | walk.append( 54 | cur_nbrs[alias_sample(alias_nodes[cur][0], alias_nodes[cur][1])]) 55 | else: 56 | prev = walk[-2] 57 | edge = (prev, cur) 58 | next_node = cur_nbrs[alias_sample(alias_edges[edge][0], 59 | alias_edges[edge][1])] 60 | walk.append(next_node) 61 | else: 62 | break 63 | 64 | return walk 65 | 66 | def node2vec_walk2(self, walk_length, start_node): 67 | """ 68 | Reference: 69 | KnightKing: A Fast Distributed Graph Random Walk Engine 70 | http://madsys.cs.tsinghua.edu.cn/publications/SOSP19-yang.pdf 71 | """ 72 | 73 | def rejection_sample(inv_p, inv_q, nbrs_num): 74 | upper_bound = max(1.0, max(inv_p, inv_q)) 75 | lower_bound = min(1.0, min(inv_p, inv_q)) 76 | shatter = 0 77 | second_upper_bound = max(1.0, inv_q) 78 | if (inv_p > second_upper_bound): 79 | shatter = second_upper_bound / nbrs_num 80 | upper_bound = second_upper_bound + shatter 81 | return upper_bound, lower_bound, shatter 82 | 83 | G = self.G 84 | alias_nodes = self.alias_nodes 85 | inv_p = 1.0 / self.p 86 | inv_q = 1.0 / self.q 87 | walk = [start_node] 88 | while len(walk) < walk_length: 89 | cur = walk[-1] 90 | cur_nbrs = list(G.neighbors(cur)) 91 | if len(cur_nbrs) > 0: 92 | if len(walk) == 1: 93 | walk.append( 94 | cur_nbrs[alias_sample(alias_nodes[cur][0], alias_nodes[cur][1])]) 95 | else: 96 | upper_bound, lower_bound, shatter = rejection_sample( 97 | inv_p, inv_q, len(cur_nbrs)) 98 | prev = walk[-2] 99 | prev_nbrs = set(G.neighbors(prev)) 100 | while True: 101 | prob = random.random() * upper_bound 102 | if (prob + shatter >= upper_bound): 103 | next_node = prev 104 | break 105 | next_node = cur_nbrs[alias_sample( 106 | alias_nodes[cur][0], alias_nodes[cur][1])] 107 | if (prob < lower_bound): 108 | break 109 | if (prob < inv_p and next_node == prev): 110 | break 111 | _prob = 1.0 if next_node in prev_nbrs else inv_q 112 | if (prob < _prob): 113 | break 114 | walk.append(next_node) 115 | else: 116 | break 117 | return walk 118 | 119 | def simulate_walks(self, num_walks, walk_length, workers=1, verbose=0): 120 | 121 | G = self.G 122 | 123 | nodes = list(G.nodes()) 124 | 125 | results = Parallel(n_jobs=workers, verbose=verbose, )( 126 | delayed(self._simulate_walks)(nodes, num, walk_length) for num in 127 | partition_num(num_walks, workers)) 128 | 129 | walks = list(itertools.chain(*results)) 130 | 131 | return walks 132 | 133 | def _simulate_walks(self, nodes, num_walks, walk_length,): 134 | walks = [] 135 | for _ in range(num_walks): 136 | random.shuffle(nodes) 137 | for v in nodes: 138 | if self.p == 1 and self.q == 1: 139 | walks.append(self.deepwalk_walk( 140 | walk_length=walk_length, start_node=v)) 141 | elif self.use_rejection_sampling: 142 | walks.append(self.node2vec_walk2( 143 | walk_length=walk_length, start_node=v)) 144 | else: 145 | walks.append(self.node2vec_walk( 146 | walk_length=walk_length, start_node=v)) 147 | return walks 148 | 149 | def get_alias_edge(self, t, v): 150 | """ 151 | compute unnormalized transition probability between nodes v and its neighbors give the previous visited node t. 152 | :param t: 153 | :param v: 154 | :return: 155 | """ 156 | G = self.G 157 | p = self.p 158 | q = self.q 159 | 160 | unnormalized_probs = [] 161 | for x in G.neighbors(v): 162 | weight = G[v][x].get('weight', 1.0) # w_vx 163 | if x == t: # d_tx == 0 164 | unnormalized_probs.append(weight/p) 165 | elif G.has_edge(x, t): # d_tx == 1 166 | unnormalized_probs.append(weight) 167 | else: # d_tx > 1 168 | unnormalized_probs.append(weight/q) 169 | norm_const = sum(unnormalized_probs) 170 | normalized_probs = [ 171 | float(u_prob)/norm_const for u_prob in unnormalized_probs] 172 | 173 | return create_alias_table(normalized_probs) 174 | 175 | def preprocess_transition_probs(self): 176 | """ 177 | Preprocessing of transition probabilities for guiding the random walks. 178 | """ 179 | G = self.G 180 | alias_nodes = {} 181 | for node in G.nodes(): 182 | unnormalized_probs = [G[node][nbr].get('weight', 1.0) 183 | for nbr in G.neighbors(node)] 184 | norm_const = sum(unnormalized_probs) 185 | normalized_probs = [ 186 | float(u_prob)/norm_const for u_prob in unnormalized_probs] 187 | alias_nodes[node] = create_alias_table(normalized_probs) 188 | 189 | if not self.use_rejection_sampling: 190 | alias_edges = {} 191 | 192 | for edge in G.edges(): 193 | alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) 194 | if not G.is_directed(): 195 | alias_edges[(edge[1], edge[0])] = self.get_alias_edge(edge[1], edge[0]) 196 | self.alias_edges = alias_edges 197 | 198 | self.alias_nodes = alias_nodes 199 | return 200 | 201 | 202 | class BiasedWalker: 203 | def __init__(self, idx2node, temp_path): 204 | 205 | self.idx2node = idx2node 206 | self.idx = list(range(len(self.idx2node))) 207 | self.temp_path = temp_path 208 | pass 209 | 210 | def simulate_walks(self, num_walks, walk_length, stay_prob=0.3, workers=1, verbose=0): 211 | 212 | layers_adj = pd.read_pickle(self.temp_path+'layers_adj.pkl') 213 | layers_alias = pd.read_pickle(self.temp_path+'layers_alias.pkl') 214 | layers_accept = pd.read_pickle(self.temp_path+'layers_accept.pkl') 215 | gamma = pd.read_pickle(self.temp_path+'gamma.pkl') 216 | walks = [] 217 | initialLayer = 0 218 | 219 | nodes = self.idx # list(self.g.nodes()) 220 | 221 | results = Parallel(n_jobs=workers, verbose=verbose, )( 222 | delayed(self._simulate_walks)(nodes, num, walk_length, stay_prob, layers_adj, layers_accept, layers_alias, gamma) for num in 223 | partition_num(num_walks, workers)) 224 | 225 | walks = list(itertools.chain(*results)) 226 | return walks 227 | 228 | def _simulate_walks(self, nodes, num_walks, walk_length, stay_prob, layers_adj, layers_accept, layers_alias, gamma): 229 | walks = [] 230 | for _ in range(num_walks): 231 | random.shuffle(nodes) 232 | for v in nodes: 233 | walks.append(self._exec_random_walk(layers_adj, layers_accept, layers_alias, 234 | v, walk_length, gamma, stay_prob)) 235 | return walks 236 | 237 | def _exec_random_walk(self, graphs, layers_accept, layers_alias, v, walk_length, gamma, stay_prob=0.3): 238 | initialLayer = 0 239 | layer = initialLayer 240 | 241 | path = [] 242 | path.append(self.idx2node[v]) 243 | 244 | while len(path) < walk_length: 245 | r = random.random() 246 | if(r < stay_prob): # same layer 247 | v = chooseNeighbor(v, graphs, layers_alias, 248 | layers_accept, layer) 249 | path.append(self.idx2node[v]) 250 | else: # different layer 251 | r = random.random() 252 | try: 253 | x = math.log(gamma[layer][v] + math.e) 254 | p_moveup = (x / (x + 1)) 255 | except: 256 | print(layer, v) 257 | raise ValueError() 258 | 259 | if(r > p_moveup): 260 | if(layer > initialLayer): 261 | layer = layer - 1 262 | else: 263 | if((layer + 1) in graphs and v in graphs[layer + 1]): 264 | layer = layer + 1 265 | 266 | return path 267 | 268 | 269 | def chooseNeighbor(v, graphs, layers_alias, layers_accept, layer): 270 | 271 | v_list = graphs[layer][v] 272 | 273 | idx = alias_sample(layers_accept[layer][v], layers_alias[layer][v]) 274 | v = v_list[idx] 275 | 276 | return v 277 | -------------------------------------------------------------------------------- /lib_node_embedding/node_embedding.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import config 4 | from lib_gnn_model.graphsage.graphsage import SAGE 5 | from lib_dataset.data_store import DataStore 6 | 7 | 8 | class NodeEmbedding: 9 | def __init__(self, args, graph, data): 10 | super(NodeEmbedding, self) 11 | 12 | self.logger = logging.getLogger(__name__) 13 | self.args = args 14 | self.graph = graph 15 | self.data = data 16 | 17 | self.data_store = DataStore(self.args) 18 | 19 | def sage_encoder(self): 20 | if self.args['is_gen_embedding']: 21 | self.logger.info("generating node embeddings with GraphSage...") 22 | 23 | node_to_embedding = {} 24 | # run sage 25 | self.target_model = SAGE(self.data.num_features, len(self.data.y.unique()), self.data) 26 | 27 | # self.target_model.train_model(50) 28 | 29 | # load a pretrained GNN model for generating node embeddings 30 | target_model_name = '_'.join((self.args['target_model'], 'random_1', 31 | str(self.args['shard_size_delta']), 32 | str(self.args['ratio_deleted_edges']), '0_0_1')) 33 | target_model_file = config.MODEL_PATH + self.args['dataset_name'] + '/' + target_model_name 34 | self.target_model.load_model(target_model_file) 35 | 36 | logits = self.target_model.generate_embeddings().detach().cpu().numpy() 37 | for node in self.graph.nodes: 38 | node_to_embedding[node] = logits[node] 39 | 40 | self.data_store.save_embeddings(node_to_embedding) 41 | else: 42 | node_to_embedding = self.data_store.load_embeddings() 43 | 44 | return node_to_embedding 45 | -------------------------------------------------------------------------------- /lib_utils/logger.py: -------------------------------------------------------------------------------- 1 | from texttable import Texttable 2 | 3 | def tab_printer(args): 4 | """ 5 | Function to print the logs in a nice tabular format. 6 | :param args: Parameters used for the model. 7 | """ 8 | # args = vars(args) 9 | keys = sorted(args.keys()) 10 | t = Texttable() 11 | t.add_rows([["Parameter", "Value"]] + [[k.replace("_"," ").capitalize(),args[k]] for k in keys]) 12 | print(t.draw()) -------------------------------------------------------------------------------- /lib_utils/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import errno 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import networkx as nx 7 | import torch 8 | from scipy.sparse import coo_matrix 9 | from tqdm import tqdm 10 | 11 | 12 | def graph_reader(path): 13 | """ 14 | Function to read the graph from the path. 15 | :param path: Path to the edge list. 16 | :return graph: NetworkX object returned. 17 | """ 18 | graph = nx.from_edgelist(pd.read_csv(path).values.tolist()) 19 | return graph 20 | 21 | 22 | def feature_reader(path): 23 | """ 24 | Reading the sparse feature matrix stored as csv from the disk. 25 | :param path: Path to the csv file. 26 | :return features: Dense matrix of features. 27 | """ 28 | features = pd.read_csv(path) 29 | node_index = features["node_id"].values.tolist() 30 | feature_index = features["feature_id"].values.tolist() 31 | feature_values = features["value"].values.tolist() 32 | node_count = max(node_index) + 1 33 | feature_count = max(feature_index) + 1 34 | features = coo_matrix((feature_values, (node_index, feature_index)), shape=(node_count, feature_count)).toarray() 35 | return features 36 | 37 | 38 | def target_reader(path): 39 | """ 40 | Reading the target vector from disk. 41 | :param path: Path to the target. 42 | :return target: Target vector. 43 | """ 44 | target = np.array(pd.read_csv(path)["target"]).reshape(-1, 1) 45 | return target 46 | 47 | 48 | def make_adjacency(graph, max_degree, sel=None): 49 | all_nodes = np.array(graph.nodes()) 50 | 51 | # Initialize w/ links to a dummy node 52 | n_nodes = len(all_nodes) 53 | adj = (np.zeros((n_nodes + 1, max_degree)) + n_nodes).astype(int) 54 | 55 | if sel is not None: 56 | # only look at nodes in training set 57 | all_nodes = all_nodes[sel] 58 | 59 | for node in tqdm(all_nodes): 60 | neibs = np.array(list(graph.neighbors(node))) 61 | 62 | if sel is not None: 63 | neibs = neibs[sel[neibs]] 64 | 65 | if len(neibs) > 0: 66 | if len(neibs) > max_degree: 67 | neibs = np.random.choice(neibs, max_degree, replace=False) 68 | elif len(neibs) < max_degree: 69 | extra = np.random.choice(neibs, max_degree - neibs.shape[0], replace=True) 70 | neibs = np.concatenate([neibs, extra]) 71 | adj[node, :] = neibs 72 | 73 | return adj 74 | 75 | 76 | def connected_component_subgraphs(graph): 77 | """ 78 | Find all connected subgraphs in a networkx Graph 79 | 80 | Args: 81 | graph (Graph): A networkx Graph 82 | 83 | Yields: 84 | generator: A subgraph generator 85 | """ 86 | for c in nx.connected_components(graph): 87 | yield graph.subgraph(c) 88 | 89 | 90 | def check_exist(file_name): 91 | if not os.path.exists(os.path.dirname(file_name)): 92 | try: 93 | os.makedirs(os.path.dirname(file_name)) 94 | except OSError as exc: # Guard against race condition 95 | if exc.errno != errno.EEXIST: 96 | raise 97 | 98 | 99 | def filter_edge_index(edge_index, node_indices, reindex=True): 100 | assert np.all(np.diff(node_indices) >= 0), 'node_indices must be sorted' 101 | if isinstance(edge_index, torch.Tensor): 102 | edge_index = edge_index.cpu() 103 | 104 | node_index = np.isin(edge_index, node_indices) 105 | col_index = np.nonzero(np.logical_and(node_index[0], node_index[1]))[0] 106 | edge_index = edge_index[:, col_index] 107 | 108 | if reindex: 109 | return np.searchsorted(node_indices, edge_index) 110 | else: 111 | return edge_index 112 | 113 | 114 | def pyg_to_nx(data): 115 | """ 116 | Convert a torch geometric Data to networkx Graph. 117 | 118 | Args: 119 | data (Data): A torch geometric Data. 120 | 121 | Returns: 122 | Graph: A networkx Graph. 123 | """ 124 | graph = nx.Graph() 125 | graph.add_nodes_from(np.arange(data.num_nodes)) 126 | edge_index = data.edge_index.numpy() 127 | 128 | for u, v in np.transpose(edge_index): 129 | graph.add_edge(u, v) 130 | 131 | return graph 132 | 133 | 134 | def edge_index_to_nx(edge_index, num_nodes): 135 | """ 136 | Convert a torch geometric Data to networkx Graph by edge_index. 137 | Args: 138 | edge_index (Data.edge_index): A torch geometric Data. 139 | num_nodes (int): Number of nodes in a graph. 140 | Returns: 141 | Graph: networkx Graph 142 | """ 143 | graph = nx.Graph() 144 | graph.add_nodes_from(np.arange(num_nodes)) 145 | edge_index = edge_index.numpy() 146 | 147 | for u, v in np.transpose(edge_index): 148 | graph.add_edge(u, v) 149 | 150 | return graph 151 | 152 | 153 | def filter_edge_index_1(data, node_indices): 154 | """ 155 | Remove unnecessary edges from a torch geometric Data, only keep the edges between node_indices. 156 | Args: 157 | data (Data): A torch geometric Data. 158 | node_indices (list): A list of nodes to be deleted from data. 159 | 160 | Returns: 161 | data.edge_index: The new edge_index after removing the node_indices. 162 | """ 163 | if isinstance(data.edge_index, torch.Tensor): 164 | data.edge_index = data.edge_index.cpu() 165 | 166 | edge_index = data.edge_index 167 | node_index = np.isin(edge_index, node_indices) 168 | 169 | col_index = np.nonzero(np.logical_and(node_index[0], node_index[1]))[0] 170 | edge_index = data.edge_index[:, col_index] 171 | 172 | return np.searchsorted(node_indices, edge_index) 173 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import torch 4 | 5 | from exp.exp_graph_partition import ExpGraphPartition 6 | from exp.exp_node_edge_unlearning import ExpNodeEdgeUnlearning 7 | from exp.exp_unlearning import ExpUnlearning 8 | from exp.exp_attack_unlearning import ExpAttackUnlearning 9 | from parameter_parser import parameter_parser 10 | 11 | 12 | def config_logger(save_name): 13 | # create logger 14 | logger = logging.getLogger() 15 | logger.setLevel(logging.DEBUG) 16 | formatter = logging.Formatter('%(levelname)s:%(asctime)s: - %(name)s - : %(message)s') 17 | 18 | # create console handler 19 | ch = logging.StreamHandler() 20 | ch.setLevel(logging.DEBUG) 21 | ch.setFormatter(formatter) 22 | logger.addHandler(ch) 23 | 24 | 25 | def main(args, exp): 26 | # config the logger 27 | logger_name = "_".join((exp, args['dataset_name'], args['partition_method'], str(args['num_shards']), str(args['test_ratio']))) 28 | config_logger(logger_name) 29 | logging.info(logger_name) 30 | 31 | torch.set_num_threads(args["num_threads"]) 32 | torch.cuda.set_device(args["cuda"]) 33 | os.environ["CUDA_VISIBLE_DEVICES"] = str(args["cuda"]) 34 | 35 | # subroutine entry for different methods 36 | if exp == 'partition': 37 | ExpGraphPartition(args) 38 | elif exp == 'unlearning': 39 | ExpUnlearning(args) 40 | elif exp == 'node_edge_unlearning': 41 | ExpNodeEdgeUnlearning(args) 42 | elif exp == 'attack_unlearning': 43 | ExpAttackUnlearning(args) 44 | else: 45 | raise Exception('unsupported attack') 46 | 47 | 48 | if __name__ == "__main__": 49 | args = parameter_parser() 50 | 51 | main(args, args['exp']) 52 | -------------------------------------------------------------------------------- /parameter_parser.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def str2bool(v): 5 | if isinstance(v, bool): 6 | return v 7 | if v.lower() in ('yes', 'true', 't', 'y', '1'): 8 | return True 9 | elif v.lower() in ('no', 'false', 'f', 'n', '0'): 10 | return False 11 | else: 12 | raise argparse.ArgumentTypeError('Boolean value expected.') 13 | 14 | 15 | def parameter_parser(): 16 | """ 17 | A method to parse up command line parameters. 18 | The default hyper-parameters give a good quality representation without grid search. 19 | """ 20 | parser = argparse.ArgumentParser() 21 | 22 | ######################### general parameters ################################ 23 | parser.add_argument('--is_vary', type=bool, default=False, help='control whether to use multiprocess') 24 | parser.add_argument('--dataset_name', type=str, default='citeseer', 25 | choices=["cora", "citeseer", "pubmed", "Coauthor_CS", "Coauthor_Phys"]) 26 | 27 | parser.add_argument('--exp', type=str, default='attack_unlearning', 28 | choices=["partition", "unlearning", "node_edge_unlearning", "attack_unlearning"]) 29 | parser.add_argument('--cuda', type=int, default=3, help='specify gpu') 30 | parser.add_argument('--num_threads', type=int, default=1) 31 | 32 | parser.add_argument('--is_upload', type=str2bool, default=True) 33 | parser.add_argument('--database_name', type=str, default="unlearning_dependant", 34 | choices=['unlearning_dependant', 'unlearning_adaptive', 35 | 'unlearning_graph_structure', 'gnn_unlearning_shards', 36 | 'unlearning_delta_plot', 'gnn_unlearning_utility', 37 | 'unlearning_ratio', 'unlearning_partition_baseline', 38 | 'unlearning_ratio', 'attack_unlearning']) 39 | 40 | ########################## graph partition parameters ###################### 41 | parser.add_argument('--is_split', type=str2bool, default=True) 42 | parser.add_argument('--test_ratio', type=float, default=0.1) 43 | parser.add_argument('--use_test_neighbors', type=str2bool, default=True) 44 | parser.add_argument('--is_partition', type=str2bool, default=True) 45 | parser.add_argument('--is_prune', type=str2bool, default=False) 46 | parser.add_argument('--num_shards', type=int, default=10) 47 | parser.add_argument('--is_constrained', type=str2bool, default=True) 48 | parser.add_argument('--is_gen_embedding', type=str2bool, default=True) 49 | 50 | parser.add_argument('--partition_method', type=str, default='sage_km', 51 | choices=["sage_km", "random", "lpa", "metis", "lpa_base", "sage_km_base"]) 52 | parser.add_argument('--terminate_delta', type=int, default=0) 53 | parser.add_argument('--shard_size_delta', type=float, default=0.005) 54 | 55 | ########################## unlearning parameters ########################### 56 | parser.add_argument('--repartition', type=str2bool, default=False) 57 | 58 | ########################## training parameters ########################### 59 | parser.add_argument('--is_train_target_model', type=str2bool, default=True) 60 | parser.add_argument('--is_use_node_feature', type=str2bool, default=False) 61 | parser.add_argument('--is_use_batch', type=str2bool, default=True, help="Use batch train GNN models.") 62 | parser.add_argument('--target_model', type=str, default='GAT', choices=["SAGE", "GAT", 'MLP', "GCN", "GIN"]) 63 | parser.add_argument('--train_lr', type=float, default=0.01) 64 | parser.add_argument('--train_weight_decay', type=float, default=0) 65 | parser.add_argument('--num_epochs', type=int, default=100) 66 | parser.add_argument('--num_runs', type=int, default=1) 67 | parser.add_argument('--batch_size', type=int, default=512) 68 | parser.add_argument('--test_batch_size', type=int, default=64) 69 | parser.add_argument('--aggregator', type=str, default='mean', choices=['mean', 'majority', 'optimal']) 70 | 71 | parser.add_argument('--opt_lr', type=float, default=0.001) 72 | parser.add_argument('--opt_decay', type=float, default=0.0001) 73 | parser.add_argument('--opt_num_epochs', type=int, default=50) 74 | parser.add_argument('--unlearning_request', type=str, default='random', choices=['random', 'adaptive', 'dependant', 'top1', 'last5']) 75 | 76 | ########################## analysis parameters ################################### 77 | parser.add_argument('--num_unlearned_nodes', type=int, default=1) 78 | parser.add_argument('--ratio_unlearned_nodes', type=float, default=0.005) 79 | parser.add_argument('--num_unlearned_edges', type=int, default=1) 80 | parser.add_argument('--ratio_deleted_edges', type=float, default=0.9) 81 | parser.add_argument('--num_opt_samples', type=int, default=1000) 82 | 83 | args = vars(parser.parse_args()) 84 | 85 | return args 86 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | This code is the implementation of graph unlearning. 4 | 5 | #### Code Strcuture 6 | 7 | ``` 8 | . 9 | ├── config.py 10 | ├── exp 11 | ├── lib_aggregator 12 | ├── lib_dataset 13 | ├── lib_gnn_model 14 | ├── lib_graph_partition 15 | ├── lib_node_embedding 16 | ├── lib_utils 17 | ├── main.py 18 | ├── parameter_parser.py 19 | └── readme.md 20 | ``` 21 | 22 | #### Environment prepare 23 | 24 | ```bash 25 | conda create --name graph_unlearning python=3.6.10 26 | conda activate graph_unlearning 27 | pip install sklearn ogb infomap seaborn munkres gensim fastdtw leidenalg cvxpy pymetis mysqlclient pymetis MulticoreTSNE cupy-cuda111 tensorflow 28 | pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html 29 | TORCH="1.9.0" 30 | CUDA="cu111" 31 | pip install torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}+${CUDA}.html 32 | pip install torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}+${CUDA}.html 33 | pip install torch-cluster -f https://data.pyg.org/whl/torch-${TORCH}+${CUDA}.html 34 | pip install torch-spline-conv -f https://data.pyg.org/whl/torch-${TORCH}+${CUDA}.html 35 | pip install torch-geometric 36 | ``` 37 | 38 | #### GraphEraser Framework 39 | 40 | ###### Graph Partition 41 | 42 | See more parameters settings in parameter_parser.py at ***##graph partition parameters##***. 43 | 44 | ```bash 45 | $ --partition true --partition_method lpa --is_constrained true 46 | 47 | $ --partition true --partition_method sage_km --is_constrained true 48 | ``` 49 | 50 | ###### Aggregation 51 | 52 | See more parameters settings in parameter_parser.py at ***##training parameters##***. 53 | 54 | ```bash 55 | Use '--aggregator' choose the desired aggregation method, choose from ['mean', 'majority', 'optimal']. 56 | 57 | ``` 58 | 59 | ###### Unlearning 60 | 61 | See more parameters settings in parameter_parser.py at ***##unlearning parameters##***. 62 | 63 | ```bash 64 | Use '--repartition' to decide whether unlearning the graph partition. 65 | 66 | Use '--unlearning_request' to choose the unlearning request distributions from ['random', 'adaptive', 'dependant', 'top1', 'last5']. 67 | ``` 68 | --------------------------------------------------------------------------------