├── config.py
├── exp
    ├── exp.py
    ├── exp_attack_unlearning.py
    ├── exp_graph_partition.py
    ├── exp_node_edge_unlearning.py
    └── exp_unlearning.py
├── lib_aggregator
    ├── __init__.py
    ├── aggregator.py
    ├── opt_dataset.py
    └── optimal_aggregator.py
├── lib_dataset
    ├── __init__.py
    └── data_store.py
├── lib_gnn_model
    ├── __init__.py
    ├── gat
    │   ├── gat.py
    │   └── gat_net.py
    ├── gcn
    │   ├── gcn.py
    │   └── gcn_net.py
    ├── gin
    │   ├── gin.py
    │   └── gin_net.py
    ├── gnn_base.py
    ├── graphsage
    │   ├── graphsage.py
    │   └── graphsage_net.py
    ├── mlp
    │   ├── __init__.py
    │   ├── mlp.py
    │   └── mlpnet.py
    └── node_classifier.py
├── lib_graph_partition
    ├── __init__.py
    ├── constrained_kmeans.py
    ├── constrained_kmeans_base.py
    ├── constrained_lpa.py
    ├── constrained_lpa_base.py
    ├── graph_partition.py
    ├── hungarian.py
    ├── hungarian_1.py
    ├── metis_partition.py
    ├── partition.py
    ├── partition_kmeans.py
    ├── partition_lpa.py
    └── partition_random.py
├── lib_node_embedding
    ├── __init__.py
    ├── ge
    │   ├── __init__.py
    │   ├── alias.py
    │   ├── classify.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── deepwalk.py
    │   │   ├── line.py
    │   │   ├── node2vec.py
    │   │   ├── sdne.py
    │   │   └── struc2vec.py
    │   ├── utils.py
    │   └── walker.py
    └── node_embedding.py
├── lib_utils
    ├── logger.py
    └── utils.py
├── main.py
├── parameter_parser.py
└── readme.md


/config.py:
--------------------------------------------------------------------------------
1 | RAW_DATA_PATH = 'temp_data/raw_data/'
2 | PROCESSED_DATA_PATH = 'temp_data/processed_data/'
3 | MODEL_PATH = 'temp_data/models/'
4 | ANALYSIS_PATH = 'temp_data/analysis_data/'
5 | 
6 | # database name
7 | DATABASE_NAME = "unlearning_gnn"


--------------------------------------------------------------------------------
/exp/exp.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from lib_dataset.data_store import DataStore
 4 | 
 5 | 
 6 | class Exp:
 7 |     def __init__(self, args):
 8 |         self.logger = logging.getLogger('exp')
 9 | 
10 |         self.args = args
11 |         self.data_store = DataStore(args)
12 | 
13 |     def load_data(self):
14 |         pass
15 | 


--------------------------------------------------------------------------------
/exp/exp_attack_unlearning.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import time
  3 | from collections import defaultdict
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | import torch_geometric as tg
  8 | from torch_geometric.data import Data
  9 | from scipy.spatial import distance
 10 | 
 11 | import config
 12 | from exp.exp import Exp
 13 | from lib_graph_partition.graph_partition import GraphPartition
 14 | from lib_gnn_model.node_classifier import NodeClassifier
 15 | from lib_aggregator.aggregator import Aggregator
 16 | from lib_utils import utils
 17 | 
 18 | 
 19 | class ExpAttackUnlearning(Exp):
 20 |     def __init__(self, args):
 21 |         super(ExpAttackUnlearning, self).__init__(args)
 22 |         self.logger = logging.getLogger('exp_attack_unlearning')
 23 |         # 1. respond to the unlearning requests
 24 |         self.load_preprocessed_data()
 25 |         # self.graph_unlearning_request_respond()
 26 |         if self.args['repartition']:
 27 |             with open(config.MODEL_PATH + self.args['dataset_name'] + '/' + self.args['target_model']+"_unlearned_indices") as file:
 28 |                 node_unlearning_indices = [line.rstrip() for line in file]
 29 |             for unlearned_node in node_unlearning_indices:
 30 |                 self.graph_unlearning_request_respond(int(unlearned_node))
 31 |         else:
 32 |             self.graph_unlearning_request_respond()
 33 |         # 2. evalute the attack performance
 34 |         self.attack_graph_unlearning()
 35 | 
 36 |     def load_preprocessed_data(self):
 37 |         self.shard_data = self.data_store.load_shard_data()
 38 |         self.raw_data = self.data_store.load_raw_data()
 39 |         self.train_data = self.data_store.load_train_data()
 40 |         self.train_graph = self.data_store.load_train_graph()
 41 |         self.train_indices, self.test_indices = self.data_store.load_train_test_split()
 42 |         self.community_to_node = self.data_store.load_community_data()
 43 |         num_feats = self.train_data.num_features
 44 |         num_classes = len(self.train_data.y.unique())
 45 |         self.target_model = NodeClassifier(num_feats, num_classes, self.args)
 46 | 
 47 |     def graph_unlearning_request_respond(self, node_unlearning_request=None):
 48 |         # reindex the node ids
 49 |         node_to_com = self.data_store.c2n_to_n2c(self.community_to_node)
 50 |         train_indices_prune = list(node_to_com.keys())
 51 | 
 52 |         if node_unlearning_request==None:
 53 |             # generate node unlearning requests       
 54 |             node_unlearning_indices = np.random.choice(train_indices_prune, self.args['num_unlearned_nodes'])
 55 |         else:
 56 |             node_unlearning_indices = np.array([node_unlearning_request])
 57 |         self.num_unlearned_edges =0
 58 |         unlearning_indices = defaultdict(list)
 59 |         for node in node_unlearning_indices:
 60 |                 unlearning_indices[node_to_com[node]].append(node)
 61 |         # delete a list of revoked nodes from train_graph
 62 |         self.train_graph.remove_nodes_from(node_unlearning_indices)        
 63 | 
 64 |         # delete the revoked nodes from train_data 
 65 |         # by building unlearned data from unlearned train_graph
 66 |         self.train_data.train_mask = torch.from_numpy(np.isin(np.arange(self.train_data.num_nodes), self.train_indices))
 67 |         self.train_data.test_mask = torch.from_numpy(np.isin(np.arange(self.train_data.num_nodes), np.append(self.test_indices, node_unlearning_indices)))
 68 | 
 69 |         # delete the revoked nodes from shard_data
 70 |         self.shard_data_after_unlearning = {}
 71 |         self.affected_shard=[]
 72 |         for shard in range(self.args["num_shards"]):
 73 |             train_shard_indices = list(self.community_to_node[shard])
 74 |             # node unlearning
 75 |             train_shard_indices = np.setdiff1d(train_shard_indices, unlearning_indices[shard])
 76 |             shard_indices = np.union1d(train_shard_indices, self.test_indices)
 77 | 
 78 |             x = self.train_data.x[shard_indices]
 79 |             y = self.train_data.y[shard_indices]
 80 |             edge_index = utils.filter_edge_index_1(self.train_data, shard_indices)
 81 | 
 82 |             data = Data(x=x, edge_index=torch.from_numpy(edge_index), y=y)
 83 |             data.train_mask = torch.from_numpy(np.isin(shard_indices, train_shard_indices))
 84 |             data.test_mask = torch.from_numpy(np.isin(shard_indices, self.test_indices))
 85 | 
 86 |             self.shard_data_after_unlearning[shard] = data
 87 |             self.num_unlearned_edges += self.shard_data[shard].num_edges - self.shard_data_after_unlearning[shard].num_edges
 88 | 
 89 |             # find the affected shard model      
 90 |             if self.shard_data_after_unlearning[shard].num_nodes != self.shard_data[shard].num_nodes:
 91 |                 self.affected_shard.append(shard)
 92 |         
 93 |         self.data_store.save_unlearned_data(self.train_graph, 'train_graph')
 94 |         self.data_store.save_unlearned_data(self.train_data, 'train_data')
 95 |         self.data_store.save_unlearned_data(self.shard_data_after_unlearning, 'shard_data')
 96 | 
 97 |         # retrain the correponding shard model
 98 |         if not self.args['repartition']:
 99 |             for shard in self.affected_shard:
100 |                 suffix = "unlearned_"+str(node_unlearning_indices[0])
101 |                 self._train_shard_model(shard, suffix)
102 | 
103 |         # (if re-partition, re-partition the remaining graph)
104 |         # re-train the shard model, save model and optimal weight score
105 |         if self.args['repartition']:
106 |             suffix="_repartition_unlearned_" + str(node_unlearning_indices[0])
107 |             self._repartition(suffix)
108 |             for shard in range(self.args["num_shards"]):
109 |                 self._train_shard_model(shard, suffix)
110 | 
111 |     def _repartition(self, suffix):
112 |         # load unlearned train_graph and train_data
113 |         train_graph = self.data_store.load_unlearned_data('train_graph')
114 |         train_data = self.data_store.load_unlearned_data('train_data')
115 |         # repartition
116 |         start_time = time.time()
117 |         partition = GraphPartition(self.args, train_graph, train_data)
118 |         community_to_node = partition.graph_partition()
119 |         partition_time = time.time() - start_time
120 |         self.logger.info("Partition cost %s seconds." % partition_time)
121 |         # save the new partition and shard
122 |         self.data_store.save_community_data(community_to_node, suffix)       
123 |         self._generate_unlearned_repartitioned_shard_data(train_data, community_to_node, self.test_indices)
124 | 
125 |     def _generate_unlearned_repartitioned_shard_data(self, train_data, community_to_node, test_indices):
126 |         self.logger.info('generating shard data')
127 | 
128 |         shard_data = {}
129 |         for shard in range(self.args['num_shards']):
130 |             train_shard_indices = list(community_to_node[shard])
131 |             shard_indices = np.union1d(train_shard_indices, test_indices)
132 | 
133 |             x = self.train_data.x[shard_indices]
134 |             y = self.train_data.y[shard_indices]
135 |             edge_index = utils.filter_edge_index_1(train_data, shard_indices)
136 | 
137 |             data = Data(x=x, edge_index=torch.from_numpy(edge_index), y=y)
138 |             data.train_mask = torch.from_numpy(np.isin(shard_indices, train_shard_indices))
139 |             data.test_mask = torch.from_numpy(np.isin(shard_indices, test_indices))
140 | 
141 |             shard_data[shard] = data
142 | 
143 |         # self.data_store.save_unlearned_data(shard_data, 'shard_data_repartition')
144 |         return shard_data
145 |     
146 |     def _train_shard_model(self, shard, suffix="unlearned"):
147 |         self.logger.info('training target models, shard %s' % shard)
148 | 
149 |         # load shard data
150 |         self.target_model.data = self.shard_data_after_unlearning[shard]
151 |         # retrain shard model
152 |         self.target_model.train_model()
153 |         # replace shard model
154 |         device=torch.device("cpu")
155 |         self.target_model.device = device
156 |         self.data_store.save_target_model(0, self.target_model, shard, suffix)
157 |         # self.data_store.save_unlearned_target_model(0, self.target_model, shard, suffix)
158 | 
159 |     def attack_graph_unlearning(self):
160 | 
161 |         # load unlearned indices
162 |         with open(config.MODEL_PATH + self.args['dataset_name'] + "/" + self.args['target_model'] +"_unlearned_indices") as file:
163 |             unlearned_indices = [line.rstrip() for line in file]
164 | 
165 |         # member sample query, label as 1
166 |         positive_posteriors = self._query_target_model(unlearned_indices, unlearned_indices)
167 |         # non-member sample query, label as 0
168 |         negative_posteriors = self._query_target_model(unlearned_indices, self.test_indices)
169 | 
170 |         # evaluate attack performance, train multiple shadow models, or calculate posterior entropy, or directly calculate AUC.
171 |         self.evaluate_attack_performance(positive_posteriors, negative_posteriors)
172 | 
173 |     def _query_target_model(self, unlearned_indices, test_indices):
174 |         # load unlearned data
175 |         train_data = self.data_store.load_unlearned_data('train_data')
176 | 
177 |         # load optimal weight score
178 |         # optimal_weight=self.data_store.load_optimal_weight(0)
179 | 
180 |         # calculate the final posterior, save as attack feature
181 |         self.logger.info('aggregating submodels')
182 |         posteriors_a, posteriors_b, posteriors_c =[],[],[]
183 | 
184 |         for i in unlearned_indices:
185 |             community_to_node = self.data_store.load_community_data('')
186 |             shard_data = self._generate_unlearned_repartitioned_shard_data(train_data, community_to_node, int(i))        
187 | 
188 |             posteriors_a.append(self._generate_posteriors(shard_data, ''))            
189 |  
190 |             suffix="unlearned_" + str(i)
191 |             posteriors_b.append(self._generate_posteriors_unlearned(shard_data, suffix, i))
192 | 
193 |             if self.args['repartition']:
194 |                 suffix = "_repartition_unlearned_" + str(i)
195 |                 community_to_node = self.data_store.load_community_data(suffix)
196 |                 shard_data = self._generate_unlearned_repartitioned_shard_data(train_data, community_to_node, int(i))        
197 |                 suffix = "__repartition_unlearned_" + str(i)
198 |                 posteriors_c.append(self._generate_posteriors(shard_data, suffix))
199 | 
200 |         return posteriors_a, posteriors_b, posteriors_c
201 | 
202 |     def _generate_posteriors_unlearned(self, shard_data, suffix, unlearned_indice):
203 |         import glob
204 |         model_path=glob.glob(config.MODEL_PATH+self.args['dataset_name']+"/*_1unlearned_"+str(unlearned_indice))
205 |         if not model_path:
206 |             self.logger.info("No corresponding unlearned shard model for node %s" % str(unlearned_indice))
207 |             return torch.tensor([0]*6)
208 |         else:
209 |             affected_shard = int(model_path[0].split('/')[-1].split('_')[-4])
210 |             posteriors = []
211 |             for shard in range(self.args['num_shards']):
212 |                 if shard == affected_shard:
213 |                     # load the retrained the shard model
214 |                     self.data_store.load_target_model(0, self.target_model, shard, suffix)
215 |                 else:
216 |                     # self.target_model.model.reset_parameters()
217 |                     # load unaffected shard model
218 |                     self.data_store.load_target_model(0, self.target_model, shard, '')
219 |                 self.device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
220 |                 self.target_model.model = self.target_model.model.to(self.device)         
221 |                 self.target_model.data = shard_data[shard].to(self.device)
222 |                 posteriors.append(self.target_model.posterior())
223 |             return torch.mean(torch.cat(posteriors, dim=0), dim=0)
224 | 
225 |     def _generate_posteriors(self, shard_data, suffix):
226 |         posteriors = []
227 |         for shard in range(self.args['num_shards']):
228 |             # self.target_model.model.reset_parameters()
229 |             self.data_store.load_target_model(0, self.target_model, shard, suffix)
230 |             self.device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
231 |             self.target_model.model = self.target_model.model.to(self.device)         
232 |             self.target_model.data = shard_data[shard].to(self.device)
233 | 
234 |             posteriors.append(self.target_model.posterior())
235 |         return torch.mean(torch.cat(posteriors, dim=0), dim=0)
236 | 
237 |     def evaluate_attack_performance(self, positive_posteriors, negative_posteriors):
238 |         # constrcut attack data
239 |         label = torch.cat((torch.ones(len(positive_posteriors[0])), torch.zeros(len(negative_posteriors[0]))))
240 |         data={}
241 |         for i in range(2):
242 |             data[i] = torch.cat((torch.stack(positive_posteriors[i]), torch.stack(negative_posteriors[i])),0)
243 | 
244 |         # calculate l2 distance
245 |         model_b_distance = self._calculate_distance(data[0], data[1])
246 |         # directly calculate AUC with feature and labels
247 |         attack_auc_b = self.evaluate_attack_with_AUC(model_b_distance, label)
248 | 
249 |         if self.args['repartition']:
250 |             model_c_distance = self._calculate_distance(data[0], data[2])
251 |             attack_auc_c = self.evaluate_attack_with_AUC(model_c_distance, label)
252 | 
253 |         self.logger.info("Attack_Model_B AUC: %s | Attack_Model_C AUC: %s" % (attack_auc_b, attack_auc_c))
254 | 
255 |     def evaluate_attack_with_AUC(self, data, label):
256 |         from sklearn.metrics import roc_auc_score
257 |         self.logger.info("Directly calculate the attack AUC")
258 |         return roc_auc_score(label, data.reshape(-1, 1))
259 |     
260 |     def _calculate_distance(self, data0, data1, distance='l2_norm' ):
261 |         if distance == 'l2_norm':
262 |             return np.array([np.linalg.norm(data0[i]-data1[i]) for i in range(len(data0))])
263 |         elif distance =='direct_diff':
264 |             return data0 - data1
265 |         else:
266 |             raise Exception("Unsupported distance")
267 | 


--------------------------------------------------------------------------------
/exp/exp_graph_partition.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import time
  3 | 
  4 | import torch
  5 | from sklearn.model_selection import train_test_split
  6 | import numpy as np
  7 | from torch_geometric.data import Data
  8 | import torch_geometric as tg
  9 | import networkx as nx
 10 | 
 11 | from exp.exp import Exp
 12 | from lib_utils.utils import connected_component_subgraphs
 13 | from lib_graph_partition.graph_partition import GraphPartition
 14 | from lib_utils import utils
 15 | 
 16 | 
 17 | class ExpGraphPartition(Exp):
 18 |     def __init__(self, args):
 19 |         super(ExpGraphPartition, self).__init__(args)
 20 | 
 21 |         self.logger = logging.getLogger('exp_graph_partition')
 22 | 
 23 |         self.load_data()
 24 |         self.train_test_split()
 25 |         self.gen_train_graph()
 26 |         self.graph_partition()
 27 |         self.generate_shard_data()
 28 | 
 29 |     def load_data(self):
 30 |         self.data = self.data_store.load_raw_data()
 31 | 
 32 |     def train_test_split(self):
 33 |         if self.args['is_split']:
 34 |             self.logger.info('splitting train/test data')
 35 |             self.train_indices, self.test_indices = train_test_split(np.arange((self.data.num_nodes)), test_size=self.args['test_ratio'], random_state=100)
 36 |             self.data_store.save_train_test_split(self.train_indices, self.test_indices)
 37 | 
 38 |             self.data.train_mask = torch.from_numpy(np.isin(np.arange(self.data.num_nodes), self.train_indices))
 39 |             self.data.test_mask = torch.from_numpy(np.isin(np.arange(self.data.num_nodes), self.test_indices))
 40 |         else:
 41 |             self.train_indices, self.test_indices = self.data_store.load_train_test_split()
 42 | 
 43 |             self.data.train_mask = torch.from_numpy(np.isin(np.arange(self.data.num_nodes), self.train_indices))
 44 |             self.data.test_mask = torch.from_numpy(np.isin(np.arange(self.data.num_nodes), self.test_indices))
 45 | 
 46 |     def gen_train_graph(self):
 47 |         # delete ratio of edges and update the train graph
 48 |         if self.args['ratio_deleted_edges'] != 0:
 49 |             self.logger.debug("Before edge deletion. train data  #.Nodes: %f, #.Edges: %f" % (
 50 |                 self.data.num_nodes, self.data.num_edges))
 51 | 
 52 |             # self._ratio_delete_edges()
 53 |             self.data.edge_index = self._ratio_delete_edges(self.data.edge_index)
 54 | 
 55 |         # decouple train test edges.
 56 |         edge_index = self.data.edge_index.numpy()
 57 |         test_edge_indices = np.logical_or(np.isin(edge_index[0], self.test_indices),
 58 |                                           np.isin(edge_index[1], self.test_indices))
 59 |         train_edge_indices = np.logical_not(test_edge_indices)
 60 |         edge_index_train = edge_index[:, train_edge_indices]
 61 | 
 62 |         self.train_graph = nx.Graph()
 63 |         self.train_graph.add_nodes_from(self.train_indices)
 64 | 
 65 |         # use largest connected graph as train graph
 66 |         if self.args['is_prune']:
 67 |             self._prune_train_set()
 68 | 
 69 |         # reconstruct a networkx train graph
 70 |         for u, v in np.transpose(edge_index_train):
 71 |             self.train_graph.add_edge(u, v)
 72 | 
 73 |         self.logger.debug("After edge deletion. train graph  #.Nodes: %f, #.Edges: %f" % (
 74 |             self.train_graph.number_of_nodes(), self.train_graph.number_of_edges()))
 75 |         self.logger.debug("After edge deletion. train data  #.Nodes: %f, #.Edges: %f" % (
 76 |             self.data.num_nodes, self.data.num_edges))
 77 |         self.data_store.save_train_data(self.data)
 78 |         self.data_store.save_train_graph(self.train_graph)
 79 | 
 80 |     def graph_partition(self):
 81 |         if self.args['is_partition']:
 82 |             self.logger.info('graph partitioning')
 83 | 
 84 |             start_time = time.time()
 85 |             partition = GraphPartition(self.args, self.train_graph, self.data)
 86 |             self.community_to_node = partition.graph_partition()
 87 |             partition_time = time.time() - start_time
 88 |             self.logger.info("Partition cost %s seconds." % partition_time)
 89 |             self.data_store.save_community_data(self.community_to_node)
 90 |         else:
 91 |             self.community_to_node = self.data_store.load_community_data()
 92 | 
 93 |     def generate_shard_data(self):
 94 |         self.logger.info('generating shard data')
 95 | 
 96 |         self.shard_data = {}
 97 |         for shard in range(self.args['num_shards']):
 98 |             train_shard_indices = list(self.community_to_node[shard])
 99 |             shard_indices = np.union1d(train_shard_indices, self.test_indices)
100 | 
101 |             x = self.data.x[shard_indices]
102 |             y = self.data.y[shard_indices]
103 |             edge_index = utils.filter_edge_index_1(self.data, shard_indices)
104 | 
105 |             data = Data(x=x, edge_index=torch.from_numpy(edge_index), y=y)
106 |             data.train_mask = torch.from_numpy(np.isin(shard_indices, train_shard_indices))
107 |             data.test_mask = torch.from_numpy(np.isin(shard_indices, self.test_indices))
108 | 
109 |             self.shard_data[shard] = data
110 | 
111 |         self.data_store.save_shard_data(self.shard_data)
112 | 
113 |     def _prune_train_set(self):
114 |         # extract the the maximum connected component
115 |         self.logger.debug("Before Prune...  #. of Nodes: %f, #. of Edges: %f" % (
116 |             self.train_graph.number_of_nodes(), self.train_graph.number_of_edges()))
117 | 
118 |         self.train_graph = max(connected_component_subgraphs(self.train_graph), key=len)
119 | 
120 |         self.logger.debug("After Prune... #. of Nodes: %f, #. of Edges: %f" % (
121 |             self.train_graph.number_of_nodes(), self.train_graph.number_of_edges()))
122 |         # self.train_indices = np.array(self.train_graph.nodes)
123 | 
124 |     def _ratio_delete_edges(self, edge_index):
125 |         edge_index = edge_index.numpy()
126 | 
127 |         unique_indices = np.where(edge_index[0] < edge_index[1])[0]
128 |         unique_indices_not = np.where(edge_index[0] > edge_index[1])[0]
129 |         remain_indices = np.random.choice(unique_indices,
130 |                                            int(unique_indices.shape[0] * (1.0 - self.args['ratio_deleted_edges'])),
131 |                                            replace=False)
132 | 
133 |         remain_encode = edge_index[0, remain_indices] * edge_index.shape[1] * 2 + edge_index[1, remain_indices]
134 |         unique_encode_not = edge_index[1, unique_indices_not] * edge_index.shape[1] * 2 + edge_index[0, unique_indices_not]
135 |         sort_indices = np.argsort(unique_encode_not)
136 |         remain_indices_not = unique_indices_not[sort_indices[np.searchsorted(unique_encode_not, remain_encode, sorter=sort_indices)]]
137 |         remain_indices = np.union1d(remain_indices, remain_indices_not)
138 | 
139 |         # self.data.edge_index = torch.from_numpy(edge_index[:, remain_indices])
140 |         return torch.from_numpy(edge_index[:, remain_indices])
141 | 


--------------------------------------------------------------------------------
/exp/exp_node_edge_unlearning.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import pickle
  3 | import time
  4 | from collections import defaultdict
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | from torch_geometric.data import Data
  9 | 
 10 | import config
 11 | from exp.exp import Exp
 12 | from lib_gnn_model.graphsage.graphsage import SAGE
 13 | from lib_gnn_model.gat.gat import GAT
 14 | from lib_gnn_model.gin.gin import GIN
 15 | from lib_gnn_model.gcn.gcn import GCN
 16 | from lib_gnn_model.mlp.mlp import MLP
 17 | from lib_gnn_model.node_classifier import NodeClassifier
 18 | from lib_aggregator.aggregator import Aggregator
 19 | from lib_utils import utils
 20 | 
 21 | 
 22 | class ExpNodeEdgeUnlearning(Exp):
 23 |     def __init__(self, args):
 24 |         super(ExpNodeEdgeUnlearning, self).__init__(args)
 25 |         self.logger = logging.getLogger('exp_node_edge_unlearning')
 26 |         self.target_model_name = self.args['target_model']
 27 | 
 28 |         self.load_data()
 29 |         self.determine_target_model()
 30 |         self.run_exp()
 31 | 
 32 |     def run_exp(self):
 33 |         # unlearning efficiency
 34 |         run_f1 = np.empty((0))
 35 |         unlearning_time = np.empty((0))
 36 |         for run in range(self.args['num_runs']):
 37 |             self.logger.info("Run %f" % run)
 38 |             self.train_target_models(run)
 39 |             aggregate_f1_score = self.aggregate(run)
 40 |             # node_unlearning_time = self.unlearning_time_statistic()
 41 |             node_unlearning_time = 0
 42 |             run_f1 = np.append(run_f1, aggregate_f1_score)
 43 |             unlearning_time = np.append(unlearning_time, node_unlearning_time)
 44 |         self.num_unlearned_edges = 0
 45 |         # model utility
 46 |         self.f1_score_avg = np.average(run_f1)
 47 |         self.f1_score_std = np.std(run_f1)
 48 |         self.unlearning_time_avg = np.average(unlearning_time)
 49 |         self.unlearning_time_std = np.std(unlearning_time)
 50 |         self.logger.info(
 51 |             "%s %s %s %s" % (self.f1_score_avg, self.f1_score_std, self.unlearning_time_avg, self.unlearning_time_std))
 52 | 
 53 |     def load_data(self):
 54 |         self.shard_data = self.data_store.load_shard_data()
 55 |         self.raw_data = self.data_store.load_raw_data()
 56 |         self.train_data = self.data_store.load_train_data()
 57 | 
 58 |         self.unlearned_shard_data = self.shard_data
 59 | 
 60 |     def determine_target_model(self):
 61 |         num_feats = self.train_data.num_features
 62 |         num_classes = len(self.train_data.y.unique())
 63 | 
 64 |         if not self.args['is_use_batch']:
 65 |             if self.target_model_name == 'SAGE':
 66 |                 self.target_model = SAGE(num_feats, num_classes)
 67 |             elif self.target_model_name == 'GCN':
 68 |                 self.target_model = GCN(num_feats, num_classes)
 69 |             elif self.target_model_name == 'GAT':
 70 |                 self.target_model = GAT(num_feats, num_classes)
 71 |             elif self.target_model_name == 'GIN':
 72 |                 self.target_model = GIN(num_feats, num_classes)
 73 |             else:
 74 |                 raise Exception('unsupported target model')
 75 |         else:
 76 |             if self.target_model_name == 'MLP':
 77 |                 self.target_model = MLP(num_feats, num_classes)
 78 |             else:
 79 |                 self.target_model = NodeClassifier(num_feats, num_classes, self.args)
 80 | 
 81 |     def train_target_models(self, run):
 82 |         if self.args['is_train_target_model']:
 83 |             self.logger.info('training target models')
 84 | 
 85 |             self.time = {}
 86 |             for shard in range(self.args['num_shards']):
 87 |                 self.time[shard] = self._train_model(run, shard)
 88 | 
 89 |     def aggregate(self, run):
 90 |         self.logger.info('aggregating submodels')
 91 | 
 92 |         # posteriors, true_label = self.generate_posterior()
 93 |         aggregator = Aggregator(run, self.target_model, self.train_data, self.unlearned_shard_data, self.args)
 94 |         aggregator.generate_posterior()
 95 |         self.aggregate_f1_score = aggregator.aggregate()
 96 | 
 97 |         self.logger.info("Final Test F1: %s" % (self.aggregate_f1_score,))
 98 |         return self.aggregate_f1_score
 99 | 
100 |     def _generate_unlearning_request(self, num_unlearned="assign"):
101 |         node_list = []
102 |         for key, value in self.community_to_node.items():
103 |             # node_list.extend(value.tolist())
104 |             node_list.extend(value)
105 |         if num_unlearned == "assign":
106 |             num_of_unlearned_nodes = self.args['num_unlearned_nodes']
107 |         elif num_unlearned == "ratio":
108 |             num_of_unlearned_nodes = int(self.args['ratio_unlearned_nodes'] * len(node_list))
109 | 
110 |         if self.args['unlearning_request'] == 'random':
111 |             unlearned_nodes_indices = np.random.choice(node_list, num_of_unlearned_nodes, replace=False)
112 | 
113 |         elif self.args['unlearning_request'] == 'top1':
114 |             sorted_shards = sorted(self.community_to_node.items(), key=lambda x: len(x[1]), reverse=True)
115 |             unlearned_nodes_indices = np.random.choice(sorted_shards[0][1], num_of_unlearned_nodes, replace=False)
116 | 
117 |         elif self.args['unlearning_request'] == 'adaptive':
118 |             sorted_shards = sorted(self.community_to_node.items(), key=lambda x: len(x[1]), reverse=True)
119 |             candidate_list = np.concatenate([sorted_shards[i][1] for i in range(int(self.args['num_shards']/2)+1)], axis=0)
120 |             unlearned_nodes_indices = np.random.choice(candidate_list, num_of_unlearned_nodes, replace=False)
121 | 
122 |         elif self.args['unlearning_request'] == 'last5':
123 |             sorted_shards = sorted(self.community_to_node.items(), key=lambda x: len(x[1]), reverse=False)
124 |             candidate_list = np.concatenate([sorted_shards[i][1] for i in range(int(self.args['num_shards']/2)+1)], axis=0)
125 |             unlearned_nodes_indices = np.random.choice(candidate_list, num_of_unlearned_nodes, replace=False)
126 | 
127 |         return unlearned_nodes_indices
128 | 
129 |     def unlearning_time_statistic(self):
130 |         if self.args['is_train_target_model'] and self.args['num_shards'] != 1:
131 |             # random sample 5% nodes, find their belonging communities
132 |             unlearned_nodes = self._generate_unlearning_request(num_unlearned="ratio")
133 |             belong_community = []
134 |             for sample_node in range(len(unlearned_nodes)):
135 |                 for community, node in self.community_to_node.items():
136 |                     if np.in1d(unlearned_nodes[sample_node], node).any():
137 |                         belong_community.append(community)
138 | 
139 |             # calculate the total unlearning time and group unlearning time
140 |             group_unlearning_time = []
141 |             node_unlearning_time = []
142 |             for shard in range(self.args['num_shards']):
143 |                 if belong_community.count(shard) != 0:
144 |                     group_unlearning_time.append(self.time[shard])
145 |                     node_unlearning_time.extend([float(self.time[shard]) for j in range(belong_community.count(shard))])
146 |             return node_unlearning_time
147 | 
148 |         elif self.args['is_train_target_model'] and self.args['num_shards'] == 1:
149 |             return self.time[0]
150 | 
151 |         else:
152 |             return 0
153 | 
154 |     def _train_model(self, run, shard):
155 |         self.logger.info('training target models, run %s, shard %s' % (run, shard))
156 | 
157 |         start_time = time.time()
158 |         self.target_model.data = self.unlearned_shard_data[shard]
159 |         self.target_model.train_model()
160 |         train_time = time.time() - start_time
161 | 
162 |         self.data_store.save_target_model(run, self.target_model, shard)
163 | 
164 |         return train_time
165 | 


--------------------------------------------------------------------------------
/exp/exp_unlearning.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import time
  3 | 
  4 | import numpy as np
  5 | 
  6 | from exp.exp import Exp
  7 | from lib_gnn_model.graphsage.graphsage import SAGE
  8 | from lib_gnn_model.gat.gat import GAT
  9 | from lib_gnn_model.gin.gin import GIN
 10 | from lib_gnn_model.gcn.gcn import GCN
 11 | from lib_gnn_model.mlp.mlp import MLP
 12 | from lib_gnn_model.node_classifier import NodeClassifier
 13 | from lib_aggregator.aggregator import Aggregator
 14 | 
 15 | 
 16 | class ExpUnlearning(Exp):
 17 |     def __init__(self, args):
 18 |         super(ExpUnlearning, self).__init__(args)
 19 | 
 20 |         self.logger = logging.getLogger('exp_unlearning')
 21 | 
 22 |         self.target_model_name = self.args['target_model']
 23 |         self.num_opt_samples = self.args['num_opt_samples']
 24 | 
 25 |         self.load_data()
 26 |         self.determine_target_model()
 27 | 
 28 |         run_f1 = np.empty((0))
 29 |         unlearning_time = np.empty((0))
 30 |         for run in range(self.args['num_runs']):
 31 |             self.logger.info("Run %f" % run)
 32 |             self.train_target_models(run)
 33 |             aggregate_f1_score = self.aggregate(run)
 34 |             node_unlearning_time = self.unlearning_time_statistic()
 35 |             run_f1 = np.append(run_f1, aggregate_f1_score)
 36 |             unlearning_time = np.append(unlearning_time, node_unlearning_time)
 37 | 
 38 |         self.f1_score_avg = np.average(run_f1)
 39 |         self.f1_score_std = np.std(run_f1)
 40 |         self.unlearning_time_avg = np.average(unlearning_time)
 41 |         self.unlearning_time_std = np.std(unlearning_time)
 42 |         self.logger.info("%s %s %s %s" % (self.f1_score_avg, self.f1_score_std, self.unlearning_time_avg, self.unlearning_time_std))
 43 | 
 44 |     def load_data(self):
 45 |         self.shard_data = self.data_store.load_shard_data()
 46 |         self.data = self.data_store.load_raw_data()
 47 | 
 48 |     def determine_target_model(self):
 49 |         num_feats = self.data.num_features
 50 |         num_classes = len(self.data.y.unique())
 51 | 
 52 |         if not self.args['is_use_batch']:
 53 |             if self.target_model_name == 'SAGE':
 54 |                 self.target_model = SAGE(num_feats, num_classes)
 55 |             elif self.target_model_name == 'GCN':
 56 |                 self.target_model = GCN(num_feats, num_classes)
 57 |             elif self.target_model_name == 'GAT':
 58 |                 self.target_model = GAT(num_feats, num_classes)
 59 |             elif self.target_model_name == 'GIN':
 60 |                 self.target_model = GIN(num_feats, num_classes)
 61 |             else:
 62 |                 raise Exception('unsupported target model')
 63 |         else:
 64 |             if self.target_model_name == 'MLP':
 65 |                 self.target_model = MLP(num_feats, num_classes)
 66 |             else:
 67 |                 self.target_model = NodeClassifier(num_feats, num_classes, self.args)
 68 | 
 69 |     def train_target_models(self, run):
 70 |         if self.args['is_train_target_model']:
 71 |             self.logger.info('training target models')
 72 | 
 73 |             self.time = {}
 74 |             for shard in range(self.args['num_shards']):
 75 |                 self.time[shard] = self._train_model(run, shard)
 76 | 
 77 |     def aggregate(self, run):
 78 |         self.logger.info('aggregating submodels')
 79 | 
 80 |         start_time = time.time()
 81 |         aggregator = Aggregator(run, self.target_model, self.data, self.shard_data, self.args)
 82 |         aggregator.generate_posterior()
 83 |         self.aggregate_f1_score = aggregator.aggregate()
 84 |         aggregate_time = time.time() - start_time
 85 |         self.logger.info("Partition cost %s seconds." % aggregate_time)
 86 | 
 87 |         self.logger.info("Final Test F1: %s" % (self.aggregate_f1_score,))
 88 |         return self.aggregate_f1_score
 89 | 
 90 |     def unlearning_time_statistic(self):
 91 |         if self.args['is_train_target_model'] and self.args['num_shards'] != 1:
 92 |             self.community_to_node = self.data_store.load_community_data()
 93 |             node_list = []
 94 |             for key, value in self.community_to_node.items():
 95 |                 node_list.extend(value)
 96 | 
 97 |             # random sample 5% nodes, find their belonging communities
 98 |             sample_nodes = np.random.choice(node_list, int(0.05 * len(node_list)))
 99 |             belong_community = []
100 |             for sample_node in range(len(sample_nodes)):
101 |                 for community, node in self.community_to_node.items():
102 |                     if np.in1d(sample_nodes[sample_node], node).any():
103 |                         belong_community.append(community)
104 | 
105 |             # calculate the total unlearning time and group unlearning time
106 |             group_unlearning_time = []
107 |             node_unlearning_time = []
108 |             for shard in range(self.args['num_shards']):
109 |                 if belong_community.count(shard) != 0:
110 |                     group_unlearning_time.append(self.time[shard])
111 |                     node_unlearning_time.extend([float(self.time[shard]) for j in range(belong_community.count(shard))])
112 | 
113 |             return node_unlearning_time
114 | 
115 |         elif self.args['is_train_target_model'] and self.args['num_shards'] == 1:
116 |             return self.time[0]
117 | 
118 |         else:
119 |             return 0
120 | 
121 |     def _train_model(self, run, shard):
122 |         self.logger.info('training target models, run %s, shard %s' % (run, shard))
123 | 
124 |         start_time = time.time()
125 |         self.target_model.data = self.shard_data[shard]
126 |         self.target_model.train_model()
127 |         train_time = time.time() - start_time
128 | 
129 |         self.data_store.save_target_model(run, self.target_model, shard)
130 |         self.logger.info("Model training time: %s" % (train_time))
131 | 
132 |         return train_time
133 | 


--------------------------------------------------------------------------------
/lib_aggregator/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinChen00/Graph-Unlearning/a9b942d01651c2e3d780ae12e1a1459e35120ffa/lib_aggregator/__init__.py


--------------------------------------------------------------------------------
/lib_aggregator/aggregator.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import torch
 3 | 
 4 | torch.cuda.empty_cache()
 5 | 
 6 | from sklearn.metrics import f1_score
 7 | import numpy as np
 8 | 
 9 | from lib_aggregator.optimal_aggregator import OptimalAggregator
10 | from lib_dataset.data_store import DataStore
11 | 
12 | 
13 | class Aggregator:
14 |     def __init__(self, run, target_model, data, shard_data, args):
15 |         self.logger = logging.getLogger('Aggregator')
16 |         self.args = args
17 | 
18 |         self.data_store = DataStore(self.args)
19 | 
20 |         self.run = run
21 |         self.target_model = target_model
22 |         self.data = data
23 |         self.shard_data = shard_data
24 | 
25 |         self.num_shards = args['num_shards']
26 | 
27 |     def generate_posterior(self, suffix=""):
28 |         self.true_label = self.shard_data[0].y[self.shard_data[0]['test_mask']].detach().cpu().numpy()
29 |         self.posteriors = {}
30 | 
31 |         for shard in range(self.args['num_shards']):
32 |             self.target_model.data = self.shard_data[shard]
33 |             self.data_store.load_target_model(self.run, self.target_model, shard, suffix)
34 |             self.posteriors[shard] = self.target_model.posterior()
35 |         self.logger.info("Saving posteriors.")
36 |         self.data_store.save_posteriors(self.posteriors, self.run, suffix)
37 | 
38 |     def aggregate(self):
39 |         if self.args['aggregator'] == 'mean':
40 |             aggregate_f1_score = self._mean_aggregator()
41 |         elif self.args['aggregator'] == 'optimal':
42 |             aggregate_f1_score = self._optimal_aggregator()
43 |         elif self.args['aggregator'] == 'majority':
44 |             aggregate_f1_score = self._majority_aggregator()
45 |         else:
46 |             raise Exception("unsupported aggregator.")
47 | 
48 |         return aggregate_f1_score
49 | 
50 |     def _mean_aggregator(self):
51 |         posterior = self.posteriors[0]
52 |         for shard in range(1, self.num_shards):
53 |             posterior += self.posteriors[shard]
54 | 
55 |         posterior = posterior / self.num_shards
56 |         return f1_score(self.true_label, posterior.argmax(axis=1).cpu().numpy(), average="micro")
57 | 
58 |     def _majority_aggregator(self):
59 |         pred_labels = []
60 |         for shard in range(self.num_shards):
61 |             pred_labels.append(self.posteriors[shard].argmax(axis=1).cpu().numpy())
62 | 
63 |         pred_labels = np.stack(pred_labels)
64 |         pred_label = np.argmax(
65 |             np.apply_along_axis(np.bincount, axis=0, arr=pred_labels, minlength=self.posteriors[0].shape[1]), axis=0)
66 | 
67 |         return f1_score(self.true_label, pred_label, average="micro")
68 | 
69 |     def _optimal_aggregator(self):
70 |         optimal = OptimalAggregator(self.run, self.target_model, self.data, self.args)
71 |         optimal.generate_train_data()
72 |         weight_para = optimal.optimization()
73 |         self.data_store.save_optimal_weight(weight_para, run=self.run)
74 | 
75 |         posterior = self.posteriors[0] * weight_para[0]
76 |         for shard in range(1, self.num_shards):
77 |             posterior += self.posteriors[shard] * weight_para[shard]
78 | 
79 |         return f1_score(self.true_label, posterior.argmax(axis=1).cpu().numpy(), average="micro")
80 | 


--------------------------------------------------------------------------------
/lib_aggregator/opt_dataset.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | 
 3 | 
 4 | class OptDataset(Dataset):
 5 |     def __init__(self, posteriors, labels):
 6 |         self.posteriors = posteriors
 7 |         self.labels = labels
 8 | 
 9 |     def __getitem__(self, index):
10 |         ret_posterior = {}
11 | 
12 |         for shard, post in self.posteriors.items():
13 |             ret_posterior[shard] = post[index]
14 | 
15 |         return ret_posterior, self.labels[index]
16 | 
17 |     def __len__(self):
18 |         return self.labels.shape[0]
19 | 


--------------------------------------------------------------------------------
/lib_aggregator/optimal_aggregator.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import logging
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | from torch import optim
  9 | from torch.optim.lr_scheduler import MultiStepLR
 10 | from torch.utils.data import DataLoader
 11 | from torch_geometric.data import Data
 12 | 
 13 | from lib_aggregator.opt_dataset import OptDataset
 14 | from lib_dataset.data_store import DataStore
 15 | from lib_utils import utils
 16 | 
 17 | 
 18 | class OptimalAggregator:
 19 |     def __init__(self, run, target_model, data, args):
 20 |         self.logger = logging.getLogger('optimal_aggregator')
 21 |         self.args = args
 22 | 
 23 |         self.run = run
 24 |         self.target_model = target_model
 25 |         self.data = data
 26 |         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 27 | 
 28 |         self.num_shards = args['num_shards']
 29 | 
 30 |     def generate_train_data(self):
 31 |         data_store = DataStore(self.args)
 32 |         train_indices, _ = data_store.load_train_test_split()
 33 | 
 34 |         # sample a set of nodes from train_indices
 35 |         if self.args["num_opt_samples"] == 1000:
 36 |             train_indices = np.random.choice(train_indices, size=1000, replace=False)
 37 |         elif self.args["num_opt_samples"] == 10000:
 38 |             train_indices = np.random.choice(train_indices, size=int(train_indices.shape[0] * 0.1), replace=False)
 39 |         elif self.args["num_opt_samples"] == 1:
 40 |             train_indices = np.random.choice(train_indices, size=int(train_indices.shape[0]), replace=False)
 41 | 
 42 |         train_indices = np.sort(train_indices)
 43 |         self.logger.info("Using %s samples for optimization" % (int(train_indices.shape[0])))
 44 | 
 45 |         x = self.data.x[train_indices]
 46 |         y = self.data.y[train_indices]
 47 |         edge_index = utils.filter_edge_index(self.data.edge_index, train_indices)
 48 | 
 49 |         train_data = Data(x=x, edge_index=torch.from_numpy(edge_index), y=y)
 50 |         train_data.train_mask = torch.zeros(train_indices.shape[0], dtype=torch.bool)
 51 |         train_data.test_mask = torch.ones(train_indices.shape[0], dtype=torch.bool)
 52 |         self.true_labels = y
 53 | 
 54 |         self.posteriors = {}
 55 |         for shard in range(self.num_shards):
 56 |             self.target_model.data = train_data
 57 |             data_store.load_target_model(self.run, self.target_model, shard)
 58 |             self.posteriors[shard] = self.target_model.posterior().to(self.device)
 59 | 
 60 |     def optimization(self):
 61 |         weight_para = nn.Parameter(torch.full((self.num_shards,), fill_value=1.0 / self.num_shards), requires_grad=True)
 62 |         optimizer = optim.Adam([weight_para], lr=self.args['opt_lr'])
 63 |         scheduler = MultiStepLR(optimizer, milestones=[500, 1000], gamma=self.args['opt_lr'])
 64 | 
 65 |         train_dset = OptDataset(self.posteriors, self.true_labels)
 66 |         train_loader = DataLoader(train_dset, batch_size=32, shuffle=True, num_workers=0)
 67 | 
 68 |         min_loss = 1000.0
 69 |         for epoch in range(self.args['opt_num_epochs']):
 70 |             loss_all = 0.0
 71 | 
 72 |             for posteriors, labels in train_loader:
 73 |                 labels = labels.to(self.device)
 74 | 
 75 |                 optimizer.zero_grad()
 76 |                 loss = self._loss_fn(posteriors, labels, weight_para)
 77 |                 loss.backward()
 78 |                 loss_all += loss
 79 | 
 80 |                 optimizer.step()
 81 |                 with torch.no_grad():
 82 |                     weight_para[:] = torch.clamp(weight_para, min=0.0)
 83 | 
 84 |             scheduler.step()
 85 | 
 86 |             if loss_all < min_loss:
 87 |                 ret_weight_para = copy.deepcopy(weight_para)
 88 |                 min_loss = loss_all
 89 | 
 90 |             self.logger.info('epoch: %s, loss: %s' % (epoch, loss_all))
 91 | 
 92 |         return ret_weight_para / torch.sum(ret_weight_para)
 93 | 
 94 |     def _loss_fn(self, posteriors, labels, weight_para):
 95 |         aggregate_posteriors = torch.zeros_like(posteriors[0])
 96 |         for shard in range(self.num_shards):
 97 |             aggregate_posteriors += weight_para[shard] * posteriors[shard]
 98 | 
 99 |         aggregate_posteriors = F.softmax(aggregate_posteriors, dim=1)
100 |         loss_1 = F.cross_entropy(aggregate_posteriors, labels)
101 |         loss_2 = torch.sqrt(torch.sum(weight_para ** 2))
102 | 
103 |         return loss_1 + loss_2
104 | 


--------------------------------------------------------------------------------
/lib_dataset/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinChen00/Graph-Unlearning/a9b942d01651c2e3d780ae12e1a1459e35120ffa/lib_dataset/__init__.py


--------------------------------------------------------------------------------
/lib_dataset/data_store.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import logging
  4 | import shutil
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | from torch_geometric.datasets import Planetoid, Coauthor
  9 | import torch_geometric.transforms as T
 10 | 
 11 | import config
 12 | 
 13 | 
 14 | class DataStore:
 15 |     def __init__(self, args):
 16 |         self.logger = logging.getLogger('data_store')
 17 |         self.args = args
 18 | 
 19 |         self.dataset_name = self.args['dataset_name']
 20 |         self.num_features = {
 21 |             "cora": 1433,
 22 |             "pubmed": 500,
 23 |             "citeseer": 3703,
 24 |             "Coauthor_CS": 6805,
 25 |             "Coauthor_Phys": 8415
 26 |         }
 27 |         self.partition_method = self.args['partition_method']
 28 |         self.num_shards = self.args['num_shards']
 29 |         self.target_model = self.args['target_model']
 30 | 
 31 |         self.determine_data_path()
 32 | 
 33 |     def determine_data_path(self):
 34 |         embedding_name = '_'.join(('embedding', self._extract_embedding_method(self.partition_method),
 35 |                                    str(self.args['ratio_deleted_edges'])))
 36 | 
 37 |         community_name = '_'.join(('community', self.partition_method, str(self.num_shards),
 38 |                                    str(self.args['ratio_deleted_edges'])))
 39 |         shard_name = '_'.join(('shard_data', self.partition_method, str(self.num_shards),
 40 |                                str(self.args['shard_size_delta']), str(self.args['ratio_deleted_edges'])))
 41 |         target_model_name = '_'.join((self.target_model, self.partition_method, str(self.num_shards),
 42 |                                       str(self.args['shard_size_delta']), str(self.args['ratio_deleted_edges'])))
 43 |         optimal_weight_name = '_'.join((self.target_model, self.partition_method, str(self.num_shards),
 44 |                                         str(self.args['shard_size_delta']), str(self.args['ratio_deleted_edges'])))
 45 | 
 46 |         processed_data_prefix = config.PROCESSED_DATA_PATH + self.dataset_name + "/"
 47 |         self.train_test_split_file =  processed_data_prefix + "train_test_split" + str(self.args['test_ratio'])
 48 |         self.train_data_file = processed_data_prefix + "train_data"
 49 |         self.train_graph_file = processed_data_prefix + "train_graph"
 50 |         self.embedding_file = processed_data_prefix + embedding_name
 51 |         self.community_file = processed_data_prefix + community_name
 52 |         self.shard_file = processed_data_prefix + shard_name
 53 |         self.unlearned_file = processed_data_prefix+ '_'.join(('unlearned', str(self.args['num_unlearned_nodes'])))
 54 | 
 55 |         self.target_model_file = config.MODEL_PATH + self.dataset_name + '/' + target_model_name
 56 |         self.optimal_weight_file = config.ANALYSIS_PATH + 'optimal/' + self.dataset_name + '/' + optimal_weight_name
 57 |         self.posteriors_file = config.ANALYSIS_PATH + 'posteriors/' + self.dataset_name + '/' + target_model_name
 58 | 
 59 |         dir_lists = [s + self.dataset_name for s in [config.PROCESSED_DATA_PATH,
 60 |                                                      config.MODEL_PATH,
 61 |                                                      config.ANALYSIS_PATH + 'optimal/',
 62 |                                                      config.ANALYSIS_PATH + 'posteriors/']]
 63 |         for dir in dir_lists:
 64 |             self._check_and_create_dirs(dir)
 65 | 
 66 |     def _check_and_create_dirs(self, folder):
 67 |         if not os.path.exists(folder):
 68 |             try:
 69 |                 self.logger.info("checking directory %s", folder)
 70 |                 os.makedirs(folder, exist_ok=True)
 71 |                 self.logger.info("new directory %s created", folder)
 72 |             except OSError as error:
 73 |                 self.logger.info("deleting old and creating new empty %s", folder)
 74 |                 shutil.rmtree(folder)
 75 |                 os.mkdir(folder)
 76 |                 self.logger.info("new empty directory %s created", folder)
 77 |         else:
 78 |             self.logger.info("folder %s exists, do not need to create again.", folder)
 79 | 
 80 |     def load_raw_data(self):
 81 |         self.logger.info('loading raw data')
 82 |         if not self.args['is_use_node_feature']:
 83 |             self.transform = T.Compose([
 84 |                 T.OneHotDegree(-2, cat=False)  # use only node degree as node feature.
 85 |             ])
 86 |         else:
 87 |             self.transform = None
 88 | 
 89 |         if self.dataset_name in ["cora", "pubmed", "citeseer"]:
 90 |             dataset = Planetoid(config.RAW_DATA_PATH, self.dataset_name, transform=T.NormalizeFeatures())
 91 |             labels = np.unique(dataset.data.y.numpy())
 92 |         elif self.dataset_name in ["Coauthor_CS", "Coauthor_Phys"]:
 93 |             if self.dataset_name == "Coauthor_Phys":
 94 |                 dataset = Coauthor(config.RAW_DATA_PATH, name="Physics", pre_transform=self.transform)
 95 |             else:
 96 |                 dataset = Coauthor(config.RAW_DATA_PATH, name="CS", pre_transform=self.transform)
 97 |         else:
 98 |             raise Exception('unsupported dataset')
 99 | 
100 |         data = dataset[0]
101 | 
102 |         return data
103 | 
104 |     def save_train_data(self, train_data):
105 |         self.logger.info('saving train data')
106 |         pickle.dump(train_data, open(self.train_data_file, 'wb'))
107 | 
108 |     def load_train_data(self):
109 |         self.logger.info('loading train data')
110 |         return pickle.load(open(self.train_data_file, 'rb'))
111 | 
112 |     def save_train_graph(self, train_data):
113 |         self.logger.info('saving train graph')
114 |         pickle.dump(train_data, open(self.train_graph_file, 'wb'))
115 | 
116 |     def load_train_graph(self):
117 |         self.logger.info('loading train graph')
118 |         return pickle.load(open(self.train_graph_file, 'rb'))
119 | 
120 |     def save_train_test_split(self, train_indices, test_indices):
121 |         self.logger.info('saving train test split data')
122 |         pickle.dump((train_indices, test_indices), open(self.train_test_split_file, 'wb'))
123 | 
124 |     def load_train_test_split(self):
125 |         self.logger.info('loading train test split data')
126 |         return pickle.load(open(self.train_test_split_file, 'rb'))
127 | 
128 |     def save_embeddings(self, embeddings):
129 |         self.logger.info('saving embedding data')
130 |         pickle.dump(embeddings, open(self.embedding_file, 'wb'))
131 | 
132 |     def load_embeddings(self):
133 |         self.logger.info('loading embedding data')
134 |         return pickle.load(open(self.embedding_file, 'rb'))
135 | 
136 |     def save_community_data(self, community_to_node, suffix=''):
137 |         self.logger.info('saving community data')
138 |         pickle.dump(community_to_node, open(self.community_file + suffix, 'wb'))
139 | 
140 |     def load_community_data(self, suffix=''):
141 |         self.logger.info('loading community data from: %s'%(self.community_file + suffix))
142 |         return pickle.load(open(self.community_file + suffix, 'rb'))
143 | 
144 |     def c2n_to_n2c(self, community_to_node):
145 |         node_list = []
146 |         for i in range(self.num_shards):
147 |             node_list.extend(list(community_to_node.values())[i])
148 |         node_to_community = {}
149 | 
150 |         for comm, nodes in dict(community_to_node).items():
151 |             for node in nodes:
152 |                 # Map node id back to original graph
153 |                 # node_to_community[node_list[node]] = comm
154 |                 node_to_community[node] = comm
155 | 
156 |         return node_to_community
157 | 
158 |     def save_shard_data(self, shard_data):
159 |         self.logger.info('saving shard data')
160 |         pickle.dump(shard_data, open(self.shard_file, 'wb'))
161 | 
162 |     def load_shard_data(self):
163 |         self.logger.info('loading shard data')
164 |         return pickle.load(open(self.shard_file, 'rb'))
165 | 
166 |     def load_unlearned_data(self, suffix):
167 |         file_path = '_'.join((self.unlearned_file, suffix))
168 |         self.logger.info('loading unlearned data from %s' % file_path)
169 |         return pickle.load(open(file_path, 'rb'))
170 | 
171 |     def save_unlearned_data(self, data, suffix):
172 |         self.logger.info('saving unlearned data %s' % suffix)
173 |         pickle.dump(data, open('_'.join((self.unlearned_file, suffix)), 'wb'))
174 | 
175 |     def save_target_model(self, run, model, shard, suffix=''):
176 |         if self.args["exp"] in ["node_edge_unlearning", "attack_unlearning"]:
177 |             model_path = '_'.join((self.target_model_file, str(shard), str(run), str(self.args['num_unlearned_nodes']))) + suffix
178 |             model.save_model(model_path)
179 |         else:
180 |             model.save_model(self.target_model_file + '_' + str(shard) + '_' + str(run))
181 |             # model.save_model(self.target_model_file + '_' + str(shard))
182 | 
183 |     def load_target_model(self, run, model, shard, suffix=''):
184 |         if self.args["exp"] == "node_edge_unlearning":
185 |             model.load_model(
186 |                 '_'.join((self.target_model_file, str(shard), str(run), str(self.args['num_unlearned_nodes']))))
187 |         elif self.args["exp"] == "attack_unlearning":
188 |             model_path = '_'.join((self.target_model_file, str(shard), str(run), str(self.args['num_unlearned_nodes']))) + suffix
189 |             print("loading target model from:" + model_path)
190 |             device = torch.device('cpu')
191 |             model.load_model(model_path)
192 |             model.device=device
193 |         else:
194 |             # model.load_model(self.target_model_file + '_' + str(shard) + '_' + str(run))
195 |             model.load_model(self.target_model_file + '_' + str(shard) + '_' + str(0))
196 | 
197 |     def save_optimal_weight(self, weight, run):
198 |         torch.save(weight, self.optimal_weight_file + '_' + str(run))
199 | 
200 |     def load_optimal_weight(self, run):
201 |         return torch.load(self.optimal_weight_file + '_' + str(run))
202 | 
203 |     def save_posteriors(self, posteriors, run, suffix=''):
204 |         torch.save(posteriors, self.posteriors_file + '_' + str(run) + suffix)
205 | 
206 |     def load_posteriors(self, run):
207 |         return torch.load(self.posteriors_file + '_' + str(run))
208 | 
209 |     def _extract_embedding_method(self, partition_method):
210 |         return partition_method.split('_')[0]
211 | 


--------------------------------------------------------------------------------
/lib_gnn_model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinChen00/Graph-Unlearning/a9b942d01651c2e3d780ae12e1a1459e35120ffa/lib_gnn_model/__init__.py


--------------------------------------------------------------------------------
/lib_gnn_model/gat/gat.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | import torch
 5 | import torch.nn.functional as F
 6 | import torch_geometric.transforms as T
 7 | from torch_geometric.datasets import Planetoid
 8 | 
 9 | import config
10 | from lib_gnn_model.gnn_base import GNNBase
11 | from lib_gnn_model.gat.gat_net import GATNet
12 | 
13 | 
14 | class GAT(GNNBase):
15 |     def __init__(self, num_feats, num_classes, data=None):
16 |         super(GAT, self).__init__()
17 |         self.logger = logging.getLogger('gat')
18 | 
19 |         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
20 |         self.model = GATNet(num_feats, num_classes)
21 |         self.data = data
22 | 
23 |     def train_model(self, num_epoch=100):
24 |         self.model.train()
25 |         self.model.reset_parameters()
26 |         self.model, self.data = self.model.to(self.device), self.data.to(self.device)
27 | 
28 |         optimizer = torch.optim.Adam(self.model.parameters(), lr=0.005, weight_decay=0.0001)
29 | 
30 |         for epoch in range(num_epoch):
31 |             self.logger.info('epoch %s' % (epoch,))
32 | 
33 |             optimizer.zero_grad()
34 |             output = self.model(self.data)[self.data.train_mask]
35 |             loss = F.nll_loss(output, self.data.y[self.data.train_mask])
36 |             loss.backward()
37 |             optimizer.step()
38 | 
39 |             train_acc, test_acc = self.evaluate_model()
40 |             self.logger.info('train acc: %s, test acc: %s' % (train_acc, test_acc))
41 | 
42 |     def evaluate_model(self):
43 |         self.model.eval()
44 |         # self.model, self.data = self.model.to(self.device), self.data.to(self.device)
45 | 
46 |         logits, accs = self.model(self.data), []
47 | 
48 |         for _, mask in self.data('train_mask', 'test_mask'):
49 |             pred = logits[mask].max(1)[1]
50 |             acc = pred.eq(self.data.y[mask]).sum().item() / mask.sum().item()
51 |             accs.append(acc)
52 | 
53 |         return accs
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     os.chdir('../../')
58 | 
59 |     output_file = None
60 |     logging.basicConfig(filename=output_file,
61 |                         format='%(levelname)s:%(asctime)s: - %(name)s - : %(message)s',
62 |                         level=logging.DEBUG)
63 | 
64 |     dataset_name = 'cora'
65 |     dataset = Planetoid(config.RAW_DATA_PATH, dataset_name, transform=T.NormalizeFeatures())
66 |     data = dataset[0]
67 | 
68 |     gat = GAT(dataset.num_features, dataset.num_classes, data)
69 |     gat.train_model()
70 |     # gat.evaluate_model()
71 | 


--------------------------------------------------------------------------------
/lib_gnn_model/gat/gat_net.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torch_geometric.nn import GATConv
 4 | 
 5 | 
 6 | class GATNet(torch.nn.Module):
 7 |     def __init__(self, num_feats, num_classes, dropout=0.6):
 8 |         super(GATNet, self).__init__()
 9 |         self.dropout = dropout
10 | 
11 |         self.conv1 = GATConv(num_feats, 8, heads=8, dropout=self.dropout, add_self_loops=False)
12 |         # On the Pubmed dataset, use heads=8 in conv2.
13 |         self.conv2 = GATConv(8 * 8, num_classes, heads=1, concat=False, dropout=self.dropout, add_self_loops=False)
14 |         # self.conv2 = GATConv(8 * 8, num_classes, heads=8, concat=False, dropout=self.dropout, add_self_loops=False)
15 | 
16 |         self.reset_parameters()
17 | 
18 |     def forward(self, data):
19 |         x = F.dropout(data.x, p=self.dropout, training=self.training)
20 |         x = F.elu(self.conv1(x, data.edge_index))
21 |         x = F.dropout(x, p=self.dropout, training=self.training)
22 |         x = self.conv2(x, data.edge_index)
23 | 
24 |         return F.log_softmax(x, dim=1)
25 | 
26 |     def reset_parameters(self):
27 |         self.conv1.reset_parameters()
28 |         self.conv2.reset_parameters()
29 | 


--------------------------------------------------------------------------------
/lib_gnn_model/gcn/gcn.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | 
 4 | import torch
 5 | import torch.nn.functional as F
 6 | import torch_geometric.transforms as T
 7 | from torch_geometric.datasets import Planetoid
 8 | 
 9 | from lib_gnn_model.gnn_base import GNNBase
10 | from lib_gnn_model.gcn.gcn_net import GCNNet
11 | import config
12 | 
13 | 
14 | class GCN(GNNBase):
15 |     def __init__(self, num_feats, num_classes, data=None):
16 |         super(GCN, self).__init__()
17 |         self.logger = logging.getLogger('gcn')
18 | 
19 |         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
20 |         self.model = GCNNet(num_feats, num_classes)
21 |         self.data = data
22 | 
23 |     def train_model(self, num_epoch=100):
24 |         self.model.train()
25 |         self.model.reset_parameters()
26 |         self.model, self.data = self.model.to(self.device), self.data.to(self.device)
27 | 
28 |         optimizer = torch.optim.Adam(self.model.parameters(), lr=0.01)
29 | 
30 |         for epoch in range(num_epoch):
31 |             self.logger.info('epoch %s' % (epoch,))
32 | 
33 |             optimizer.zero_grad()
34 |             output = self.model(self.data)[self.data.train_mask]
35 |             loss = F.nll_loss(output, self.data.y[self.data.train_mask])
36 |             loss.backward()
37 |             optimizer.step()
38 | 
39 |             train_acc, test_acc = self.evaluate_model()
40 |             self.logger.info('train acc: %s, test acc: %s' % (train_acc, test_acc))
41 | 
42 |     def evaluate_model(self):
43 |         self.model.eval()
44 |         self.model, self.data = self.model.to(self.device), self.data.to(self.device)
45 | 
46 |         logits, accs = self.model(self.data), []
47 | 
48 |         for _, mask in self.data('train_mask', 'test_mask'):
49 |             pred = logits[mask].max(1)[1]
50 |             acc = pred.eq(self.data.y[mask]).sum().item() / mask.sum().item()
51 |             accs.append(acc)
52 | 
53 |         return accs
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     os.chdir('../../')
58 | 
59 |     output_file = None
60 |     logging.basicConfig(filename=output_file,
61 |                         format='%(levelname)s:%(asctime)s: - %(name)s - : %(message)s',
62 |                         level=logging.DEBUG)
63 | 
64 |     dataset_name = 'cora'
65 |     dataset = Planetoid(config.RAW_DATA_PATH, dataset_name, transform=T.NormalizeFeatures())
66 |     data = dataset[0]
67 | 
68 |     gcn = GCN(dataset.num_features, dataset.num_classes, data)
69 |     gcn.train_model()


--------------------------------------------------------------------------------
/lib_gnn_model/gcn/gcn_net.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torch_geometric.nn import GCNConv
 4 | 
 5 | 
 6 | class GCNNet(torch.nn.Module):
 7 |     def __init__(self, num_feats, num_classes):
 8 |         super(GCNNet, self).__init__()
 9 | 
10 |         self.conv1 = GCNConv(num_feats, 16, cached=True, add_self_loops=False)
11 |         self.conv2 = GCNConv(16, num_classes, cached=True, add_self_loops=False)
12 | 
13 |     def forward(self, data):
14 |         x, edge_index, edge_weight = data.x, data.edge_index, data.edge_attr
15 |         x = F.relu(self.conv1(x, edge_index, edge_weight))
16 |         x = F.dropout(x, training=self.training)
17 |         x = self.conv2(x, edge_index, edge_weight)
18 | 
19 |         return F.log_softmax(x, dim=-1)
20 | 
21 |     def reset_parameters(self):
22 |         self.conv1.reset_parameters()
23 |         self.conv2.reset_parameters()
24 | 


--------------------------------------------------------------------------------
/lib_gnn_model/gin/gin.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | 
 4 | import torch
 5 | import torch.nn.functional as F
 6 | import torch_geometric.transforms as T
 7 | from torch_geometric.datasets import Planetoid, Reddit
 8 | 
 9 | from lib_gnn_model.gnn_base import GNNBase
10 | from lib_gnn_model.gin.gin_net import GINNet
11 | import config
12 | 
13 | 
14 | class GIN(GNNBase):
15 |     def __init__(self, num_feats, num_classes, data=None):
16 |         super(GIN, self).__init__()
17 |         self.logger = logging.getLogger('gin')
18 | 
19 |         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
20 |         self.model = GINNet(num_feats, num_classes).to(self.device)
21 |         self.data = data
22 | 
23 |     def train_model(self, num_epochs=100):
24 |         self.model.train()
25 |         self.model.reset_parameters()
26 |         self.model, self.data = self.model.to(self.device), self.data.to(self.device)
27 | 
28 |         optimizer = torch.optim.Adam(self.model.parameters(), lr=0.01)
29 | 
30 |         for epoch in range(num_epochs):
31 |             self.logger.info('epoch %s' % (epoch,))
32 | 
33 |             optimizer.zero_grad()
34 |             output = self.model(self.data)[self.data.train_mask]
35 |             loss = F.nll_loss(output, self.data.y[self.data.train_mask])
36 |             # loss = F.nll_loss(output, self.data.y.squeeze(1)[self.data.train_mask])
37 |             loss.backward()
38 |             optimizer.step()
39 | 
40 |             train_acc, test_acc = self.evaluate_model()
41 |             self.logger.info('train acc: %s, test acc: %s' % (train_acc, test_acc))
42 | 
43 |     def evaluate_model(self):
44 |         self.model.eval()
45 |         self.model, self.data = self.model.to(self.device), self.data.to(self.device)
46 | 
47 |         logits, accs = self.model(self.data), []
48 | 
49 |         for _, mask in self.data('train_mask', 'test_mask'):
50 |             pred = logits[mask].max(1)[1]
51 |             acc = pred.eq(self.data.y[mask]).sum().item() / mask.sum().item()
52 |             accs.append(acc)
53 | 
54 |         return accs
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     os.chdir('../../')
59 | 
60 |     output_file = None
61 |     logging.basicConfig(filename=output_file,
62 |                         format='%(levelname)s:%(asctime)s: - %(name)s - : %(message)s',
63 |                         level=logging.DEBUG)
64 | 
65 |     dataset_name = 'citeseer'
66 |     dataset = Planetoid(config.RAW_DATA_PATH, dataset_name, transform=T.NormalizeFeatures())
67 |     data = dataset[0]
68 | 
69 |     gin = GIN(dataset.num_features, dataset.num_classes, data)
70 |     gin.train_model()
71 | 


--------------------------------------------------------------------------------
/lib_gnn_model/gin/gin_net.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torch.nn import Sequential, Linear, ReLU
 4 | from torch_geometric.nn import GINConv
 5 | 
 6 | 
 7 | class GINNet(torch.nn.Module):
 8 |     def __init__(self, num_feats, num_classes):
 9 |         super(GINNet, self).__init__()
10 | 
11 |         dim = 32
12 | 
13 |         nn1 = Sequential(Linear(num_feats, dim), ReLU(), Linear(dim, dim))
14 |         self.conv1 = GINConv(nn1)
15 |         self.bn1 = torch.nn.BatchNorm1d(dim)
16 | 
17 |         nn2 = Sequential(Linear(dim, dim), ReLU(), Linear(dim, dim))
18 |         self.conv2 = GINConv(nn2)
19 |         self.bn2 = torch.nn.BatchNorm1d(dim)
20 | 
21 |         nn3 = Sequential(Linear(dim, dim), ReLU(), Linear(dim, dim))
22 |         self.conv3 = GINConv(nn3)
23 |         self.bn3 = torch.nn.BatchNorm1d(dim)
24 | 
25 |         nn4 = Sequential(Linear(dim, dim), ReLU(), Linear(dim, dim))
26 |         self.conv4 = GINConv(nn4)
27 |         self.bn4 = torch.nn.BatchNorm1d(dim)
28 | 
29 |         nn5 = Sequential(Linear(dim, dim), ReLU(), Linear(dim, dim))
30 |         self.conv5 = GINConv(nn5)
31 |         self.bn5 = torch.nn.BatchNorm1d(dim)
32 | 
33 |         self.fc1 = Linear(dim, dim)
34 |         self.fc2 = Linear(dim, num_classes)
35 | 
36 |     def forward(self, data, batch=None):
37 |         x = F.relu(self.conv1(data.x, data.edge_index))
38 |         x = self.bn1(x)
39 |         x = F.relu(self.conv2(x, data.edge_index))
40 |         x = self.bn2(x)
41 |         x = F.relu(self.fc1(x))
42 |         x = F.dropout(x, p=0.5, training=self.training)
43 |         x = self.fc2(x)
44 | 
45 |         return F.log_softmax(x, dim=1)
46 | 
47 |     def reset_parameters(self):
48 |         self.conv1.reset_parameters()
49 |         self.conv2.reset_parameters()
50 | 


--------------------------------------------------------------------------------
/lib_gnn_model/gnn_base.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import pickle
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class GNNBase:
 8 |     def __init__(self):
 9 |         self.logger = logging.getLogger('gnn')
10 | 
11 |         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
12 |         # self.device = torch.device('cpu')
13 |         self.model = None
14 |         self.embedding_dim = 0
15 |         self.data = None
16 |         self.subgraph_loader = None
17 | 
18 |     def save_model(self, save_path):
19 |         self.logger.info('saving model')
20 |         torch.save(self.model.state_dict(), save_path)
21 | 
22 |     def load_model(self, save_path):
23 |         self.logger.info('loading model')
24 |         device = torch.device('cpu')
25 |         self.model.load_state_dict(torch.load(save_path, map_location=device))
26 | 
27 |     def save_paras(self, save_path):
28 |         self.logger.info('saving paras')
29 |         self.paras = {
30 |             'embedding_dim': self.embedding_dim
31 |         }
32 |         pickle.dump(self.paras, open(save_path, 'wb'))
33 | 
34 |     def load_paras(self, save_path):
35 |         self.logger.info('loading paras')
36 |         return pickle.load(open(save_path, 'rb'))
37 | 
38 |     def count_parameters(self):
39 |         return sum(p.numel() for p in self.model.parameters() if p.requires_grad)
40 | 
41 |     def posterior(self):
42 |         self.model.eval()
43 |         self.model = self.model.to(self.device)
44 |         self.data = self.data.to(self.device)
45 | 
46 |         posteriors = self.model(self.data)
47 |         for _, mask in self.data('test_mask'):
48 |             posteriors = posteriors[mask]
49 | 
50 |         return posteriors.detach()
51 | 


--------------------------------------------------------------------------------
/lib_gnn_model/graphsage/graphsage.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | 
  4 | import torch
  5 | import torch.nn.functional as F
  6 | import torch_geometric.transforms as T
  7 | from torch_geometric.datasets import Planetoid
  8 | from torch_geometric.data import NeighborSampler
  9 | 
 10 | from lib_gnn_model.graphsage.graphsage_net import SageNet
 11 | from lib_gnn_model.gnn_base import GNNBase
 12 | import config
 13 | 
 14 | 
 15 | class SAGE(GNNBase):
 16 |     def __init__(self, num_feats, num_classes, data=None):
 17 |         super(SAGE, self).__init__()
 18 |         self.logger = logging.getLogger('graphsage')
 19 | 
 20 |         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 21 |         # self.device = torch.device('cpu')
 22 |         self.model = SageNet(num_feats, 256, num_classes).to(self.device)
 23 |         self.data = data
 24 | 
 25 |     def train_model(self, num_epochs=100):
 26 |         self.model.train()
 27 |         self.model.reset_parameters()
 28 |         self.model, self.data = self.model.to(self.device), self.data.to(self.device)
 29 |         self.data.y = self.data.y.squeeze().to(self.device)
 30 |         self._gen_train_loader()
 31 | 
 32 |         optimizer = torch.optim.Adam(self.model.parameters(), lr=0.01, weight_decay=0.001)
 33 | 
 34 |         for epoch in range(num_epochs):
 35 |             self.logger.info('epoch %s' % (epoch,))
 36 | 
 37 |             for batch_size, n_id, adjs in self.train_loader:
 38 |                 # `adjs` holds a list of `(edge_index, e_id, size)` tuples.
 39 |                 adjs = [adj.to(self.device) for adj in adjs]
 40 | 
 41 |                 optimizer.zero_grad()
 42 |                 out = self.model(self.data.x[n_id], adjs)
 43 |                 loss = F.nll_loss(out, self.data.y[n_id[:batch_size]])
 44 |                 loss.backward()
 45 |                 optimizer.step()
 46 | 
 47 |             train_acc, test_acc = self.evaluate_model()
 48 |             self.logger.info(f'Train: {train_acc:.4f}, Test: {test_acc:.4f}')
 49 | 
 50 |     @torch.no_grad()
 51 |     def evaluate_model(self):
 52 |         self.model.eval()
 53 |         self.model, self.data = self.model.to(self.device), self.data.to(self.device)
 54 |         self._gen_subgraph_loader()
 55 | 
 56 |         out = self.model.inference(self.data.x, self.subgraph_loader, self.device)
 57 |         y_true = self.data.y.cpu().unsqueeze(-1)
 58 |         y_pred = out.argmax(dim=-1, keepdim=True)
 59 | 
 60 |         results = []
 61 |         for mask in [self.data.train_mask, self.data.test_mask]:
 62 |             results += [int(y_pred[mask].eq(y_true[mask]).sum()) / int(mask.sum())]
 63 | 
 64 |         return results
 65 | 
 66 |     def posterior(self):
 67 |         self.model.eval()
 68 |         self.model, self.data = self.model.to(self.device), self.data.to(self.device)
 69 |         self._gen_subgraph_loader()
 70 | 
 71 |         posteriors = self.model.inference(self.data.x, self.subgraph_loader, self.device)
 72 | 
 73 |         for _, mask in self.data('test_mask'):
 74 |             posteriors = F.log_softmax(posteriors[mask], dim=-1)
 75 | 
 76 |         return posteriors.detach()
 77 | 
 78 |     def generate_embeddings(self):
 79 |         self.model.eval()
 80 |         self.model, self.data = self.model.to(self.device), self.data.to(self.device)
 81 |         self._gen_subgraph_loader()
 82 | 
 83 |         logits = self.model.inference(self.data.x, self.subgraph_loader, self.device)
 84 |         return logits
 85 | 
 86 |     def _gen_train_loader(self):
 87 |         if self.data.edge_index.shape[1] == 0:
 88 |             self.data.edge_index = torch.tensor([[1, 2], [2, 1]])
 89 |         self.train_loader = NeighborSampler(self.data.edge_index, node_idx=self.data.train_mask,
 90 |                                             # sizes=[25, 10], batch_size=128, shuffle=True,
 91 |                                             # sizes=[25, 10], num_nodes=self.data.num_nodes,
 92 |                                             sizes=[10, 10], num_nodes=self.data.num_nodes,
 93 |                                             # sizes=[5, 5], num_nodes=self.data.num_nodes,
 94 |                                             # batch_size=128, shuffle=True,
 95 |                                             batch_size=64, shuffle=True,
 96 |                                             num_workers=0)
 97 | 
 98 |     def _gen_subgraph_loader(self):
 99 |         self.subgraph_loader = NeighborSampler(self.data.edge_index, node_idx=None,
100 |                                                # sizes=[-1], num_nodes=self.data.num_nodes,
101 |                                                sizes=[10], num_nodes=self.data.num_nodes,
102 |                                                # batch_size=128, shuffle=False,
103 |                                                batch_size=64, shuffle=False,
104 |                                                num_workers=0)
105 | 
106 | 
107 | if __name__ == '__main__':
108 |     os.chdir('../../')
109 | 
110 |     output_file = None
111 |     logging.basicConfig(filename=output_file,
112 |                         format='%(levelname)s:%(asctime)s: - %(name)s - : %(message)s',
113 |                         level=logging.DEBUG)
114 | 
115 |     dataset_name = 'cora'
116 |     dataset = Planetoid(config.RAW_DATA_PATH, dataset_name, transform=T.NormalizeFeatures())
117 |     data = dataset[0]
118 | 
119 |     graphsage = SAGE(dataset.num_features, dataset.num_classes, data)
120 |     graphsage.train_model()
121 | 


--------------------------------------------------------------------------------
/lib_gnn_model/graphsage/graphsage_net.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torch_geometric.nn import SAGEConv
 4 | 
 5 | 
 6 | class SageNet(torch.nn.Module):
 7 |     def __init__(self, in_channels, hidden_channels, out_channels):
 8 |         super(SageNet, self).__init__()
 9 | 
10 |         self.num_layers = 2
11 | 
12 |         self.convs = torch.nn.ModuleList()
13 |         self.convs.append(SAGEConv(in_channels, hidden_channels))
14 |         self.convs.append(SAGEConv(hidden_channels, out_channels))
15 | 
16 |     def forward(self, x, adjs):
17 |         # `train_loader` computes the k-hop neighborhood of a batch of nodes,
18 |         # and returns, for each layer, a bipartite graph object, holding the
19 |         # bipartite edges `edge_index`, the index `e_id` of the original edges,
20 |         # and the size/shape `size` of the bipartite graph.
21 |         # Target nodes are also included in the source nodes so that one can
22 |         # easily apply skip-connections or add self-loops.
23 |         for i, (edge_index, _, size) in enumerate(adjs):
24 |             x_target = x[:size[1]]  # Target nodes are always placed first.
25 |             x = self.convs[i]((x, x_target), edge_index)
26 | 
27 |             if i != self.num_layers - 1:
28 |                 x = F.relu(x)
29 |                 x = F.dropout(x, p=0.5, training=self.training)
30 | 
31 |         return F.log_softmax(x, dim=-1)
32 | 
33 |     def inference(self, x_all, subgraph_loader, device):
34 |         # Compute representations of nodes layer by layer, using *all*
35 |         # available edges. This leads to faster computation in contrast to
36 |         # immediately computing the final representations of each batch.
37 |         for i in range(self.num_layers):
38 |             xs = []
39 | 
40 |             for batch_size, n_id, adj in subgraph_loader:
41 |                 edge_index, _, size = adj.to(device)
42 |                 x = x_all[n_id].to(device)
43 |                 x_target = x[:size[1]]
44 |                 x = self.convs[i]((x, x_target), edge_index)
45 |                 if i != self.num_layers - 1:
46 |                     x = F.relu(x)
47 |                 xs.append(x.cpu())
48 | 
49 |             x_all = torch.cat(xs, dim=0)
50 | 
51 |         return x_all
52 | 
53 |     def reset_parameters(self):
54 |         for i in range(self.num_layers):
55 |             self.convs[i].reset_parameters()
56 | 


--------------------------------------------------------------------------------
/lib_gnn_model/mlp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinChen00/Graph-Unlearning/a9b942d01651c2e3d780ae12e1a1459e35120ffa/lib_gnn_model/mlp/__init__.py


--------------------------------------------------------------------------------
/lib_gnn_model/mlp/mlp.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | 
 4 | import torch
 5 | import torch.nn.functional as F
 6 | import torch_geometric.transforms as T
 7 | from torch_geometric.datasets import Planetoid
 8 | 
 9 | from lib_gnn_model.gnn_base import GNNBase
10 | from lib_gnn_model.mlp.mlpnet import MLPNet
11 | import config
12 | 
13 | 
14 | class MLP(GNNBase):
15 |     def __init__(self, num_feats, num_classes, data=None):
16 |         super(MLP, self).__init__()
17 | 
18 |         self.logger = logging.getLogger(__name__)
19 | 
20 |         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
21 |         self.model = MLPNet(num_feats, num_classes)
22 |         self.data = data
23 | 
24 |     def train_model(self, num_epoch=100):
25 |         self.model.train()
26 |         self.model, self.data = self.model.to(self.device), self.data.to(self.device)
27 | 
28 |         optimizer = torch.optim.Adam(self.model.parameters(), lr=0.01)
29 | 
30 |         for epoch in range(num_epoch):
31 |             self.logger.info('epoch %s' % (epoch,))
32 | 
33 |             optimizer.zero_grad()
34 |             output = self.model(self.data.x)[self.data.train_mask]
35 |             # loss = F.nll_loss(output, self.data.y[self.data.train_mask])
36 |             loss = torch.nn.CrossEntropyLoss(output, self.data.y[self.data.train_mask].squeeze())
37 |             loss.backward()
38 |             optimizer.step()
39 | 
40 |             train_acc, test_acc = self.evaluate_model()
41 |             self.logger.info('train acc: %s, test acc: %s' % (train_acc, test_acc))
42 | 
43 |     def evaluate_model(self):
44 |         self.model.eval()
45 |         self.model, self.data = self.model.to(self.device), self.data.to(self.device)
46 | 
47 |         logits, accs = self.model(self.data.x), []
48 | 
49 |         for _, mask in self.data('train_mask', 'test_mask'):
50 |             pred = logits[mask].max(1)[1]
51 |             acc = pred.eq(self.data.y[mask]).sum().item() / mask.sum().item()
52 |             accs.append(acc)
53 | 
54 |         return accs
55 | 
56 |     def posterior(self):
57 |         self.model.eval()
58 |         posteriors = self.model(self.data.x)
59 |         for _, mask in self.data('test_mask'):
60 |             posteriors = posteriors[mask]
61 | 
62 |         return posteriors
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     os.chdir('../../')
67 | 
68 |     output_file = None
69 |     logging.basicConfig(filename=output_file,
70 |                         format='%(levelname)s:%(asctime)s: - %(name)s - : %(message)s',
71 |                         level=logging.DEBUG)
72 | 
73 |     dataset_name = 'Cora'
74 |     dataset = Planetoid(config.RAW_DATA_PATH + dataset_name, dataset_name, transform=T.NormalizeFeatures())
75 |     data = dataset[0]
76 | 
77 |     gcn = MLP(dataset.num_features, dataset.num_classes, data)
78 |     gcn.train_model()


--------------------------------------------------------------------------------
/lib_gnn_model/mlp/mlpnet.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | import torch.nn.functional as F
 3 | 
 4 | 
 5 | class MLPNet(nn.Module):
 6 |     def __init__(self, input_size, num_classes):
 7 |         super(MLPNet, self).__init__()
 8 |         self.xent = nn.CrossEntropyLoss()
 9 | 
10 |         self.layers = nn.Sequential(
11 |             nn.Linear(input_size, 250),
12 |             nn.Linear(250, 100),
13 |             nn.Linear(100, num_classes)
14 |         )
15 | 
16 |     def forward(self, x):
17 |         x = x.view(x.size(0), -1)
18 |         x = self.layers(x)
19 |         return F.softmax(x, dim=1)
20 | 
21 |     def loss(self, nodes, labels):
22 |         scores = self.forward(nodes)
23 |         return self.xent(scores, labels.squeeze())
24 | 
25 |     def reset_parameters(self):
26 |         return 0
27 | 


--------------------------------------------------------------------------------
/lib_gnn_model/node_classifier.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | 
  4 | import torch
  5 | from sklearn.model_selection import train_test_split
  6 | 
  7 | torch.cuda.empty_cache()
  8 | import torch.nn.functional as F
  9 | import torch_geometric.transforms as T
 10 | from torch_geometric.datasets import Planetoid
 11 | from torch_geometric.data import NeighborSampler
 12 | from torch_geometric.nn.conv.gcn_conv import gcn_norm
 13 | import numpy as np
 14 | 
 15 | import config
 16 | from lib_gnn_model.gat.gat_net_batch import GATNet
 17 | from lib_gnn_model.gin.gin_net_batch import GINNet
 18 | from lib_gnn_model.gcn.gcn_net_batch import GCNNet
 19 | from lib_gnn_model.graphsage.graphsage_net import SageNet
 20 | from lib_gnn_model.gnn_base import GNNBase
 21 | from parameter_parser import parameter_parser
 22 | from lib_utils import utils
 23 | 
 24 | 
 25 | class NodeClassifier(GNNBase):
 26 |     def __init__(self, num_feats, num_classes, args, data=None):
 27 |         super(NodeClassifier, self).__init__()
 28 | 
 29 |         self.args = args
 30 |         self.logger = logging.getLogger('node_classifier')
 31 |         self.target_model = args['target_model']
 32 | 
 33 |         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 34 |         # self.device = 'cpu'
 35 |         self.model = self.determine_model(num_feats, num_classes).to(self.device)
 36 |         self.data = data
 37 | 
 38 |     def determine_model(self, num_feats, num_classes):
 39 |         self.logger.info('target model: %s' % (self.args['target_model'],))
 40 | 
 41 |         if self.target_model == 'SAGE':
 42 |             self.lr, self.decay = 0.01, 0.001
 43 |             return SageNet(num_feats, 256, num_classes)
 44 |         elif self.target_model == 'GAT':
 45 |             self.lr, self.decay = 0.01, 0.001
 46 |             return GATNet(num_feats, num_classes)
 47 |         elif self.target_model == 'GCN':
 48 |             self.lr, self.decay = 0.05, 0.0001
 49 |             return GCNNet(num_feats, num_classes)
 50 |         elif self.target_model == 'GIN':
 51 |             self.lr, self.decay = 0.01, 0.0001
 52 |             return GINNet(num_feats, num_classes)
 53 |         else:
 54 |             raise Exception('unsupported target model')
 55 | 
 56 |     def train_model(self):
 57 |         self.logger.info("training model")
 58 |         self.model.train()
 59 |         self.model.reset_parameters()
 60 |         self.model, self.data = self.model.to(self.device), self.data.to(self.device)
 61 |         self.data.y = self.data.y.squeeze().to(self.device)
 62 |         self._gen_train_loader()
 63 | 
 64 |         optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr, weight_decay=self.decay)
 65 | 
 66 |         for epoch in range(self.args['num_epochs']):
 67 |             self.logger.info('epoch %s' % (epoch,))
 68 | 
 69 |             for batch_size, n_id, adjs in self.train_loader:
 70 |                 # self.logger.info("batch size: %s"%(batch_size))
 71 |                 # `adjs` holds a list of `(edge_index, e_id, size)` tuples.
 72 |                 adjs = [adj.to(self.device) for adj in adjs]
 73 | 
 74 |                 test_node = np.nonzero(self.data.test_mask.cpu().numpy())[0]
 75 |                 intersect = np.intersect1d(test_node, n_id.numpy())
 76 | 
 77 |                 optimizer.zero_grad()
 78 | 
 79 |                 if self.target_model == 'GCN':
 80 |                     out = self.model(self.data.x[n_id], adjs, self.edge_weight)
 81 |                 else:
 82 |                     out = self.model(self.data.x[n_id], adjs)
 83 | 
 84 |                 loss = F.nll_loss(out, self.data.y[n_id[:batch_size]])
 85 |                 loss.backward()
 86 |                 optimizer.step()
 87 | 
 88 |             train_acc, test_acc = self.evaluate_model()
 89 |             self.logger.info(f'Train: {train_acc:.4f}, Test: {test_acc:.4f}')
 90 | 
 91 |     @torch.no_grad()
 92 |     def evaluate_model(self):
 93 |         self.model.eval()
 94 |         self.model, self.data = self.model.to(self.device), self.data.to(self.device)
 95 |         self._gen_test_loader()
 96 | 
 97 |         if self.target_model == 'GCN':
 98 |             out = self.model.inference(self.data.x, self.test_loader, self.edge_weight, self.device)
 99 |         else:
100 |             out = self.model.inference(self.data.x, self.test_loader, self.device)
101 | 
102 |         y_true = self.data.y.cpu().unsqueeze(-1)
103 |         y_pred = out.argmax(dim=-1, keepdim=True)
104 | 
105 |         results = []
106 |         for mask in [self.data.train_mask, self.data.test_mask]:
107 |             results += [int(y_pred[mask].eq(y_true[mask]).sum()) / int(mask.sum())]
108 | 
109 |         return results
110 | 
111 |     def posterior(self):
112 |         self.logger.debug("generating posteriors")
113 |         self.model, self.data = self.model.to(self.device), self.data.to(self.device)
114 |         self.model.eval()
115 | 
116 |         self._gen_test_loader()
117 |         if self.target_model == 'GCN':
118 |             posteriors = self.model.inference(self.data.x, self.test_loader, self.edge_weight, self.device)
119 |         else:
120 |             posteriors = self.model.inference(self.data.x, self.test_loader, self.device)
121 | 
122 |         for _, mask in self.data('test_mask'):
123 |             posteriors = F.log_softmax(posteriors[mask], dim=-1)
124 | 
125 |         return posteriors.detach()
126 | 
127 |     def generate_embeddings(self):
128 |         self.model.eval()
129 |         self.model, self.data = self.model.to(self.device), self.data.to(self.device)
130 |         self._gen_test_loader()
131 | 
132 |         if self.target_model == 'GCN':
133 |             logits = self.model.inference(self.data.x, self.test_loader, self.edge_weight, self.device)
134 |         else:
135 |             logits = self.model.inference(self.data.x, self.test_loader, self.device)
136 |         return logits
137 | 
138 |     def _gen_train_loader(self):
139 |         self.logger.info("generate train loader")
140 |         train_indices = np.nonzero(self.data.train_mask.cpu().numpy())[0]
141 |         edge_index = utils.filter_edge_index(self.data.edge_index, train_indices, reindex=False)
142 |         if edge_index.shape[1] == 0:
143 |             edge_index = torch.tensor([[1, 2], [2, 1]])
144 | 
145 |         self.train_loader = NeighborSampler(
146 |             edge_index, node_idx=self.data.train_mask,
147 |             sizes=[5, 5], num_nodes=self.data.num_nodes,
148 |             batch_size=self.args['batch_size'], shuffle=True,
149 |             num_workers=0)
150 | 
151 |         if self.target_model == 'GCN':
152 |             _, self.edge_weight = gcn_norm(self.data.edge_index, edge_weight=None, num_nodes=self.data.x.shape[0],
153 |                                            add_self_loops=False)
154 | 
155 |         self.logger.info("generate train loader finish")
156 | 
157 |     def _gen_test_loader(self):
158 |         test_indices = np.nonzero(self.data.train_mask.cpu().numpy())[0]
159 | 
160 |         if not self.args['use_test_neighbors']:
161 |             edge_index = utils.filter_edge_index(self.data.edge_index, test_indices, reindex=False)
162 |         else:
163 |             edge_index = self.data.edge_index
164 | 
165 |         if edge_index.shape[1] == 0:
166 |             edge_index = torch.tensor([[1, 3], [3, 1]])
167 | 
168 |         self.test_loader = NeighborSampler(
169 |             edge_index, node_idx=None,
170 |             sizes=[-1], num_nodes=self.data.num_nodes,
171 |             # sizes=[5], num_nodes=self.data.num_nodes,
172 |             batch_size=self.args['test_batch_size'], shuffle=False,
173 |             num_workers=0)
174 | 
175 |         if self.target_model == 'GCN':
176 |             _, self.edge_weight = gcn_norm(self.data.edge_index, edge_weight=None, num_nodes=self.data.x.shape[0],
177 |                                            add_self_loops=False)
178 | 
179 | 
180 | if __name__ == '__main__':
181 |     os.chdir('../')
182 |     args = parameter_parser()
183 | 
184 |     output_file = None
185 |     logging.basicConfig(filename=output_file,
186 |                         format='%(levelname)s:%(asctime)s: - %(name)s - : %(message)s',
187 |                         level=logging.DEBUG)
188 | 
189 |     dataset_name = 'cora'
190 |     dataset = Planetoid(config.RAW_DATA_PATH, dataset_name, transform=T.NormalizeFeatures())
191 |     data = dataset[0]
192 | 
193 |     train_indices, test_indices = train_test_split(np.arange((data.num_nodes)), test_size=0.2, random_state=100)
194 |     data.train_mask, data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool), torch.zeros(data.num_nodes,
195 |                                                                                                  dtype=torch.bool)
196 |     data.train_mask[train_indices] = True
197 |     data.test_mask[test_indices] = True
198 | 
199 |     graphsage = NodeClassifier(dataset.num_features, dataset.num_classes, args, data)
200 |     graphsage.train_model()
201 | 


--------------------------------------------------------------------------------
/lib_graph_partition/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinChen00/Graph-Unlearning/a9b942d01651c2e3d780ae12e1a1459e35120ffa/lib_graph_partition/__init__.py


--------------------------------------------------------------------------------
/lib_graph_partition/constrained_kmeans.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import copy
  3 | 
  4 | from tqdm import tqdm
  5 | 
  6 | import numpy as np
  7 | import cupy as np
  8 | 
  9 | 
 10 | class ConstrainedKmeans:
 11 |     def __init__(self, data_feat, num_clusters, node_threshold, terminate_delta, max_iteration=20):
 12 |         self.logger = logging.getLogger('constrained_kmeans')
 13 | 
 14 |         self.data_feat = data_feat
 15 |         self.num_clusters = num_clusters
 16 |         self.node_threshold = node_threshold
 17 |         self.terminate_delta = terminate_delta
 18 |         self.max_iteration = max_iteration
 19 | 
 20 |     def initialization(self):
 21 |         centroids = np.random.choice(np.arange(self.data_feat.shape[0]), self.num_clusters, replace=False)
 22 |         self.centroid = {}
 23 |         for i in range(self.num_clusters):
 24 |             self.centroid[i] = self.data_feat[centroids[i].get()]
 25 | 
 26 |     def clustering(self):
 27 |         centroid = copy.deepcopy(self.centroid)
 28 |         km_delta = []
 29 | 
 30 |         pbar = tqdm(total=self.max_iteration)
 31 |         pbar.set_description('Clustering')
 32 | 
 33 |         for i in range(self.max_iteration):
 34 |             self.logger.info('iteration %s' % (i,))
 35 | 
 36 |             self._node_reassignment()
 37 |             self._centroid_updating()
 38 | 
 39 |             # record the average change of centroids, if the change is smaller than a very small value, then terminate
 40 |             delta = self._centroid_delta(centroid, self.centroid)
 41 |             km_delta.append(delta)
 42 |             centroid = copy.deepcopy(self.centroid)
 43 | 
 44 |             if delta <= self.terminate_delta:
 45 |                 break
 46 |             self.logger.info("delta: %s" % delta)
 47 |         pbar.close()
 48 |         return self.clusters, km_delta
 49 | 
 50 |     def _node_reassignment(self):
 51 |         self.clusters = {}
 52 |         for i in range(self.num_clusters):
 53 |             self.clusters[i] = np.zeros(0, dtype=np.uint64)
 54 | 
 55 |         distance = np.zeros([self.num_clusters, self.data_feat.shape[0]])
 56 | 
 57 |         for i in range(self.num_clusters):
 58 |             distance[i] = np.sum(np.power((self.data_feat - self.centroid[i]), 2), axis=1)
 59 | 
 60 |         sort_indices = np.unravel_index(np.argsort(distance, axis=None), distance.shape)
 61 |         clusters = sort_indices[0]
 62 |         users = sort_indices[1]
 63 |         selected_nodes = np.zeros(0, dtype=np.int64)
 64 |         counter = 0
 65 | 
 66 |         while len(selected_nodes) < self.data_feat.shape[0]:
 67 |             cluster = int(clusters[counter])
 68 |             user = users[counter]
 69 |             if self.clusters[cluster].size < self.node_threshold:
 70 |                 self.clusters[cluster] = np.append(self.clusters[cluster], np.array(int(user)))
 71 |                 selected_nodes = np.append(selected_nodes, np.array(int(user)))
 72 | 
 73 |                 # delete all the following pairs for the selected user
 74 |                 user_indices = np.where(users == user)[0]
 75 |                 a = np.arange(users.size)
 76 |                 b = user_indices[user_indices > counter]
 77 |                 remain_indices = a[np.where(np.logical_not(np.isin(a, b)))[0]]
 78 |                 clusters = clusters[remain_indices]
 79 |                 users = users[remain_indices]
 80 | 
 81 |             counter += 1
 82 | 
 83 |     def _centroid_updating(self):
 84 |         for i in range(self.num_clusters):
 85 |             self.centroid[i] = np.mean(self.data_feat[self.clusters[i].astype(int)], axis=0)
 86 | 
 87 |     def _centroid_delta(self, centroid_pre, centroid_cur):
 88 |         delta = 0.0
 89 |         for i in range(len(centroid_cur)):
 90 |             delta += np.sum(np.abs(centroid_cur[i] - centroid_pre[i]))
 91 | 
 92 |         return delta
 93 | 
 94 | 
 95 | if __name__ == '__main__':
 96 |     output_file = None
 97 |     logging.basicConfig(filename=output_file,
 98 |                         format='%(levelname)s:%(asctime)s: - %(name)s - : %(message)s',
 99 |                         level=logging.DEBUG)
100 | 
101 |     data_feat = np.array([[1, 2],
102 |                           [1, 3],
103 |                           [1, 4],
104 |                           [1, 5],
105 |                           [10, 2],
106 |                           [10, 3]])
107 |     num_clusters = 2
108 |     node_threshold = 3
109 |     terminate_delta = 0.001
110 | 
111 |     cluster = ConstrainedKmeans(data_feat, num_clusters, node_threshold, terminate_delta)
112 |     cluster.initialization()
113 |     cluster.clustering()


--------------------------------------------------------------------------------
/lib_graph_partition/constrained_kmeans_base.py:
--------------------------------------------------------------------------------
  1 | # An implementation of ``Balanced K-Means for Clustering.'' (https://rdcu.be/cESzk)
  2 | import logging
  3 | import copy
  4 | 
  5 | import numpy as np
  6 | import seaborn as sns
  7 | import matplotlib.pyplot as plt
  8 | from munkres import Munkres
  9 | from lib_graph_partition.hungarian import Hungarian
 10 | from lib_graph_partition.hungarian_1 import KMMatcher
 11 | 
 12 | 
 13 | class ConstrainedKmeansBase:
 14 |     def __init__(self, data_feat, num_clusters, node_threshold, terminate_delta, max_iteration=20):
 15 |         self.logger = logging.getLogger('constrained_kmeans_base')
 16 | 
 17 |         self.data_feat = data_feat
 18 |         self.num_clusters = num_clusters
 19 |         self.node_threshold = node_threshold
 20 |         self.terminate_delta = terminate_delta
 21 |         self.max_iteration = max_iteration
 22 | 
 23 |     def initialization(self):
 24 |         centroids = np.random.choice(np.arange(self.data_feat.shape[0]), self.num_clusters, replace=False)
 25 |         self.centroid = dict(zip(range(self.num_clusters), self.data_feat[centroids]))
 26 | 
 27 |     def clustering(self):
 28 |         centroid = copy.deepcopy(self.centroid)
 29 |         centroid_delta = {}
 30 |         km_base_delta = []
 31 | 
 32 |         for i in range(self.max_iteration):
 33 |             self.logger.info('iteration %s' % (i))
 34 |             self._node_reassignment()
 35 |             self._centroid_updating()
 36 | 
 37 |             # record the average change of centroids, if the change is smaller than a very small value, then terminate
 38 |             delta = self._centroid_delta(centroid, self.centroid)
 39 |             centroid_delta[i] = delta
 40 |             km_base_delta.append(delta)
 41 |             centroid = copy.deepcopy(self.centroid)
 42 | 
 43 |             if delta <= self.terminate_delta:
 44 |                 break
 45 |             self.logger.info("delta: %s" % delta)
 46 | 
 47 |         return self.clusters, km_base_delta
 48 | 
 49 |     def _node_reassignment(self):
 50 |         self.logger.info('Node reassignment begins')
 51 |         self.clusters = dict(
 52 |             zip(np.arange(self.num_clusters), [np.zeros(0, dtype=np.uint64) for _ in range(self.num_clusters)]))
 53 | 
 54 |         distance = np.zeros([self.num_clusters, self.data_feat.shape[0]])
 55 |         # cost_matrix = np.zeros([self.data_feat.shape[0], self.data_feat.shape[0]])
 56 |         for i in range(self.num_clusters):
 57 |             distance[i] = np.sum((self.data_feat - self.centroid[i]) ** 2, axis=1)
 58 |         cost_matrix = np.tile(distance, (self.data_feat.shape[0], 1))
 59 |         cost_matrix = cost_matrix[:self.data_feat.shape[0], :]
 60 | 
 61 |         # too slow
 62 |         # matrix = np.array(cost_matrix)
 63 |         # m = Munkres()
 64 |         # assignment = m.compute(matrix)
 65 |         # assignment = np.array(assignment)
 66 |         # assignment = assignment[:, 1]
 67 | 
 68 |         # hungarian = Hungarian(cost_matrix)
 69 |         # hungarian.calculate()
 70 |         # assignment = hungarian.get_results()
 71 |         # assignment = np.array(assignment)
 72 |         # assignment = assignment[np.argsort(assignment[:, 0])]
 73 |         # assignment = assignment[:, 1]
 74 | 
 75 |         matcher = KMMatcher(cost_matrix)
 76 |         assignment, _ = matcher.solve()
 77 | 
 78 |         partition = np.zeros(self.data_feat.shape[0])
 79 |         for i in range(self.data_feat.shape[0]):
 80 |             partition[assignment[i]] = i % self.num_clusters
 81 | 
 82 |         for i in range(self.num_clusters):
 83 |             self.clusters[i] = np.where(partition == i)[0]
 84 | 
 85 |     def _centroid_updating(self):
 86 |         self.logger.info('Updating centroid begins')
 87 |         for i in range(self.num_clusters):
 88 |             self.centroid[i] = np.mean(self.data_feat[self.clusters[i]], axis=0)
 89 | 
 90 |     def _centroid_delta(self, centroid_pre, centroid_cur):
 91 |         delta = 0.0
 92 |         for i in range(len(centroid_cur)):
 93 |             delta += np.sum(np.abs(centroid_cur[i] - centroid_pre[i]))
 94 | 
 95 |         return delta
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     output_file = None
100 |     logging.basicConfig(filename=output_file,
101 |                         format='%(levelname)s:%(asctime)s: - %(name)s - : %(message)s',
102 |                         level=logging.DEBUG)
103 | 
104 |     data_feat = np.array([[1, 2],
105 |                           [1, 3],
106 |                           [1, 4],
107 |                           [1, 5],
108 |                           [10, 2],
109 |                           [10, 3]])
110 |     num_clusters = 2
111 |     node_threshold = 3
112 |     terminate_delta = 0.001
113 | 
114 |     cluster = ConstrainedKmeansBase(data_feat, num_clusters, node_threshold, terminate_delta)
115 |     cluster.initialization()
116 |     cluster.clustering()
117 | 


--------------------------------------------------------------------------------
/lib_graph_partition/constrained_lpa.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import logging
  3 | from collections import defaultdict
  4 | 
  5 | import numpy as np
  6 | 
  7 | 
  8 | class ConstrainedLPA:
  9 |     def __init__(self, adj, num_communities, node_threshold, terminate_delta):
 10 |         self.logger = logging.getLogger('constrained_lpa_single')
 11 | 
 12 |         self.adj = adj
 13 |         self.num_nodes = adj.shape[0]
 14 |         self.num_communities = num_communities
 15 |         self.node_threshold = node_threshold
 16 |         self.terminate_delta = terminate_delta
 17 | 
 18 |     def initialization(self):
 19 |         self.logger.info('initializing communities')
 20 | 
 21 |         random_nodes = np.arange(self.num_nodes)
 22 |         np.random.shuffle(random_nodes)
 23 |         self.communities = defaultdict(set)
 24 |         self.node_community = np.zeros(self.adj.shape[0])
 25 | 
 26 |         # each node use node is as its community label
 27 |         for community, nodes in enumerate(np.array_split(random_nodes, self.num_communities)):
 28 |             self.communities[community] = set(nodes)
 29 |             self.node_community[nodes] = community
 30 | 
 31 |     def community_detection(self, iterations=100):
 32 |         self.logger.info('detecting communities')
 33 | 
 34 |         communities = copy.deepcopy(self.communities)
 35 |         lpa_deltas = []
 36 | 
 37 |         # Currently, break when maximum iterations round achieves.
 38 |         for i in range(iterations):
 39 |             self.logger.info('iteration %s' % (i,))
 40 | 
 41 |             desire_move = self._determine_desire_move()
 42 |             sort_indices = np.flip(np.argsort(desire_move[:, 2]))
 43 |             candidate_nodes = defaultdict(list)
 44 | 
 45 |             # allocate nodes' community with descending order of colocate count
 46 |             for node in sort_indices:
 47 |                 src_community = desire_move[node][0]
 48 |                 dst_community = desire_move[node][1]
 49 | 
 50 |                 if src_community != dst_community:
 51 |                     if len(self.communities[dst_community]) < self.node_threshold:
 52 |                         self.node_community[node] = dst_community
 53 |                         self.communities[dst_community].add(node)
 54 |                         self.communities[src_community].remove(node)
 55 | 
 56 |                         # reallocate the candidate nodes
 57 |                         candidate_nodes_cur = candidate_nodes[src_community]
 58 |                         while len(candidate_nodes_cur) != 0:
 59 |                             node_cur = candidate_nodes_cur[0]
 60 |                             src_community_cur = desire_move[node_cur][0]
 61 |                             dst_community_cur = desire_move[node_cur][1]
 62 | 
 63 |                             self.node_community[node_cur] = dst_community_cur
 64 |                             self.communities[dst_community_cur].add(node_cur)
 65 |                             self.communities[src_community_cur].remove(node_cur)
 66 | 
 67 |                             candidate_nodes[dst_community_cur].pop(0)
 68 |                             candidate_nodes_cur = candidate_nodes[src_community_cur]
 69 |                     else:
 70 |                         candidate_nodes[dst_community].append(node)
 71 |                 # record the communities of each iteration, break the loop while communities are stable.
 72 | 
 73 |             delta = self._lpa_delta(communities, self.communities)
 74 |             lpa_deltas.append(delta)
 75 |             self.logger.info("%d" % delta)
 76 |             communities = copy.deepcopy(self.communities)
 77 |             if delta <= self.terminate_delta:
 78 |                 break
 79 | 
 80 |         return self.communities, lpa_deltas
 81 | 
 82 |     def _determine_desire_move(self):
 83 |         desire_move = np.zeros([self.num_nodes, 3])
 84 |         desire_move[:, 0] = self.node_community
 85 | 
 86 |         for i in range(self.num_nodes):
 87 |             # neighbor_community = self.node_community[np.nonzero(self.adj[i])[0]]  # for non-bool adj
 88 |             neighbor_community = self.node_community[self.adj[i]]  # for bool adj
 89 |             unique_community, unique_count = np.unique(neighbor_community, return_counts=True)
 90 |             if unique_community.shape[0] == 0:
 91 |                 continue
 92 |             max_indices = np.where(unique_count == np.max(unique_count))[0]
 93 | 
 94 |             if max_indices.size == 1:
 95 |                 desire_move[i, 1] = unique_community[max_indices]
 96 |                 desire_move[i, 2] = unique_count[max_indices]
 97 |             elif max_indices.size > 1:
 98 |                 max_index = np.random.choice(max_indices)
 99 |                 desire_move[i, 1] = unique_community[max_index]
100 |                 desire_move[i, 2] = unique_count[max_index]
101 | 
102 |         return desire_move
103 | 
104 |     def _lpa_delta(self, lpa_pre, lpa_cur):
105 |         delta = 0.0
106 |         for i in range(len(lpa_cur)):
107 |             delta += len((lpa_cur[i] | lpa_pre[i]) - (lpa_cur[i] & lpa_pre[i]))
108 | 
109 |         return delta
110 | 
111 | 
112 | if __name__ == '__main__':
113 |     output_file = None
114 |     logging.basicConfig(filename=output_file,
115 |                         format='%(levelname)s:%(asctime)s: - %(name)s - : %(message)s',
116 |                         level=logging.DEBUG)
117 | 
118 |     adj = np.array([[0, 1, 1],
119 |                     [1, 0, 1],
120 |                     [1, 1, 0]],
121 |                    dtype=np.bool)
122 | 
123 |     num_communities = 2
124 |     node_threshold = 3
125 |     terminate_delta = 1
126 | 
127 |     lpa = ConstrainedLPA(adj, num_communities, node_threshold, terminate_delta)
128 | 
129 |     lpa.initialization()
130 |     lpa.community_detection()
131 | 


--------------------------------------------------------------------------------
/lib_graph_partition/constrained_lpa_base.py:
--------------------------------------------------------------------------------
  1 | # An implementation of `` Balanced Label Propagation for Partitioning MassiveGraphs'' (https://stanford.edu/~jugander/papers/wsdm13-blp.pdf)
  2 | 
  3 | import copy
  4 | import logging
  5 | from collections import defaultdict
  6 | 
  7 | import numpy as np
  8 | import cvxpy as cp
  9 | from scipy.stats import linregress
 10 | 
 11 | 
 12 | class ConstrainedLPABase:
 13 |     def __init__(self, adj, num_communities, node_threshold, terminate_delta):
 14 |         self.logger = logging.getLogger('constrained_lpa_base')
 15 | 
 16 |         self.adj = adj
 17 |         self.num_nodes = adj.shape[0]
 18 |         self.num_communities = num_communities
 19 |         self.node_threshold = node_threshold
 20 |         self.terminate_delta = terminate_delta
 21 | 
 22 |     def initialization(self):
 23 |         self.logger.info('initializing communities')
 24 | 
 25 |         random_nodes = np.arange(self.num_nodes)
 26 |         np.random.shuffle(random_nodes)
 27 |         self.communities = defaultdict(set)
 28 |         self.node_community = np.zeros(self.adj.shape[0])
 29 | 
 30 |         # each node use node is as its community label
 31 |         for community, nodes in enumerate(np.array_split(random_nodes, self.num_communities)):
 32 |             self.communities[community] = set(nodes)
 33 |             self.node_community[nodes] = community
 34 | 
 35 |     def community_detection(self, iterations=100):
 36 |         self.logger.info('detecting communities')
 37 | 
 38 |         communities = copy.deepcopy(self.communities)
 39 |         lpa_deltas = []
 40 | 
 41 |         for i in range(iterations):
 42 |             self.logger.info('iteration %s' % (i,))
 43 | 
 44 |             ## Step 1: calculate desired move
 45 |             desire_move = self._determine_desire_move()
 46 |             relocation = {}
 47 |             utility_func = {}
 48 | 
 49 |             ## Step 2: calculate parameters for linear programming problem
 50 |             for src_community in range(self.num_communities):
 51 |                 for dst_community in range(self.num_communities):
 52 |                     move_node = desire_move[np.where(np.logical_and(desire_move[:, 1] == src_community, desire_move[:, 2] == dst_community))[0]]
 53 | 
 54 |                     if src_community != dst_community and move_node.size != 0:
 55 |                         move_node = move_node[np.flip(np.argsort(move_node[:, 3]))]
 56 |                         relocation[(src_community, dst_community)] = move_node
 57 | 
 58 |                         if move_node.shape[0] == 1:
 59 |                             utility_func[(src_community, dst_community)] = np.array([[0, move_node[0, 3]]])
 60 |                         else:
 61 |                             cum_sum = np.cumsum(move_node[:, 3])
 62 |                             utility_func_temp = np.zeros([move_node.shape[0] - 1, 2])
 63 |                             for k in range(move_node.shape[0] - 1):
 64 |                                 utility_func_temp[k, 0], utility_func_temp[k, 1], _, _, _ = linregress([k, k+1], [cum_sum[k], cum_sum[k+1]])
 65 |                                 utility_func[(src_community, dst_community)] = utility_func_temp
 66 | 
 67 |             ## Step 3: solve linear programming problem
 68 |             x = cp.Variable([self.num_communities, self.num_communities])
 69 |             z = cp.Variable([self.num_communities, self.num_communities])
 70 | 
 71 |             objective = cp.Maximize(cp.sum(z))
 72 |             constraints = []
 73 |             for src_community in range(self.num_communities):
 74 |                 const = 0
 75 |                 for dst_community in range(self.num_communities):
 76 |                     if (src_community, dst_community) in relocation:
 77 |                         if src_community == dst_community:
 78 |                             constraints.append(x[src_community, dst_community] == 0)
 79 |                             constraints.append(z[src_community, dst_community] == 0)
 80 |                         else:
 81 |                             ## Constraint 2 of Theorem 2
 82 |                             constraints.append(x[src_community, dst_community] >= 0)
 83 |                             constraints.append(x[src_community, dst_community] <= relocation[(src_community, dst_community)].shape[0])
 84 | 
 85 |                             ## Constraint 1 of Theorem 2
 86 |                             if (dst_community, src_community) in relocation:
 87 |                                 const += x[src_community, dst_community] - x[dst_community, src_community]
 88 | 
 89 |                         ## Constraint 3 of Theorem 2
 90 |                         for utility_func_value in utility_func[(src_community, dst_community)]:
 91 |                             constraints.append(- utility_func_value[0] * x[src_community, dst_community] + z[src_community, dst_community] <= utility_func_value[1])
 92 | 
 93 |                     else:
 94 |                         constraints.append(x[src_community, dst_community] == 0)
 95 |                         constraints.append(z[src_community, dst_community] == 0)
 96 | 
 97 |                 ## Constraint 1 of Theorem 2
 98 |                 constraints.append(len(self.communities[src_community]) + const <= self.node_threshold)
 99 | 
100 |             problem = cp.Problem(objective, constraints)
101 |             problem.solve()
102 | 
103 |             ## Step 4: parse linear programming problem results
104 |             if problem.status == 'optimal':
105 |                 x_value = np.floor(np.abs(x.value)).astype(np.int64)
106 |                 for src_community in range(self.num_communities):
107 |                     for dst_community in range(self.num_communities):
108 |                         if (src_community, dst_community) in relocation and x_value[src_community, dst_community] != 0:
109 |                         # if (src_community, dst_community) in relocation:
110 |                             relocation_temp = relocation[(src_community, dst_community)][:, 0].astype(np.int64)
111 |                             move_node = relocation_temp[:x_value[src_community, dst_community] - 1]
112 |                             if isinstance(move_node, np.int64):
113 |                                 self.communities[src_community].remove(move_node)
114 |                                 self.communities[dst_community].add(move_node)
115 |                                 self.node_community[move_node] = dst_community
116 |                             else:
117 |                                 # move_node = set(move_node)
118 |                                 self.communities[src_community].difference_update(move_node)
119 |                                 self.communities[dst_community].update(move_node)
120 |                                 for node in move_node:
121 |                                     self.node_community[node] = dst_community
122 |             else:
123 |                 self.logger.info("No optimal solution, break!")
124 |                 break
125 | 
126 |             ## Check the number of moved nodes
127 |             delta = self._lpa_delta(communities, self.communities)
128 |             lpa_deltas.append(delta)
129 |             self.logger.info("%d" % delta)
130 |             communities = copy.deepcopy(self.communities)
131 |             if delta <= self.terminate_delta:
132 |                 break
133 | 
134 |         return self.communities, lpa_deltas
135 | 
136 |     def _determine_desire_move(self):
137 |         desire_move = []
138 | 
139 |         for i in range(self.num_nodes):
140 |             # neighbor_community = self.node_community[np.nonzero(self.adj[i])[0]]  # for non-bool adj
141 |             neighbor_community = self.node_community[self.adj[i]] # for bool adj
142 |             unique_community, unique_count = np.unique(neighbor_community, return_counts=True)
143 | 
144 |             src_relocation = unique_count[np.where(unique_community == self.node_community[i])[0]]
145 |             for community in unique_community:
146 |                 if community != self.node_community[i]:
147 |                     dst_relocation = unique_count[np.where(unique_community == community)[0]]
148 |                     if dst_relocation - src_relocation >= 0:
149 |                         desire_move_temp = np.zeros(4)
150 |                         desire_move_temp[0] = i
151 |                         desire_move_temp[1] = self.node_community[i]
152 |                         desire_move_temp[2] = community
153 |                         desire_move_temp[3] = dst_relocation - src_relocation
154 | 
155 |                         desire_move.append(desire_move_temp)
156 | 
157 |         return np.stack(desire_move)
158 | 
159 |     def _lpa_delta(self, lpa_pre, lpa_cur):
160 |         delta = 0.0
161 |         for i in range(len(lpa_cur)):
162 |             delta += len((lpa_cur[i] | lpa_pre[i]) - (lpa_cur[i] & lpa_pre[i]))
163 | 
164 |         return delta
165 | 
166 | 
167 | if __name__ == '__main__':
168 |     output_file = None
169 |     logging.basicConfig(filename=output_file,
170 |                         format='%(levelname)s:%(asctime)s: - %(name)s - : %(message)s',
171 |                         level=logging.DEBUG)
172 | 
173 |     adj = np.array([[0, 1, 1],
174 |                     [1, 0, 1],
175 |                     [1, 1, 0]],
176 |                    dtype=np.bool)
177 | 
178 |     num_communities = 2
179 |     node_threshold = 3
180 |     terminate_delta = 1
181 | 
182 |     lpa = ConstrainedLPABase(adj, num_communities, node_threshold, terminate_delta)
183 | 
184 |     lpa.initialization()
185 |     lpa.community_detection()
186 | 


--------------------------------------------------------------------------------
/lib_graph_partition/graph_partition.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from lib_graph_partition.partition_kmeans import PartitionKMeans
 4 | from lib_graph_partition.partition_lpa import PartitionConstrainedLPA, PartitionLPA, PartitionConstrainedLPABase
 5 | from lib_graph_partition.metis_partition import MetisPartition
 6 | from lib_graph_partition.partition_random import PartitionRandom
 7 | 
 8 | 
 9 | class GraphPartition:
10 |     def __init__(self, args, graph, dataset=None):
11 |         self.logger = logging.getLogger(__name__)
12 | 
13 |         self.args = args
14 |         self.graph = graph
15 |         self.dataset = dataset
16 | 
17 |         self.partition_method = self.args['partition_method']
18 |         self.num_shards = self.args['num_shards']
19 | 
20 |     def graph_partition(self):
21 |         self.logger.info('graph partition, method: %s' % self.partition_method)
22 | 
23 |         if self.partition_method == 'random':
24 |             partition_method = PartitionRandom(self.args, self.graph)
25 |         elif self.partition_method in ['sage_km', 'sage_km_base']:
26 |             partition_method = PartitionKMeans(self.args, self.graph, self.dataset)
27 |         elif self.partition_method == 'lpa' and not self.args['is_constrained']:
28 |             partition_method = PartitionLPA(self.args, self.graph)
29 |         elif self.partition_method == 'lpa' and self.args['is_constrained']:
30 |             partition_method = PartitionConstrainedLPA(self.args, self.graph)
31 |         elif self.partition_method == 'lpa_base':
32 |             partition_method = PartitionConstrainedLPABase(self.args, self.graph)
33 |         elif self.partition_method == 'metis':
34 |             partition_method = MetisPartition(self.args, self.graph, self.dataset)
35 |         else:
36 |             raise Exception('Unsupported partition method')
37 | 
38 |         return partition_method.partition()
39 | 


--------------------------------------------------------------------------------
/lib_graph_partition/hungarian.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | """
  3 | Implementation of the Hungarian (Munkres) Algorithm using Python and NumPy
  4 | References: http://www.ams.jhu.edu/~castello/362/Handouts/hungarian.pdf
  5 |         http://weber.ucsd.edu/~vcrawfor/hungar.pdf
  6 |         http://en.wikipedia.org/wiki/Hungarian_algorithm
  7 |         http://www.public.iastate.edu/~ddoty/HungarianAlgorithm.html
  8 |         http://www.clapper.org/software/python/munkres/
  9 | """
 10 | 
 11 | # Module Information.
 12 | __version__ = "1.1.1"
 13 | __author__ = "Thom Dedecko"
 14 | __url__ = "http://github.com/tdedecko/hungarian-algorithm"
 15 | __copyright__ = "(c) 2010 Thom Dedecko"
 16 | __license__ = "MIT License"
 17 | 
 18 | 
 19 | class HungarianError(Exception):
 20 |     pass
 21 | 
 22 | # Import numpy. Error if fails
 23 | try:
 24 |     import numpy as np
 25 | except ImportError:
 26 |     raise HungarianError("NumPy is not installed.")
 27 | 
 28 | 
 29 | class Hungarian:
 30 |     """
 31 |     Implementation of the Hungarian (Munkres) Algorithm using np.
 32 |     Usage:
 33 |         hungarian = Hungarian(cost_matrix)
 34 |         hungarian.calculate()
 35 |     or
 36 |         hungarian = Hungarian()
 37 |         hungarian.calculate(cost_matrix)
 38 |     Handle Profit matrix:
 39 |         hungarian = Hungarian(profit_matrix, is_profit_matrix=True)
 40 |     or
 41 |         cost_matrix = Hungarian.make_cost_matrix(profit_matrix)
 42 |     The matrix will be automatically padded if it is not square.
 43 |     For that numpy's resize function is used, which automatically adds 0's to any row/column that is added
 44 |     Get results and total potential after calculation:
 45 |         hungarian.get_results()
 46 |         hungarian.get_total_potential()
 47 |     """
 48 | 
 49 |     def __init__(self, input_matrix=None, is_profit_matrix=False):
 50 |         """
 51 |         input_matrix is a List of Lists.
 52 |         input_matrix is assumed to be a cost matrix unless is_profit_matrix is True.
 53 |         """
 54 |         if input_matrix is not None:
 55 |             # Save input
 56 |             my_matrix = np.array(input_matrix)
 57 |             self._input_matrix = np.array(input_matrix)
 58 |             self._maxColumn = my_matrix.shape[1]
 59 |             self._maxRow = my_matrix.shape[0]
 60 | 
 61 |             # Adds 0s if any columns/rows are added. Otherwise stays unaltered
 62 |             matrix_size = max(self._maxColumn, self._maxRow)
 63 |             pad_columns = matrix_size - self._maxRow
 64 |             pad_rows = matrix_size - self._maxColumn
 65 |             my_matrix = np.pad(my_matrix, ((0,pad_columns),(0,pad_rows)), 'constant', constant_values=(0))
 66 | 
 67 |             # Convert matrix to profit matrix if necessary
 68 |             if is_profit_matrix:
 69 |                 my_matrix = self.make_cost_matrix(my_matrix)
 70 | 
 71 |             self._cost_matrix = my_matrix
 72 |             self._size = len(my_matrix)
 73 |             self._shape = my_matrix.shape
 74 | 
 75 |             # Results from algorithm.
 76 |             self._results = []
 77 |             self._totalPotential = 0
 78 |         else:
 79 |             self._cost_matrix = None
 80 | 
 81 |     def get_results(self):
 82 |         """Get results after calculation."""
 83 |         return self._results
 84 | 
 85 |     def get_total_potential(self):
 86 |         """Returns expected value after calculation."""
 87 |         return self._totalPotential
 88 | 
 89 |     def calculate(self, input_matrix=None, is_profit_matrix=False):
 90 |         """
 91 |         Implementation of the Hungarian (Munkres) Algorithm.
 92 |         input_matrix is a List of Lists.
 93 |         input_matrix is assumed to be a cost matrix unless is_profit_matrix is True.
 94 |         """
 95 |         # Handle invalid and new matrix inputs.
 96 |         if input_matrix is None and self._cost_matrix is None:
 97 |             raise HungarianError("Invalid input")
 98 |         elif input_matrix is not None:
 99 |             self.__init__(input_matrix, is_profit_matrix)
100 | 
101 |         result_matrix = self._cost_matrix.copy()
102 | 
103 |         # Step 1: Subtract row mins from each row.
104 |         for index, row in enumerate(result_matrix):
105 |             result_matrix[index] -= row.min()
106 | 
107 |         # Step 2: Subtract column mins from each column.
108 |         for index, column in enumerate(result_matrix.T):
109 |             result_matrix[:, index] -= column.min()
110 | 
111 |         # Step 3: Use minimum number of lines to cover all zeros in the matrix.
112 |         # If the total covered rows+columns is not equal to the matrix size then adjust matrix and repeat.
113 |         total_covered = 0
114 |         while total_covered < self._size:
115 |             # Find minimum number of lines to cover all zeros in the matrix and find total covered rows and columns.
116 |             cover_zeros = CoverZeros(result_matrix)
117 |             covered_rows = cover_zeros.get_covered_rows()
118 |             covered_columns = cover_zeros.get_covered_columns()
119 |             total_covered = len(covered_rows) + len(covered_columns)
120 | 
121 |             # if the total covered rows+columns is not equal to the matrix size then adjust it by min uncovered num (m).
122 |             if total_covered < self._size:
123 |                 result_matrix = self._adjust_matrix_by_min_uncovered_num(result_matrix, covered_rows, covered_columns)
124 | 
125 |         # Step 4: Starting with the top row, work your way downwards as you make assignments.
126 |         # Find single zeros in rows or columns.
127 |         # Add them to final result and remove them and their associated row/column from the matrix.
128 |         expected_results = min(self._maxColumn, self._maxRow)
129 |         zero_locations = (result_matrix == 0)
130 |         while len(self._results) != expected_results:
131 | 
132 |             # If number of zeros in the matrix is zero before finding all the results then an error has occurred.
133 |             if not zero_locations.any():
134 |                 raise HungarianError("Unable to find results. Algorithm has failed.")
135 | 
136 |             # Find results and mark rows and columns for deletion
137 |             matched_rows, matched_columns = self.__find_matches(zero_locations)
138 | 
139 |             # Make arbitrary selection
140 |             total_matched = len(matched_rows) + len(matched_columns)
141 |             if total_matched == 0:
142 |                 matched_rows, matched_columns = self.select_arbitrary_match(zero_locations)
143 | 
144 |             # Delete rows and columns
145 |             for row in matched_rows:
146 |                 zero_locations[row] = False
147 |             for column in matched_columns:
148 |                 zero_locations[:, column] = False
149 | 
150 |             # Save Results
151 |             self.__set_results(zip(matched_rows, matched_columns))
152 | 
153 |         # Calculate total potential
154 |         value = 0
155 |         for row, column in self._results:
156 |             value += self._input_matrix[row, column]
157 |         self._totalPotential = value
158 | 
159 |     @staticmethod
160 |     def make_cost_matrix(profit_matrix):
161 |         """
162 |         Converts a profit matrix into a cost matrix.
163 |         Expects NumPy objects as input.
164 |         """
165 |         # subtract profit matrix from a matrix made of the max value of the profit matrix
166 |         matrix_shape = profit_matrix.shape
167 |         offset_matrix = np.ones(matrix_shape, dtype=int) * profit_matrix.max()
168 |         cost_matrix = offset_matrix - profit_matrix
169 |         return cost_matrix
170 | 
171 |     def _adjust_matrix_by_min_uncovered_num(self, result_matrix, covered_rows, covered_columns):
172 |         """Subtract m from every uncovered number and add m to every element covered with two lines."""
173 |         # Calculate minimum uncovered number (m)
174 |         elements = []
175 |         for row_index, row in enumerate(result_matrix):
176 |             if row_index not in covered_rows:
177 |                 for index, element in enumerate(row):
178 |                     if index not in covered_columns:
179 |                         elements.append(element)
180 |         min_uncovered_num = min(elements)
181 | 
182 |         # Add m to every covered element
183 |         adjusted_matrix = result_matrix
184 |         for row in covered_rows:
185 |             adjusted_matrix[row] += min_uncovered_num
186 |         for column in covered_columns:
187 |             adjusted_matrix[:, column] += min_uncovered_num
188 | 
189 |         # Subtract m from every element
190 |         m_matrix = np.ones(self._shape, dtype=int) * min_uncovered_num
191 |         adjusted_matrix -= m_matrix
192 | 
193 |         return adjusted_matrix
194 | 
195 |     def __find_matches(self, zero_locations):
196 |         """Returns rows and columns with matches in them."""
197 |         marked_rows = np.array([], dtype=int)
198 |         marked_columns = np.array([], dtype=int)
199 | 
200 |         # Mark rows and columns with matches
201 |         # Iterate over rows
202 |         for index, row in enumerate(zero_locations):
203 |             row_index = np.array([index])
204 |             if np.sum(row) == 1:
205 |                 column_index, = np.where(row)
206 |                 marked_rows, marked_columns = self.__mark_rows_and_columns(marked_rows, marked_columns, row_index,
207 |                                                                            column_index)
208 | 
209 |         # Iterate over columns
210 |         for index, column in enumerate(zero_locations.T):
211 |             column_index = np.array([index])
212 |             if np.sum(column) == 1:
213 |                 row_index, = np.where(column)
214 |                 marked_rows, marked_columns = self.__mark_rows_and_columns(marked_rows, marked_columns, row_index,
215 |                                                                            column_index)
216 | 
217 |         return marked_rows, marked_columns
218 | 
219 |     @staticmethod
220 |     def __mark_rows_and_columns(marked_rows, marked_columns, row_index, column_index):
221 |         """Check if column or row is marked. If not marked then mark it."""
222 |         new_marked_rows = marked_rows
223 |         new_marked_columns = marked_columns
224 |         if not (marked_rows == row_index).any() and not (marked_columns == column_index).any():
225 |             new_marked_rows = np.insert(marked_rows, len(marked_rows), row_index)
226 |             new_marked_columns = np.insert(marked_columns, len(marked_columns), column_index)
227 |         return new_marked_rows, new_marked_columns
228 | 
229 |     @staticmethod
230 |     def select_arbitrary_match(zero_locations):
231 |         """Selects row column combination with minimum number of zeros in it."""
232 |         # Count number of zeros in row and column combinations
233 |         rows, columns = np.where(zero_locations)
234 |         zero_count = []
235 |         for index, row in enumerate(rows):
236 |             total_zeros = np.sum(zero_locations[row]) + np.sum(zero_locations[:, columns[index]])
237 |             zero_count.append(total_zeros)
238 | 
239 |         # Get the row column combination with the minimum number of zeros.
240 |         indices = zero_count.index(min(zero_count))
241 |         row = np.array([rows[indices]])
242 |         column = np.array([columns[indices]])
243 | 
244 |         return row, column
245 | 
246 |     def __set_results(self, result_lists):
247 |         """Set results during calculation."""
248 |         # Check if results values are out of bound from input matrix (because of matrix being padded).
249 |         # Add results to results list.
250 |         for result in result_lists:
251 |             row, column = result
252 |             if row < self._maxRow and column < self._maxColumn:
253 |                 new_result = (int(row), int(column))
254 |                 self._results.append(new_result)
255 | 
256 | 
257 | class CoverZeros:
258 |     """
259 |     Use minimum number of lines to cover all zeros in the matrix.
260 |     Algorithm based on: http://weber.ucsd.edu/~vcrawfor/hungar.pdf
261 |     """
262 | 
263 |     def __init__(self, matrix):
264 |         """
265 |         Input a matrix and save it as a boolean matrix to designate zero locations.
266 |         Run calculation procedure to generate results.
267 |         """
268 |         # Find zeros in matrix
269 |         self._zero_locations = (matrix == 0)
270 |         self._shape = matrix.shape
271 | 
272 |         # Choices starts without any choices made.
273 |         self._choices = np.zeros(self._shape, dtype=bool)
274 | 
275 |         self._marked_rows = []
276 |         self._marked_columns = []
277 | 
278 |         # marks rows and columns
279 |         self.__calculate()
280 | 
281 |         # Draw lines through all unmarked rows and all marked columns.
282 |         self._covered_rows = list(set(range(self._shape[0])) - set(self._marked_rows))
283 |         self._covered_columns = self._marked_columns
284 | 
285 |     def get_covered_rows(self):
286 |         """Return list of covered rows."""
287 |         return self._covered_rows
288 | 
289 |     def get_covered_columns(self):
290 |         """Return list of covered columns."""
291 |         return self._covered_columns
292 | 
293 |     def __calculate(self):
294 |         """
295 |         Calculates minimum number of lines necessary to cover all zeros in a matrix.
296 |         Algorithm based on: http://weber.ucsd.edu/~vcrawfor/hungar.pdf
297 |         """
298 |         while True:
299 |             # Erase all marks.
300 |             self._marked_rows = []
301 |             self._marked_columns = []
302 | 
303 |             # Mark all rows in which no choice has been made.
304 |             for index, row in enumerate(self._choices):
305 |                 if not row.any():
306 |                     self._marked_rows.append(index)
307 | 
308 |             # If no marked rows then finish.
309 |             if not self._marked_rows:
310 |                 return True
311 | 
312 |             # Mark all columns not already marked which have zeros in marked rows.
313 |             num_marked_columns = self.__mark_new_columns_with_zeros_in_marked_rows()
314 | 
315 |             # If no new marked columns then finish.
316 |             if num_marked_columns == 0:
317 |                 return True
318 | 
319 |             # While there is some choice in every marked column.
320 |             while self.__choice_in_all_marked_columns():
321 |                 # Some Choice in every marked column.
322 | 
323 |                 # Mark all rows not already marked which have choices in marked columns.
324 |                 num_marked_rows = self.__mark_new_rows_with_choices_in_marked_columns()
325 | 
326 |                 # If no new marks then Finish.
327 |                 if num_marked_rows == 0:
328 |                     return True
329 | 
330 |                 # Mark all columns not already marked which have zeros in marked rows.
331 |                 num_marked_columns = self.__mark_new_columns_with_zeros_in_marked_rows()
332 | 
333 |                 # If no new marked columns then finish.
334 |                 if num_marked_columns == 0:
335 |                     return True
336 | 
337 |             # No choice in one or more marked columns.
338 |             # Find a marked column that does not have a choice.
339 |             choice_column_index = self.__find_marked_column_without_choice()
340 | 
341 |             while choice_column_index is not None:
342 |                 # Find a zero in the column indexed that does not have a row with a choice.
343 |                 choice_row_index = self.__find_row_without_choice(choice_column_index)
344 | 
345 |                 # Check if an available row was found.
346 |                 new_choice_column_index = None
347 |                 if choice_row_index is None:
348 |                     # Find a good row to accomodate swap. Find its column pair.
349 |                     choice_row_index, new_choice_column_index = \
350 |                         self.__find_best_choice_row_and_new_column(choice_column_index)
351 | 
352 |                     # Delete old choice.
353 |                     self._choices[choice_row_index, new_choice_column_index] = False
354 | 
355 |                 # Set zero to choice.
356 |                 self._choices[choice_row_index, choice_column_index] = True
357 | 
358 |                 # Loop again if choice is added to a row with a choice already in it.
359 |                 choice_column_index = new_choice_column_index
360 | 
361 |     def __mark_new_columns_with_zeros_in_marked_rows(self):
362 |         """Mark all columns not already marked which have zeros in marked rows."""
363 |         num_marked_columns = 0
364 |         for index, column in enumerate(self._zero_locations.T):
365 |             if index not in self._marked_columns:
366 |                 if column.any():
367 |                     row_indices, = np.where(column)
368 |                     zeros_in_marked_rows = (set(self._marked_rows) & set(row_indices)) != set([])
369 |                     if zeros_in_marked_rows:
370 |                         self._marked_columns.append(index)
371 |                         num_marked_columns += 1
372 |         return num_marked_columns
373 | 
374 |     def __mark_new_rows_with_choices_in_marked_columns(self):
375 |         """Mark all rows not already marked which have choices in marked columns."""
376 |         num_marked_rows = 0
377 |         for index, row in enumerate(self._choices):
378 |             if index not in self._marked_rows:
379 |                 if row.any():
380 |                     column_index, = np.where(row)
381 |                     if column_index in self._marked_columns:
382 |                         self._marked_rows.append(index)
383 |                         num_marked_rows += 1
384 |         return num_marked_rows
385 | 
386 |     def __choice_in_all_marked_columns(self):
387 |         """Return Boolean True if there is a choice in all marked columns. Returns boolean False otherwise."""
388 |         for column_index in self._marked_columns:
389 |             if not self._choices[:, column_index].any():
390 |                 return False
391 |         return True
392 | 
393 |     def __find_marked_column_without_choice(self):
394 |         """Find a marked column that does not have a choice."""
395 |         for column_index in self._marked_columns:
396 |             if not self._choices[:, column_index].any():
397 |                 return column_index
398 | 
399 |         raise HungarianError(
400 |             "Could not find a column without a choice. Failed to cover matrix zeros. Algorithm has failed.")
401 | 
402 |     def __find_row_without_choice(self, choice_column_index):
403 |         """Find a row without a choice in it for the column indexed. If a row does not exist then return None."""
404 |         row_indices, = np.where(self._zero_locations[:, choice_column_index])
405 |         for row_index in row_indices:
406 |             if not self._choices[row_index].any():
407 |                 return row_index
408 | 
409 |         # All rows have choices. Return None.
410 |         return None
411 | 
412 |     def __find_best_choice_row_and_new_column(self, choice_column_index):
413 |         """
414 |         Find a row index to use for the choice so that the column that needs to be changed is optimal.
415 |         Return a random row and column if unable to find an optimal selection.
416 |         """
417 |         row_indices, = np.where(self._zero_locations[:, choice_column_index])
418 |         for row_index in row_indices:
419 |             column_indices, = np.where(self._choices[row_index])
420 |             column_index = column_indices[0]
421 |             if self.__find_row_without_choice(column_index) is not None:
422 |                 return row_index, column_index
423 | 
424 |         # Cannot find optimal row and column. Return a random row and column.
425 |         from random import shuffle
426 | 
427 |         shuffle(row_indices)
428 |         column_index, = np.where(self._choices[row_indices[0]])
429 |         return row_indices[0], column_index[0]
430 | 
431 | 
432 | if __name__ == '__main__':
433 |     profit_matrix = [
434 |         [62, 75, 80, 93, 95, 97],
435 |         [75, 80, 82, 85, 71, 97],
436 |         [80, 75, 81, 98, 90, 97],
437 |         [78, 82, 84, 80, 50, 98],
438 |         [90, 85, 85, 80, 85, 99],
439 |         [65, 75, 80, 75, 68, 96]]
440 | 
441 |     hungarian = Hungarian(profit_matrix, is_profit_matrix=True)
442 |     hungarian.calculate()
443 |     print("Expected value:\t\t543")
444 |     print("Calculated value:\t", hungarian.get_total_potential())  # = 543
445 |     print("Expected results:\n\t[(0, 4), (2, 3), (5, 5), (4, 0), (1, 1), (3, 2)]")
446 |     print("Results:\n\t", hungarian.get_results())
447 |     print("-" * 80)
448 | 
449 |     cost_matrix = [
450 |         [4, 2, 8],
451 |         [4, 3, 7],
452 |         [3, 1, 6]]
453 |     hungarian = Hungarian(cost_matrix)
454 |     print('calculating...')
455 |     hungarian.calculate()
456 |     print("Expected value:\t\t12")
457 |     print("Calculated value:\t", hungarian.get_total_potential())  # = 12
458 |     print("Expected results:\n\t[(0, 1), (1, 0), (2, 2)]")
459 |     print("Results:\n\t", hungarian.get_results())
460 |     print("-" * 80)
461 | 
462 |     profit_matrix = [
463 |         [62, 75, 80, 93, 0, 97],
464 |         [75, 0, 82, 85, 71, 97],
465 |         [80, 75, 81, 0, 90, 97],
466 |         [78, 82, 0, 80, 50, 98],
467 |         [0, 85, 85, 80, 85, 99],
468 |         [65, 75, 80, 75, 68, 0]]
469 |     hungarian = Hungarian()
470 |     hungarian.calculate(profit_matrix, is_profit_matrix=True)
471 |     print("Expected value:\t\t523")
472 |     print("Calculated value:\t", hungarian.get_total_potential())  # = 523
473 |     print("Expected results:\n\t[(0, 3), (2, 4), (3, 0), (5, 2), (1, 5), (4, 1)]")
474 |     print("Results:\n\t", hungarian.get_results())
475 |     print("-" * 80)
476 | 


--------------------------------------------------------------------------------
/lib_graph_partition/hungarian_1.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | reference: https://www.topcoder.com/community/competitive-programming/tutorials/assignment-problem-and-hungarian-algorithm/
  3 | '''
  4 | 
  5 | import numpy as np
  6 | 
  7 | #max weight assignment
  8 | class KMMatcher:
  9 | 
 10 |     ## weights : nxm weight matrix (numpy , float), n <= m
 11 |     def __init__(self, weights):
 12 |         weights = np.array(weights).astype(np.float32)
 13 |         self.weights = weights
 14 |         self.n, self.m = weights.shape
 15 |         assert self.n <= self.m
 16 |         # init label
 17 |         self.label_x = np.max(weights, axis=1)
 18 |         self.label_y = np.zeros((self.m, ), dtype=np.float32)
 19 | 
 20 |         self.max_match = 0
 21 |         self.xy = -np.ones((self.n,), dtype=np.int)
 22 |         self.yx = -np.ones((self.m,), dtype=np.int)
 23 | 
 24 |     def do_augment(self, x, y):
 25 |         self.max_match += 1
 26 |         while x != -2:
 27 |             self.yx[y] = x
 28 |             ty = self.xy[x]
 29 |             self.xy[x] = y
 30 |             x, y = self.prev[x], ty
 31 | 
 32 |     def find_augment_path(self):
 33 |         self.S = np.zeros((self.n,), np.bool)
 34 |         self.T = np.zeros((self.m,), np.bool)
 35 | 
 36 |         self.slack = np.zeros((self.m,), dtype=np.float32)
 37 |         self.slackyx = -np.ones((self.m,), dtype=np.int)  # l[slackyx[y]] + l[y] - w[slackx[y], y] == slack[y]
 38 | 
 39 |         self.prev = -np.ones((self.n,), np.int)
 40 | 
 41 |         queue, st = [], 0
 42 |         root = -1
 43 | 
 44 |         for x in range(self.n):
 45 |             if self.xy[x] == -1:
 46 |                 queue.append(x);
 47 |                 root = x
 48 |                 self.prev[x] = -2
 49 |                 self.S[x] = True
 50 |                 break
 51 | 
 52 |         self.slack = self.label_y + self.label_x[root] - self.weights[root]
 53 |         self.slackyx[:] = root
 54 | 
 55 |         while True:
 56 |             while st < len(queue):
 57 |                 x = queue[st]; st+= 1
 58 | 
 59 |                 is_in_graph = np.isclose(self.weights[x], self.label_x[x] + self.label_y)
 60 |                 nonzero_inds = np.nonzero(np.logical_and(is_in_graph, np.logical_not(self.T)))[0]
 61 | 
 62 |                 for y in nonzero_inds:
 63 |                     if self.yx[y] == -1:
 64 |                         return x, y
 65 |                     self.T[y] = True
 66 |                     queue.append(self.yx[y])
 67 |                     self.add_to_tree(self.yx[y], x)
 68 | 
 69 |             self.update_labels()
 70 |             queue, st = [], 0
 71 |             is_in_graph = np.isclose(self.slack, 0)
 72 |             nonzero_inds = np.nonzero(np.logical_and(is_in_graph, np.logical_not(self.T)))[0]
 73 | 
 74 |             for y in nonzero_inds:
 75 |                 x = self.slackyx[y]
 76 |                 if self.yx[y] == -1:
 77 |                     return x, y
 78 |                 self.T[y] = True
 79 |                 if not self.S[self.yx[y]]:
 80 |                     queue.append(x)
 81 |                     self.add_to_tree(self.yx[y], x)
 82 | 
 83 |     def solve(self, verbose = False):
 84 |         while self.max_match < self.n:
 85 |             x, y = self.find_augment_path()
 86 |             self.do_augment(x, y)
 87 | 
 88 |         sum = 0.
 89 |         for x in range(self.n):
 90 |             if verbose:
 91 |                 print('match {} to {}, weight {:.4f}'.format(x, self.xy[x], self.weights[x, self.xy[x]]))
 92 |             sum += self.weights[x, self.xy[x]]
 93 |         self.best = sum
 94 |         if verbose:
 95 |             print('ans: {:.4f}'.format(sum))
 96 |         return self.xy, sum
 97 | 
 98 | 
 99 |     def add_to_tree(self, x, prevx):
100 |         self.S[x] = True
101 |         self.prev[x] = prevx
102 | 
103 |         better_slack_idx = self.label_x[x] + self.label_y - self.weights[x] < self.slack
104 |         self.slack[better_slack_idx] = self.label_x[x] + self.label_y[better_slack_idx] - self.weights[x, better_slack_idx]
105 |         self.slackyx[better_slack_idx] = x
106 | 
107 |     def update_labels(self):
108 |         delta = self.slack[np.logical_not(self.T)].min()
109 |         self.label_x[self.S] -= delta
110 |         self.label_y[self.T] += delta
111 |         self.slack[np.logical_not(self.T)] -= delta
112 | 
113 | 
114 | if __name__ == '__main__':
115 |     matcher = KMMatcher([
116 |         [2., 3., 0., 3.],
117 |         [0., 4., 4., 0.],
118 |         [5., 6., 0., 0.],
119 |         [0., 0., 7., 0.]
120 |     ])
121 |     best = matcher.solve(verbose=True)
122 |     print(best)
123 | 


--------------------------------------------------------------------------------
/lib_graph_partition/metis_partition.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import networkx as nx
 3 | import pymetis
 4 | from torch_geometric.data import ClusterData
 5 | from torch_geometric.utils import from_networkx
 6 | 
 7 | from lib_graph_partition.partition import Partition
 8 | 
 9 | 
10 | class MetisPartition(Partition):
11 |     def __init__(self, args, graph, dataset):
12 |         super(MetisPartition, self).__init__(args, graph, dataset)
13 |         self.graph = graph
14 |         self.args = args
15 |         self.data = dataset
16 | 
17 |     def partition(self, recursive=False):
18 |         # recursive (bool, optional): If set to :obj:`True`, will use multilevel
19 |         # recursive bisection instead of multilevel k-way partitioning.
20 |         # (default: :obj:`False`)
21 |         # only use train data, not the whole dataset
22 |         self.train_data = from_networkx(self.graph)
23 |         data = ClusterData(self.train_data, self.args['num_shards'], recursive=recursive)
24 | 
25 |         community_to_node = {}
26 |         for i in range(self.args['num_shards']):
27 |             community_to_node[i] = [*range(data.partptr[i], data.partptr[i+1], 1)]
28 | 
29 |         # map node back to original graph
30 |         for com in range(self.args['num_shards']):
31 |             community_to_node[com] = np.array(list(self.graph.nodes))[data.partptr.numpy()[com]:data.partptr.numpy()[com+1]]
32 | 
33 |         return community_to_node
34 | 
35 | 
36 | class PyMetisPartition(Partition):
37 |     def __init__(self, args, graph, dataset):
38 |         super(PyMetisPartition, self).__init__(args, graph, dataset)
39 |         self.graph = graph
40 |         self.args = args
41 |         self.data = dataset
42 | 
43 |     def partition(self, recursive=False):
44 |         # recursive (bool, optional): If set to :obj:`True`, will use multilevel
45 |         # recursive bisection instead of multilevel k-way partitioning.
46 |         # (default: :obj:`False`)
47 |         # only use train data, not the whole dataset
48 |         # map graph into new graph
49 |         mapping = {}
50 |         for i, node in enumerate(self.graph.nodes):
51 |             mapping[node] = i
52 |         partition_graph = nx.relabel_nodes(self.graph, mapping=mapping)
53 | 
54 |         adj_list = []
55 |         for line in nx.generate_adjlist(partition_graph):
56 |             line_int = list(map(int, line.split()))
57 |             adj_list.append(np.array(line_int))
58 | 
59 |         n_cuts, membership = pymetis.part_graph(self.args['num_shards'], adjacency=adj_list)
60 | 
61 |         # map node back to original graph
62 |         community_to_node = {}
63 |         for shard_index in range(self.args['num_shards']):
64 |             community_to_node[shard_index] = np.array([node_id for node_id, node_shard_index in zip(list(mapping.keys()), membership) if node_shard_index == shard_index])
65 |         return community_to_node
66 | 


--------------------------------------------------------------------------------
/lib_graph_partition/partition.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class Partition:
 5 |     def __init__(self, args, graph, dataset=None):
 6 |         self.args = args
 7 |         self.graph = graph
 8 |         self.dataset = dataset
 9 | 
10 |         self.partition_method = self.args['partition_method']
11 |         self.num_shards = self.args['num_shards']
12 |         self.dataset_name = self.args['dataset_name']
13 |         
14 |     def idx2id(self, idx_dict, node_list):
15 |         ret_dict = {}
16 |         for com, idx in idx_dict.items():
17 |             ret_dict[com] = node_list[list(idx)]
18 |         
19 |         return ret_dict
20 |     
21 |     def id2idx(self, id_dict, node_list):
22 |         ret_dict = {}
23 |         for com, id in id_dict.items():
24 |             ret_dict[com] = np.searchsorted(node_list, id)
25 |         
26 |         return ret_dict
27 | 


--------------------------------------------------------------------------------
/lib_graph_partition/partition_kmeans.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import pickle
 3 | 
 4 | import cupy as cp
 5 | import numpy as np
 6 | import logging
 7 | 
 8 | from sklearn.cluster import KMeans
 9 | 
10 | import config
11 | from lib_graph_partition.constrained_kmeans_base import ConstrainedKmeansBase
12 | from lib_graph_partition.partition import Partition
13 | from lib_graph_partition.constrained_kmeans import ConstrainedKmeans
14 | from lib_node_embedding.node_embedding import NodeEmbedding
15 | 
16 | 
17 | class PartitionKMeans(Partition):
18 |     def __init__(self, args, graph, dataset):
19 |         super(PartitionKMeans, self).__init__(args, graph, dataset)
20 | 
21 |         self.logger = logging.getLogger('partition_kmeans')
22 |         cp.cuda.Device(self.args['cuda']).use()
23 |         self.load_embeddings()
24 | 
25 |     def load_embeddings(self):
26 |         node_embedding = NodeEmbedding(self.args, self.graph, self.dataset)
27 | 
28 |         if self.partition_method in ["sage_km", "sage_km_base"]:
29 |             self.node_to_embedding = node_embedding.sage_encoder()
30 |         else:
31 |             raise Exception('unsupported embedding method')
32 | 
33 |     def partition(self):
34 |         self.logger.info("partitioning")
35 | 
36 |         embedding = []
37 |         for node in self.node_to_embedding.keys():
38 |             embedding.append(self.node_to_embedding[node])
39 | 
40 |         if not self.args['is_constrained']:
41 |             cluster = KMeans(n_clusters=self.num_shards, random_state=10)
42 |             cluster_labels = cluster.fit_predict(embedding)
43 | 
44 |             node_to_community = {}
45 |             for com, node in zip(cluster_labels, self.node_to_embedding.keys()):
46 |                 node_to_community[node] = com
47 | 
48 |             community_to_node = {}
49 |             for com in range(len(set(node_to_community.values()))):
50 |                 community_to_node[com] = np.where(np.array(list(node_to_community.values())) == com)[0]
51 |             community_to_node = dict(sorted(community_to_node.items()))
52 | 
53 |         else:
54 |             # node_threshold = math.ceil(self.graph.number_of_nodes() / self.num_shards)
55 |             # node_threshold = math.ceil(self.graph.number_of_nodes() / self.num_shards + 0.05*self.graph.number_of_nodes())
56 |             node_threshold = math.ceil(
57 |                 self.graph.number_of_nodes() / self.args['num_shards'] + self.args['shard_size_delta'] * (
58 |                             self.graph.number_of_nodes() - self.graph.number_of_nodes() / self.args['num_shards']))
59 |             self.logger.info("#.nodes: %s. Shard threshold: %s." % (self.graph.number_of_nodes(), node_threshold))
60 | 
61 |             if self.partition_method == 'sage_km_base':
62 |                 cluster = ConstrainedKmeansBase(np.array(embedding), num_clusters=self.num_shards,
63 |                                                 node_threshold=node_threshold,
64 |                                                 terminate_delta=self.args['terminate_delta'])
65 |                 cluster.initialization()
66 |                 community, km_deltas = cluster.clustering()
67 |                 pickle.dump(km_deltas, open(config.ANALYSIS_PATH + "partition/base_bkm_" + self.args['dataset_name'], 'wb'))
68 | 
69 |                 community_to_node = {}
70 |                 for i in range(self.num_shards):
71 |                     community_to_node[i] = np.array(community[i])
72 | 
73 |             if self.partition_method == 'sage_km':
74 |                 cluster = ConstrainedKmeans(cp.array(embedding), num_clusters=self.num_shards,
75 |                                                node_threshold=node_threshold,
76 |                                                terminate_delta=self.args['terminate_delta'])
77 |                 cluster.initialization()
78 |                 community, km_deltas = cluster.clustering()
79 |                 pickle.dump(km_deltas, open(config.ANALYSIS_PATH + "partition/bkm_" + self.args['dataset_name'], 'wb'))
80 | 
81 |                 community_to_node = {}
82 |                 for i in range(self.num_shards):
83 |                     community_to_node[i] = np.array(community[i].get().astype(int))
84 | 
85 |         return community_to_node
86 | 
87 | 


--------------------------------------------------------------------------------
/lib_graph_partition/partition_lpa.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import numpy as np
 3 | import networkx as nx
 4 | import logging
 5 | import pickle
 6 | 
 7 | from lib_graph_partition.constrained_lpa_base import ConstrainedLPABase
 8 | from lib_graph_partition.partition import Partition
 9 | from lib_graph_partition.constrained_lpa import ConstrainedLPA
10 | import config
11 | 
12 | 
13 | class PartitionLPA(Partition):
14 |     def __init__(self, args, graph):
15 |         super(PartitionLPA, self).__init__(args, graph)
16 | 
17 |         self.logger = logging.getLogger('partition_lpa')
18 | 
19 |     def partition(self):
20 |         # implement LPA by hand, refer to https://github.com/benedekrozemberczki/LabelPropagation
21 |         community_generator = nx.algorithms.community.label_propagation.label_propagation_communities(self.graph)
22 |         self.logger.info("Generating LPA communities.")
23 |         community_to_node = {key: c for key, c in zip(range(self.graph.number_of_nodes()), community_generator)}
24 |         print("Found %s communities by unconstrained LPA", len(community_to_node.keys()))
25 |         return community_to_node
26 | 
27 | 
28 | class PartitionConstrainedLPA(Partition):
29 |     def __init__(self, args, graph):
30 |         super(PartitionConstrainedLPA, self).__init__(args, graph)
31 |         self.args = args
32 | 
33 |         self.logger = logging.getLogger('partition_constrained_lpa')
34 | 
35 |     def partition(self):
36 |         adj_array = nx.linalg.adj_matrix(self.graph).toarray().astype(np.bool)
37 |         # node_threshold = math.ceil(self.graph.number_of_nodes() / self.args['num_shards']) + 0.05 * self.graph.number_of_nodes()
38 |         # node_threshold = math.ceil(self.graph.number_of_nodes() / self.args['num_shards'])
39 |         node_threshold = math.ceil(self.graph.number_of_nodes() / self.args['num_shards'] +
40 |                                    self.args['shard_size_delta'] * (self.graph.number_of_nodes()-self.graph.number_of_nodes() / self.args['num_shards']))
41 | 
42 |         self.logger.info(" #. nodes: %s. LPA shard threshold: %s." % (self.graph.number_of_nodes(), node_threshold))
43 |         lpa = ConstrainedLPA(adj_array, self.num_shards, node_threshold, self.args['terminate_delta'])
44 | 
45 |         lpa.initialization()
46 |         community_to_node, lpa_deltas = lpa.community_detection()
47 | 
48 |         pickle.dump(lpa_deltas, open(config.ANALYSIS_PATH + "partition/blpa_" + self.args['dataset_name'], 'wb'))
49 | 
50 |         return self.idx2id(community_to_node, np.array(self.graph.nodes))
51 | 
52 | 
53 | class PartitionConstrainedLPABase(Partition):
54 |     def __init__(self, args, graph):
55 |         super(PartitionConstrainedLPABase, self).__init__(args, graph)
56 |         self.args = args
57 | 
58 |         self.logger = logging.getLogger('partition_constrained_lpa')
59 | 
60 |     def partition(self):
61 |         adj_array = nx.linalg.adj_matrix(self.graph).toarray().astype(np.bool)
62 |         node_threshold = math.ceil(self.graph.number_of_nodes() / self.args['num_shards'] + self.args['shard_size_delta'] * (self.graph.number_of_nodes()-self.graph.number_of_nodes() / self.args['num_shards']))
63 | 
64 |         self.logger.info(" #. nodes: %s. LPA shard threshold: %s." % (self.graph.number_of_nodes(), node_threshold))
65 |         lpa = ConstrainedLPABase(adj_array, self.num_shards, node_threshold, self.args['terminate_delta'])
66 | 
67 |         lpa.initialization()
68 |         community_to_node, lpa_deltas = lpa.community_detection()
69 | 
70 |         pickle.dump(lpa_deltas, open(config.ANALYSIS_PATH + "partition/base_blpa_" + self.args['dataset_name'], 'wb'))
71 | 
72 |         return self.idx2id(community_to_node, np.array(self.graph.nodes))
73 | 


--------------------------------------------------------------------------------
/lib_graph_partition/partition_random.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from lib_graph_partition.partition import Partition
 4 | 
 5 | 
 6 | class PartitionRandom(Partition):
 7 |     def __init__(self, args, graph):
 8 |         super(PartitionRandom, self).__init__(args, graph)
 9 | 
10 |     def partition(self):
11 |         graph_nodes = np.array(self.graph.nodes)
12 |         np.random.shuffle(graph_nodes)
13 |         train_shard_indices = np.array_split(graph_nodes, self.args['num_shards'])
14 | 
15 |         return dict(zip(range(self.num_shards), train_shard_indices))
16 | 


--------------------------------------------------------------------------------
/lib_node_embedding/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MinChen00/Graph-Unlearning/a9b942d01651c2e3d780ae12e1a1459e35120ffa/lib_node_embedding/__init__.py


--------------------------------------------------------------------------------
/lib_node_embedding/ge/__init__.py:
--------------------------------------------------------------------------------
1 | from .models import *


--------------------------------------------------------------------------------
/lib_node_embedding/ge/alias.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def create_alias_table(area_ratio):
 5 |     """
 6 | 
 7 |     :param area_ratio: sum(area_ratio)=1
 8 |     :return: accept,alias
 9 |     """
10 |     l = len(area_ratio)
11 |     accept, alias = [0] * l, [0] * l
12 |     small, large = [], []
13 |     area_ratio_ = np.array(area_ratio) * l
14 |     for i, prob in enumerate(area_ratio_):
15 |         if prob < 1.0:
16 |             small.append(i)
17 |         else:
18 |             large.append(i)
19 | 
20 |     while small and large:
21 |         small_idx, large_idx = small.pop(), large.pop()
22 |         accept[small_idx] = area_ratio_[small_idx]
23 |         alias[small_idx] = large_idx
24 |         area_ratio_[large_idx] = area_ratio_[large_idx] - \
25 |             (1 - area_ratio_[small_idx])
26 |         if area_ratio_[large_idx] < 1.0:
27 |             small.append(large_idx)
28 |         else:
29 |             large.append(large_idx)
30 | 
31 |     while large:
32 |         large_idx = large.pop()
33 |         accept[large_idx] = 1
34 |     while small:
35 |         small_idx = small.pop()
36 |         accept[small_idx] = 1
37 | 
38 |     return accept, alias
39 | 
40 | 
41 | def alias_sample(accept, alias):
42 |     """
43 | 
44 |     :param accept:
45 |     :param alias:
46 |     :return: sample index
47 |     """
48 |     N = len(accept)
49 |     i = int(np.random.random()*N)
50 |     r = np.random.random()
51 |     if r < accept[i]:
52 |         return i
53 |     else:
54 |         return alias[i]
55 | 


--------------------------------------------------------------------------------
/lib_node_embedding/ge/classify.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | 
 4 | import numpy
 5 | from sklearn.metrics import f1_score, accuracy_score
 6 | from sklearn.multiclass import OneVsRestClassifier
 7 | from sklearn.preprocessing import MultiLabelBinarizer
 8 | 
 9 | 
10 | class TopKRanker(OneVsRestClassifier):
11 |     def predict(self, X, top_k_list):
12 |         probs = numpy.asarray(super(TopKRanker, self).predict_proba(X))
13 |         all_labels = []
14 |         for i, k in enumerate(top_k_list):
15 |             probs_ = probs[i, :]
16 |             labels = self.classes_[probs_.argsort()[-k:]].tolist()
17 |             probs_[:] = 0
18 |             probs_[labels] = 1
19 |             all_labels.append(probs_)
20 |         return numpy.asarray(all_labels)
21 | 
22 | 
23 | class Classifier(object):
24 | 
25 |     def __init__(self, embeddings, clf):
26 |         self.embeddings = embeddings
27 |         self.clf = TopKRanker(clf)
28 |         self.binarizer = MultiLabelBinarizer(sparse_output=True)
29 | 
30 |     def train(self, X, Y, Y_all):
31 |         self.binarizer.fit(Y_all)
32 |         X_train = [self.embeddings[x] for x in X]
33 |         Y = self.binarizer.transform(Y)
34 |         self.clf.fit(X_train, Y)
35 | 
36 |     def evaluate(self, X, Y):
37 |         top_k_list = [len(l) for l in Y]
38 |         Y_ = self.predict(X, top_k_list)
39 |         Y = self.binarizer.transform(Y)
40 |         averages = ["micro", "macro", "samples", "weighted"]
41 |         results = {}
42 |         for average in averages:
43 |             results[average] = f1_score(Y, Y_, average=average)
44 |         results['acc'] = accuracy_score(Y,Y_)
45 |         print('-------------------')
46 |         print(results)
47 |         return results
48 |         print('-------------------')
49 | 
50 |     def predict(self, X, top_k_list):
51 |         X_ = numpy.asarray([self.embeddings[x] for x in X])
52 |         Y = self.clf.predict(X_, top_k_list=top_k_list)
53 |         return Y
54 | 
55 |     def split_train_evaluate(self, X, Y, train_precent, seed=0):
56 |         state = numpy.random.get_state()
57 | 
58 |         training_size = int(train_precent * len(X))
59 |         numpy.random.seed(seed)
60 |         shuffle_indices = numpy.random.permutation(numpy.arange(len(X)))
61 |         X_train = [X[shuffle_indices[i]] for i in range(training_size)]
62 |         Y_train = [Y[shuffle_indices[i]] for i in range(training_size)]
63 |         X_test = [X[shuffle_indices[i]] for i in range(training_size, len(X))]
64 |         Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))]
65 | 
66 |         self.train(X_train, Y_train, Y)
67 |         numpy.random.set_state(state)
68 |         return self.evaluate(X_test, Y_test)
69 | 
70 | 
71 | def read_node_label(filename, skip_head=False):
72 |     fin = open(filename, 'r')
73 |     X = []
74 |     Y = []
75 |     while 1:
76 |         if skip_head:
77 |             fin.readline()
78 |         l = fin.readline()
79 |         if l == '':
80 |             break
81 |         vec = l.strip().split(' ')
82 |         X.append(vec[0])
83 |         Y.append(vec[1:])
84 |     fin.close()
85 |     return X, Y
86 | 


--------------------------------------------------------------------------------
/lib_node_embedding/ge/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .deepwalk import DeepWalk
2 | from .node2vec import Node2Vec
3 | from .line import LINE
4 | from .sdne import SDNE
5 | from .struc2vec import Struc2Vec
6 | 
7 | 
8 | __all__ = ["DeepWalk", "Node2Vec", "LINE", "SDNE", "Struc2Vec"]
9 | 


--------------------------------------------------------------------------------
/lib_node_embedding/ge/models/deepwalk.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | """
 4 | 
 5 | 
 6 | 
 7 | Author:
 8 | 
 9 |     Weichen Shen,wcshen1994@163.com
10 | 
11 | 
12 | 
13 | Reference:
14 | 
15 |     [1] Perozzi B, Al-Rfou R, Skiena S. Deepwalk: Online learning of social representations[C]//Proceedings of the 20th ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2014: 701-710.(http://www.perozzi.net/publications/14_kdd_deepwalk.pdf)
16 | 
17 | 
18 | 
19 | """
20 | from ..walker import RandomWalker
21 | from gensim.models import Word2Vec
22 | import pandas as pd
23 | 
24 | 
25 | class DeepWalk:
26 |     def __init__(self, graph, walk_length, num_walks, workers=1):
27 | 
28 |         self.graph = graph
29 |         self.w2v_model = None
30 |         self._embeddings = {}
31 | 
32 |         self.walker = RandomWalker(
33 |             graph, p=1, q=1, )
34 |         self.sentences = self.walker.simulate_walks(
35 |             num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1)
36 | 
37 |     def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs):
38 | 
39 |         kwargs["sentences"] = self.sentences
40 |         kwargs["min_count"] = kwargs.get("min_count", 0)
41 |         kwargs["size"] = embed_size
42 |         kwargs["sg"] = 1  # skip gram
43 |         kwargs["hs"] = 1  # deepwalk use Hierarchical Softmax
44 |         kwargs["workers"] = workers
45 |         kwargs["window"] = window_size
46 |         kwargs["iter"] = iter
47 | 
48 |         print("Learning embedding vectors...")
49 |         model = Word2Vec(**kwargs)
50 |         print("Learning embedding vectors done!")
51 | 
52 |         self.w2v_model = model
53 |         return model
54 | 
55 |     def get_embeddings(self,):
56 |         if self.w2v_model is None:
57 |             print("model not train")
58 |             return {}
59 | 
60 |         self._embeddings = {}
61 |         for word in self.graph.nodes():
62 |             self._embeddings[word] = self.w2v_model.wv[word]
63 | 
64 |         return self._embeddings
65 | 


--------------------------------------------------------------------------------
/lib_node_embedding/ge/models/line.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | """
  4 | 
  5 | 
  6 | 
  7 | Author:
  8 | 
  9 |     Weichen Shen,wcshen1994@163.com
 10 | 
 11 | 
 12 | 
 13 | Reference:
 14 | 
 15 |     [1] Tang J, Qu M, Wang M, et al. Line: Large-scale information network embedding[C]//Proceedings of the 24th International Conference on World Wide Web. International World Wide Web Conferences Steering Committee, 2015: 1067-1077.(https://arxiv.org/pdf/1503.03578.pdf)
 16 | 
 17 | 
 18 | 
 19 | """
 20 | import math
 21 | import random
 22 | 
 23 | import numpy as np
 24 | import tensorflow as tf
 25 | from tensorflow.python.keras import backend as K
 26 | from tensorflow.python.keras.layers import Embedding, Input, Lambda
 27 | from tensorflow.python.keras.models import Model
 28 | 
 29 | from ..alias import create_alias_table, alias_sample
 30 | from ..utils import preprocess_nxgraph
 31 | 
 32 | 
 33 | def line_loss(y_true, y_pred):
 34 |     return -K.mean(K.log(K.sigmoid(y_true*y_pred)))
 35 | 
 36 | 
 37 | def create_model(numNodes, embedding_size, order='second'):
 38 | 
 39 |     v_i = Input(shape=(1,))
 40 |     v_j = Input(shape=(1,))
 41 | 
 42 |     first_emb = Embedding(numNodes, embedding_size, name='first_emb')
 43 |     second_emb = Embedding(numNodes, embedding_size, name='second_emb')
 44 |     context_emb = Embedding(numNodes, embedding_size, name='context_emb')
 45 | 
 46 |     v_i_emb = first_emb(v_i)
 47 |     v_j_emb = first_emb(v_j)
 48 | 
 49 |     v_i_emb_second = second_emb(v_i)
 50 |     v_j_context_emb = context_emb(v_j)
 51 | 
 52 |     first = Lambda(lambda x: tf.reduce_sum(
 53 |         x[0]*x[1], axis=-1, keep_dims=False), name='first_order')([v_i_emb, v_j_emb])
 54 |     second = Lambda(lambda x: tf.reduce_sum(
 55 |         x[0]*x[1], axis=-1, keep_dims=False), name='second_order')([v_i_emb_second, v_j_context_emb])
 56 | 
 57 |     if order == 'first':
 58 |         output_list = [first]
 59 |     elif order == 'second':
 60 |         output_list = [second]
 61 |     else:
 62 |         output_list = [first, second]
 63 | 
 64 |     model = Model(inputs=[v_i, v_j], outputs=output_list)
 65 | 
 66 |     return model, {'first': first_emb, 'second': second_emb}
 67 | 
 68 | 
 69 | class LINE:
 70 |     def __init__(self, graph, embedding_size=8, negative_ratio=5, order='second',):
 71 |         """
 72 | 
 73 |         :param graph:
 74 |         :param embedding_size:
 75 |         :param negative_ratio:
 76 |         :param order: 'first','second','all'
 77 |         """
 78 |         if order not in ['first', 'second', 'all']:
 79 |             raise ValueError('mode must be fisrt,second,or all')
 80 | 
 81 |         self.graph = graph
 82 |         self.idx2node, self.node2idx = preprocess_nxgraph(graph)
 83 |         self.use_alias = True
 84 | 
 85 |         self.rep_size = embedding_size
 86 |         self.order = order
 87 | 
 88 |         self._embeddings = {}
 89 |         self.negative_ratio = negative_ratio
 90 |         self.order = order
 91 | 
 92 |         self.node_size = graph.number_of_nodes()
 93 |         self.edge_size = graph.number_of_edges()
 94 |         self.samples_per_epoch = self.edge_size*(1+negative_ratio)
 95 | 
 96 |         self._gen_sampling_table()
 97 |         self.reset_model()
 98 | 
 99 |     def reset_training_config(self, batch_size, times):
100 |         self.batch_size = batch_size
101 |         self.steps_per_epoch = (
102 |             (self.samples_per_epoch - 1) // self.batch_size + 1)*times
103 | 
104 |     def reset_model(self, opt='adam'):
105 | 
106 |         self.model, self.embedding_dict = create_model(
107 |             self.node_size, self.rep_size, self.order)
108 |         self.model.compile(opt, line_loss)
109 |         self.batch_it = self.batch_iter(self.node2idx)
110 | 
111 |     def _gen_sampling_table(self):
112 | 
113 |         # create sampling table for vertex
114 |         power = 0.75
115 |         numNodes = self.node_size
116 |         node_degree = np.zeros(numNodes)  # out degree
117 |         node2idx = self.node2idx
118 | 
119 |         for edge in self.graph.edges():
120 |             node_degree[node2idx[edge[0]]
121 |                         ] += self.graph[edge[0]][edge[1]].get('weight', 1.0)
122 | 
123 |         total_sum = sum([math.pow(node_degree[i], power)
124 |                          for i in range(numNodes)])
125 |         norm_prob = [float(math.pow(node_degree[j], power)) /
126 |                      total_sum for j in range(numNodes)]
127 | 
128 |         self.node_accept, self.node_alias = create_alias_table(norm_prob)
129 | 
130 |         # create sampling table for edge
131 |         numEdges = self.graph.number_of_edges()
132 |         total_sum = sum([self.graph[edge[0]][edge[1]].get('weight', 1.0)
133 |                          for edge in self.graph.edges()])
134 |         norm_prob = [self.graph[edge[0]][edge[1]].get('weight', 1.0) *
135 |                      numEdges / total_sum for edge in self.graph.edges()]
136 | 
137 |         self.edge_accept, self.edge_alias = create_alias_table(norm_prob)
138 | 
139 |     def batch_iter(self, node2idx):
140 | 
141 |         edges = [(node2idx[x[0]], node2idx[x[1]]) for x in self.graph.edges()]
142 | 
143 |         data_size = self.graph.number_of_edges()
144 |         shuffle_indices = np.random.permutation(np.arange(data_size))
145 |         # positive or negative mod
146 |         mod = 0
147 |         mod_size = 1 + self.negative_ratio
148 |         h = []
149 |         t = []
150 |         sign = 0
151 |         count = 0
152 |         start_index = 0
153 |         end_index = min(start_index + self.batch_size, data_size)
154 |         while True:
155 |             if mod == 0:
156 | 
157 |                 h = []
158 |                 t = []
159 |                 for i in range(start_index, end_index):
160 |                     if random.random() >= self.edge_accept[shuffle_indices[i]]:
161 |                         shuffle_indices[i] = self.edge_alias[shuffle_indices[i]]
162 |                     cur_h = edges[shuffle_indices[i]][0]
163 |                     cur_t = edges[shuffle_indices[i]][1]
164 |                     h.append(cur_h)
165 |                     t.append(cur_t)
166 |                 sign = np.ones(len(h))
167 |             else:
168 |                 sign = np.ones(len(h))*-1
169 |                 t = []
170 |                 for i in range(len(h)):
171 | 
172 |                     t.append(alias_sample(
173 |                         self.node_accept, self.node_alias))
174 | 
175 |             if self.order == 'all':
176 |                 yield ([np.array(h), np.array(t)], [sign, sign])
177 |             else:
178 |                 yield ([np.array(h), np.array(t)], [sign])
179 |             mod += 1
180 |             mod %= mod_size
181 |             if mod == 0:
182 |                 start_index = end_index
183 |                 end_index = min(start_index + self.batch_size, data_size)
184 | 
185 |             if start_index >= data_size:
186 |                 count += 1
187 |                 mod = 0
188 |                 h = []
189 |                 shuffle_indices = np.random.permutation(np.arange(data_size))
190 |                 start_index = 0
191 |                 end_index = min(start_index + self.batch_size, data_size)
192 | 
193 |     def get_embeddings(self,):
194 |         self._embeddings = {}
195 |         if self.order == 'first':
196 |             embeddings = self.embedding_dict['first'].get_weights()[0]
197 |         elif self.order == 'second':
198 |             embeddings = self.embedding_dict['second'].get_weights()[0]
199 |         else:
200 |             embeddings = np.hstack((self.embedding_dict['first'].get_weights()[
201 |                                    0], self.embedding_dict['second'].get_weights()[0]))
202 |         idx2node = self.idx2node
203 |         for i, embedding in enumerate(embeddings):
204 |             self._embeddings[idx2node[i]] = embedding
205 | 
206 |         return self._embeddings
207 | 
208 |     def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1, times=1):
209 |         self.reset_training_config(batch_size, times)
210 |         hist = self.model.fit_generator(self.batch_it, epochs=epochs, initial_epoch=initial_epoch, steps_per_epoch=self.steps_per_epoch,
211 |                                         verbose=verbose)
212 | 
213 |         return hist
214 | 


--------------------------------------------------------------------------------
/lib_node_embedding/ge/models/node2vec.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | """
 4 | 
 5 | 
 6 | 
 7 | Author:
 8 | 
 9 |     Weichen Shen,wcshen1994@163.com
10 | 
11 | 
12 | 
13 | Reference:
14 | 
15 |     [1] Grover A, Leskovec J. node2vec: Scalable feature learning for networks[C]//Proceedings of the 22nd ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2016: 855-864.(https://www.kdd.org/kdd2016/papers/files/rfp0218-groverA.pdf)
16 | 
17 | 
18 | 
19 | """
20 | 
21 | from gensim.models import Word2Vec
22 | import pandas as pd
23 | 
24 | from ..walker import RandomWalker
25 | 
26 | 
27 | class Node2Vec:
28 | 
29 |     def __init__(self, graph, walk_length, num_walks, p=1.0, q=1.0, workers=1, use_rejection_sampling=0):
30 | 
31 |         self.graph = graph
32 |         self._embeddings = {}
33 |         self.walker = RandomWalker(
34 |             graph, p=p, q=q, use_rejection_sampling=use_rejection_sampling)
35 | 
36 |         print("Preprocess transition probs...")
37 |         self.walker.preprocess_transition_probs()
38 | 
39 |         self.sentences = self.walker.simulate_walks(
40 |             num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1)
41 | 
42 |     def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs):
43 | 
44 |         kwargs["sentences"] = self.sentences
45 |         kwargs["min_count"] = kwargs.get("min_count", 0)
46 |         kwargs["size"] = embed_size
47 |         kwargs["sg"] = 1
48 |         kwargs["hs"] = 0  # node2vec not use Hierarchical Softmax
49 |         kwargs["workers"] = workers
50 |         kwargs["window"] = window_size
51 |         kwargs["iter"] = iter
52 | 
53 |         print("Learning embedding vectors...")
54 |         model = Word2Vec(**kwargs)
55 |         print("Learning embedding vectors done!")
56 | 
57 |         self.w2v_model = model
58 | 
59 |         return model
60 | 
61 |     def get_embeddings(self,):
62 |         if self.w2v_model is None:
63 |             print("model not train")
64 |             return {}
65 | 
66 |         self._embeddings = {}
67 |         for word in self.graph.nodes():
68 |             self._embeddings[word] = self.w2v_model.wv[word]
69 | 
70 |         return self._embeddings
71 | 


--------------------------------------------------------------------------------
/lib_node_embedding/ge/models/sdne.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | """
  4 | 
  5 | 
  6 | 
  7 | Author:
  8 | 
  9 |     Weichen Shen,wcshen1994@163.com
 10 | 
 11 | 
 12 | 
 13 | Reference:
 14 | 
 15 |     [1] Wang D, Cui P, Zhu W. Structural deep network embedding[C]//Proceedings of the 22nd ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2016: 1225-1234.(https://www.kdd.org/kdd2016/papers/files/rfp0191-wangAemb.pdf)
 16 | 
 17 | 
 18 | 
 19 | """
 20 | import time
 21 | 
 22 | import numpy as np
 23 | import scipy.sparse as sp
 24 | import tensorflow as tf
 25 | from tensorflow.python.keras import backend as K
 26 | from tensorflow.python.keras.callbacks import History
 27 | from tensorflow.python.keras.layers import Dense, Input
 28 | from tensorflow.python.keras.models import Model
 29 | from tensorflow.python.keras.regularizers import l1_l2
 30 | 
 31 | from ..utils import preprocess_nxgraph
 32 | 
 33 | 
 34 | def l_2nd(beta):
 35 |     def loss_2nd(y_true, y_pred):
 36 |         b_ = np.ones_like(y_true)
 37 |         b_[y_true != 0] = beta
 38 |         x = K.square((y_true - y_pred) * b_)
 39 |         t = K.sum(x, axis=-1, )
 40 |         return K.mean(t)
 41 | 
 42 |     return loss_2nd
 43 | 
 44 | 
 45 | def l_1st(alpha):
 46 |     def loss_1st(y_true, y_pred):
 47 |         L = y_true
 48 |         Y = y_pred
 49 |         batch_size = tf.to_float(K.shape(L)[0])
 50 |         return alpha * 2 * tf.linalg.trace(tf.matmul(tf.matmul(Y, L, transpose_a=True), Y)) / batch_size
 51 | 
 52 |     return loss_1st
 53 | 
 54 | 
 55 | def create_model(node_size, hidden_size=[256, 128], l1=1e-5, l2=1e-4):
 56 |     A = Input(shape=(node_size,))
 57 |     L = Input(shape=(None,))
 58 |     fc = A
 59 |     for i in range(len(hidden_size)):
 60 |         if i == len(hidden_size) - 1:
 61 |             fc = Dense(hidden_size[i], activation='relu',
 62 |                        kernel_regularizer=l1_l2(l1, l2), name='1st')(fc)
 63 |         else:
 64 |             fc = Dense(hidden_size[i], activation='relu',
 65 |                        kernel_regularizer=l1_l2(l1, l2))(fc)
 66 |     Y = fc
 67 |     for i in reversed(range(len(hidden_size) - 1)):
 68 |         fc = Dense(hidden_size[i], activation='relu',
 69 |                    kernel_regularizer=l1_l2(l1, l2))(fc)
 70 | 
 71 |     A_ = Dense(node_size, 'relu', name='2nd')(fc)
 72 |     model = Model(inputs=[A, L], outputs=[A_, Y])
 73 |     emb = Model(inputs=A, outputs=Y)
 74 |     return model, emb
 75 | 
 76 | 
 77 | class SDNE(object):
 78 |     def __init__(self, graph, hidden_size=[32, 16], alpha=1e-6, beta=5., nu1=1e-5, nu2=1e-4, ):
 79 | 
 80 |         self.graph = graph
 81 |         # self.g.remove_edges_from(self.g.selfloop_edges())
 82 |         self.idx2node, self.node2idx = preprocess_nxgraph(self.graph)
 83 | 
 84 |         self.node_size = self.graph.number_of_nodes()
 85 |         self.hidden_size = hidden_size
 86 |         self.alpha = alpha
 87 |         self.beta = beta
 88 |         self.nu1 = nu1
 89 |         self.nu2 = nu2
 90 | 
 91 |         self.A, self.L = self._create_A_L(
 92 |             self.graph, self.node2idx)  # Adj Matrix,L Matrix
 93 |         self.reset_model()
 94 |         self.inputs = [self.A, self.L]
 95 |         self._embeddings = {}
 96 | 
 97 |     def reset_model(self, opt='adam'):
 98 | 
 99 |         self.model, self.emb_model = create_model(self.node_size, hidden_size=self.hidden_size, l1=self.nu1,
100 |                                                   l2=self.nu2)
101 |         self.model.compile(opt, [l_2nd(self.beta), l_1st(self.alpha)])
102 |         self.get_embeddings()
103 | 
104 |     def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1):
105 |         if batch_size >= self.node_size:
106 |             if batch_size > self.node_size:
107 |                 print('batch_size({0}) > node_size({1}),set batch_size = {1}'.format(
108 |                     batch_size, self.node_size))
109 |                 batch_size = self.node_size
110 |             return self.model.fit([self.A.todense(), self.L.todense()], [self.A.todense(), self.L.todense()],
111 |                                   batch_size=batch_size, epochs=epochs, initial_epoch=initial_epoch, verbose=verbose,
112 |                                   shuffle=False, )
113 |         else:
114 |             steps_per_epoch = (self.node_size - 1) // batch_size + 1
115 |             hist = History()
116 |             hist.on_train_begin()
117 |             logs = {}
118 |             for epoch in range(initial_epoch, epochs):
119 |                 start_time = time.time()
120 |                 losses = np.zeros(3)
121 |                 for i in range(steps_per_epoch):
122 |                     index = np.arange(
123 |                         i * batch_size, min((i + 1) * batch_size, self.node_size))
124 |                     A_train = self.A[index, :].todense()
125 |                     L_mat_train = self.L[index][:, index].todense()
126 |                     inp = [A_train, L_mat_train]
127 |                     batch_losses = self.model.train_on_batch(inp, inp)
128 |                     losses += batch_losses
129 |                 losses = losses / steps_per_epoch
130 | 
131 |                 logs['loss'] = losses[0]
132 |                 logs['2nd_loss'] = losses[1]
133 |                 logs['1st_loss'] = losses[2]
134 |                 epoch_time = int(time.time() - start_time)
135 |                 hist.on_epoch_end(epoch, logs)
136 |                 if verbose > 0:
137 |                     print('Epoch {0}/{1}'.format(epoch + 1, epochs))
138 |                     print('{0}s - loss: {1: .4f} - 2nd_loss: {2: .4f} - 1st_loss: {3: .4f}'.format(
139 |                         epoch_time, losses[0], losses[1], losses[2]))
140 |             return hist
141 | 
142 |     def evaluate(self, ):
143 |         return self.model.evaluate(x=self.inputs, y=self.inputs, batch_size=self.node_size)
144 | 
145 |     def get_embeddings(self):
146 |         self._embeddings = {}
147 |         embeddings = self.emb_model.predict(self.A.todense(), batch_size=self.node_size)
148 |         look_back = self.idx2node
149 |         for i, embedding in enumerate(embeddings):
150 |             self._embeddings[look_back[i]] = embedding
151 | 
152 |         return self._embeddings
153 | 
154 |     def _create_A_L(self, graph, node2idx):
155 |         node_size = graph.number_of_nodes()
156 |         A_data = []
157 |         A_row_index = []
158 |         A_col_index = []
159 | 
160 |         for edge in graph.edges():
161 |             v1, v2 = edge
162 |             edge_weight = graph[v1][v2].get('weight', 1)
163 | 
164 |             A_data.append(edge_weight)
165 |             A_row_index.append(node2idx[v1])
166 |             A_col_index.append(node2idx[v2])
167 | 
168 |         A = sp.csr_matrix((A_data, (A_row_index, A_col_index)), shape=(node_size, node_size))
169 |         A_ = sp.csr_matrix((A_data + A_data, (A_row_index + A_col_index, A_col_index + A_row_index)),
170 |                            shape=(node_size, node_size))
171 | 
172 |         D = sp.diags(A_.sum(axis=1).flatten().tolist()[0])
173 |         L = D - A_
174 |         return A, L
175 | 


--------------------------------------------------------------------------------
/lib_node_embedding/ge/models/struc2vec.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | """
  4 | 
  5 | 
  6 | 
  7 | Author:
  8 | 
  9 |     Weichen Shen,wcshen1994@163.com
 10 | 
 11 | 
 12 | 
 13 | Reference:
 14 | 
 15 |     [1] Ribeiro L F R, Saverese P H P, Figueiredo D R. struc2vec: Learning node representations from structural identity[C]//Proceedings of the 23rd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining. ACM, 2017: 385-394.(https://arxiv.org/pdf/1704.03165.pdf)
 16 | 
 17 | 
 18 | 
 19 | """
 20 | 
 21 | import math
 22 | import os
 23 | import shutil
 24 | from collections import ChainMap, deque
 25 | 
 26 | import numpy as np
 27 | import pandas as pd
 28 | from fastdtw import fastdtw
 29 | from gensim.models import Word2Vec
 30 | from joblib import Parallel, delayed
 31 | from tqdm import tqdm
 32 | 
 33 | from ..alias import create_alias_table
 34 | from ..utils import partition_dict, preprocess_nxgraph
 35 | from ..walker import BiasedWalker
 36 | 
 37 | 
 38 | class Struc2Vec():
 39 |     def __init__(self, graph, walk_length=10, num_walks=100, workers=1, verbose=0, stay_prob=0.3, opt1_reduce_len=True, opt2_reduce_sim_calc=True, opt3_num_layers=None, temp_path='./temp_struc2vec/', reuse=False):
 40 |         self.graph = graph
 41 |         self.idx2node, self.node2idx = preprocess_nxgraph(graph)
 42 |         self.idx = list(range(len(self.idx2node)))
 43 | 
 44 |         self.opt1_reduce_len = opt1_reduce_len
 45 |         self.opt2_reduce_sim_calc = opt2_reduce_sim_calc
 46 |         self.opt3_num_layers = opt3_num_layers
 47 | 
 48 |         self.resue = reuse
 49 |         self.temp_path = temp_path
 50 | 
 51 |         if not os.path.exists(self.temp_path):
 52 |             os.mkdir(self.temp_path)
 53 |         if not reuse:
 54 |             shutil.rmtree(self.temp_path)
 55 |             os.mkdir(self.temp_path)
 56 | 
 57 |         self.create_context_graph(self.opt3_num_layers, workers, verbose)
 58 |         self.prepare_biased_walk()
 59 |         self.walker = BiasedWalker(self.idx2node, self.temp_path)
 60 |         self.sentences = self.walker.simulate_walks(
 61 |             num_walks, walk_length, stay_prob, workers, verbose)
 62 | 
 63 |         self._embeddings = {}
 64 | 
 65 |     def create_context_graph(self, max_num_layers, workers=1, verbose=0,):
 66 | 
 67 |         pair_distances = self._compute_structural_distance(
 68 |             max_num_layers, workers, verbose,)
 69 |         layers_adj, layers_distances = self._get_layer_rep(pair_distances)
 70 |         pd.to_pickle(layers_adj, self.temp_path + 'layers_adj.pkl')
 71 | 
 72 |         layers_accept, layers_alias = self._get_transition_probs(
 73 |             layers_adj, layers_distances)
 74 |         pd.to_pickle(layers_alias, self.temp_path + 'layers_alias.pkl')
 75 |         pd.to_pickle(layers_accept, self.temp_path + 'layers_accept.pkl')
 76 | 
 77 |     def prepare_biased_walk(self,):
 78 | 
 79 |         sum_weights = {}
 80 |         sum_edges = {}
 81 |         average_weight = {}
 82 |         gamma = {}
 83 |         layer = 0
 84 |         while (os.path.exists(self.temp_path+'norm_weights_distance-layer-' + str(layer)+'.pkl')):
 85 |             probs = pd.read_pickle(
 86 |                 self.temp_path+'norm_weights_distance-layer-' + str(layer)+'.pkl')
 87 |             for v, list_weights in probs.items():
 88 |                 sum_weights.setdefault(layer, 0)
 89 |                 sum_edges.setdefault(layer, 0)
 90 |                 sum_weights[layer] += sum(list_weights)
 91 |                 sum_edges[layer] += len(list_weights)
 92 | 
 93 |             average_weight[layer] = sum_weights[layer] / sum_edges[layer]
 94 | 
 95 |             gamma.setdefault(layer, {})
 96 | 
 97 |             for v, list_weights in probs.items():
 98 |                 num_neighbours = 0
 99 |                 for w in list_weights:
100 |                     if (w > average_weight[layer]):
101 |                         num_neighbours += 1
102 |                 gamma[layer][v] = num_neighbours
103 | 
104 |             layer += 1
105 | 
106 |         pd.to_pickle(average_weight, self.temp_path + 'average_weight')
107 |         pd.to_pickle(gamma, self.temp_path + 'gamma.pkl')
108 | 
109 |     def train(self, embed_size=128, window_size=5, workers=3, iter=5):
110 | 
111 |         # pd.read_pickle(self.temp_path+'walks.pkl')
112 |         sentences = self.sentences
113 | 
114 |         print("Learning representation...")
115 |         model = Word2Vec(sentences, size=embed_size, window=window_size, min_count=0, hs=1, sg=1, workers=workers,
116 |                          iter=iter)
117 |         print("Learning representation done!")
118 |         self.w2v_model = model
119 | 
120 |         return model
121 | 
122 |     def get_embeddings(self,):
123 |         if self.w2v_model is None:
124 |             print("model not train")
125 |             return {}
126 | 
127 |         self._embeddings = {}
128 |         for word in self.graph.nodes():
129 |             self._embeddings[word] = self.w2v_model.wv[word]
130 | 
131 |         return self._embeddings
132 | 
133 |     def _compute_ordered_degreelist(self, max_num_layers):
134 | 
135 |         degreeList = {}
136 |         vertices = self.idx  # self.g.nodes()
137 |         for v in vertices:
138 |             degreeList[v] = self._get_order_degreelist_node(v, max_num_layers)
139 |         return degreeList
140 | 
141 |     def _get_order_degreelist_node(self, root, max_num_layers=None):
142 |         if max_num_layers is None:
143 |             max_num_layers = float('inf')
144 | 
145 |         ordered_degree_sequence_dict = {}
146 |         visited = [False] * len(self.graph.nodes())
147 |         queue = deque()
148 |         level = 0
149 |         queue.append(root)
150 |         visited[root] = True
151 | 
152 |         while (len(queue) > 0 and level <= max_num_layers):
153 | 
154 |             count = len(queue)
155 |             if self.opt1_reduce_len:
156 |                 degree_list = {}
157 |             else:
158 |                 degree_list = []
159 |             while (count > 0):
160 | 
161 |                 top = queue.popleft()
162 |                 node = self.idx2node[top]
163 |                 degree = len(self.graph[node])
164 | 
165 |                 if self.opt1_reduce_len:
166 |                     degree_list[degree] = degree_list.get(degree, 0) + 1
167 |                 else:
168 |                     degree_list.append(degree)
169 | 
170 |                 for nei in self.graph[node]:
171 |                     nei_idx = self.node2idx[nei]
172 |                     if not visited[nei_idx]:
173 |                         visited[nei_idx] = True
174 |                         queue.append(nei_idx)
175 |                 count -= 1
176 |             if self.opt1_reduce_len:
177 |                 orderd_degree_list = [(degree, freq)
178 |                                       for degree, freq in degree_list.items()]
179 |                 orderd_degree_list.sort(key=lambda x: x[0])
180 |             else:
181 |                 orderd_degree_list = sorted(degree_list)
182 |             ordered_degree_sequence_dict[level] = orderd_degree_list
183 |             level += 1
184 | 
185 |         return ordered_degree_sequence_dict
186 | 
187 |     def _compute_structural_distance(self, max_num_layers, workers=1, verbose=0,):
188 | 
189 |         if os.path.exists(self.temp_path+'structural_dist.pkl'):
190 |             structural_dist = pd.read_pickle(
191 |                 self.temp_path+'structural_dist.pkl')
192 |         else:
193 |             if self.opt1_reduce_len:
194 |                 dist_func = cost_max
195 |             else:
196 |                 dist_func = cost
197 | 
198 |             if os.path.exists(self.temp_path + 'degreelist.pkl'):
199 |                 degreeList = pd.read_pickle(self.temp_path + 'degreelist.pkl')
200 |             else:
201 |                 degreeList = self._compute_ordered_degreelist(max_num_layers)
202 |                 pd.to_pickle(degreeList, self.temp_path + 'degreelist.pkl')
203 | 
204 |             if self.opt2_reduce_sim_calc:
205 |                 degrees = self._create_vectors()
206 |                 degreeListsSelected = {}
207 |                 vertices = {}
208 |                 n_nodes = len(self.idx)
209 |                 for v in self.idx:  # c:list of vertex
210 |                     nbs = get_vertices(
211 |                         v, len(self.graph[self.idx2node[v]]), degrees, n_nodes)
212 |                     vertices[v] = nbs  # store nbs
213 |                     degreeListsSelected[v] = degreeList[v]  # store dist
214 |                     for n in nbs:
215 |                         # store dist of nbs
216 |                         degreeListsSelected[n] = degreeList[n]
217 |             else:
218 |                 vertices = {}
219 |                 for v in degreeList:
220 |                     vertices[v] = [vd for vd in degreeList.keys() if vd > v]
221 | 
222 |             results = Parallel(n_jobs=workers, verbose=verbose,)(
223 |                 delayed(compute_dtw_dist)(part_list, degreeList, dist_func) for part_list in partition_dict(vertices, workers))
224 |             dtw_dist = dict(ChainMap(*results))
225 | 
226 |             structural_dist = convert_dtw_struc_dist(dtw_dist)
227 |             pd.to_pickle(structural_dist, self.temp_path +
228 |                          'structural_dist.pkl')
229 | 
230 |         return structural_dist
231 | 
232 |     def _create_vectors(self):
233 |         degrees = {}  # sotre v list of degree
234 |         degrees_sorted = set()  # store degree
235 |         G = self.graph
236 |         for v in self.idx:
237 |             degree = len(G[self.idx2node[v]])
238 |             degrees_sorted.add(degree)
239 |             if (degree not in degrees):
240 |                 degrees[degree] = {}
241 |                 degrees[degree]['vertices'] = []
242 |             degrees[degree]['vertices'].append(v)
243 |         degrees_sorted = np.array(list(degrees_sorted), dtype='int')
244 |         degrees_sorted = np.sort(degrees_sorted)
245 | 
246 |         l = len(degrees_sorted)
247 |         for index, degree in enumerate(degrees_sorted):
248 |             if (index > 0):
249 |                 degrees[degree]['before'] = degrees_sorted[index - 1]
250 |             if (index < (l - 1)):
251 |                 degrees[degree]['after'] = degrees_sorted[index + 1]
252 | 
253 |         return degrees
254 | 
255 |     def _get_layer_rep(self, pair_distances):
256 |         layer_distances = {}
257 |         layer_adj = {}
258 |         for v_pair, layer_dist in pair_distances.items():
259 |             for layer, distance in layer_dist.items():
260 |                 vx = v_pair[0]
261 |                 vy = v_pair[1]
262 | 
263 |                 layer_distances.setdefault(layer, {})
264 |                 layer_distances[layer][vx, vy] = distance
265 | 
266 |                 layer_adj.setdefault(layer, {})
267 |                 layer_adj[layer].setdefault(vx, [])
268 |                 layer_adj[layer].setdefault(vy, [])
269 |                 layer_adj[layer][vx].append(vy)
270 |                 layer_adj[layer][vy].append(vx)
271 | 
272 |         return layer_adj, layer_distances
273 | 
274 |     def _get_transition_probs(self, layers_adj, layers_distances):
275 |         layers_alias = {}
276 |         layers_accept = {}
277 | 
278 |         for layer in layers_adj:
279 | 
280 |             neighbors = layers_adj[layer]
281 |             layer_distances = layers_distances[layer]
282 |             node_alias_dict = {}
283 |             node_accept_dict = {}
284 |             norm_weights = {}
285 | 
286 |             for v, neighbors in neighbors.items():
287 |                 e_list = []
288 |                 sum_w = 0.0
289 | 
290 |                 for n in neighbors:
291 |                     if (v, n) in layer_distances:
292 |                         wd = layer_distances[v, n]
293 |                     else:
294 |                         wd = layer_distances[n, v]
295 |                     w = np.exp(-float(wd))
296 |                     e_list.append(w)
297 |                     sum_w += w
298 | 
299 |                 e_list = [x / sum_w for x in e_list]
300 |                 norm_weights[v] = e_list
301 |                 accept, alias = create_alias_table(e_list)
302 |                 node_alias_dict[v] = alias
303 |                 node_accept_dict[v] = accept
304 | 
305 |             pd.to_pickle(
306 |                 norm_weights, self.temp_path + 'norm_weights_distance-layer-' + str(layer)+'.pkl')
307 | 
308 |             layers_alias[layer] = node_alias_dict
309 |             layers_accept[layer] = node_accept_dict
310 | 
311 |         return layers_accept, layers_alias
312 | 
313 | 
314 | def cost(a, b):
315 |     ep = 0.5
316 |     m = max(a, b) + ep
317 |     mi = min(a, b) + ep
318 |     return ((m / mi) - 1)
319 | 
320 | 
321 | def cost_min(a, b):
322 |     ep = 0.5
323 |     m = max(a[0], b[0]) + ep
324 |     mi = min(a[0], b[0]) + ep
325 |     return ((m / mi) - 1) * min(a[1], b[1])
326 | 
327 | 
328 | def cost_max(a, b):
329 |     ep = 0.5
330 |     m = max(a[0], b[0]) + ep
331 |     mi = min(a[0], b[0]) + ep
332 |     return ((m / mi) - 1) * max(a[1], b[1])
333 | 
334 | 
335 | def convert_dtw_struc_dist(distances, startLayer=1):
336 |     """
337 | 
338 |     :param distances: dict of dict
339 |     :param startLayer:
340 |     :return:
341 |     """
342 |     for vertices, layers in distances.items():
343 |         keys_layers = sorted(layers.keys())
344 |         startLayer = min(len(keys_layers), startLayer)
345 |         for layer in range(0, startLayer):
346 |             keys_layers.pop(0)
347 | 
348 |         for layer in keys_layers:
349 |             layers[layer] += layers[layer - 1]
350 |     return distances
351 | 
352 | 
353 | def get_vertices(v, degree_v, degrees, n_nodes):
354 |     a_vertices_selected = 2 * math.log(n_nodes, 2)
355 |     vertices = []
356 |     try:
357 |         c_v = 0
358 | 
359 |         for v2 in degrees[degree_v]['vertices']:
360 |             if (v != v2):
361 |                 vertices.append(v2)  # same degree
362 |                 c_v += 1
363 |                 if (c_v > a_vertices_selected):
364 |                     raise StopIteration
365 | 
366 |         if ('before' not in degrees[degree_v]):
367 |             degree_b = -1
368 |         else:
369 |             degree_b = degrees[degree_v]['before']
370 |         if ('after' not in degrees[degree_v]):
371 |             degree_a = -1
372 |         else:
373 |             degree_a = degrees[degree_v]['after']
374 |         if (degree_b == -1 and degree_a == -1):
375 |             raise StopIteration  # not anymore v
376 |         degree_now = verifyDegrees(degrees, degree_v, degree_a, degree_b)
377 |         # nearest valid degree
378 |         while True:
379 |             for v2 in degrees[degree_now]['vertices']:
380 |                 if (v != v2):
381 |                     vertices.append(v2)
382 |                     c_v += 1
383 |                     if (c_v > a_vertices_selected):
384 |                         raise StopIteration
385 | 
386 |             if (degree_now == degree_b):
387 |                 if ('before' not in degrees[degree_b]):
388 |                     degree_b = -1
389 |                 else:
390 |                     degree_b = degrees[degree_b]['before']
391 |             else:
392 |                 if ('after' not in degrees[degree_a]):
393 |                     degree_a = -1
394 |                 else:
395 |                     degree_a = degrees[degree_a]['after']
396 | 
397 |             if (degree_b == -1 and degree_a == -1):
398 |                 raise StopIteration
399 | 
400 |             degree_now = verifyDegrees(degrees, degree_v, degree_a, degree_b)
401 | 
402 |     except StopIteration:
403 |         return list(vertices)
404 | 
405 |     return list(vertices)
406 | 
407 | 
408 | def verifyDegrees(degrees, degree_v_root, degree_a, degree_b):
409 | 
410 |     if(degree_b == -1):
411 |         degree_now = degree_a
412 |     elif(degree_a == -1):
413 |         degree_now = degree_b
414 |     elif(abs(degree_b - degree_v_root) < abs(degree_a - degree_v_root)):
415 |         degree_now = degree_b
416 |     else:
417 |         degree_now = degree_a
418 | 
419 |     return degree_now
420 | 
421 | 
422 | def compute_dtw_dist(part_list, degreeList, dist_func):
423 |     dtw_dist = {}
424 |     for v1, nbs in part_list:
425 |         lists_v1 = degreeList[v1]  # lists_v1 :orderd degree list of v1
426 |         for v2 in nbs:
427 |             lists_v2 = degreeList[v2]  # lists_v1 :orderd degree list of v2
428 |             max_layer = min(len(lists_v1), len(lists_v2))  # valid layer
429 |             dtw_dist[v1, v2] = {}
430 |             for layer in range(0, max_layer):
431 |                 dist, path = fastdtw(
432 |                     lists_v1[layer], lists_v2[layer], radius=1, dist=dist_func)
433 |                 dtw_dist[v1, v2][layer] = dist
434 |     return dtw_dist
435 | 


--------------------------------------------------------------------------------
/lib_node_embedding/ge/utils.py:
--------------------------------------------------------------------------------
 1 | def preprocess_nxgraph(graph):
 2 |     node2idx = {}
 3 |     idx2node = []
 4 |     node_size = 0
 5 |     for node in graph.nodes():
 6 |         node2idx[node] = node_size
 7 |         idx2node.append(node)
 8 |         node_size += 1
 9 |     return idx2node, node2idx
10 | 
11 | 
12 | def partition_dict(vertices, workers):
13 |     batch_size = (len(vertices) - 1) // workers + 1
14 |     part_list = []
15 |     part = []
16 |     count = 0
17 |     for v1, nbs in vertices.items():
18 |         part.append((v1, nbs))
19 |         count += 1
20 |         if count % batch_size == 0:
21 |             part_list.append(part)
22 |             part = []
23 |     if len(part) > 0:
24 |         part_list.append(part)
25 |     return part_list
26 | 
27 | 
28 | def partition_list(vertices, workers):
29 |     batch_size = (len(vertices) - 1) // workers + 1
30 |     part_list = []
31 |     part = []
32 |     count = 0
33 |     for v1, nbs in enumerate(vertices):
34 |         part.append((v1, nbs))
35 |         count += 1
36 |         if count % batch_size == 0:
37 |             part_list.append(part)
38 |             part = []
39 |     if len(part) > 0:
40 |         part_list.append(part)
41 |     return part_list
42 | 
43 | 
44 | def partition_num(num, workers):
45 |     if num % workers == 0:
46 |         return [num//workers]*workers
47 |     else:
48 |         return [num//workers]*workers + [num % workers]
49 | 


--------------------------------------------------------------------------------
/lib_node_embedding/ge/walker.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import math
  3 | import random
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from joblib import Parallel, delayed
  8 | from tqdm import trange
  9 | 
 10 | from .alias import alias_sample, create_alias_table
 11 | from .utils import partition_num
 12 | 
 13 | 
 14 | class RandomWalker:
 15 |     def __init__(self, G, p=1, q=1, use_rejection_sampling=0):
 16 |         """
 17 |         :param G:
 18 |         :param p: Return parameter,controls the likelihood of immediately revisiting a node in the walk.
 19 |         :param q: In-out parameter,allows the search to differentiate between “inward” and “outward” nodes
 20 |         :param use_rejection_sampling: Whether to use the rejection sampling strategy in node2vec.
 21 |         """
 22 |         self.G = G
 23 |         self.p = p
 24 |         self.q = q
 25 |         self.use_rejection_sampling = use_rejection_sampling
 26 | 
 27 |     def deepwalk_walk(self, walk_length, start_node):
 28 | 
 29 |         walk = [start_node]
 30 | 
 31 |         while len(walk) < walk_length:
 32 |             cur = walk[-1]
 33 |             cur_nbrs = list(self.G.neighbors(cur))
 34 |             if len(cur_nbrs) > 0:
 35 |                 walk.append(random.choice(cur_nbrs))
 36 |             else:
 37 |                 break
 38 |         return walk
 39 | 
 40 |     def node2vec_walk(self, walk_length, start_node):
 41 | 
 42 |         G = self.G
 43 |         alias_nodes = self.alias_nodes
 44 |         alias_edges = self.alias_edges
 45 | 
 46 |         walk = [start_node]
 47 | 
 48 |         while len(walk) < walk_length:
 49 |             cur = walk[-1]
 50 |             cur_nbrs = list(G.neighbors(cur))
 51 |             if len(cur_nbrs) > 0:
 52 |                 if len(walk) == 1:
 53 |                     walk.append(
 54 |                         cur_nbrs[alias_sample(alias_nodes[cur][0], alias_nodes[cur][1])])
 55 |                 else:
 56 |                     prev = walk[-2]
 57 |                     edge = (prev, cur)
 58 |                     next_node = cur_nbrs[alias_sample(alias_edges[edge][0],
 59 |                                                       alias_edges[edge][1])]
 60 |                     walk.append(next_node)
 61 |             else:
 62 |                 break
 63 | 
 64 |         return walk
 65 | 
 66 |     def node2vec_walk2(self, walk_length, start_node):
 67 |         """
 68 |         Reference:
 69 |         KnightKing: A Fast Distributed Graph Random Walk Engine
 70 |         http://madsys.cs.tsinghua.edu.cn/publications/SOSP19-yang.pdf
 71 |         """
 72 | 
 73 |         def rejection_sample(inv_p, inv_q, nbrs_num):
 74 |             upper_bound = max(1.0, max(inv_p, inv_q))
 75 |             lower_bound = min(1.0, min(inv_p, inv_q))
 76 |             shatter = 0
 77 |             second_upper_bound = max(1.0, inv_q)
 78 |             if (inv_p > second_upper_bound):
 79 |                 shatter = second_upper_bound / nbrs_num
 80 |                 upper_bound = second_upper_bound + shatter
 81 |             return upper_bound, lower_bound, shatter
 82 | 
 83 |         G = self.G
 84 |         alias_nodes = self.alias_nodes
 85 |         inv_p = 1.0 / self.p
 86 |         inv_q = 1.0 / self.q
 87 |         walk = [start_node]
 88 |         while len(walk) < walk_length:
 89 |             cur = walk[-1]
 90 |             cur_nbrs = list(G.neighbors(cur))
 91 |             if len(cur_nbrs) > 0:
 92 |                 if len(walk) == 1:
 93 |                     walk.append(
 94 |                         cur_nbrs[alias_sample(alias_nodes[cur][0], alias_nodes[cur][1])])
 95 |                 else:
 96 |                     upper_bound, lower_bound, shatter = rejection_sample(
 97 |                         inv_p, inv_q, len(cur_nbrs))
 98 |                     prev = walk[-2]
 99 |                     prev_nbrs = set(G.neighbors(prev))
100 |                     while True:
101 |                         prob = random.random() * upper_bound
102 |                         if (prob + shatter >= upper_bound):
103 |                             next_node = prev
104 |                             break
105 |                         next_node = cur_nbrs[alias_sample(
106 |                             alias_nodes[cur][0], alias_nodes[cur][1])]
107 |                         if (prob < lower_bound):
108 |                             break
109 |                         if (prob < inv_p and next_node == prev):
110 |                             break
111 |                         _prob = 1.0 if next_node in prev_nbrs else inv_q
112 |                         if (prob < _prob):
113 |                             break
114 |                     walk.append(next_node)
115 |             else:
116 |                 break
117 |         return walk
118 | 
119 |     def simulate_walks(self, num_walks, walk_length, workers=1, verbose=0):
120 | 
121 |         G = self.G
122 | 
123 |         nodes = list(G.nodes())
124 | 
125 |         results = Parallel(n_jobs=workers, verbose=verbose, )(
126 |             delayed(self._simulate_walks)(nodes, num, walk_length) for num in
127 |             partition_num(num_walks, workers))
128 | 
129 |         walks = list(itertools.chain(*results))
130 | 
131 |         return walks
132 | 
133 |     def _simulate_walks(self, nodes, num_walks, walk_length,):
134 |         walks = []
135 |         for _ in range(num_walks):
136 |             random.shuffle(nodes)
137 |             for v in nodes:
138 |                 if self.p == 1 and self.q == 1:
139 |                     walks.append(self.deepwalk_walk(
140 |                         walk_length=walk_length, start_node=v))
141 |                 elif self.use_rejection_sampling:
142 |                     walks.append(self.node2vec_walk2(
143 |                         walk_length=walk_length, start_node=v))
144 |                 else:
145 |                     walks.append(self.node2vec_walk(
146 |                         walk_length=walk_length, start_node=v))
147 |         return walks
148 | 
149 |     def get_alias_edge(self, t, v):
150 |         """
151 |         compute unnormalized transition probability between nodes v and its neighbors give the previous visited node t.
152 |         :param t:
153 |         :param v:
154 |         :return:
155 |         """
156 |         G = self.G
157 |         p = self.p
158 |         q = self.q
159 | 
160 |         unnormalized_probs = []
161 |         for x in G.neighbors(v):
162 |             weight = G[v][x].get('weight', 1.0)  # w_vx
163 |             if x == t:  # d_tx == 0
164 |                 unnormalized_probs.append(weight/p)
165 |             elif G.has_edge(x, t):  # d_tx == 1
166 |                 unnormalized_probs.append(weight)
167 |             else:  # d_tx > 1
168 |                 unnormalized_probs.append(weight/q)
169 |         norm_const = sum(unnormalized_probs)
170 |         normalized_probs = [
171 |             float(u_prob)/norm_const for u_prob in unnormalized_probs]
172 | 
173 |         return create_alias_table(normalized_probs)
174 | 
175 |     def preprocess_transition_probs(self):
176 |         """
177 |         Preprocessing of transition probabilities for guiding the random walks.
178 |         """
179 |         G = self.G
180 |         alias_nodes = {}
181 |         for node in G.nodes():
182 |             unnormalized_probs = [G[node][nbr].get('weight', 1.0)
183 |                                   for nbr in G.neighbors(node)]
184 |             norm_const = sum(unnormalized_probs)
185 |             normalized_probs = [
186 |                 float(u_prob)/norm_const for u_prob in unnormalized_probs]
187 |             alias_nodes[node] = create_alias_table(normalized_probs)
188 | 
189 |         if not self.use_rejection_sampling:
190 |             alias_edges = {}
191 | 
192 |             for edge in G.edges():
193 |                 alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
194 |                 if not G.is_directed():
195 |                     alias_edges[(edge[1], edge[0])] = self.get_alias_edge(edge[1], edge[0])
196 |                 self.alias_edges = alias_edges
197 | 
198 |         self.alias_nodes = alias_nodes
199 |         return
200 | 
201 | 
202 | class BiasedWalker:
203 |     def __init__(self, idx2node, temp_path):
204 | 
205 |         self.idx2node = idx2node
206 |         self.idx = list(range(len(self.idx2node)))
207 |         self.temp_path = temp_path
208 |         pass
209 | 
210 |     def simulate_walks(self, num_walks, walk_length, stay_prob=0.3, workers=1, verbose=0):
211 | 
212 |         layers_adj = pd.read_pickle(self.temp_path+'layers_adj.pkl')
213 |         layers_alias = pd.read_pickle(self.temp_path+'layers_alias.pkl')
214 |         layers_accept = pd.read_pickle(self.temp_path+'layers_accept.pkl')
215 |         gamma = pd.read_pickle(self.temp_path+'gamma.pkl')
216 |         walks = []
217 |         initialLayer = 0
218 | 
219 |         nodes = self.idx  # list(self.g.nodes())
220 | 
221 |         results = Parallel(n_jobs=workers, verbose=verbose, )(
222 |             delayed(self._simulate_walks)(nodes, num, walk_length, stay_prob, layers_adj, layers_accept, layers_alias, gamma) for num in
223 |             partition_num(num_walks, workers))
224 | 
225 |         walks = list(itertools.chain(*results))
226 |         return walks
227 | 
228 |     def _simulate_walks(self, nodes, num_walks, walk_length, stay_prob, layers_adj, layers_accept, layers_alias, gamma):
229 |         walks = []
230 |         for _ in range(num_walks):
231 |             random.shuffle(nodes)
232 |             for v in nodes:
233 |                 walks.append(self._exec_random_walk(layers_adj, layers_accept, layers_alias,
234 |                                                     v, walk_length, gamma, stay_prob))
235 |         return walks
236 | 
237 |     def _exec_random_walk(self, graphs, layers_accept, layers_alias, v, walk_length, gamma, stay_prob=0.3):
238 |         initialLayer = 0
239 |         layer = initialLayer
240 | 
241 |         path = []
242 |         path.append(self.idx2node[v])
243 | 
244 |         while len(path) < walk_length:
245 |             r = random.random()
246 |             if(r < stay_prob):  # same layer
247 |                 v = chooseNeighbor(v, graphs, layers_alias,
248 |                                    layers_accept, layer)
249 |                 path.append(self.idx2node[v])
250 |             else:  # different layer
251 |                 r = random.random()
252 |                 try:
253 |                     x = math.log(gamma[layer][v] + math.e)
254 |                     p_moveup = (x / (x + 1))
255 |                 except:
256 |                     print(layer, v)
257 |                     raise ValueError()
258 | 
259 |                 if(r > p_moveup):
260 |                     if(layer > initialLayer):
261 |                         layer = layer - 1
262 |                 else:
263 |                     if((layer + 1) in graphs and v in graphs[layer + 1]):
264 |                         layer = layer + 1
265 | 
266 |         return path
267 | 
268 | 
269 | def chooseNeighbor(v, graphs, layers_alias, layers_accept, layer):
270 | 
271 |     v_list = graphs[layer][v]
272 | 
273 |     idx = alias_sample(layers_accept[layer][v], layers_alias[layer][v])
274 |     v = v_list[idx]
275 | 
276 |     return v
277 | 


--------------------------------------------------------------------------------
/lib_node_embedding/node_embedding.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import config
 4 | from lib_gnn_model.graphsage.graphsage import SAGE
 5 | from lib_dataset.data_store import DataStore
 6 | 
 7 | 
 8 | class NodeEmbedding:
 9 |     def __init__(self, args, graph, data):
10 |         super(NodeEmbedding, self)
11 | 
12 |         self.logger = logging.getLogger(__name__)
13 |         self.args = args
14 |         self.graph = graph
15 |         self.data = data
16 | 
17 |         self.data_store = DataStore(self.args)
18 | 
19 |     def sage_encoder(self):
20 |         if self.args['is_gen_embedding']:
21 |             self.logger.info("generating node embeddings with GraphSage...")
22 | 
23 |             node_to_embedding = {}
24 |             # run sage
25 |             self.target_model = SAGE(self.data.num_features, len(self.data.y.unique()), self.data)
26 | 
27 |             # self.target_model.train_model(50)
28 | 
29 |             # load a pretrained GNN model for generating node embeddings
30 |             target_model_name = '_'.join((self.args['target_model'], 'random_1',
31 |                                           str(self.args['shard_size_delta']),
32 |                                           str(self.args['ratio_deleted_edges']), '0_0_1'))
33 |             target_model_file = config.MODEL_PATH + self.args['dataset_name'] + '/' + target_model_name
34 |             self.target_model.load_model(target_model_file)
35 | 
36 |             logits = self.target_model.generate_embeddings().detach().cpu().numpy()
37 |             for node in self.graph.nodes:
38 |                 node_to_embedding[node] = logits[node]
39 | 
40 |             self.data_store.save_embeddings(node_to_embedding)
41 |         else:
42 |             node_to_embedding = self.data_store.load_embeddings()
43 | 
44 |         return node_to_embedding
45 | 


--------------------------------------------------------------------------------
/lib_utils/logger.py:
--------------------------------------------------------------------------------
 1 | from texttable import Texttable
 2 | 
 3 | def tab_printer(args):
 4 |     """
 5 |     Function to print the logs in a nice tabular format.
 6 |     :param args: Parameters used for the model.
 7 |     """
 8 |     # args = vars(args)
 9 |     keys = sorted(args.keys())
10 |     t = Texttable()
11 |     t.add_rows([["Parameter", "Value"]] +  [[k.replace("_"," ").capitalize(),args[k]] for k in keys])
12 |     print(t.draw())


--------------------------------------------------------------------------------
/lib_utils/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import errno
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import networkx as nx
  7 | import torch
  8 | from scipy.sparse import coo_matrix
  9 | from tqdm import tqdm
 10 | 
 11 | 
 12 | def graph_reader(path):
 13 |     """
 14 |     Function to read the graph from the path.
 15 |     :param path: Path to the edge list.
 16 |     :return graph: NetworkX object returned.
 17 |     """
 18 |     graph = nx.from_edgelist(pd.read_csv(path).values.tolist())
 19 |     return graph
 20 | 
 21 | 
 22 | def feature_reader(path):
 23 |     """
 24 |     Reading the sparse feature matrix stored as csv from the disk.
 25 |     :param path: Path to the csv file.
 26 |     :return features: Dense matrix of features.
 27 |     """
 28 |     features = pd.read_csv(path)
 29 |     node_index = features["node_id"].values.tolist()
 30 |     feature_index = features["feature_id"].values.tolist()
 31 |     feature_values = features["value"].values.tolist()
 32 |     node_count = max(node_index) + 1
 33 |     feature_count = max(feature_index) + 1
 34 |     features = coo_matrix((feature_values, (node_index, feature_index)), shape=(node_count, feature_count)).toarray()
 35 |     return features
 36 | 
 37 | 
 38 | def target_reader(path):
 39 |     """
 40 |     Reading the target vector from disk.
 41 |     :param path: Path to the target.
 42 |     :return target: Target vector.
 43 |     """
 44 |     target = np.array(pd.read_csv(path)["target"]).reshape(-1, 1)
 45 |     return target
 46 | 
 47 | 
 48 | def make_adjacency(graph, max_degree, sel=None):
 49 |     all_nodes = np.array(graph.nodes())
 50 | 
 51 |     # Initialize w/ links to a dummy node
 52 |     n_nodes = len(all_nodes)
 53 |     adj = (np.zeros((n_nodes + 1, max_degree)) + n_nodes).astype(int)
 54 | 
 55 |     if sel is not None:
 56 |         # only look at nodes in training set
 57 |         all_nodes = all_nodes[sel]
 58 | 
 59 |     for node in tqdm(all_nodes):
 60 |         neibs = np.array(list(graph.neighbors(node)))
 61 | 
 62 |         if sel is not None:
 63 |             neibs = neibs[sel[neibs]]
 64 | 
 65 |         if len(neibs) > 0:
 66 |             if len(neibs) > max_degree:
 67 |                 neibs = np.random.choice(neibs, max_degree, replace=False)
 68 |             elif len(neibs) < max_degree:
 69 |                 extra = np.random.choice(neibs, max_degree - neibs.shape[0], replace=True)
 70 |                 neibs = np.concatenate([neibs, extra])
 71 |             adj[node, :] = neibs
 72 | 
 73 |     return adj
 74 | 
 75 | 
 76 | def connected_component_subgraphs(graph):
 77 |     """
 78 |     Find all connected subgraphs in a networkx Graph
 79 | 
 80 |     Args:
 81 |         graph (Graph): A networkx Graph
 82 | 
 83 |     Yields:
 84 |         generator: A subgraph generator
 85 |     """
 86 |     for c in nx.connected_components(graph):
 87 |         yield graph.subgraph(c)
 88 | 
 89 | 
 90 | def check_exist(file_name):
 91 |     if not os.path.exists(os.path.dirname(file_name)):
 92 |         try:
 93 |             os.makedirs(os.path.dirname(file_name))
 94 |         except OSError as exc: # Guard against race condition
 95 |             if exc.errno != errno.EEXIST:
 96 |                 raise
 97 | 
 98 | 
 99 | def filter_edge_index(edge_index, node_indices, reindex=True):
100 |     assert np.all(np.diff(node_indices) >= 0), 'node_indices must be sorted'
101 |     if isinstance(edge_index, torch.Tensor):
102 |         edge_index = edge_index.cpu()
103 | 
104 |     node_index = np.isin(edge_index, node_indices)
105 |     col_index = np.nonzero(np.logical_and(node_index[0], node_index[1]))[0]
106 |     edge_index = edge_index[:, col_index]
107 | 
108 |     if reindex:
109 |         return np.searchsorted(node_indices, edge_index)
110 |     else:
111 |         return edge_index
112 | 
113 | 
114 | def pyg_to_nx(data):
115 |     """
116 |     Convert a torch geometric Data to networkx Graph.
117 | 
118 |     Args:
119 |         data (Data): A torch geometric Data.
120 | 
121 |     Returns:
122 |         Graph: A networkx Graph.
123 |     """
124 |     graph = nx.Graph()
125 |     graph.add_nodes_from(np.arange(data.num_nodes))
126 |     edge_index = data.edge_index.numpy()
127 | 
128 |     for u, v in np.transpose(edge_index):
129 |         graph.add_edge(u, v)
130 | 
131 |     return graph
132 | 
133 | 
134 | def edge_index_to_nx(edge_index, num_nodes):
135 |     """
136 |     Convert a torch geometric Data to networkx Graph by edge_index.
137 |     Args:
138 |         edge_index (Data.edge_index): A torch geometric Data.
139 |         num_nodes (int): Number of nodes in a graph.
140 |     Returns:
141 |         Graph: networkx Graph
142 |     """
143 |     graph = nx.Graph()
144 |     graph.add_nodes_from(np.arange(num_nodes))
145 |     edge_index = edge_index.numpy()
146 | 
147 |     for u, v in np.transpose(edge_index):
148 |         graph.add_edge(u, v)
149 | 
150 |     return graph
151 | 
152 | 
153 | def filter_edge_index_1(data, node_indices):
154 |     """
155 |     Remove unnecessary edges from a torch geometric Data, only keep the edges between node_indices.
156 |     Args:
157 |         data (Data): A torch geometric Data.
158 |         node_indices (list): A list of nodes to be deleted from data.
159 | 
160 |     Returns:
161 |         data.edge_index: The new edge_index after removing the node_indices.
162 |     """
163 |     if isinstance(data.edge_index, torch.Tensor):
164 |         data.edge_index = data.edge_index.cpu()
165 | 
166 |     edge_index = data.edge_index
167 |     node_index = np.isin(edge_index, node_indices)
168 | 
169 |     col_index = np.nonzero(np.logical_and(node_index[0], node_index[1]))[0]
170 |     edge_index = data.edge_index[:, col_index]
171 | 
172 |     return np.searchsorted(node_indices, edge_index)
173 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import torch
 4 | 
 5 | from exp.exp_graph_partition import ExpGraphPartition
 6 | from exp.exp_node_edge_unlearning import ExpNodeEdgeUnlearning
 7 | from exp.exp_unlearning import ExpUnlearning
 8 | from exp.exp_attack_unlearning import ExpAttackUnlearning
 9 | from parameter_parser import parameter_parser
10 | 
11 | 
12 | def config_logger(save_name):
13 |     # create logger
14 |     logger = logging.getLogger()
15 |     logger.setLevel(logging.DEBUG)
16 |     formatter = logging.Formatter('%(levelname)s:%(asctime)s: - %(name)s - : %(message)s')
17 | 
18 |     # create console handler
19 |     ch = logging.StreamHandler()
20 |     ch.setLevel(logging.DEBUG)
21 |     ch.setFormatter(formatter)
22 |     logger.addHandler(ch)
23 | 
24 | 
25 | def main(args, exp):
26 |     # config the logger
27 |     logger_name = "_".join((exp, args['dataset_name'], args['partition_method'], str(args['num_shards']), str(args['test_ratio'])))
28 |     config_logger(logger_name)
29 |     logging.info(logger_name)
30 | 
31 |     torch.set_num_threads(args["num_threads"])
32 |     torch.cuda.set_device(args["cuda"])
33 |     os.environ["CUDA_VISIBLE_DEVICES"] = str(args["cuda"])
34 | 
35 |     # subroutine entry for different methods
36 |     if exp == 'partition':
37 |         ExpGraphPartition(args)
38 |     elif exp == 'unlearning':
39 |         ExpUnlearning(args)
40 |     elif exp == 'node_edge_unlearning':
41 |         ExpNodeEdgeUnlearning(args)
42 |     elif exp == 'attack_unlearning':
43 |         ExpAttackUnlearning(args)
44 |     else:
45 |         raise Exception('unsupported attack')
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     args = parameter_parser()
50 | 
51 |     main(args, args['exp'])
52 | 


--------------------------------------------------------------------------------
/parameter_parser.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | 
 4 | def str2bool(v):
 5 |     if isinstance(v, bool):
 6 |         return v
 7 |     if v.lower() in ('yes', 'true', 't', 'y', '1'):
 8 |         return True
 9 |     elif v.lower() in ('no', 'false', 'f', 'n', '0'):
10 |         return False
11 |     else:
12 |         raise argparse.ArgumentTypeError('Boolean value expected.')
13 | 
14 | 
15 | def parameter_parser():
16 |     """
17 |     A method to parse up command line parameters.
18 |     The default hyper-parameters give a good quality representation without grid search.
19 |     """
20 |     parser = argparse.ArgumentParser()
21 | 
22 |     ######################### general parameters ################################
23 |     parser.add_argument('--is_vary', type=bool, default=False, help='control whether to use multiprocess')
24 |     parser.add_argument('--dataset_name', type=str, default='citeseer',
25 |                         choices=["cora", "citeseer", "pubmed", "Coauthor_CS", "Coauthor_Phys"])
26 | 
27 |     parser.add_argument('--exp', type=str, default='attack_unlearning',
28 |                         choices=["partition", "unlearning", "node_edge_unlearning", "attack_unlearning"])
29 |     parser.add_argument('--cuda', type=int, default=3, help='specify gpu')
30 |     parser.add_argument('--num_threads', type=int, default=1)
31 | 
32 |     parser.add_argument('--is_upload', type=str2bool, default=True)
33 |     parser.add_argument('--database_name', type=str, default="unlearning_dependant",
34 |                         choices=['unlearning_dependant', 'unlearning_adaptive',
35 |                                  'unlearning_graph_structure', 'gnn_unlearning_shards',
36 |                                  'unlearning_delta_plot', 'gnn_unlearning_utility',
37 |                                  'unlearning_ratio', 'unlearning_partition_baseline',
38 |                                  'unlearning_ratio', 'attack_unlearning'])
39 | 
40 |     ########################## graph partition parameters ######################
41 |     parser.add_argument('--is_split', type=str2bool, default=True)
42 |     parser.add_argument('--test_ratio', type=float, default=0.1)
43 |     parser.add_argument('--use_test_neighbors', type=str2bool, default=True)
44 |     parser.add_argument('--is_partition', type=str2bool, default=True)
45 |     parser.add_argument('--is_prune', type=str2bool, default=False)
46 |     parser.add_argument('--num_shards', type=int, default=10)
47 |     parser.add_argument('--is_constrained', type=str2bool, default=True)
48 |     parser.add_argument('--is_gen_embedding', type=str2bool, default=True)
49 | 
50 |     parser.add_argument('--partition_method', type=str, default='sage_km',
51 |                         choices=["sage_km", "random", "lpa", "metis", "lpa_base", "sage_km_base"])
52 |     parser.add_argument('--terminate_delta', type=int, default=0)
53 |     parser.add_argument('--shard_size_delta', type=float, default=0.005)
54 | 
55 |     ########################## unlearning parameters ###########################
56 |     parser.add_argument('--repartition', type=str2bool, default=False)
57 | 
58 |     ########################## training parameters ###########################
59 |     parser.add_argument('--is_train_target_model', type=str2bool, default=True)
60 |     parser.add_argument('--is_use_node_feature', type=str2bool, default=False)
61 |     parser.add_argument('--is_use_batch', type=str2bool, default=True, help="Use batch train GNN models.")
62 |     parser.add_argument('--target_model', type=str, default='GAT', choices=["SAGE", "GAT", 'MLP', "GCN", "GIN"])
63 |     parser.add_argument('--train_lr', type=float, default=0.01)
64 |     parser.add_argument('--train_weight_decay', type=float, default=0)
65 |     parser.add_argument('--num_epochs', type=int, default=100)
66 |     parser.add_argument('--num_runs', type=int, default=1)
67 |     parser.add_argument('--batch_size', type=int, default=512)
68 |     parser.add_argument('--test_batch_size', type=int, default=64)
69 |     parser.add_argument('--aggregator', type=str, default='mean', choices=['mean', 'majority', 'optimal'])
70 | 
71 |     parser.add_argument('--opt_lr', type=float, default=0.001)
72 |     parser.add_argument('--opt_decay', type=float, default=0.0001)
73 |     parser.add_argument('--opt_num_epochs', type=int, default=50)
74 |     parser.add_argument('--unlearning_request', type=str, default='random', choices=['random', 'adaptive', 'dependant', 'top1', 'last5'])
75 | 
76 |     ########################## analysis parameters ###################################
77 |     parser.add_argument('--num_unlearned_nodes', type=int, default=1)
78 |     parser.add_argument('--ratio_unlearned_nodes', type=float, default=0.005)
79 |     parser.add_argument('--num_unlearned_edges', type=int, default=1)
80 |     parser.add_argument('--ratio_deleted_edges', type=float, default=0.9)
81 |     parser.add_argument('--num_opt_samples', type=int, default=1000)
82 | 
83 |     args = vars(parser.parse_args())
84 | 
85 |     return args
86 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | ## Introduction
 2 | 
 3 | This code is the implementation of  graph unlearning.
 4 | 
 5 | #### Code Strcuture
 6 | 
 7 | ```
 8 | .
 9 | ├── config.py
10 | ├── exp
11 | ├── lib_aggregator
12 | ├── lib_dataset
13 | ├── lib_gnn_model
14 | ├── lib_graph_partition
15 | ├── lib_node_embedding
16 | ├── lib_utils
17 | ├── main.py
18 | ├── parameter_parser.py
19 | └── readme.md
20 | ```
21 | 
22 | #### Environment prepare
23 | 
24 | ```bash
25 | conda create --name graph_unlearning python=3.6.10
26 | conda activate graph_unlearning 
27 | pip install sklearn ogb infomap seaborn munkres gensim fastdtw leidenalg cvxpy pymetis mysqlclient pymetis MulticoreTSNE cupy-cuda111 tensorflow
28 | pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
29 | TORCH="1.9.0"
30 | CUDA="cu111"
31 | pip install torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}+${CUDA}.html
32 | pip install torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}+${CUDA}.html
33 | pip install torch-cluster -f https://data.pyg.org/whl/torch-${TORCH}+${CUDA}.html
34 | pip install torch-spline-conv -f https://data.pyg.org/whl/torch-${TORCH}+${CUDA}.html
35 | pip install torch-geometric
36 | ```
37 | 
38 | #### GraphEraser Framework
39 | 
40 | ###### Graph Partition
41 | 
42 | See more parameters settings in parameter_parser.py at ***##graph partition parameters##***.
43 | 
44 | ```bash
45 | $ --partition true --partition_method lpa --is_constrained true
46 | 
47 | $ --partition true --partition_method sage_km --is_constrained true
48 | ```
49 | 
50 | ###### Aggregation
51 | 
52 | See more parameters settings in parameter_parser.py at ***##training parameters##***.
53 | 
54 | ```bash
55 | Use '--aggregator' choose the desired aggregation method, choose from ['mean', 'majority', 'optimal'].
56 | 
57 | ```
58 | 
59 | ###### Unlearning
60 | 
61 | See more parameters settings in parameter_parser.py at ***##unlearning parameters##***.
62 | 
63 | ```bash
64 | Use '--repartition' to decide whether unlearning the graph partition.
65 | 
66 | Use '--unlearning_request' to choose the unlearning request distributions from ['random', 'adaptive', 'dependant', 'top1', 'last5'].
67 | ```
68 | 


--------------------------------------------------------------------------------