├── .gitignore ├── .gitmodules ├── Makefile ├── readme.md ├── requirements.txt └── src ├── main.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | result/ 3 | embeddings/ 4 | logs/ 5 | 6 | *.pyc 7 | *.swp 8 | *~ 9 | *.log 10 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "embedding_test"] 2 | path = embedding_test 3 | url = git@github.com:tadpole/embedding_test.git 4 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | #dataset = pubmed 2 | #method = gcn 3 | #task = classification 4 | 5 | dataset = BlogCatalog ### can be ['BlogCatalog', 'Wikipedia', 'pubmed'] 6 | method = deepwalk ### can be ['deepwalk', 'AROPE', 'gcn'] 7 | task = link_predict ### can be ['link_predict', 'classification'] 8 | ms = mle ### can be ['mle', 'random_search', 'b_opt'] 9 | ms_name = $(shell echo $(ms) | sed "s/ /_/g") 10 | #log_file = logs/l_$(dataset)_$(method)_$(task).log 11 | log_file = logs/l_$(dataset)_$(method)_$(task)_$(ms_name).log 12 | log_pid = logs/pid_$(dataset)_$(method)_$(task) 13 | 14 | 15 | sample: 16 | python3 -u src/main.py $(dataset) sample $(task) $(ms) 17 | 18 | run: 19 | python3 -u src/main.py $(dataset) $(method) $(task) $(ms) 20 | 21 | server_run: 22 | nohup make run > $(log_file) 2>&1 & echo $$! > $(log_pid) 23 | 24 | log: 25 | tail $(log_file) -n 20 26 | 27 | pid: 28 | cat $(log_pid) 29 | 30 | kill: 31 | kill -9 `cat $(log_pid)` 32 | 33 | check: 34 | ps -ef | grep "python3 -u src/main.py" 35 | 36 | test: 37 | echo $(shell echo $(ms) | sed "s/ /_/g") 38 | 39 | .PHONY: clean 40 | clean: 41 | rm */.*swp* 42 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # AutoNE 2 | The Implementation of "[AutoNE: Hyperparameter Optimization for Massive Network Embedding](https://tadpole.github.io/files/2019_KDD_AutoNE.pdf)"(KDD 2019). 3 | 4 | ### Requirements 5 | - Python 3 6 | ``` 7 | $ pip3 install -r requirements.txt 8 | ``` 9 | Besides, this code relies on my submodule [embeddding_test](https://github.com/tadpole/embedding_test). Please use following command to download the code: 10 | ``` 11 | $ git clone --recursive git@github.com:tadpole/AutoNE.git 12 | ``` 13 | 14 | ### Usage 15 | The Dataset can be downloaded from [here](https://cloud.tsinghua.edu.cn/f/73d0675acf134f259bf4/?dl=1). 16 | 17 | You can change 'dataset', 'method', 'task', 'ms' variables in Makefile to select data and model. 18 | 19 | ``` 20 | dataset : [BlogCatalog | Wikipedia | pubmed] 21 | method : [deepwalk | AROPE | gcn] 22 | task : [link_predict | classification] 23 | ms : [mle | random_search | b_opt] 24 | ``` 25 | 26 | #### Sampling dataset 27 | ``` 28 | $ make sample 29 | ``` 30 | 31 | #### Run the model 32 | ``` 33 | $ make run 34 | ``` 35 | 36 | ### Cite 37 | If you find this code useful, please cite our paper: 38 | ``` 39 | @inproceedings{tu2019autone, 40 | title={AutoNE: Hyperparameter Optimization for Massive Network Embedding}, 41 | author={Tu, Ke and Ma, Jianxin and Cui, Peng and Pei, Jian and Zhu, Wenwu}, 42 | booktitle={Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery \& Data Mining}, 43 | year={2019}, 44 | organization={ACM} 45 | } 46 | ``` 47 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | networkx==2.2 2 | numpy==1.15.2 3 | scipy==1.3.0 4 | NetLSD==1.0.2 5 | bayesian-optimization==1.0.0 6 | deepwalk==1.0.3 7 | -------------------------------------------------------------------------------- /src/main.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import random 3 | import itertools 4 | import time 5 | import copy 6 | import functools 7 | 8 | import numpy as np 9 | import networkx as nx 10 | import netlsd 11 | from sklearn import gaussian_process 12 | from scipy import sparse 13 | from bayes_opt import BayesianOptimization 14 | 15 | import utils 16 | 17 | embedding_test_dir = 'embedding_test' 18 | debug = True 19 | cache = True 20 | 21 | def split_graph(G, output_dir, radio=0.8): 22 | t_dir = output_dir 23 | Gs = G 24 | file_path = os.path.join(t_dir, 'graph.edgelist') 25 | file_test_path = os.path.join(t_dir, 'graph_test.edgelist') 26 | label_path = os.path.join(t_dir, 'label.txt') 27 | G_train = nx.Graph() 28 | G_test = nx.Graph() 29 | edges = np.random.permutation(list(Gs.edges())) 30 | nodes = set() 31 | for a, b in edges: 32 | if a not in nodes or b not in nodes: 33 | G_train.add_edge(a, b) 34 | nodes.add(a) 35 | nodes.add(b) 36 | else: 37 | G_test.add_edge(a, b) 38 | print(len(nodes), Gs.number_of_nodes()) 39 | assert len(nodes) == Gs.number_of_nodes() 40 | assert len(nodes) == G_train.number_of_nodes() 41 | num_test_edges = int((1-radio)*Gs.number_of_edges()) 42 | now_number = G_test.number_of_edges() 43 | if num_test_edges < now_number: 44 | test_edges = list(G_test.edges()) 45 | G_train.add_edges_from(test_edges[:now_number-num_test_edges]) 46 | G_test.remove_edges_from(test_edges[:now_number-num_test_edges]) 47 | print("sample graph,origin: {} {}, train: {} {}, test: {} {}".format(Gs.number_of_nodes(), Gs.number_of_edges(), G_train.number_of_nodes(), G_train.number_of_edges(), G_test.number_of_nodes(), G_test.number_of_edges())) 48 | with utils.write_with_create(file_path) as f: 49 | for i, j in G_train.edges(): 50 | print(i, j, file=f) 51 | with utils.write_with_create(file_test_path) as f: 52 | for i, j in G_test.edges(): 53 | print(i, j, file=f) 54 | 55 | def sample_graph(G, output_dir, s_n, times=10, with_test=False, radio=0.8, feature_path=None): 56 | if s_n is None: 57 | s_n = int(np.sqrt(G.number_of_nodes())) 58 | for t in range(times): 59 | t_dir = os.path.join(output_dir, 's{}'.format(t)) 60 | n = random.randint(int(s_n/2), 2*s_n) 61 | Gs = utils.random_walk_induced_graph_sampling(G, n) 62 | mapping = dict(zip(Gs.nodes(), range(Gs.number_of_nodes()))) 63 | if feature_path is not None: 64 | feats = sparse.load_npz(feature_path) 65 | row = [] 66 | col = [] 67 | data = [] 68 | fr, fc = feats.nonzero() 69 | for i, j in zip(fr, fc): 70 | if i in mapping: 71 | row.append(mapping[i]) 72 | col.append(j) 73 | data.append(feats[i, j]) 74 | feats = sparse.csr_matrix((data, (row, col)), shape=(len(mapping), feats.shape[1])) 75 | Gs = nx.relabel_nodes(Gs, mapping) 76 | file_path = os.path.join(t_dir, 'graph.edgelist') 77 | file_test_path = os.path.join(t_dir, 'graph_test.edgelist') 78 | label_path = os.path.join(t_dir, 'label.txt') 79 | feature_save_path = os.path.join(t_dir, 'features.npz') 80 | if feature_path is not None: 81 | utils.write_with_create(feature_save_path) 82 | sparse.save_npz(feature_save_path, feats) 83 | if not with_test: 84 | print("sample graph, nodes: {}, edges: {}, save into {}".format(Gs.number_of_nodes(), Gs.number_of_edges(), t_dir)) 85 | with utils.write_with_create(file_path) as f: 86 | for i, j in Gs.edges(): 87 | print(i, j, file=f) 88 | with utils.write_with_create(label_path) as f: 89 | for i, data in Gs.nodes(data=True): 90 | if 'label' in data: 91 | for j in data['label']: 92 | print(i, j, file=f) 93 | else: 94 | G_train = nx.Graph() 95 | G_test = nx.Graph() 96 | edges = np.random.permutation(list(Gs.edges())) 97 | nodes = set() 98 | for a, b in edges: 99 | if a not in nodes or b not in nodes: 100 | G_train.add_edge(a, b) 101 | nodes.add(a) 102 | nodes.add(b) 103 | else: 104 | G_test.add_edge(a, b) 105 | assert len(nodes) == Gs.number_of_nodes() 106 | assert len(nodes) == G_train.number_of_nodes() 107 | num_test_edges = int((1-radio)*Gs.number_of_edges()) 108 | now_number = G_test.number_of_edges() 109 | if num_test_edges < now_number: 110 | test_edges = list(G_test.edges()) 111 | G_train.add_edges_from(test_edges[:now_number-num_test_edges]) 112 | G_test.remove_edges_from(test_edges[:now_number-num_test_edges]) 113 | print("sample graph,origin: {} {}, train: {} {}, test: {} {}".format(Gs.number_of_nodes(), Gs.number_of_edges(), G_train.number_of_nodes(), G_train.number_of_edges(), G_test.number_of_nodes(), G_test.number_of_edges())) 114 | with utils.write_with_create(file_path) as f: 115 | for i, j in G_train.edges(): 116 | print(i, j, file=f) 117 | with utils.write_with_create(file_test_path) as f: 118 | for i, j in G_test.edges(): 119 | print(i, j, file=f) 120 | 121 | 122 | def get_result(dataset_name, target_model, task, kargs, sampled_dir='', debug=debug, cache=cache): 123 | rs = utils.RandomState() 124 | rs.save_state() 125 | rs.set_seed(0) 126 | embedding_filename = utils.get_names(target_model, **kargs) 127 | if task == 'classification': 128 | cf = os.path.abspath(os.path.join('result/{}'.format(dataset_name), sampled_dir, 'cf', embedding_filename)) 129 | elif task == 'link_predict': 130 | cf = os.path.abspath(os.path.join('result/{}'.format(dataset_name), sampled_dir, 'lp', embedding_filename)) 131 | embedding_filename = os.path.abspath(os.path.join('embeddings/{}'.format(dataset_name), sampled_dir, embedding_filename)) 132 | dataset_filename = os.path.abspath(os.path.join('data/{}'.format(dataset_name), sampled_dir, 'graph.edgelist')) 133 | if target_model != 'gcn': 134 | if (not cache) or (not os.path.exists(embedding_filename)) or (os.path.getmtime(embedding_filename) < os.path.getmtime(dataset_filename)): 135 | utils.run_target_model(target_model, dataset_filename, os.path.dirname(embedding_filename), embedding_test_dir=embedding_test_dir, debug=debug, **kargs) 136 | if (not cache) or (not os.path.exists(cf)) or (os.path.getmtime(cf) < os.path.getmtime(embedding_filename)): 137 | if task == 'classification': 138 | labels = os.path.abspath(os.path.join(os.path.dirname(dataset_filename), 'label.txt')) 139 | elif task == 'link_predict': 140 | labels = os.path.abspath(os.path.join(os.path.dirname(dataset_filename))) 141 | utils.run_test(task, dataset_name, [embedding_filename], labels, cf, embedding_test_dir=embedding_test_dir) 142 | else: 143 | if (not cache) or (not os.path.exists(cf)): 144 | data_path = os.path.abspath(os.path.join('data/{}'.format(dataset_name))) 145 | with utils.cd(os.path.join(embedding_test_dir, 'src/baseline/gcn/gcn')): 146 | cmd = ('python3 main.py' +\ 147 | ' --epochs {} --hidden1 {} --learning_rate {}' +\ 148 | ' --output_filename {} --debug {} --dataset {} --input_dir {}').format(kargs['epochs'], kargs['hidden1'], kargs['learning_rate'], cf, debug, dataset_name, data_path) 149 | if debug: 150 | print(cmd) 151 | else: 152 | cmd += ' > /dev/null 2>&1' 153 | os.system(cmd) 154 | rs.load_state() 155 | res = np.loadtxt(cf, dtype=float) 156 | if len(res.shape) != 0: 157 | res = res[0] 158 | return res 159 | 160 | def get_wne(dataset_name, sampled_dir='', cache=True): 161 | dataset_filename = os.path.abspath(os.path.join('data/{}'.format(dataset_name), sampled_dir, 'graph.edgelist')) 162 | labels = os.path.abspath(os.path.join(os.path.dirname(dataset_filename), 'label.txt')) 163 | save_path = os.path.abspath(os.path.join('embeddings/{}'.format(dataset_name), sampled_dir, 'wme.embeddings')) 164 | if (not cache) or (not os.path.exists(save_path)) or (os.path.getmtime(save_path) < os.path.getmtime(dataset_filename)): 165 | G = utils.load_graph(dataset_filename, label_name=None) 166 | do_full = (G.number_of_nodes()<10000) 167 | eigenvalues = 'full' if do_full else 'auto' 168 | wne = netlsd.heat(G, timescales=np.logspace(-2, 2, 10), eigenvalues=eigenvalues) 169 | with utils.write_with_create(save_path) as f: 170 | print(" ".join(map(str, wne)), file=f) 171 | return np.loadtxt(save_path) 172 | 173 | def _get_mle_result(gp, dataset_name, target_model, task, without_wne, params, ps, s, X, y): 174 | wne = get_wne(dataset_name, '', cache=True) if not without_wne else None 175 | X_b_t, res_t = None, -1.0 176 | X_t = copy.deepcopy(X) 177 | y_t = copy.deepcopy(y) 178 | for i in range(s): 179 | X_b, y_b = gp.predict(ps, params.get_bound(ps), params.get_type(ps), wne) 180 | X_b = params.convert(X_b, ps) 181 | 182 | args = params.random_args(ps=ps, known_args=dict(zip(ps, X_b))) 183 | res = get_result(dataset_name, target_model, task, args, '') 184 | if res_t < res: 185 | res_t = res 186 | X_b_t = X_b 187 | if without_wne: 188 | X_b = [X_b] 189 | else: 190 | X_b = np.hstack((X_b, wne)) 191 | X_t = np.vstack((X_t, X_b)) 192 | y_t.append(res) 193 | gp.fit(X_t, y_t) 194 | X_b, y_b = gp.predict(ps, params.get_bound(ps), params.get_type(ps), wne) 195 | X_b = params.convert(X_b, ps) 196 | 197 | args = params.random_args(ps=ps, known_args=dict(zip(ps, X_b))) 198 | res = get_result(dataset_name, target_model, task, args, '') 199 | if res_t < res: 200 | res_t = res 201 | X_b_t = X_b 202 | return X_b_t, res_t 203 | 204 | def mle_k(dataset_name, target_model, task='classification', sampled_number=10, without_wne=False, k=16, s=0, print_iter=10, debug=False): 205 | X = [] 206 | y = [] 207 | params = utils.Params(target_model) 208 | ps = params.arg_names 209 | total_t = 0.0 210 | info = [] 211 | X_t, res_t = None, -1.0 212 | if without_wne: 213 | gp = utils.GaussianProcessRegressor() 214 | else: 215 | K = utils.K(len(ps)) 216 | gp = utils.GaussianProcessRegressor(K) 217 | for t in range(sampled_number): 218 | b_t = time.time() 219 | i = t 220 | wne = get_wne(dataset_name, 'sampled/s{}'.format(i), cache=True) 221 | for v in range(k): 222 | kargs = params.random_args(ps) 223 | res = get_result(dataset_name, target_model, task, kargs, 'sampled/s{}'.format(i)) 224 | if without_wne: 225 | X.append([kargs[p] for p in ps]) 226 | else: 227 | X.append(np.hstack(([kargs[p] for p in ps], wne))) 228 | if debug: 229 | print('sample {}, {}/{}, kargs: {}, res: {}, time: {:.4f}s'.format(t, v, k, [kargs[p] for p in ps], res, time.time()-b_t)) 230 | y.append(res) 231 | 232 | for t in range(s): 233 | b_t = time.time() 234 | gp.fit(np.vstack(X), y) 235 | X_temp, res_temp = _get_mle_result(gp, dataset_name, target_model, task, without_wne, params, ps, 0, X, y) 236 | if without_wne: 237 | X.append(X_temp) 238 | else: 239 | X.append(np.hstack((X_temp, wne))) 240 | y.append(res_temp) 241 | if res_t < res_temp: 242 | res_t = res_temp 243 | X_t = X_temp 244 | e_t = time.time() 245 | total_t += e_t-b_t 246 | info.append([res_temp, total_t]) 247 | print('iters: {}/{}, params: {}, res: {}, time: {:.4f}s'.format(t, s, X_temp, res_temp, total_t)) 248 | if debug: 249 | return X_t, res_t, info 250 | return X_t, res_t 251 | 252 | def random_search(dataset_name, target_model, task, k=16, debug=False, sampled_dir=''): 253 | X = [] 254 | y = [] 255 | params = utils.Params(target_model) 256 | ps = params.arg_names 257 | b_t = time.time() 258 | info = [] 259 | for v in range(k): 260 | kargs = params.random_args(ps) 261 | #kargs = params.convert_dict(kargs, ps) 262 | if debug: 263 | print(kargs) 264 | res = get_result(dataset_name, target_model, task, kargs, sampled_dir) 265 | X.append([kargs[p] for p in ps]) 266 | y.append(res) 267 | ind = np.argmax(y) 268 | total_t = time.time()-b_t 269 | if debug: 270 | info.append([y[ind], total_t]) 271 | print('iters: {}/{}, params: {}, res: {}, time: {:.4f}s'.format(v, k, X[ind], y[ind], total_t)) 272 | X = np.array(X) 273 | y = np.array(y) 274 | ind = np.argmax(y) 275 | if debug: 276 | return X[ind], y[ind], info 277 | return X[ind], y[ind] 278 | 279 | 280 | def b_opt(dataset_name, target_model, task, k=16, debug=False, n_inits=0, inits=None, sampled_dir=''): 281 | params = utils.Params(target_model) 282 | ps = params.arg_names 283 | p_bound = dict(zip(ps, params.get_bound(ps))) 284 | def black_box_function(**kargs): 285 | b_t = time.time() 286 | x = [kargs[p] for p in ps] 287 | args = params.convert(x, ps) 288 | kargs = dict(zip(ps, args)) 289 | kargs['emd_size'] = 64 290 | if target_model == 'AROPE': 291 | kargs['order'] = 3 292 | res = get_result(dataset_name, target_model, task, kargs, sampled_dir) 293 | e_t = time.time() 294 | print("############## params: {}, time: {}s".format(kargs, e_t-b_t)) 295 | return res 296 | opt = BayesianOptimization( 297 | f=black_box_function, 298 | pbounds=p_bound, 299 | verbose=2) 300 | #opt.set_gp_params(normalize_y=False) 301 | if inits is not None: 302 | for d in inits: 303 | dd = dict(zip(ps, d)) 304 | target = black_box_function(**dd) 305 | print(dd, target) 306 | opt.register(params=dd, target=target) 307 | opt.maximize(init_points=n_inits, n_iter=k) 308 | X = [opt.max['params'][p] for p in ps] 309 | y = opt.max['target'] 310 | if debug: 311 | info = [res['target'] for res in opt.res] 312 | return X, y, info 313 | return X, y 314 | 315 | def test_1(dataset_name, target_model, task): 316 | params = utils.Params(target_model) 317 | ps = params.arg_names 318 | b_t = time.time() 319 | info = [] 320 | sampled_dir = 'sampled/s0' 321 | X = [] 322 | y = [] 323 | args = {'number-walks': 10, 'walk-length': 10, 'window-size': 3} 324 | temp_args = params.random_args(ps) 325 | res = get_result(dataset_name, target_model, task, temp_args, sampled_dir, cache=True) 326 | X.append([temp_args[p] for p in ps]) 327 | y.append(res) 328 | print(i, j, [temp_args[p] for p in ps], res) 329 | return 0 330 | 331 | def main(args): 332 | seed = None 333 | random.seed(seed) 334 | np.random.seed(seed) 335 | if len(args) == 0: 336 | dataset_name = 'pubmed' 337 | target_model = 'gcn' 338 | task = 'classification' 339 | ms = ['mle', 'random_search', 'b_opt'] 340 | else: 341 | dataset_name = args[0] 342 | target_model = args[1] 343 | task = args[2] 344 | ms = args[3:] 345 | with_test = False 346 | dataset_path = 'data/{}/graph.edgelist'.format(dataset_name) 347 | label_path = 'data/{}/label.txt'.format(dataset_name) 348 | feature_path = None 349 | if task == 'link_predict': 350 | dataset_name = dataset_name+'_0.8' 351 | label_path = None 352 | with_test = True 353 | if target_model == 'gcn': 354 | feature_path = 'data/{}/features.npz'.format(dataset_name) 355 | if target_model == 'sample': 356 | G = utils.load_graph(dataset_path, label_path) 357 | split_graph(G, 'data/{}_0.8'.format(dataset_name), radio=0.8) 358 | sampled_number = 10#int(np.sqrt(G.number_of_nodes())) 359 | sample_graph(G, 'data/{}/sampled'.format(dataset_name), s_n=1000, times=5, with_test=with_test, feature_path=feature_path) 360 | return 0 361 | ks = 5 362 | #test(dataset_name, target_model, task) 363 | sampled_dir = '' 364 | 365 | for m in ms: 366 | res = [] 367 | for i in range(ks): 368 | info = [] 369 | if m == 'mle': 370 | X, y, info = mle_k(dataset_name, target_model, task, sampled_number=5, without_wne=False, k=5, s=10, debug=True) 371 | elif m == 'mle_w': 372 | X, y, info = mle_k(dataset_name, target_model, task, sampled_number=5, without_wne=True, k=5, s=10, debug=True) 373 | elif m == 'random_search': 374 | X, y, info = random_search(dataset_name, target_model, task, k=10, debug=True, sampled_dir=sampled_dir) 375 | elif m == 'random_search_l': 376 | X, y, info = random_search(dataset_name, target_model, task, k=5, debug=True, sampled_dir=sampled_dir) 377 | elif m == 'b_opt': 378 | b_t = time.time() 379 | X, y, info_t = b_opt(dataset_name, target_model, task, k=5, n_inits=5, debug=True, sampled_dir=sampled_dir) 380 | e_t = time.time() 381 | info = [[j, (e_t-b_t)/len(info_t)*(i+1)] for i, j in enumerate(info_t)] 382 | elif m == 'b_opt_l': 383 | b_t = time.time() 384 | X, y, info_t = b_opt(dataset_name, target_model, task, k=5, n_inits=1, debug=True, sampled_dir=sampled_dir) 385 | e_t = time.time() 386 | info = [[j, (e_t-b_t)/len(info_t)*(i+1)] for i, j in enumerate(info_t)] 387 | res.append(info) 388 | print(m, i, res) 389 | ts = 'lp' if task == 'link_predict' else 'cf' 390 | if sampled_dir == '': 391 | save_filename = 'result/{}/res_{}_{}_{}.npz'.format(dataset_name, ts, m, target_model) 392 | else: 393 | save_filename = 'result/{}/res_{}_{}_{}_{}.npz'.format(dataset_name, os.path.basename(sampled_dir), ts, m, target_model) 394 | np.savez(save_filename, res=res) 395 | 396 | if __name__ == '__main__': 397 | main(sys.argv[1:]) 398 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | import os, sys 3 | import collections 4 | import itertools 5 | import pickle as pkl 6 | import scipy.sparse as sp 7 | 8 | import networkx as nx 9 | import numpy as np 10 | from sklearn.cluster import SpectralClustering 11 | from sklearn import gaussian_process 12 | from scipy.optimize import minimize 13 | from bayes_opt import BayesianOptimization 14 | 15 | def rand(size, a, b, decimals=4): 16 | res = np.random.random_sample(size)*(b-a)+a 17 | if decimals is not None: 18 | return np.around(res, decimals=decimals) 19 | return res 20 | 21 | def softmax(x): 22 | e_x = np.exp(x - np.max(x)) 23 | return e_x / e_x.sum() 24 | 25 | def split_network(G, N): 26 | sc = SpectralClustering(N, affinity='precomputed') 27 | return sc.fit_predict(nx.adjacency_matrix(G)) 28 | 29 | def random_walk_induced_graph_sampling(G, N, T=100, growth_size=2, n_starts=5): 30 | # Refer to https://github.com/Ashish7129/Graph-Sampling 31 | G = nx.convert_node_labels_to_integers(G, 0, 'default', True) 32 | for n, data in G.nodes(data=True): 33 | G.node[n]['id'] = n 34 | n_node = G.number_of_nodes() 35 | labels = nx.get_node_attributes(G, 'label') 36 | candidate_set = set() 37 | if len(labels) > 0: 38 | candidate_set_value = set() 39 | l = np.random.permutation(list(labels.keys())) 40 | for i in l: 41 | for j in labels[i]: 42 | if j not in candidate_set_value: 43 | candidate_set.add(i) 44 | candidate_set_value.add(j) 45 | while len(candidate_set) < n_starts: 46 | candidate_set.add(np.random.random_integers(n_node)) 47 | candidate_set = list(candidate_set)[:n_starts] 48 | sampled_nodes = set() 49 | for i, temp_node in enumerate(candidate_set): 50 | sampled_nodes.add(G.node[temp_node]['id']) 51 | iter_ = 1 52 | nodes_before_t_iter = 0 53 | curr_node = temp_node 54 | N_t = int(N/n_starts)*(i+1) 55 | while len(sampled_nodes) < N_t: 56 | edges = [n for n in G.neighbors(curr_node)] 57 | index_of_edge = random.randint(0, len(edges)-1) 58 | chosen_node = edges[index_of_edge] 59 | sampled_nodes.add(G.node[chosen_node]['id']) 60 | curr_node = chosen_node 61 | iter_ += 1 62 | if iter_ % T == 0: 63 | if (len(sampled_nodes)-nodes_before_t_iter < growth_size): 64 | curr_node = random.randint(0, n_node-1) 65 | nodes_before_t_iter = len(sampled_nodes) 66 | sampled_graph = G.subgraph(sampled_nodes) 67 | return sampled_graph 68 | 69 | def generate_mask(dataset_path, radio=0.8): 70 | G = load_graph(os.path.join(dataset_path, 'graph.edgelist'), os.path.join(dataset_path, 'label.txt')) 71 | labels = nx.get_node_attributes(G, 'label') 72 | l = np.random.permutation(list(labels.keys())) 73 | n = int(radio*len(l)) 74 | with open(os.path.join(dataset_path, 'label_mask_train'), 'w') as f: 75 | for i in l[:n]: 76 | print(i, file=f) 77 | with open(os.path.join(dataset_path, 'label_mask_test'), 'w') as f: 78 | for i in l[n:]: 79 | print(i, file=f) 80 | 81 | 82 | def write_with_create(path): 83 | dirpath = os.path.dirname(path) 84 | if not os.path.exists(dirpath): 85 | os.makedirs(dirpath) 86 | return open(path, 'w') 87 | 88 | def load_graph(edgelist_filename, label_name=None): 89 | G = nx.read_edgelist(edgelist_filename, nodetype=int) 90 | if label_name is not None: 91 | labels = np.loadtxt(label_name, dtype=int) 92 | ### multi-label 93 | l = collections.defaultdict(list) 94 | for i, j in labels: 95 | l[i].append(j) 96 | ### Warning:: The call order of arguments `values` and `name` switched between v1.x & v2.x. 97 | nx.set_node_attributes(G, l, 'label') 98 | print("load graph", G.number_of_nodes(), G.number_of_edges()) 99 | return G 100 | 101 | def run_target_model(method, input_filename, output_dir, embedding_test_dir, debug=True, **kargs): 102 | sys.path.append(embedding_test_dir) 103 | from src.baseline import baseline 104 | with cd(embedding_test_dir): 105 | baseline(method, None, kargs['emd_size'], input_filename, output_dir, debug=debug, **kargs) 106 | 107 | def run_test(task, dataset_name, models, labels, save_filename, embedding_test_dir): 108 | sys.path.append(embedding_test_dir) 109 | from src.test import test 110 | args = {} 111 | if task == 'classification': 112 | args['radio'] = [0.8] 113 | args['label_name'] = labels 114 | evalution = None 115 | elif task == 'link_predict': 116 | evalution = 'AUC' 117 | args['data_dir'] = labels 118 | args['sampling_mapping'] = {'Flickr': 100000, 'wiki': 1000000} 119 | 120 | with cd(embedding_test_dir): 121 | test(task, evalution, dataset_name, models, save_filename=save_filename, **args) 122 | 123 | def get_names(method, **args): 124 | kargs = args 125 | if method == 'node2vec': 126 | embedding_filename = os.path.join("{}_{:d}_{:d}_{:d}_{:d}_{:.4f}_{:.4f}".format(method, kargs['emd_size'], kargs['num-walks'], kargs['walk-length'], kargs['window-size'], kargs['p'], kargs['q'])) 127 | elif method == 'deepwalk': 128 | embedding_filename = os.path.join("{}_{:d}_{:d}_{:d}_{:d}".format(method, kargs['emd_size'], kargs['number-walks'], kargs['walk-length'], kargs['window-size'])) 129 | elif method == 'gcn': 130 | #embedding_filename = os.path.join("{}_{:d}_{:d}_{:.4f}".format(method, kargs['epochs'], kargs['hidden1'], kargs['learning_rate'])) 131 | embedding_filename = os.path.join("{}_{:d}_{:d}_{:.4f}_{:.4f}_{:.4f}".format(method, kargs['epochs'], kargs['hidden1'], kargs['learning_rate'], kargs['dropout'], kargs['weight_decay'])) 132 | elif method == 'AROPE': 133 | embedding_filename = os.path.join("{}_{}_".format(method, kargs['emd_size'])+'_'.join(['{:.4f}'.format(kargs['w{}'.format(i+1)]) for i in range(kargs['order'])])) 134 | return embedding_filename 135 | 136 | def random_with_bound_type(bound, type_): 137 | res = [] 138 | for b, t in zip(bound, type_): 139 | if t == int: 140 | res.append(random.randint(*b)) 141 | elif t == float: 142 | res.append(rand(1, *b)[0]) 143 | else: 144 | res.append(None) 145 | return res 146 | 147 | 148 | def find_b_opt_max(gp, ps, p_bound, p_type, w=None, n_warmup=100000, n_iter=100): 149 | """ 150 | refer to acq_max https://github.com/fmfn/BayesianOptimization/blob/master/bayes_opt/util.py 151 | """ 152 | X = [] 153 | for k in range(n_warmup): 154 | X.append(random_with_bound_type(p_bound, p_type)) 155 | if w is not None: 156 | X = np.hstack((X, np.tile(w, (len(X), 1)))) 157 | y = gp.predict(X) 158 | ind = np.argmax(y) 159 | x_max, y_max = X[ind][:len(ps)], y[ind] 160 | temp_w = [] if w is None else w 161 | def utility(x, kappa=2.576): 162 | mean, std = gp.predict([list(x)+list(temp_w)], return_std=True) 163 | #print("######### mean, std ", x, mean, std) 164 | return (mean + kappa*std)[0] 165 | for i in range(n_iter): 166 | x_try = random_with_bound_type(p_bound, p_type) 167 | res = minimize(lambda x: -utility(x), 168 | x_try, 169 | bounds=p_bound, 170 | method='L-BFGS-B') 171 | if not res.success: 172 | continue 173 | if -res.fun >= y_max: 174 | x_max = res.x 175 | y_max = -res.fun 176 | 177 | return x_max, y_max 178 | 179 | class cd: 180 | """Context manager for changing the current working directory""" 181 | def __init__(self, newPath): 182 | self.newPath = os.path.expanduser(newPath) 183 | 184 | def __enter__(self): 185 | self.savedPath = os.getcwd() 186 | os.chdir(self.newPath) 187 | 188 | def __exit__(self, etype, value, traceback): 189 | os.chdir(self.savedPath) 190 | 191 | class K(gaussian_process.kernels.Kernel): 192 | def __init__(self, n=3): 193 | self.n = n 194 | self.kernels = [gaussian_process.kernels.Matern(nu=2.5), gaussian_process.kernels.Matern(nu=2.5)] 195 | 196 | def __call__(self, X, Y=None): 197 | n = self.n 198 | if Y is None: 199 | Y = X 200 | return self.kernels[0](X[:, :n], Y[:, :n])*self.kernels[1](X[:, n:], Y[:, n:]) 201 | 202 | def diag(self, X): 203 | n = self.n 204 | return self.kernels[0].diag(X[:, :n])*self.kernels[1].diag(X[:, n:]) 205 | 206 | def is_stationary(self): 207 | return np.all([kernel.is_stationary() for kernel in self.kernels]) 208 | 209 | class GaussianProcessRegressor(object): 210 | def __init__(self, kernel=None): 211 | if kernel is None: 212 | kernel = gaussian_process.kernels.Matern(nu=2.5) 213 | self.gp = gaussian_process.GaussianProcessRegressor( 214 | kernel=kernel, 215 | alpha=1e-6, 216 | normalize_y=True, 217 | n_restarts_optimizer=10) 218 | 219 | def fit(self, X, y): 220 | self.gp.fit(X, y) 221 | 222 | def predict(self, ps, p_bound, type_, w=None): 223 | return find_b_opt_max(self.gp, ps, p_bound, type_, w) 224 | 225 | class meta_learner(BayesianOptimization): 226 | def set_kernel(kernel): 227 | self._gp.kernel = kernel 228 | 229 | class RandomState(object): 230 | def __init__(self): 231 | self.state = None 232 | 233 | def set_seed(self, seed): 234 | random.seed(seed) 235 | np.random.seed(seed) 236 | 237 | def save_state(self): 238 | self.state = (random.getstate(), np.random.get_state()) 239 | 240 | def load_state(self): 241 | random.setstate(self.state[0]) 242 | np.random.set_state(self.state[1]) 243 | 244 | class Params(object): 245 | def __init__(self, method): 246 | eps = 1e-6 247 | self.method = method 248 | if method == 'node2vec': 249 | self.arg_names = ['num-walks', 'walk-length', 'window-size', 'p', 'q'] 250 | self.type_ = [int, int, int, float, float] 251 | self.bound = [(2, 20), (2, 80), (2, 10), (0.0001, 2), (0.0001, 2)] 252 | elif method == 'deepwalk': 253 | self.arg_names = ['number-walks', 'walk-length', 'window-size'] 254 | self.type_ = [int, int, int] 255 | self.bound = [(2, 20), (2, 80), (2, 20)] 256 | elif method == 'gcn': 257 | self.arg_names = ['epochs', 'hidden1', 'learning_rate', 'dropout', 'weight_decay'] 258 | self.type_ = [int, int, float, float, float] 259 | self.bound = [(10, 300), (2, 64), (0.0001, 0.1), (0.1, 0.9), (1e-4, 100e-4)] 260 | elif method == 'AROPE': 261 | n = 3 262 | self.arg_names = ['w{}'.format(i+1) for i in range(n)] 263 | self.type_ = [float for _ in range(n)] 264 | self.bound = [(0, 3) for _ in range(n)] 265 | self.ind = dict(zip(self.arg_names, range(len(self.arg_names)))) 266 | 267 | def get_type(self, ps=None): 268 | if ps is None: 269 | return self.type_ 270 | return [self.type_[self.ind[p]] for p in ps] 271 | 272 | def get_bound(self, ps=None): 273 | if ps is None: 274 | return self.bound 275 | return [self.bound[self.ind[p]] for p in ps] 276 | 277 | def convert(self, X, ps=None): 278 | type_ = self.get_type(ps) 279 | bound = np.array(self.get_bound(ps)) 280 | X = np.clip(X, bound[:, 0], bound[:, 1]) 281 | res = [] 282 | for x, t in zip(X, type_): 283 | if t == int: 284 | res.append(int(round(x, 0))) 285 | elif t == float: 286 | res.append(round(x, 4)) 287 | return res 288 | 289 | def convert_dict(self, d, ps=None): 290 | for p in ps: 291 | x = np.clip(d[p], self.bound[self.ind[p]][0], self.bound[self.ind[p]][1]) 292 | t = self.type_[self.ind[p]] 293 | if t == int: 294 | x = int(round(x, 0)) 295 | elif t == float: 296 | x = round(x, 4) 297 | d[p] = x 298 | return d 299 | 300 | def random_args(self, ps=None, emd_size=64, known_args={}): 301 | if ps is None: 302 | ps = self.arg_names 303 | type_ = self.get_type(ps) 304 | bound = self.get_bound(ps) 305 | res = random_with_bound_type(bound, type_) 306 | d = dict(zip(ps, res)) 307 | for arg in known_args: 308 | d[arg] = known_args[arg] 309 | if self.method != 'gcn': 310 | d['emd_size'] = emd_size 311 | if self.method == 'AROPE': 312 | d['order'] = 3 313 | return d 314 | 315 | def analysis_result(data_dir): 316 | fs = os.listdir(data_dir) 317 | fs = np.array([np.loadtxt(os.path.join(data_dir, f)) for f in fs if not f.endswith('names')]) 318 | print(fs.shape) 319 | scale = 100 320 | d = (fs[:, 0]*scale).astype(int) 321 | for k, v in collections.Counter(d).most_common(): 322 | print(k*1.0/scale, v, "{:.2f}".format(v*1.0/fs.shape[0])) 323 | 324 | def check_label(data_dir): 325 | fn = os.path.join(data_dir, 'label.txt') 326 | d = np.loadtxt(fn, dtype=int) 327 | c = collections.Counter(d[:, 1]) 328 | for k, v in c.most_common(): 329 | print(k, v) 330 | 331 | def convert_gcn_data(dataset_name, input_dir, output_dir): 332 | def parse_index_file(filename): 333 | """Parse index file.""" 334 | index = [] 335 | for line in open(filename): 336 | index.append(int(line.strip())) 337 | return index 338 | names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] 339 | objects = [] 340 | for i in range(len(names)): 341 | with open(os.path.join(input_dir, "ind.{}.{}".format(dataset_name, names[i])), 'rb') as f: 342 | if sys.version_info > (3, 0): 343 | objects.append(pkl.load(f, encoding='latin1')) 344 | else: 345 | objects.append(pkl.load(f)) 346 | 347 | x, y, tx, ty, allx, ally, graph = tuple(objects) 348 | test_idx_reorder = parse_index_file(os.path.join(input_dir, "ind.{}.test.index".format(dataset_name))) 349 | test_idx_range = np.sort(test_idx_reorder) 350 | 351 | if dataset_name == 'citeseer': 352 | # Fix citeseer dataset (there are some isolated nodes in the graph) 353 | # Find isolated nodes, add them as zero-vecs into the right position 354 | test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1) 355 | tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) 356 | tx_extended[test_idx_range-min(test_idx_range), :] = tx 357 | tx = tx_extended 358 | ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) 359 | ty_extended[test_idx_range-min(test_idx_range), :] = ty 360 | ty = ty_extended 361 | 362 | features = sp.vstack((allx, tx)).tolil() 363 | features[test_idx_reorder, :] = features[test_idx_range, :] 364 | G = nx.from_dict_of_lists(graph) 365 | adj = nx.adjacency_matrix(G) 366 | 367 | labels = np.vstack((ally, ty)) 368 | labels[test_idx_reorder, :] = labels[test_idx_range, :] 369 | print(G.number_of_nodes(), G.number_of_edges()) 370 | with open('data/{}/label.txt'.format(dataset_name), 'w') as f: 371 | for i in range(len(labels)): 372 | assert sum(labels[i])>0 373 | for j, k in enumerate(labels[i]): 374 | if k == 1: 375 | print(i, j, file=f) 376 | 377 | with open('data/{}/graph.edgelist'.format(dataset_name), 'w') as f: 378 | for i, j in G.edges(): 379 | print(i, j, file=f) 380 | 381 | sp.save_npz('data/{}/features.npz'.format(dataset_name), features.tocsr()) 382 | 383 | if __name__ == '__main__': 384 | #analysis_result('result/BlogCatalog/cf/') 385 | #check_label('data/citeseer/sampled/s1/') 386 | for i in range(5): 387 | generate_mask('data/pubmed/sampled/s{}'.format(i)) 388 | #convert_gcn_data('pubmed', 'embedding_test/src/baseline/gcn/gcn/data/', 'data/') 389 | --------------------------------------------------------------------------------