├── .gitattributes ├── .gitignore ├── README.md ├── data └── contact-high-school │ ├── README.txt │ ├── contact-high-school-nverts.txt │ ├── contact-high-school-simplices.txt │ └── contact-high-school-times.txt ├── readme.md.rtf └── src ├── HyperCI.py ├── Model.py ├── check_hypergraph.py ├── data_preprocessing.py ├── data_simulation.py ├── readme └── utils.py /.gitattributes: -------------------------------------------------------------------------------- 1 | .mat filter=lfs diff=lfs merge=lfs -text 2 | *.mat filter=lfs diff=lfs merge=lfs -text 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jma712/HyperSCI/ba33ef07b16e2110faabd127bb510aed63e29ea1/.gitignore -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HyperSCI-KDD22: Learning Causal Effects on Hypergraphs 2 | 3 | Code for the KDD 2022 paper [*Learning Causal Effects on Hypergraphs*.](https://arxiv.org/pdf/2207.04049.pdf) 4 | 5 | ## Environment 6 | ``` 7 | Python 3.6 8 | Pytorch 1.2.0 9 | Scipy 1.3.1 10 | Numpy 1.17.2 11 | ``` 12 | 13 | ## Dataset 14 | Demo datasets with simulation can be found in [link](https://drive.google.com/drive/folders/1Jey0eanNrv7YkzUGpnv64sfeY4iGZsEv?usp=sharing). 15 | 16 | ## Run Experiment 17 | ### HyperSCI 18 | ``` 19 | python HyperSCI.py --dataset 'contact' --path '../data/contact.mat' 20 | ``` 21 | With the demo ```contact.mat``` dataset and default parameter settings, the mean results ($\sqrt{\epsilon_{PEHE}}$ and $\epsilon_{ATE}$) of three runs for our method should be $12.16/9.55$. 22 | 23 | ``` 24 | python HyperSCI.py --dataset 'GoodReads' --path '../data/GoodReads.mat' 25 | ``` 26 | With the demo ```GoodReads.mat``` dataset and default parameter settings, the mean results ($\sqrt{\epsilon_{PEHE}}$ and $\epsilon_{ATE}$) of three runs for our method should be $33.30/4.73$. 27 | 28 | The data preprocessing from raw data and simulation is in: 29 | ### Data Preprocessing 30 | ``` 31 | python data_preprocessing.py 32 | ``` 33 | ### Data Simulation 34 | ``` 35 | python data_simulation.py 36 | ``` 37 | 38 | ### References 39 | Jing Ma, Mengting Wan, Longqi Yang, Jundong Li, Brent Hecht, Jaime Teevan, “Learning Causal Effects on Hypergraphs”, ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD), 2022. 40 | 41 | -------------------------------------------------------------------------------- /data/contact-high-school/README.txt: -------------------------------------------------------------------------------- 1 | This dataset is constructed from a contact network amongst high school students 2 | in Marseilles, France, in December 2013. The contact network was downloaded from 3 | http://www.sociopatterns.org/datasets/high-school-contact-and-friendship-networks/ 4 | 5 | We form simplices through cliques of simultaneous contacts. Specifically, for 6 | every unique timestamp in the dataset, we construct a simplex for every maximal 7 | clique amongst the contact edges that exist for that timestamp. Timestamps were 8 | recorded in 20 second intervals. 9 | -------------------------------------------------------------------------------- /readme.md.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1252\cocoartf2638 2 | \cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fswiss\fcharset0 Helvetica;} 3 | {\colortbl;\red255\green255\blue255;} 4 | {\*\expandedcolortbl;;} 5 | \margl1440\margr1440\vieww24700\viewh13720\viewkind0 6 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 7 | 8 | \f0\fs24 \cf0 # HyperSCI-KDD22: Learning Causal Effects on Hypergraphs\ 9 | \ 10 | Code for the WSDM 2022 paper [*Learning Causal Effects on Hypergraphs*.](https://arxiv.org/pdf/2207.04049.pdf)\ 11 | \ 12 | ## Environment\ 13 | ```\ 14 | Python 3.6\ 15 | Pytorch 1.2.0\ 16 | Scipy 1.3.1\ 17 | Numpy 1.17.2\ 18 | ```\ 19 | \ 20 | ## Dataset\ 21 | Datasets can be found in ```./data```\ 22 | \ 23 | ## Run Experiment\ 24 | ### Data Preprocessing\ 25 | ```\ 26 | python data_preprocessing.py\ 27 | ```\ 28 | ### Data Simulation\ 29 | ```\ 30 | python data_simulation.py\ 31 | ```\ 32 | ### Data Simulation\ 33 | ```\ 34 | python HyperSCI.py --path '../../data/Simulation/GR/GoodReads.mat'\ 35 | ```\ 36 | \ 37 | ### Refenrences\ 38 | Jing Ma, Mengting Wan, Longqi Yang, Jundong Li, Brent Hecht, Jaime Teevan, \'93Learning Causal Effects on Hypergraphs\'94, ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD), 2022. \ 39 | \ 40 | } -------------------------------------------------------------------------------- /src/HyperCI.py: -------------------------------------------------------------------------------- 1 | ''' 2 | main for HyperCI 3 | ''' 4 | 5 | import time 6 | import argparse 7 | import numpy as np 8 | import random 9 | import math 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | import torch.optim as optim 15 | 16 | from Model import HyperSCI, GraphSCI 17 | import utils 18 | import scipy.io as sio 19 | from sklearn.linear_model import LinearRegression, Ridge 20 | import data_preprocessing as dpp 21 | import data_simulation as dsim 22 | 23 | from scipy import sparse as sp 24 | import scipy.io as sio 25 | import csv 26 | import torch_geometric.nn as gnn 27 | import pickle 28 | import json 29 | import matplotlib.pyplot as plt 30 | from matplotlib import rc 31 | rc('mathtext', default='regular') 32 | import matplotlib 33 | 34 | font_sz = 28 35 | matplotlib.rcParams['font.family'] = 'sans-serif' 36 | matplotlib.rcParams['font.sans-serif'] = 'NSimSun,Times New Roman' 37 | matplotlib.rcParams.update({'font.size': font_sz}) 38 | 39 | # Parameters 40 | parser = argparse.ArgumentParser() 41 | parser.add_argument('--nocuda', type=int, default=0, help='Disables CUDA training.') 42 | parser.add_argument('--dataset', type=str, default='contact') # contact, GoodReads Microsoft 43 | 44 | parser.add_argument('--weight_decay', type=float, default=1e-2, help='Weight decay (L2 loss on parameters).') 45 | parser.add_argument('--seed', type=int, default=42, help='Random seed.') 46 | parser.add_argument('--epochs', type=int, default=1601, help='Number of epochs to train.') 47 | parser.add_argument('--lr', type=float, default=1e-4, help='Initial learning rate.') 48 | parser.add_argument('--h_dim', type=int, default=25, help='dim of hidden units.') 49 | parser.add_argument('--g_dim', type=int, default=32, help='dim of interference representation.') 50 | parser.add_argument('--clip', type=float, default=1., help='gradient clipping') 51 | parser.add_argument('--activate', type=int, default=0) 52 | parser.add_argument('--normy', type=int, default=0) 53 | parser.add_argument('--num_gnn_layer', type=int, default=1, help='layer of gnn') 54 | parser.add_argument('--n_out', type=int, default=0, help='layer of prediction') 55 | parser.add_argument('--dropout', type=float, default=0.5) 56 | parser.add_argument('--phi_layer', type=int, default=1, help='layer of phi(x)') 57 | parser.add_argument('--skip', type=str, default='23', choices=['123', '23']) 58 | parser.add_argument('--graph_model', type=str, default='hypergraph', choices=['hypergraph', 'graph']) # hypergraph: our model; graph: gcn based baseline 59 | parser.add_argument('--graph_type', type=str, default='hypergraph', choices=['hypergraph', 'projected']) # use hypergraph or projected graph 60 | parser.add_argument('--index_type', type=str, default='hyper_index', choices=['hyper_index', 'graph_index']) # graph_index for baseline 61 | parser.add_argument('--path', type=str, default='../data/contact.mat') 62 | parser.add_argument('--encoder', type=str, default='gcn', choices=['gcn', 'gat']) 63 | parser.add_argument('--exp_name', type=str, default='ITE', choices=['ITE', 'LR', 'case', 'hypersize']) 64 | parser.add_argument('--LR_name', type=str, default='S', choices=['S', 'T', 'T_agg']) # linear regression: S-Learner, T-learner 65 | parser.add_argument('--max_hyperedge_size', type=int, default=50, 66 | help='only keep hyperedges with size no more than this value (only valid in hypersize experiment)') 67 | parser.add_argument('--wass', type=float, default=1e-2) 68 | 69 | args = parser.parse_args() 70 | args.cuda = not args.nocuda and torch.cuda.is_available() 71 | device = torch.device("cuda:0" if args.cuda else "cpu") 72 | 73 | print('using device: ', device) 74 | 75 | # seed 76 | np.random.seed(args.seed) 77 | torch.manual_seed(args.seed) 78 | 79 | def compute_loss(Y_true, treatments, results, idx_trn, idx_select): 80 | # binary 81 | y1_true = Y_true[1] 82 | y0_true = Y_true[0] 83 | rep = results['rep'] 84 | y1_pred = results['y1_pred'] 85 | y0_pred = results['y0_pred'] 86 | yf_pred = torch.where(treatments > 0, y1_pred, y0_pred) 87 | 88 | # balancing 89 | num_balance_max = 2000 # max num of instances used for balancing 90 | idx_balance = idx_select if len(idx_select) < num_balance_max else idx_select[: num_balance_max] 91 | rep_t1, rep_t0 = rep[idx_balance][(treatments[idx_balance] > 0).nonzero()], rep[idx_balance][(treatments[idx_balance] < 1).nonzero()] 92 | 93 | # wass1 distance 94 | dist, _ = utils.wasserstein(rep_t1, rep_t0, device, cuda=True) 95 | 96 | # potential outcome prediction 97 | YF = torch.where(treatments > 0, y1_true, y0_true) 98 | 99 | # norm y 100 | if args.normy: 101 | ym, ys = torch.mean(YF[idx_trn]), torch.std(YF[idx_trn]) 102 | YF_select = (YF[idx_select] - ym) / ys 103 | else: 104 | YF_select = YF[idx_select] 105 | 106 | # loss: (Y-Y_hat)^2 + alpha * w-dist 107 | loss_mse = torch.nn.MSELoss() 108 | loss_y = loss_mse(yf_pred[idx_select], YF_select) 109 | 110 | loss = loss_y + args.wass * dist 111 | 112 | loss_result = { 113 | 'loss': loss, 'loss_y': loss_y, 'loss_b': dist 114 | } 115 | 116 | return loss_result 117 | 118 | def evaluate(Y_true, treatments, results, idx_trn, idx_select, keep_orin_ite=False): 119 | y1_true, y0_true = Y_true[1], Y_true[0] 120 | 121 | y1_pred = results['y1_pred'] 122 | y0_pred = results['y0_pred'] 123 | 124 | # potential outcome prediction 125 | YF = torch.where(treatments > 0, y1_true, y0_true) 126 | 127 | # norm y 128 | if args.normy: 129 | ym, ys = torch.mean(YF[idx_trn]), torch.std(YF[idx_trn]) 130 | y1_pred, y0_pred = y1_pred * ys + ym, y0_pred * ys + ym 131 | 132 | ITE_pred = y1_pred - y0_pred 133 | ITE_true = y1_true - y0_true 134 | 135 | # metrics 136 | n_select = len(idx_select) 137 | ate = (torch.abs((ITE_pred[idx_select] - ITE_true[idx_select]).mean())).item() 138 | pehe = math.sqrt(((ITE_pred[idx_select] - ITE_true[idx_select]) * (ITE_pred[idx_select] - ITE_true[idx_select])).sum().data / n_select) 139 | 140 | RMSE_Y1 = torch.sqrt(torch.mean(torch.pow(y1_true[idx_select] - y1_pred[idx_select], 2))).item() 141 | RMSE_Y0 = torch.sqrt(torch.mean(torch.pow(y0_true[idx_select] - y0_pred[idx_select], 2))).item() 142 | 143 | eval_results = {'pehe': pehe, 'ate': ate, 'RMSE_Y1': RMSE_Y1, 'RMSE_Y0': RMSE_Y0} 144 | if keep_orin_ite: 145 | eval_results['ITE_pred'] = ITE_pred 146 | 147 | return eval_results 148 | 149 | def report_info(epoch, time_begin, loss_results_train, eval_results_val, eval_results_tst): 150 | loss_train = loss_results_train['loss'] 151 | loss_y = loss_results_train['loss_y'] 152 | loss_b = loss_results_train['loss_b'] 153 | pehe_val, ate_val = eval_results_val['pehe'], eval_results_val['ate'] 154 | pehe_tst, ate_tst, RMSE_Y1_tst, RMSE_Y0_tst = eval_results_tst['pehe'], eval_results_tst['ate'], eval_results_tst['RMSE_Y1'], eval_results_tst['RMSE_Y0'] 155 | 156 | print('Epoch: {:04d}'.format(epoch + 1), 157 | 'loss_train: {:.4f}'.format(loss_train.item()), 158 | 'pehe_tst: {:.4f}'.format(pehe_tst), 159 | 'ate_tst: {:.4f} '.format(ate_tst), 160 | 'time: {:.4f}s'.format(time.time() - time_begin) 161 | ) 162 | 163 | def train(epochs, model, optimizer, features, treatments, hyperedge_index, Y_true, idx_trn, idx_val, idx_tst): 164 | time_begin = time.time() 165 | print("start training!") 166 | 167 | for k in range(epochs): # epoch 168 | model.train() 169 | optimizer.zero_grad() 170 | 171 | # forward 172 | results = model(features, treatments, hyperedge_index) 173 | 174 | # loss 175 | loss_results_train = compute_loss(Y_true, treatments, results, idx_trn, idx_trn) 176 | loss_train = loss_results_train['loss'] 177 | 178 | loss_train.backward() 179 | optimizer.step() 180 | 181 | nn.utils.clip_grad_norm(model.parameters(), args.clip) 182 | 183 | if k % 100 == 0: 184 | # evaluate 185 | model.eval() 186 | results = model(features, treatments, hyperedge_index) 187 | eval_results_val = evaluate(Y_true, treatments, results, idx_trn, idx_val) 188 | eval_results_tst = evaluate(Y_true, treatments, results, idx_trn, idx_tst) 189 | 190 | report_info(k, time_begin, loss_results_train, eval_results_val, eval_results_tst) 191 | return 192 | 193 | def test(model, features, treatments, hyperedge_index, Y_true, idx_trn, idx_select, keep_orin_ite=False): 194 | model.eval() 195 | 196 | results = model(features, treatments, hyperedge_index) 197 | eval_results = evaluate(Y_true, treatments, results, idx_trn, idx_select, keep_orin_ite) 198 | 199 | pehe = eval_results['pehe'] 200 | ate = eval_results['ate'] 201 | RMSE_Y1_tst, RMSE_Y0_tst = eval_results['RMSE_Y1'], eval_results['RMSE_Y0'] 202 | 203 | print('test results: ', 204 | 'RMSE_Y1_tst: {:.4f}'.format(RMSE_Y1_tst), 205 | 'RMSE_Y0_tst: {:.4f} '.format(RMSE_Y0_tst), 206 | 'pehe_tst: {:.4f}'.format(pehe), 207 | 'ate_tst: {:.4f} '.format(ate)) 208 | 209 | return eval_results 210 | 211 | def load_data(dataset, path, num_exp=10, graph_type='hypergraph', index_type='hyper_index', hyper_form_type='processed'): 212 | trn_rate = 0.6 213 | tst_rate = 0.2 214 | 215 | data = sio.loadmat(path) 216 | features, treatments, outcomes, Y_true, hyperedge_index = data['features'], data['treatments'][0], data['outcomes'][0], data['Y_true'], data['hyperedge_index'] 217 | 218 | standarlize = True 219 | if standarlize: 220 | from sklearn import preprocessing 221 | scaler = preprocessing.StandardScaler().fit(features) 222 | features = scaler.transform(features) 223 | 224 | print('loaded data from ', path) 225 | # print(dpp.hypergraph_stats(hyperedge_index, features.shape[0])) 226 | 227 | show_hyperedge_size = False 228 | if show_hyperedge_size: 229 | unique, frequency = np.unique(hyperedge_index[1], return_counts=True) 230 | print('hyperedge size: ', np.sort(frequency)[::-1][:100]) # top 100 hyperedge size 231 | dpp.draw_freq(frequency) 232 | 233 | if hyper_form_type == 'processed' and graph_type == 'projected' and args.exp_name != 'hypersize': 234 | hyperedge_index = utils.project_hypergraph(features.shape[0], hyperedge_index, type=index_type) 235 | 236 | idx_trn_list, idx_val_list, idx_tst_list = [], [], [] 237 | idx_treated = np.where(treatments == 1)[0] 238 | idx_control = np.where(treatments == 0)[0] 239 | for i in range(num_exp): 240 | idx_treated_cur = idx_treated.copy() 241 | idx_control_cur = idx_control.copy() 242 | np.random.shuffle(idx_treated_cur) 243 | np.random.shuffle(idx_control_cur) 244 | 245 | idx_treated_trn = idx_treated_cur[: int(len(idx_treated) * trn_rate)] 246 | idx_control_trn = idx_control_cur[: int(len(idx_control) * trn_rate)] 247 | idx_trn_cur = np.concatenate([idx_treated_trn, idx_control_trn]) 248 | idx_trn_cur = np.sort(idx_trn_cur) 249 | idx_trn_list.append(idx_trn_cur) 250 | 251 | idx_treated_tst = idx_treated_cur[int(len(idx_treated) * trn_rate): int(len(idx_treated) * trn_rate) + int(len(idx_treated) * tst_rate)] 252 | idx_control_tst = idx_control_cur[int(len(idx_control) * trn_rate): int(len(idx_control) * trn_rate) + int(len(idx_control) * tst_rate)] 253 | idx_tst_cur = np.concatenate([idx_treated_tst, idx_control_tst]) 254 | idx_tst_cur = np.sort(idx_tst_cur) 255 | idx_tst_list.append(idx_tst_cur) 256 | idx_treated_val = idx_treated_cur[int(len(idx_treated) * trn_rate) + int(len(idx_treated) * tst_rate):] 257 | idx_control_val = idx_control_cur[int(len(idx_control) * trn_rate) + int(len(idx_control) * tst_rate):] 258 | idx_val_cur = np.concatenate([idx_treated_val, idx_control_val]) 259 | idx_val_cur = np.sort(idx_val_cur) 260 | idx_val_list.append(idx_val_cur) 261 | 262 | # tensor 263 | features = torch.FloatTensor(features) 264 | treatments = torch.FloatTensor(treatments) 265 | Y_true = torch.FloatTensor(Y_true) 266 | outcomes = torch.FloatTensor(outcomes) 267 | 268 | if hyper_form_type == 'processed' and graph_type == 'projected' and index_type == 'graph_index': 269 | hyperedge_index = hyperedge_index.nonzero() # sparse adjacency matrix -> edge index 270 | if hyper_form_type == 'processed': 271 | hyperedge_index = torch.LongTensor(hyperedge_index) 272 | idx_trn_list = [torch.LongTensor(id) for id in idx_trn_list] 273 | idx_val_list = [torch.LongTensor(id) for id in idx_val_list] 274 | idx_tst_list = [torch.LongTensor(id) for id in idx_tst_list] 275 | 276 | return features, treatments, outcomes, Y_true, hyperedge_index, idx_trn_list, idx_val_list, idx_tst_list 277 | 278 | 279 | def baseline_LR(features, treatment, outcome, Y_true, idx_trn, idx_val, idx_tst, hyperedge_index=None): 280 | # t-leaner 281 | if args.LR_name == 'T_agg': 282 | import data_simulation as sim 283 | features_agg = sim.agg_features(features, hyperedge_index, treatment, alpha=1.0) 284 | features = np.concatenate([features, features_agg], axis=1) 285 | 286 | if args.LR_name == 'T' or args.LR_name == 'T_agg': 287 | model_1 = LinearRegression() 288 | model_0 = LinearRegression() 289 | idx_treated_trn = np.where(treatment[idx_trn] == 1) 290 | idx_control_trn = np.where(treatment[idx_trn] == 0) 291 | 292 | model_1.fit(features[idx_trn[idx_treated_trn]], outcome[idx_trn[idx_treated_trn]]) 293 | model_0.fit(features[idx_trn[idx_control_trn]], outcome[idx_trn[idx_control_trn]]) 294 | 295 | y_pred1_tst = model_1.predict(features[idx_tst]) 296 | y_pred0_tst = model_0.predict(features[idx_tst]) 297 | 298 | # s-learner 299 | elif args.LR_name == 'S': 300 | model_t = LinearRegression() 301 | features_t = np.concatenate([features, treatment.reshape(-1, 1)], axis=1) 302 | model_t.fit(features_t[idx_trn], outcome[idx_trn]) 303 | 304 | y_pred1_tst = model_t.predict(np.concatenate([features[idx_tst], np.ones((len(idx_tst), 1))], axis=1)) 305 | y_pred0_tst = model_t.predict(np.concatenate([features[idx_tst], np.zeros((len(idx_tst), 1))], axis=1)) 306 | 307 | y1_true_tst = Y_true[1][idx_tst] 308 | y0_true_tst = Y_true[0][idx_tst] 309 | 310 | # test 311 | ITE_pred_tst = y_pred1_tst - y_pred0_tst 312 | ITE_true_tst = y1_true_tst - y0_true_tst 313 | 314 | n_select = len(idx_tst) 315 | ate = np.abs((ITE_pred_tst - ITE_true_tst).mean()) 316 | pehe = math.sqrt(((ITE_pred_tst - ITE_true_tst) * ( 317 | ITE_pred_tst - ITE_true_tst)).sum() / n_select) 318 | RMSE_Y1 = math.sqrt(np.mean(np.power(y_pred1_tst - y1_true_tst, 2))) 319 | RMSE_Y0 = math.sqrt(np.mean(np.power(y_pred0_tst - y0_true_tst, 2))) 320 | 321 | eval_results = {'pehe': pehe, 'ate': ate, 'RMSE_Y1': RMSE_Y1, 'RMSE_Y0': RMSE_Y0} 322 | 323 | return eval_results 324 | 325 | def data_statistics(features, treatments, outcomes, Y_true): 326 | y_obs = torch.where(treatments > 0, Y_true[1], Y_true[0]) 327 | print('ITE ', torch.mean(Y_true[1]-Y_true[0]), torch.std(Y_true[1]-Y_true[0])) 328 | print('y_obs ', torch.mean(y_obs), torch.std(y_obs)) 329 | print('outcomes ',torch.mean(outcomes), torch.std(outcomes)) 330 | return 331 | 332 | def experiment_LR(features, treatment, outcome, Y_true, hyperedge_index, idx_trn_list, idx_val_list, idx_tst_list, exp_num=3): 333 | t_begin = time.time() 334 | results_all = {'pehe': [], 'ate': []} 335 | 336 | for i_exp in range(0, exp_num): # 10 runs of experiments 337 | print("============== Experiment ", str(i_exp), " =========================") 338 | idx_trn = idx_trn_list[i_exp] 339 | idx_val = idx_val_list[i_exp] 340 | idx_tst = idx_tst_list[i_exp] 341 | 342 | eval_results_tst = baseline_LR(features.numpy(), treatment.numpy(), outcome.numpy(), Y_true.numpy(), idx_trn.numpy(), idx_val.numpy(), idx_tst.numpy(), hyperedge_index=hyperedge_index.numpy()) 343 | 344 | results_all['pehe'].append(eval_results_tst['pehe']) 345 | results_all['ate'].append(eval_results_tst['ate']) 346 | 347 | results_all['average_pehe'] = np.mean(np.array(results_all['pehe'], dtype=np.float)) 348 | results_all['std_pehe'] = np.std(np.array(results_all['pehe'], dtype=np.float)) 349 | results_all['average_ate'] = np.mean(np.array(results_all['ate'], dtype=np.float)) 350 | results_all['std_ate'] = np.std(np.array(results_all['ate'], dtype=np.float)) 351 | 352 | 353 | print("============== Overall experiment results =========================") 354 | for k in results_all: 355 | if isinstance(results_all[k], list): 356 | print(k, ": ", results_all[k]) 357 | else: 358 | print(k, f": {results_all[k]:.4f}") 359 | print("Total time elapsed: {:.4f}s".format(time.time() - t_begin)) 360 | 361 | return 362 | 363 | 364 | def experiment_ite(args, features, treatments, outcomes, Y_true, hyperedge_index, idx_trn_list, idx_val_list, idx_tst_list, exp_num=3): 365 | t_begin = time.time() 366 | 367 | results_all = {'pehe': [], 'ate': []} 368 | 369 | for i_exp in range(0, exp_num): # runs of experiments 370 | print("============== Experiment ", str(i_exp), " =========================") 371 | idx_trn = idx_trn_list[i_exp] 372 | idx_val = idx_val_list[i_exp] 373 | idx_tst = idx_tst_list[i_exp] 374 | 375 | # set model 376 | if args.graph_model == 'hypergraph': 377 | model = HyperSCI(args, x_dim=features.shape[1]) 378 | elif args.graph_model == 'graph': 379 | model = GraphSCI(args, x_dim=features.shape[1]) 380 | optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) 381 | 382 | # cuda 383 | if args.cuda: 384 | model = model.to(device) 385 | features = features.to(device) 386 | treatments = treatments.to(device) 387 | outcomes = outcomes.to(device) 388 | Y_true = Y_true.to(device) 389 | hyperedge_index = hyperedge_index.to(device) 390 | # if hyperedge_attr is not None: 391 | # hyperedge_attr = hyperedge_attr.to(device) 392 | idx_trn_list = [id.to(device) for id in idx_trn_list] 393 | idx_val_list = [id.to(device) for id in idx_val_list] 394 | idx_tst_list = [id.to(device) for id in idx_tst_list] 395 | 396 | # training 397 | train(args.epochs, model, optimizer, features, treatments, hyperedge_index, Y_true, idx_trn, idx_val, idx_tst) 398 | eval_results_tst = test(model, features, treatments, hyperedge_index, Y_true, idx_trn, idx_tst) 399 | 400 | results_all['pehe'].append(eval_results_tst['pehe']) 401 | results_all['ate'].append(eval_results_tst['ate']) 402 | # break # if you just need one run 403 | 404 | results_all['average_pehe'] = np.mean(np.array(results_all['pehe'], dtype=np.float)) 405 | results_all['std_pehe'] = np.std(np.array(results_all['pehe'], dtype=np.float)) 406 | results_all['average_ate'] = np.mean(np.array(results_all['ate'], dtype=np.float)) 407 | results_all['std_ate'] = np.std(np.array(results_all['ate'], dtype=np.float)) 408 | 409 | print("============== Overall experiment results =========================") 410 | for k in results_all: 411 | if isinstance(results_all[k], list): 412 | print(k, ": ", results_all[k]) 413 | else: 414 | print(k, f": {results_all[k]:.4f}") 415 | print("Total time elapsed: {:.4f}s".format(time.time() - t_begin)) 416 | 417 | return 418 | 419 | # only keep hyperedges with size < max_hyperedge_size 420 | def modify_hypergraph(hyperedge_index, max_hyperedge_size): 421 | idx_delete = [] 422 | j = 0 423 | while j < len(hyperedge_index[1]): 424 | u = j 425 | while u < len(hyperedge_index[1]) and hyperedge_index[1][u] == hyperedge_index[1][j]: 426 | u += 1 427 | edge_size = u - j 428 | if edge_size > max_hyperedge_size: 429 | idx_delete += [i for i in range(j, j+edge_size)] 430 | j += edge_size 431 | 432 | # delete 433 | idx_select = list(set(range(len(hyperedge_index[1]))) - set(idx_delete)) 434 | hyperedge_index = hyperedge_index[:, idx_select] 435 | 436 | # update edge index 437 | j = 0 438 | last = -1 439 | while j < len(hyperedge_index[1]): 440 | while j < len(hyperedge_index[1]) and hyperedge_index[1][j] == last + 1: 441 | j += 1 442 | if j != len(hyperedge_index[1]): # not the end 443 | start = j 444 | new = hyperedge_index[1][j] 445 | while j < len(hyperedge_index[1]) and hyperedge_index[1][j] == new: 446 | j += 1 447 | hyperedge_index[1][start: j] = last + 1 448 | last += 1 449 | return hyperedge_index 450 | 451 | def experiment_hypersize(args, features, treatments, outcomes, Y_true, hyperedge_index, idx_trn_list, idx_val_list, idx_tst_list, max_hyperedge_size=2, exp_num=3): 452 | print('running experiment on hypergraph which removes higheredge with size more than ', max_hyperedge_size) 453 | t_begin = time.time() 454 | results_all = {'pehe': [], 'ate': []} 455 | 456 | # only keep hyperedges with size no more than k 457 | hyperedge_index = modify_hypergraph(hyperedge_index, max_hyperedge_size) 458 | 459 | if args.graph_model == 'graph': 460 | hyperedge_index = utils.project_hypergraph(features.shape[0], hyperedge_index, type=args.index_type) # hypergraph->graph 461 | hyperedge_index = hyperedge_index.nonzero() # sparse adjacency matrix -> edge index 462 | hyperedge_index = torch.LongTensor(hyperedge_index) 463 | 464 | for i_exp in range(0, exp_num): # 10 runs of experiments 465 | print("============== Experiment ", str(i_exp), " =========================") 466 | idx_trn = idx_trn_list[i_exp] 467 | idx_val = idx_val_list[i_exp] 468 | idx_tst = idx_tst_list[i_exp] 469 | 470 | # set model 471 | if args.graph_model == 'hypergraph': 472 | model = HyperSCI(args, x_dim=features.shape[1]) 473 | elif args.graph_model == 'graph': 474 | model = GraphSCI(args, x_dim=features.shape[1]) 475 | optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) 476 | 477 | # cuda 478 | if args.cuda: 479 | model = model.to(device) 480 | features = features.to(device) 481 | treatments = treatments.to(device) 482 | outcomes = outcomes.to(device) 483 | Y_true = Y_true.to(device) 484 | hyperedge_index = hyperedge_index.to(device) 485 | # if hyperedge_attr is not None: 486 | # hyperedge_attr = hyperedge_attr.to(device) 487 | idx_trn_list = [id.to(device) for id in idx_trn_list] 488 | idx_val_list = [id.to(device) for id in idx_val_list] 489 | idx_tst_list = [id.to(device) for id in idx_tst_list] 490 | 491 | # training 492 | train(args.epochs, model, optimizer, features, treatments, hyperedge_index, Y_true, idx_trn, idx_val, idx_tst) 493 | eval_results_tst = test(model, features, treatments, hyperedge_index, Y_true, idx_trn, idx_tst) 494 | 495 | results_all['pehe'].append(eval_results_tst['pehe']) 496 | results_all['ate'].append(eval_results_tst['ate']) 497 | 498 | results_all['average_pehe'] = np.mean(np.array(results_all['pehe'], dtype=np.float)) 499 | results_all['std_pehe'] = np.std(np.array(results_all['pehe'], dtype=np.float)) 500 | results_all['average_ate'] = np.mean(np.array(results_all['ate'], dtype=np.float)) 501 | results_all['std_ate'] = np.std(np.array(results_all['ate'], dtype=np.float)) 502 | 503 | print("============== Overall experiment results =========================") 504 | for k in results_all: 505 | if isinstance(results_all[k], list): 506 | print(k, ": ", results_all[k]) 507 | else: 508 | print(k, f": {results_all[k]:.4f}") 509 | print("Total time elapsed: {:.4f}s".format(time.time() - t_begin)) 510 | 511 | return 512 | 513 | def compare_ite_diff(args, features, treatments, outcomes, Y_true, hyperedge_index, idx_trn_list, idx_val_list, idx_tst_list, exp_num=3, type='none', postfix = ''): 514 | t_begin = time.time() 515 | 516 | results_all = {'ITE_diff': []} 517 | 518 | assert type == 'none' or type == 'projected' 519 | if type == 'none': 520 | hyperedge_index_weak = torch.LongTensor([range(len(features)), range(len(features))]) # no/weak structure information 521 | elif type == 'projected': 522 | hyperedge_index_weak = utils.project_hypergraph(features.shape[0], hyperedge_index, type='hyper_index') # projected 523 | hyperedge_index_weak = torch.LongTensor(hyperedge_index_weak) 524 | 525 | idx_all = torch.LongTensor(range(len(features))) 526 | if args.cuda: 527 | idx_all = idx_all.to(device) 528 | hyperedge_index_weak = hyperedge_index_weak.to(device) 529 | 530 | for i_exp in range(0, exp_num): # 10 runs of experiments 531 | print("============== Experiment ", str(i_exp), " =========================") 532 | idx_trn = idx_trn_list[i_exp] 533 | idx_val = idx_val_list[i_exp] 534 | idx_tst = idx_tst_list[i_exp] 535 | 536 | # set model 537 | if args.graph_model == 'hypergraph': 538 | model = HyperSCI(args, x_dim=features.shape[1]) 539 | elif args.graph_model == 'graph': 540 | model = GraphSCI(args, x_dim=features.shape[1]) 541 | optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) 542 | 543 | # cuda 544 | if args.cuda: 545 | model = model.to(device) 546 | features = features.to(device) 547 | treatments = treatments.to(device) 548 | outcomes = outcomes.to(device) 549 | Y_true = Y_true.to(device) 550 | hyperedge_index = hyperedge_index.to(device) 551 | # if hyperedge_attr is not None: 552 | # hyperedge_attr = hyperedge_attr.to(device) 553 | idx_trn_list = [id.to(device) for id in idx_trn_list] 554 | idx_val_list = [id.to(device) for id in idx_val_list] 555 | idx_tst_list = [id.to(device) for id in idx_tst_list] 556 | 557 | # training 558 | train(args.epochs, model, optimizer, features, treatments, hyperedge_index, Y_true, idx_trn, idx_val, idx_tst) 559 | eval_results_all = test(model, features, treatments, hyperedge_index, Y_true, idx_trn, idx_all, 560 | keep_orin_ite=True) 561 | eval_results_all_weak = test(model, features, treatments, hyperedge_index_weak, Y_true, idx_trn, idx_all, 562 | keep_orin_ite=True) 563 | 564 | results_all['ITE_diff'].append( 565 | (eval_results_all['ITE_pred'] - eval_results_all_weak['ITE_pred']).view(1, -1)) # 1 x n 566 | 567 | # break # !!!!!!!!!!!!! 568 | 569 | ite_diff = torch.cat(results_all['ITE_diff'], dim=0) # exp_num x n 570 | results_all['average_ITE_diff'] = torch.mean(ite_diff, dim=0).cpu().detach().numpy().reshape(-1) # n 571 | 572 | print('mean ite diff: ', np.mean(results_all['average_ITE_diff']), ' std: ', 573 | np.std(results_all['average_ITE_diff'])) 574 | print("Total time elapsed: {:.4f}s".format(time.time() - t_begin)) 575 | 576 | dpp.draw_freq(results_all['average_ITE_diff']) 577 | 578 | # save into files 579 | save_flag = True 580 | if save_flag: 581 | filename = '../data/goodreads_ite_diff_'+type+postfix+'.pickle' 582 | 583 | data_save = {'ite_diff': results_all['average_ITE_diff']} 584 | with open(filename, 'wb') as f: 585 | pickle.dump(data_save, f) 586 | print('saved file ', filename) 587 | 588 | return results_all['average_ITE_diff'] 589 | 590 | def query_hyper_statistics(features, treatments, outcomes, Y_true, hyperedge_index, types): 591 | hyperedge_index_np = hyperedge_index.cpu().detach().numpy() 592 | results = {} 593 | if 'treated_ratio' in types: 594 | results['treated_ratio'] = [] 595 | if 'neighbor_num' in types: 596 | results['neighbor_num'] = [] 597 | 598 | for i in range(features.shape[0]): 599 | neighbors_i = dsim.search_neighbor_hypergraph(i, hyperedge_index_np) # not include itself 600 | if 'treated_ratio' in types: 601 | if len(neighbors_i) > 0: 602 | ti = treatments[i] 603 | t_neighbor = treatments[neighbors_i] 604 | equal_num = torch.where(t_neighbor == ti, 1.0, 0.0).sum() 605 | ratio = float(equal_num + 1) / (len(t_neighbor) + 1) # +1, itself 606 | results['treated_ratio'].append(ratio) 607 | else: 608 | results['treated_ratio'].append(1) 609 | 610 | if 'neighbor_num' in types: 611 | results['neighbor_num'].append(len(neighbors_i)) 612 | 613 | return results 614 | 615 | def toDiscreteAxis(values, numOfBins=10, min_value=None, max_value=None): 616 | if min_value is None: 617 | min_value = min(values) 618 | if max_value is None: 619 | max_value = max(values) 620 | axis = [] 621 | for i in range(numOfBins): 622 | axis.append(min_value + i * (max_value-min_value)/numOfBins) 623 | return axis, min_value, max_value 624 | 625 | def experiment_case(args, features, treatments, outcomes, Y_true, hyperedge_index, idx_trn_list, idx_val_list, idx_tst_list, exp_num=3, type_case='None', postfix=''): 626 | # heatmap 627 | with open('../data/goodreads_case_'+type_case+postfix+'.pickle', 'rb') as f: 628 | data_save = pickle.load(f) 629 | average_ITE_diff = np.sqrt(np.square(data_save['ite_diff'])) 630 | treated_ratio = data_save['treated_ratio'] # list of 631 | neighbor_num = data_save['neighbor_num'] 632 | 633 | #dpp.draw_freq(treated_ratio) 634 | #dpp.draw_freq(neighbor_num) 635 | 636 | bin_x = 6 637 | bin_y = 6 638 | 639 | ax_t, min_t, max_t = toDiscreteAxis(treated_ratio, bin_x) 640 | ax_n, min_n, max_n = toDiscreteAxis(neighbor_num, bin_y, max_value=30) 641 | 642 | n = len(average_ITE_diff) 643 | data_num_matrix = np.zeros((bin_x, bin_y)) 644 | data_diff_matrix = np.zeros((bin_x, bin_y)) 645 | idx_matrix = {str(i)+'_'+str(j): [] for i in range(bin_x) for j in range(bin_y)} 646 | 647 | for i in range(n): 648 | if treated_ratio[i] >= max_t: 649 | idx_x = bin_x - 1 650 | else: 651 | idx_x = int((treated_ratio[i] - min_t) / ((max_t - min_t) / bin_x)) 652 | 653 | if neighbor_num[i] >= max_n: 654 | idx_y = bin_y - 1 655 | else: 656 | idx_y = int((neighbor_num[i] - min_n) / ((max_n - min_n) / bin_y)) 657 | 658 | idx_matrix[str(idx_x)+'_'+str(idx_y)].append(i) 659 | data_num_matrix[idx_x][idx_y] += 1 660 | data_diff_matrix[idx_x][idx_y] += average_ITE_diff[i] 661 | 662 | norm_diff = data_diff_matrix / (data_num_matrix + 1) 663 | norm_diff_draw = norm_diff.copy() 664 | for i in range(len(norm_diff_draw)): 665 | norm_diff_draw[i] = norm_diff[len(norm_diff_draw) -1 - i] 666 | plt.imshow(norm_diff_draw, cmap='viridis') 667 | xlist = [round(((i+1) * (max_n - min_n) / bin_x)) for i in range(bin_x)] 668 | plt.xticks(np.arange(bin_x), xlist) 669 | ylist_orin = [round(((i+0.5) * (max_t - min_t) / bin_y), 1) for i in range(bin_y)] 670 | ylist = ylist_orin.copy() 671 | ylist = [ylist_orin[len(ylist_orin) - 1 - i] for i in range(len(ylist_orin))] 672 | plt.yticks(np.arange(bin_y), ylist) 673 | plt.xlabel(r"$|\mathcal{N}_{(i)}|$") 674 | plt.ylabel(r"$r(i)$") 675 | 676 | cbar=plt.colorbar() 677 | cbar.ax.locator_params(nbins=5) 678 | plt.savefig("./" + 'case_' + type_case + '_' + postfix + '.pdf', bbox_inches='tight') 679 | plt.show() 680 | 681 | # book_select, meta info 682 | update_book_meta = False 683 | if update_book_meta: 684 | with open('../data/goodreads_select.pickle', 'rb') as f: 685 | data_select = pickle.load(f) 686 | books_select, authors_select = data_select['books_select'], data_select['authors_select'] 687 | 688 | meta_result = dpp.load_goodreads_select_meta('../data/goodreads_books_children.json', books_select, authors_select) 689 | titles = meta_result['title'] 690 | authors = meta_result['authors'] 691 | for i in range(bin_x): 692 | for j in range(bin_y): 693 | idx_ij = idx_matrix[str(i)+'_'+str(j)] 694 | data_ij = [] 695 | for book_idx in idx_ij: 696 | data_ij.append({'id': book_idx, 'asin': books_select[book_idx], 'title': titles[book_idx], 'authors': authors[book_idx], 'treated_ratio': treated_ratio[book_idx], 'neighbor_num': neighbor_num[book_idx], 'ite_diff': str(average_ITE_diff[book_idx])}) 697 | with open('../data/GoodReads_meta_'+str(i)+'_'+str(j)+'.json', 'w') as outfile: 698 | json.dump(data_ij, outfile) 699 | 700 | return 701 | 702 | 703 | if __name__ == '__main__': 704 | exp_num = 3 705 | if args.graph_model == 'graph': 706 | args.graph_type = 'projected' 707 | args.index_type = 'graph_index' 708 | 709 | print('exp_name: ', args.exp_name, ' graph_model: ', args.graph_model, ' encoder:', args.encoder, ' graph_type: ', args.graph_type, ' index_type: ', args.index_type) 710 | if args.exp_name == 'hypersize' and args.graph_model == 'graph': 711 | features, treatments, outcomes, Y_true, hyperedge_index, idx_trn_list, idx_val_list, idx_tst_list = load_data( 712 | args.dataset, args.path, graph_type=args.graph_type, index_type=args.index_type, hyper_form_type='old') 713 | else: 714 | features, treatments, outcomes, Y_true, hyperedge_index, idx_trn_list, idx_val_list, idx_tst_list = load_data(args.dataset, args.path, graph_type=args.graph_type, index_type=args.index_type) # return tensors 715 | 716 | # ========= Experiment 1: compare with baselines ============ 717 | if args.exp_name == 'LR': 718 | experiment_LR(features, treatments, outcomes, Y_true, hyperedge_index, idx_trn_list, idx_val_list, idx_tst_list, exp_num=exp_num) 719 | elif args.exp_name == 'ITE': 720 | experiment_ite(args, features, treatments, outcomes, Y_true, hyperedge_index, idx_trn_list, idx_val_list, idx_tst_list, exp_num=exp_num) 721 | elif args.exp_name == 'hypersize': 722 | # ========== Experiment 2: only keep hyperedges with size no more than k ======== 723 | max_hyperedge_size = args.max_hyperedge_size 724 | experiment_hypersize(args, features, treatments, outcomes, Y_true, hyperedge_index, idx_trn_list, idx_val_list, idx_tst_list, max_hyperedge_size=max_hyperedge_size, exp_num=exp_num) 725 | elif args.exp_name == 'case': 726 | # ========= Experimet 3: case study ============== 727 | type_case = 'projected' # 'none', 'projected' 728 | postfix = '' # _realY, '' 729 | experiment_case(args, features, treatments, outcomes, Y_true, hyperedge_index, idx_trn_list, idx_val_list, idx_tst_list, exp_num=3, type_case=type_case, postfix=postfix) 730 | 731 | -------------------------------------------------------------------------------- /src/Model.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import math 3 | import numpy as np 4 | import torch 5 | from torch import nn 6 | from torch.nn import Module, Parameter 7 | import torch.nn.functional as F 8 | from sklearn import metrics 9 | import torch_geometric.nn as gnn 10 | from torch_geometric.nn import GCNConv, GATConv 11 | import utils 12 | 13 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 14 | 15 | class HyperSCI(nn.Module): 16 | def __init__(self, args, x_dim): 17 | super(HyperSCI, self).__init__() 18 | self.h_dim = args.h_dim 19 | self.n_out = args.n_out 20 | self.g_dim = args.g_dim 21 | self.dropout = args.dropout 22 | self.graph_model = args.graph_model 23 | self.encoder = args.encoder 24 | self.skip_type = args.skip 25 | self.num_gnn_layer = args.num_gnn_layer 26 | self.phi_layer = args.phi_layer 27 | self.activate = args.activate 28 | 29 | self.phi_x = nn.Sequential(nn.Linear(x_dim, self.h_dim).to(device)) if self.phi_layer == 1 else \ 30 | nn.Sequential(nn.Linear(x_dim, self.h_dim).to(device), nn.ReLU().to(device), nn.Linear(self.h_dim, self.h_dim).to(device)) 31 | self.phi_x_p = nn.Sequential(nn.Linear(x_dim, self.h_dim).to(device), nn.ReLU().to(device), nn.Linear(self.h_dim, self.h_dim).to(device)) 32 | 33 | if self.encoder == 'gat': 34 | self.hgnn = gnn.HypergraphConv(self.h_dim, self.g_dim, use_attention=True, heads=2, concat=False, dropout=0.5).to(device) 35 | self.hgnn_more = [gnn.HypergraphConv(self.g_dim, self.g_dim, use_attention=True, heads=2, concat=False, dropout=0.5).to(device) for i in range(self.num_gnn_layer-1)] 36 | elif self.encoder == 'gcn': 37 | self.hgnn = gnn.HypergraphConv(self.h_dim, self.g_dim).to(device) 38 | self.hgnn_more = [gnn.HypergraphConv(self.g_dim, self.g_dim).to(device) for i in range(self.num_gnn_layer-1)] 39 | else: 40 | self.hgnn = nn.Sequential(nn.Linear(self.h_dim, self.g_dim).to(device)) # MLP 41 | self.hgnn_more = [nn.Sequential(nn.Linear(self.g_dim, self.g_dim).to(device)) for i in range(self.num_gnn_layer-1)] 42 | 43 | if self.skip_type == '123': 44 | self.y_rep_dim = x_dim + self.h_dim + self.g_dim 45 | elif self.skip_type == '23': # phi_x + g_rep 46 | self.y_rep_dim = self.h_dim + self.g_dim 47 | 48 | # prediction for potential outcome 49 | self.out_t00 = [nn.Linear(self.y_rep_dim, self.y_rep_dim).to(device) for i in range(self.n_out)] 50 | self.out_t10 = [nn.Linear(self.y_rep_dim, self.y_rep_dim).to(device) for i in range(self.n_out)] 51 | self.out_t01 = nn.Linear(self.y_rep_dim, 1).to(device) 52 | self.out_t11 = nn.Linear(self.y_rep_dim, 1).to(device) 53 | 54 | def forward(self, features, treatments, hyperedge_index): 55 | phi_x = self.phi_x(features) 56 | phi_x_t = torch.mul(treatments.view(-1,1), phi_x) 57 | phi_x_p = phi_x # if use phi_x_p, set self.phi_x_p(features) 58 | 59 | hyperedge_attr = None 60 | if self.encoder == 'gat': 61 | hyperedge_attr = utils.get_hyperedge_attr(phi_x_t, hyperedge_index, type='mean') 62 | if self.encoder == 'gat' or self.encoder == 'gcn': 63 | rep_hgnn = self.hgnn(x=phi_x_t, hyperedge_index=hyperedge_index, hyperedge_attr=hyperedge_attr) # hypergnn 64 | for i in range(self.num_gnn_layer-1): 65 | if self.activate: 66 | rep_hgnn = F.relu(rep_hgnn) 67 | if self.encoder == 'gat': 68 | hyperedge_attr = utils.get_hyperedge_attr(rep_hgnn, hyperedge_index, type='mean') 69 | rep_hgnn = self.hgnn_more[i](x=rep_hgnn, hyperedge_index=hyperedge_index, hyperedge_attr=hyperedge_attr) 70 | else: # mlp 71 | rep_hgnn = self.hgnn(phi_x_t) 72 | for i in range(self.num_gnn_layer - 1): 73 | if self.activate: 74 | rep_hgnn = F.relu(rep_hgnn) 75 | rep_hgnn = self.hgnn_more[i](rep_hgnn) 76 | if self.activate: 77 | rep_hgnn = F.relu(rep_hgnn) 78 | rep_hgnn = F.dropout(rep_hgnn, self.dropout, training=self.training) 79 | 80 | if self.skip_type == '123': 81 | rep_post_0 = torch.cat([features, torch.zeros_like(phi_x_p), rep_hgnn], dim=1) 82 | rep_post_1 = torch.cat([features, phi_x_p, rep_hgnn], dim=1) 83 | elif self.skip_type == '23': 84 | rep_post_0 = torch.cat([torch.zeros_like(phi_x_p), rep_hgnn], dim=1) 85 | rep_post_1 = torch.cat([phi_x_p, rep_hgnn], dim=1) 86 | 87 | # potential outcome 88 | if self.n_out == 0: 89 | y00 = rep_post_0 90 | y10 = rep_post_1 91 | for i in range(self.n_out): 92 | y00 = F.relu(self.out_t00[i](rep_post_0)) 93 | #y00 = F.dropout(y00, self.dropout, training=self.training) 94 | y10 = F.relu(self.out_t10[i](rep_post_1)) 95 | #y10 = F.dropout(y10, self.dropout, training=self.training) 96 | 97 | y0_pred = self.out_t01(y00).view(-1) 98 | y1_pred = self.out_t11(y10).view(-1) 99 | 100 | results = {'y1_pred': y1_pred, 'y0_pred': y0_pred, 'rep': phi_x} 101 | 102 | return results 103 | 104 | 105 | class GraphSCI(nn.Module): 106 | def __init__(self, args, x_dim): 107 | super(GraphSCI, self).__init__() 108 | 109 | self.h_dim = args.h_dim 110 | self.n_out = args.n_out 111 | self.g_dim = args.g_dim 112 | self.dropout = args.dropout 113 | self.encoder = args.encoder 114 | self.skip_type = args.skip 115 | self.num_gnn_layer = args.num_gnn_layer 116 | self.phi_layer = args.phi_layer 117 | self.activate = args.activate 118 | 119 | self.phi_x = nn.Sequential(nn.Linear(x_dim, self.h_dim).to(device)) if self.phi_layer == 1 else \ 120 | nn.Sequential(nn.Linear(x_dim, self.h_dim).to(device), nn.ReLU().to(device),nn.Linear(self.h_dim, self.h_dim).to(device)) 121 | 122 | self.phi_x_p = nn.Sequential(nn.Linear(x_dim, self.h_dim).to(device), nn.ReLU().to(device), 123 | nn.Linear(self.h_dim, self.h_dim).to(device)) 124 | 125 | if self.encoder == 'gcn': 126 | self.gnn = GCNConv(self.h_dim, self.g_dim).to(device) 127 | self.gnn_more = [GCNConv(self.g_dim, self.g_dim).to(device) for i in range(self.num_gnn_layer - 1)] 128 | else: 129 | self.gnn = nn.Sequential(nn.Linear(self.h_dim, self.g_dim).to(device)) # MLP 130 | self.gnn_more = [nn.Sequential(nn.Linear(self.g_dim, self.g_dim).to(device)) for i in range(self.num_gnn_layer - 1)] 131 | 132 | if self.skip_type == '123': 133 | self.y_rep_dim = x_dim + self.h_dim + self.g_dim 134 | elif self.skip_type == '23': # phi_x + g_rep 135 | self.y_rep_dim = self.h_dim + self.g_dim 136 | 137 | # potential outcome 138 | self.out_t00 = [nn.Linear(self.y_rep_dim, self.y_rep_dim).to(device) for i in range(self.n_out)] 139 | self.out_t10 = [nn.Linear(self.y_rep_dim, self.y_rep_dim).to(device) for i in range(self.n_out)] 140 | self.out_t01 = nn.Linear(self.y_rep_dim, 1).to(device) 141 | self.out_t11 = nn.Linear(self.y_rep_dim, 1).to(device) 142 | 143 | def forward(self, features, treatments, edge_index): 144 | phi_x = self.phi_x(features) 145 | phi_x_p = phi_x # self.phi_x_p(features) 146 | phi_x_t = torch.mul(treatments.view(-1, 1), phi_x) 147 | 148 | if self.encoder == 'gcn': 149 | rep_gnn = self.gnn(x=phi_x_t, edge_index=edge_index) # hypergnn 150 | for i in range(self.num_gnn_layer - 1): 151 | if self.activate: 152 | rep_gnn = F.relu(rep_gnn) 153 | rep_gnn = self.gnn_more[i](x=rep_gnn, edge_index=edge_index) 154 | else: 155 | rep_gnn = self.gnn(phi_x_t) 156 | for i in range(self.num_gnn_layer - 1): 157 | if self.activate: 158 | rep_gnn = F.relu(rep_gnn) 159 | rep_gnn = self.gnn_more[i](rep_gnn) 160 | if self.activate: 161 | rep_gnn = F.relu(rep_gnn) 162 | rep_gnn = F.dropout(rep_gnn, self.dropout, training=self.training) 163 | rep_post_0 = torch.cat([torch.zeros_like(phi_x_p), rep_gnn], dim=1) 164 | rep_post_1 = torch.cat([phi_x_p, rep_gnn], dim=1) 165 | 166 | # prediction for potential outcome 167 | if self.n_out == 0: 168 | y00 = rep_post_0 169 | y10 = rep_post_1 170 | for i in range(self.n_out): 171 | y00 = F.relu(self.out_t00[i](rep_post)) 172 | #y00 = F.dropout(y00, self.dropout, training=self.training) 173 | y10 = F.relu(self.out_t10[i](rep_post)) 174 | #y10 = F.dropout(y10, self.dropout, training=self.training) 175 | 176 | y0_pred = self.out_t01(y00).view(-1) 177 | y1_pred = self.out_t11(y10).view(-1) 178 | 179 | results = {'y1_pred': y1_pred, 'y0_pred': y0_pred, 'rep': phi_x} 180 | 181 | return results 182 | 183 | -------------------------------------------------------------------------------- /src/check_hypergraph.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.io as sio 3 | 4 | def hypergraph_stats(hyperedge_index, n): 5 | # hyperedge size 6 | unique_edge, counts_edge = np.unique(hyperedge_index[1], return_counts=True) # edgeid, size 7 | ave_hyperedge_size = np.mean(counts_edge) 8 | max_hyperedge_size = np.max(counts_edge) 9 | min_hyperedge_size = np.min(counts_edge) 10 | m = len(unique_edge) 11 | 12 | sz, ct = np.unique(counts_edge, return_counts=True) # hyperedgesize, count 13 | counts_edge_2 = ct[np.where(sz==2)][0] 14 | 15 | # node degree 16 | unique_node, counts_node = np.unique(hyperedge_index[0], return_counts=True) # nodeid, degree 17 | ave_degree = np.mean(counts_node) 18 | max_degree = np.max(counts_node) 19 | min_degree = np.min(counts_node) 20 | statistics = {'n': n, 'm': m, 'm>2': m-counts_edge_2, 21 | 'average_hyperedge_size': ave_hyperedge_size, 'min_hyperedge_size': min_hyperedge_size, 'max_hyperedge_size': max_hyperedge_size, 22 | 'average_degree': ave_degree, 'max_degree': max_degree, 'min_degree': min_degree} 23 | return statistics 24 | 25 | if __name__ == '__main__': 26 | path = '/data/Simulation/MS/Microsoft_sim_quadratic_alpha1.0_beta1.0_node.mat' 27 | data = sio.loadmat(path) 28 | features, hyperedge_index = data['features'], data['hyperedge_index'] 29 | 30 | print('loaded data from ', path) 31 | print(hypergraph_stats(hyperedge_index, features.shape[0])) 32 | -------------------------------------------------------------------------------- /src/data_preprocessing.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Generate processed data: filtering, combining ... 3 | ''' 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import csv 8 | import matplotlib.pyplot as plt 9 | import collections 10 | import os 11 | 12 | from sklearn import preprocessing 13 | from sklearn.linear_model import LinearRegression 14 | import category_encoders as ce 15 | from sklearn.pipeline import Pipeline 16 | import datetime 17 | import json 18 | import pickle 19 | from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer 20 | 21 | # from nltk import word_tokenize 22 | # from nltk.stem import PorterStemmer, WordNetLemmatizer 23 | # 24 | # import scipy.io as sio 25 | # class LemmaTokenizer: 26 | # def __init__(self): 27 | # self.wnl = WordNetLemmatizer() 28 | # def __call__(self, doc): 29 | # return [self.wnl.lemmatize(t) for t in word_tokenize(doc)] 30 | 31 | 32 | def draw_bar(x, y, x_label, y_label=None): 33 | fig = plt.figure() 34 | ax = fig.add_subplot(1, 1, 1) 35 | 36 | ax.bar(x, y) 37 | plt.xlabel(x_label) 38 | plt.ylabel(y_label) 39 | plt.show() 40 | 41 | def draw_freq(data, x_label=None, bool_discrete = False): 42 | fig = plt.figure() 43 | plt.hist(data, bins=50) 44 | 45 | plt.xlabel(x_label) 46 | plt.ylabel("Frequency") 47 | 48 | ax = fig.add_subplot(1, 1, 1) 49 | 50 | # Find at most 10 ticks on the y-axis 51 | if not bool_discrete: 52 | max_xticks = 10 53 | xloc = plt.MaxNLocator(max_xticks) 54 | ax.xaxis.set_major_locator(xloc) 55 | 56 | plt.show() 57 | 58 | def filter_goodreads(path, save_flag=True): 59 | ''' 60 | : filter: 1. review>=3; 2. author's book number >= 2 and <=50 61 | ''' 62 | 63 | # read review/rating data 64 | rating_dict = {} 65 | review_num_dict = {} 66 | book_authors_dict = {} 67 | authors_book_dict = {} 68 | with open(path) as f: 69 | for line in f: 70 | data_line = json.loads(line) 71 | asin = data_line['isbn'] 72 | if asin == "": 73 | continue 74 | if asin in rating_dict: 75 | print('repeated books!') 76 | 77 | rating_dict[asin] = float(data_line['average_rating']) 78 | review_num_dict[asin] = int(data_line['text_reviews_count']) 79 | book_authors_dict[asin] = data_line['authors'] 80 | for author_info in data_line['authors']: 81 | author = author_info['author_id'] 82 | if author in authors_book_dict: 83 | authors_book_dict[author].append(asin) 84 | else: 85 | authors_book_dict[author] = [asin] 86 | 87 | # filter: review>=3, author's book number >= 2 88 | print('all books: ', len(rating_dict)) 89 | books_select = set([asin for asin in review_num_dict if review_num_dict[asin] >= 3]) 90 | print('books with review >= 3: ', len(books_select)) 91 | 92 | authors_booknum_dict = {author: len([book for book in authors_book_dict[author] if book in books_select]) for author 93 | in authors_book_dict} # all authors' number of books (books are filtered) 94 | 95 | # filter: author's book number >= 2 96 | books_select_2 = [] 97 | for book in books_select: 98 | for author_info in book_authors_dict[book]: 99 | author = author_info['author_id'] 100 | if authors_booknum_dict[author] >= 2 and authors_booknum_dict[author] <= 50: 101 | books_select_2.append(book) 102 | break 103 | 104 | authors_select = [author for author in authors_book_dict if 105 | len(authors_book_dict[author]) > 2 and len(authors_book_dict[author]) <= 50] 106 | print('real hyperedges: authors with book > 2 and <= 50: ', len(authors_select)) 107 | authors_select = set([author for author in authors_book_dict if 108 | len(authors_book_dict[author]) >= 2 and len(authors_book_dict[author]) <= 50]) 109 | print('hyperedges: authors with book >= 2 and <=50: ', len(authors_select)) 110 | 111 | # book_select_2 = set(books_select_2) 112 | print('books with authors who wrote books num >= 2: ', len(books_select_2)) 113 | 114 | books_select = books_select.intersection(books_select_2) 115 | print('books selected: ', len(books_select)) 116 | 117 | authors_book_dict = {author: list(set(authors_book_dict[author]).intersection(books_select)) for author in 118 | authors_book_dict} # update with select books 119 | 120 | max_hyperedge_size = 0 121 | for author in authors_select: 122 | if len(authors_book_dict[author]) > max_hyperedge_size: 123 | max_hyperedge_size = len(authors_book_dict[author]) 124 | print('max_hyperedge_size: ', max_hyperedge_size) 125 | 126 | max_degree = 0 127 | book_authors_num_dict = { 128 | book: len([author['author_id'] for author in book_authors_dict[book] if author['author_id'] in authors_select]) 129 | for book in book_authors_dict} # all books' authors (authors are filtered) 130 | for book in books_select: 131 | if book_authors_num_dict[book] > max_degree: 132 | max_degree = book_authors_num_dict[book] 133 | print('max_degree: ', max_degree) 134 | 135 | # ratings 136 | all_ratings = np.array([int(rating_dict[asin]) for asin in books_select]) 137 | unique, frequency = np.unique(all_ratings, 138 | return_counts=True) 139 | sort_index = np.argsort(frequency)[::-1] 140 | unique = unique[sort_index] 141 | frequency = frequency[sort_index] 142 | 143 | for k, v in zip(unique, frequency): 144 | print('rating: ', k, v) 145 | 146 | books_select = list(books_select) 147 | books_select.sort() 148 | authors_select = list(authors_select) 149 | authors_select.sort() 150 | # save into files 151 | if save_flag: 152 | data_save = {'books_select': books_select, 'authors_select': authors_select} 153 | with open('../data/goodreads_select.pickle', 'wb') as f: 154 | pickle.dump(data_save, f) 155 | return books_select, authors_select 156 | 157 | def load_goodreads_select_meta(path, books_select, authors_select, save_flag=False): 158 | books_select_set = set(books_select) 159 | authors_select_set = set(authors_select) 160 | 161 | # read data 162 | rating_dict = {} 163 | book_authors_dict = {} # book asin: [author1_ID, author2_ID...] 164 | authors_book_dict = {} 165 | book_descriptions = {} 166 | book_title = {} 167 | book_review_count = {} 168 | 169 | with open(path) as f: 170 | for line in f: 171 | data_line = json.loads(line) 172 | asin = data_line['isbn'] 173 | if asin == "" or asin not in books_select_set: # only focus on selected books! 174 | continue 175 | if asin in rating_dict: 176 | print('repeated books!') 177 | 178 | rating_dict[asin] = float(data_line['average_rating']) 179 | book_authors_dict[asin] = [] 180 | for author_info in data_line['authors']: 181 | author = author_info['author_id'] 182 | if author in authors_select_set: # only focus on selected authors 183 | book_authors_dict[asin].append(author) 184 | 185 | if author in authors_book_dict: 186 | authors_book_dict[author].append(asin) 187 | else: 188 | authors_book_dict[author] = [asin] 189 | 190 | book_descriptions[asin] = data_line['description'] + ' ' + data_line['title'] 191 | book_title[asin] = data_line['title'] 192 | book_review_count[asin] = int(data_line['text_reviews_count']) 193 | 194 | author_st = [] 195 | title_st = [] 196 | review_count_st = [] 197 | 198 | for i in range(len(books_select)): 199 | author_st.append(book_authors_dict[books_select[i]]) 200 | title_st.append(book_title[books_select[i]]) 201 | review_count_st.append(book_review_count[books_select[i]]) 202 | data_meta = {'title': title_st, 'authors': author_st, 'review_count': np.array(review_count_st)} 203 | return data_meta 204 | 205 | 206 | 207 | def load_goodreads_select(path, books_select, authors_select, save_flag=False): 208 | books_select_set = set(books_select) 209 | authors_select_set = set(authors_select) 210 | 211 | # read data 212 | rating_dict = {} 213 | book_authors_dict = {} # book asin: [author1_ID, author2_ID...] 214 | authors_book_dict = {} 215 | book_descriptions = {} 216 | 217 | with open(path) as f: 218 | for line in f: 219 | data_line = json.loads(line) 220 | asin = data_line['isbn'] 221 | if asin == "" or asin not in books_select_set: # only focus on selected books 222 | continue 223 | if asin in rating_dict: 224 | print('repeated books!') 225 | 226 | rating_dict[asin] = float(data_line['average_rating']) 227 | book_authors_dict[asin] = [] 228 | for author_info in data_line['authors']: 229 | author = author_info['author_id'] 230 | if author in authors_select_set: # only focus on selected authors 231 | book_authors_dict[asin].append(author) 232 | 233 | if author in authors_book_dict: 234 | authors_book_dict[author].append(asin) 235 | else: 236 | authors_book_dict[author] = [asin] 237 | 238 | book_descriptions[asin] = data_line['description'] + ' ' + data_line['title'] 239 | 240 | # authors who have at least one books in book_select 241 | author_with_books = [] 242 | for author in authors_select: 243 | if author in authors_book_dict and len(authors_book_dict[author])>1: # authors with at least one book 244 | author_with_books.append(author) 245 | print('size: ', len(author_with_books)) 246 | authors_select = author_with_books 247 | 248 | # bag of words 249 | corpus = [book_descriptions[asin] for asin in books_select] 250 | cv = CountVectorizer(lowercase=True, stop_words='english', ngram_range=(1, 1), max_features=500) # tokenizer=LemmaTokenizer() 251 | cv_fit = cv.fit_transform(corpus) # top 500 x word num 252 | word_name = cv.get_feature_names() # dictionary 253 | print('word num: ', len(word_name), ' ', word_name) 254 | # normalize 255 | cv_fit = cv_fit.toarray() 256 | features = preprocessing.normalize(cv_fit) 257 | features += np.random.normal(0, 1, size=(features.shape[0], features.shape[1])) 258 | print('feature mean/std: ', np.mean(features), np.std(features)) 259 | 260 | # treatment 261 | treatment = np.array([int(rating_dict[asin]) for asin in books_select]) 262 | treatment[np.where(treatment <=3)] = 0 263 | treatment[np.where(treatment > 3)] = 1 264 | print('t=0 and t=1: ', (treatment==0).sum(), (treatment==1).sum()) 265 | 266 | # hypergraph 267 | book2idx = {books_select[id]: id for id in range(len(books_select))} 268 | edge_idx_node = [] 269 | edge_idx_edge = [] 270 | for aid in range(len(authors_select)): 271 | books_cur = [] 272 | for book in authors_book_dict[authors_select[aid]]: 273 | if book in book2idx: 274 | books_cur.append(book2idx[book]) 275 | edge_idx_node = edge_idx_node + books_cur 276 | edge_idx_edge = edge_idx_edge + [aid for i in range(len(books_cur))] 277 | 278 | hyperedge_index = np.array([edge_idx_node, edge_idx_edge]) 279 | 280 | 281 | data_save = {'features': features, 'treatment': treatment, 'hyper_index': hyperedge_index} 282 | 283 | if save_flag: 284 | with open('../data/goodreads_processed.pickle', 'wb') as f: 285 | pickle.dump(data_save, f) 286 | return data_save 287 | 288 | def preprocess_goodreads(path): 289 | # books_select, authors_select = filter_goodreads(path, False) 290 | with open('../data/goodreads_select.pickle', 'rb') as f: 291 | data_select = pickle.load(f) 292 | books_select, authors_select = data_select['books_select'], data_select['authors_select'] 293 | 294 | # load book features 295 | load_goodreads_select(path, books_select, authors_select, True) 296 | 297 | return 298 | 299 | def hypergraph_stats(hyperedge_index, n): 300 | # hyperedge size 301 | unique_edge, counts_edge = np.unique(hyperedge_index[1], return_counts=True) # edgeid, size 302 | ave_hyperedge_size = np.mean(counts_edge) 303 | max_hyperedge_size = np.max(counts_edge) 304 | min_hyperedge_size = np.min(counts_edge) 305 | m = len(unique_edge) 306 | 307 | sz, ct = np.unique(counts_edge, return_counts=True) # hyperedgesize, count 308 | counts_edge_2 = ct[np.where(sz==2)][0] 309 | 310 | # node degree 311 | unique_node, counts_node = np.unique(hyperedge_index[0], return_counts=True) # nodeid, degree 312 | ave_degree = np.mean(counts_node) 313 | max_degree = np.max(counts_node) 314 | min_degree = np.min(counts_node) 315 | statistics = {'n': n, 'm': m, 'm>2': m-counts_edge_2, 316 | 'average_hyperedge_size': ave_hyperedge_size, 'min_hyperedge_size': min_hyperedge_size, 'max_hyperedge_size': max_hyperedge_size, 317 | 'average_degree': ave_degree, 'max_degree': max_degree, 'min_degree': min_degree} 318 | return statistics 319 | 320 | def preprocess_contact(path_root): 321 | path_nverts = path_root+'contact-high-school-nverts.txt' 322 | path_simplices = path_root+'contact-high-school-simplices.txt' 323 | 324 | # size of each hyperedge 325 | with open(path_nverts) as f: 326 | sizeOfEdge = f.readlines() 327 | f.close() 328 | sizeOfEdge = [int(i) for i in sizeOfEdge] 329 | m = len(sizeOfEdge) 330 | 331 | idx_start = [] 332 | sum_size = 0 333 | for i in range(m): 334 | idx_start.append(sum_size) 335 | sum_size += sizeOfEdge[i] 336 | 337 | # nodes in each hyperedge 338 | with open(path_simplices) as f: 339 | edge_idx_node = f.readlines() 340 | f.close() 341 | edge_idx_node = [int(i) for i in edge_idx_node] 342 | # edge_idx_edge = [i for i in range(m) for j in range(sizeOfEdge[i])] 343 | 344 | # remove redundant hyperedges 345 | unique_edges = {} 346 | edge_idx_node_unique = [] 347 | edge_idx_edge_unique = [] 348 | for i in range(m): 349 | key_nodes = edge_idx_node[idx_start[i]: idx_start[i] + sizeOfEdge[i]] 350 | key_nodes.sort() 351 | key = '' 352 | for k in key_nodes: 353 | key += '_'+str(k) 354 | if key not in unique_edges: 355 | edge_idx_edge_unique += [len(unique_edges) for j in range(sizeOfEdge[i])] 356 | edge_idx_node_unique += edge_idx_node[idx_start[i]: idx_start[i] + sizeOfEdge[i]] 357 | unique_edges[key] = 1 358 | 359 | edge_idx_node_unique = [i-1 for i in edge_idx_node_unique] # start from 0 360 | hyperedge_index = np.array([edge_idx_node_unique, edge_idx_edge_unique]) 361 | 362 | # statistics 363 | n = np.max(hyperedge_index[0]) + 1 364 | statistics = hypergraph_stats(hyperedge_index, n) 365 | print(statistics) 366 | 367 | # record_data 368 | data_save = {'hyper_index': hyperedge_index} 369 | 370 | save_flag = True 371 | if save_flag: 372 | with open('../data/contact_hypergraph.pickle', 'wb') as f: 373 | pickle.dump(data_save, f) 374 | return data_save 375 | 376 | 377 | if __name__ == '__main__': 378 | dataset = 'contact' 379 | if dataset == 'GoodReads': 380 | path = '../data/goodreads_books_children.json' 381 | preprocess_goodreads(path) 382 | elif dataset == 'contact': 383 | path_root = '../data/contact-high-school/' 384 | preprocess_contact(path_root) 385 | 386 | 387 | 388 | 389 | 390 | 391 | -------------------------------------------------------------------------------- /src/data_simulation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import csv 4 | import matplotlib.pyplot as plt 5 | import collections 6 | import os 7 | 8 | from sklearn import preprocessing 9 | from sklearn.linear_model import LinearRegression 10 | import category_encoders as ce 11 | from sklearn.pipeline import Pipeline 12 | import data_preprocessing as dpp 13 | import json 14 | import pickle 15 | import math 16 | 17 | import scipy.io as sio 18 | 19 | def sigmoid(x): 20 | return 1 / (1 + np.exp(-x)) 21 | 22 | def search_neighbor_hypergraph(node, hyperedge_index): 23 | # return a list containing all neighbors (exist K times if in K same hyperedges) of node (NOT include itself) 24 | arg_idx = np.argwhere(hyperedge_index[0] == node).reshape(-1) 25 | edges = hyperedge_index[1][arg_idx] # edges which include the node 26 | neighbors_idx = np.argwhere(np.isin(hyperedge_index[1], edges)) # all the neighbors on the edge 27 | neighbors_idx_noself =np.argwhere(np.isin(neighbors_idx, arg_idx).reshape(-1)).reshape(-1) 28 | neighbors_idx = np.delete(neighbors_idx, neighbors_idx_noself) # remove node itself 29 | neighbors = hyperedge_index[0][neighbors_idx] 30 | return neighbors 31 | 32 | # return a dictionary: {hyperedge_id: nodes} 33 | def search_neighbor_hyperedge(node, hyperedge_index): 34 | arg_idx = np.argwhere(hyperedge_index[0] == node) 35 | edges = hyperedge_index[1][arg_idx] # edges which include the node 36 | edges = np.unique(edges) 37 | 38 | hyperedge_neighbors = {} 39 | for ei in range(len(edges)): 40 | edge_id = edges[ei] 41 | neighbors_idx = np.argwhere(np.isin(hyperedge_index[1], [edge_id])).reshape(-1) # all the neighbors on the edge, including itself 42 | neighbors = hyperedge_index[0][neighbors_idx] 43 | hyperedge_neighbors[edge_id] = neighbors 44 | return hyperedge_neighbors 45 | 46 | def non_linear(x, type='raw'): 47 | if type == 'sigmoid': 48 | nl_x = 1 / (1 + np.exp(-x)) - 0.5 49 | if type == 'tanh': 50 | nl_x = np.tanh(x) 51 | elif type == 'raw': 52 | nl_x = x 53 | elif type == 'leaky_relu': 54 | nl_x = np.where(x > 0, x, x * 0.01) 55 | return nl_x 56 | 57 | 58 | def simulate_outcome_2(features, hyperedge_index, treatments, type='linear', alpha=1.0, beta=1.0, nonlinear_type='raw'): 59 | n = features.shape[0] 60 | dim_size = features.shape[1] 61 | norm_type = 'size' # size, no, fix, size_log, pow 62 | norm_value = 5 63 | norm_pow = 2 64 | 65 | agg_type = 'mean' 66 | agg_value = 50 67 | y_C = 1.0 68 | 69 | if type == 'linear': 70 | # 1. f_y00 71 | eps = np.random.normal(0.0, 0.001, size=1) 72 | W0 = np.random.normal(loc=0.0, scale=1.0, size=dim_size) 73 | #f_y00 = np.dot(features, W0) + eps # f_{𝑦, 0}(x_𝑖) = W0 x_i; (n x d) x (d x 1) -> n x 1 74 | f_y00 = 0.0 # 75 | # 2. ITE: f_t = t(\delta W x X) 76 | c_ft = 5 77 | W_t = np.random.normal(loc=0.05, scale=1.0, size=dim_size) # contact: 0.05, GR: 1.0, 0.05 78 | ites = np.dot(features, W_t).reshape(-1) + c_ft # (n x d) x (d x 1) -> n 79 | f_t = np.multiply(treatments, ites) # n 80 | 81 | # 3. Spillover effect: s = aggregate(t_j * \tau(x_j)) = aggregate(t_j * (W_t x X_j)). aggregator: mean 82 | f_s = np.zeros(n, dtype=np.float) 83 | for i in range(n): # each node 84 | if i % 1000 == 0: 85 | print("dealing with i: ", i) 86 | hyperedge_neighbors_i = search_neighbor_hyperedge(i, hyperedge_index) 87 | if len(hyperedge_neighbors_i) == 0: 88 | continue 89 | for ei in hyperedge_neighbors_i: # every hyperedge 90 | neighbors_ei = hyperedge_neighbors_i[ei] 91 | 92 | # not include itself 93 | neighbors_idx_self = np.argwhere(neighbors_ei == i) 94 | neighbors_ei = np.delete(neighbors_ei, neighbors_idx_self) # remove node itself 95 | 96 | edge_size = len(neighbors_ei) 97 | if edge_size <= 0: 98 | continue 99 | if agg_type == 'mean': 100 | f_s_i = np.mean(np.multiply(treatments[neighbors_ei], ites[neighbors_ei].reshape(-1))) # (n x 1) * (n x 1), mean over neighbors 101 | f_s_i *= agg_value 102 | elif agg_type == 'fix': 103 | f_s_i = np.sum(np.multiply(treatments[neighbors_ei], ites[neighbors_ei].reshape(-1))) 104 | f_s_i = f_s_i * edge_size / agg_value 105 | f_s_i = non_linear(f_s_i, nonlinear_type) 106 | f_s[i] += f_s_i 107 | 108 | num_ei = len(hyperedge_neighbors_i) 109 | if norm_type == 'size': 110 | f_s[i] /= num_ei # normalize with hyperedge num 111 | elif norm_type == 'fix': 112 | f_s[i] /= norm_value 113 | elif norm_type == 'size_log': 114 | f_s[i] /= (1 + math.log(num_ei)) 115 | elif norm_type == 'pow': 116 | f_s[i] /= (math.pow(num_ei, norm_pow)) 117 | elif norm_type == 'try': 118 | f_s[i] /= ( 10 / math.pow(num_ei, 2)) 119 | 120 | # observed y 121 | noise = np.random.normal(0, 1, size=n) 122 | w_noise = 20.0 123 | y = y_C * (f_y00 + alpha * f_t + beta * f_s + w_noise * noise * treatments) # n 124 | y_0 = y_C * (f_y00 + 0 + beta * f_s) 125 | y_1 = y_C * (f_y00 + alpha * ites + beta * f_s + w_noise * noise) 126 | 127 | y = y * (1.0/(1+alpha+beta)) 128 | y_0 = y_0 * (1.0/(1+alpha+beta)) 129 | y_1 = y_1 * (1.0/(1+alpha+beta)) 130 | 131 | y_0 = y_0.reshape(1, -1) 132 | y_1 = y_1.reshape(1, -1) 133 | Y_true = np.concatenate([y_0, y_1], axis=0) 134 | 135 | print('noise:', np.mean(w_noise * noise), np.std(w_noise * noise)) 136 | 137 | elif type == 'quadratic': 138 | y_C = 0.03 139 | agg_value = 10.0 140 | 141 | # 1. f_y00 142 | W0 = np.random.normal(loc=0.0, scale=1.0, size=dim_size) 143 | f_y00 = np.dot(features, W0) # f_{𝑦, 0}(x_𝑖) = W0 x_i 144 | # 2. ITE: f_t = t(X \delta W X^T) 145 | c_ft = 5 146 | W_t = np.random.normal(loc=0.5, scale=3.0, size=(dim_size, dim_size)) 147 | ites = np.diag(np.dot(np.dot(features, W_t), features.T)).reshape( 148 | -1) + c_ft # diag((n x d) x (d x d) x (d x n)) -> n 149 | f_t = np.multiply(treatments, ites) # n 150 | 151 | # 3. Spillover effect: s = aggregate(t_j * \tau(x_j)) = aggregate(t_j * (W_t x X_j)). aggregator: mean 152 | f_s = np.zeros(n, dtype=np.float) 153 | for i in range(n): # each node 154 | if i % 10000 == 0: 155 | print("dealing with i: ", i) 156 | hyperedge_neighbors_i = search_neighbor_hyperedge(i, hyperedge_index) # {hyperedge_id: nodes} 157 | if len(hyperedge_neighbors_i) == 0: 158 | continue 159 | for ei in hyperedge_neighbors_i: # every hyperedge 160 | neighbors_ei = hyperedge_neighbors_i[ei] 161 | 162 | # not include itself 163 | neighbors_idx_self = np.argwhere(neighbors_ei == i) 164 | neighbors_ei = np.delete(neighbors_ei, neighbors_idx_self) # remove node itself 165 | 166 | edge_size = len(neighbors_ei) 167 | if edge_size <= 0: 168 | print(1) 169 | continue 170 | 171 | masked_features = treatments[neighbors_ei].reshape(-1,1) * features[neighbors_ei] # n x d 172 | f_s_i = np.matmul(np.matmul(masked_features, W_t), masked_features.T) # (n x d) x (d x d) x (d x n) -> n x n 173 | f_s_i = np.mean(f_s_i) 174 | f_s_i = non_linear(f_s_i, nonlinear_type) # activate 175 | 176 | f_s_i *= agg_value 177 | f_s[i] += f_s_i 178 | 179 | num_ei = len(hyperedge_neighbors_i) 180 | if norm_type == 'size': 181 | f_s[i] /= num_ei # normalize with hyperedge num 182 | elif norm_type == 'fix': 183 | f_s[i] /= norm_value 184 | elif norm_type == 'size_log': 185 | f_s[i] /= (1 + math.log(num_ei)) 186 | elif norm_type == 'pow': 187 | f_s[i] /= (math.pow(num_ei, norm_pow)) 188 | 189 | # observed y 190 | noise = np.random.normal(0, 1, size=n) 191 | w_noise = 20.0 # 2.0 192 | y = y_C * (f_y00 + alpha * f_t + beta * f_s) + w_noise * noise * treatments # n 193 | y_0 = y_C * (f_y00 + 0 + beta * f_s) 194 | y_1 = y_C * (f_y00 + alpha * ites + beta * f_s) + w_noise * noise 195 | 196 | y_0 = y_0.reshape(1, -1) 197 | y_1 = y_1.reshape(1, -1) 198 | Y_true = np.concatenate([y_0, y_1], axis=0) 199 | 200 | print('noise:', np.mean(w_noise * noise), np.std(w_noise * noise)) 201 | 202 | simulate_outcome_results = {'outcomes': y, 'Y_true': Y_true} 203 | return simulate_outcome_results 204 | 205 | 206 | def simulate_goodreads(type='linear', alpha=1.0, beta=1.0, path_save=None, nonlinear_type='raw'): 207 | with open('../data/goodreads_processed.pickle', 'rb') as f: 208 | data_processed = pickle.load(f) 209 | features, treatment_binary, hyperedge_index = data_processed['features'], data_processed['treatment'], data_processed['hyper_index'] 210 | 211 | feat_noise = np.random.normal(0, 0.05, features.shape) 212 | features += feat_noise 213 | 214 | # simulation begins 215 | simulate_outcome_results = simulate_outcome_2(features, hyperedge_index, treatment_binary, type, alpha, beta, nonlinear_type) 216 | 217 | simulation_data = { 218 | 'parameter': {'alpha': alpha, 'beta': beta, 'type': type, 'nonlinear_type': nonlinear_type}, 219 | 'features': features, 220 | 'treatments': treatment_binary, 221 | 'outcomes': simulate_outcome_results['outcomes'], # observed 222 | 'Y_true': simulate_outcome_results['Y_true'], # potential 223 | 'hyperedge_index': hyperedge_index 224 | } 225 | if path_save is None: 226 | if type == 'observed': 227 | path_save = '../data/Simulation/GR/GoodReads_obsY.mat' 228 | else: 229 | path_save = '../data/Simulation/GR/GoodReads_sim_' + type + '_alpha' + str(alpha) + '_beta' + str( 230 | beta) + '.mat' 231 | save_flag = True 232 | if save_flag: 233 | sio.savemat(path_save, simulation_data) 234 | print('Data saved! Path: ', path_save) 235 | if type != 'observed': 236 | print('type=', type, ' alpha=', alpha, ' beta=',beta) 237 | return simulation_data 238 | 239 | def agg_features(features, hyperedge_index, alpha=0.5): 240 | features_new = features.copy() 241 | num_of_neighbors = [] 242 | for i in range(features.shape[0]): 243 | hyperedge_neighbors_i = search_neighbor_hyperedge(i, hyperedge_index) # {hyperedge_id: nodes} 244 | if len(hyperedge_neighbors_i) == 0: 245 | continue 246 | feature_agg_ei_all = 0.0 247 | for ei in hyperedge_neighbors_i: # every hyperedge 248 | neighbors_ei = hyperedge_neighbors_i[ei] 249 | 250 | # not include itself 251 | neighbors_idx_self = np.argwhere(neighbors_ei == i) 252 | neighbors_ei = np.delete(neighbors_ei, neighbors_idx_self) # remove node itself 253 | 254 | feature_agg_ei = np.mean(features[neighbors_ei], axis=0) # mean 255 | feature_agg_ei_all = feature_agg_ei_all + feature_agg_ei 256 | feature_agg_ei_all /= len(hyperedge_neighbors_i) 257 | 258 | features_new[i] = (1 - alpha) * features[i] + alpha * feature_agg_ei_all 259 | 260 | return features_new 261 | 262 | def simulate_contact(type='linear', alpha=1.0, beta=1.0, path_save=None, nonlinear_type='raw'): 263 | # load hypergraph 264 | with open('../data/contact_hypergraph.pickle', 'rb') as f: 265 | data_processed = pickle.load(f) 266 | hyperedge_index = data_processed['hyper_index'] 267 | n = np.max(hyperedge_index[0]) + 1 268 | 269 | # simulate features 270 | d_x = 50 271 | features = np.random.normal(loc=0.2, scale=1, size=(n, d_x)) # 10 272 | print('feature std: ', np.mean(np.std(features, axis=0))) 273 | 274 | # simulate treatments 275 | W = np.random.normal(0,1,size=(d_x,1)) 276 | treatment_orin = sigmoid(0.01 * np.matmul(features, W)) # N X 1 277 | treatment_orin = treatment_orin.reshape(-1) 278 | treated_ratio = 0.49 279 | thresh_t = np.sort(treatment_orin)[::-1][int(treated_ratio * len(treatment_orin))] 280 | treatment = np.zeros_like(treatment_orin) 281 | treatment[np.where(treatment_orin >= thresh_t)] = 1.0 282 | treatment[np.where(treatment_orin < thresh_t)] = 0.0 283 | treated_ratio = float(np.count_nonzero(treatment)) / n 284 | print('treatment ratio: ', treated_ratio) 285 | 286 | # simulate outcomes 287 | simulate_outcome_results = simulate_outcome_2(features, hyperedge_index, treatment, type, alpha, beta, nonlinear_type) 288 | 289 | simulation_data = { 290 | 'parameter': {'alpha':alpha, 'beta': beta, 'type': type, 'nonlinear_type': nonlinear_type}, 291 | 'features': features, 292 | 'treatments': treatment, 293 | 'outcomes': simulate_outcome_results['outcomes'], 294 | 'Y_true': simulate_outcome_results['Y_true'], 295 | 'hyperedge_index': hyperedge_index 296 | } 297 | if path_save is None: 298 | path_save = '../data/Simulation/contact/contact_sim_' + type + '_alpha' + str(alpha) + '_beta' + str( 299 | beta) + '.mat' 300 | save_flag = True 301 | if save_flag: 302 | sio.savemat(path_save, simulation_data) 303 | print('Data saved! Path: ', path_save) 304 | print('type=', type, ' alpha=', alpha, ' beta=', beta) 305 | return simulation_data 306 | 307 | if __name__ == '__main__': 308 | dataset = 'contact' # Microsoft, GoodReads, contact 309 | 310 | if dataset == 'GoodReads': 311 | type = 'linear' # linear, quadratic 312 | nonlinear_type = 'raw' 313 | alpha = 1.0 314 | beta = 1.0 315 | path_save = '../data/Simulation/GR/GoodReads_sim_' + type + '_alpha' + str(alpha) + '_beta' + str(beta) +\ 316 | '_nonlinear_' + nonlinear_type + '.mat' 317 | simulate_goodreads(type=type, alpha=alpha, beta=beta, path_save=path_save, nonlinear_type=nonlinear_type) 318 | 319 | elif dataset == 'contact': 320 | type = 'linear' # linear, quadratic 321 | nonlinear_type = 'raw' 322 | alpha = 1.0 323 | beta = 1.0 324 | path_save = '../data/Simulation/contact/contact_sim_' + type + '_alpha' + str(alpha) + '_beta' + str( 325 | beta) + '_nonlinear_' + nonlinear_type + '.mat' 326 | simulate_contact(type=type, alpha=alpha, beta=beta, path_save=path_save, nonlinear_type=nonlinear_type) -------------------------------------------------------------------------------- /src/readme: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import scipy.io as sio 2 | import torch 3 | import scipy.sparse as sp 4 | import numpy as np 5 | import random 6 | import torch.nn.functional as F 7 | from scipy.sparse import csc_matrix 8 | import scipy.sparse as sparse 9 | import pandas as pd 10 | 11 | device = torch.device("cuda:0") 12 | 13 | def pdist(sample_1, sample_2, norm=2, eps=1e-5): 14 | """Compute the matrix of all squared pairwise distances. 15 | Arguments 16 | --------- 17 | sample_1 : torch.Tensor or Variable 18 | The first sample, should be of shape ``(n_1, d)``. 19 | sample_2 : torch.Tensor or Variable 20 | The second sample, should be of shape ``(n_2, d)``. 21 | norm : float 22 | The l_p norm to be used. 23 | Returns 24 | ------- 25 | torch.Tensor or Variable 26 | Matrix of shape (n_1, n_2). The [i, j]-th entry is equal to 27 | ``|| sample_1[i, :] - sample_2[j, :] ||_p``.""" 28 | n_1, n_2 = sample_1.size(0), sample_2.size(0) 29 | norm = float(norm) 30 | if norm == 2.: 31 | norms_1 = torch.sum(sample_1**2, dim=1, keepdim=True) 32 | norms_2 = torch.sum(sample_2**2, dim=1, keepdim=True) 33 | norms = (norms_1.expand(n_1, n_2) + 34 | norms_2.transpose(0, 1).expand(n_1, n_2)) 35 | distances_squared = norms - 2 * sample_1.mm(sample_2.t()) 36 | return torch.sqrt(eps + torch.abs(distances_squared)) 37 | else: 38 | dim = sample_1.size(1) 39 | expanded_1 = sample_1.unsqueeze(1).expand(n_1, n_2, dim) 40 | expanded_2 = sample_2.unsqueeze(0).expand(n_1, n_2, dim) 41 | differences = torch.abs(expanded_1 - expanded_2) ** norm 42 | inner = torch.sum(differences, dim=2, keepdim=False) 43 | return (eps + inner) ** (1. / norm) 44 | 45 | def wasserstein(x, y, device, p=0.5, lam=10, its=10, sq=False, backpropT=False, cuda=False): 46 | """return W dist between x and y""" 47 | '''distance matrix M''' 48 | nx = x.shape[0] 49 | ny = y.shape[0] 50 | 51 | x = x.squeeze() 52 | y = y.squeeze() 53 | 54 | # pdist = torch.nn.PairwiseDistance(p=2) 55 | 56 | M = pdist(x, y) # distance_matrix(x,y,p=2) 57 | 58 | '''estimate lambda and delta''' 59 | M_mean = torch.mean(M) 60 | M_drop = F.dropout(M, 10.0 / (nx * ny)) 61 | delta = torch.max(M_drop).cpu().detach() 62 | eff_lam = (lam / M_mean).cpu().detach() 63 | 64 | '''compute new distance matrix''' 65 | Mt = M 66 | row = delta * torch.ones(M[0:1, :].shape) 67 | col = torch.cat([delta * torch.ones(M[:, 0:1].shape), torch.zeros((1, 1))], 0) 68 | if cuda: 69 | #row = row.cuda() 70 | #col = col.cuda() 71 | row = row.to(device) 72 | col = col.to(device) 73 | Mt = torch.cat([M, row], 0) 74 | Mt = torch.cat([Mt, col], 1) 75 | 76 | '''compute marginal''' 77 | a = torch.cat([p * torch.ones((nx, 1)) / nx, (1 - p) * torch.ones((1, 1))], 0) 78 | b = torch.cat([(1 - p) * torch.ones((ny, 1)) / ny, p * torch.ones((1, 1))], 0) 79 | 80 | '''compute kernel''' 81 | Mlam = eff_lam * Mt 82 | temp_term = torch.ones(1) * 1e-6 83 | if cuda: 84 | #temp_term = temp_term.cuda() 85 | #a = a.cuda() 86 | #b = b.cuda() 87 | temp_term = temp_term.to(device) 88 | a = a.to(device) 89 | b = b.to(device) 90 | K = torch.exp(-Mlam) + temp_term 91 | U = K * Mt 92 | ainvK = K / a 93 | 94 | u = a 95 | 96 | for i in range(its): 97 | u = 1.0 / (ainvK.matmul(b / torch.t(torch.t(u).matmul(K)))) 98 | if cuda: 99 | #u = u.cuda() 100 | u = u.to(device) 101 | v = b / (torch.t(torch.t(u).matmul(K))) 102 | if cuda: 103 | #v = v.cuda() 104 | v = v.to(device) 105 | 106 | upper_t = u * (torch.t(v) * K).detach() 107 | 108 | E = upper_t * Mt 109 | D = 2 * torch.sum(E) 110 | 111 | if cuda: 112 | #D = D.cuda() 113 | D = D.to(device) 114 | 115 | return D, Mlam 116 | 117 | def pdist2sq(x_t, x_cf): 118 | C = -2 * torch.matmul(x_t,torch.t(x_cf)) 119 | n_t = torch.sum(x_t * x_t, 1, True) 120 | n_cf = torch.sum(x_cf * x_cf, 1, True) 121 | D = (C + torch.t(n_cf)) + n_t 122 | return D 123 | 124 | def mmd2_rbf(Xt, Xc, p,sig): 125 | """ Computes the l2-RBF MMD for X given t """ 126 | 127 | Kcc = torch.exp(-pdist2sq(Xc,Xc)/(sig)**2) 128 | Kct = torch.exp(-pdist2sq(Xc,Xt)/(sig)**2) 129 | Ktt = torch.exp(-pdist2sq(Xt,Xt)/(sig)**2) 130 | 131 | m = Xc.shape[0] 132 | n = Xt.shape[0] 133 | 134 | mmd = (1.0-p)**2/(m*(m-1.0))*(torch.sum(Kcc)-m) 135 | mmd = mmd + (p) ** 2/(n*(n-1.0))*(torch.sum(Ktt)-n) 136 | mmd = mmd - 2.0*p*(1.0-p)/(m*n)*torch.sum(Kct) 137 | mmd = 4.0*mmd 138 | 139 | return mmd 140 | 141 | def mmd2_lin(Xt, Xc,p): 142 | ''' Linear MMD ''' 143 | mean_control = torch.mean(Xc,0) 144 | mean_treated = torch.mean(Xt,0) 145 | 146 | mmd = torch.sum((2.0*p*mean_treated - 2.0*(1.0-p)*mean_control) ** 2) 147 | 148 | return mmd 149 | 150 | def safe_sqrt(x, lbound=1e-10): 151 | ''' Numerically safe version of pytorch sqrt ''' 152 | return torch.sqrt(torch.clamp(x, lbound, np.inf)) 153 | 154 | def get_hyperedge_attr(features, hyperedge_index, type='mean'): 155 | # input: features: tensor N x F; hyperedge_index: 2 x |sum of all hyperedge size| 156 | # return hyperedge_attr: tensor, M x F 157 | #features = torch.FloatTensor([[0, 0.1, 0.2], [1.1, 1.2, 1.3], [2., 2.1, 2.2], [3.1,3.2,3.3], [4, 4.1, 4.2], [5,5,5]]) 158 | #hyperedge_index = torch.LongTensor([[0,1,0,3,4,5,1],[0,0,1,1,1,2,2]]) 159 | if type == 'mean': 160 | # hyperedge_attr = features[hyperedge_index[0]] # |sum of all hyperedge size| x F 161 | # index_start = # M, the start index of every hyperedge 162 | # hyperedge_attr = torch.tensor_split(hyperedge_attr, index_start) # 163 | hyperedge_attr = None 164 | samples = features[hyperedge_index[0]] 165 | labels = hyperedge_index[1] 166 | 167 | labels = labels.view(labels.size(0), 1).expand(-1, samples.size(1)) 168 | unique_labels, labels_count = labels.unique(dim=0, return_counts=True) 169 | 170 | hyperedge_attr = torch.zeros_like(unique_labels, dtype=torch.float).scatter_add_(0, labels, samples) 171 | hyperedge_attr = hyperedge_attr / labels_count.float().unsqueeze(1) 172 | return hyperedge_attr 173 | 174 | def sparse_mx_to_torch_sparse_tensor(sparse_mx): 175 | """Convert a scipy sparse matrix to a torch sparse tensor.""" 176 | sparse_mx = sparse_mx.tocoo().astype(np.float32) 177 | indices = torch.from_numpy( 178 | np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64)) 179 | values = torch.from_numpy(sparse_mx.data) 180 | shape = torch.Size(sparse_mx.shape) 181 | return torch.sparse.FloatTensor(indices, values, shape) 182 | 183 | def project_hypergraph(n, hyperedge_index, type='hyper_index'): 184 | # inner product: uid, tid + uid, tid => uid - tid - uid 185 | # hyperedge_index = torch.LongTensor([[0], [0]]) 186 | # hyperedge_index = sparse.eye(n, dtype=np.int8) 187 | # return hyperedge_index 188 | 189 | df = pd.DataFrame(data={'uid': hyperedge_index[0], 'tid': hyperedge_index[1]}) 190 | df = pd.merge(df, df, on='tid') 191 | 192 | df_team_num = df[df['uid_x'] < df['uid_y']] # u 1] 196 | print('num of high-order repeat: ', df_team_num.shape[0], ' highest', df_team_num['Count'].max()) 197 | 198 | df = df.loc[:, ['uid_x', 'uid_y']].drop_duplicates() # (uid, uid) with no repeat, (u, v) and (v, u) are both in 199 | 200 | if type == 'graph_index': 201 | df_self = pd.DataFrame(data={'uid_x': np.arange(n), 'uid_y': np.arange(n)}) 202 | df = df.append(df_self, ignore_index=True).drop_duplicates() 203 | df = df.sort_values(by=['uid_x', 'uid_y'], ascending=True) # add edge (i,i) 204 | 205 | edge_num = df.shape[0] 206 | rows = df.loc[:, 'uid_x'].values.reshape(-1) 207 | cols = df.loc[:, 'uid_y'].values.reshape(-1) 208 | 209 | data = np.ones(edge_num) 210 | adj_sparse = csc_matrix((data, (rows, cols)), shape=(n, n)) 211 | projected_graph = adj_sparse 212 | 213 | print('projected the hypergraph into a plain graph with edge num: ', (edge_num-n)/2) 214 | 215 | elif type == 'hyper_index': 216 | df = df.drop(df[df.uid_x == df.uid_y].index) # remove self loop 217 | df = df.drop(df[df.uid_x >= df.uid_y].index) # just keep (u, v), u