├── Process ├── process.py ├── rand5fold.py └── dataset.py ├── README.md ├── others ├── earlystopping.py └── evaluate.py └── main.py /Process/process.py: -------------------------------------------------------------------------------- 1 | import os 2 | from Process.dataset import GraphDataset, test_GraphDataset 3 | 4 | cwd=os.getcwd() 5 | 6 | 7 | def loadData(dataname, fold_x_train,fold_x_test,droprate): 8 | print("loading train set", ) 9 | traindata_list = GraphDataset(fold_x_train, droprate=droprate) 10 | print("train no:", len(traindata_list)) 11 | print("loading test set", ) 12 | testdata_list = test_GraphDataset(fold_x_test, droprate=0) 13 | print("test no:", len(testdata_list)) 14 | return traindata_list, testdata_list 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GACL 2 | 3 | Tiening Sun, Zhong Qian, Sujun Dong, Peifeng Li, and Qiaoming Zhu. Rumor Detection on Social Media with Graph Adversarial Contrastive Learning. In Proceedings of the ACM Web Conference 2022 (WWW ’22), April 25–29, 2022, Virtual Event, Lyon, France. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/3485447.3511999 4 | 5 | 6 | ## Run 7 | 8 | First, you need to download the datasets and corresponding pre-trained word vectors from https://1drv.ms/u/s!AiEW2lmZS3GShCqqIDQ2bQ1vu-SY?e=JQAnXk. 9 | 10 | Then, follow the prompts(README.txt) to place the datasets and word vectors in the correct folders. 11 | 12 | Finally, run the main.py file. 13 | 14 | ## Other 15 | 16 | The raw datasets can be downloaded from https://www.dropbox.com/s/7ewzdrbelpmrnxu/rumdetect2017.zip?dl=0 and https://figshare.com/search?q=pheme 17 | 18 | -------------------------------------------------------------------------------- /others/earlystopping.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | class EarlyStopping: 5 | 6 | def __init__(self, patience=7, verbose=False): 7 | self.patience = patience 8 | self.verbose = verbose 9 | self.counter = 0 10 | self.best_score = None 11 | self.early_stop = False 12 | self.accs=0 13 | self.F1=0 14 | self.F2 = 0 15 | self.F3 = 0 16 | self.F4 = 0 17 | self.val_loss_min = np.Inf 18 | 19 | def __call__(self, val_loss, accs,F1,F2,F3,F4,model,modelname,str): 20 | 21 | score = (accs+F1+F2+F3+F4) /5 22 | 23 | if self.best_score is None: 24 | self.best_score = score 25 | self.accs = accs 26 | self.F1 = F1 27 | self.F2 = F2 28 | self.F3 = F3 29 | self.F4 = F4 30 | elif score < self.best_score: 31 | self.counter += 1 32 | if self.counter >= self.patience: 33 | self.early_stop = True 34 | print("BEST Accuracy: {:.3f}|NR F1: {:.3f}|FR F1: {:.3f}|TR F1: {:.3f}|UR F1: {:.3f}" 35 | .format(self.accs,self.F1,self.F2,self.F3,self.F4)) 36 | else: 37 | self.best_score = score 38 | self.accs = accs 39 | self.F1 = F1 40 | self.F2 = F2 41 | self.F3 = F3 42 | self.F4 = F4 43 | self.save_checkpoint(val_loss, model,modelname,str) 44 | self.counter = 0 45 | -------------------------------------------------------------------------------- /others/evaluate.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def evaluation4class(prediction, y): # 4 dim 4 | TP1, FP1, FN1, TN1 = 0, 0, 0, 0 5 | TP2, FP2, FN2, TN2 = 0, 0, 0, 0 6 | TP3, FP3, FN3, TN3 = 0, 0, 0, 0 7 | TP4, FP4, FN4, TN4 = 0, 0, 0, 0 8 | # e, RMSE, RMSE1, RMSE2, RMSE3, RMSE4 = 0.000001, 0.0, 0.0, 0.0, 0.0, 0.0 9 | for i in range(len(y)): 10 | Act, Pre = y[i], prediction[i] 11 | 12 | ## for class 1 13 | if Act == 0 and Pre == 0: TP1 += 1 14 | if Act == 0 and Pre != 0: FN1 += 1 15 | if Act != 0 and Pre == 0: FP1 += 1 16 | if Act != 0 and Pre != 0: TN1 += 1 17 | ## for class 2 18 | if Act == 1 and Pre == 1: TP2 += 1 19 | if Act == 1 and Pre != 1: FN2 += 1 20 | if Act != 1 and Pre == 1: FP2 += 1 21 | if Act != 1 and Pre != 1: TN2 += 1 22 | ## for class 3 23 | if Act == 2 and Pre == 2: TP3 += 1 24 | if Act == 2 and Pre != 2: FN3 += 1 25 | if Act != 2 and Pre == 2: FP3 += 1 26 | if Act != 2 and Pre != 2: TN3 += 1 27 | ## for class 4 28 | if Act == 3 and Pre == 3: TP4 += 1 29 | if Act == 3 and Pre != 3: FN4 += 1 30 | if Act != 3 and Pre == 3: FP4 += 1 31 | if Act != 3 and Pre != 3: TN4 += 1 32 | 33 | ## print result 34 | Acc_all = round(float(TP1 + TP2 + TP3 + TP4) / float(len(y) ), 4) 35 | Acc1 = round(float(TP1 + TN1) / float(TP1 + TN1 + FN1 + FP1), 4) 36 | if (TP1 + FP1)==0: 37 | Prec1 =0 38 | else: 39 | Prec1 = round(float(TP1) / float(TP1 + FP1), 4) 40 | if (TP1 + FN1 )==0: 41 | Recll1 =0 42 | else: 43 | Recll1 = round(float(TP1) / float(TP1 + FN1 ), 4) 44 | if (Prec1 + Recll1 )==0: 45 | F1 =0 46 | else: 47 | F1 = round(2 * Prec1 * Recll1 / (Prec1 + Recll1 ), 4) 48 | 49 | Acc2 = round(float(TP2 + TN2) / float(TP2 + TN2 + FN2 + FP2), 4) 50 | if (TP2 + FP2)==0: 51 | Prec2 =0 52 | else: 53 | Prec2 = round(float(TP2) / float(TP2 + FP2), 4) 54 | if (TP2 + FN2 )==0: 55 | Recll2 =0 56 | else: 57 | Recll2 = round(float(TP2) / float(TP2 + FN2 ), 4) 58 | if (Prec2 + Recll2 )==0: 59 | F2 =0 60 | else: 61 | F2 = round(2 * Prec2 * Recll2 / (Prec2 + Recll2 ), 4) 62 | 63 | Acc3 = round(float(TP3 + TN3) / float(TP3 + TN3 + FN3 + FP3), 4) 64 | if (TP3 + FP3)==0: 65 | Prec3 =0 66 | else: 67 | Prec3 = round(float(TP3) / float(TP3 + FP3), 4) 68 | if (TP3 + FN3 )==0: 69 | Recll3 =0 70 | else: 71 | Recll3 = round(float(TP3) / float(TP3 + FN3), 4) 72 | if (Prec3 + Recll3 )==0: 73 | F3 =0 74 | else: 75 | F3 = round(2 * Prec3 * Recll3 / (Prec3 + Recll3), 4) 76 | 77 | Acc4 = round(float(TP4 + TN4) / float(TP4 + TN4 + FN4 + FP4), 4) 78 | if (TP4 + FP4)==0: 79 | Prec4 =0 80 | else: 81 | Prec4 = round(float(TP4) / float(TP4 + FP4), 4) 82 | if (TP4 + FN4) == 0: 83 | Recll4 = 0 84 | else: 85 | Recll4 = round(float(TP4) / float(TP4 + FN4), 4) 86 | if (Prec4 + Recll4 )==0: 87 | F4 =0 88 | else: 89 | F4 = round(2 * Prec4 * Recll4 / (Prec4 + Recll4), 4) 90 | 91 | return Acc_all,Acc1, Prec1, Recll1, F1,Acc2, Prec2, Recll2, F2,Acc3, Prec3, Recll3, F3,Acc4, Prec4, Recll4, F4 92 | 93 | def evaluationclass(prediction, y): # 2 dim 94 | TP1, FP1, FN1, TN1 = 0, 0, 0, 0 95 | TP2, FP2, FN2, TN2 = 0, 0, 0, 0 96 | for i in range(len(y)): 97 | Act, Pre = y[i], prediction[i] 98 | 99 | ## for class 1 100 | if Act == 0 and Pre == 0: TP1 += 1 101 | if Act == 0 and Pre != 0: FN1 += 1 102 | if Act != 0 and Pre == 0: FP1 += 1 103 | if Act != 0 and Pre != 0: TN1 += 1 104 | ## for class 2 105 | if Act == 1 and Pre == 1: TP2 += 1 106 | if Act == 1 and Pre != 1: FN2 += 1 107 | if Act != 1 and Pre == 1: FP2 += 1 108 | if Act != 1 and Pre != 1: TN2 += 1 109 | 110 | ## print result 111 | Acc_all = round(float(TP1 + TP2) / float(len(y) ), 4) 112 | Acc1 = round(float(TP1 + TN1) / float(TP1 + TN1 + FN1 + FP1), 4) 113 | if (TP1 + FP1)==0: 114 | Prec1 =0 115 | else: 116 | Prec1 = round(float(TP1) / float(TP1 + FP1), 4) 117 | if (TP1 + FN1 )==0: 118 | Recll1 =0 119 | else: 120 | Recll1 = round(float(TP1) / float(TP1 + FN1 ), 4) 121 | if (Prec1 + Recll1 )==0: 122 | F1 =0 123 | else: 124 | F1 = round(2 * Prec1 * Recll1 / (Prec1 + Recll1 ), 4) 125 | 126 | Acc2 = round(float(TP2 + TN2) / float(TP2 + TN2 + FN2 + FP2), 4) 127 | if (TP2 + FP2)==0: 128 | Prec2 =0 129 | else: 130 | Prec2 = round(float(TP2) / float(TP2 + FP2), 4) 131 | if (TP2 + FN2 )==0: 132 | Recll2 =0 133 | else: 134 | Recll2 = round(float(TP2) / float(TP2 + FN2 ), 4) 135 | if (Prec2 + Recll2 )==0: 136 | F2 =0 137 | else: 138 | F2 = round(2 * Prec2 * Recll2 / (Prec2 + Recll2 ), 4) 139 | 140 | return Acc_all,Acc1, Prec1, Recll1, F1,Acc2, Prec2, Recll2, F2 -------------------------------------------------------------------------------- /Process/rand5fold.py: -------------------------------------------------------------------------------- 1 | import random 2 | from random import shuffle 3 | import os 4 | 5 | 6 | 7 | 8 | def load5foldData(obj,data_path,label_path): 9 | path = data_path 10 | label_path = label_path 11 | if 'Twitter' in obj: 12 | labelPath = os.path.join(label_path) 13 | labelset_nonR, labelset_f, labelset_t, labelset_u = ['non-rumor'], ['false'], ['true'], ['unverified'] 14 | t_path = path 15 | file_list = os.listdir(t_path) 16 | print('The len of file_list: ', len(file_list)) 17 | 18 | NR,F,T,U = [],[],[],[] 19 | l1=l2=l3=l4=0 20 | labelDic = {} 21 | for line in open(labelPath): 22 | line = line.rstrip() 23 | label, eid = line.split('\t')[0], line.split('\t')[2] 24 | 25 | if eid in file_list: 26 | labelDic[eid] = label.lower() 27 | 28 | if label in labelset_nonR: 29 | NR.append(eid) 30 | l1 += 1 31 | if labelDic[eid] in labelset_f: # F 32 | F.append(eid) 33 | l2 += 1 34 | if labelDic[eid] in labelset_t: # T 35 | T.append(eid) 36 | l3 += 1 37 | if labelDic[eid] in labelset_u: # U 38 | U.append(eid) 39 | l4 += 1 40 | print(len(labelDic)) 41 | print(l1,l2,l3,l4) 42 | random.shuffle(NR) 43 | random.shuffle(F) 44 | random.shuffle(T) 45 | random.shuffle(U) 46 | 47 | 48 | fold0_x_test,fold1_x_test,fold2_x_test,fold3_x_test,fold4_x_test=[],[],[],[],[] 49 | fold0_x_train, fold1_x_train, fold2_x_train, fold3_x_train, fold4_x_train = [], [], [], [], [] 50 | leng1 = int(l1 * 0.2) 51 | leng2 = int(l2 * 0.2) 52 | leng3 = int(l3 * 0.2) 53 | leng4 = int(l4 * 0.2) 54 | 55 | 56 | fold0_x_test.extend(NR[0:leng1]) 57 | fold0_x_test.extend(F[0:leng2]) 58 | fold0_x_test.extend(T[0:leng3]) 59 | fold0_x_test.extend(U[0:leng4]) 60 | fold0_x_train.extend(NR[leng1:]) 61 | fold0_x_train.extend(F[leng2:]) 62 | fold0_x_train.extend(T[leng3:]) 63 | fold0_x_train.extend(U[leng4:]) 64 | 65 | fold1_x_train.extend(NR[0:leng1]) 66 | fold1_x_train.extend(NR[leng1 * 2:]) 67 | fold1_x_train.extend(F[0:leng2]) 68 | fold1_x_train.extend(F[leng2 * 2:]) 69 | fold1_x_train.extend(T[0:leng3]) 70 | fold1_x_train.extend(T[leng3 * 2:]) 71 | fold1_x_train.extend(U[0:leng4]) 72 | fold1_x_train.extend(U[leng4 * 2:]) 73 | fold1_x_test.extend(NR[leng1:leng1*2]) 74 | fold1_x_test.extend(F[leng2:leng2*2]) 75 | fold1_x_test.extend(T[leng3:leng3*2]) 76 | fold1_x_test.extend(U[leng4:leng4*2]) 77 | 78 | fold2_x_train.extend(NR[0:leng1*2]) 79 | fold2_x_train.extend(NR[leng1*3:]) 80 | fold2_x_train.extend(F[0:leng2*2]) 81 | fold2_x_train.extend(F[leng2*3:]) 82 | fold2_x_train.extend(T[0:leng3*2]) 83 | fold2_x_train.extend(T[leng3*3:]) 84 | fold2_x_train.extend(U[0:leng4*2]) 85 | fold2_x_train.extend(U[leng4*3:]) 86 | fold2_x_test.extend(NR[leng1*2:leng1*3]) 87 | fold2_x_test.extend(F[leng2*2:leng2*3]) 88 | fold2_x_test.extend(T[leng3*2:leng3*3]) 89 | fold2_x_test.extend(U[leng4*2:leng4*3]) 90 | 91 | fold3_x_train.extend(NR[0:leng1*3]) 92 | fold3_x_train.extend(NR[leng1*4:]) 93 | fold3_x_train.extend(F[0:leng2*3]) 94 | fold3_x_train.extend(F[leng2*4:]) 95 | fold3_x_train.extend(T[0:leng3*3]) 96 | fold3_x_train.extend(T[leng3*4:]) 97 | fold3_x_train.extend(U[0:leng4*3]) 98 | fold3_x_train.extend(U[leng4*4:]) 99 | fold3_x_test.extend(NR[leng1*3:leng1*4]) 100 | fold3_x_test.extend(F[leng2*3:leng2*4]) 101 | fold3_x_test.extend(T[leng3*3:leng3*4]) 102 | fold3_x_test.extend(U[leng4*3:leng4*4]) 103 | 104 | fold4_x_train.extend(NR[0:leng1*4]) 105 | fold4_x_train.extend(NR[leng1*5:]) 106 | fold4_x_train.extend(F[0:leng2*4]) 107 | fold4_x_train.extend(F[leng2*5:]) 108 | fold4_x_train.extend(T[0:leng3*4]) 109 | fold4_x_train.extend(T[leng3*5:]) 110 | fold4_x_train.extend(U[0:leng4*4]) 111 | fold4_x_train.extend(U[leng4*5:]) 112 | fold4_x_test.extend(NR[leng1*4:leng1*5]) 113 | fold4_x_test.extend(F[leng2*4:leng2*5]) 114 | fold4_x_test.extend(T[leng3*4:leng3*5]) 115 | fold4_x_test.extend(U[leng4*4:leng4*5]) 116 | 117 | 118 | 119 | fold0_test = list(fold0_x_test) 120 | shuffle(fold0_test) 121 | fold0_train = list(fold0_x_train) 122 | shuffle(fold0_train) 123 | fold1_test = list(fold1_x_test) 124 | shuffle(fold1_test) 125 | fold1_train = list(fold1_x_train) 126 | shuffle(fold1_train) 127 | fold2_test = list(fold2_x_test) 128 | shuffle(fold2_test) 129 | fold2_train = list(fold2_x_train) 130 | shuffle(fold2_train) 131 | fold3_test = list(fold3_x_test) 132 | shuffle(fold3_test) 133 | fold3_train = list(fold3_x_train) 134 | shuffle(fold3_train) 135 | fold4_test = list(fold4_x_test) 136 | shuffle(fold4_test) 137 | fold4_train = list(fold4_x_train) 138 | shuffle(fold4_train) 139 | 140 | return list(fold0_test),list(fold0_train),\ 141 | list(fold1_test),list(fold1_train),\ 142 | list(fold2_test),list(fold2_train),\ 143 | list(fold3_test),list(fold3_train),\ 144 | list(fold4_test), list(fold4_train) 145 | -------------------------------------------------------------------------------- /Process/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import torch 4 | import random 5 | from torch.utils.data import Dataset 6 | from torch_geometric.data import Data 7 | import pickle 8 | #from transformers import * 9 | import json 10 | from torch.utils.data import DataLoader 11 | 12 | 13 | # global 14 | label2id = { 15 | "unverified": 0, 16 | "non-rumor": 1, 17 | "true": 2, 18 | "false": 3, 19 | } 20 | 21 | def random_pick(list, probabilities): 22 | x = random.uniform(0,1) 23 | cumulative_probability = 0.0 24 | for item, item_probability in zip(list, probabilities): 25 | cumulative_probability += item_probability 26 | if x < cumulative_probability: 27 | break 28 | return item 29 | 30 | class RumorDataset(torch.utils.data.Dataset): 31 | def __init__(self, encodings, labels): 32 | self.encodings = encodings 33 | self.labels = labels 34 | 35 | def __getitem__(self, idx): 36 | item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} 37 | #item = {key: torch.LongTensor(val[idx]) for key, val in self.encodings.items()} 38 | item['labels'] = torch.tensor(self.labels[idx])#torch.LongTensor(batch_label).to(device1) 39 | #item['labels'] = torch.LongTensor(self.labels[idx]) 40 | return item 41 | 42 | def __len__(self): 43 | return len(self.labels) 44 | 45 | 46 | class GraphDataset(Dataset): 47 | def __init__(self, fold_x, droprate): 48 | 49 | self.fold_x = fold_x 50 | self.droprate = droprate 51 | 52 | def __len__(self): 53 | return len(self.fold_x) 54 | 55 | def __getitem__(self, index): 56 | id =self.fold_x[index] 57 | 58 | # ====================================edgeindex======================================== 59 | with open('./data/twitter16/'+ id + '/after_tweets.pkl', 'rb') as t: 60 | tweets = pickle.load(t) 61 | #print(tweets) 62 | dict = {} 63 | for index, tweet in enumerate(tweets): 64 | dict[tweet] = index 65 | #print('dict: ', dict) 66 | 67 | with open('./data/twitter16/'+ id + '/after_structure.pkl', 'rb') as f: 68 | inf = pickle.load(f) 69 | 70 | inf = inf[1:] 71 | new_inf = [] 72 | for pair in inf: 73 | new_pair = [] 74 | for E in pair: 75 | if E == 'ROOT': 76 | break 77 | E = dict[E] 78 | new_pair.append(E) 79 | if E != 'ROOT': 80 | new_inf.append(new_pair) 81 | new_inf = np.array(new_inf).T 82 | edgeindex = new_inf 83 | 84 | 85 | init_row = list(edgeindex[0]) 86 | init_col = list(edgeindex[1]) 87 | burow = list(edgeindex[1]) 88 | bucol = list(edgeindex[0]) 89 | row = init_row + burow 90 | col = init_col + bucol 91 | 92 | new_edgeindex = [row, col] 93 | 94 | #==================================- dropping + adding + misplacing -===================================# 95 | 96 | choose_list = [1,2,3] # 1-drop 2-add 3-misplace 97 | probabilities = [0.7,0.2,0.1] # T15: probabilities = [0.5,0.3,0.2] 98 | choose_num = random_pick(choose_list, probabilities) 99 | 100 | if self.droprate > 0: 101 | if choose_num == 1: 102 | 103 | length = len(row) 104 | poslist = random.sample(range(length), int(length * (1 - self.droprate))) 105 | poslist = sorted(poslist) 106 | row2 = list(np.array(row)[poslist]) 107 | col2 = list(np.array(col)[poslist]) 108 | new_edgeindex2 = [row2, col2] 109 | #new_edgeindex = [row2, col2] 110 | ''' 111 | length = len(list(set(sorted(row)))) 112 | print('length:', length) 113 | poslist = random.sample(range(1,length), int(length * self.prerate)) 114 | print('len of poslist: ', len(poslist)) 115 | new_row = [] 116 | new_col = [] 117 | #print('row:',row) 118 | #print('poslist', poslist) 119 | for i_r, e_r in enumerate(row): 120 | for i_c, e_c in enumerate(col): 121 | if i_r == i_c: 122 | if e_r not in poslist and e_c not in poslist: 123 | new_row.append(e_r) 124 | new_col.append(e_c) 125 | #print('new_row:', new_row) 126 | #print('new_col:', new_col) 127 | 128 | print('len of new_row:', len(new_row)) 129 | if len(new_row) != len(new_col): 130 | print('setting error') 131 | Dict = {} 132 | for index, tweet in enumerate(sorted(list(set(new_row+new_col)))): 133 | Dict[tweet] = index 134 | 135 | row2 = [] 136 | col2 = [] 137 | for i_nr in new_row: 138 | row2.append(Dict[i_nr]) 139 | for i_nc in new_col: 140 | col2.append(Dict[i_nc]) 141 | #print('row2:',row2) 142 | ''' 143 | 144 | 145 | elif choose_num == 2: 146 | ''' 147 | length = len(row) 148 | last_num = list(set(sorted(row)))[-1] 149 | add_list = list(range(last_num+1, int(length * self.prerate))) 150 | add_row = [] 151 | add_col = [] 152 | for add_item in add_list: 153 | add_row.append(add_item) 154 | add_col.append(random.randint(0, add_item-1)) 155 | row2 = row + add_row + add_col 156 | col2 = col + add_col + add_row 157 | ''' 158 | length = len(list(set(sorted(row)))) 159 | add_row = random.sample(range(length), int(length * self.droprate)) 160 | add_col = random.sample(range(length), int(length * self.droprate)) 161 | row2 = row + add_row + add_col 162 | col2 = col + add_col + add_row 163 | 164 | new_edgeindex2 = [row2, col2] 165 | 166 | 167 | 168 | elif choose_num == 3: 169 | length = len(init_row) 170 | mis_index_list = random.sample(range(length), int(length * self.droprate)) 171 | #print('mis_index_list:', mis_index_list) 172 | Sort_len = len(list(set(sorted(row)))) 173 | if Sort_len > int(length * self.droprate): 174 | mis_value_list = random.sample(range(Sort_len), int(length * self.droprate)) 175 | #print('mis_valu_list:', mis_value_list) 176 | #val_i = 0 177 | for i, item in enumerate(init_row): 178 | for mis_i,mis_item in enumerate(mis_index_list): 179 | if i == mis_item and mis_value_list[mis_i] != item: 180 | init_row[i] = mis_value_list[mis_i] 181 | row2 = init_row + init_col 182 | col2 = init_col + init_row 183 | new_edgeindex2 = [row2, col2] 184 | 185 | 186 | else: 187 | length = len(row) 188 | poslist = random.sample(range(length), int(length * (1 - self.droprate))) 189 | poslist = sorted(poslist) 190 | row2 = list(np.array(row)[poslist]) 191 | col2 = list(np.array(col)[poslist]) 192 | new_edgeindex2 = [row2, col2] 193 | else: 194 | new_edgeindex = [row, col] 195 | new_edgeindex2 = [row, col] 196 | 197 | 198 | 199 | # =========================================X=============================================== 200 | with open('./bert_w2c/T16/t16_mask_00/' + id + '.json', 'r') as j_f0: 201 | json_inf0 = json.load(j_f0) 202 | 203 | x0 = json_inf0[id] 204 | x0 = np.array(x0) 205 | 206 | with open('./bert_w2c/T16/t16_mask_015/' + id + '.json', 'r') as j_f: 207 | json_inf = json.load(j_f) 208 | 209 | x_list = json_inf[id] 210 | x = np.array(x_list) 211 | 212 | 213 | with open('./data/label_16.json', 'r') as j_tags: 214 | tags = json.load(j_tags) 215 | 216 | y = label2id[tags[id]] 217 | #y = np.array(y) 218 | if self.droprate > 0: 219 | if choose_num == 1: 220 | zero_list = [0]*768 221 | x_length = len(x_list) 222 | r_list = random.sample(range(x_length), int(x_length * self.droprate)) 223 | r_list = sorted(r_list) 224 | for idex, line in enumerate(x_list): 225 | for r in r_list: 226 | if idex == r: 227 | x_list[idex] = zero_list 228 | 229 | x2 = np.array(x_list) 230 | x = x2 231 | 232 | 233 | return Data(x0=torch.tensor(x0,dtype=torch.float32), 234 | x=torch.tensor(x,dtype=torch.float32), 235 | edge_index=torch.LongTensor(new_edgeindex), 236 | edge_index2=torch.LongTensor(new_edgeindex2), 237 | y1=torch.LongTensor([y]), 238 | y2=torch.LongTensor([y])) 239 | 240 | 241 | 242 | class test_GraphDataset(Dataset): 243 | def __init__(self, fold_x, droprate): 244 | 245 | self.fold_x = fold_x 246 | self.droprate = droprate 247 | 248 | def __len__(self): 249 | return len(self.fold_x) 250 | 251 | def __getitem__(self, index): 252 | id =self.fold_x[index] 253 | # ====================================edgeindex============================================== 254 | with open('./data/twitter16/'+ id + '/after_tweets.pkl', 'rb') as t: 255 | tweets = pickle.load(t) 256 | #print(tweets) 257 | dict = {} 258 | for index, tweet in enumerate(tweets): 259 | dict[tweet] = index 260 | #print('dict: ', dict) 261 | 262 | with open('./data/twitter16/'+ id + '/after_structure.pkl', 'rb') as f: 263 | inf = pickle.load(f) 264 | 265 | inf = inf[1:] 266 | new_inf = [] 267 | for pair in inf: 268 | new_pair = [] 269 | for E in pair: 270 | if E == 'ROOT': 271 | break 272 | E = dict[E] 273 | new_pair.append(E) 274 | if E != 'ROOT': 275 | new_inf.append(new_pair) 276 | new_inf = np.array(new_inf).T 277 | edgeindex = new_inf 278 | 279 | row = list(edgeindex[0]) 280 | col = list(edgeindex[1]) 281 | burow = list(edgeindex[1]) 282 | bucol = list(edgeindex[0]) 283 | row.extend(burow) 284 | col.extend(bucol) 285 | 286 | new_edgeindex = [row, col] 287 | new_edgeindex2 = [row, col] 288 | 289 | 290 | # =========================================X==================================================== 291 | with open('./bert_w2c/T16/t16_mask_00/' + id + '.json', 'r') as j_f: 292 | json_inf = json.load(j_f) 293 | 294 | x = json_inf[id] 295 | x = np.array(x) 296 | 297 | with open('./data/label_16.json', 'r') as j_tags: 298 | tags = json.load(j_tags) 299 | 300 | y = label2id[tags[id]] 301 | #y = np.array(y) 302 | 303 | 304 | return Data(x0=torch.tensor(x,dtype=torch.float32), 305 | x=torch.tensor(x,dtype=torch.float32), 306 | edge_index=torch.LongTensor(new_edgeindex), 307 | edge_index2=torch.LongTensor(new_edgeindex2), 308 | y1=torch.LongTensor([y]), 309 | y2=torch.LongTensor([y])) 310 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # --------- GACL meathod ------------ 2 | # Time 2022 3 | 4 | import sys,os 5 | from numpy.matrixlib.defmatrix import matrix 6 | sys.path.append(os.getcwd()) 7 | from Process.process import * 8 | #from Process.process_user import * 9 | import torch as th 10 | import torch.nn as nn 11 | from torch_scatter import scatter_mean 12 | import torch.nn.functional as F 13 | import numpy as np 14 | from others.earlystopping import EarlyStopping 15 | from torch_geometric.data import DataLoader 16 | from tqdm import tqdm 17 | from Process.rand5fold import * 18 | from others.evaluate import * 19 | from torch_geometric.nn import GCNConv 20 | import copy 21 | import random 22 | 23 | 24 | def setup_seed(seed): 25 | th.manual_seed(seed) 26 | th.cuda.manual_seed_all(seed) 27 | np.random.seed(seed) 28 | random.seed(seed) 29 | th.backends.cudnn.deterministic = True 30 | 31 | setup_seed(2022) 32 | 33 | 34 | class hard_fc(th.nn.Module): 35 | def __init__(self, d_in,d_hid, DroPout=0): 36 | super().__init__() 37 | self.w_1 = nn.Linear(d_in, d_hid) # position-wise 38 | self.w_2 = nn.Linear(d_hid, d_in) # position-wise 39 | self.layer_norm = nn.LayerNorm(d_in, eps=1e-6) 40 | self.dropout = nn.Dropout(DroPout) 41 | 42 | def forward(self, x): 43 | 44 | residual = x 45 | 46 | x = self.w_2(F.relu(self.w_1(x))) 47 | x = self.dropout(x) 48 | x += residual 49 | 50 | x = self.layer_norm(x) 51 | 52 | return x 53 | 54 | class FGM(): 55 | def __init__(self, model): 56 | self.model = model 57 | self.backup = {} 58 | 59 | def attack(self, epsilon=0.3, emb_name='hard_fc1.'): # T15: epsilon = 0.2 60 | for name, param in self.model.named_parameters(): 61 | if param.requires_grad and emb_name in name: 62 | self.backup[name] = param.data.clone() 63 | norm = th.norm(param.grad) 64 | if norm != 0 and not th.isnan(norm): 65 | r_at = epsilon * param.grad / norm 66 | param.data.add_(r_at) 67 | 68 | def restore(self, emb_name='hard_fc1.'): 69 | for name, param in self.model.named_parameters(): 70 | if param.requires_grad and emb_name in name: 71 | assert name in self.backup 72 | param.data = self.backup[name] 73 | self.backup = {} 74 | 75 | 76 | class GCN_Net(th.nn.Module): 77 | def __init__(self,in_feats,hid_feats,out_feats): 78 | super(GCN_Net, self).__init__() 79 | self.conv1 = GCNConv(in_feats, hid_feats) 80 | self.conv2 = GCNConv(hid_feats, out_feats) 81 | self.fc=th.nn.Linear(2*out_feats,4) 82 | self.hard_fc1 = hard_fc(out_feats, out_feats) 83 | self.hard_fc2 = hard_fc(out_feats, out_feats) # optional 84 | 85 | def forward(self, data): 86 | init_x0, init_x, edge_index1, edge_index2 = data.x0, data.x, data.edge_index, data.edge_index2 87 | 88 | x1 = self.conv1(init_x0, edge_index1) 89 | x1 = F.relu(x1) 90 | x1 = self.conv2(x1, edge_index1) 91 | x1 = F.relu(x1) 92 | x1 = scatter_mean(x1, data.batch, dim=0) 93 | x1_g = x1 94 | x1 = self.hard_fc1(x1) 95 | x1_t = x1 96 | x1 = th.cat((x1_g, x1_t), 1) 97 | 98 | x2 = self.conv1(init_x, edge_index2) 99 | x2 = F.relu(x2) 100 | x2 = self.conv2(x2, edge_index2) 101 | x2 = F.relu(x2) 102 | x2 = scatter_mean(x2, data.batch, dim=0) 103 | x2_g = x2 104 | x2 = self.hard_fc1(x2) 105 | x2_t = x2 106 | x2 = th.cat((x2_g, x2_t), 1) 107 | x = th.cat((x1, x2), 0) 108 | y = th.cat((data.y1, data.y2), 0) 109 | 110 | x_T = x.t() 111 | dot_matrix = th.mm(x, x_T) 112 | x_norm = th.norm(x, p=2, dim=1) 113 | x_norm = x_norm.unsqueeze(1) 114 | norm_matrix = th.mm(x_norm, x_norm.t()) 115 | 116 | t = 0.3 # pheme: t = 0.6 117 | cos_matrix = (dot_matrix / norm_matrix) / t 118 | cos_matrix = th.exp(cos_matrix) 119 | diag = th.diag(cos_matrix) 120 | cos_matrix_diag = th.diag_embed(diag) 121 | cos_matrix = cos_matrix - cos_matrix_diag 122 | y_matrix_T = y.expand(len(y), len(y)) 123 | y_matrix = y_matrix_T.t() 124 | y_matrix = th.ne(y_matrix, y_matrix_T).float() 125 | #y_matrix_list = y_matrix.chunk(3, dim=0) 126 | #y_matrix = y_matrix_list[0] 127 | neg_matrix = cos_matrix * y_matrix 128 | neg_matrix_list = neg_matrix.chunk(2, dim=0) 129 | #neg_matrix = neg_matrix_list[0] 130 | pos_y_matrix = y_matrix * (-1) + 1 131 | pos_matrix_list = (cos_matrix * pos_y_matrix).chunk(2,dim=0) 132 | #print('cos_matrix: ', cos_matrix.shape, cos_matrix) 133 | #print('pos_y_matrix: ', pos_y_matrix.shape, pos_y_matrix) 134 | pos_matrix = pos_matrix_list[0] 135 | #print('pos shape: ', pos_matrix.shape, pos_matrix) 136 | neg_matrix = (th.sum(neg_matrix, dim=1)).unsqueeze(1) 137 | sum_neg_matrix_list = neg_matrix.chunk(2, dim=0) 138 | p1_neg_matrix = sum_neg_matrix_list[0] 139 | p2_neg_matrix = sum_neg_matrix_list[1] 140 | neg_matrix = p1_neg_matrix 141 | #print('neg shape: ', neg_matrix.shape) 142 | div = pos_matrix / neg_matrix 143 | div = (th.sum(div, dim=1)).unsqueeze(1) 144 | div = div / batchsize 145 | log = th.log(div) 146 | SUM = th.sum(log) 147 | cl_loss = -SUM 148 | 149 | x = self.fc(x) 150 | x = F.log_softmax(x, dim=1) 151 | 152 | return x, cl_loss, y 153 | 154 | 155 | def train_GCN(x_test, x_train,lr, weight_decay,patience,n_epochs,batchsize,dataname): 156 | model = GCN_Net(768,64,64).to(device) 157 | fgm = FGM(model) 158 | for para in model.hard_fc1.parameters(): 159 | para.requires_grad = False 160 | for para in model.hard_fc2.parameters(): 161 | para.requires_grad = False 162 | optimizer = th.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, weight_decay=weight_decay) 163 | 164 | # optional ------ S1 ---------- 165 | for para in model.hard_fc1.parameters(): 166 | para.requires_grad = True 167 | for para in model.hard_fc2.parameters(): 168 | para.requires_grad = True 169 | #optimizer_hard = th.optim.Adam(model.hard_fc.parameters(), lr=lr, weight_decay=weight_decay) 170 | optimizer_hard = th.optim.SGD([{'params': model.hard_fc1.parameters()}, 171 | {'params': model.hard_fc2.parameters()}], lr=0.001) 172 | 173 | model.train() 174 | train_losses = [] 175 | val_losses = [] 176 | train_accs = [] 177 | val_accs = [] 178 | early_stopping = EarlyStopping(patience=patience, verbose=True) 179 | 180 | for epoch in range(n_epochs): 181 | traindata_list, testdata_list = loadData(dataname, x_train, x_test, droprate=0.4) # T15 droprate = 0.1 182 | train_loader = DataLoader(traindata_list, batch_size=batchsize, shuffle=True, num_workers=5) 183 | test_loader = DataLoader(testdata_list, batch_size=batchsize, shuffle=True, num_workers=5) 184 | avg_loss = [] 185 | avg_acc = [] 186 | batch_idx = 0 187 | tqdm_train_loader = tqdm(train_loader) 188 | NUM=1 189 | beta=0.001 190 | for Batch_data in tqdm_train_loader: 191 | Batch_data.to(device) 192 | out_labels, cl_loss, y = model(Batch_data) 193 | finalloss = F.nll_loss(out_labels,y) 194 | loss = finalloss + beta*cl_loss 195 | avg_loss.append(loss.item()) 196 | ##------------- S1 ---------------## 197 | ''' 198 | optimizer.zero_grad() 199 | loss.backward(retain_graph=True) 200 | avg_loss.append(loss.item()) 201 | optimizer.step() 202 | epsilon = 3 203 | loss_ad = epsilon/(finalloss + 0.001*cl_loss) 204 | print('loss_ad: ', loss_ad) 205 | optimizer_hard.zero_grad() 206 | loss_ad.backward() 207 | optimizer_hard.step() 208 | ''' 209 | ##--------------------------------## 210 | 211 | ##------------- S2 ---------------## 212 | optimizer.zero_grad() 213 | loss.backward() 214 | fgm.attack() 215 | out_labels, cl_loss, y = model(Batch_data) 216 | finalloss = F.nll_loss(out_labels,y) 217 | loss_adv = finalloss + beta*cl_loss 218 | loss_adv.backward() 219 | fgm.restore() 220 | optimizer.step() 221 | ##--------------------------------## 222 | 223 | 224 | _, pred = out_labels.max(dim=-1) 225 | correct = pred.eq(y).sum().item() 226 | train_acc = correct / len(y) 227 | avg_acc.append(train_acc) 228 | print("Epoch {:05d} | Batch{:02d} | Train_Loss {:.4f}| Train_Accuracy {:.4f}".format(epoch, batch_idx,loss.item(),train_acc)) 229 | batch_idx = batch_idx + 1 230 | NUM += 1 231 | #print('train_loss: ', loss.item()) 232 | train_losses.append(np.mean(avg_loss)) 233 | train_accs.append(np.mean(avg_acc)) 234 | 235 | temp_val_losses = [] 236 | temp_val_accs = [] 237 | temp_val_Acc_all, temp_val_Acc1, temp_val_Prec1, temp_val_Recll1, temp_val_F1, \ 238 | temp_val_Acc2, temp_val_Prec2, temp_val_Recll2, temp_val_F2, \ 239 | temp_val_Acc3, temp_val_Prec3, temp_val_Recll3, temp_val_F3, \ 240 | temp_val_Acc4, temp_val_Prec4, temp_val_Recll4, temp_val_F4 = [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [] 241 | model.eval() 242 | tqdm_test_loader = tqdm(test_loader) 243 | for Batch_data in tqdm_test_loader: 244 | Batch_data.to(device) 245 | val_out, val_cl_loss, y = model(Batch_data) 246 | val_loss = F.nll_loss(val_out, y) 247 | temp_val_losses.append(val_loss.item()) 248 | _, val_pred = val_out.max(dim=1) 249 | correct = val_pred.eq(y).sum().item() 250 | val_acc = correct / len(y) 251 | Acc_all, Acc1, Prec1, Recll1, F1, Acc2, Prec2, Recll2, F2, Acc3, Prec3, Recll3, F3, Acc4, Prec4, Recll4, F4 = evaluation4class( 252 | val_pred, y) 253 | temp_val_Acc_all.append(Acc_all), temp_val_Acc1.append(Acc1), temp_val_Prec1.append( 254 | Prec1), temp_val_Recll1.append(Recll1), temp_val_F1.append(F1), \ 255 | temp_val_Acc2.append(Acc2), temp_val_Prec2.append(Prec2), temp_val_Recll2.append( 256 | Recll2), temp_val_F2.append(F2), \ 257 | temp_val_Acc3.append(Acc3), temp_val_Prec3.append(Prec3), temp_val_Recll3.append( 258 | Recll3), temp_val_F3.append(F3), \ 259 | temp_val_Acc4.append(Acc4), temp_val_Prec4.append(Prec4), temp_val_Recll4.append( 260 | Recll4), temp_val_F4.append(F4) 261 | temp_val_accs.append(val_acc) 262 | val_losses.append(np.mean(temp_val_losses)) 263 | val_accs.append(np.mean(temp_val_accs)) 264 | print("Epoch {:05d} | Val_Loss {:.4f} | Val_Accuracy {:.4f}".format(epoch, np.mean(avg_loss), np.mean(temp_val_losses), 265 | np.mean(temp_val_accs))) 266 | res = ['acc:{:.4f}'.format(np.mean(temp_val_Acc_all)), 267 | 'C1:{:.4f},{:.4f},{:.4f},{:.4f}'.format(np.mean(temp_val_Acc1), np.mean(temp_val_Prec1), 268 | np.mean(temp_val_Recll1), np.mean(temp_val_F1)), 269 | 'C2:{:.4f},{:.4f},{:.4f},{:.4f}'.format(np.mean(temp_val_Acc2), np.mean(temp_val_Prec2), 270 | np.mean(temp_val_Recll2), np.mean(temp_val_F2)), 271 | 'C3:{:.4f},{:.4f},{:.4f},{:.4f}'.format(np.mean(temp_val_Acc3), np.mean(temp_val_Prec3), 272 | np.mean(temp_val_Recll3), np.mean(temp_val_F3)), 273 | 'C4:{:.4f},{:.4f},{:.4f},{:.4f}'.format(np.mean(temp_val_Acc4), np.mean(temp_val_Prec4), 274 | np.mean(temp_val_Recll4), np.mean(temp_val_F4))] 275 | print('results:', res) 276 | 277 | if epoch > 25: 278 | early_stopping(np.mean(temp_val_losses), np.mean(temp_val_accs), np.mean(temp_val_F1), np.mean(temp_val_F2), 279 | np.mean(temp_val_F3), np.mean(temp_val_F4), model, 'GACL', dataname) 280 | accs =np.mean(temp_val_accs) 281 | F1 = np.mean(temp_val_F1) 282 | F2 = np.mean(temp_val_F2) 283 | F3 = np.mean(temp_val_F3) 284 | F4 = np.mean(temp_val_F4) 285 | if early_stopping.early_stop: 286 | print("Early stopping") 287 | accs=early_stopping.accs 288 | F1=early_stopping.F1 289 | F2 = early_stopping.F2 290 | F3 = early_stopping.F3 291 | F4 = early_stopping.F4 292 | break 293 | return accs,F1,F2,F3,F4 294 | 295 | 296 | ##---------------------------------main--------------------------------------- 297 | scale = 1 298 | lr=0.0005 * scale 299 | weight_decay=1e-4 300 | patience=10 301 | n_epochs=200 302 | batchsize=120 303 | datasetname='Twitter16' # (1)Twitter15 (2)pheme (3)weibo 304 | #model="GCN" 305 | device = th.device('cuda:4' if th.cuda.is_available() else 'cpu') 306 | test_accs = [] 307 | NR_F1 = [] # NR 308 | FR_F1 = [] # FR 309 | TR_F1 = [] # TR 310 | UR_F1 = [] # UR 311 | 312 | data_path = './data/twitter16/' 313 | laebl_path = './data/Twitter16_label_All.txt' 314 | 315 | fold0_x_test, fold0_x_train, \ 316 | fold1_x_test, fold1_x_train,\ 317 | fold2_x_test, fold2_x_train, \ 318 | fold3_x_test, fold3_x_train, \ 319 | fold4_x_test,fold4_x_train = load5foldData(datasetname,data_path,laebl_path) 320 | 321 | print('fold0 shape: ', len(fold0_x_test), len(fold0_x_train)) 322 | print('fold1 shape: ', len(fold1_x_test), len(fold1_x_train)) 323 | print('fold2 shape: ', len(fold2_x_test), len(fold2_x_train)) 324 | print('fold3 shape: ', len(fold3_x_test), len(fold3_x_train)) 325 | print('fold4 shape: ', len(fold4_x_test), len(fold4_x_train)) 326 | 327 | 328 | accs0, F1_0, F2_0, F3_0, F4_0 = train_GCN(fold0_x_test,fold0_x_train,lr,weight_decay, patience,n_epochs,batchsize,datasetname) 329 | accs1, F1_1, F2_1, F3_1, F4_1 = train_GCN(fold1_x_test,fold1_x_train,lr,weight_decay,patience,n_epochs,batchsize,datasetname) 330 | accs2, F1_2, F2_2, F3_2, F4_2 = train_GCN(fold2_x_test,fold2_x_train,lr,weight_decay,patience,n_epochs,batchsize,datasetname) 331 | accs3, F1_3, F2_3, F3_3, F4_3 = train_GCN(fold3_x_test,fold3_x_train,lr,weight_decay,patience,n_epochs,batchsize,datasetname) 332 | accs4, F1_4, F2_4, F3_4, F4_4 = train_GCN(fold4_x_test,fold4_x_train,lr,weight_decay,patience,n_epochs,batchsize,datasetname) 333 | test_accs.append((accs0+accs1+accs2+accs3+accs4)/5) 334 | NR_F1.append((F1_0+F1_1+F1_2+F1_3+F1_4)/5) 335 | FR_F1.append((F2_0 + F2_1 + F2_2 + F2_3 + F2_4) / 5) 336 | TR_F1.append((F3_0 + F3_1 + F3_2 + F3_3 + F3_4) / 5) 337 | UR_F1.append((F4_0 + F4_1 + F4_2 + F4_3 + F4_4) / 5) 338 | print("AVG_result: {:.4f}|UR F1: {:.4f}|NR F1: {:.4f}|TR F1: {:.4f}|FR F1: {:.4f}".format(sum(test_accs), sum(NR_F1), sum(FR_F1), sum(TR_F1), sum(UR_F1))) 339 | --------------------------------------------------------------------------------