├── README.md └── mine_next ├── functions ├── __pycache__ │ ├── dataset.cpython-37.pyc │ ├── main_function2.cpython-37.pyc │ ├── sent_to_graph.cpython-37.pyc │ └── stance_main_func.cpython-37.pyc ├── dataset.py ├── dev_error.json ├── gcn_test2.py ├── heterograph.py ├── homograph.py ├── main_function.py ├── main_function2.py ├── make_graph.py ├── pos_analy.py ├── save_graph.py ├── sent2_to_graph.py ├── sent_to_graph.py ├── stance_main_func.py ├── test.py ├── test_error.json ├── textrank.py ├── txt2json.py ├── use_bertopic.py ├── use_bertopic2.py ├── use_bertopic3.py └── use_firstsent.py ├── model ├── __pycache__ │ └── modeling.cpython-37.pyc └── modeling.py ├── run_base.py ├── run_debug.py ├── run_grad1.py ├── run_grand.py ├── run_grand2 ├── run_grand2.py ├── run_grand3_test.py ├── run_one.py └── run_stance.py /README.md: -------------------------------------------------------------------------------- 1 | 학습 실행 방법 2 | 3 | python run.py --train_file TRAIN_FILE_PATH --save_dir SAVE_DIRECTORY_NAME --do_train True --init_weight True 4 | 5 | 테스트 실행 방법 6 | 7 | python run.py --predict_file PREDICT_FILE_PATH --output_dir MODEL_DIRECTORY_NAME --checkpoint MODEL_CHECKPOINT --do_eval True 8 | 9 | 실제 예시 10 | 11 | python run.py --predict_file extractive_summary_mrc_test_4.0.json --output_dir ./ --checkpoint 16000 --do_eval True 12 | 13 | --output_dir : 저장된 모델을 불러오는 디렉토리. --checkpoint와 같이 엮임. 14 | 15 | ex) 16 | --output_dir : ./ 17 | --checkpoint : 16000 18 | ./checkpoint-16000 안에 들어있는 모델 불러옴 19 | 20 | -------------------------------------------------------------------------------- /mine_next/functions/__pycache__/dataset.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KUNLP/Argument_Classifier/a44f3764516e0a3450e545fbc9e8caa12f8ee466/mine_next/functions/__pycache__/dataset.cpython-37.pyc -------------------------------------------------------------------------------- /mine_next/functions/__pycache__/main_function2.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KUNLP/Argument_Classifier/a44f3764516e0a3450e545fbc9e8caa12f8ee466/mine_next/functions/__pycache__/main_function2.cpython-37.pyc -------------------------------------------------------------------------------- /mine_next/functions/__pycache__/sent_to_graph.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KUNLP/Argument_Classifier/a44f3764516e0a3450e545fbc9e8caa12f8ee466/mine_next/functions/__pycache__/sent_to_graph.cpython-37.pyc -------------------------------------------------------------------------------- /mine_next/functions/__pycache__/stance_main_func.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KUNLP/Argument_Classifier/a44f3764516e0a3450e545fbc9e8caa12f8ee466/mine_next/functions/__pycache__/stance_main_func.cpython-37.pyc -------------------------------------------------------------------------------- /mine_next/functions/dataset.py: -------------------------------------------------------------------------------- 1 | import json, ast 2 | import benepar 3 | import dgl.frame 4 | from torch.utils.data import TensorDataset, Dataset 5 | import torch 6 | from transformers import AutoTokenizer 7 | import pandas as pd 8 | import argparse 9 | from tqdm import tqdm 10 | import spacy 11 | from mine_next.functions.sent_to_graph import constituent_to_tree, get_cons_tag_vocab, final_graph, all_process_graph 12 | import os, string 13 | 14 | nlp = spacy.load('en_core_web_sm') 15 | nlp.add_pipe('benepar', config={'model': 'benepar_en3'}) 16 | 17 | 18 | def convert_only_sentence2tensordataset(dataset, pseudo, tokenizer, max_length, mode): 19 | printable = set(string.printable) 20 | total_idx = [] 21 | total_input_ids = [] 22 | total_attention_mask = [] 23 | total_label = [] 24 | total_token_type_ids = [] 25 | total_sim_label = [] 26 | claim_sentences = dataset['claim_sentence'].tolist() 27 | claim_labels = dataset['claim_label'].tolist() 28 | claim_article_id = dataset['article_id'].tolist() 29 | gold_topic_sentences = dataset['topic_sentence'].tolist() 30 | 31 | claim_labels = [0 if label is 'O' else 1 for label in claim_labels] 32 | # 여기부분은 평소에 불러다 쓸때 사용하는 부분 33 | # total_graph = {} 34 | # max_constituent_length = 600 35 | # total_constituent_labels = [] 36 | # with open('../data/IAM/claims/graphs/{}_constituent.txt'.format(mode), 'r', encoding='utf-8') as f: 37 | # constituents = f.readlines() 38 | # # 테스트 39 | # for constituent in constituents: 40 | # constituent = ast.literal_eval(constituent.replace('\n', '')) 41 | # total_constituent_labels.append(constituent+[-1]*(max_constituent_length-len(constituent))) 42 | # graphs = os.listdir('../data/IAM/claims/graphs') 43 | # graphs_list = [os.path.join('../data/IAM/claims/graphs', file) for file in graphs if ('dgl' in file and mode in file)] #train이나 dev 의 dgl만 44 | # for graph in graphs_list: 45 | # (g,), _ = dgl.load_graphs(graph) 46 | # idx = graph.split('/')[-1].split('_')[-1].split('.')[0] 47 | # total_graph[int(idx)] = g 48 | 49 | # 평소 불러쓸때 50 | total_graph_first = {} 51 | total_graph_second = {} 52 | max_constituent_length = 600 53 | total_constituent_label_first = [] 54 | total_constituent_label_second = [] 55 | with open('../data/IAM/claims/graphs/{}_constituent_first_second.txt'.format(mode), 'r', encoding='utf-8') as f: 56 | constituents = f.readlines() 57 | for constituent in constituents: 58 | constituent = ast.literal_eval(constituent.replace('\n', '')) 59 | total_constituent_label_first.append(constituent[0]+[-1]*(max_constituent_length-len(constituent[0]))) 60 | total_constituent_label_second.append(constituent[1]+[-1]*(max_constituent_length-len(constituent[1]))) 61 | graphs = os.listdir('../data/IAM/claims/graphs') 62 | graphs_list = [os.path.join('../data/IAM/claims/graphs', file) for file in graphs if ('dgl' in file and mode in file and 'first' in file)] #train이나 dev 의 dgl만 63 | for graph in graphs_list: 64 | (g,), _ = dgl.load_graphs(graph) 65 | idx = graph.split('/')[-1].split('_')[-1].split('.')[0] 66 | total_graph_first[int(idx)] = g 67 | graphs_list = [os.path.join('../data/IAM/claims/graphs', file) for file in graphs if ('dgl' in file and mode in file and 'second' in file)] #train이나 dev 의 dgl만 68 | for graph in graphs_list: 69 | (g,), _ = dgl.load_graphs(graph) 70 | idx = graph.split('/')[-1].split('_')[-1].split('.')[0] 71 | total_graph_second[int(idx)] = g 72 | 73 | for idx, (topic, claim_sentence, claim_label, article_id) in tqdm(enumerate(zip(gold_topic_sentences, claim_sentences, claim_labels, claim_article_id)), desc='convert to data to tensordataset', total=len(claim_labels)): 74 | claim_sentence = claim_sentence.lower().replace('“', '"').replace('”', '"') 75 | claim_sentence = "".join(filter(lambda x : x in printable, claim_sentence)) 76 | 77 | # claim_graph_first, claim_graph_second, constituent_label_first, constituent_label_second = all_process_graph(nlp, tokenizer, claim_sentence) 78 | # total_graph_first[idx] = claim_graph_first 79 | # total_graph_second[idx] = claim_graph_second 80 | # constituent_label_first = constituent_label_first.tolist() + [-1]*(max_constituent_length-len(constituent_label_first.tolist())) 81 | # constituent_label_second = constituent_label_second.tolist() + [-1]*(max_constituent_length-len(constituent_label_second.tolist())) 82 | # total_constituent_label_first.append(constituent_label_first) 83 | # total_constituent_label_second.append(constituent_label_second) 84 | 85 | # 슈도 토픽 할때 86 | #process_sentence = tokenizer(pseudo[article_id], claim_sentence, max_length=max_length, padding='max_length', truncation=True) 87 | #process_sentence = tokenizer(claim_sentence, max_length=max_length, padding='max_length', truncation=True) 88 | process_sentence = tokenizer(topic, claim_sentence, max_length=max_length, padding='max_length', truncation=True) 89 | input_ids = process_sentence['input_ids'] 90 | attention_mask = process_sentence['attention_mask'] 91 | # 주제에 대한부분만 1로 하고 나머진 0 으로 하는 식 92 | sep_index = [idx for idx, ids in enumerate(input_ids) if ids == 2] 93 | try: 94 | second_sep_index = sep_index[1] 95 | token_type_ids = [0] * second_sep_index 96 | token_type_ids += [1] * (len(input_ids)-len(token_type_ids)) 97 | except IndexError: 98 | token_type_ids = [0] * max_length 99 | # 주장일 때 100 | if claim_label == 1: 101 | sim_label = 1 102 | # 주장이 아닐때 103 | elif claim_label == 0: 104 | sim_label = -1 105 | 106 | total_idx.append(idx) 107 | total_input_ids.append(input_ids) 108 | total_attention_mask.append(attention_mask) 109 | total_token_type_ids.append(token_type_ids) 110 | total_label.append(claim_label) 111 | total_sim_label.append(sim_label) 112 | #total_graph[idx] = claim_graph 113 | #total_constituent_labels.append(constituent_label_list) 114 | if idx < 3: 115 | print() 116 | print("****EXAMPLE****") 117 | print("topic sentence : {}".format(topic)) 118 | print("claim sentence : {}".format(claim_sentence)) 119 | print("claim sentence input ids : {}".format(input_ids)) 120 | print("claim sentence attention mask : {}".format(attention_mask)) 121 | print("claim sentence token type ids : {}".format(token_type_ids)) 122 | print("label : {}".format(claim_label)) 123 | print("sim label : {}".format(sim_label)) 124 | 125 | 126 | total_idx = torch.tensor(total_idx, dtype=torch.long) 127 | total_input_ids = torch.tensor(total_input_ids, dtype=torch.long) 128 | total_attention_mask = torch.tensor(total_attention_mask, dtype=torch.long) 129 | total_token_type_ids = torch.tensor(total_token_type_ids, dtype=torch.long) 130 | total_label = torch.tensor(total_label, dtype=torch.long) 131 | total_sim_label = torch.tensor(total_sim_label, dtype=torch.long) 132 | total_constituent_label_first = torch.tensor(total_constituent_label_first, dtype=torch.long) 133 | total_constituent_label_second = torch.tensor(total_constituent_label_second, dtype=torch.long) 134 | dataset = TensorDataset(total_idx, total_input_ids, total_attention_mask, total_token_type_ids, total_label, total_sim_label, 135 | total_constituent_label_first, total_constituent_label_second) 136 | 137 | return dataset, total_graph_first, total_graph_second 138 | 139 | 140 | def convert_only_sentence2tensordataset(dataset, pseudo, tokenizer, max_length, mode): 141 | printable = set(string.printable) 142 | total_idx = [] 143 | total_input_ids = [] 144 | total_attention_mask = [] 145 | total_label = [] 146 | total_token_type_ids = [] 147 | total_sim_label = [] 148 | claim_sentences = dataset['claim_sentence'].tolist() 149 | claim_labels = dataset['claim_label'].tolist() 150 | claim_article_id = dataset['article_id'].tolist() 151 | gold_topic_sentences = dataset['topic_sentence'].tolist() 152 | 153 | claim_labels = [0 if label is 'O' else 1 for label in claim_labels] 154 | # 여기부분은 평소에 불러다 쓸때 사용하는 부분 155 | # total_graph = {} 156 | # max_constituent_length = 600 157 | # total_constituent_labels = [] 158 | # with open('../data/IAM/claims/graphs/{}_constituent.txt'.format(mode), 'r', encoding='utf-8') as f: 159 | # constituents = f.readlines() 160 | # #테스트 161 | # for constituent in constituents: 162 | # constituent = ast.literal_eval(constituent.replace('\n', '')) 163 | # total_constituent_labels.append(constituent+[-1]*(max_constituent_length-len(constituent))) 164 | # graphs = os.listdir('../data/IAM/claims/graphs') 165 | # graphs_list = [os.path.join('../data/IAM/claims/graphs', file) for file in graphs if ('dgl' in file and mode in file)] #train이나 dev 의 dgl만 166 | # for graph in graphs_list: 167 | # (g,), _ = dgl.load_graphs(graph) 168 | # idx = graph.split('/')[-1].split('_')[-1].split('.')[0] 169 | # total_graph[int(idx)] = g 170 | 171 | #평소 불러쓸때 172 | total_graph_first = {} 173 | total_graph_second = {} 174 | max_constituent_length = 600 175 | total_constituent_label_first = [] 176 | total_constituent_label_second = [] 177 | with open('../data/IAM/claims/graphs/{}_constituent_first_second.txt'.format(mode), 'r', encoding='utf-8') as f: 178 | constituents = f.readlines() 179 | for constituent in constituents: 180 | constituent = ast.literal_eval(constituent.replace('\n', '')) 181 | total_constituent_label_first.append(constituent[0]+[-1]*(max_constituent_length-len(constituent[0]))) 182 | total_constituent_label_second.append(constituent[1]+[-1]*(max_constituent_length-len(constituent[1]))) 183 | graphs = os.listdir('../data/IAM/claims/graphs') 184 | graphs_list = [os.path.join('../data/IAM/claims/graphs', file) for file in graphs if ('dgl' in file and mode in file and 'first' in file)] #train이나 dev 의 dgl만 185 | for graph in graphs_list: 186 | (g,), _ = dgl.load_graphs(graph) 187 | idx = graph.split('/')[-1].split('_')[-1].split('.')[0] 188 | total_graph_first[int(idx)] = g 189 | graphs_list = [os.path.join('../data/IAM/claims/graphs', file) for file in graphs if ('dgl' in file and mode in file and 'second' in file)] #train이나 dev 의 dgl만 190 | for graph in graphs_list: 191 | (g,), _ = dgl.load_graphs(graph) 192 | idx = graph.split('/')[-1].split('_')[-1].split('.')[0] 193 | total_graph_second[int(idx)] = g 194 | 195 | for idx, (topic, claim_sentence, claim_label, article_id) in tqdm(enumerate(zip(gold_topic_sentences, claim_sentences, claim_labels, claim_article_id)), desc='convert to data to tensordataset', total=len(claim_labels)): 196 | claim_sentence = claim_sentence.lower().replace('“', '"').replace('”', '"') 197 | claim_sentence = "".join(filter(lambda x : x in printable, claim_sentence)) 198 | 199 | # claim_graph_first, claim_graph_second, constituent_label_first, constituent_label_second = all_process_graph(nlp, tokenizer, claim_sentence) 200 | # total_graph_first[idx] = claim_graph_first 201 | # total_graph_second[idx] = claim_graph_second 202 | # constituent_label_first = constituent_label_first.tolist() + [-1]*(max_constituent_length-len(constituent_label_first.tolist())) 203 | # constituent_label_second = constituent_label_second.tolist() + [-1]*(max_constituent_length-len(constituent_label_second.tolist())) 204 | # total_constituent_label_first.append(constituent_label_first) 205 | # total_constituent_label_second.append(constituent_label_second) 206 | 207 | # 슈도 토픽 할때 208 | process_sentence = tokenizer(pseudo[article_id], claim_sentence, max_length=max_length, padding='max_length', truncation=True) 209 | # 그냥 문장 하나만 할 때 210 | # process_sentence = tokenizer(claim_sentence, max_length=max_length, padding='max_length', truncation=True) 211 | # 골든 토픽과 문장 하나 212 | # process_sentence = tokenizer(topic, claim_sentence, max_length=max_length, padding='max_length', truncation=True) 213 | 214 | input_ids = process_sentence['input_ids'] 215 | attention_mask = process_sentence['attention_mask'] 216 | # 주제에 대한부분만 1로 하고 나머진 0 으로 하는 식 217 | sep_index = [idx for idx, ids in enumerate(input_ids) if ids == 2] 218 | try: 219 | second_sep_index = sep_index[1] 220 | token_type_ids = [0] * second_sep_index 221 | token_type_ids += [1] * (len(input_ids)-len(token_type_ids)) 222 | except IndexError: 223 | token_type_ids = [0] * max_length 224 | # 주장일 때 225 | if claim_label == 1: 226 | sim_label = 1 227 | # 주장이 아닐때 228 | elif claim_label == 0: 229 | sim_label = -1 230 | 231 | total_idx.append(idx) 232 | total_input_ids.append(input_ids) 233 | total_attention_mask.append(attention_mask) 234 | total_token_type_ids.append(token_type_ids) 235 | total_label.append(claim_label) 236 | total_sim_label.append(sim_label) 237 | #total_graph[idx] = claim_graph 238 | #total_constituent_labels.append(constituent_label_list) 239 | if idx < 3: 240 | print() 241 | print("****EXAMPLE****") 242 | print("topic sentence : {}".format(topic)) 243 | print("pseudo topic sentence : {}".format(pseudo[article_id])) 244 | print("claim sentence : {}".format(claim_sentence)) 245 | print("claim sentence input ids : {}".format(input_ids)) 246 | print("claim sentence attention mask : {}".format(attention_mask)) 247 | print("claim sentence token type ids : {}".format(token_type_ids)) 248 | print("label : {}".format(claim_label)) 249 | print("sim label : {}".format(sim_label)) 250 | 251 | 252 | total_idx = torch.tensor(total_idx, dtype=torch.long) 253 | total_input_ids = torch.tensor(total_input_ids, dtype=torch.long) 254 | total_attention_mask = torch.tensor(total_attention_mask, dtype=torch.long) 255 | total_token_type_ids = torch.tensor(total_token_type_ids, dtype=torch.long) 256 | total_label = torch.tensor(total_label, dtype=torch.long) 257 | total_sim_label = torch.tensor(total_sim_label, dtype=torch.long) 258 | total_constituent_label_first = torch.tensor(total_constituent_label_first, dtype=torch.long) 259 | total_constituent_label_second = torch.tensor(total_constituent_label_second, dtype=torch.long) 260 | dataset = TensorDataset(total_idx, total_input_ids, total_attention_mask, total_token_type_ids, total_label, total_sim_label, 261 | total_constituent_label_first, total_constituent_label_second) 262 | 263 | return dataset, total_graph_first, total_graph_second 264 | 265 | 266 | def convert_data2tensordataset(dataset, tokenizer, max_length, mode): 267 | total_input_ids = [] 268 | total_attention_mask = [] 269 | total_label = [] 270 | total_token_type_ids = [] 271 | total_sim_label = [] 272 | total_idx = [] 273 | claim_sentences = dataset['claim_sentence'].tolist() 274 | claim_labels = dataset['claim_label'].tolist() 275 | claim_labels = [0 if label is 'O' else 1 for label in claim_labels] 276 | topic_sentences = dataset['topic_sentence'].tolist() 277 | for idx, (topic_sentence, claim_sentence, claim_label) in tqdm(enumerate(zip(topic_sentences, claim_sentences, claim_labels)), desc='convert to data to tensordataset', total=len(claim_labels)): 278 | process_sentence = tokenizer(topic_sentence, claim_sentence, max_length=max_length, padding='max_length', truncation=True) 279 | input_ids = process_sentence['input_ids'] 280 | attention_mask = process_sentence['attention_mask'] 281 | # 주제에 대한부분만 1로 하고 나머진 0 으로 하는 식 282 | sep_index = [idx for idx, ids in enumerate(input_ids) if ids == 2] 283 | second_sep_index = sep_index[1] 284 | token_type_ids = [0] * second_sep_index 285 | token_type_ids += [1] * (len(input_ids)-len(token_type_ids)) 286 | # 주장일 때 287 | if claim_label == 1: 288 | sim_label = 1 289 | # 주장이 아닐때 290 | elif claim_label == 0: 291 | sim_label = -1 292 | total_idx.append(idx) 293 | total_input_ids.append(input_ids) 294 | total_attention_mask.append(attention_mask) 295 | total_token_type_ids.append(token_type_ids) 296 | total_label.append(claim_label) 297 | total_sim_label.append(sim_label) 298 | if idx < 3: 299 | print() 300 | print("****EXAMPLE****") 301 | print("topic sentence : {}".format(topic_sentence)) 302 | print("claim sentence : {}".format(claim_sentence)) 303 | print("topic, claim sentence input ids : {}".format(input_ids)) 304 | print("topic, claim sentence attention mask : {}".format(attention_mask)) 305 | print("topic, claim sentence token type ids : {}".format(token_type_ids)) 306 | print("label : {}".format(claim_label)) 307 | print("sim label : {}".format(sim_label)) 308 | total_idx = torch.tensor(total_idx, dtype=torch.long) 309 | total_input_ids = torch.tensor(total_input_ids, dtype=torch.long) 310 | total_attention_mask = torch.tensor(total_attention_mask, dtype=torch.long) 311 | total_token_type_ids = torch.tensor(total_token_type_ids, dtype=torch.long) 312 | total_label = torch.tensor(total_label, dtype=torch.long) 313 | total_sim_label = torch.tensor(total_sim_label, dtype=torch.long) 314 | dataset = TensorDataset(total_idx, total_input_ids, total_attention_mask, total_token_type_ids, total_label, total_sim_label) 315 | return dataset 316 | 317 | 318 | def convert_stance_data2tensordataset(dataset, tokenizer, max_length, mode=None): 319 | total_idx = [] 320 | total_input_ids = [] 321 | total_attention_mask = [] 322 | total_label = [] 323 | total_token_type_ids = [] 324 | total_sim_label = [] 325 | total_stance_label = [] 326 | #dataset = dataset[dataset['claim_label'] == 'C'] 327 | 328 | claim_sentences = dataset['claim_sentence'].tolist() 329 | topic_sentences = dataset['topic_sentence'].tolist() 330 | stance_labels = dataset['stance_labels'].tolist() 331 | for idx, (topic_sentence, claim_sentence, stance_label) in tqdm(enumerate(zip(topic_sentences, claim_sentences, stance_labels)), desc='convert to data to tensordataset', total=len(stance_labels)): 332 | process_sentence = tokenizer(topic_sentence, claim_sentence, max_length=max_length, padding='max_length', truncation=True) 333 | input_ids = process_sentence['input_ids'] 334 | attention_mask = process_sentence['attention_mask'] 335 | # 주제에 대한부분만 1로 하고 나머진 0 으로 하는 식 336 | try: 337 | sep_index = [idx for idx, ids in enumerate(input_ids) if ids == 2] 338 | second_sep_index = sep_index[1] 339 | token_type_ids = [0] * second_sep_index 340 | token_type_ids += [1] * (len(input_ids)-len(token_type_ids)) 341 | except IndexError: 342 | token_type_ids = [0] * max_length 343 | #sent_attention_mask = (1-token_type_ids) * attention_mask 344 | total_idx.append(idx) 345 | total_input_ids.append(input_ids) 346 | total_attention_mask.append(attention_mask) 347 | total_token_type_ids.append(token_type_ids) 348 | if stance_label == -1: 349 | total_stance_label.append(0) 350 | else: 351 | total_stance_label.append(1) 352 | #total_stance_label.append(stance_label) 353 | if idx < 3: 354 | print() 355 | print("****EXAMPLE****") 356 | print("topic sentence : {}".format(topic_sentence)) 357 | print("claim sentence : {}".format(claim_sentence)) 358 | print("topic, claim sentence input ids : {}".format(input_ids)) 359 | print("topic, claim sentence attention mask : {}".format(attention_mask)) 360 | print("topic, claim sentence token type ids : {}".format(token_type_ids)) 361 | print("stance label : {}".format(stance_label)) 362 | 363 | total_idx = torch.tensor(total_idx, dtype=torch.long) 364 | total_input_ids = torch.tensor(total_input_ids, dtype=torch.long) 365 | total_attention_mask = torch.tensor(total_attention_mask, dtype=torch.long) 366 | total_token_type_ids = torch.tensor(total_token_type_ids, dtype=torch.long) 367 | total_stance_label = torch.tensor(total_stance_label, dtype=torch.long) 368 | dataset = TensorDataset(total_idx, total_input_ids, total_attention_mask, total_token_type_ids, total_stance_label) 369 | return dataset 370 | 371 | # with open('../../../data/train_claim.json', 'r', encoding='utf-8') as reader: 372 | # dataset = json.load(reader)['data'] 373 | # 374 | # total_title = [] 375 | # total_input_ids = [] 376 | # total_attention_mask = [] 377 | # total_label = [] 378 | # for data in dataset: 379 | # title = data['title'] 380 | # total_title.append(title) 381 | # paragraphs = data['paragraphs'] 382 | # for para in paragraphs: 383 | # answers = para['qas'][0]['answers'] 384 | # context = para['context'] 385 | # result = tokenizer(context, padding='max_length', max_length=4096, truncation=True) 386 | # # cls idx 2 / sep idx 3 387 | # total_input_ids.append(result['input_ids']) 388 | # total_attention_mask.append(result['attention_mask']) 389 | # context_list = context.split('[SEP]') 390 | # each_label = [0] * len(context_list) 391 | # # 첫 sep는 첫번째 문장에 대한 표현. 문장의 오른쪽에 있는 sep를 기준으로 한다. 392 | # for answer in answers: 393 | # text = answer['text'] 394 | # for idx, ctx in enumerate(context_list): 395 | # if text in ctx: 396 | # print(idx+1) 397 | 398 | 399 | 400 | # if __name__ == '__main__': 401 | # parser = argparse.ArgumentParser(description='dataset creating') 402 | # parser.add_argument('--train_data', type=str, default='../../../data/train_claim.json') 403 | -------------------------------------------------------------------------------- /mine_next/functions/gcn_test2.py: -------------------------------------------------------------------------------- 1 | import dgl.nn.pytorch as dglnn 2 | import torch.nn as nn 3 | import dgl.data 4 | import torch.nn.functional as F 5 | from dgl.dataloading import GraphDataLoader 6 | import torch 7 | 8 | 9 | ### 10 | # 이건 그래프 통쨰로 분류하는 코드 11 | ### 12 | 13 | 14 | dataset = dgl.data.GINDataset('MUTAG', False) 15 | 16 | dataloader = GraphDataLoader( 17 | dataset, 18 | batch_size=1024, 19 | drop_last=False, 20 | shuffle=True) 21 | 22 | 23 | class Classifier(nn.Module): 24 | def __init__(self, in_dim, hidden_dim, n_classes): 25 | super(Classifier, self).__init__() 26 | self.conv1 = dglnn.GraphConv(in_dim, hidden_dim) 27 | self.conv2 = dglnn.GraphConv(hidden_dim, hidden_dim) 28 | self.classify = nn.Linear(hidden_dim, n_classes) 29 | 30 | def forward(self, g, h): 31 | # Apply graph convolution and activation. 32 | h = F.relu(self.conv1(g, h)) 33 | h = F.relu(self.conv2(g, h)) 34 | with g.local_scope(): 35 | g.ndata['h'] = h 36 | # Calculate graph representation by average readout. 37 | # (batch size, 20(아마 히든사이즈)) 38 | hg = dgl.mean_nodes(g, 'h') 39 | return self.classify(hg) 40 | 41 | 42 | model = Classifier(7, 20, 5) 43 | opt = torch.optim.Adam(model.parameters()) 44 | for epoch in range(20): 45 | for batched_graph, labels in dataloader: 46 | # (num nodes, 7) 아마 라벨 개수가 7개인듯 47 | feats = batched_graph.ndata['attr'] 48 | logits = model(batched_graph, feats) 49 | loss = F.cross_entropy(logits, labels) 50 | opt.zero_grad() 51 | loss.backward() 52 | opt.step() -------------------------------------------------------------------------------- /mine_next/functions/heterograph.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. currentmodule:: dgl 3 | 4 | Working with Heterogeneous Graphs 5 | ================================= 6 | 7 | **Author**: Quan Gan, `Minjie Wang `_, Mufei Li, 8 | George Karypis, Zheng Zhang 9 | 10 | In this tutorial, you learn about: 11 | 12 | * Examples of heterogenous graph data and typical applications. 13 | 14 | * Creating and manipulating a heterogenous graph in DGL. 15 | 16 | * Implementing `Relational-GCN `_, a popular GNN model, 17 | for heterogenous graph input. 18 | 19 | * Training a model to solve a node classification task. 20 | 21 | Heterogeneous graphs, or *heterographs* for short, are graphs that contain 22 | different types of nodes and edges. The different types of nodes and edges tend 23 | to have different types of attributes that are designed to capture the 24 | characteristics of each node and edge type. Within the context of 25 | graph neural networks, depending on their complexity, certain node and edge types 26 | might need to be modeled with representations that have a different number of dimensions. 27 | 28 | DGL supports graph neural network computations on such heterogeneous graphs, by 29 | using the heterograph class and its associated API. 30 | 31 | """ 32 | 33 | ############################################################################### 34 | # Examples of heterographs 35 | # ----------------------- 36 | # Many graph datasets represent relationships among various types of entities. 37 | # This section provides an overview for several graph use-cases that show such relationships 38 | # and can have their data represented as heterographs. 39 | # 40 | # Citation graph 41 | # ~~~~~~~~~~~~~~~ 42 | # The Association for Computing Machinery publishes an `ACM dataset `_ that contains two 43 | # million papers, their authors, publication venues, and the other papers 44 | # that were cited. This information can be represented as a heterogeneous graph. 45 | # 46 | # The following diagram shows several entities in the ACM dataset and the relationships among them 47 | # (taken from `Shi et al., 2015 `_). 48 | # 49 | # .. figure:: https://data.dgl.ai/tutorial/hetero/acm-example.png# 50 | # 51 | # This graph has three types of entities that correspond to papers, authors, and publication venues. 52 | # It also contains three types of edges that connect the following: 53 | # 54 | # * Authors with papers corresponding to *written-by* relationships 55 | # 56 | # * Papers with publication venues corresponding to *published-in* relationships 57 | # 58 | # * Papers with other papers corresponding to *cited-by* relationships 59 | # 60 | # 61 | # Recommender systems 62 | # ~~~~~~~~~~~~~~~~~~~~ 63 | # The datasets used in recommender systems often contain 64 | # interactions between users and items. For example, the data could include the 65 | # ratings that users have provided to movies. Such interactions can be modeled 66 | # as heterographs. 67 | # 68 | # The nodes in these heterographs will have two types, *users* and *movies*. The edges 69 | # will correspond to the user-movie interactions. Furthermore, if an interaction is 70 | # marked with a rating, then each rating value could correspond to a different edge type. 71 | # The following diagram shows an example of user-item interactions as a heterograph. 72 | # 73 | # .. figure:: https://data.dgl.ai/tutorial/hetero/recsys-example.png 74 | # 75 | # 76 | # Knowledge graph 77 | # ~~~~~~~~~~~~~~~~ 78 | # Knowledge graphs are inherently heterogenous. For example, in 79 | # Wikidata, Barack Obama (item Q76) is an instance of a human, which could be viewed as 80 | # the entity class, whose spouse (item P26) is Michelle Obama (item Q13133) and 81 | # occupation (item P106) is politician (item Q82955). The relationships are shown in the following. 82 | # diagram. 83 | # 84 | # .. figure:: https://data.dgl.ai/tutorial/hetero/kg-example.png 85 | # 86 | 87 | ############################################################################### 88 | # Creating a heterograph in DGL 89 | # ----------------------------- 90 | # You can create a heterograph in DGL using the :func:`dgl.heterograph` API. 91 | # The argument to :func:`dgl.heterograph` is a dictionary. The keys are tuples 92 | # in the form of ``(srctype, edgetype, dsttype)`` specifying the relation name 93 | # and the two entity types it connects. Such tuples are called *canonical edge types* 94 | # The values are data to initialize the graph structures, that is, which 95 | # nodes the edges actually connect. 96 | # 97 | # For instance, the following code creates the user-item interactions heterograph shown earlier. 98 | 99 | # Each value of the dictionary is a pair of source and destination arrays. 100 | # Nodes are integer IDs starting from zero. Nodes IDs of different types have 101 | # separate countings. 102 | import dgl 103 | import numpy as np 104 | 105 | ratings = dgl.heterograph( 106 | {('user', '+1', 'movie') : (np.array([0, 0, 1]), np.array([0, 1, 0])), 107 | ('user', '-1', 'movie') : (np.array([2]), np.array([1]))}) 108 | 109 | ############################################################################### 110 | # Manipulating heterograph 111 | # ------------------------ 112 | # You can create a more realistic heterograph using the ACM dataset. To do this, first 113 | # download the dataset as follows: 114 | 115 | import scipy.io 116 | import urllib.request 117 | 118 | data_url = 'https://data.dgl.ai/dataset/ACM.mat' 119 | data_file_path = '/tmp/ACM.mat' 120 | 121 | urllib.request.urlretrieve(data_url, data_file_path) 122 | data = scipy.io.loadmat(data_file_path) 123 | print(list(data.keys())) 124 | 125 | ############################################################################### 126 | # The dataset stores node information by their types: ``P`` for paper, ``A`` 127 | # for author, ``C`` for conference, ``L`` for subject code, and so on. The relationships 128 | # are stored as SciPy sparse matrix under key ``XvsY``, where ``X`` and ``Y`` 129 | # could be any of the node type code. 130 | # 131 | # The following code prints out some statistics about the paper-author relationships. 132 | 133 | print(type(data['PvsA'])) 134 | print('#Papers:', data['PvsA'].shape[0]) 135 | print('#Authors:', data['PvsA'].shape[1]) 136 | print('#Links:', data['PvsA'].nnz) 137 | 138 | ############################################################################### 139 | # Converting this SciPy matrix to a heterograph in DGL is straightforward. 140 | 141 | pa_g = dgl.heterograph({('paper', 'written-by', 'author') : data['PvsA'].nonzero()}) 142 | 143 | ############################################################################### 144 | # You can easily print out the type names and other structural information. 145 | 146 | print('Node types:', pa_g.ntypes) 147 | print('Edge types:', pa_g.etypes) 148 | print('Canonical edge types:', pa_g.canonical_etypes) 149 | 150 | # Nodes and edges are assigned integer IDs starting from zero and each type has its own counting. 151 | # To distinguish the nodes and edges of different types, specify the type name as the argument. 152 | print(pa_g.number_of_nodes('paper')) 153 | # Canonical edge type name can be shortened to only one edge type name if it is 154 | # uniquely distinguishable. 155 | print(pa_g.number_of_edges(('paper', 'written-by', 'author'))) 156 | print(pa_g.number_of_edges('written-by')) 157 | print(pa_g.successors(1, etype='written-by')) # get the authors that write paper #1 158 | 159 | # Type name argument could be omitted whenever the behavior is unambiguous. 160 | print(pa_g.number_of_edges()) # Only one edge type, the edge type argument could be omitted 161 | 162 | ############################################################################### 163 | # A homogeneous graph is just a special case of a heterograph with only one type 164 | # of node and edge. 165 | 166 | # Paper-citing-paper graph is a homogeneous graph 167 | pp_g = dgl.heterograph({('paper', 'citing', 'paper') : data['PvsP'].nonzero()}) 168 | # equivalent (shorter) API for creating homogeneous graph 169 | pp_g = dgl.from_scipy(data['PvsP']) 170 | 171 | # All the ntype and etype arguments could be omitted because the behavior is unambiguous. 172 | print(pp_g.number_of_nodes()) 173 | print(pp_g.number_of_edges()) 174 | print(pp_g.successors(3)) 175 | 176 | ############################################################################### 177 | # Create a subset of the ACM graph using the paper-author, paper-paper, 178 | # and paper-subject relationships. Meanwhile, also add the reverse 179 | # relationship to prepare for the later sections. 180 | 181 | G = dgl.heterograph({ 182 | ('paper', 'written-by', 'author') : data['PvsA'].nonzero(), 183 | ('author', 'writing', 'paper') : data['PvsA'].transpose().nonzero(), 184 | ('paper', 'citing', 'paper') : data['PvsP'].nonzero(), 185 | ('paper', 'cited', 'paper') : data['PvsP'].transpose().nonzero(), 186 | ('paper', 'is-about', 'subject') : data['PvsL'].nonzero(), 187 | ('subject', 'has', 'paper') : data['PvsL'].transpose().nonzero(), 188 | }) 189 | 190 | print(G) 191 | 192 | ############################################################################### 193 | # **Metagraph** (or network schema) is a useful summary of a heterograph. 194 | # Serving as a template for a heterograph, it tells how many types of objects 195 | # exist in the network and where the possible links exist. 196 | # 197 | # DGL provides easy access to the metagraph, which could be visualized using 198 | # external tools. 199 | 200 | # Draw the metagraph using graphviz. 201 | # import pygraphviz as pgv 202 | # def plot_graph(nxg): 203 | # ag = pgv.AGraph(strict=False, directed=True) 204 | # for u, v, k in nxg.edges(keys=True): 205 | # ag.add_edge(u, v, label=k) 206 | # ag.layout('dot') 207 | # ag.draw('graph.png') 208 | # 209 | # plot_graph(G.metagraph()) 210 | 211 | ############################################################################### 212 | # Learning tasks associated with heterographs 213 | # ------------------------------------------- 214 | # Some of the typical learning tasks that involve heterographs include: 215 | # 216 | # * *Node classification and regression* to predict the class of each node or 217 | # estimate a value associated with it. 218 | # 219 | # * *Link prediction* to predict if there is an edge of a certain 220 | # type between a pair of nodes, or predict which other nodes a particular 221 | # node is connected with (and optionally the edge types of such connections). 222 | # 223 | # * *Graph classification/regression* to assign an entire 224 | # heterograph into one of the target classes or to estimate a numerical 225 | # value associated with it. 226 | # 227 | # In this tutorial, we designed a simple example for the first task. 228 | # 229 | # A semi-supervised node classification example 230 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 231 | # Our goal is to predict the publishing conference of a paper using the ACM 232 | # academic graph we just created. To further simplify the task, we only focus 233 | # on papers published in three conferences: *KDD*, *ICML*, and *VLDB*. All 234 | # the other papers are not labeled, making it a semi-supervised setting. 235 | # 236 | # The following code extracts those papers from the raw dataset and prepares 237 | # the training, validation, testing split. 238 | 239 | import numpy as np 240 | import torch 241 | import torch.nn as nn 242 | import torch.nn.functional as F 243 | 244 | pvc = data['PvsC'].tocsr() 245 | # find all papers published in KDD, ICML, VLDB 246 | c_selected = [0, 11, 13] # KDD, ICML, VLDB 247 | p_selected = pvc[:, c_selected].tocoo() 248 | # generate labels 249 | labels = pvc.indices 250 | labels[labels == 11] = 1 251 | labels[labels == 13] = 2 252 | labels = torch.tensor(labels).long() 253 | 254 | # generate train/val/test split 255 | pid = p_selected.row 256 | shuffle = np.random.permutation(pid) 257 | train_idx = torch.tensor(shuffle[0:800]).long() 258 | val_idx = torch.tensor(shuffle[800:900]).long() 259 | test_idx = torch.tensor(shuffle[900:]).long() 260 | 261 | ############################################################################### 262 | # Relational-GCN on heterograph 263 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 264 | # We use `Relational-GCN `_ to learn the 265 | # representation of nodes in the graph. Its message-passing equation is as 266 | # follows: 267 | # 268 | # .. math:: 269 | # 270 | # h_i^{(l+1)} = \sigma\left(\sum_{r\in \mathcal{R}} 271 | # \sum_{j\in\mathcal{N}_r(i)}W_r^{(l)}h_j^{(l)}\right) 272 | # 273 | # Breaking down the equation, you see that there are two parts in the 274 | # computation. 275 | # 276 | # (i) Message computation and aggregation within each relation :math:`r` 277 | # 278 | # (ii) Reduction that merges the results from multiple relationships 279 | # 280 | # Following this intuition, perform message passing on a heterograph in 281 | # two steps. 282 | # 283 | # (i) Per-edge-type message passing 284 | # 285 | # (ii) Type wise reduction 286 | 287 | import dgl.function as fn 288 | 289 | class HeteroRGCNLayer(nn.Module): 290 | def __init__(self, in_size, out_size, etypes): 291 | super(HeteroRGCNLayer, self).__init__() 292 | # W_r for each relation 293 | self.weight = nn.ModuleDict({ 294 | name : nn.Linear(in_size, out_size) for name in etypes 295 | }) 296 | 297 | def forward(self, G, feat_dict): 298 | # The input is a dictionary of node features for each type 299 | funcs = {} 300 | for srctype, etype, dsttype in G.canonical_etypes: 301 | # Compute W_r * h 302 | Wh = self.weight[etype](feat_dict[srctype]) 303 | # Save it in graph for message passing 304 | G.nodes[srctype].data['Wh_%s' % etype] = Wh 305 | # Specify per-relation message passing functions: (message_func, reduce_func). 306 | # Note that the results are saved to the same destination feature 'h', which 307 | # hints the type wise reducer for aggregation. 308 | funcs[etype] = (fn.copy_u('Wh_%s' % etype, 'm'), fn.mean('m', 'h')) 309 | # Trigger message passing of multiple types. 310 | # The first argument is the message passing functions for each relation. 311 | # The second one is the type wise reducer, could be "sum", "max", 312 | # "min", "mean", "stack" 313 | G.multi_update_all(funcs, 'sum') 314 | # return the updated node feature dictionary 315 | return {ntype : G.nodes[ntype].data['h'] for ntype in G.ntypes} 316 | 317 | ############################################################################### 318 | # Create a simple GNN by stacking two ``HeteroRGCNLayer``. Since the 319 | # nodes do not have input features, make their embeddings trainable. 320 | 321 | class HeteroRGCN(nn.Module): 322 | def __init__(self, G, in_size, hidden_size, out_size): 323 | super(HeteroRGCN, self).__init__() 324 | # Use trainable node embeddings as featureless inputs. 325 | embed_dict = {ntype : nn.Parameter(torch.Tensor(G.number_of_nodes(ntype), in_size)) 326 | for ntype in G.ntypes} 327 | for key, embed in embed_dict.items(): 328 | nn.init.xavier_uniform_(embed) 329 | self.embed = nn.ParameterDict(embed_dict) 330 | # create layers 331 | self.layer1 = HeteroRGCNLayer(in_size, hidden_size, G.etypes) 332 | self.layer2 = HeteroRGCNLayer(hidden_size, out_size, G.etypes) 333 | 334 | def forward(self, G): 335 | h_dict = self.layer1(G, self.embed) 336 | h_dict = {k : F.leaky_relu(h) for k, h in h_dict.items()} 337 | h_dict = self.layer2(G, h_dict) 338 | # get paper logits 339 | return h_dict['paper'] 340 | 341 | ############################################################################### 342 | # Train and evaluate 343 | # ~~~~~~~~~~~~~~~~~~ 344 | # Train and evaluate this network. 345 | 346 | # Create the model. The output has three logits for three classes. 347 | model = HeteroRGCN(G, 10, 10, 3) 348 | 349 | opt = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4) 350 | 351 | best_val_acc = 0 352 | best_test_acc = 0 353 | 354 | for epoch in range(100): 355 | logits = model(G) 356 | # The loss is computed only for labeled nodes. 357 | loss = F.cross_entropy(logits[train_idx], labels[train_idx]) 358 | 359 | pred = logits.argmax(1) 360 | train_acc = (pred[train_idx] == labels[train_idx]).float().mean() 361 | val_acc = (pred[val_idx] == labels[val_idx]).float().mean() 362 | test_acc = (pred[test_idx] == labels[test_idx]).float().mean() 363 | 364 | if best_val_acc < val_acc: 365 | best_val_acc = val_acc 366 | best_test_acc = test_acc 367 | 368 | opt.zero_grad() 369 | loss.backward() 370 | opt.step() 371 | 372 | if epoch % 5 == 0: 373 | print('Loss %.4f, Train Acc %.4f, Val Acc %.4f (Best %.4f), Test Acc %.4f (Best %.4f)' % ( 374 | loss.item(), 375 | train_acc.item(), 376 | val_acc.item(), 377 | best_val_acc.item(), 378 | test_acc.item(), 379 | best_test_acc.item(), 380 | )) 381 | 382 | ############################################################################### 383 | # What's next? 384 | # ------------ 385 | # * Check out our full implementation in PyTorch 386 | # `here `_. 387 | # 388 | # * We also provide the following model examples: 389 | # 390 | # * `Graph Convolutional Matrix Completion _`, 391 | # which we implement in MXNet 392 | # `here `_. 393 | # 394 | # * `Heterogeneous Graph Attention Network `_ 395 | # requires transforming a heterograph into a homogeneous graph according to 396 | # a given metapath (i.e. a path template consisting of edge types). We 397 | # provide :func:`dgl.transform.metapath_reachable_graph` to do this. See full 398 | # implementation 399 | # `here `_. 400 | # 401 | # * `Metapath2vec `_ requires 402 | # generating random walk paths according to a given metapath. Please 403 | # refer to the full metapath2vec implementation 404 | # `here `_. 405 | # 406 | # * :doc:`Full heterograph API reference <../../api/python/heterograph>`. 407 | -------------------------------------------------------------------------------- /mine_next/functions/homograph.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | import dgl 3 | import dgl.frame 4 | import torch 5 | import os, csv 6 | import pandas as pd 7 | from tqdm import tqdm 8 | import benepar 9 | from transformers import AutoTokenizer 10 | import string 11 | import dgl.nn.pytorch as dglnn 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | 15 | 16 | class Classifier(nn.Module): 17 | def __init__(self, in_dim, hidden_dim, n_classes): 18 | super(Classifier, self).__init__() 19 | self.conv1 = dglnn.GraphConv(in_dim, hidden_dim) 20 | self.conv2 = dglnn.GraphConv(hidden_dim, hidden_dim) 21 | self.classify = nn.Linear(hidden_dim, n_classes) 22 | #self.cons_type_embeddings = nn.Embedding(82, 300) 23 | 24 | def forward(self, g, h, edge_type=None): 25 | # Apply graph convolution and activation. 26 | # cons_node_ids = g.filter(lambda nodes:nodes.data['dtype'] == 1 ) 27 | # cc_edge_id = g.filter(lambda edges : edges.data['dtype'] == edge_type) 28 | # self_edge_id = g.filter(lambda edges : edges.data['dtype'] == 4) 29 | # cc_edge_id = torch.cat([cc_edge_id, self_edge_id], dim=0) 30 | 31 | h = F.relu(self.conv1(g, h)) 32 | h = F.relu(self.conv2(g, h)) 33 | with g.local_scope(): 34 | g.ndata['h'] = h 35 | # Calculate graph representation by average readout. 36 | # (batch size, 20(아마 히든사이즈)) 37 | hg = dgl.mean_nodes(g, 'h') 38 | return self.classify(hg) 39 | 40 | 41 | class RGCN(nn.Module): 42 | def __init__(self, in_feats, hid_feats, out_feats, rel_names): 43 | super().__init__() 44 | 45 | self.conv1 = dglnn.HeteroGraphConv({ 46 | rel: dglnn.GraphConv(in_feats, hid_feats) 47 | for rel in rel_names}, aggregate='sum') 48 | self.conv2 = dglnn.HeteroGraphConv({ 49 | rel: dglnn.GraphConv(hid_feats, out_feats) 50 | for rel in rel_names}, aggregate='sum') 51 | 52 | def forward(self, graph, inputs): 53 | # inputs is features of nodes 54 | h = self.conv1(graph, inputs) 55 | h = {k: F.relu(v) for k, v in h.items()} 56 | h = self.conv2(graph, h) 57 | return h 58 | 59 | 60 | class HeteroClassifier(nn.Module): 61 | def __init__(self, in_dim, hidden_dim, n_classes, rel_names): 62 | super().__init__() 63 | 64 | self.rgcn = RGCN(in_dim, hidden_dim, hidden_dim, rel_names) 65 | self.classify = nn.Linear(hidden_dim, n_classes) 66 | 67 | def forward(self, g, h): 68 | #h = g.ndata['feat'] 69 | h = self.rgcn(g, h) 70 | with g.local_scope(): 71 | g.ndata['h'] = h 72 | # Calculate graph representation by average readout. 73 | hg = 0 74 | for ntype in g.ntypes: 75 | hg = hg + dgl.mean_nodes(g, 'h', ntype=ntype) 76 | return self.classify(hg) 77 | 78 | 79 | class Tree(object): 80 | def __init__(self, type): 81 | # self.start, self.end 단어 기준으로 세는것. one 이 0번째, . 이 27번쨰라 root 노드가 start=0, end=27을 가지고 있는거임 82 | self.parent = None 83 | self.num_children = 0 84 | self.children = list() 85 | self.type = type 86 | self.is_leaf = False 87 | self.start = -1 88 | self.end = -1 89 | self.idx = -1 90 | 91 | def add_child(self, child): 92 | child.parent = self 93 | self.num_children += 1 94 | self.children.append(child) 95 | 96 | def size(self): 97 | count = 1 98 | for i in range(self.num_children): 99 | count += self.children[i].size() 100 | return count 101 | 102 | def __str__(self): 103 | return self.type 104 | 105 | def __iter__(self): 106 | yield self 107 | for c in self.children: 108 | for x in c: 109 | yield x 110 | 111 | def get_cons_tag_vocab(data_path): 112 | tag2id = {} 113 | with open(data_path) as f: 114 | for line in f.readlines(): 115 | tag, idx = line.strip().split('\t') 116 | tag2id[tag] = int(idx) 117 | return tag2id 118 | 119 | def span_starts_ends(node: Tree): 120 | if len(node.children) == 0: 121 | return 122 | for child in node.children: 123 | span_starts_ends(child) 124 | 125 | node.start = node.children[0].start 126 | node.end = node.children[-1].end 127 | 128 | def constituent_to_tree(tokenizer, constituent_string, sentence, word_offset, node_offset, num_orders=2): 129 | constituents = [] 130 | temp_str = "" 131 | # 괄호 ['(', 'S', '(', 'NP', '(', 'NP', '(', 'CD', 'one', ')', '(', 'ADJP', '(', 'RB', 'long', ')' ... ] 이런식으로 (,)나 단어, constituent 단위로 분리 132 | for i, char in enumerate(constituent_string): 133 | if char == "(" or char == ")" or char == " ": 134 | if len(temp_str) != 0: 135 | constituents.append(temp_str) 136 | temp_str = "" 137 | if char != " ": 138 | constituents.append(char) 139 | else: 140 | temp_str += char 141 | # NP, PP등 노드 단위로 stack 142 | stack = [] 143 | for cons in constituents: 144 | if cons != ")": 145 | stack.append(cons) 146 | else: 147 | tail = stack.pop() 148 | temp_constituents = [] 149 | while tail != "(": 150 | temp_constituents.append(tail) 151 | tail = stack.pop() 152 | 153 | parent = Tree(temp_constituents[-1]) 154 | for i in range(len(temp_constituents) - 2, -1, -1): 155 | if isinstance(temp_constituents[i], Tree): 156 | parent.add_child(temp_constituents[i]) 157 | else: 158 | child = Tree(temp_constituents[i]) 159 | parent.add_child(child) 160 | stack.append(parent) 161 | root = stack[-1] 162 | for node in root: 163 | if len(node.children) == 0: 164 | node.is_leaf = True 165 | 166 | for node in root: 167 | if node.is_leaf: 168 | node.start = word_offset 169 | node.end = word_offset 170 | word_offset += 1 171 | span_starts_ends(root) 172 | 173 | node_sequence = [] 174 | # internal nodes 는 S, NP VP, PP 와같은 노드만. one, lone과같은 노드는 없음 175 | internal_nodes = [] 176 | for node in root: 177 | if not node.is_leaf: 178 | internal_nodes.append(node) 179 | node_sequence.append(node) 180 | 181 | node_offset_original = node_offset 182 | for node in root: 183 | if node.is_leaf: 184 | continue 185 | node.idx = node_offset 186 | node_offset += 1 187 | 188 | constituent_sequence = [] # [(node idx, node start, node end, node type, parent idx)] 189 | num_internal_nodes = len(internal_nodes) 190 | # constituent_edge 191 | constituent_edge = [[0] * num_internal_nodes for _ in range(num_internal_nodes)] 192 | for i, node in enumerate(internal_nodes): 193 | parent_idx = node.parent.idx if node.parent else -1 194 | constituent_sequence.append((node.idx, node.start, node.end, node.type, parent_idx)) 195 | if parent_idx != -1: 196 | constituent_edge[node.idx - node_offset_original][parent_idx - node_offset_original] = 1 # 바로 아래 코드랑 보면 양방향 엣지 포함하는거임 197 | constituent_edge[parent_idx - node_offset_original][node.idx - node_offset_original] = 1 198 | # 이부분은 한계층 건너 뛰어서 엣지 이어 주는 식임. 원래 S랑 PP는 안이어져있는데 여기서 이어줌 199 | high_order_sequence = [constituent_sequence] 200 | for i in range(1, num_orders): 201 | new_constituent_sequence = [] 202 | for idx, start, end, type, parent_idx in high_order_sequence[-1]: 203 | if parent_idx == -1: 204 | continue 205 | parent_node = constituent_sequence[parent_idx - node_offset_original] 206 | if parent_node[-1] == -1: 207 | continue 208 | new_constituent_sequence.append((idx, start, end, type, parent_node[-1])) 209 | constituent_edge[idx - node_offset_original][parent_node[-1] - node_offset_original] = 1 210 | constituent_edge[parent_node[-1] - node_offset_original][idx - node_offset_original] = 1 211 | high_order_sequence.append(new_constituent_sequence) 212 | return high_order_sequence, word_offset, node_offset 213 | 214 | def final_graph(constituent_list, graph): 215 | cons_tag2id = get_cons_tag_vocab('../../data/IAM/constituent_gold_vocab.txt') 216 | forward_edge_type, backward_edge_type = 0, 2 217 | 218 | prev_root_node_id = None 219 | constituent_labels = [] 220 | for high_order_sent_cons in constituent_list: 221 | for i, sent_cons in enumerate(high_order_sent_cons): 222 | for idx, start, end, label, parent_idx in sent_cons: 223 | idx_nodeid = idx # 원래는 constituent_start_idx = 0, node_id_offset = 406(token id까지였음. 1063중 406이 토큰이고 이후가 node 였었음.) 224 | # parent 없는 노드 225 | if parent_idx == -1: 226 | if prev_root_node_id is not None: 227 | # graph.add_edges(prev_root_node_id, idx_nodeid, 228 | # data={'cc_link': torch.tensor([forward_edge_type + i]), 229 | # 'dtype': torch.tensor([forward_edge_type + i])}) 230 | # # dual GAT 231 | # graph.add_edges(idx_nodeid, prev_root_node_id, 232 | # data={'cc_link': torch.tensor([backward_edge_type + i]), 233 | # 'dtype': torch.tensor([backward_edge_type + i])}) 234 | graph.add_edges(prev_root_node_id, idx_nodeid, 235 | data={'cc_link': torch.tensor([1]), 236 | 'dtype': torch.tensor([1])}) 237 | # dual GAT 238 | graph.add_edges(idx_nodeid, prev_root_node_id, 239 | data={'cc_link': torch.tensor([1]), 240 | 'dtype': torch.tensor([1])}) 241 | prev_root_node_id = idx_nodeid 242 | # parent 있는 노드들 243 | if parent_idx != -1: 244 | parent_idx_nodeid = parent_idx 245 | # graph.add_edges(parent_idx_nodeid, idx_nodeid, 246 | # data={'cc_link': torch.tensor([forward_edge_type + i]), 247 | # 'dtype': torch.tensor([forward_edge_type + i])}) 248 | # graph.add_edges(idx_nodeid, parent_idx_nodeid, 249 | # data={'cc_link': torch.tensor([backward_edge_type + i]), 250 | # 'dtype': torch.tensor([backward_edge_type + i])}) 251 | graph.add_edges(parent_idx_nodeid, idx_nodeid, 252 | data={'cc_link': torch.tensor([1]), 253 | 'dtype': torch.tensor([1])}) 254 | graph.add_edges(idx_nodeid, parent_idx_nodeid, 255 | data={'cc_link': torch.tensor([1]), 256 | 'dtype': torch.tensor([1])}) 257 | if i == 0: 258 | # self-loop edge 259 | # graph.add_edges(idx_nodeid, idx_nodeid, data={'cc_link': torch.tensor([4]), 260 | # 'dtype': torch.tensor([4])}) 261 | graph.add_edges(idx_nodeid, idx_nodeid, data={'cc_link': torch.tensor([1]), 262 | 'dtype': torch.tensor([1])}) 263 | constituent_labels.append(cons_tag2id[label]) 264 | 265 | constituent_labels = torch.tensor(constituent_labels,dtype=torch.long) 266 | return graph, constituent_labels 267 | 268 | def all_process_graph(nlp, tokenizer, sentence): 269 | sentence_doc = nlp(sentence) 270 | sentence_sent = list(sentence_doc.sents)[0] 271 | parse_string = sentence_sent._.parse_string 272 | word_offset, node_offset = 0, 0 273 | constituent = [] 274 | constituent_sequence, word_offset, node_offset = \ 275 | constituent_to_tree(tokenizer, parse_string, sentence, word_offset, node_offset) 276 | constituent.append(constituent_sequence) 277 | 278 | graph = dgl.graph([]) 279 | graph.set_n_initializer(dgl.frame.zero_initializer) 280 | num_cons = sum([len(sent_cons[0]) for sent_cons in constituent]) 281 | 282 | graph.add_nodes(num_cons) 283 | graph.ndata['unit'] = torch.ones(num_cons) 284 | graph.ndata['dtype'] = torch.ones(num_cons) 285 | 286 | claim_graph, constituent_labels = \ 287 | final_graph(constituent, graph) 288 | return claim_graph, constituent_labels 289 | 290 | 291 | if __name__ == "__main__": 292 | nlp = spacy.load('en_core_web_sm') 293 | nlp.add_pipe('benepar', config={'model': 'benepar_en3'}) 294 | tokenizer = AutoTokenizer.from_pretrained('roberta-base', do_lower_case=False, use_fast=False) 295 | 296 | dev_data = pd.read_csv('../../data/IAM/claims/dev.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE) 297 | dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label'] 298 | dev_data = dev_data.dropna(axis=0) 299 | 300 | dev_sentences = dev_data['claim_sentence'].tolist()[:10] 301 | total_dev_constituent_label = [] 302 | printable = set(string.printable) 303 | total_graph = {} 304 | cons_type_embeddings = nn.Embedding(82, 300) 305 | model = Classifier(300, 300, 2) # homo graph 테스트용 306 | 307 | for idx, dev in tqdm(enumerate(dev_sentences), total=len(dev_sentences)): 308 | dev = dev.lower().replace('“', '"').replace('”', '"') 309 | dev = "".join(filter(lambda x: x in printable, dev)) 310 | 311 | dev_graph, dev_constituent_label = all_process_graph(nlp, tokenizer, dev) 312 | total_dev_constituent_label.append([dev_constituent_label]) 313 | total_graph[idx] = dev_graph 314 | cons_node_feat = cons_type_embeddings(dev_constituent_label) 315 | #etypes = ['0', '1', '2', '3', '4'] 316 | #model = HeteroClassifier(300, 300, 2, etypes) 317 | #print(dev_graph.edges(form='all')) 318 | logits = model(dev_graph, cons_node_feat) # homo 319 | #logits = model(dev_graph, cons_node_feat) # hetero 320 | print(logits) 321 | 322 | 323 | -------------------------------------------------------------------------------- /mine_next/functions/main_function.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import os 3 | from tqdm import tqdm 4 | import torch 5 | from torch.utils.data import DataLoader, RandomSampler, SequentialSampler 6 | from transformers.optimization import AdamW, get_linear_schedule_with_warmup 7 | from transformers import AutoConfig, AutoTokenizer 8 | from sklearn.metrics import classification_report, accuracy_score 9 | from sklearn.utils import resample 10 | 11 | import csv 12 | import numpy as np 13 | import pandas as pd 14 | import json 15 | 16 | # from source.claim_classification.model.modeling import KoElectraForClaimClassification 17 | # from source.claim_classification.func.dataset import convert_data2tensordataset 18 | 19 | from mine_next.model.modeling import RobertaForClassification 20 | from mine_next.functions.dataset import ( 21 | convert_data2tensordataset, 22 | convert_stance_data2tensordataset, 23 | convert_only_sentence2tensordataset 24 | ) 25 | 26 | 27 | def random_downsampling(dataset): 28 | major = dataset[dataset['claim_label'] == 'O'] 29 | minor = dataset[dataset['claim_label'] == 'C'] 30 | sampling_data = resample(major, replace=True, n_samples=len(minor)*5, random_state=42) 31 | train_data = pd.concat([sampling_data, minor]) 32 | return train_data 33 | 34 | 35 | def random_upsampling(dataset): 36 | major = dataset[dataset['claim_label'] == 'O'] 37 | minor = dataset[dataset['claim_label'] == 'C'] 38 | sampling_data = resample(minor, replace=True, n_samples=len(major), random_state=42) 39 | train_data = pd.concat([sampling_data, major]) 40 | return train_data 41 | 42 | def do_train(config, model, optimizer, scheduler, train_dataloader, epoch, global_step, total_graph): 43 | losses = [] 44 | total_predicts, total_corrects = [], [] 45 | for step, batch in tqdm(enumerate(train_dataloader), desc='do_train(epoch_{})'.format(epoch), total=len(train_dataloader)): 46 | batch = tuple(t.cuda() for t in batch) 47 | # graph 같이 학습할 경우 48 | # idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], batch[4], batch[5] 49 | # constituent_labels = batch[6] 50 | # loss, predicts = model( 51 | # idx=idx, 52 | # input_ids=input_ids, 53 | # attention_mask=attention_mask, 54 | # token_type_ids=token_type_ids, 55 | # labels=labels, 56 | # sim_labels=sim_labels, 57 | # all_graph=total_graph, 58 | # constituent_labels=constituent_labels 59 | # ) 60 | # base 61 | idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], \ 62 | batch[4], batch[5] 63 | loss, predicts = model( 64 | idx=idx, 65 | input_ids=input_ids, 66 | attention_mask=attention_mask, 67 | token_type_ids=token_type_ids, 68 | labels=labels, 69 | sim_labels=sim_labels, 70 | ) 71 | predicts = predicts.argmax(dim=-1) 72 | predicts = predicts.cpu().detach().numpy().tolist() 73 | labels = labels.cpu().detach().numpy().tolist() 74 | 75 | total_predicts.extend(predicts) 76 | total_corrects.extend(labels) 77 | 78 | if config.gradient_accumulation_steps > 1: 79 | loss = loss / config.gradient_accumulation_steps 80 | # 원래는 tensor(0.7255)이런식 81 | loss.backward() 82 | losses.append(loss.data.item()) 83 | if (step + 1) % config.gradient_accumulation_steps == 0 or \ 84 | (len(train_dataloader) <= config.gradient_accumulation_steps and (step + 1) == len( 85 | train_dataloader)): 86 | torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm) 87 | optimizer.step() 88 | scheduler.step() 89 | 90 | model.zero_grad() 91 | global_step += 1 92 | target_names = ['class 0', 'class 1'] 93 | print(classification_report(total_corrects, total_predicts, target_names=target_names, digits=4)) 94 | accuracy = accuracy_score(total_corrects, total_predicts) 95 | return accuracy, np.mean(losses), global_step 96 | 97 | 98 | def do_evaluate(model, dev_dataloader, total_graph): 99 | total_predicts, total_corrects = [], [] 100 | for step, batch in tqdm(enumerate(dev_dataloader), desc="do_evaluate", total=len(dev_dataloader)): 101 | batch = tuple(t.cuda() for t in batch) 102 | # graph 학습할 경우 103 | # idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], batch[4], batch[5] 104 | # constituent_labels = batch[6] 105 | # predicts = model( 106 | # idx=idx, 107 | # input_ids=input_ids, 108 | # attention_mask=attention_mask, 109 | # token_type_ids=token_type_ids, 110 | # all_graph=total_graph, 111 | # constituent_labels=constituent_labels 112 | # ) 113 | # base 114 | idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], \ 115 | batch[4], batch[5] 116 | predicts = model( 117 | idx=idx, 118 | input_ids=input_ids, 119 | attention_mask=attention_mask, 120 | token_type_ids=token_type_ids, 121 | ) 122 | predicts = predicts.argmax(dim=-1) 123 | predicts = predicts.detach().cpu().tolist() 124 | labels = labels.detach().cpu().tolist() 125 | total_predicts.extend(predicts) 126 | total_corrects.extend(labels) 127 | target_names = ['class 0', 'class 1'] 128 | print(classification_report(total_corrects, total_predicts, target_names=target_names, digits=4)) 129 | accuracy = accuracy_score(total_corrects, total_predicts) 130 | return accuracy, total_predicts 131 | 132 | 133 | def train(config, model, tokenizer): 134 | 135 | # 데이터셋 로드 136 | train_data = pd.read_csv(config.claim_train, sep='\t', header=None, quoting=csv.QUOTE_NONE) 137 | train_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label'] 138 | train_data = train_data.dropna(axis=0) 139 | # train_data = train_data[:100] 140 | dev_data = pd.read_csv(config.claim_dev, sep='\t', header=None, quoting=csv.QUOTE_NONE) 141 | dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label'] 142 | dev_data = dev_data.dropna(axis=0) 143 | # dev_data = dev_data[:100] 144 | 145 | #train_data = random_upsampling(train_data) 146 | train_dataset, train_total_graph = convert_only_sentence2tensordataset(train_data, tokenizer, config.max_length, 'train') 147 | dev_dataset, dev_total_graph = convert_only_sentence2tensordataset(dev_data, tokenizer, config.max_length, 'dev') 148 | 149 | 150 | train_sampler = RandomSampler(train_dataset) 151 | train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config.batch_size) 152 | dev_sampler = SequentialSampler(dev_dataset) 153 | dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=config.batch_size) 154 | 155 | t_total = len(train_dataloader) // config.gradient_accumulation_steps * config.epoch 156 | optimizer = AdamW(model.parameters(), lr=config.learning_rate) 157 | scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config.warmup_steps, num_training_steps=t_total) 158 | 159 | global_step = 0 160 | max_test_accuracy = 0 161 | model.zero_grad() 162 | for epoch in range(config.epoch): 163 | model.train() 164 | train_accuracy, average_loss, global_step = do_train( 165 | config=config, model=model, 166 | optimizer=optimizer, scheduler=scheduler, 167 | train_dataloader=train_dataloader, epoch=epoch+1, global_step=global_step, total_graph=train_total_graph) 168 | print("train_accuracy : {}\taverage_loss : {}\n".format(round(train_accuracy, 4), round(average_loss, 4))) 169 | 170 | model.eval() 171 | test_accuracy, _ = do_evaluate(model=model, dev_dataloader=dev_dataloader, total_graph=dev_total_graph) 172 | print("test_accuracy : {}\n".format(round(test_accuracy, 4))) 173 | output_dir = os.path.join(config.save_dir, "checkpoint-{}".format(epoch)) 174 | if not os.path.exists(output_dir): 175 | os.makedirs(output_dir) 176 | model_to_save = model.module if hasattr(model, "module") else model 177 | model_to_save.save_pretrained(output_dir) 178 | tokenizer.save_pretrained(output_dir) 179 | torch.save(config, os.path.join(output_dir, "training_args.bin")) 180 | 181 | 182 | def evaluate(config, model, tokenizer): 183 | dev_data = pd.read_csv(config.claim_dev, sep='\t', header=None, quoting=csv.QUOTE_NONE) 184 | dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label'] 185 | dev_data = dev_data.dropna(axis=0) 186 | # dev_data = dev_data[:10] 187 | # dev_dataset = convert_stance_data2tensordataset(dev_data, tokenizer, config.max_length) 188 | dev_dataset, total_graph = convert_only_sentence2tensordataset(dev_data, tokenizer, config.max_length, 'dev') 189 | 190 | dev_sampler = SequentialSampler(dev_dataset) 191 | dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=config.batch_size) 192 | 193 | test_accuracy, total_predicts = do_evaluate(model=model, dev_dataloader=dev_dataloader, total_graph=total_graph) 194 | print("test accuracy : {}".format(round(test_accuracy,4))) 195 | total_corrects = dev_data['claim_label'].tolist() 196 | total_corrects = [1 if correct == 'C' else 0 for correct in total_corrects] 197 | totaL_claim_sentence = dev_data['claim_sentence'].tolist() 198 | error_list = [] 199 | for predict, correct, claim in zip(total_predicts, total_corrects, totaL_claim_sentence): 200 | if predict != correct: 201 | error = {} 202 | error['predict'] = predict 203 | error['correct'] = correct 204 | error['claim_sentence'] = claim 205 | error_list.append(error) 206 | 207 | with open('../mine/functions/dev_error.json', 'w', encoding='utf-8') as f: 208 | json.dump(error_list, f, indent=4) 209 | -------------------------------------------------------------------------------- /mine_next/functions/main_function2.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import os 3 | from tqdm import tqdm 4 | import torch 5 | from torch.utils.data import DataLoader, RandomSampler, SequentialSampler 6 | from transformers.optimization import AdamW, get_linear_schedule_with_warmup 7 | from transformers import AutoConfig, AutoTokenizer 8 | from sklearn.metrics import classification_report, accuracy_score 9 | from sklearn.utils import resample 10 | 11 | import csv 12 | import numpy as np 13 | import pandas as pd 14 | import json 15 | 16 | # from source.claim_classification.model.modeling import KoElectraForClaimClassification 17 | # from source.claim_classification.func.dataset import convert_data2tensordataset 18 | 19 | from mine_next.model.modeling import RobertaForClassification 20 | from mine_next.functions.dataset import ( 21 | convert_data2tensordataset, 22 | convert_stance_data2tensordataset, 23 | convert_only_sentence2tensordataset 24 | ) 25 | 26 | 27 | def random_downsampling(dataset): 28 | major = dataset[dataset['claim_label'] == 'O'] 29 | minor = dataset[dataset['claim_label'] == 'C'] 30 | sampling_data = resample(major, replace=True, n_samples=len(minor)*5, random_state=42) 31 | train_data = pd.concat([sampling_data, minor]) 32 | return train_data 33 | 34 | def random_upsampling(dataset): 35 | major = dataset[dataset['claim_label'] == 'O'] 36 | minor = dataset[dataset['claim_label'] == 'C'] 37 | sampling_data = resample(minor, replace=True, n_samples=len(major), random_state=42) 38 | train_data = pd.concat([sampling_data, major]) 39 | return train_data 40 | 41 | def do_train(config, model, optimizer, scheduler, train_dataloader, epoch, global_step, total_graph): 42 | losses = [] 43 | total_predicts, total_corrects = [], [] 44 | for step, batch in tqdm(enumerate(train_dataloader), desc='do_train(epoch_{})'.format(epoch), total=len(train_dataloader)): 45 | batch = tuple(t.cuda() for t in batch) 46 | # graph 같이 학습할 경우 47 | idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], batch[4], batch[5] 48 | constituent_labels_first, constituent_labels_second = batch[6], batch[7] 49 | loss, predicts = model( 50 | idx=idx, 51 | input_ids=input_ids, 52 | attention_mask=attention_mask, 53 | token_type_ids=token_type_ids, 54 | labels=labels, 55 | sim_labels=sim_labels, 56 | all_graph=total_graph, 57 | constituent_labels_first=constituent_labels_first, 58 | constituent_labels_second=constituent_labels_second 59 | ) 60 | # base 61 | # idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], \ 62 | # batch[4], batch[5] 63 | # loss, predicts = model( 64 | # idx=idx, 65 | # input_ids=input_ids, 66 | # attention_mask=attention_mask, 67 | # token_type_ids=token_type_ids, 68 | # labels=labels, 69 | # sim_labels=sim_labels, 70 | # ) 71 | predicts = predicts.argmax(dim=-1) 72 | predicts = predicts.cpu().detach().numpy().tolist() 73 | labels = labels.cpu().detach().numpy().tolist() 74 | 75 | total_predicts.extend(predicts) 76 | total_corrects.extend(labels) 77 | 78 | if config.gradient_accumulation_steps > 1: 79 | loss = loss / config.gradient_accumulation_steps 80 | # 원래는 tensor(0.7255)이런식 81 | loss.backward() 82 | losses.append(loss.data.item()) 83 | if (step + 1) % config.gradient_accumulation_steps == 0 or \ 84 | (len(train_dataloader) <= config.gradient_accumulation_steps and (step + 1) == len( 85 | train_dataloader)): 86 | torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm) 87 | optimizer.step() 88 | scheduler.step() 89 | 90 | model.zero_grad() 91 | global_step += 1 92 | target_names = ['class 0', 'class 1'] 93 | print(classification_report(total_corrects, total_predicts, target_names=target_names, digits=4)) 94 | accuracy = accuracy_score(total_corrects, total_predicts) 95 | return accuracy, np.mean(losses), global_step 96 | 97 | def do_evaluate(model, dev_dataloader, total_graph): 98 | total_predicts, total_corrects = [], [] 99 | for step, batch in tqdm(enumerate(dev_dataloader), desc="do_evaluate", total=len(dev_dataloader)): 100 | batch = tuple(t.cuda() for t in batch) 101 | # graph 학습할 경우 102 | idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], batch[4], batch[5] 103 | constituent_labels_first, constituent_labels_second = batch[6], batch[7] 104 | predicts = model( 105 | idx=idx, 106 | input_ids=input_ids, 107 | attention_mask=attention_mask, 108 | token_type_ids=token_type_ids, 109 | all_graph=total_graph, 110 | constituent_labels_first=constituent_labels_first, 111 | constituent_labels_second= constituent_labels_second 112 | ) 113 | # base 114 | # idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], \ 115 | # batch[4], batch[5] 116 | # predicts = model( 117 | # idx=idx, 118 | # input_ids=input_ids, 119 | # attention_mask=attention_mask, 120 | # token_type_ids=token_type_ids, 121 | # ) 122 | predicts = predicts.argmax(dim=-1) 123 | predicts = predicts.detach().cpu().tolist() 124 | labels = labels.detach().cpu().tolist() 125 | total_predicts.extend(predicts) 126 | total_corrects.extend(labels) 127 | target_names = ['class 0', 'class 1'] 128 | result = classification_report(total_corrects, total_predicts, target_names=target_names, digits=4, output_dict=True) 129 | print(classification_report(total_corrects, total_predicts, target_names=target_names, digits=4)) 130 | accuracy = accuracy_score(total_corrects, total_predicts) 131 | return accuracy, total_predicts, result['class 1']['f1-score'] 132 | 133 | def train(config, model, tokenizer): 134 | 135 | # 데이터셋 로드 136 | train_data = pd.read_csv(config.claim_train, sep='\t', header=None, quoting=csv.QUOTE_NONE) 137 | train_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label'] 138 | train_data = train_data.dropna(axis=0) 139 | dev_data = pd.read_csv(config.claim_dev, sep='\t', header=None, quoting=csv.QUOTE_NONE) 140 | dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label'] 141 | dev_data = dev_data.dropna(axis=0) 142 | 143 | pseudo_train = json.load(open(config.train_pseudo_topic, encoding='utf-8')) 144 | pseudo_dev = json.load(open(config.dev_pseudo_topic, encoding='utf-8')) 145 | #train_data = random_upsampling(train_data) 146 | 147 | train_dataset, train_total_graph_first, train_total_graph_second = convert_only_sentence2tensordataset( 148 | train_data, pseudo_train, tokenizer, config.max_length, 'train') 149 | dev_dataset, dev_total_graph_first, dev_total_graph_second = convert_only_sentence2tensordataset(dev_data, pseudo_dev, tokenizer, config.max_length, 'dev') 150 | 151 | train_sampler = RandomSampler(train_dataset) 152 | train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config.batch_size) 153 | dev_sampler = SequentialSampler(dev_dataset) 154 | dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=config.batch_size) 155 | 156 | t_total = len(train_dataloader) // config.gradient_accumulation_steps * config.epoch 157 | optimizer = AdamW(model.parameters(), lr=config.learning_rate) 158 | scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config.warmup_steps, num_training_steps=t_total) 159 | 160 | global_step = 0 161 | max_test_accuracy = 0 162 | max_claim_f1 = 0 163 | model.zero_grad() 164 | for epoch in range(config.epoch): 165 | model.train() 166 | train_accuracy, average_loss, global_step = do_train( 167 | config=config, model=model, 168 | optimizer=optimizer, scheduler=scheduler, 169 | train_dataloader=train_dataloader, epoch=epoch, global_step=global_step, total_graph=[train_total_graph_first, train_total_graph_second]) 170 | print("train_accuracy : {}\taverage_loss : {}\n".format(round(train_accuracy, 4), round(average_loss, 4))) 171 | 172 | model.eval() 173 | test_accuracy, _, claim_f1 = do_evaluate(model=model, dev_dataloader=dev_dataloader, total_graph=[dev_total_graph_first, dev_total_graph_second]) 174 | print("test_accuracy : {}\n".format(round(test_accuracy, 4))) 175 | if max_claim_f1 < claim_f1: 176 | output_dir = os.path.join(config.save_dir, "checkpoint-{}".format(epoch)) 177 | if not os.path.exists(output_dir): 178 | os.makedirs(output_dir) 179 | model_to_save = model.module if hasattr(model, "module") else model 180 | model_to_save.save_pretrained(output_dir) 181 | tokenizer.save_pretrained(output_dir) 182 | torch.save(config, os.path.join(output_dir, "training_args.bin")) 183 | max_claim_f1 = claim_f1 184 | 185 | def evaluate(config, model, tokenizer): 186 | dev_data = pd.read_csv(config.claim_dev, sep='\t', header=None, quoting=csv.QUOTE_NONE) 187 | dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label'] 188 | dev_data = dev_data.dropna(axis=0) 189 | # dev_data = dev_data[:10] 190 | # dev_dataset = convert_stance_data2tensordataset(dev_data, tokenizer, config.max_length) 191 | pseudo_dev = json.load(open(config.dev_pseudo_topic, encoding='utf-8')) 192 | dev_dataset, dev_total_graph_first, dev_total_graph_second = convert_only_sentence2tensordataset(dev_data, pseudo_dev, tokenizer, config.max_length, 'dev') 193 | 194 | dev_sampler = SequentialSampler(dev_dataset) 195 | dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=config.batch_size) 196 | 197 | test_accuracy, total_predicts, claim_f1 = do_evaluate(model=model, dev_dataloader=dev_dataloader, total_graph=[dev_total_graph_first, dev_total_graph_second]) 198 | print("test accuracy : {}".format(round(test_accuracy,4))) 199 | total_corrects = dev_data['claim_label'].tolist() 200 | total_corrects = [1 if correct == 'C' else 0 for correct in total_corrects] 201 | assert len(total_corrects) == len(total_predicts) 202 | totaL_claim_sentence = dev_data['claim_sentence'].tolist() 203 | error_list = [] 204 | for predict, correct, claim in zip(total_predicts, total_corrects, totaL_claim_sentence): 205 | error = {} 206 | error['predict'] = predict 207 | error['correct'] = correct 208 | error['claim_sentence'] = claim 209 | error_list.append(error) 210 | 211 | with open('../mine_next/functions/dev_error.json', 'w', encoding='utf-8') as f: 212 | json.dump(error_list, f, indent=4) 213 | 214 | 215 | def test(config, model, tokenizer): 216 | test_data = pd.read_csv(config.claim_test, sep='\t', header=None, quoting=csv.QUOTE_NONE) 217 | test_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label'] 218 | test_data = test_data.dropna(axis=0) 219 | pseudo_test = json.load(open(config.test_pseudo_topic, encoding='utf-8')) 220 | test_dataset, test_total_graph_first, test_total_graph_second = convert_only_sentence2tensordataset(test_data, pseudo_test, tokenizer, config.max_length, 'test') 221 | 222 | test_sampler = SequentialSampler(test_dataset) 223 | test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=config.batch_size) 224 | 225 | test_accuracy, total_predicts, claim_f1 = do_evaluate(model=model, dev_dataloader=test_dataloader, total_graph=[test_total_graph_first, test_total_graph_second]) 226 | print("test accuracy : {}".format(round(test_accuracy,4))) 227 | total_corrects = test_data['claim_label'].tolist() 228 | total_corrects = [1 if correct == 'C' else 0 for correct in total_corrects] 229 | assert len(total_corrects) == len(total_predicts) 230 | totaL_claim_sentence = test_data['claim_sentence'].tolist() 231 | error_list = [] 232 | for predict, correct, claim in zip(total_predicts, total_corrects, totaL_claim_sentence): 233 | error = {} 234 | error['predict'] = predict 235 | error['correct'] = correct 236 | error['claim_sentence'] = claim 237 | error_list.append(error) 238 | 239 | with open('../mine_next/functions/test_error.json', 'w', encoding='utf-8') as f: 240 | json.dump(error_list, f, indent=4) 241 | -------------------------------------------------------------------------------- /mine_next/functions/make_graph.py: -------------------------------------------------------------------------------- 1 | import benepar, spacy 2 | from nltk.tree import Tree as nltk_tree 3 | from nltk.treeprettyprinter import TreePrettyPrinter 4 | from nltk.draw.tree import TreeView 5 | import os, csv 6 | import pandas as pd 7 | from tqdm import tqdm 8 | import dgl 9 | from dgl import save_graphs, load_graphs 10 | from dgl.data.utils import makedirs, save_info, load_info 11 | 12 | import torch 13 | from transformers import BertTokenizer 14 | 15 | tokenizer = BertTokenizer.from_pretrained('bert-base-cased') 16 | data = pd.read_csv('../../data/IAM/claims/train.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE) 17 | data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label'] 18 | data = data.dropna(axis=0) 19 | data = data[data['claim_label'] == 'C'] 20 | claims = data['claim_sentence'].tolist() 21 | 22 | def get_cons_tag_vocab(data_path): 23 | tag2id = {} 24 | with open(data_path) as f: 25 | for line in f.readlines(): 26 | tag, idx = line.strip().split('\t') 27 | tag2id[tag] = int(idx) 28 | return tag2id 29 | 30 | 31 | 32 | class Tree(object): 33 | def __init__(self, type): 34 | # self.start, self.end 단어 기준으로 세는것. one 이 0번째, . 이 27번쨰라 root 노드가 start=0, end=27을 가지고 있는거임 35 | self.parent = None 36 | self.num_children = 0 37 | self.children = list() 38 | self.type = type 39 | self.is_leaf = False 40 | self.start = -1 41 | self.end = -1 42 | self.idx = -1 43 | 44 | def add_child(self, child): 45 | child.parent = self 46 | self.num_children += 1 47 | self.children.append(child) 48 | 49 | def size(self): 50 | count = 1 51 | for i in range(self.num_children): 52 | count += self.children[i].size() 53 | 54 | return count 55 | 56 | def __str__(self): 57 | return self.type 58 | 59 | def __iter__(self): 60 | yield self 61 | for c in self.children: 62 | for x in c: 63 | yield x 64 | 65 | def span_starts_ends(node: Tree): 66 | if len(node.children) == 0: 67 | return 68 | for child in node.children: 69 | span_starts_ends(child) 70 | 71 | node.start = node.children[0].start 72 | node.end = node.children[-1].end 73 | 74 | def constituent_to_tree(constituent_string, word_offset, node_offset, num_orders=2): 75 | constituents = [] 76 | temp_str = "" 77 | words = [] 78 | subtokens = [] 79 | subtoken_map = [] 80 | # 괄호 ['(', 'S', '(', 'NP', '(', 'NP', '(', 'CD', 'one', ')', '(', 'ADJP', '(', 'RB', 'long', ')' ... ] 이런식으로 (,)나 단어, constituent 단위로 분리 81 | for i, char in enumerate(constituent_string): 82 | if char == "(" or char == ")" or char == " ": 83 | if len(temp_str) != 0: 84 | constituents.append(temp_str) 85 | temp_str = "" 86 | if char != " ": 87 | constituents.append(char) 88 | else: 89 | temp_str += char 90 | # NP, PP등 노드 단위로 stack 91 | stack = [] 92 | for cons in constituents: 93 | if cons != ")": 94 | stack.append(cons) 95 | else: 96 | tail = stack.pop() 97 | temp_constituents = [] 98 | while tail != "(": 99 | temp_constituents.append(tail) 100 | tail = stack.pop() 101 | 102 | parent = Tree(temp_constituents[-1]) 103 | for i in range(len(temp_constituents) - 2, -1, -1): 104 | if isinstance(temp_constituents[i], Tree): 105 | parent.add_child(temp_constituents[i]) 106 | else: 107 | # parent에 붙일때 parent의 leaf를 true로 바꿔주는 형식으로 해줄것 108 | child = Tree(temp_constituents[i]) 109 | parent.add_child(child) 110 | stack.append(parent) 111 | root = stack[-1] 112 | # 노드 방문하면서 잎인지 체크해야함 113 | map_count = 0 114 | for node in root: 115 | if len(node.children) == 0: 116 | node.is_leaf = True 117 | words.append(str(node)) 118 | node_token = tokenizer.tokenize(str(node)) 119 | subtokens.extend(node_token) 120 | subtoken_map.extend([map_count]*len(node_token)) 121 | map_count += 1 122 | 123 | word_offset_original = word_offset 124 | for node in root: 125 | if node.is_leaf: 126 | node.start = word_offset 127 | node.end = word_offset 128 | word_offset += 1 129 | span_starts_ends(root) 130 | 131 | node_sequence = [] 132 | # internal nodes 는 S, NP VP, PP 와같은 노드만. one, lone과같은 노드는 없음 133 | internal_nodes = [] 134 | for node in root: 135 | if not node.is_leaf: 136 | internal_nodes.append(node) 137 | node_sequence.append(node) 138 | node_offset_original = node_offset 139 | for node in root: 140 | if node.is_leaf: 141 | # or node.type in [":", "``", ".", ",", "XX", "X", "-LRB-", "-RRB-", "''", "HYPH"] 142 | continue 143 | node.idx = node_offset 144 | node_offset += 1 145 | constituent_sequence = [] # [(idx, start, end, type, parent idx)] 146 | num_internal_nodes = len(internal_nodes) 147 | # constituent_edge 148 | constituent_edge = [[0] * num_internal_nodes for _ in range(num_internal_nodes)] 149 | for i, node in enumerate(internal_nodes): 150 | # if node.type in [":", "``", ".", ",", "XX", "X", "-LRB-", "-RRB-", "''", "HYPH"]: 151 | # continue 152 | parent_idx = node.parent.idx if node.parent else -1 153 | constituent_sequence.append((node.idx, node.start, node.end, node.type, parent_idx)) 154 | if parent_idx != -1: 155 | constituent_edge[node.idx - node_offset_original][parent_idx - node_offset_original] = 1 #바로 아래 코드랑 보면 양방향 엣지 포함하는거임 156 | constituent_edge[parent_idx - node_offset_original][node.idx - node_offset_original] = 1 157 | # 이부분은 한계층 건너 뛰어서 엣지 ㅇ이어 주는 식임. 원래 S랑 PP는 안이어져있는데 여기서 이어줌 158 | high_order_sequence = [constituent_sequence] 159 | for i in range(1, num_orders): 160 | new_constituent_sequence = [] 161 | for idx, start, end, type, parent_idx in high_order_sequence[-1]: 162 | if parent_idx == -1: 163 | continue 164 | parent_node = constituent_sequence[parent_idx - node_offset_original] 165 | if parent_node[-1] == -1: 166 | continue 167 | new_constituent_sequence.append((idx, start, end, type, parent_node[-1])) 168 | constituent_edge[idx - node_offset_original][parent_node[-1] - node_offset_original] = 1 169 | constituent_edge[parent_node[-1] - node_offset_original][idx - node_offset_original] = 1 170 | high_order_sequence.append(new_constituent_sequence) 171 | return high_order_sequence, word_offset, node_offset, subtoken_map, subtokens 172 | 173 | 174 | def print_parse_string(claim_list): 175 | for claim in claim_list: 176 | input_string = claim.lower() 177 | doc = nlp(input_string) 178 | sent = list(doc.sents)[0] 179 | print(sent) 180 | parse_string = sent._.parse_string 181 | print(parse_string) 182 | 183 | 184 | def save(self): 185 | # save graphs and labels 186 | self.save_path = '.' 187 | self.mode = 'test' 188 | graph_path = os.path.join(self.save_path, self.mode + '_dgl_graph.bin') 189 | save_graphs(graph_path, self.graphs, {'labels': self.labels}) 190 | # save other information in python dict 191 | info_path = os.path.join(self.save_path, self.mode + '_info.pkl') 192 | save_info(info_path, {'num_classes': self.num_classes}) 193 | 194 | def load(self): 195 | # load processed data from directory `self.save_path` 196 | self.save_path = '.' 197 | self.mode = 'test' 198 | graph_path = os.path.join(self.save_path, self.mode + '_dgl_graph.bin') 199 | self.graphs, label_dict = load_graphs(graph_path) 200 | self.labels = label_dict['labels'] 201 | info_path = os.path.join(self.save_path, self.mode + '_info.pkl') 202 | self.num_classes = load_info(info_path)['num_classes'] 203 | 204 | def has_cache(self): 205 | # check whether there are processed data in `self.save_path` 206 | self.save_path = '.' 207 | self.mode = 'test' 208 | graph_path = os.path.join(self.save_path, self.mode + '_dgl_graph.bin') 209 | info_path = os.path.join(self.save_path, self.mode + '_info.pkl') 210 | return os.path.exists(graph_path) and os.path.exists(info_path) 211 | 212 | 213 | #print_parse_string(claims) 214 | 215 | 216 | 217 | 218 | # doc = nlp(input_string) 219 | # 220 | # sent = list(doc.sents)[0] 221 | # print(sent) 222 | # parse_string = sent._.parse_string 223 | # print(parse_string) 224 | # 225 | # # 원랜 read_constituents 파트. tree.py 226 | # constituents = [] 227 | # word_offset, node_offset = 0, 0 228 | # constituent = [] 229 | # constituent_sequence, word_offset, node_offset, subtoken_map, subtokens = constituent_to_tree(parse_string, word_offset, node_offset) 230 | # subtoken_map = torch.tensor(subtoken_map, dtype=torch.int64) 231 | # print('constitutuent sequence : ', constituent_sequence) # constituent sequence 0번째가 원래 노드, 1번째가 grand parent와 grand child 관련 232 | # print('word offset , node offset', word_offset, node_offset) 233 | # constituent.append(constituent_sequence) 234 | # constituents.append(constituent) 235 | # 236 | # # 그래프 만들기 237 | # num_tokens = subtoken_map.size()[0] # 문장 토크나이즈 해서 나온 토큰 개수 238 | # num_cons = sum([len(sent_cons[0]) for sent_cons in constituent]) # cons 노드 개수 239 | # graph = dgl.graph([]) 240 | # graph.set_n_initializer(dgl.frame.zero_initializer) 241 | # print(graph) 242 | # 243 | # # 그래프에 토큰 관련 추가 244 | # graph.add_nodes(num_tokens) 245 | # graph.ndata['unit'] = torch.zeros(num_tokens) 246 | # graph.ndata['dtype'] = torch.zeros(num_tokens) 247 | # 248 | # # constituent tree 그래프 249 | # graph.add_nodes(num_cons) 250 | # graph.ndata['unit'][num_tokens:] = torch.ones(num_cons) 251 | # graph.ndata['dtype'][num_tokens:] = torch.ones(num_cons) 252 | # 253 | # 254 | # constituent_starts = [] 255 | # constituent_ends = [] 256 | # constituent_labels = [] 257 | # prev_root_node_id = None 258 | # forward_edge_type, backward_edge_type = 0, 2 259 | # constituent_start_idx = 0 260 | # node_id_offset = 0 261 | # num_tokens = len(subtoken_map) 262 | # token_range = torch.arange(0, num_tokens, dtype=torch.int64) 263 | # cons_tag2id = get_cons_tag_vocab('../../data/IAM/constituent_gold_vocab.txt') 264 | # 265 | # 266 | # for high_order_sent_cons in constituent: 267 | # for i, sent_cons in enumerate(high_order_sent_cons): 268 | # for idx, start, end, label, parent_idx in sent_cons: 269 | # idx_nodeid = idx - constituent_start_idx + node_id_offset # 원래는 constituent_start_idx = 0, node_id_offset = 406(token id까지였음. 1063중 406이 토큰이고 이후가 node 였었음.) 270 | # # parent 없는 노드 271 | # if parent_idx == -1: 272 | # if prev_root_node_id is not None: 273 | # graph.add_edges(prev_root_node_id, idx_nodeid, 274 | # data={'cc_link': torch.tensor([forward_edge_type + i]), 275 | # 'dtype': torch.tensor([forward_edge_type + i])}) 276 | # # dual GAT 277 | # graph.add_edges(idx_nodeid, prev_root_node_id, 278 | # data={'cc_link': torch.tensor([backward_edge_type + i]), 279 | # 'dtype': torch.tensor([backward_edge_type + i])}) 280 | # prev_root_node_id = idx_nodeid 281 | # # parent 없는 노드들 282 | # if parent_idx != -1: 283 | # parent_idx_nodeid = parent_idx - constituent_start_idx + node_id_offset 284 | # graph.add_edges(parent_idx_nodeid, idx_nodeid, 285 | # data={'cc_link': torch.tensor([forward_edge_type + i]), 286 | # 'dtype': torch.tensor([forward_edge_type + i])}) 287 | # graph.add_edges(idx_nodeid, parent_idx_nodeid, 288 | # data={'cc_link': torch.tensor([backward_edge_type + i]), 289 | # 'dtype': torch.tensor([backward_edge_type + i])}) 290 | # 291 | # if i == 0: 292 | # # self-loop edge 293 | # graph.add_edges(idx_nodeid, idx_nodeid, data={'cc_link': torch.tensor([4]), 294 | # 'dtype': torch.tensor([4])}) 295 | # # constituent -> token 296 | # token_start = token_range[subtoken_map == start][0] 297 | # token_end = token_range[subtoken_map == end][-1] 298 | # graph.add_edges(idx_nodeid, token_start, data={'ct_link': torch.tensor([5]), 299 | # 'dtype': torch.tensor([5])}) 300 | # graph.add_edges(idx_nodeid, token_end, data={'ct_link': torch.tensor([5]), 301 | # 'dtype': torch.tensor([5])}) 302 | # constituent_starts.append(token_start) 303 | # constituent_ends.append(token_end) 304 | # constituent_labels.append(cons_tag2id[label]) 305 | # 306 | # print(graph) 307 | # # ndata 308 | # # unit 0이 token 노드, 1이 cons 노드 309 | # #print(graph.ndata) 310 | # print('graph ndata unit',graph.ndata['unit']) 311 | # print('graph ndata dtype', graph.ndata['dtype']) 312 | # 313 | # # edata 314 | # # cc link : 4(self loop edge), node-token : 5(constituent token edge) -> 이건 grand 이런거 아니고 그냥 일반적인 parent, child 트리 315 | # # forward edge type(cc link) : 0, backward edge type(cc link) : 2 -> 일반적인 parent child 트리 316 | # # forward edge type(cc link) : 1, backward edge type(cc link) : 3 -> grand parent child 트리 317 | # #print(graph.edata) 318 | # print('graph edata cc link', graph.edata['cc_link']) 319 | # print('graph edata ct link', graph.edata['ct_link']) 320 | # print('graph edata dtype', graph.edata['dtype']) 321 | # 322 | # dgl.save_graphs('graph.dgl', graph) 323 | # (g,), _ = dgl.load_graphs('graph.dgl') 324 | 325 | 326 | 327 | nlp = spacy.load('en_core_web_sm') 328 | nlp.add_pipe('benepar', config={'model':'benepar_en3'}) 329 | input_string = 'Effects in the classroom' 330 | input_string = input_string.lower() 331 | doc = nlp(input_string) 332 | 333 | sent = list(doc.sents)[0] 334 | # print(sent) 335 | # parse_string = sent._.parse_string 336 | # print(parse_string) 337 | # # 338 | # # for tok in doc: 339 | # # 340 | # # print() 341 | # t = nltk_tree.fromstring(sent._.parse_string) 342 | # TreeView(t)._cframe.print_to_file('output1.ps') 343 | # os.system('convert output1.ps output1.png') 344 | # 345 | # t = nltk_tree.fromstring(sent._.parse_string) 346 | # print(TreePrettyPrinter(t).text()) 347 | 348 | from nltk import Tree 349 | from nltk.draw.util import CanvasFrame 350 | from nltk.draw import TreeWidget 351 | 352 | cf = CanvasFrame() 353 | t = Tree.fromstring(sent._.parse_string) 354 | tc = TreeWidget(cf.canvas(),t) 355 | tc['node_font'] = 'arial 14 bold' 356 | tc['leaf_font'] = 'arial 14' 357 | tc['node_color'] = '#005990' 358 | tc['leaf_color'] = '#3F8F57' 359 | tc['line_color'] = '#175252' 360 | cf.add_widget(tc,10,10) # (10,10) offsets 361 | cf.print_to_file('tree1.ps') 362 | cf.destroy() 363 | os.system('convert tree1.ps tree1.png') 364 | -------------------------------------------------------------------------------- /mine_next/functions/pos_analy.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | from spacy import displacy 3 | from collections import Counter 4 | import pandas as pd 5 | import os 6 | import csv 7 | from pathlib import Path 8 | from nltk.tree import Tree 9 | from nltk.parse.corenlp import CoreNLPParser 10 | from nltk.parse.stanford import StanfordParser 11 | nlp = spacy.load("en_core_web_sm") 12 | data = pd.read_csv('../../data/IAM/claims/train.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE) 13 | data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label'] 14 | data = data.dropna(axis=0) 15 | data = data[data['claim_label'] == 'C'] 16 | topics = data['topic_sentence'].tolist() 17 | claims = data['claim_sentence'].tolist() 18 | # with open('../../data/IAM/all_claim_sentence.txt', 'r', encoding='utf-8') as txt_file: 19 | # all_claims = txt_file.readlines() 20 | 21 | counter = Counter() 22 | claim_dep = [] 23 | claim_pos = [] 24 | 25 | # doc = nlp(claims[0]) 26 | # 27 | # 28 | # def token_format(token): 29 | # return "_".join([token.orth_, token.tag_, token.dep_]) 30 | # 31 | # def to_nltk_tree(node): 32 | # if node.n_lefts + node.n_rights > 0: 33 | # return Tree(token_format(node), 34 | # [to_nltk_tree(child) 35 | # for child in node.children] 36 | # ) 37 | # else: 38 | # return token_format(node) 39 | # tree = [to_nltk_tree(sent.root) for sent in doc.sents] 40 | # # The first item in the list is the full tree 41 | # tree[0].draw() 42 | 43 | # # os.environ['CLASSPATH'] = '../../stanford/*' 44 | parser = CoreNLPParser(url='http://localhost:9000') 45 | #parser = StanfordParser(model_path="../../stanford/edu/stanford/nlp/models/lexparser/englishPCFG.caseless.ser.gz") 46 | def nltk_spacy_tree(sent): 47 | doc = nlp(sent) 48 | def token_format(token): 49 | return "_".join([token.orth_, token.tag_, token.dep_]) 50 | def to_nltk_tree(node): 51 | if node.n_lefts + node.n_rights > 0: 52 | return Tree(token_format(node), [to_nltk_tree(child) for child in node.children]) 53 | else: 54 | return token_format(node) 55 | tree = [to_nltk_tree(sent.root) for sent in doc.sents] 56 | print(tree[0]) 57 | nltk_spacy_tree(claims[0]) 58 | 59 | def nltk_stanford_tree(sent): 60 | parse = parser.raw_parse(sent) 61 | tree = list(parse) 62 | print(tree[0].draw()) 63 | 64 | #nltk_stanford_tree(claims[0]) 65 | 66 | # nlp = stanfordnlp.Pipeline(processors='tokenize,pos') 67 | # doc = nlp(claims[0]) 68 | # print(doc) 69 | 70 | ''' 71 | 디펜던스 파서 트리 그려주는 코드 72 | ''' 73 | # for idx, claim in enumerate(claims[:1]): 74 | # doc = nlp(claim) 75 | # sentence_spans = list(doc.sents) 76 | # #displacy.serve(doc, style='dep') 77 | # 78 | # svg = displacy.render(sentence_spans, style='dep') 79 | # output_path = Path('../../data/IAM/dep_claim_img/sentence_{}.svg'.format(idx)) 80 | # output_path.open('w', encoding='utf-8').write(svg) 81 | # for tok in doc: 82 | 83 | 84 | 85 | 86 | # sentence_dep = [] 87 | # sentence_pos = [] 88 | # lemma = [] 89 | # for tok in doc: 90 | # sentence_dep.append(tok.dep_) 91 | # sentence_pos.append(tok.pos_) 92 | # if tok.pos_ == 'VERB': 93 | # lemma.append(tok.lemma_) 94 | # claim_dep.append(sentence_dep) 95 | # claim_pos.append(sentence_pos) 96 | # counter.update(lemma) 97 | # print(counter) 98 | 99 | # with open('../../data/IAM/train_claim_pos.txt', 'w', encoding='utf-8') as pos_file: 100 | # for pos in claim_pos: 101 | # pos_file.write(' '.join(pos)) 102 | # pos_file.write('\n') 103 | # with open('../../data/IAM/train_claim_dep.txt', 'w', encoding='utf-8') as dep_file: 104 | # for dep in claim_dep: 105 | # dep_file.write(' '.join(dep)) 106 | # dep_file.write('\n') 107 | 108 | # for tok in doc: 109 | # print(tok.text, tok.lemma_, tok.pos_, tok.tag_, tok.dep_) 110 | # print() -------------------------------------------------------------------------------- /mine_next/functions/save_graph.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KUNLP/Argument_Classifier/a44f3764516e0a3450e545fbc9e8caa12f8ee466/mine_next/functions/save_graph.py -------------------------------------------------------------------------------- /mine_next/functions/sent2_to_graph.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | import dgl 3 | import dgl.frame 4 | import torch 5 | import os, csv 6 | import pandas as pd 7 | from tqdm import tqdm 8 | import benepar 9 | from transformers import AutoTokenizer 10 | import string 11 | 12 | 13 | class Tree(object): 14 | def __init__(self, type): 15 | self.parent = None 16 | self.num_children = 0 17 | self.children = list() 18 | self.type = type 19 | self.is_leaf = False 20 | self.start = -1 21 | self.end = -1 22 | self.idx = -1 23 | 24 | def add_child(self, child): 25 | child.parent = self 26 | self.num_children += 1 27 | self.children.append(child) 28 | 29 | def size(self): 30 | count = 1 31 | for i in range(self.num_children): 32 | count += self.children[i].size() 33 | return count 34 | 35 | def __str__(self): 36 | return self.type 37 | 38 | def __iter__(self): 39 | yield self 40 | for c in self.children: 41 | for x in c: 42 | yield x 43 | 44 | def get_cons_tag_vocab(data_path): 45 | tag2id = {} 46 | with open(data_path) as f: 47 | for line in f.readlines(): 48 | tag, idx = line.strip().split('\t') 49 | tag2id[tag] = int(idx) 50 | return tag2id 51 | 52 | 53 | def span_starts_ends(node: Tree): 54 | if len(node.children) == 0: 55 | return 56 | for child in node.children: 57 | span_starts_ends(child) 58 | 59 | node.start = node.children[0].start 60 | node.end = node.children[-1].end 61 | 62 | 63 | def constituent_to_tree(tokenizer, constituent_string, sentence, word_offset, node_offset, num_orders=2): 64 | constituents = [] 65 | temp_str = "" 66 | for i, char in enumerate(constituent_string): 67 | if char == "(" or char == ")" or char == " ": 68 | if len(temp_str) != 0: 69 | constituents.append(temp_str) 70 | temp_str = "" 71 | if char != " ": 72 | constituents.append(char) 73 | else: 74 | temp_str += char 75 | # NP, PP등 노드 단위로 stack 76 | stack = [] 77 | for cons in constituents: 78 | if cons != ")": 79 | stack.append(cons) 80 | else: 81 | tail = stack.pop() 82 | temp_constituents = [] 83 | while tail != "(": 84 | temp_constituents.append(tail) 85 | tail = stack.pop() 86 | 87 | parent = Tree(temp_constituents[-1]) 88 | for i in range(len(temp_constituents) - 2, -1, -1): 89 | if isinstance(temp_constituents[i], Tree): 90 | parent.add_child(temp_constituents[i]) 91 | else: 92 | child = Tree(temp_constituents[i]) 93 | parent.add_child(child) 94 | stack.append(parent) 95 | root = stack[-1] 96 | map_count = 0 97 | words = [] 98 | subtokens = [] 99 | subtoken_map = [] 100 | for node in root: 101 | if len(node.children) == 0: 102 | node.is_leaf = True 103 | words.append(str(node)) 104 | node_token = tokenizer.tokenize(str(node)) 105 | if len(node_token) == 0: 106 | continue 107 | subtokens.extend(node_token) 108 | subtoken_map.extend([map_count]*len(node_token)) 109 | map_count += 1 110 | 111 | for node in root: 112 | if node.is_leaf: 113 | node.start = word_offset 114 | node.end = word_offset 115 | word_offset += 1 116 | span_starts_ends(root) 117 | 118 | node_sequence = [] 119 | # internal nodes 는 S, NP VP, PP 와같은 노드만. one, lone과같은 노드는 없음 120 | internal_nodes = [] 121 | for node in root: 122 | if not node.is_leaf: 123 | internal_nodes.append(node) 124 | node_sequence.append(node) 125 | 126 | node_offset_original = node_offset 127 | for node in root: 128 | if node.is_leaf: 129 | continue 130 | node.idx = node_offset 131 | node_offset += 1 132 | 133 | constituent_sequence = [] # [(node idx, node start, node end, node type, parent idx)] 134 | num_internal_nodes = len(internal_nodes) 135 | # constituent_edge 136 | constituent_edge = [[0] * num_internal_nodes for _ in range(num_internal_nodes)] 137 | for i, node in enumerate(internal_nodes): 138 | parent_idx = node.parent.idx if node.parent else -1 139 | constituent_sequence.append((node.idx, node.start, node.end, node.type, parent_idx)) 140 | if parent_idx != -1: 141 | constituent_edge[node.idx - node_offset_original][parent_idx - node_offset_original] = 1 #바로 아래 코드랑 보면 양방향 엣지 포함하는거임 142 | constituent_edge[parent_idx - node_offset_original][node.idx - node_offset_original] = 1 143 | # 이부분은 한계층 건너 뛰어서 엣지 이어 주는 식임. 원래 S랑 PP는 안이어져있는데 여기서 이어줌 144 | high_order_sequence = [constituent_sequence] 145 | for i in range(1, num_orders): 146 | new_constituent_sequence = [] 147 | for idx, start, end, type, parent_idx in high_order_sequence[-1]: 148 | if parent_idx == -1: 149 | continue 150 | parent_node = constituent_sequence[parent_idx - node_offset_original] 151 | if parent_node[-1] == -1: 152 | continue 153 | new_constituent_sequence.append((idx, start, end, type, parent_node[-1])) 154 | constituent_edge[idx - node_offset_original][parent_node[-1] - node_offset_original] = 1 155 | constituent_edge[parent_node[-1] - node_offset_original][idx - node_offset_original] = 1 156 | high_order_sequence.append(new_constituent_sequence) 157 | return high_order_sequence, word_offset, node_offset 158 | 159 | 160 | def final_graph(constituent_list, first_graph, second_graph): 161 | cons_tag2id = get_cons_tag_vocab('../../data/IAM/constituent_gold_vocab.txt') 162 | forward_edge_type, backward_edge_type = 0, 2 163 | # 여기서 pc, gpc그래프 나눠서 주는게 낫지않을까 164 | constituent_labels_first = [] 165 | constituent_labels_second = [] 166 | prev_root_node_id = None 167 | print('fist graph', first_graph.edges()) 168 | print('second graph', second_graph.edges()) 169 | one_order_sent_cons = constituent_list[0][0] 170 | two_order_sent_cons = constituent_list[0][1] 171 | for idx, start, end, label, parent_idx in one_order_sent_cons: 172 | idx_nodeid = idx 173 | # parent 없는 노드 174 | if parent_idx == -1: 175 | if prev_root_node_id is not None: 176 | first_graph.add_edges(prev_root_node_id, idx_nodeid, 177 | data={'cc_link': torch.tensor([1]), 178 | 'dtype': torch.tensor([1])}) 179 | # dual GAT 180 | first_graph.add_edges(idx_nodeid, prev_root_node_id, 181 | data={'cc_link': torch.tensor([1]), 182 | 'dtype': torch.tensor([1])}) 183 | prev_root_node_id = idx_nodeid 184 | # parent 없는 노드들 185 | if parent_idx != -1: 186 | parent_idx_nodeid = parent_idx 187 | first_graph.add_edges(parent_idx_nodeid, idx_nodeid, 188 | data={'cc_link': torch.tensor([1]), 189 | 'dtype': torch.tensor([1])}) 190 | first_graph.add_edges(idx_nodeid, parent_idx_nodeid, 191 | data={'cc_link': torch.tensor([1]), 192 | 'dtype': torch.tensor([1])}) 193 | 194 | # self-loop edge 195 | first_graph.add_edges(idx_nodeid, idx_nodeid, data={'cc_link': torch.tensor([1]), 196 | 'dtype': torch.tensor([1])}) 197 | constituent_labels_first.append(cons_tag2id[label]) 198 | # print('first graph', first_graph.edges()) 199 | 200 | for idx, start, end, label, parent_idx in two_order_sent_cons: 201 | idx_nodeid = idx 202 | # parent 없는 노드 203 | if parent_idx == -1: 204 | if prev_root_node_id is not None: 205 | second_graph.add_edges(prev_root_node_id, idx_nodeid, 206 | data={'cc_link': torch.tensor([1]), 207 | 'dtype': torch.tensor([1])}) 208 | # dual GAT 209 | second_graph.add_edges(idx_nodeid, prev_root_node_id, 210 | data={'cc_link': torch.tensor([1]), 211 | 'dtype': torch.tensor([1])}) 212 | prev_root_node_id = idx_nodeid 213 | # parent 없는 노드들 214 | if parent_idx != -1: 215 | parent_idx_nodeid = parent_idx 216 | second_graph.add_edges(parent_idx_nodeid, idx_nodeid, 217 | data={'cc_link': torch.tensor([1]), 218 | 'dtype': torch.tensor([1])}) 219 | second_graph.add_edges(idx_nodeid, parent_idx_nodeid, 220 | data={'cc_link': torch.tensor([1]), 221 | 'dtype': torch.tensor([1])}) 222 | constituent_labels_second.append(cons_tag2id[label]) 223 | print('second graph', second_graph.edges()) 224 | # for high_order_sent_cons in constituent_list: 225 | # # i = 0: parent - child/ i = 1: grand parent - grand child 226 | # for i, sent_cons in enumerate(high_order_sent_cons): 227 | # for idx, start, end, label, parent_idx in sent_cons: 228 | # idx_nodeid = idx 229 | # # parent 없는 노드 230 | # if parent_idx == -1: 231 | # if prev_root_node_id is not None: 232 | # graph.add_edges(prev_root_node_id, idx_nodeid, 233 | # data={'cc_link': torch.tensor([1]), 234 | # 'dtype': torch.tensor([1])}) 235 | # # dual GAT 236 | # graph.add_edges(idx_nodeid, prev_root_node_id, 237 | # data={'cc_link': torch.tensor([1]), 238 | # 'dtype': torch.tensor([1])}) 239 | # prev_root_node_id = idx_nodeid 240 | # # parent 없는 노드들 241 | # if parent_idx != -1: 242 | # parent_idx_nodeid = parent_idx 243 | # graph.add_edges(parent_idx_nodeid, idx_nodeid, 244 | # data={'cc_link': torch.tensor([1]), 245 | # 'dtype': torch.tensor([1])}) 246 | # graph.add_edges(idx_nodeid, parent_idx_nodeid, 247 | # data={'cc_link': torch.tensor([1]), 248 | # 'dtype': torch.tensor([1])}) 249 | # 250 | # if i == 0: 251 | # # self-loop edge 252 | # graph.add_edges(idx_nodeid, idx_nodeid, data={'cc_link': torch.tensor([1]), 253 | # 'dtype': torch.tensor([1])}) 254 | # constituent_labels.append(cons_tag2id[label]) 255 | # print(graph.edges(form='all')) 256 | 257 | constituent_labels_first = torch.tensor(constituent_labels_first, dtype=torch.long) 258 | constituent_labels_second = torch.tensor(constituent_labels_second, dtype=torch.long) 259 | return first_graph, second_graph, constituent_labels_first, constituent_labels_second 260 | 261 | 262 | def all_process_graph(nlp, tokenizer, sentence): 263 | sentence_doc = nlp(sentence) 264 | sentence_sent = list(sentence_doc.sents)[0] 265 | parse_string = sentence_sent._.parse_string 266 | word_offset, node_offset = 0, 0 267 | constituent = [] 268 | constituent_sequence, word_offset, node_offset = \ 269 | constituent_to_tree(tokenizer, parse_string, sentence, word_offset, node_offset) 270 | constituent.append(constituent_sequence) 271 | 272 | first_graph = dgl.graph([]) 273 | first_graph.set_n_initializer(dgl.frame.zero_initializer) 274 | num_cons = sum([len(sent_cons[0]) for sent_cons in constituent]) 275 | first_graph.add_nodes(num_cons) 276 | first_graph.ndata['unit'] = torch.ones(num_cons) 277 | first_graph.ndata['dtype'] = torch.ones(num_cons) 278 | second_graph = dgl.graph([]) 279 | second_graph.set_n_initializer(dgl.frame.zero_initializer) 280 | num_cons = sum([len(sent_cons[0]) for sent_cons in constituent]) 281 | second_graph.add_nodes(num_cons) 282 | second_graph.ndata['unit'] = torch.ones(num_cons) 283 | second_graph.ndata['dtype'] = torch.ones(num_cons) 284 | 285 | claim_first_graph, claim_second_graph, constituent_labels_frist, constituent_labels_second = \ 286 | final_graph(constituent, first_graph, second_graph) 287 | return claim_first_graph, claim_second_graph, constituent_labels_frist, constituent_labels_second 288 | 289 | 290 | if __name__ == "__main__": 291 | 292 | nlp = spacy.load('en_core_web_sm') 293 | nlp.add_pipe('benepar', config={'model': 'benepar_en3'}) 294 | printable = set(string.printable) 295 | 296 | tokenizer = AutoTokenizer.from_pretrained('roberta-base', do_lower_case=False, use_fast=False) 297 | 298 | train_data = pd.read_csv('../../data/IAM/claims/train.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE) 299 | train_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label'] 300 | train_data = train_data.dropna(axis=0) 301 | dev_data = pd.read_csv('../../data/IAM/claims/dev.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE) 302 | dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label'] 303 | dev_data = dev_data.dropna(axis=0) 304 | 305 | train_sentences = train_data['claim_sentence'].tolist()[:10] 306 | dev_sentences = dev_data['claim_sentence'].tolist()[:10] 307 | total_train = [] 308 | total_dev = [] 309 | for idx, train in tqdm(enumerate(train_sentences), total=len(train_sentences)): 310 | train = train.lower().replace('“', '"').replace('”', '"') 311 | train = "".join(filter(lambda x : x in printable, train)) 312 | 313 | train_first_graph, train_second_graph, train_constituent_labels_first, train_constituent_labels_second \ 314 | = all_process_graph(nlp, tokenizer, train) 315 | dgl.save_graphs('../../data/IAM/claims/graphs/train_first_graph_{}.dgl'.format(idx), train_first_graph) 316 | dgl.save_graphs('../../data/IAM/claims/graphs/train_second_graph_{}.dgl'.format(idx), train_second_graph) 317 | total_train.append([train_constituent_labels_first.tolist(), train_constituent_labels_second.tolist()]) 318 | 319 | for idx, dev in tqdm(enumerate(dev_sentences), total=len(dev_sentences)): 320 | dev = dev.lower().replace('“', '"').replace('”', '"') 321 | dev = "".join(filter(lambda x : x in printable, dev)) 322 | dev_first_graph, dev_second_graph, dev_constituent_label_first, dev_constituent_label_second \ 323 | = all_process_graph(nlp, tokenizer, dev) 324 | dgl.save_graphs('../../data/IAM/claims/graphs/dev_first_graph_{}.dgl'.format(idx), dev_first_graph) 325 | dgl.save_graphs('../../data/IAM/claims/graphs/dev_second_graph_{}.dgl'.format(idx), dev_second_graph) 326 | total_dev.append([dev_constituent_label_first.tolist(), dev_constituent_label_second.tolist()]) 327 | 328 | with open('../../data/IAM/claims/graphs/train_constituent_test.txt', 'w', encoding='utf-8') as f: 329 | for line in total_train: 330 | f.write(str(line)+'\n') 331 | 332 | with open('../../data/IAM/claims/graphs/dev_constituent_test.txt', 'w', encoding='utf-8') as f: 333 | for line in total_dev: 334 | f.write(str(line)+'\n') 335 | -------------------------------------------------------------------------------- /mine_next/functions/sent_to_graph.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | import dgl 3 | import dgl.frame 4 | import torch 5 | import os, csv 6 | import pandas as pd 7 | from tqdm import tqdm 8 | import benepar 9 | from transformers import AutoTokenizer 10 | import string 11 | 12 | 13 | class Tree(object): 14 | def __init__(self, type): 15 | # self.start, self.end 단어 기준으로 세는것. one 이 0번째, . 이 27번쨰라 root 노드가 start=0, end=27을 가지고 있는거임 16 | self.parent = None 17 | self.num_children = 0 18 | self.children = list() 19 | self.type = type 20 | self.is_leaf = False 21 | self.start = -1 22 | self.end = -1 23 | self.idx = -1 24 | 25 | def add_child(self, child): 26 | child.parent = self 27 | self.num_children += 1 28 | self.children.append(child) 29 | 30 | def size(self): 31 | count = 1 32 | for i in range(self.num_children): 33 | count += self.children[i].size() 34 | return count 35 | 36 | def __str__(self): 37 | return self.type 38 | 39 | def __iter__(self): 40 | yield self 41 | for c in self.children: 42 | for x in c: 43 | yield x 44 | 45 | 46 | def get_cons_tag_vocab(data_path): 47 | tag2id = {} 48 | with open(data_path) as f: 49 | for line in f.readlines(): 50 | tag, idx = line.strip().split('\t') 51 | tag2id[tag] = int(idx) 52 | return tag2id 53 | 54 | 55 | def span_starts_ends(node: Tree): 56 | if len(node.children) == 0: 57 | return 58 | for child in node.children: 59 | span_starts_ends(child) 60 | 61 | node.start = node.children[0].start 62 | node.end = node.children[-1].end 63 | 64 | 65 | def constituent_to_tree(tokenizer, constituent_string, sentence, word_offset, node_offset, num_orders=2): 66 | constituents = [] 67 | temp_str = "" 68 | # 괄호 ['(', 'S', '(', 'NP', '(', 'NP', '(', 'CD', 'one', ')', '(', 'ADJP', '(', 'RB', 'long', ')' ... ] 이런식으로 (,)나 단어, constituent 단위로 분리 69 | for i, char in enumerate(constituent_string): 70 | if char == "(" or char == ")" or char == " ": 71 | if len(temp_str) != 0: 72 | constituents.append(temp_str) 73 | temp_str = "" 74 | if char != " ": 75 | constituents.append(char) 76 | else: 77 | temp_str += char 78 | # NP, PP등 노드 단위로 stack 79 | stack = [] 80 | for cons in constituents: 81 | if cons != ")": 82 | stack.append(cons) 83 | else: 84 | tail = stack.pop() 85 | temp_constituents = [] 86 | while tail != "(": 87 | temp_constituents.append(tail) 88 | tail = stack.pop() 89 | 90 | parent = Tree(temp_constituents[-1]) 91 | for i in range(len(temp_constituents) - 2, -1, -1): 92 | if isinstance(temp_constituents[i], Tree): 93 | parent.add_child(temp_constituents[i]) 94 | else: 95 | child = Tree(temp_constituents[i]) 96 | parent.add_child(child) 97 | stack.append(parent) 98 | root = stack[-1] 99 | map_count = 0 100 | words = [] 101 | subtokens = [] 102 | subtoken_map = [] 103 | for node in root: 104 | if len(node.children) == 0: 105 | node.is_leaf = True 106 | words.append(str(node)) 107 | node_token = tokenizer.tokenize(str(node)) 108 | if len(node_token) == 0: 109 | continue 110 | subtokens.extend(node_token) 111 | subtoken_map.extend([map_count]*len(node_token)) 112 | map_count += 1 113 | 114 | for node in root: 115 | if node.is_leaf: 116 | node.start = word_offset 117 | node.end = word_offset 118 | word_offset += 1 119 | span_starts_ends(root) 120 | 121 | node_sequence = [] 122 | # internal nodes 는 S, NP VP, PP 와같은 노드만. one, lone과같은 노드는 없음 123 | internal_nodes = [] 124 | for node in root: 125 | if not node.is_leaf: 126 | internal_nodes.append(node) 127 | node_sequence.append(node) 128 | 129 | node_offset_original = node_offset 130 | for node in root: 131 | if node.is_leaf: 132 | continue 133 | node.idx = node_offset 134 | node_offset += 1 135 | 136 | constituent_sequence = [] # [(node idx, node start, node end, node type, parent idx)] 137 | num_internal_nodes = len(internal_nodes) 138 | # constituent_edge 139 | constituent_edge = [[0] * num_internal_nodes for _ in range(num_internal_nodes)] 140 | for i, node in enumerate(internal_nodes): 141 | parent_idx = node.parent.idx if node.parent else -1 142 | constituent_sequence.append((node.idx, node.start, node.end, node.type, parent_idx)) 143 | if parent_idx != -1: 144 | constituent_edge[node.idx - node_offset_original][parent_idx - node_offset_original] = 1 #바로 아래 코드랑 보면 양방향 엣지 포함하는거임 145 | constituent_edge[parent_idx - node_offset_original][node.idx - node_offset_original] = 1 146 | # 이부분은 한계층 건너 뛰어서 엣지 이어 주는 식임. 원래 S랑 PP는 안이어져있는데 여기서 이어줌 147 | high_order_sequence = [constituent_sequence] 148 | for i in range(1, num_orders): 149 | new_constituent_sequence = [] 150 | for idx, start, end, type, parent_idx in high_order_sequence[-1]: 151 | if parent_idx == -1: 152 | new_constituent_sequence.append((idx, start, end, type, parent_idx)) 153 | continue 154 | parent_node = constituent_sequence[parent_idx - node_offset_original] 155 | if parent_node[-1] == -1: 156 | continue 157 | new_constituent_sequence.append((idx, start, end, type, parent_node[-1])) 158 | constituent_edge[idx - node_offset_original][parent_node[-1] - node_offset_original] = 1 159 | constituent_edge[parent_node[-1] - node_offset_original][idx - node_offset_original] = 1 160 | high_order_sequence.append(new_constituent_sequence) 161 | 162 | return high_order_sequence, word_offset, node_offset 163 | 164 | 165 | def final_graph(constituent_list, first_graph, second_graph): 166 | cons_tag2id = get_cons_tag_vocab('../../data/IAM/constituent_gold_vocab.txt') 167 | constituent_labels_first = [] 168 | constituent_labels_second = [] 169 | prev_root_node_id = None 170 | one_order_sent_cons = constituent_list[0][0] 171 | two_order_sent_cons = constituent_list[0][1] 172 | two_order_sent_cons_idx = [idx[0] for idx in two_order_sent_cons] 173 | 174 | for idx, start, end, label, parent_idx in one_order_sent_cons: 175 | idx_nodeid = idx 176 | if parent_idx == -1: 177 | if prev_root_node_id is not None: 178 | first_graph.add_edges(prev_root_node_id, idx_nodeid, 179 | data={'cc_link': torch.tensor([1]), 180 | 'dtype': torch.tensor([1])}) 181 | first_graph.add_edges(idx_nodeid, prev_root_node_id, 182 | data={'cc_link': torch.tensor([1]), 183 | 'dtype': torch.tensor([1])}) 184 | prev_root_node_id = idx_nodeid 185 | if parent_idx != -1: 186 | parent_idx_nodeid = parent_idx 187 | first_graph.add_edges(parent_idx_nodeid, idx_nodeid, 188 | data={'cc_link': torch.tensor([1]), 189 | 'dtype': torch.tensor([1])}) 190 | first_graph.add_edges(idx_nodeid, parent_idx_nodeid, 191 | data={'cc_link': torch.tensor([1]), 192 | 'dtype': torch.tensor([1])}) 193 | first_graph.add_edges(idx_nodeid, idx_nodeid, data={'cc_link': torch.tensor([1]), 194 | 'dtype': torch.tensor([1])}) 195 | constituent_labels_first.append(cons_tag2id[label]) 196 | 197 | prev_root_node_id = None 198 | for idx, start, end, label, parent_idx in two_order_sent_cons: 199 | idx_nodeid = idx 200 | if parent_idx == -1: 201 | if prev_root_node_id is not None: 202 | second_graph.add_edges(prev_root_node_id, idx_nodeid, 203 | data={'cc_link': torch.tensor([1]), 204 | 'dtype': torch.tensor([1])}) 205 | second_graph.add_edges(idx_nodeid, prev_root_node_id, 206 | data={'cc_link': torch.tensor([1]), 207 | 'dtype': torch.tensor([1])}) 208 | prev_root_node_id = idx_nodeid 209 | if parent_idx != -1: 210 | parent_idx_nodeid = parent_idx 211 | second_graph.add_edges(parent_idx_nodeid, idx_nodeid, 212 | data={'cc_link': torch.tensor([1]), 213 | 'dtype': torch.tensor([1])}) 214 | second_graph.add_edges(idx_nodeid, parent_idx_nodeid, 215 | data={'cc_link': torch.tensor([1]), 216 | 'dtype': torch.tensor([1])}) 217 | second_graph = dgl.add_self_loop(second_graph) 218 | # for high_order_sent_cons in constituent_list: 219 | # for i, sent_cons in enumerate(high_order_sent_cons): 220 | # for idx, start, end, label, parent_idx in sent_cons: 221 | # idx_nodeid = idx # 원래는 constituent_start_idx = 0, node_id_offset = 406(token id까지였음. 1063중 406이 토큰이고 이후가 node 였었음.) 222 | # # parent 없는 노드 223 | # if parent_idx == -1: 224 | # if prev_root_node_id is not None: 225 | # graph.add_edges(prev_root_node_id, idx_nodeid, 226 | # data={'cc_link': torch.tensor([1]), 227 | # 'dtype': torch.tensor([1])}) 228 | # # dual GAT 229 | # graph.add_edges(idx_nodeid, prev_root_node_id, 230 | # data={'cc_link': torch.tensor([1]), 231 | # 'dtype': torch.tensor([1])}) 232 | # prev_root_node_id = idx_nodeid 233 | # # parent 없는 노드들 234 | # if parent_idx != -1: 235 | # parent_idx_nodeid = parent_idx 236 | # graph.add_edges(parent_idx_nodeid, idx_nodeid, 237 | # data={'cc_link': torch.tensor([1]), 238 | # 'dtype': torch.tensor([1])}) 239 | # graph.add_edges(idx_nodeid, parent_idx_nodeid, 240 | # data={'cc_link': torch.tensor([1]), 241 | # 'dtype': torch.tensor([1])}) 242 | # 243 | # if i == 0: 244 | # # self-loop edge 245 | # graph.add_edges(idx_nodeid, idx_nodeid, data={'cc_link': torch.tensor([1]), 246 | # 'dtype': torch.tensor([1])}) 247 | # constituent_labels.append(cons_tag2id[label]) 248 | # constituent_labels = torch.tensor(constituent_labels,dtype=torch.long) 249 | # return graph, constituent_labels 250 | constituent_labels_first = torch.tensor(constituent_labels_first, dtype=torch.long) 251 | constituent_labels_second = torch.tensor(constituent_labels_first, dtype=torch.long) # 라벨 개수 동일하게 252 | return first_graph, second_graph, constituent_labels_first, constituent_labels_second 253 | 254 | 255 | def all_process_graph(nlp, tokenizer, sentence): 256 | sentence_doc = nlp(sentence) 257 | sentence_sent = list(sentence_doc.sents)[0] 258 | parse_string = sentence_sent._.parse_string 259 | word_offset, node_offset = 0, 0 260 | constituent = [] 261 | constituent_sequence, word_offset, node_offset = \ 262 | constituent_to_tree(tokenizer, parse_string, sentence, word_offset, node_offset) 263 | constituent.append(constituent_sequence) 264 | 265 | first_graph = dgl.graph([]) 266 | first_graph.set_n_initializer(dgl.frame.zero_initializer) 267 | num_cons = sum([len(sent_cons[0]) for sent_cons in constituent]) 268 | first_graph.add_nodes(num_cons) 269 | first_graph.ndata['unit'] = torch.ones(num_cons) 270 | first_graph.ndata['dtype'] = torch.ones(num_cons) 271 | 272 | second_graph = dgl.graph([]) 273 | second_graph.set_n_initializer(dgl.frame.zero_initializer) 274 | num_cons = sum([len(sent_cons[0]) for sent_cons in constituent]) 275 | second_graph.add_nodes(num_cons) 276 | second_graph.ndata['unit'] = torch.ones(num_cons) 277 | second_graph.ndata['dtype'] = torch.ones(num_cons) 278 | 279 | claim_first_graph, claim_second_graph, constituent_labels_first, constituent_labels_second = \ 280 | final_graph(constituent, first_graph, second_graph) 281 | return claim_first_graph, claim_second_graph, constituent_labels_first, constituent_labels_second 282 | 283 | 284 | 285 | if __name__ == "__main__": 286 | nlp = spacy.load('en_core_web_sm') 287 | nlp.add_pipe('benepar', config={'model': 'benepar_en3'}) 288 | tokenizer = AutoTokenizer.from_pretrained('roberta-base', do_lower_case=False, use_fast=False) 289 | 290 | # train_data = pd.read_csv('../../data/IAM/claims/train.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE) 291 | # train_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label'] 292 | # train_data = train_data.dropna(axis=0) 293 | # dev_data = pd.read_csv('../../data/IAM/claims/dev.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE) 294 | # dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label'] 295 | # dev_data = dev_data.dropna(axis=0) 296 | test_data = pd.read_csv('../../data/IAM/claims/test.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE) 297 | test_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label'] 298 | test_data = test_data.dropna(axis=0) 299 | 300 | # train_sentences = train_data['claim_sentence'].tolist() 301 | # dev_sentences = dev_data['claim_sentence'].tolist() 302 | test_sentences = test_data['claim_sentence'].tolist() 303 | 304 | total_train = [] 305 | total_dev = [] 306 | total_test = [] 307 | printable = set(string.printable) 308 | 309 | for idx, test in tqdm(enumerate(test_sentences), total=len(test_sentences)): 310 | test = test.lower().replace('“', '"').replace('”', '"') 311 | test = "".join(filter(lambda x : x in printable, test)) 312 | test_first_graph, test_second_graph, test_constituent_labels_first, test_constituent_labels_second = \ 313 | all_process_graph(nlp, tokenizer, test) 314 | #dgl.save_graphs('../../data/IAM/claims/graphs/test_first_graph_{}.dgl'.format(idx), test_first_graph) 315 | #dgl.save_graphs('../../data/IAM/claims/graphs/test_second_graph_{}.dgl'.format(idx), test_second_graph) 316 | total_test.append([test_constituent_labels_first.tolist(), test_constituent_labels_second.tolist()]) 317 | 318 | # for idx, train in tqdm(enumerate(train_sentences), total=len(train_sentences)): 319 | # # train = train.lower().replace("\xa0", '').replace('...', '').replace('—', ' ').replace('“', '').replace('’', "'").strip() 320 | # train = train.lower().replace('“', '"').replace('”', '"') 321 | # train = "".join(filter(lambda x : x in printable, train)) 322 | # 323 | # train_first_graph, train_second_graph, train_constituent_labels_first, train_constituent_labels_second = \ 324 | # all_process_graph(nlp, tokenizer, train) 325 | # dgl.save_graphs('../../data/IAM/claims/graphs/train_first_graph_{}.dgl'.format(idx), train_first_graph) 326 | # dgl.save_graphs('../../data/IAM/claims/graphs/train_second_graph_{}.dgl'.format(idx), train_second_graph) 327 | # total_train.append([train_constituent_labels_first.tolist(), train_constituent_labels_second.tolist()]) 328 | # 329 | # for idx, dev in tqdm(enumerate(dev_sentences), total=len(dev_sentences)): 330 | # dev = dev.lower().replace('“', '"').replace('”', '"') 331 | # dev = "".join(filter(lambda x: x in printable, dev)) 332 | # dev_first_graph, dev_second_graph, dev_constituent_label_first, dev_constituent_label_second \ 333 | # = all_process_graph(nlp, tokenizer, dev) 334 | # dgl.save_graphs('../../data/IAM/claims/graphs/dev_first_graph_{}.dgl'.format(idx), dev_first_graph) 335 | # dgl.save_graphs('../../data/IAM/claims/graphs/dev_second_graph_{}.dgl'.format(idx), dev_second_graph) 336 | # total_dev.append([dev_constituent_label_first.tolist(), dev_constituent_label_second.tolist()]) 337 | 338 | # with open('../../data/IAM/claims/graphs/train_constituent_first_second.txt', 'w', encoding='utf-8') as f: 339 | # for line in total_train: 340 | # f.write(str(line)+'\n') 341 | # 342 | # with open('../../data/IAM/claims/graphs/dev_constituent_first_second.txt', 'w', encoding='utf-8') as f: 343 | # for line in total_dev: 344 | # f.write(str(line)+'\n') 345 | 346 | # with open('../../data/IAM/claims/graphs/test_constituent_first_second.txt', 'w', encoding='utf-8') as f: 347 | # for line in total_test: 348 | # f.write(str(line) + '\n') 349 | -------------------------------------------------------------------------------- /mine_next/functions/stance_main_func.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import os 3 | from tqdm import tqdm 4 | import torch 5 | from torch.utils.data import DataLoader, RandomSampler, SequentialSampler 6 | from transformers.optimization import AdamW, get_linear_schedule_with_warmup 7 | from transformers import AutoConfig, AutoTokenizer 8 | from sklearn.metrics import classification_report, accuracy_score 9 | from sklearn.utils import resample 10 | 11 | import csv 12 | import numpy as np 13 | import pandas as pd 14 | import json 15 | 16 | # from source.claim_classification.model.modeling import KoElectraForClaimClassification 17 | # from source.claim_classification.func.dataset import convert_data2tensordataset 18 | 19 | from mine_next.model.modeling import RobertaForClassification 20 | from mine_next.functions.dataset import ( 21 | convert_data2tensordataset, 22 | convert_stance_data2tensordataset, 23 | convert_only_sentence2tensordataset, 24 | ) 25 | 26 | 27 | def random_downsampling(dataset): 28 | major = dataset[dataset['claim_label'] == 'O'] 29 | minor = dataset[dataset['claim_label'] == 'C'] 30 | sampling_data = resample(major, replace=True, n_samples=len(minor)*5, random_state=42) 31 | train_data = pd.concat([sampling_data, minor]) 32 | return train_data 33 | 34 | 35 | def random_upsampling(dataset): 36 | major = dataset[dataset['claim_label'] == 'O'] 37 | minor = dataset[dataset['claim_label'] == 'C'] 38 | sampling_data = resample(minor, replace=True, n_samples=len(major), random_state=42) 39 | train_data = pd.concat([sampling_data, major]) 40 | return train_data 41 | 42 | def do_train(config, model, optimizer, scheduler, train_dataloader, epoch, global_step): 43 | losses = [] 44 | total_predicts, total_corrects = [], [] 45 | for step, batch in tqdm(enumerate(train_dataloader), desc='do_train(epoch_{})'.format(epoch), total=len(train_dataloader)): 46 | batch = tuple(t.cuda() for t in batch) 47 | # graph 같이 학습할 경우 48 | # idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], batch[4], batch[5] 49 | # constituent_labels = batch[6] 50 | # loss, predicts = model( 51 | # idx=idx, 52 | # input_ids=input_ids, 53 | # attention_mask=attention_mask, 54 | # token_type_ids=token_type_ids, 55 | # labels=labels, 56 | # sim_labels=sim_labels, 57 | # all_graph=total_graph, 58 | # constituent_labels=constituent_labels 59 | # ) 60 | # base 61 | idx, input_ids, attention_mask, token_type_ids, stance_labels = batch[0], batch[1], batch[2], batch[3], \ 62 | batch[4] 63 | loss, predicts = model( 64 | idx=idx, 65 | input_ids=input_ids, 66 | attention_mask=attention_mask, 67 | token_type_ids=token_type_ids, 68 | labels=stance_labels, 69 | sim_labels=None, 70 | ) 71 | predicts = predicts.argmax(dim=-1) 72 | predicts = predicts.cpu().detach().numpy().tolist() 73 | labels = stance_labels.cpu().detach().numpy().tolist() 74 | 75 | total_predicts.extend(predicts) 76 | total_corrects.extend(labels) 77 | 78 | if config.gradient_accumulation_steps > 1: 79 | loss = loss / config.gradient_accumulation_steps 80 | # 원래는 tensor(0.7255)이런식 81 | loss.backward() 82 | losses.append(loss.data.item()) 83 | if (step + 1) % config.gradient_accumulation_steps == 0 or \ 84 | (len(train_dataloader) <= config.gradient_accumulation_steps and (step + 1) == len( 85 | train_dataloader)): 86 | torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm) 87 | optimizer.step() 88 | scheduler.step() 89 | 90 | model.zero_grad() 91 | global_step += 1 92 | target_names = ['class 0', 'class 1'] 93 | print(classification_report(total_corrects, total_predicts, target_names=target_names, digits=4)) 94 | accuracy = accuracy_score(total_corrects, total_predicts) 95 | return accuracy, np.mean(losses), global_step 96 | 97 | 98 | def do_evaluate(model, dev_dataloader): 99 | total_predicts, total_corrects = [], [] 100 | for step, batch in tqdm(enumerate(dev_dataloader), desc="do_evaluate", total=len(dev_dataloader)): 101 | batch = tuple(t.cuda() for t in batch) 102 | # graph 학습할 경우 103 | # idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], batch[4], batch[5] 104 | # constituent_labels = batch[6] 105 | # predicts = model( 106 | # idx=idx, 107 | # input_ids=input_ids, 108 | # attention_mask=attention_mask, 109 | # token_type_ids=token_type_ids, 110 | # all_graph=total_graph, 111 | # constituent_labels=constituent_labels 112 | # ) 113 | # base 114 | idx, input_ids, attention_mask, token_type_ids, stance_labels = batch[0], batch[1], batch[2], batch[3], \ 115 | batch[4] 116 | predicts = model( 117 | idx=idx, 118 | input_ids=input_ids, 119 | attention_mask=attention_mask, 120 | token_type_ids=token_type_ids, 121 | ) 122 | predicts = predicts.argmax(dim=-1) 123 | predicts = predicts.detach().cpu().tolist() 124 | labels = stance_labels.detach().cpu().tolist() 125 | total_predicts.extend(predicts) 126 | total_corrects.extend(labels) 127 | target_names = ['class 0', 'class 1'] 128 | print(classification_report(total_corrects, total_predicts, target_names=target_names, digits=4)) 129 | accuracy = accuracy_score(total_corrects, total_predicts) 130 | return accuracy, total_predicts 131 | 132 | 133 | def train(config, model, tokenizer): 134 | 135 | # 데이터셋 로드 136 | train_data = pd.read_csv(config.stance_train, sep='\t', header=None, quoting=csv.QUOTE_NONE) 137 | #train_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label'] 138 | train_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'id', 'stance_labels'] 139 | train_data = train_data.dropna(axis=0) 140 | # train_data = train_data[:100] 141 | dev_data = pd.read_csv(config.stance_dev, sep='\t', header=None, quoting=csv.QUOTE_NONE) 142 | dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'id', 'stance_labels'] 143 | #dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label'] 144 | dev_data = dev_data.dropna(axis=0) 145 | # dev_data = dev_data[:100] 146 | 147 | #train_data = random_upsampling(train_data) 148 | train_dataset = convert_stance_data2tensordataset(train_data, tokenizer, config.max_length, 'train') 149 | dev_dataset = convert_stance_data2tensordataset(dev_data, tokenizer, config.max_length, 'dev') 150 | 151 | 152 | train_sampler = RandomSampler(train_dataset) 153 | train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config.batch_size) 154 | dev_sampler = SequentialSampler(dev_dataset) 155 | dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=config.batch_size) 156 | 157 | t_total = len(train_dataloader) // config.gradient_accumulation_steps * config.epoch 158 | optimizer = AdamW(model.parameters(), lr=config.learning_rate) 159 | scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config.warmup_steps, num_training_steps=t_total) 160 | 161 | global_step = 0 162 | max_test_accuracy = 0 163 | model.zero_grad() 164 | for epoch in range(config.epoch): 165 | model.train() 166 | train_accuracy, average_loss, global_step = do_train( 167 | config=config, model=model, 168 | optimizer=optimizer, scheduler=scheduler, 169 | train_dataloader=train_dataloader, epoch=epoch+1, global_step=global_step) 170 | print("train_accuracy : {}\taverage_loss : {}\n".format(round(train_accuracy, 4), round(average_loss, 4))) 171 | 172 | model.eval() 173 | test_accuracy, _ = do_evaluate(model=model, dev_dataloader=dev_dataloader) 174 | print("test_accuracy : {}\n".format(round(test_accuracy, 4))) 175 | output_dir = os.path.join(config.save_dir, "checkpoint-{}".format(epoch)) 176 | if not os.path.exists(output_dir): 177 | os.makedirs(output_dir) 178 | model_to_save = model.module if hasattr(model, "module") else model 179 | model_to_save.save_pretrained(output_dir) 180 | tokenizer.save_pretrained(output_dir) 181 | torch.save(config, os.path.join(output_dir, "training_args.bin")) 182 | 183 | 184 | def evaluate(config, model, tokenizer): 185 | dev_data = pd.read_csv(config.claim_dev, sep='\t', header=None, quoting=csv.QUOTE_NONE) 186 | dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label'] 187 | dev_data = dev_data.dropna(axis=0) 188 | # dev_data = dev_data[:10] 189 | # dev_dataset = convert_stance_data2tensordataset(dev_data, tokenizer, config.max_length) 190 | dev_dataset, total_graph = convert_only_sentence2tensordataset(dev_data, tokenizer, config.max_length, 'dev') 191 | 192 | dev_sampler = SequentialSampler(dev_dataset) 193 | dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=config.batch_size) 194 | 195 | test_accuracy, total_predicts = do_evaluate(model=model, dev_dataloader=dev_dataloader, total_graph=total_graph) 196 | print("test accuracy : {}".format(round(test_accuracy,4))) 197 | total_corrects = dev_data['claim_label'].tolist() 198 | total_corrects = [1 if correct == 'C' else 0 for correct in total_corrects] 199 | totaL_claim_sentence = dev_data['claim_sentence'].tolist() 200 | error_list = [] 201 | for predict, correct, claim in zip(total_predicts, total_corrects, totaL_claim_sentence): 202 | if predict != correct: 203 | error = {} 204 | error['predict'] = predict 205 | error['correct'] = correct 206 | error['claim_sentence'] = claim 207 | error_list.append(error) 208 | 209 | with open('../mine/functions/dev_error.json', 'w', encoding='utf-8') as f: 210 | json.dump(error_list, f, indent=4) 211 | -------------------------------------------------------------------------------- /mine_next/functions/test.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import csv 3 | from transformers import RobertaTokenizer 4 | import string 5 | 6 | s = "“Let’s say there’s a government-run test." 7 | printable = set(string.printable) 8 | print(printable) 9 | print("".join(filter(lambda x: x in printable, s))) 10 | 11 | 12 | all_claim = [] 13 | 14 | def extract_claim(data_file): 15 | data = pd.read_csv(data_file, sep='\t', header=None, quoting=csv.QUOTE_NONE) 16 | data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label'] 17 | data = data.dropna(axis=0) 18 | data = data[data['claim_label'] == 'C'] 19 | claim_data = data['claim_sentence'] 20 | return claim_data.tolist() 21 | # tokenizer = RobertaTokenizer.from_pretrained('roberta-base') 22 | # input_str = 'a significant number of republicans assert that hereditary monarchy is unfair and elitist' 23 | #print(tokenizer.tokenize(input_str)) 24 | 25 | # train_claim = extract_claim('../../data/IAM/claims/train.txt') 26 | # print(train_claim) 27 | # dev_claim = extract_claim('../../data/IAM/claims/dev.txt') 28 | # test_claim = extract_claim('../../data/IAM/claims/test.txt') 29 | # 30 | # all_claim.extend(train_claim) 31 | # all_claim.extend(dev_claim) 32 | # all_claim.extend(test_claim) 33 | # 34 | # with open('../../data/IAM/all_claim_sentence.txt', 'w', encoding='utf-8') as txt_file: 35 | # for claim in all_claim: 36 | # txt_file.write(claim) 37 | # txt_file.write('\n') 38 | 39 | -------------------------------------------------------------------------------- /mine_next/functions/textrank.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | import pytextrank 3 | import os 4 | import pandas as pd 5 | import csv 6 | import json, string 7 | from tqdm import tqdm 8 | 9 | 10 | def make_article_dict(): 11 | topic_dir_list = os.listdir('../../data/IAM/origin/test') 12 | topic_dir_list = [os.path.join('../../data/IAM/origin/test', topic) for topic in topic_dir_list] 13 | article_dict = {} 14 | for topic_dir in topic_dir_list: 15 | file_list = os.listdir(topic_dir) # [21_3.txt, 21_7.txt, ... ] 16 | file_list_open = [os.path.join(topic_dir, file) for file in file_list] # ../../data/IAM/origin/train/Should_commercial_advertisements_be_allowed_to_be_fictitious/21_3.txt 17 | 18 | for idx, file in zip(file_list, file_list_open): 19 | article_id = idx.split('.')[0] 20 | 21 | num = file.split('/')[-1] 22 | assert idx == num 23 | sentences = [] 24 | with open(file, 'r', encoding='utf-8') as f: 25 | article = f.readlines() 26 | for line in article: 27 | article_sentence = line.split('\t')[0] 28 | sentences.append(article_sentence) 29 | article_dict[article_id] = sentences 30 | 31 | with open('../../data/IAM/origin/test_article_dict.json', 'w', encoding='utf-8') as outfile: 32 | json.dump(article_dict, outfile, indent='\t', ensure_ascii=False) 33 | 34 | def make_pseudo_topic_with_textrank(): 35 | printable = set(string.printable) 36 | nlp = spacy.load("en_core_web_sm") 37 | nlp.add_pipe("textrank") 38 | 39 | datas = json.load(open('../../data/IAM/origin/dev_article_dict.json', encoding='utf-8')) 40 | pseudo_topic = {} 41 | for key, value in tqdm(datas.items(), total=len(datas)): 42 | article_text = " ".join(value) 43 | article_text = article_text.lower().replace('“', '"').replace('”', '"') 44 | article_text = "".join(filter(lambda x : x in printable, article_text)) 45 | doc = nlp(article_text) 46 | topic = [] 47 | for phrase in doc._.phrases[:10]: 48 | topic.append(phrase.text) 49 | # pseudo_topic[key] = " ".join(topic) 50 | pseudo_topic[key] = topic 51 | with open('../../data/IAM/origin/dev_pseudo_topic_with_textrank_list.json', 'w', encoding='utf-8') as file: 52 | json.dump(pseudo_topic, file, indent='\t', ensure_ascii=False) 53 | 54 | #make_article_dict() 55 | # make_pseudo_topic_with_textrank() 56 | # data = json.load(open('../../data/IAM/origin/dev_pseudo_topic_with_textrank_list.json', encoding='utf-8')) 57 | # print(data) 58 | 59 | 60 | # doc = nlp(article.lower()) 61 | # 62 | # # examine the top-ranked phrases in the document 63 | # pseudo_topic = [] 64 | # for phrase in doc._.phrases[:10]: 65 | # #print(phrase) 66 | # print(phrase.text) 67 | # # print(phrase.rank, phrase.count) 68 | # # print(phrase.chunks) 69 | # print() 70 | # 71 | 72 | 73 | total_char_count = 0 74 | total_word_count = 0 75 | topic_dir_list = os.listdir('../../data/IAM/origin/test') 76 | topic_dir_list = [os.path.join('../../data/IAM/origin/test', topic) for topic in topic_dir_list] 77 | article_dict = {} 78 | for topic_dir in topic_dir_list: 79 | file_list = os.listdir(topic_dir) # [21_3.txt, 21_7.txt, ... ] 80 | file_list_open = [os.path.join(topic_dir, file) for file in file_list] # ../../data/IAM/origin/train/Should_commercial_advertisements_be_allowed_to_be_fictitious/21_3.txt 81 | 82 | for idx, file in zip(file_list, file_list_open): 83 | article_id = idx.split('.')[0] 84 | 85 | num = file.split('/')[-1] 86 | assert idx == num 87 | sentences = [] 88 | with open(file, 'r', encoding='utf-8') as f: 89 | article = f.readlines() 90 | for line in article: 91 | article_sentence = line.split('\t')[0] 92 | word_of_sentence = article_sentence.split(' ') 93 | total_char_count += len(article_sentence) 94 | total_word_count += len(word_of_sentence) 95 | print(total_char_count) 96 | print(total_word_count) 97 | -------------------------------------------------------------------------------- /mine_next/functions/txt2json.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | 4 | import pandas as pd 5 | 6 | dataset = pd.read_csv('../../data/IAM/stance/test.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE) 7 | dataset.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label'] 8 | dataset = dataset.dropna(axis=0) 9 | 10 | # claim_sentences = dataset['claim_sentence'].tolist() 11 | # claim_labels = ['non claim' if label is 'O' else 'claim' for label in dataset['claim_label'].tolist()] 12 | stance_sentences = dataset['claim_sentence'] 13 | stance_label = dataset['stance_label'] 14 | 15 | label_dict = {} 16 | label_dict['-1'] = 'contest' 17 | label_dict['1'] = 'support' 18 | label_dict['0'] = 'non-claim' 19 | data_json = [] 20 | for sentence, label in zip(stance_sentences, stance_label): 21 | content = {} 22 | content['text'] = sentence 23 | content['label'] = label_dict[str(label)] 24 | data_json.append(content) 25 | 26 | with open('../../data/IAM/stance/IAM_stance_test.json', 'w', encoding='utf-8') as outfile: 27 | json.dump(data_json, outfile, indent='\t', ensure_ascii=False) -------------------------------------------------------------------------------- /mine_next/functions/use_bertopic.py: -------------------------------------------------------------------------------- 1 | from bertopic import BERTopic 2 | from sklearn.datasets import fetch_20newsgroups 3 | from hdbscan import HDBSCAN 4 | from transformers import BertModel 5 | from sentence_transformers import SentenceTransformer 6 | import pandas as pd 7 | import os, json 8 | from os import listdir 9 | from os.path import isfile, join 10 | from sklearn.manifold import TSNE 11 | from tqdm import tqdm 12 | 13 | 14 | def topic_sentences(mode): 15 | sentences = [] 16 | article_ids = [] 17 | topic_dir_list = os.listdir('../../data/IAM/origin/{}'.format(mode)) 18 | topic_dir_list = sorted([os.path.join('../../data/IAM/origin/{}'.format(mode), topic) for topic in topic_dir_list]) 19 | 20 | for topic_dir in topic_dir_list: 21 | file_list = os.listdir(topic_dir) 22 | file_list_open = sorted([os.path.join(topic_dir, file) for file in file_list]) 23 | 24 | for idx, file in zip(file_list, file_list_open): 25 | article_id = idx.split('.')[0] 26 | sentence = [] 27 | with open(file, 'r', encoding='utf-8') as f: 28 | article = f.readlines() 29 | for line in article: 30 | article_sentence = line.split('\t')[0] 31 | #sentences.append(article_sentence) 32 | sentence.append(article_sentence) 33 | sentences.append(' '.join(sent for sent in sentence)) 34 | article_ids.append(article_id) 35 | return article_ids, sentences 36 | 37 | train_ids, train_sentences = topic_sentences('train') 38 | dev_ids, dev_sentences = topic_sentences('dev') 39 | test_ids, test_sentences = topic_sentences('test') 40 | 41 | def topic_modeling(): 42 | ''' 43 | 1. extract embeddings 44 | 2. reduce dimensionality 45 | 3. cluster reduced embeddings 46 | 4. tokenize topics 47 | 5. create topic representatioin 48 | ''' 49 | embedding_model = SentenceTransformer("all-MiniLM-L12-v2") 50 | hdbscan_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True) 51 | topic_model = BERTopic( 52 | embedding_model=embedding_model, 53 | hdbscan_model=hdbscan_model, 54 | # diversity=0.2 55 | ) 56 | topic_model.save('../../data/IAM/origin/topic_modeling_div0.0') 57 | 58 | topic_modeling() 59 | topic_model = BERTopic.load('../../data/IAM/origin/topic_modeling_div0.0') 60 | topics, probs = topic_model.fit_transform(train_sentences) 61 | print(topic_model.get_topic_info()) 62 | # topic_model.visualize_topics().write_html("../../data/IAM/origin/intertopic_dist_map_div0.2.html") 63 | # topic_model.visualize_documents(train_sentences).write_html("../../data/IAM/origin/projections_div0.2.html") 64 | 65 | 66 | def make_pseudo_topic_with_bertopic(ids, sentences, topic_model, mode): 67 | pseudo_topic_dict = {} 68 | for idx, sentence in tqdm(zip(ids, sentences), total=len(ids), desc='{} processing ...'.format(mode)): 69 | # 여기서 sentence 는 기사 하나라고 생각하면 된다 70 | pseudo_topic = topic_model.get_topic(topic=topic_model.transform(sentence)[0][0]) 71 | pseudo_topic = ' '.join([topic_word[0] for topic_word in pseudo_topic]) 72 | pseudo_topic_dict[idx] = pseudo_topic 73 | with open('../../data/IAM/origin/{}_pseudo_topic_with_bertopic_div0.0.json'.format(mode), 'w', encoding='utf-8') as file: 74 | json.dump(pseudo_topic_dict, file, indent='\t', ensure_ascii=False) 75 | 76 | make_pseudo_topic_with_bertopic(train_ids, train_sentences, topic_model, 'train') 77 | make_pseudo_topic_with_bertopic(dev_ids, dev_sentences, topic_model, 'dev') 78 | make_pseudo_topic_with_bertopic(test_ids, test_sentences, topic_model, 'test') 79 | 80 | 81 | # with open('../../data/IAM/origin/test_pseudo_topic_with_bertopic.json', 'w', encoding='utf-8') as file: 82 | # json.dump(pseudo_topic_dict, file, indent='\t', ensure_ascii=False) 83 | 84 | # embedding_model = SentenceTransformer("all-MiniLM-L12-v2") 85 | # # embedding_model = SentenceTransformer("all-mpnet-base-v2") 86 | # cluster_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True) 87 | # topic_model = BERTopic(embedding_model=embedding_model, hdbscan_model=cluster_model) 88 | #topic_model.save('../../data/IAM/origin/topic_model') 89 | -------------------------------------------------------------------------------- /mine_next/functions/use_bertopic2.py: -------------------------------------------------------------------------------- 1 | from bertopic import BERTopic 2 | from sklearn.datasets import fetch_20newsgroups 3 | from hdbscan import HDBSCAN 4 | from transformers import BertModel 5 | from sentence_transformers import SentenceTransformer 6 | import pandas as pd 7 | import os, json 8 | from os import listdir 9 | from os.path import isfile, join 10 | from sklearn.manifold import TSNE 11 | from tqdm import tqdm 12 | 13 | 14 | def topic_sentences(mode): 15 | sentences = [] 16 | article_ids = [] 17 | topic_dir_list = os.listdir('../../data/IAM/origin/{}'.format(mode)) 18 | topic_dir_list = sorted([os.path.join('../../data/IAM/origin/{}'.format(mode), topic) for topic in topic_dir_list]) 19 | 20 | for topic_dir in topic_dir_list: 21 | file_list = os.listdir(topic_dir) 22 | file_list_open = sorted([os.path.join(topic_dir, file) for file in file_list]) 23 | 24 | for idx, file in zip(file_list, file_list_open): 25 | article_id = idx.split('.')[0] 26 | sentence = [] 27 | with open(file, 'r', encoding='utf-8') as f: 28 | article = f.readlines() 29 | for line in article: 30 | article_sentence = line.split('\t')[0] 31 | #sentences.append(article_sentence) 32 | sentence.append(article_sentence) 33 | sentences.append(' '.join(sent for sent in sentence)) 34 | article_ids.append(article_id) 35 | return article_ids, sentences 36 | 37 | train_ids, train_sentences = topic_sentences('train') 38 | dev_ids, dev_sentences = topic_sentences('dev') 39 | test_ids, test_sentences = topic_sentences('test') 40 | 41 | def topic_modeling(): 42 | ''' 43 | 1. extract embeddings 44 | 2. reduce dimensionality 45 | 3. cluster reduced embeddings 46 | 4. tokenize topics 47 | 5. create topic representatioin 48 | ''' 49 | embedding_model = SentenceTransformer("all-MiniLM-L12-v2") 50 | hdbscan_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True) 51 | topic_model = BERTopic( 52 | embedding_model=embedding_model, 53 | hdbscan_model=hdbscan_model, 54 | diversity=0.2 55 | ) 56 | topic_model.save('../../data/IAM/origin/topic_modeling_div0.2') 57 | 58 | topic_modeling() 59 | topic_model = BERTopic.load('../../data/IAM/origin/topic_modeling_div0.2') 60 | topics, probs = topic_model.fit_transform(train_sentences) 61 | print(topic_model.get_topic_info()) 62 | # topic_model.visualize_topics().write_html("../../data/IAM/origin/intertopic_dist_map_div0.2.html") 63 | # topic_model.visualize_documents(train_sentences).write_html("../../data/IAM/origin/projections_div0.2.html") 64 | 65 | 66 | def make_pseudo_topic_with_bertopic(ids, sentences, topic_model, mode): 67 | pseudo_topic_dict = {} 68 | for idx, sentence in tqdm(zip(ids, sentences), total=len(ids), desc='{} processing ...'.format(mode)): 69 | # 여기서 sentence 는 기사 하나라고 생각하면 된다 70 | pseudo_topic = topic_model.get_topic(topic=topic_model.transform(sentence)[0][0]) 71 | pseudo_topic = ' '.join([topic_word[0] for topic_word in pseudo_topic]) 72 | pseudo_topic_dict[idx] = pseudo_topic 73 | with open('../../data/IAM/origin/{}_pseudo_topic_with_bertopic_div0.2.json'.format(mode), 'w', encoding='utf-8') as file: 74 | json.dump(pseudo_topic_dict, file, indent='\t', ensure_ascii=False) 75 | 76 | make_pseudo_topic_with_bertopic(train_ids, train_sentences, topic_model, 'train') 77 | make_pseudo_topic_with_bertopic(dev_ids, dev_sentences, topic_model, 'dev') 78 | make_pseudo_topic_with_bertopic(test_ids, test_sentences, topic_model, 'test') 79 | 80 | 81 | # with open('../../data/IAM/origin/test_pseudo_topic_with_bertopic.json', 'w', encoding='utf-8') as file: 82 | # json.dump(pseudo_topic_dict, file, indent='\t', ensure_ascii=False) 83 | 84 | # embedding_model = SentenceTransformer("all-MiniLM-L12-v2") 85 | # # embedding_model = SentenceTransformer("all-mpnet-base-v2") 86 | # cluster_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True) 87 | # topic_model = BERTopic(embedding_model=embedding_model, hdbscan_model=cluster_model) 88 | #topic_model.save('../../data/IAM/origin/topic_model') 89 | -------------------------------------------------------------------------------- /mine_next/functions/use_bertopic3.py: -------------------------------------------------------------------------------- 1 | from bertopic import BERTopic 2 | from sklearn.datasets import fetch_20newsgroups 3 | from hdbscan import HDBSCAN 4 | from transformers import BertModel 5 | from sentence_transformers import SentenceTransformer 6 | import pandas as pd 7 | import os, json 8 | from os import listdir 9 | from os.path import isfile, join 10 | from sklearn.manifold import TSNE 11 | from tqdm import tqdm 12 | 13 | 14 | def topic_sentences(mode): 15 | sentences = [] 16 | article_ids = [] 17 | topic_dir_list = os.listdir('../../data/IAM/origin/{}'.format(mode)) 18 | topic_dir_list = sorted([os.path.join('../../data/IAM/origin/{}'.format(mode), topic) for topic in topic_dir_list]) 19 | 20 | for topic_dir in topic_dir_list: 21 | file_list = os.listdir(topic_dir) 22 | file_list_open = sorted([os.path.join(topic_dir, file) for file in file_list]) 23 | 24 | for idx, file in zip(file_list, file_list_open): 25 | article_id = idx.split('.')[0] 26 | sentence = [] 27 | with open(file, 'r', encoding='utf-8') as f: 28 | article = f.readlines() 29 | for line in article: 30 | article_sentence = line.split('\t')[0] 31 | #sentences.append(article_sentence) 32 | sentence.append(article_sentence) 33 | sentences.append(' '.join(sent for sent in sentence)) 34 | article_ids.append(article_id) 35 | return article_ids, sentences 36 | 37 | train_ids, train_sentences = topic_sentences('train') 38 | dev_ids, dev_sentences = topic_sentences('dev') 39 | test_ids, test_sentences = topic_sentences('test') 40 | temp_ids = [ids.split('_')[0] for ids in train_ids] 41 | 42 | def topic_modeling(): 43 | ''' 44 | 1. extract embeddings 45 | 2. reduce dimensionality 46 | 3. cluster reduced embeddings 47 | 4. tokenize topics 48 | 5. create topic representatioin 49 | ''' 50 | embedding_model = SentenceTransformer("all-MiniLM-L12-v2") 51 | hdbscan_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True) 52 | topic_model = BERTopic( 53 | embedding_model=embedding_model, 54 | hdbscan_model=hdbscan_model, 55 | diversity=0.1 56 | ) 57 | topic_model.save('../../data/IAM/origin/topic_modeling_div0.1') 58 | 59 | #topic_modeling() 60 | topic_model = BERTopic.load('../../data/IAM/origin/topic_modeling_div0.1') 61 | topics, probs = topic_model.fit_transform(train_sentences) 62 | print(topic_model.get_topic_info()) 63 | #print(topic_model.topic_embeddings_) 64 | # 88개의 topic_embeddings_ 65 | #topic_model.visualize_topics(width=1500, height=1500).write_html("../../data/IAM/origin/intertopic_dist_map_div0.1.html") 66 | #topic_model.visualize_documents(train_sentences, width=2000, height=2000).write_html("../../data/IAM/origin/projections_div0.1.html") 67 | topic_model.visualize_topics_per_class(topic_model.topics_per_class(train_sentences, temp_ids), width=1500, height=1500, top_n_topics=20).write_html('../../data/IAM/origin/topic_per_class_div.0.1.html') 68 | # for i in range(10): 69 | # print(topic_model.transform(train_sentences[i])) 70 | 71 | def make_pseudo_topic_with_bertopic(ids, sentences, topic_model, mode): 72 | pseudo_topic_dict = {} 73 | for idx, sentence in tqdm(zip(ids, sentences), total=len(ids), desc='{} processing ...'.format(mode)): 74 | # 여기서 sentence 는 기사 하나라고 생각하면 된다 75 | pseudo_topic = topic_model.get_topic(topic=topic_model.transform(sentence)[0][0]) 76 | pseudo_topic = ' '.join([topic_word[0] for topic_word in pseudo_topic]) 77 | pseudo_topic_dict[idx] = pseudo_topic 78 | with open('../../data/IAM/origin/{}_pseudo_topic_with_bertopic_div0.1.json'.format(mode), 'w', encoding='utf-8') as file: 79 | json.dump(pseudo_topic_dict, file, indent='\t', ensure_ascii=False) 80 | 81 | # make_pseudo_topic_with_bertopic(train_ids, train_sentences, topic_model, 'train') 82 | # make_pseudo_topic_with_bertopic(dev_ids, dev_sentences, topic_model, 'dev') 83 | # make_pseudo_topic_with_bertopic(test_ids, test_sentences, topic_model, 'test') 84 | 85 | 86 | # with open('../../data/IAM/origin/test_pseudo_topic_with_bertopic.json', 'w', encoding='utf-8') as file: 87 | # json.dump(pseudo_topic_dict, file, indent='\t', ensure_ascii=False) 88 | 89 | # embedding_model = SentenceTransformer("all-MiniLM-L12-v2") 90 | # # embedding_model = SentenceTransformer("all-mpnet-base-v2") 91 | # cluster_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True) 92 | # topic_model = BERTopic(embedding_model=embedding_model, hdbscan_model=cluster_model) 93 | #topic_model.save('../../data/IAM/origin/topic_model') 94 | -------------------------------------------------------------------------------- /mine_next/functions/use_firstsent.py: -------------------------------------------------------------------------------- 1 | from bertopic import BERTopic 2 | from sklearn.datasets import fetch_20newsgroups 3 | from hdbscan import HDBSCAN 4 | from transformers import BertModel 5 | from sentence_transformers import SentenceTransformer 6 | import pandas as pd 7 | import os, json 8 | from os import listdir 9 | from os.path import isfile, join 10 | 11 | def topic_sentences(mode): 12 | sentences = [] 13 | article_ids = [] 14 | topic_dir_list = os.listdir('../../data/IAM/origin/{}'.format(mode)) 15 | topic_dir_list = [os.path.join('../../data/IAM/origin/{}'.format(mode), topic) for topic in topic_dir_list] 16 | 17 | for topic_dir in topic_dir_list: 18 | file_list = os.listdir(topic_dir) 19 | file_list_open = [os.path.join(topic_dir, file) for file in file_list] 20 | 21 | for idx, file in zip(file_list, file_list_open): 22 | article_id = idx.split('.')[0] 23 | sentence = [] 24 | with open(file, 'r', encoding='utf-8') as f: 25 | article = f.readlines() 26 | for line in article: 27 | article_sentence = line.split('\t')[0] 28 | #sentences.append(article_sentence) 29 | sentence.append(article_sentence) 30 | sentences.append(sentence[0]) 31 | #sentences.append(' '.join(sent for sent in sentence)) 32 | article_ids.append(article_id) 33 | return article_ids, sentences 34 | 35 | train_ids, train_sentences = topic_sentences('train') 36 | dev_ids, dev_sentences = topic_sentences('dev') 37 | fit_data = train_sentences + dev_sentences 38 | test_ids, test_sentences = topic_sentences('test') 39 | # embedding_model = SentenceTransformer("all-MiniLM-L12-v2") 40 | # # embedding_model = SentenceTransformer("all-mpnet-base-v2") 41 | # cluster_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True) 42 | # topic_model = BERTopic(embedding_model=embedding_model, hdbscan_model=cluster_model) 43 | #topic_model.save('../../data/IAM/origin/topic_model') 44 | # 45 | 46 | 47 | #topic_model = BERTopic.load('../../data/IAM/origin/topic_model') 48 | #topics, probs = topic_model.fit_transform(fit_data) 49 | 50 | #print(topic_model.get_topic_info()) 51 | 52 | def make_pseudo_topic_with_bertopic(ids, sentences, mode): 53 | pseudo_topic_dict = {} 54 | for idx, sentence in zip(ids, sentences): 55 | #pseudo_topic = topic_model.get_topic(topic=topic_model.transform(sentence)[0][0]) 56 | #pseudo_topic = ' '.join([topic_word[0] for topic_word in pseudo_topic]) 57 | pseudo_topic_dict[idx] = sentence 58 | with open('../../data/IAM/origin/{}_pseudo_topic_with_first_sent.json'.format(mode), 'w', encoding='utf-8') as file: 59 | json.dump(pseudo_topic_dict, file, indent='\t', ensure_ascii=False) 60 | 61 | make_pseudo_topic_with_bertopic(train_ids, train_sentences, 'train') 62 | make_pseudo_topic_with_bertopic(dev_ids, dev_sentences, 'dev') 63 | make_pseudo_topic_with_bertopic(test_ids, test_sentences, 'test') 64 | 65 | 66 | # with open('../../data/IAM/origin/test_pseudo_topic_with_bertopic.json', 'w', encoding='utf-8') as file: 67 | # json.dump(pseudo_topic_dict, file, indent='\t', ensure_ascii=False) -------------------------------------------------------------------------------- /mine_next/model/__pycache__/modeling.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KUNLP/Argument_Classifier/a44f3764516e0a3450e545fbc9e8caa12f8ee466/mine_next/model/__pycache__/modeling.cpython-37.pyc -------------------------------------------------------------------------------- /mine_next/model/modeling.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModel, AutoConfig, \ 2 | RobertaPreTrainedModel, RobertaModel 3 | import torch 4 | import torch.nn as nn 5 | from abc import ABC 6 | import dgl 7 | import dgl.function as fn 8 | import dgl.nn.pytorch as dglnn 9 | import torch.nn.functional as F 10 | from dgl import DGLGraph 11 | 12 | 13 | class CGATLayer(nn.Module, ABC): 14 | """ Constituent-Constituent GATLayer """ 15 | 16 | def __init__(self, in_dim, feat_embed_size, out_dim, num_heads): 17 | super(CGATLayer, self).__init__() 18 | self.fc = nn.Linear(in_dim, out_dim * num_heads, bias=False) 19 | self.attn_fc = nn.Linear(2 * out_dim, 1, bias=False) 20 | self.num_heads = num_heads 21 | self.reset_parameters() 22 | 23 | def reset_parameters(self): 24 | gain = nn.init.calculate_gain('relu') 25 | nn.init.xavier_normal_(self.fc.weight, gain=gain) 26 | nn.init.xavier_normal_(self.attn_fc.weight, gain=gain) 27 | 28 | def edge_attention(self, edges): 29 | z2 = torch.cat([edges.src['z'], edges.dst['z']], dim=2) 30 | a = self.attn_fc(z2) 31 | return {'e': F.leaky_relu(a)} 32 | 33 | def message_func(self, edges): 34 | return {'z': edges.src['z'], 'e': edges.data['e']} 35 | 36 | def reduce_func(self, nodes): 37 | alpha = F.softmax(nodes.mailbox['e'], dim=1) 38 | h = torch.sum(alpha * nodes.mailbox['z'], dim=1) 39 | return {'h': h} 40 | 41 | def forward(self, g, h, edge_type=None): 42 | z = self.fc(h) 43 | num_tokens, emb_size = z.size() 44 | z = z.reshape([num_tokens, self.num_heads, emb_size // self.num_heads]) 45 | cons_node_ids = g.filter_nodes(lambda nodes: nodes.data['dtype'] == 1) 46 | cc_edge_id = g.filter_edges(lambda edges: edges.data["dtype"] == edge_type) 47 | self_edge_id = g.filter_edges(lambda edges: edges.data["dtype"] == 4) 48 | cc_edge_id = torch.cat([cc_edge_id, self_edge_id], dim=0) 49 | g.nodes[cons_node_ids].data['z'] = z 50 | g.apply_edges(self.edge_attention, edges=cc_edge_id) 51 | g.pull(cons_node_ids, self.message_func, self.reduce_func) 52 | g.ndata.pop('z') 53 | h = g.ndata.pop('h') 54 | return h[cons_node_ids] 55 | 56 | 57 | class CTGATLayer(nn.Module, ABC): 58 | """ Constituent-Token GATLayer """ 59 | 60 | def __init__(self, in_dim, feat_embed_size, out_dim, num_heads): 61 | super(CTGATLayer, self).__init__() 62 | self.fc = nn.Linear(in_dim, out_dim * num_heads, bias=False) 63 | self.attn_fc = nn.Linear(2 * out_dim, 1, bias=False) 64 | self.num_heads = num_heads 65 | self.reset_parameters() 66 | 67 | def reset_parameters(self): 68 | gain = nn.init.calculate_gain('relu') 69 | nn.init.xavier_normal_(self.fc.weight, gain=gain) 70 | nn.init.xavier_normal_(self.attn_fc.weight, gain=gain) 71 | 72 | def edge_attention(self, edges): 73 | z2 = torch.cat([edges.src['z'], edges.dst['z']], dim=2) 74 | a = self.attn_fc(z2) 75 | return {'e': F.leaky_relu(a)} 76 | 77 | def message_func(self, edges): 78 | return {'z': edges.src['z'], 'e': edges.data['e']} 79 | 80 | def reduce_func(self, nodes): 81 | alpha = F.softmax(nodes.mailbox['e'], dim=1) 82 | h = torch.sum(alpha * nodes.mailbox['z'], dim=1) 83 | return {'h': h} 84 | 85 | def forward(self, g, h, edge_type=None): 86 | z = self.fc(h) 87 | num_tokens, emb_size = z.size() 88 | z = z.reshape([num_tokens, self.num_heads, emb_size // self.num_heads]) 89 | token_node_ids = g.filter_nodes(lambda nodes: nodes.data['dtype'] == 0) 90 | cons_node_ids = g.filter_nodes(lambda nodes: nodes.data['dtype'] == 1) 91 | ct_edge_id = g.filter_edges(lambda edges: edges.data["dtype"] == 5) 92 | g.nodes[cons_node_ids].data['z'] = z 93 | g.apply_edges(self.edge_attention, edges=ct_edge_id) 94 | g.pull(token_node_ids, self.message_func, self.reduce_func) 95 | g.ndata.pop('z') 96 | h = g.ndata.pop('h') 97 | return h[token_node_ids] 98 | 99 | 100 | class MultiHeadGATLayer(nn.Module, ABC): 101 | def __init__(self, layer, in_size, out_size, feat_embed_size, num_heads, config, merge='cat', layer_norm_eps=1e-12): 102 | super(MultiHeadGATLayer, self).__init__() 103 | self.heads = nn.ModuleList() 104 | out_dim = out_size // num_heads 105 | self.layer = layer(in_size, feat_embed_size, out_dim, num_heads) 106 | self.merge = merge 107 | self.dropout = nn.Dropout(p=0.2) 108 | self.LayerNorm = nn.LayerNorm(out_size, eps=layer_norm_eps) 109 | 110 | def forward(self, g, o, h, edge_type=None): 111 | head_outs = self.layer(g, self.dropout(h), edge_type) 112 | num_tokens = head_outs.size()[0] 113 | if self.merge == 'cat': 114 | out = head_outs.reshape([num_tokens, -1]) 115 | else: 116 | out = torch.mean(head_outs, dim=1) 117 | out = o + F.elu(out) 118 | out = self.LayerNorm(out) 119 | return out 120 | 121 | 122 | class GCNLayer(nn.Module): 123 | def __init__(self, in_feats, out_feats): 124 | super(GCNLayer, self).__init__() 125 | self.linear = nn.Linear(in_feats, out_feats) 126 | self.gcn_msg = fn.copy_u(u='h', out='m') 127 | self.gcn_reduce = fn.sum(msg='m', out='h') 128 | def forward(self, g, feature): 129 | with g.local_scope(): 130 | g.ndata['h'] = feature 131 | g.updata_all(self.gcn_msg, self.gcn_reduce) 132 | h = g.ndata['h'] 133 | return self.linear(h) 134 | 135 | 136 | class MultiCGNLayer(nn.Module): 137 | def __init__(self): 138 | super(MultiCGNLayer, self).__init__() 139 | self.hidden_size * 2 + self.cons_hidden_size 140 | self.layer1 = GCNLayer() 141 | 142 | 143 | class GraphEmbedding(nn.Module): 144 | def __init__(self, in_dim, hidden_dim): 145 | super(GraphEmbedding, self).__init__() 146 | self.conv1 = dglnn.GraphConv(in_dim, hidden_dim) 147 | self.conv2 = dglnn.GraphConv(hidden_dim, hidden_dim) 148 | self.conv3 = dglnn.GraphConv(hidden_dim, hidden_dim) 149 | nn.init.xavier_normal_(self.conv1.weight) 150 | nn.init.xavier_normal_(self.conv2.weight) 151 | nn.init.xavier_normal_(self.conv3.weight) 152 | def forward(self, g, h): 153 | h = F.relu(self.conv1(g, h)) 154 | h = F.relu(self.conv2(g, h)) 155 | h = F.relu(self.conv3(g, h)) 156 | #with g.local_scope(): 157 | g.ndata['h'] = h 158 | hg = dgl.mean_nodes(g, 'h') 159 | return hg 160 | 161 | 162 | class GraphEmbedding2(nn.Module): 163 | def __init__(self, in_dim, hidden_dim, out_dim): 164 | super(GraphEmbedding2, self).__init__() 165 | self.conv1 = dglnn.GraphConv(in_dim, hidden_dim, allow_zero_in_degree=True) 166 | self.conv2 = dglnn.GraphConv(hidden_dim, hidden_dim, allow_zero_in_degree=True) 167 | self.conv3 = dglnn.GraphConv(hidden_dim, out_dim, allow_zero_in_degree=True) 168 | # nn.init.xavier_normal_(self.conv1.weight) 169 | # nn.init.xavier_normal_(self.conv2.weight) 170 | # nn.init.xavier_normal_(self.conv3.weight) 171 | def forward(self, g, h): 172 | h = F.relu(self.conv1(g, h)) 173 | h = F.relu(self.conv2(g, h)) 174 | h = F.relu(self.conv3(g, h)) 175 | with g.local_scope(): 176 | g.ndata['h'] = h 177 | hg = dgl.mean_nodes(g, 'h') 178 | return hg 179 | 180 | 181 | class RobertaReflectGraphWithGrandEdgeClassification(RobertaPreTrainedModel): 182 | def __init__(self, config): 183 | super().__init__(config) 184 | self.num_labels = config.num_labels 185 | self.hidden_size = config.hidden_size 186 | self.cons_hidden_size = config.cons_hidden_size 187 | self.roberta = RobertaModel(config) 188 | self.feature_size = config.feature_size 189 | # 그래프 둘다 쓸때 190 | # self.claim_layer = nn.Sequential( 191 | # nn.Linear(in_features=self.hidden_size+2*self.feature_size, out_features=self.hidden_size+2*self.feature_size), 192 | # nn.ReLU(), 193 | # nn.Linear(in_features=self.hidden_size+2*self.feature_size, out_features=self.num_labels), 194 | # ) 195 | # 그래프 하나만 할때 196 | self.claim_layer = nn.Sequential( 197 | nn.Linear(in_features=self.hidden_size + self.feature_size, 198 | out_features=self.hidden_size + self.feature_size), 199 | nn.ReLU(), 200 | nn.Linear(in_features=self.hidden_size + self.feature_size, out_features=self.num_labels), 201 | ) 202 | # 원래는 self.hidden_size + self.cons_hidden_size 203 | self.cons_type_embeddings = nn.Embedding(len(config.cons_tag2id), self.cons_hidden_size) 204 | # nn.init.uniform_(self.cons_type_embeddings.weight, -1.0, 1.0) 205 | self.softmax = nn.Softmax(dim=-1) 206 | self.graph_embedding = GraphEmbedding2(self.cons_hidden_size, self.cons_hidden_size, self.feature_size) 207 | 208 | def forward(self, idx=None, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, sim_labels=None, all_graph=None, 209 | constituent_labels_first=None, constituent_labels_second=None): 210 | output = self.roberta(input_ids=input_ids, attention_mask=attention_mask) 211 | output = output.last_hidden_state 212 | 213 | graph_conv_first = [] 214 | graph_conv_second = [] 215 | for (graph_id, first, second) in zip(idx, constituent_labels_first, constituent_labels_second): 216 | curr_first_g = all_graph[0][int(graph_id.item())].to("cuda") 217 | first_mask = first != -1 218 | first_label = first[first_mask] 219 | first_cons_node_feature = self.cons_type_embeddings(first_label) 220 | curr_first_g_conv = self.graph_embedding(curr_first_g, first_cons_node_feature) 221 | graph_conv_first.append(curr_first_g_conv) 222 | 223 | curr_second_g = all_graph[1][int(graph_id.item())].to("cuda") 224 | second_mask = second != -1 225 | second_label = second[second_mask] 226 | second_cons_node_feature = self.cons_type_embeddings(second_label) 227 | curr_second_g_conv = self.graph_embedding(curr_second_g, second_cons_node_feature) 228 | graph_conv_second.append(curr_second_g_conv) 229 | 230 | # graph_conv_reult = torch.stack(graph_conv_reult, dim=0) 231 | # cls = output[:, 0, : ] -> (4, 768) 232 | graph_conv_first = torch.stack(graph_conv_first, dim=0) 233 | graph_conv_second = torch.stack(graph_conv_second, dim=0) 234 | 235 | cls_token = output[:, 0, :].unsqueeze(dim=1) 236 | #cls_graph_concat = torch.cat([cls_token, graph_conv_first, graph_conv_second], dim=-1) 237 | cls_graph_concat = torch.cat([cls_token, graph_conv_second], dim=-1) 238 | #cls_graph_concat = torch.cat([cls_token, graph_conv_second], dim=-1) 239 | 240 | logit = self.claim_layer(cls_graph_concat) 241 | logit = logit.squeeze(dim=1) 242 | if labels is not None: 243 | loss_func = nn.CrossEntropyLoss() 244 | loss = loss_func(logit, labels) 245 | return loss, self.softmax(logit) 246 | else: 247 | return logit 248 | 249 | 250 | class RobertaReflectGraphClassification(RobertaPreTrainedModel): 251 | def __init__(self, config): 252 | super().__init__(config) 253 | self.num_labels = config.num_labels 254 | self.hidden_size = config.hidden_size 255 | self.cons_hidden_size = config.cons_hidden_size 256 | self.roberta = RobertaModel(config) 257 | self.feature_size = config.feature_size 258 | self.claim_layer = nn.Sequential( 259 | nn.Linear(in_features=self.hidden_size+2*self.feature_size, out_features=self.hidden_size+2*self.feature_size), 260 | nn.ReLU(), 261 | nn.Linear(in_features=self.hidden_size+2*self.feature_size, out_features=self.num_labels), 262 | )# 원래는 self.hidden_size + self.cons_hidden_size 263 | self.cons_type_embeddings = nn.Embedding(len(config.cons_tag2id), self.cons_hidden_size) 264 | # nn.init.uniform_(self.cons_type_embeddings.weight, -1.0, 1.0) 265 | self.softmax = nn.Softmax(dim=-1) 266 | self.graph_embedding = GraphEmbedding2(self.cons_hidden_size, self.cons_hidden_size, self.feature_size) 267 | 268 | def forward(self, idx=None, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, sim_labels=None, all_graph=None, 269 | constituent_labels_first=None, constituent_labels_second=None): 270 | output = self.roberta(input_ids=input_ids, attention_mask=attention_mask) 271 | output = output.last_hidden_state 272 | 273 | graph_conv_first = [] 274 | graph_conv_second = [] 275 | for (graph_id, first, second) in zip(idx, constituent_labels_first, constituent_labels_second): 276 | curr_first_g = all_graph[0][int(graph_id.item())].to("cuda") 277 | first_mask = first != -1 278 | first_label = first[first_mask] 279 | first_cons_node_feature = self.cons_type_embeddings(first_label) 280 | curr_first_g_conv = self.graph_embedding(curr_first_g, first_cons_node_feature) 281 | graph_conv_first.append(curr_first_g_conv) 282 | 283 | curr_second_g = all_graph[1][int(graph_id.item())].to("cuda") 284 | second_mask = second != -1 285 | second_label = second[second_mask] 286 | second_cons_node_feature = self.cons_type_embeddings(second_label) 287 | curr_second_g_conv = self.graph_embedding(curr_second_g, second_cons_node_feature) 288 | graph_conv_second.append(curr_second_g_conv) 289 | 290 | # graph_conv_reult = torch.stack(graph_conv_reult, dim=0) 291 | # cls = output[:, 0, : ] -> (4, 768) 292 | graph_conv_first = torch.stack(graph_conv_first, dim=0) 293 | graph_conv_second = torch.stack(graph_conv_second, dim=0) 294 | 295 | cls_token = output[:, 0, :].unsqueeze(dim=1) 296 | # cls_graph_concat = torch.cat([cls_token, graph_conv_first], dim=-1) 297 | #cls_graph_concat = torch.cat([graph_conv_first,cls_token], dim=-1) 298 | cls_graph_concat = torch.cat([cls_token, graph_conv_first, graph_conv_second], dim=-1) 299 | logit = self.claim_layer(cls_graph_concat) 300 | logit = logit.squeeze(dim=1) 301 | if labels is not None: 302 | loss_func = nn.CrossEntropyLoss() 303 | loss = loss_func(logit, labels) 304 | return loss, self.softmax(logit) 305 | else: 306 | return logit 307 | 308 | 309 | class RobertaForClassification(RobertaPreTrainedModel): 310 | def __init__(self, config): 311 | super().__init__(config) 312 | self.num_labels = config.num_labels 313 | self.hidden_size = config.hidden_size 314 | self.roberta = RobertaModel(config) 315 | self.claim_layer = nn.Sequential( 316 | nn.Linear(in_features=self.hidden_size, out_features=self.hidden_size), 317 | nn.ReLU(), 318 | nn.Linear(in_features=self.hidden_size, out_features=self.num_labels), 319 | ) 320 | self.softmax = nn.Softmax(dim=-1) 321 | 322 | def forward(self, idx=None, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, sim_labels=None): 323 | output = self.roberta(input_ids=input_ids, attention_mask=attention_mask) 324 | output = output.last_hidden_state 325 | 326 | logit = self.claim_layer(output[:, 0, :]) 327 | if labels is not None: 328 | loss_func = nn.CrossEntropyLoss() 329 | loss = loss_func(logit, labels) 330 | return loss, self.softmax(logit) 331 | else: 332 | return logit 333 | 334 | 335 | class RobertaForStanceClassification(RobertaPreTrainedModel): 336 | def __init__(self, config): 337 | super().__init__(config) 338 | self.num_labels = config.num_labels 339 | self.hidden_size = config.hidden_size 340 | self.roberta = RobertaModel(config) 341 | self.claim_layer = nn.Sequential( 342 | nn.Linear(in_features=self.hidden_size, out_features=self.hidden_size), 343 | nn.ReLU(), 344 | nn.Linear(in_features=self.hidden_size, out_features=self.num_labels), 345 | ) 346 | self.softmax = nn.Softmax(dim=-1) 347 | 348 | def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None): 349 | output = self.roberta(input_ids=input_ids, attention_mask=attention_mask) 350 | output = output.last_hidden_state 351 | 352 | logit = self.claim_layer(output[:, 0, :]) 353 | if labels is not None: 354 | loss_func = nn.CrossEntropyLoss() 355 | loss = loss_func(logit, labels) 356 | return loss, self.softmax(logit) 357 | else: 358 | return logit 359 | 360 | 361 | class RobertaForSTANCY(RobertaPreTrainedModel): 362 | def __init__(self, config): 363 | super().__init__(config) 364 | self.num_labels = config.num_labels 365 | self.hidden_size = config.hidden_size 366 | self.roberta = RobertaModel(config) 367 | self.claim_layer = nn.Sequential( 368 | nn.Linear(in_features=self.hidden_size+1, out_features=self.hidden_size+1), 369 | nn.ReLU(), 370 | nn.Linear(in_features=self.hidden_size+1, out_features=self.num_labels), 371 | ) 372 | self.softmax = nn.Softmax(dim=-1) 373 | self.cosine = nn.CosineSimilarity() 374 | 375 | def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, sim_labels=None): 376 | 377 | output_combine = self.roberta(input_ids=input_ids, attention_mask=attention_mask) 378 | #output_combine = output_combine.last_hidden_state 379 | output_combine = output_combine.pooler_output 380 | sent_attention_mask = (1-token_type_ids) * attention_mask 381 | output_sent = self.roberta(input_ids=input_ids, attention_mask=sent_attention_mask) 382 | #output_sent = output_sent.last_hidden_state 383 | output_sent = output_sent.pooler_output 384 | cos_sim = self.cosine(output_combine, output_sent).unsqueeze(1) 385 | combined = torch.cat([output_combine, cos_sim], dim=1) 386 | 387 | logit = self.claim_layer(combined) 388 | 389 | if labels is not None: 390 | loss_func = nn.CrossEntropyLoss() 391 | loss_bert = loss_func(logit, labels) 392 | 393 | loss_cosine = nn.CosineEmbeddingLoss() 394 | loss_claim = loss_cosine(output_combine, output_sent, sim_labels) 395 | loss = loss_bert + loss_claim 396 | return loss, self.softmax(logit) 397 | else: 398 | return logit 399 | -------------------------------------------------------------------------------- /mine_next/run_base.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from mine_next.functions.main_function import train, evaluate 3 | import random, os 4 | import numpy as np 5 | import torch 6 | from transformers import AutoConfig, AutoTokenizer 7 | from mine_next.model.modeling import RobertaForClassification, RobertaForSTANCY, RobertaForStanceClassification 8 | from mine_next.model.modeling import RobertaReflectGraphClassification 9 | from mine_next.functions.sent_to_graph import get_cons_tag_vocab 10 | 11 | def create_model(args): 12 | config = AutoConfig.from_pretrained( 13 | args.language_model, 14 | num_labels=args.num_labels, 15 | max_length=args.max_length, 16 | # local_files_only=True 17 | ) 18 | tokenizer = AutoTokenizer.from_pretrained(args.language_model, do_lower_case=False, use_fast=False) 19 | setattr(config, 'cons_hidden_size', args.cons_hidden_size) 20 | setattr(config, 'cons_tag2id', args.cons_tag2id) 21 | model = RobertaForClassification.from_pretrained( 22 | args.language_model if args.init_weight else os.path.join(args.output_dir, "checkpoint-{}".format(args.checkpoint)), 23 | config=config, 24 | # local_files_only=True 25 | ) 26 | return config, tokenizer, model 27 | 28 | 29 | def set_seed(args): 30 | random.seed(args.seed) 31 | np.random.seed(args.seed) 32 | torch.manual_seed(args.seed) 33 | if torch.cuda.is_available(): 34 | torch.cuda.manual_seed_all(args.seed) 35 | 36 | 37 | def main(args): 38 | set_seed(args) 39 | config, tokenizer, model = create_model(args) 40 | model.to(args.device) 41 | 42 | if args.mode == 'train': 43 | train(args, model, tokenizer) 44 | elif args.mode == 'dev': 45 | evaluate(args, model, tokenizer) 46 | 47 | 48 | if __name__ == '__main__': 49 | parser = argparse.ArgumentParser(description='main') 50 | parser.add_argument('--claim_train', type=str, default='../data/IAM/claims/train.txt') 51 | parser.add_argument('--claim_dev', type=str, default='../data/IAM/claims/dev.txt') 52 | parser.add_argument('--save_dir', type=str, default='only_sentence_base_3e_5') # 모델 불러올 dir 53 | parser.add_argument('--output_dir', type=str, default='only_sentence_base_3e_5') # 모델 저장할 dir 54 | parser.add_argument('--constituent_gold_vocab', type=str, default='../data/IAM/constituent_gold_vocab.txt') 55 | parser.add_argument('--init_weight', type=bool, default=True) # False면 학습된거. True면 쌩 로버타 56 | parser.add_argument('--device', type=str, default="cuda") 57 | #model 58 | parser.add_argument('--num_labels', type=int, default=2) 59 | parser.add_argument('--max_length', type=int, default=512) 60 | parser.add_argument('--batch_size', type=int, default=16) 61 | parser.add_argument('--cons_tag2id', type=dict, default=get_cons_tag_vocab('../data/IAM/constituent_gold_vocab.txt')) 62 | parser.add_argument('--cons_hidden_size', type=int, default=128) 63 | 64 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1) 65 | parser.add_argument("--learning_rate", type=float, default=3e-5) 66 | parser.add_argument("--warmup_steps", type=int, default=0) 67 | parser.add_argument("--max_grad_norm", type=float, default=5.0) 68 | parser.add_argument('--mode', type=str, default='dev') 69 | parser.add_argument('--seed', type=int, default=42) 70 | parser.add_argument('--checkpoint', type=int, default=1) 71 | parser.add_argument('--language_model', type=str, default='roberta-base') 72 | parser.add_argument("--epoch", type=int, default=15) 73 | 74 | args = parser.parse_args() 75 | main(args) 76 | -------------------------------------------------------------------------------- /mine_next/run_debug.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from mine_next.functions.main_function2 import train, evaluate 3 | import random, os 4 | import numpy as np 5 | import torch 6 | from transformers import AutoConfig, AutoTokenizer 7 | from mine_next.model.modeling import RobertaForClassification, RobertaForSTANCY, RobertaForStanceClassification 8 | from mine_next.model.modeling import RobertaReflectGraphClassification 9 | from mine_next.functions.sent_to_graph import get_cons_tag_vocab 10 | 11 | 12 | def create_model(args): 13 | config = AutoConfig.from_pretrained( 14 | args.language_model, 15 | num_labels=args.num_labels, 16 | max_length=args.max_length, 17 | # local_files_only=True 18 | ) 19 | tokenizer = AutoTokenizer.from_pretrained(args.language_model, do_lower_case=False, use_fast=False) 20 | setattr(config, 'cons_hidden_size', args.cons_hidden_size) 21 | setattr(config, 'cons_tag2id', args.cons_tag2id) 22 | model = RobertaReflectGraphClassification.from_pretrained( 23 | args.language_model if args.init_weight else os.path.join(args.output_dir, "checkpoint-{}".format(args.checkpoint)), 24 | config=config, 25 | # local_files_only=True 26 | ) 27 | return config, tokenizer, model 28 | 29 | def set_seed(args): 30 | random.seed(args.seed) 31 | np.random.seed(args.seed) 32 | torch.manual_seed(args.seed) 33 | if torch.cuda.is_available(): 34 | torch.cuda.manual_seed_all(args.seed) 35 | 36 | def main(args): 37 | set_seed(args) 38 | config, tokenizer, model = create_model(args) 39 | model.to(args.device) 40 | 41 | if args.mode == 'train': 42 | train(args, model, tokenizer) 43 | elif args.mode == 'dev': 44 | evaluate(args, model, tokenizer) 45 | 46 | if __name__ == '__main__': 47 | parser = argparse.ArgumentParser(description='main') 48 | parser.add_argument('--claim_train', type=str, default='../data/IAM/claims/train.txt') 49 | parser.add_argument('--claim_dev', type=str, default='../data/IAM/claims/dev.txt') 50 | parser.add_argument('--save_dir', type=str, default='only_sentence_base_graph_cons_256') # 모델 불러올 dir 51 | parser.add_argument('--output_dir', type=str, default='only_sentence_base_graph_cons_256') # 모델 저장할 dir 52 | parser.add_argument('--constituent_gold_vocab', type=str, default='../data/IAM/constituent_gold_vocab.txt') 53 | parser.add_argument('--init_weight', type=bool, default=True) # False면 학습된거. True면 쌩 로버타 54 | parser.add_argument('--device', type=str, default="cuda") 55 | #model 56 | parser.add_argument('--num_labels', type=int, default=2) 57 | parser.add_argument('--max_length', type=int, default=512) 58 | parser.add_argument('--batch_size', type=int, default=16) 59 | parser.add_argument('--cons_tag2id', type=dict, default=get_cons_tag_vocab('../data/IAM/constituent_gold_vocab.txt')) 60 | parser.add_argument('--cons_hidden_size', type=int, default=256) 61 | 62 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1) 63 | parser.add_argument("--learning_rate", type=float, default=3e-5) 64 | parser.add_argument("--warmup_steps", type=int, default=0) 65 | parser.add_argument("--max_grad_norm", type=float, default=5.0) 66 | parser.add_argument('--mode', type=str, default='train') 67 | parser.add_argument('--seed', type=int, default=42) 68 | parser.add_argument('--checkpoint', type=int, default=5) 69 | parser.add_argument('--language_model', type=str, default='roberta-base') 70 | parser.add_argument("--epoch", type=int, default=30) 71 | 72 | args = parser.parse_args() 73 | main(args) 74 | -------------------------------------------------------------------------------- /mine_next/run_grad1.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from mine_next.functions.main_function2 import train, evaluate, test 3 | import random, os 4 | import numpy as np 5 | import torch 6 | from transformers import AutoConfig, AutoTokenizer 7 | from mine_next.model.modeling import RobertaForClassification, RobertaForSTANCY, RobertaForStanceClassification 8 | from mine_next.model.modeling import RobertaReflectGraphClassification, RobertaReflectGraphWithGrandEdgeClassification 9 | from mine_next.functions.sent_to_graph import get_cons_tag_vocab 10 | 11 | def create_model(args): 12 | config = AutoConfig.from_pretrained( 13 | args.language_model, 14 | num_labels=args.num_labels, 15 | max_length=args.max_length, 16 | # local_files_only=True 17 | ) 18 | tokenizer = AutoTokenizer.from_pretrained(args.language_model, do_lower_case=False, use_fast=False) 19 | setattr(config, 'cons_hidden_size', args.cons_hidden_size) 20 | setattr(config, 'feature_size', args.feature_size) 21 | setattr(config, 'cons_tag2id', args.cons_tag2id) 22 | model = RobertaReflectGraphClassification.from_pretrained( 23 | args.language_model if args.init_weight else os.path.join(args.output_dir, "checkpoint-{}".format(args.checkpoint)), 24 | config=config, 25 | # local_files_only=True 26 | ) 27 | return config, tokenizer, model 28 | 29 | 30 | def set_seed(args): 31 | random.seed(args.seed) 32 | np.random.seed(args.seed) 33 | torch.manual_seed(args.seed) 34 | if torch.cuda.is_available(): 35 | torch.cuda.manual_seed_all(args.seed) 36 | 37 | 38 | def main(args): 39 | set_seed(args) 40 | config, tokenizer, model = create_model(args) 41 | model.to(args.device) 42 | 43 | if args.mode == 'train': 44 | train(args, model, tokenizer) 45 | elif args.mode == 'dev': 46 | evaluate(args, model, tokenizer) 47 | elif args.mode == 'test': 48 | test(args, model, tokenizer) 49 | 50 | 51 | if __name__ == '__main__': 52 | parser = argparse.ArgumentParser(description='main') 53 | parser.add_argument('--claim_train', type=str, default='../data/IAM/claims/train.txt') 54 | parser.add_argument('--claim_dev', type=str, default='../data/IAM/claims/dev.txt') 55 | parser.add_argument('--claim_test', type=str, default='../data/IAM/claims/test.txt') 56 | parser.add_argument('--constituent_gold_vocab', type=str, default='../data/IAM/constituent_gold_vocab.txt') 57 | # parser.add_argument('--train_pseudo_topic', type=str, default='../data/IAM/origin/train_pseudo_topic_with_first_sent.json') 58 | # parser.add_argument('--dev_pseudo_topic', type=str, default='../data/IAM/origin/dev_pseudo_topic_with_first_sent.json') 59 | # parser.add_argument('--test_pseudo_topic', type=str, default='../data/IAM/origin/test_pseudo_topic_with_first_sent.json') 60 | parser.add_argument('--train_pseudo_topic', type=str, 61 | default='../data/IAM/origin/train_pseudo_topic_with_bertopic_div0.2.json') 62 | parser.add_argument('--dev_pseudo_topic', type=str, 63 | default='../data/IAM/origin/dev_pseudo_topic_with_bertopic_div0.2.json') 64 | parser.add_argument('--test_pseudo_topic', type=str, 65 | default='../data/IAM/origin/test_pseudo_topic_with_bertopic_div0.2.json') 66 | parser.add_argument('--init_weight', type=bool, default=True) # False면 학습된거. True면 쌩 로버타 67 | parser.add_argument('--device', type=str, default="cuda") 68 | #model 69 | parser.add_argument('--num_labels', type=int, default=2) 70 | parser.add_argument('--max_length', type=int, default=256) 71 | parser.add_argument('--batch_size', type=int, default=32) 72 | parser.add_argument('--cons_tag2id', type=dict, default=get_cons_tag_vocab('../data/IAM/constituent_gold_vocab.txt')) 73 | parser.add_argument('--cons_hidden_size', type=int, default=768) 74 | parser.add_argument('--feature_size', type=int, default=384) 75 | 76 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1) 77 | parser.add_argument("--learning_rate", type=float, default=3e-5) 78 | parser.add_argument("--warmup_steps", type=int, default=0) 79 | parser.add_argument("--max_grad_norm", type=float, default=5.0) 80 | parser.add_argument('--mode', type=str, default='train') 81 | parser.add_argument('--seed', type=int, default=42) 82 | parser.add_argument('--checkpoint', type=int, default=1) 83 | parser.add_argument('--language_model', type=str, default='roberta-base') 84 | parser.add_argument("--epoch", type=int, default=40) 85 | parser.add_argument('--save_dir', type=str, default='pseudo_topic_with_bertopic_div02_base_graph_normal_only_pc_cons_768_feat_384_max_length_256_lr_3e5') # 모델 저장할 dir 86 | parser.add_argument('--output_dir', type=str, default='pseudo_topic_with_bertopic_div02_base_graph_normal_only_pc_cons_768_feat_384_max_length_256_lr_3e5') # 모델 불러올 dir 87 | 88 | args = parser.parse_args() 89 | main(args) 90 | -------------------------------------------------------------------------------- /mine_next/run_grand.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from mine_next.functions.main_function2 import train, evaluate, test 3 | import random, os 4 | import numpy as np 5 | import torch 6 | from transformers import AutoConfig, AutoTokenizer 7 | from mine_next.model.modeling import RobertaForClassification, RobertaForSTANCY, RobertaForStanceClassification 8 | from mine_next.model.modeling import RobertaReflectGraphClassification, RobertaReflectGraphWithGrandEdgeClassification 9 | from mine_next.functions.sent_to_graph import get_cons_tag_vocab 10 | 11 | def create_model(args): 12 | config = AutoConfig.from_pretrained( 13 | args.language_model, 14 | num_labels=args.num_labels, 15 | max_length=args.max_length, 16 | # local_files_only=True 17 | ) 18 | tokenizer = AutoTokenizer.from_pretrained(args.language_model, do_lower_case=False, use_fast=False) 19 | setattr(config, 'cons_hidden_size', args.cons_hidden_size) 20 | setattr(config, 'feature_size', args.feature_size) 21 | setattr(config, 'cons_tag2id', args.cons_tag2id) 22 | model = RobertaReflectGraphClassification.from_pretrained( 23 | args.language_model if args.init_weight else os.path.join(args.output_dir, "checkpoint-{}".format(args.checkpoint)), 24 | config=config, 25 | # local_files_only=True 26 | ) 27 | return config, tokenizer, model 28 | 29 | 30 | def set_seed(args): 31 | random.seed(args.seed) 32 | np.random.seed(args.seed) 33 | torch.manual_seed(args.seed) 34 | if torch.cuda.is_available(): 35 | torch.cuda.manual_seed_all(args.seed) 36 | 37 | 38 | def main(args): 39 | set_seed(args) 40 | config, tokenizer, model = create_model(args) 41 | model.to(args.device) 42 | 43 | if args.mode == 'train': 44 | train(args, model, tokenizer) 45 | elif args.mode == 'dev': 46 | evaluate(args, model, tokenizer) 47 | elif args.mode == 'test': 48 | test(args, model, tokenizer) 49 | 50 | 51 | if __name__ == '__main__': 52 | parser = argparse.ArgumentParser(description='main') 53 | parser.add_argument('--claim_train', type=str, default='../data/IAM/claims/train.txt') 54 | parser.add_argument('--claim_dev', type=str, default='../data/IAM/claims/dev.txt') 55 | parser.add_argument('--claim_test', type=str, default='../data/IAM/claims/test.txt') 56 | parser.add_argument('--constituent_gold_vocab', type=str, default='../data/IAM/constituent_gold_vocab.txt') 57 | # parser.add_argument('--train_pseudo_topic', type=str, default='../data/IAM/origin/train_pseudo_topic_with_first_sent.json') 58 | # parser.add_argument('--dev_pseudo_topic', type=str, default='../data/IAM/origin/dev_pseudo_topic_with_first_sent.json') 59 | # parser.add_argument('--test_pseudo_topic', type=str, default='../data/IAM/origin/test_pseudo_topic_with_first_sent.json') 60 | parser.add_argument('--train_pseudo_topic', type=str, 61 | default='../data/IAM/origin/train_pseudo_topic_with_bertopic_div0.1.json') 62 | parser.add_argument('--dev_pseudo_topic', type=str, 63 | default='../data/IAM/origin/dev_pseudo_topic_with_bertopic_div0.1.json') 64 | parser.add_argument('--test_pseudo_topic', type=str, 65 | default='../data/IAM/origin/test_pseudo_topic_with_bertopic_div0.1.json') 66 | parser.add_argument('--init_weight', type=bool, default=True) # False면 학습된거. True면 쌩 로버타 67 | parser.add_argument('--device', type=str, default="cuda") 68 | #model 69 | parser.add_argument('--num_labels', type=int, default=2) 70 | parser.add_argument('--max_length', type=int, default=256) 71 | parser.add_argument('--batch_size', type=int, default=32) 72 | parser.add_argument('--cons_tag2id', type=dict, default=get_cons_tag_vocab('../data/IAM/constituent_gold_vocab.txt')) 73 | parser.add_argument('--cons_hidden_size', type=int, default=768) 74 | parser.add_argument('--feature_size', type=int, default=384) 75 | 76 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1) 77 | parser.add_argument("--learning_rate", type=float, default=3e-5) 78 | parser.add_argument("--warmup_steps", type=int, default=0) 79 | parser.add_argument("--max_grad_norm", type=float, default=5.0) 80 | parser.add_argument('--mode', type=str, default='train') 81 | parser.add_argument('--seed', type=int, default=42) 82 | parser.add_argument('--checkpoint', type=int, default=1) 83 | parser.add_argument('--language_model', type=str, default='roberta-base') 84 | parser.add_argument("--epoch", type=int, default=40) 85 | parser.add_argument('--save_dir', type=str, default='pseudo_topic_with_bertopic_div01_base_graph_normal_only_pc_cons_768_feat_384_max_length_256_lr_3e5') # 모델 저장할 dir 86 | parser.add_argument('--output_dir', type=str, default='pseudo_topic_with_bertopic_div01_base_graph_normal_only_pc_cons_768_feat_384_max_length_256_lr_3e5') # 모델 불러올 dir 87 | 88 | args = parser.parse_args() 89 | main(args) 90 | -------------------------------------------------------------------------------- /mine_next/run_grand2: -------------------------------------------------------------------------------- 1 | import argparse 2 | from mine_next.functions.main_function2 import train, evaluate, test 3 | import random, os 4 | import numpy as np 5 | import torch 6 | from transformers import AutoConfig, AutoTokenizer 7 | from mine_next.model.modeling import RobertaForClassification, RobertaForSTANCY, RobertaForStanceClassification 8 | from mine_next.model.modeling import RobertaReflectGraphClassification, RobertaReflectGraphWithGrandEdgeClassification 9 | from mine_next.functions.sent_to_graph import get_cons_tag_vocab 10 | 11 | 12 | def create_model(args): 13 | config = AutoConfig.from_pretrained( 14 | args.language_model, 15 | num_labels=args.num_labels, 16 | max_length=args.max_length, 17 | # local_files_only=True 18 | ) 19 | tokenizer = AutoTokenizer.from_pretrained(args.language_model, do_lower_case=False, use_fast=False) 20 | setattr(config, 'cons_hidden_size', args.cons_hidden_size) 21 | setattr(config, 'feature_size', args.feature_size) 22 | setattr(config, 'cons_tag2id', args.cons_tag2id) 23 | model = RobertaReflectGraphWithGrandEdgeClassification.from_pretrained( 24 | args.language_model if args.init_weight else os.path.join(args.output_dir, "checkpoint-{}".format(args.checkpoint)), 25 | config=config, 26 | # local_files_only=True 27 | ) 28 | return config, tokenizer, model 29 | 30 | def set_seed(args): 31 | random.seed(args.seed) 32 | np.random.seed(args.seed) 33 | torch.manual_seed(args.seed) 34 | if torch.cuda.is_available(): 35 | torch.cuda.manual_seed_all(args.seed) 36 | 37 | def main(args): 38 | set_seed(args) 39 | config, tokenizer, model = create_model(args) 40 | model.to(args.device) 41 | 42 | if args.mode == 'train': 43 | train(args, model, tokenizer) 44 | elif args.mode == 'dev': 45 | evaluate(args, model, tokenizer) 46 | elif args.mode == 'test': 47 | test(args, model, tokenizer) 48 | 49 | if __name__ == '__main__': 50 | parser = argparse.ArgumentParser(description='main') 51 | parser.add_argument('--claim_train', type=str, default='../data/IAM/claims/train.txt') 52 | parser.add_argument('--claim_dev', type=str, default='../data/IAM/claims/dev.txt') 53 | parser.add_argument('--claim_test', type=str, default='../data/IAM/claims/test.txt') 54 | parser.add_argument('--constituent_gold_vocab', type=str, default='../data/IAM/constituent_gold_vocab.txt') 55 | parser.add_argument('--train_pseudo_topic', type=str, default='../data/IAM/origin/train_pseudo_topic_with_bertopic.json') 56 | parser.add_argument('--dev_pseudo_topic', type=str, default='../data/IAM/origin/dev_pseudo_topic_with_bertopic.json') 57 | parser.add_argument('--test_pseudo_topic', type=str, default='../data/IAM/origin/test_pseudo_topic_with_bertopic.json') 58 | parser.add_argument('--init_weight', type=bool, default=True) # False면 학습된거. True면 쌩 로버타 59 | parser.add_argument('--device', type=str, default="cuda") 60 | #model 61 | parser.add_argument('--num_labels', type=int, default=2) 62 | parser.add_argument('--max_length', type=int, default=256) 63 | parser.add_argument('--batch_size', type=int, default=32) 64 | parser.add_argument('--cons_tag2id', type=dict, default=get_cons_tag_vocab('../data/IAM/constituent_gold_vocab.txt')) 65 | parser.add_argument('--cons_hidden_size', type=int, default=768) 66 | parser.add_argument('--feature_size', type=int, default=384) 67 | 68 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1) 69 | parser.add_argument("--learning_rate", type=float, default=2e-5) 70 | parser.add_argument("--warmup_steps", type=int, default=0) 71 | parser.add_argument("--max_grad_norm", type=float, default=5.0) 72 | parser.add_argument('--mode', type=str, default='train') 73 | parser.add_argument('--seed', type=int, default=42) 74 | parser.add_argument('--checkpoint', type=int, default=5) 75 | parser.add_argument('--language_model', type=str, default='roberta-base') 76 | parser.add_argument("--epoch", type=int, default=40) 77 | parser.add_argument('--save_dir', type=str, default='pseudo_topic_with_bertopic_sentence_base_two_graph_cons_768_feat_384_max_length_256_lr_2e5') # 모델 저장할 dir 78 | parser.add_argument('--output_dir', type=str, default='pseudo_topic_with_bertopic_sentence_base_two_granh_cons_768_feat_384_max_length_256_lr_2e5') # 모델 불러올 dir 79 | 80 | args = parser.parse_args() 81 | main(args) 82 | -------------------------------------------------------------------------------- /mine_next/run_grand2.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from mine_next.functions.main_function2 import train, evaluate, test 3 | import random, os 4 | import numpy as np 5 | import torch 6 | from transformers import AutoConfig, AutoTokenizer 7 | from mine_next.model.modeling import RobertaForClassification, RobertaForSTANCY, RobertaForStanceClassification 8 | from mine_next.model.modeling import RobertaReflectGraphClassification, RobertaReflectGraphWithGrandEdgeClassification 9 | from mine_next.functions.sent_to_graph import get_cons_tag_vocab 10 | 11 | def create_model(args): 12 | config = AutoConfig.from_pretrained( 13 | args.language_model, 14 | num_labels=args.num_labels, 15 | max_length=args.max_length, 16 | # local_files_only=True 17 | ) 18 | tokenizer = AutoTokenizer.from_pretrained(args.language_model, do_lower_case=False, use_fast=False) 19 | setattr(config, 'cons_hidden_size', args.cons_hidden_size) 20 | setattr(config, 'feature_size', args.feature_size) 21 | setattr(config, 'cons_tag2id', args.cons_tag2id) 22 | model = RobertaReflectGraphClassification.from_pretrained( 23 | args.language_model if args.init_weight else os.path.join(args.output_dir, "checkpoint-{}".format(args.checkpoint)), 24 | config=config, 25 | # local_files_only=True 26 | ) 27 | return config, tokenizer, model 28 | 29 | 30 | def set_seed(args): 31 | random.seed(args.seed) 32 | np.random.seed(args.seed) 33 | torch.manual_seed(args.seed) 34 | if torch.cuda.is_available(): 35 | torch.cuda.manual_seed_all(args.seed) 36 | 37 | 38 | def main(args): 39 | set_seed(args) 40 | config, tokenizer, model = create_model(args) 41 | model.to(args.device) 42 | 43 | if args.mode == 'train': 44 | train(args, model, tokenizer) 45 | elif args.mode == 'dev': 46 | evaluate(args, model, tokenizer) 47 | elif args.mode == 'test': 48 | test(args, model, tokenizer) 49 | 50 | 51 | if __name__ == '__main__': 52 | parser = argparse.ArgumentParser(description='main') 53 | parser.add_argument('--claim_train', type=str, default='../data/IAM/claims/train.txt') 54 | parser.add_argument('--claim_dev', type=str, default='../data/IAM/claims/dev.txt') 55 | parser.add_argument('--claim_test', type=str, default='../data/IAM/claims/test.txt') 56 | parser.add_argument('--constituent_gold_vocab', type=str, default='../data/IAM/constituent_gold_vocab.txt') 57 | parser.add_argument('--train_pseudo_topic', type=str, default='../data/IAM/origin/train_pseudo_topic_with_bertopic.json') 58 | parser.add_argument('--dev_pseudo_topic', type=str, default='../data/IAM/origin/dev_pseudo_topic_with_bertopic.json') 59 | parser.add_argument('--test_pseudo_topic', type=str, default='../data/IAM/origin/test_pseudo_topic_with_bertopic.json') 60 | parser.add_argument('--init_weight', type=bool, default=False) # False면 학습된거. True면 쌩 로버타 61 | parser.add_argument('--device', type=str, default="cuda") 62 | #model 63 | parser.add_argument('--num_labels', type=int, default=2) 64 | parser.add_argument('--max_length', type=int, default=256) 65 | parser.add_argument('--batch_size', type=int, default=32) 66 | parser.add_argument('--cons_tag2id', type=dict, default=get_cons_tag_vocab('../data/IAM/constituent_gold_vocab.txt')) 67 | parser.add_argument('--cons_hidden_size', type=int, default=768) 68 | parser.add_argument('--feature_size', type=int, default=384) 69 | 70 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1) 71 | parser.add_argument("--learning_rate", type=float, default=3e-5) 72 | parser.add_argument("--warmup_steps", type=int, default=0) 73 | parser.add_argument("--max_grad_norm", type=float, default=5.0) 74 | parser.add_argument('--mode', type=str, default='test') 75 | parser.add_argument('--seed', type=int, default=42) 76 | parser.add_argument('--checkpoint', type=int, default=24) 77 | parser.add_argument('--language_model', type=str, default='roberta-base') 78 | parser.add_argument("--epoch", type=int, default=40) 79 | parser.add_argument('--save_dir', type=str, default='only_sentence_base_graph_normal_only_pc_cons_768_feat_384_max_length_256_lr_3e5') # 모델 저장할 dir 80 | parser.add_argument('--output_dir', type=str, default='only_sentence_base_graph_normal_only_pc_cons_768_feat_384_max_length_256_lr_3e5') # 모델 불러올 dir 81 | 82 | args = parser.parse_args() 83 | main(args) 84 | -------------------------------------------------------------------------------- /mine_next/run_grand3_test.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from mine_next.functions.main_function2 import train, evaluate, test 3 | import random, os 4 | import numpy as np 5 | import torch 6 | from transformers import AutoConfig, AutoTokenizer 7 | from mine_next.model.modeling import RobertaForClassification, RobertaForSTANCY, RobertaForStanceClassification 8 | from mine_next.model.modeling import RobertaReflectGraphClassification, RobertaReflectGraphWithGrandEdgeClassification 9 | from mine_next.functions.sent_to_graph import get_cons_tag_vocab 10 | 11 | 12 | def create_model(args): 13 | config = AutoConfig.from_pretrained( 14 | args.language_model, 15 | num_labels=args.num_labels, 16 | max_length=args.max_length, 17 | # local_files_only=True 18 | ) 19 | tokenizer = AutoTokenizer.from_pretrained(args.language_model, do_lower_case=False, use_fast=False) 20 | setattr(config, 'cons_hidden_size', args.cons_hidden_size) 21 | setattr(config, 'feature_size', args.feature_size) 22 | setattr(config, 'cons_tag2id', args.cons_tag2id) 23 | model = RobertaReflectGraphWithGrandEdgeClassification.from_pretrained( 24 | args.language_model if args.init_weight else os.path.join(args.output_dir, "checkpoint-{}".format(args.checkpoint)), 25 | config=config, 26 | # local_files_only=True 27 | ) 28 | return config, tokenizer, model 29 | 30 | def set_seed(args): 31 | random.seed(args.seed) 32 | np.random.seed(args.seed) 33 | torch.manual_seed(args.seed) 34 | if torch.cuda.is_available(): 35 | torch.cuda.manual_seed_all(args.seed) 36 | 37 | def main(args): 38 | set_seed(args) 39 | config, tokenizer, model = create_model(args) 40 | model.to(args.device) 41 | 42 | if args.mode == 'train': 43 | train(args, model, tokenizer) 44 | elif args.mode == 'dev': 45 | evaluate(args, model, tokenizer) 46 | elif args.mode == 'test': 47 | test(args, model, tokenizer) 48 | 49 | if __name__ == '__main__': 50 | parser = argparse.ArgumentParser(description='main') 51 | parser.add_argument('--claim_train', type=str, default='../data/IAM/claims/train.txt') 52 | parser.add_argument('--claim_dev', type=str, default='../data/IAM/claims/dev.txt') 53 | parser.add_argument('--claim_test', type=str, default='../data/IAM/claims/test.txt') 54 | parser.add_argument('--constituent_gold_vocab', type=str, default='../data/IAM/constituent_gold_vocab.txt') 55 | parser.add_argument('--train_pseudo_topic', type=str, default='../data/IAM/origin/train_pseudo_topic_with_bertopic.json') 56 | parser.add_argument('--dev_pseudo_topic', type=str, default='../data/IAM/origin/dev_pseudo_topic_with_bertopic.json') 57 | parser.add_argument('--test_pseudo_topic', type=str, default='../data/IAM/origin/test_pseudo_topic_with_bertopic.json') 58 | parser.add_argument('--init_weight', type=bool, default=False) # False면 학습된거. True면 쌩 로버타 59 | parser.add_argument('--device', type=str, default="cuda") 60 | #model 61 | parser.add_argument('--num_labels', type=int, default=2) 62 | parser.add_argument('--max_length', type=int, default=256) 63 | parser.add_argument('--batch_size', type=int, default=32) 64 | parser.add_argument('--cons_tag2id', type=dict, default=get_cons_tag_vocab('../data/IAM/constituent_gold_vocab.txt')) 65 | parser.add_argument('--cons_hidden_size', type=int, default=768) 66 | parser.add_argument('--feature_size', type=int, default=384) 67 | 68 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1) 69 | parser.add_argument("--learning_rate", type=float, default=3e-5) 70 | parser.add_argument("--warmup_steps", type=int, default=0) 71 | parser.add_argument("--max_grad_norm", type=float, default=5.0) 72 | parser.add_argument('--mode', type=str, default='test') 73 | parser.add_argument('--seed', type=int, default=42) 74 | parser.add_argument('--checkpoint', type=int, default=7) 75 | parser.add_argument('--language_model', type=str, default='roberta-base') 76 | parser.add_argument("--epoch", type=int, default=40) 77 | parser.add_argument('--save_dir', type=str, default='only_sentence_base_only_grand_cons_768_feat_384_max_length_256_lr_3e5') # 모델 저장할 dir 78 | parser.add_argument('--output_dir', type=str, default='only_sentence_base_only_grand_cons_768_feat_384_max_length_256_lr_3e5') # 모델 불러올 dir 79 | 80 | args = parser.parse_args() 81 | main(args) 82 | -------------------------------------------------------------------------------- /mine_next/run_one.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from mine_next.functions.main_function2 import train, evaluate 3 | import random, os 4 | import numpy as np 5 | import torch 6 | from transformers import AutoConfig, AutoTokenizer 7 | from mine_next.model.modeling import RobertaForClassification, RobertaForSTANCY, RobertaForStanceClassification 8 | from mine_next.model.modeling import RobertaReflectGraphClassification, RobertaReflectGraphWithGrandEdgeClassification 9 | from mine_next.functions.sent_to_graph import get_cons_tag_vocab 10 | 11 | 12 | def create_model(args): 13 | config = AutoConfig.from_pretrained( 14 | args.language_model, 15 | num_labels=args.num_labels, 16 | max_length=args.max_length, 17 | # local_files_only=True 18 | ) 19 | tokenizer = AutoTokenizer.from_pretrained(args.language_model, do_lower_case=False, use_fast=False) 20 | setattr(config, 'cons_hidden_size', args.cons_hidden_size) 21 | setattr(config, 'feature_size', args.feature_size) 22 | setattr(config, 'cons_tag2id', args.cons_tag2id) 23 | model = RobertaReflectGraphClassification.from_pretrained( 24 | args.language_model if args.init_weight else os.path.join(args.output_dir, "checkpoint-{}".format(args.checkpoint)), 25 | config=config, 26 | # local_files_only=True 27 | ) 28 | return config, tokenizer, model 29 | 30 | def set_seed(args): 31 | random.seed(args.seed) 32 | np.random.seed(args.seed) 33 | torch.manual_seed(args.seed) 34 | if torch.cuda.is_available(): 35 | torch.cuda.manual_seed_all(args.seed) 36 | 37 | def main(args): 38 | set_seed(args) 39 | config, tokenizer, model = create_model(args) 40 | model.to(args.device) 41 | 42 | if args.mode == 'train': 43 | train(args, model, tokenizer) 44 | elif args.mode == 'dev': 45 | evaluate(args, model, tokenizer) 46 | 47 | if __name__ == '__main__': 48 | parser = argparse.ArgumentParser(description='main') 49 | parser.add_argument('--claim_train', type=str, default='../data/IAM/claims/train.txt') 50 | parser.add_argument('--claim_dev', type=str, default='../data/IAM/claims/dev.txt') 51 | parser.add_argument('--constituent_gold_vocab', type=str, default='../data/IAM/constituent_gold_vocab.txt') 52 | parser.add_argument('--init_weight', type=bool, default=True) # False면 학습된거. True면 쌩 로버타 53 | parser.add_argument('--device', type=str, default="cuda") 54 | #model 55 | parser.add_argument('--num_labels', type=int, default=2) 56 | parser.add_argument('--max_length', type=int, default=256) 57 | parser.add_argument('--batch_size', type=int, default=32) 58 | parser.add_argument('--cons_tag2id', type=dict, default=get_cons_tag_vocab('../data/IAM/constituent_gold_vocab.txt')) 59 | parser.add_argument('--cons_hidden_size', type=int, default=768) 60 | parser.add_argument('--feature_size', type=int, default=384) 61 | 62 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1) 63 | parser.add_argument("--learning_rate", type=float, default=3e-5) 64 | parser.add_argument("--warmup_steps", type=int, default=0) 65 | parser.add_argument("--max_grad_norm", type=float, default=5.0) 66 | parser.add_argument('--mode', type=str, default='train') 67 | parser.add_argument('--seed', type=int, default=42) 68 | parser.add_argument('--checkpoint', type=int, default=8) 69 | parser.add_argument('--language_model', type=str, default='roberta-base') 70 | parser.add_argument("--epoch", type=int, default=30) 71 | parser.add_argument('--output_dir', type=str, default='only_sentence_base_graph_normal_only_pc_cons_768_feat_384_max_length_256_lr_3e5_with_pseudo_topic') # 모델 불러올 dir 72 | parser.add_argument('--save_dir', type=str, default='only_sentence_base_graph_normal_only_pc_cons_768_feat_384_max_length_256_lr_3e5_with_pseudo_topic') # 모델 저장할 dir 73 | 74 | 75 | args = parser.parse_args() 76 | main(args) 77 | -------------------------------------------------------------------------------- /mine_next/run_stance.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | #from mine_next.functions.main_function import train, evaluate 3 | from mine_next.functions.stance_main_func import train, evaluate 4 | import random, os 5 | import numpy as np 6 | import torch 7 | from transformers import AutoConfig, AutoTokenizer 8 | from mine_next.model.modeling import RobertaForClassification, RobertaForSTANCY, RobertaForStanceClassification 9 | from mine_next.model.modeling import RobertaReflectGraphClassification 10 | from mine_next.functions.sent_to_graph import get_cons_tag_vocab 11 | 12 | 13 | def create_model(args): 14 | config = AutoConfig.from_pretrained( 15 | args.language_model, 16 | num_labels=args.num_labels, 17 | max_length=args.max_length, 18 | # local_files_only=True 19 | ) 20 | tokenizer = AutoTokenizer.from_pretrained(args.language_model, do_lower_case=False, use_fast=False) 21 | setattr(config, 'cons_hidden_size', args.cons_hidden_size) 22 | setattr(config, 'cons_tag2id', args.cons_tag2id) 23 | model = RobertaForClassification.from_pretrained( 24 | args.language_model if args.init_weight else os.path.join(args.output_dir, "checkpoint-{}".format(args.checkpoint)), 25 | config=config, 26 | # local_files_only=True 27 | ) 28 | return config, tokenizer, model 29 | 30 | def set_seed(args): 31 | random.seed(args.seed) 32 | np.random.seed(args.seed) 33 | torch.manual_seed(args.seed) 34 | if torch.cuda.is_available(): 35 | torch.cuda.manual_seed_all(args.seed) 36 | 37 | def main(args): 38 | set_seed(args) 39 | config, tokenizer, model = create_model(args) 40 | model.to(args.device) 41 | 42 | if args.mode == 'train': 43 | train(args, model, tokenizer) 44 | elif args.mode == 'dev': 45 | evaluate(args, model, tokenizer) 46 | 47 | if __name__ == '__main__': 48 | parser = argparse.ArgumentParser(description='main') 49 | parser.add_argument('--claim_train', type=str, default='../data/IAM/claims/train.txt') 50 | parser.add_argument('--claim_dev', type=str, default='../data/IAM/claims/dev.txt') 51 | parser.add_argument('--stance_train', type=str, default='../data/IAM/stance/train.txt') 52 | parser.add_argument('--stance_dev', type=str, default='../data/IAM/stance/dev.txt') 53 | parser.add_argument('--save_dir', type=str, default='stance_test') # 모델 불러올 dir 54 | parser.add_argument('--output_dir', type=str, default='stance_test') # 모델 저장할 dir 55 | parser.add_argument('--constituent_gold_vocab', type=str, default='../data/IAM/constituent_gold_vocab.txt') 56 | parser.add_argument('--init_weight', type=bool, default=True) # False면 학습된거. True면 쌩 로버타 57 | parser.add_argument('--device', type=str, default="cuda") 58 | #model 59 | parser.add_argument('--num_labels', type=int, default=2) 60 | parser.add_argument('--max_length', type=int, default=256) 61 | parser.add_argument('--batch_size', type=int, default=32) 62 | parser.add_argument('--cons_tag2id', type=dict, default=get_cons_tag_vocab('../data/IAM/constituent_gold_vocab.txt')) 63 | parser.add_argument('--cons_hidden_size', type=int, default=128) 64 | 65 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1) 66 | parser.add_argument("--learning_rate", type=float, default=3e-5) 67 | parser.add_argument("--warmup_steps", type=int, default=0) 68 | parser.add_argument("--max_grad_norm", type=float, default=5.0) 69 | parser.add_argument('--mode', type=str, default='train') 70 | parser.add_argument('--seed', type=int, default=42) 71 | parser.add_argument('--checkpoint', type=int, default=1) 72 | parser.add_argument('--language_model', type=str, default='roberta-base') 73 | parser.add_argument("--epoch", type=int, default=15) 74 | 75 | args = parser.parse_args() 76 | main(args) 77 | --------------------------------------------------------------------------------